ColossalAI/tests/test_comm/test_comm.py

from functools import partial

import pytest
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from colossalai.communication import all_gather, all_reduce, reduce_scatter
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch
from colossalai.utils import free_port, get_current_device

CONFIG = dict(parallel=dict(data=8, pipeline=1, tensor=dict(mode=None, size=1)))

SIZE = 8


def check_all_gather():
    tensor = torch.tensor([dist.get_rank() * SIZE + j for j in range(SIZE)])
    tensor = tensor.to(get_current_device())
    print('Before:   Rank {0} - {1}'.format(dist.get_rank(), tensor))
    tensor, op = all_gather(tensor, 0, ParallelMode.GLOBAL, async_op=True)
    print('After:    Rank {0} - {1}'.format(dist.get_rank(), tensor))
    op.wait()
    print('Complete: Rank {0} - {1}'.format(dist.get_rank(), tensor))
    torch.cuda.synchronize()


def check_reduce_scatter():
    tensor = torch.tensor([dist.get_rank() * SIZE + j for j in range(SIZE)])
    tensor = tensor.to(get_current_device())
    print('Before:   Rank {0} - {1}'.format(dist.get_rank(), tensor))
    tensor, op = reduce_scatter(tensor, 0, ParallelMode.GLOBAL, async_op=True)
    print('After:    Rank {0} - {1}'.format(dist.get_rank(), tensor))
    op.wait()
    print('Complete: Rank {0} - {1}'.format(dist.get_rank(), tensor))
    torch.cuda.synchronize()


def check_all_reduce():
    tensor = torch.tensor([dist.get_rank() * SIZE + j for j in range(SIZE)])
    tensor = tensor.to(get_current_device())
    print('Before:   Rank {0} - {1}'.format(dist.get_rank(), tensor))
    tensor, op = all_reduce(tensor, ParallelMode.GLOBAL, async_op=True)
    print('After:    Rank {0} - {1}'.format(dist.get_rank(), tensor))
    op.wait()
    print('Complete: Rank {0} - {1}'.format(dist.get_rank(), tensor))
    torch.cuda.synchronize()


def check_layer(rank, world_size, port):
    launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')

    assert dist.get_rank() == gpc.get_global_rank()
    print('Rank {} / {}'.format(dist.get_rank(), dist.get_world_size()))

    check_all_gather()
    check_reduce_scatter()
    check_all_reduce()

    gpc.destroy()
    torch.cuda.empty_cache()


@pytest.mark.dist
def test_comm():
    world_size = 4
    run_func = partial(check_layer, world_size=world_size, port=free_port())
    mp.spawn(run_func, nprocs=world_size)


if __name__ == '__main__':
    test_comm()
Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-27 07:04:32 +00:00			`from functools import partial`

			`import pytest`
			`import torch`
			`import torch.distributed as dist`
			`import torch.multiprocessing as mp`
			`from colossalai.communication import all_gather, all_reduce, reduce_scatter`
			`from colossalai.context import ParallelMode`
			`from colossalai.core import global_context as gpc`
			`from colossalai.initialize import launch`
Hotfix/Colossalai layers (#92) * optimized 1d layer apis; reorganized nn.layer modules; fixed tests * fixed 2.5d runtime issue * reworked split batch, now called in trainer.schedule.load_batch Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-29 15:32:10 +00:00			`from colossalai.utils import free_port, get_current_device`
Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-27 07:04:32 +00:00
			`CONFIG = dict(parallel=dict(data=8, pipeline=1, tensor=dict(mode=None, size=1)))`

			`SIZE = 8`


			`def check_all_gather():`
			`tensor = torch.tensor([dist.get_rank() * SIZE + j for j in range(SIZE)])`
			`tensor = tensor.to(get_current_device())`
			`print('Before: Rank {0} - {1}'.format(dist.get_rank(), tensor))`
			`tensor, op = all_gather(tensor, 0, ParallelMode.GLOBAL, async_op=True)`
			`print('After: Rank {0} - {1}'.format(dist.get_rank(), tensor))`
			`op.wait()`
			`print('Complete: Rank {0} - {1}'.format(dist.get_rank(), tensor))`
			`torch.cuda.synchronize()`


			`def check_reduce_scatter():`
			`tensor = torch.tensor([dist.get_rank() * SIZE + j for j in range(SIZE)])`
			`tensor = tensor.to(get_current_device())`
			`print('Before: Rank {0} - {1}'.format(dist.get_rank(), tensor))`
			`tensor, op = reduce_scatter(tensor, 0, ParallelMode.GLOBAL, async_op=True)`
			`print('After: Rank {0} - {1}'.format(dist.get_rank(), tensor))`
			`op.wait()`
			`print('Complete: Rank {0} - {1}'.format(dist.get_rank(), tensor))`
			`torch.cuda.synchronize()`


			`def check_all_reduce():`
			`tensor = torch.tensor([dist.get_rank() * SIZE + j for j in range(SIZE)])`
			`tensor = tensor.to(get_current_device())`
			`print('Before: Rank {0} - {1}'.format(dist.get_rank(), tensor))`
			`tensor, op = all_reduce(tensor, ParallelMode.GLOBAL, async_op=True)`
			`print('After: Rank {0} - {1}'.format(dist.get_rank(), tensor))`
			`op.wait()`
			`print('Complete: Rank {0} - {1}'.format(dist.get_rank(), tensor))`
			`torch.cuda.synchronize()`


Hotfix/Colossalai layers (#92) * optimized 1d layer apis; reorganized nn.layer modules; fixed tests * fixed 2.5d runtime issue * reworked split batch, now called in trainer.schedule.load_batch Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-29 15:32:10 +00:00			`def check_layer(rank, world_size, port):`
			`launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')`
Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-27 07:04:32 +00:00
			`assert dist.get_rank() == gpc.get_global_rank()`
			`print('Rank {} / {}'.format(dist.get_rank(), dist.get_world_size()))`

			`check_all_gather()`
			`check_reduce_scatter()`
			`check_all_reduce()`

			`gpc.destroy()`
			`torch.cuda.empty_cache()`


			`@pytest.mark.dist`
			`def test_comm():`
			`world_size = 4`
Hotfix/Colossalai layers (#92) * optimized 1d layer apis; reorganized nn.layer modules; fixed tests * fixed 2.5d runtime issue * reworked split batch, now called in trainer.schedule.load_batch Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-29 15:32:10 +00:00			`run_func = partial(check_layer, world_size=world_size, port=free_port())`
Layer integration (#83) * integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com> 2021-12-27 07:04:32 +00:00			`mp.spawn(run_func, nprocs=world_size)`


			`if __name__ == '__main__':`
			`test_comm()`