mirror of https://github.com/hpcaitech/ColossalAI
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
43 lines
1.4 KiB
43 lines
1.4 KiB
2 weeks ago
|
import numpy as np
|
||
|
import pytest
|
||
|
import torch
|
||
|
import torch.distributed as dist
|
||
|
|
||
|
import colossalai
|
||
|
from colossalai.cluster import ProcessGroupMesh
|
||
|
from colossalai.testing import rerun_if_address_is_in_use, spawn
|
||
|
from colossalai.testing.random import seed_all
|
||
|
from colossalai.utils import get_current_device
|
||
|
from colossalai.zero.low_level._utils import all_gather_into_flat_tensor_nd
|
||
|
|
||
|
|
||
|
def check_all_gather_2d():
|
||
|
seed_all(1024)
|
||
|
tensor = torch.rand(128, device=get_current_device())
|
||
|
extra_dp_size, inner_dp_size = 2, 2
|
||
|
pg_mesh = ProcessGroupMesh(extra_dp_size, inner_dp_size)
|
||
|
extra_dp_group = pg_mesh.get_group_along_axis(0)
|
||
|
inner_dp_group = pg_mesh.get_group_along_axis(1)
|
||
|
ranks = [dist.get_rank(extra_dp_group), dist.get_rank(inner_dp_group)]
|
||
|
sizes = [dist.get_world_size(extra_dp_group), dist.get_world_size(inner_dp_group)]
|
||
|
chunk = tensor.chunk(dist.get_world_size())[np.ravel_multi_index(ranks, sizes)].clone()
|
||
|
out = torch.zeros_like(tensor)
|
||
|
all_gather_into_flat_tensor_nd(out, chunk, group=(extra_dp_group, inner_dp_group))
|
||
|
assert torch.equal(out, tensor)
|
||
|
|
||
|
|
||
|
def run_dist(rank, world_size, port):
|
||
|
colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost")
|
||
|
|
||
|
check_all_gather_2d()
|
||
|
|
||
|
|
||
|
@pytest.mark.dist
|
||
|
@rerun_if_address_is_in_use()
|
||
|
def test_comm_nd():
|
||
|
spawn(run_dist, 4)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
test_comm_nd()
|