mirror of https://github.com/hpcaitech/ColossalAI
HELSON
3 years ago
committed by
GitHub
13 changed files with 263 additions and 499 deletions
@ -1,119 +0,0 @@
|
||||
import torch.distributed as dist |
||||
|
||||
from colossalai.registry import DIST_GROUP_INITIALIZER |
||||
from colossalai.global_variables import moe_env |
||||
from .process_group_initializer import ProcessGroupInitializer |
||||
from ..parallel_mode import ParallelMode |
||||
|
||||
|
||||
@DIST_GROUP_INITIALIZER.register_module |
||||
class Initializer_Moemodel(ProcessGroupInitializer): |
||||
"""Model parallel initialization for MoE system. |
||||
|
||||
:param moe_moel: Size of moe model parallel |
||||
:param moe_data: Size of moe data parallel |
||||
:param args: Args used in base class |
||||
:param kwargs: Kwargs used in base class |
||||
|
||||
:type moe_model: int |
||||
:type moe_data: int |
||||
""" |
||||
def __init__(self, moe_model, moe_data, *args, **kwargs): |
||||
super().__init__(*args, **kwargs) |
||||
self.moe_model = moe_model |
||||
self.moe_data = moe_data |
||||
|
||||
def init_dist_group(self): |
||||
"""Initialize model parallel groups in moe parallel environment, |
||||
and assign local_ranks and groups to each gpu. |
||||
|
||||
:return: MoE model parallelism's information |
||||
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode) |
||||
""" |
||||
local_rank = None |
||||
ranks_in_group = None |
||||
process_group = None |
||||
group_world_size = None |
||||
mode = ParallelMode.MOE_MODEL |
||||
|
||||
for i in range(self.moe_data): |
||||
ranks = [i * self.moe_model + j for j in range(self.moe_model)] |
||||
group = dist.new_group(ranks) |
||||
|
||||
if self.rank in ranks: |
||||
local_rank = ranks.index(self.rank) |
||||
group_world_size = len(ranks) |
||||
process_group = group |
||||
ranks_in_group = ranks |
||||
|
||||
return local_rank, group_world_size, process_group, ranks_in_group, mode |
||||
|
||||
|
||||
@DIST_GROUP_INITIALIZER.register_module |
||||
class Initializer_Moedata(ProcessGroupInitializer): |
||||
"""Data parallel initialization for MoE system. |
||||
|
||||
:param moe_moel: Size of moe model parallel |
||||
:param moe_data: Size of moe data parallel |
||||
:param args: Args used in base class |
||||
:param kwargs: Kwargs used in base class |
||||
|
||||
:type moe_model: int |
||||
:type moe_data: int |
||||
""" |
||||
def __init__(self, moe_model, moe_data, *args, **kwargs): |
||||
super().__init__(*args, **kwargs) |
||||
self.moe_model = moe_model |
||||
self.moe_data = moe_data |
||||
|
||||
def init_dist_group(self): |
||||
"""Initialize data parallel groups in moe parallel environment, |
||||
and assign local_ranks and groups to each gpu. |
||||
|
||||
:return: MoE data parallelism's information |
||||
:rtype: Tuple(local_rank, group_world_size, process_group, ranks_in_group, mode) |
||||
""" |
||||
local_rank = None |
||||
ranks_in_group = None |
||||
process_group = None |
||||
group_world_size = None |
||||
mode = ParallelMode.MOE_DATA |
||||
|
||||
for i in range(self.moe_model): |
||||
ranks = [i + j * self.moe_model for j in range(self.moe_data)] |
||||
group = dist.new_group(ranks) |
||||
|
||||
if self.rank in ranks: |
||||
local_rank = ranks.index(self.rank) |
||||
group_world_size = len(ranks) |
||||
process_group = group |
||||
ranks_in_group = ranks |
||||
|
||||
return local_rank, group_world_size, process_group, ranks_in_group, mode |
||||
|
||||
|
||||
@DIST_GROUP_INITIALIZER.register_module |
||||
class Initializer_Moe(ProcessGroupInitializer): |
||||
"""Serves as the single entry point to MoE parallel initialization. |
||||
|
||||
:param args: Args used to initialize ProcessGroupInitializer |
||||
:param kwargs: Kwargs used to initialize ProcessGroupInitializer |
||||
""" |
||||
def __init__(self, *args, **kwargs): |
||||
super().__init__(*args, **kwargs) |
||||
self.moe_model = moe_env.model_parallel_size |
||||
self.moe_data = moe_env.data_parallel_size |
||||
self.model_initializer = Initializer_Moemodel( |
||||
self.moe_model, self.moe_data, *args, **kwargs) |
||||
self.data_initializer = Initializer_Moedata( |
||||
self.moe_model, self.moe_data, *args, **kwargs) |
||||
|
||||
def init_dist_group(self): |
||||
"""Initializes MoE parallel communication groups. |
||||
|
||||
:return: MoE parallelism's information |
||||
:rtype: list of Tuples (local_rank, group_world_size, process_group, ranks_in_group, mode) |
||||
""" |
||||
parallel_setting = [self.model_initializer.init_dist_group(), |
||||
self.data_initializer.init_dist_group()] |
||||
return parallel_setting |
@ -0,0 +1,72 @@
|
||||
from functools import partial |
||||
import pytest |
||||
import torch |
||||
import torch.nn as nn |
||||
import torch.multiprocessing as mp |
||||
import torch.distributed as dist |
||||
import colossalai |
||||
from colossalai.utils import free_port, get_current_device |
||||
from colossalai.nn.layer.moe import Top1Router, UniformNoiseGenerator, MoeLayer, Experts |
||||
from colossalai.core import MOE_CONTEXT |
||||
from colossalai.utils.moe import sync_moe_model_param |
||||
from colossalai.engine.gradient_handler import MoeGradientHandler |
||||
from colossalai.testing import assert_equal_in_group |
||||
|
||||
BATCH_SIZE = 4 |
||||
DIM = 16 |
||||
CONFIG = dict() |
||||
|
||||
|
||||
def run_test(rank, world_size, port): |
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl') |
||||
expert_module = nn.Linear |
||||
expert_factor = dict(in_features=DIM, out_features=DIM, device=get_current_device()) |
||||
|
||||
MOE_CONTEXT.setup(42) # MOE initialization |
||||
noisy_func = UniformNoiseGenerator() |
||||
router = Top1Router(noisy_func=noisy_func) |
||||
num_experts_list = [1, 2, 4] |
||||
layer_list = [] |
||||
for num_experts in num_experts_list: |
||||
exp = Experts(expert_module, num_experts, **expert_factor) |
||||
moe_layer = MoeLayer(DIM, num_experts, router, exp) |
||||
layer_list.append(moe_layer) |
||||
|
||||
model = nn.Sequential(*layer_list) |
||||
model = model.to(get_current_device()) |
||||
sync_moe_model_param(model) |
||||
|
||||
dist_dict = MOE_CONTEXT.information |
||||
assert_equal_in_group(layer_list[0].experts.experts[0].weight.data, dist_dict[1].dp_group) |
||||
assert_equal_in_group(layer_list[1].experts.experts[0].weight.data, dist_dict[2].dp_group) |
||||
# MoE model synchronization passed |
||||
|
||||
grad_handler = MoeGradientHandler(model, 0) |
||||
|
||||
rank = dist.get_rank() |
||||
torch.cuda.manual_seed(78 + rank) |
||||
data = torch.randn(BATCH_SIZE, DIM, device=get_current_device()) |
||||
grad = torch.randn_like(data) |
||||
|
||||
MOE_CONTEXT.reset_loss() |
||||
outputs = model(data) |
||||
outputs.backward(grad) |
||||
grad_handler.handle_gradient() |
||||
|
||||
assert_equal_in_group(layer_list[0].experts.experts[0].weight.grad, dist_dict[1].dp_group) |
||||
assert_equal_in_group(layer_list[0].experts.experts[0].bias.grad, dist_dict[1].dp_group) |
||||
|
||||
assert_equal_in_group(layer_list[1].experts.experts[0].weight.grad, dist_dict[2].dp_group) |
||||
assert_equal_in_group(layer_list[1].experts.experts[0].bias.grad, dist_dict[2].dp_group) |
||||
# MoE grad handler test passed |
||||
|
||||
|
||||
@pytest.mark.dist |
||||
def test_grad_handler(): |
||||
world_size = 4 |
||||
run_func = partial(run_test, world_size=world_size, port=free_port()) |
||||
mp.spawn(run_func, nprocs=world_size) |
||||
|
||||
|
||||
if __name__ == '__main__': |
||||
test_grad_handler() |
@ -0,0 +1,70 @@
|
||||
from functools import partial |
||||
import pytest |
||||
import torch |
||||
import torch.nn as nn |
||||
import torch.multiprocessing as mp |
||||
import torch.distributed as dist |
||||
import colossalai |
||||
from colossalai.utils import free_port, get_current_device |
||||
from colossalai.nn.layer.moe import Experts |
||||
from colossalai.core import MOE_CONTEXT |
||||
from colossalai.utils.moe import sync_moe_model_param |
||||
from colossalai.testing import assert_equal_in_group |
||||
|
||||
D_MODEL = 4 |
||||
D_FF = 8 |
||||
CONFIG = dict() |
||||
|
||||
|
||||
def run_test(rank, world_size, port): |
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl') |
||||
expert_module = nn.Linear |
||||
expert_factor = dict(in_features=D_MODEL, out_features=D_FF, device=get_current_device()) |
||||
|
||||
MOE_CONTEXT.setup(42) # MOE environment initialization |
||||
exp0 = Experts(expert_module, 1, **expert_factor) |
||||
exp1 = Experts(expert_module, 2, **expert_factor) |
||||
exp2 = Experts(expert_module, 4, **expert_factor) |
||||
exp3 = Experts(expert_module, 8, **expert_factor) |
||||
|
||||
assert exp0.num_local_experts == 1 |
||||
assert exp1.num_local_experts == 1 |
||||
assert exp2.num_local_experts == 1 |
||||
assert exp3.num_local_experts == 2 |
||||
# experts deployment passed |
||||
|
||||
dist_dict = MOE_CONTEXT.information |
||||
rank = dist.get_rank() |
||||
|
||||
assert len(dist_dict) == 3 |
||||
assert dist.get_rank(dist_dict[4].ep_group) == rank |
||||
assert dist.get_rank(dist_dict[2].ep_group) == rank % 2 |
||||
assert dist.get_rank(dist_dict[1].ep_group) == 0 |
||||
|
||||
assert dist.get_rank(dist_dict[4].dp_group) == 0 |
||||
assert dist.get_rank(dist_dict[2].dp_group) == rank // 2 |
||||
assert dist.get_rank(dist_dict[1].dp_group) == rank |
||||
# group creation passed |
||||
|
||||
model = nn.ModuleList([exp0, exp1, exp2, exp3]) |
||||
model = model.to(get_current_device()) |
||||
sync_moe_model_param(model) |
||||
|
||||
assert_equal_in_group(exp0.experts[0].weight.data, dist_dict[1].dp_group) |
||||
assert_equal_in_group(exp0.experts[0].bias.data, dist_dict[1].dp_group) |
||||
# MOE experts layout success when ep_size = 1 |
||||
|
||||
assert_equal_in_group(exp1.experts[0].weight.data, dist_dict[2].dp_group) |
||||
assert_equal_in_group(exp1.experts[0].bias.data, dist_dict[2].dp_group) |
||||
# MOE experts layout success when ep_size = 2 |
||||
|
||||
|
||||
@pytest.mark.dist |
||||
def test_moe_initialization(): |
||||
world_size = 4 |
||||
run_func = partial(run_test, world_size=world_size, port=free_port()) |
||||
mp.spawn(run_func, nprocs=world_size) |
||||
|
||||
|
||||
if __name__ == '__main__': |
||||
test_moe_initialization() |
@ -1,97 +0,0 @@
|
||||
from functools import partial |
||||
import pytest |
||||
import torch |
||||
import torch.nn as nn |
||||
import torch.multiprocessing as mp |
||||
import colossalai |
||||
from colossalai.context import ParallelMode |
||||
from colossalai.core import global_context as gpc |
||||
from colossalai.utils import free_port, get_current_device |
||||
from colossalai.nn.layer.moe import Top1Router, MoeLayer |
||||
from colossalai.global_variables import moe_env |
||||
|
||||
BATCH_SIZE = 32 |
||||
NUM_EXPERTS = 4 |
||||
CONFIG = dict(parallel=dict(moe=dict(size=4))) |
||||
|
||||
|
||||
def check_equal(A, B, atol=1e-06): |
||||
assert torch.allclose(A, B, rtol=0, atol=atol) is True |
||||
|
||||
|
||||
def run_routing(rank, world_size, port, rs=2, hidden_size=128, data_type=torch.float32): |
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl') |
||||
|
||||
# torch.set_printoptions(precision=30) |
||||
torch.backends.cuda.matmul.allow_tf32 = False |
||||
local_rank = gpc.get_local_rank(ParallelMode.GLOBAL) |
||||
torch.manual_seed(rs + local_rank) |
||||
moe_env.reset_loss() |
||||
tokens = torch.randn(BATCH_SIZE, hidden_size, dtype=data_type, device=get_current_device(), requires_grad=True) |
||||
# print(f"tokens:\n{tokens}") |
||||
router = Top1Router(1) |
||||
layer = MoeLayer(hidden_size, NUM_EXPERTS, router, nn.Identity()) |
||||
if data_type == torch.float16: |
||||
layer = layer.half() |
||||
layer.cuda_mode = False |
||||
|
||||
old_out = layer(tokens) |
||||
# print(f"old output:\n{old_out}") |
||||
|
||||
ech = old_out.shape |
||||
grad = torch.randn(ech, device=get_current_device()) |
||||
old_out.backward(grad) |
||||
|
||||
o_tk_grad = tokens.grad.data.clone() |
||||
o_gt_grad = layer.gate.weight.grad.data.clone() |
||||
|
||||
tokens.grad.zero_() |
||||
layer.gate.weight.grad.zero_() |
||||
|
||||
layer.cuda_mode = True |
||||
new_out = layer(tokens) |
||||
|
||||
# print(torch.max(torch.abs(old_out - new_out))) |
||||
if data_type == torch.float32: |
||||
check_equal(old_out, new_out) |
||||
else: |
||||
check_equal(old_out, new_out, 1e-2) |
||||
# print(f"forward functions passed") |
||||
|
||||
# print(f"new output:\n{new_out}") |
||||
new_out.backward(grad) |
||||
n_tk_grad = tokens.grad.data.clone() |
||||
n_gt_grad = layer.gate.weight.grad.data.clone() |
||||
|
||||
# print(torch.max(torch.abs(o_tk_grad - n_tk_grad))) |
||||
if data_type == torch.float32: |
||||
check_equal(o_tk_grad, n_tk_grad) |
||||
else: |
||||
check_equal(o_tk_grad, o_tk_grad, 1e-2) |
||||
# print(f"tokens gradient passed") |
||||
|
||||
# print(torch.max(torch.abs(o_gt_grad - n_gt_grad))) |
||||
if data_type == torch.float32: |
||||
check_equal(o_gt_grad, n_gt_grad, 5e-05) |
||||
else: |
||||
check_equal(o_gt_grad, n_gt_grad, 2e-01) |
||||
# print(f"linear weight gradient passed") |
||||
|
||||
|
||||
@pytest.mark.skip(reason="Should be activated for detailed tests") |
||||
@pytest.mark.parametrize("rs", [2, 42, 60]) |
||||
@pytest.mark.parametrize("hidden_size", [128, 256, 512, 768, 1024, 2048]) |
||||
@pytest.mark.parametrize("data_type", [torch.float32, torch.float16]) |
||||
def test_moe_top2(rs, hidden_size, data_type): |
||||
world_size = 4 |
||||
run_func = partial(run_routing, |
||||
world_size=world_size, |
||||
port=free_port(), |
||||
rs=rs, |
||||
hidden_size=hidden_size, |
||||
data_type=data_type) |
||||
mp.spawn(run_func, nprocs=world_size) |
||||
|
||||
|
||||
if __name__ == '__main__': |
||||
test_moe_top2(60, 512, torch.float16) |
@ -1,97 +0,0 @@
|
||||
from functools import partial |
||||
import pytest |
||||
import torch |
||||
import torch.nn as nn |
||||
import torch.multiprocessing as mp |
||||
import colossalai |
||||
from colossalai.context import ParallelMode |
||||
from colossalai.core import global_context as gpc |
||||
from colossalai.utils import free_port, get_current_device |
||||
from colossalai.nn.layer.moe import Top2Router, MoeLayer |
||||
from colossalai.global_variables import moe_env |
||||
|
||||
BATCH_SIZE = 32 |
||||
NUM_EXPERTS = 4 |
||||
CONFIG = dict(parallel=dict(moe=dict(size=4))) |
||||
|
||||
|
||||
def check_equal(A, B, atol=1e-06): |
||||
assert torch.allclose(A, B, rtol=0, atol=atol) is True |
||||
|
||||
|
||||
def run_routing(rank, world_size, port, rs=2, hidden_size=128, data_type=torch.float32): |
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl') |
||||
|
||||
# torch.set_printoptions(precision=30) |
||||
torch.backends.cuda.matmul.allow_tf32 = False |
||||
local_rank = gpc.get_local_rank(ParallelMode.GLOBAL) |
||||
torch.manual_seed(rs + local_rank) |
||||
moe_env.reset_loss() |
||||
tokens = torch.randn(BATCH_SIZE, hidden_size, dtype=data_type, device=get_current_device(), requires_grad=True) |
||||
# print(f"tokens:\n{tokens}") |
||||
router = Top2Router(1) |
||||
layer = MoeLayer(hidden_size, NUM_EXPERTS, router, nn.Identity()) |
||||
if data_type == torch.float16: |
||||
layer = layer.half() |
||||
layer.cuda_mode = False |
||||
|
||||
old_out = layer(tokens) |
||||
# print(f"old output:\n{old_out}") |
||||
|
||||
ech = old_out.shape |
||||
grad = torch.randn(ech, device=get_current_device()) |
||||
old_out.backward(grad) |
||||
|
||||
o_tk_grad = tokens.grad.data.clone() |
||||
o_gt_grad = layer.gate.weight.grad.data.clone() |
||||
|
||||
tokens.grad.zero_() |
||||
layer.gate.weight.grad.zero_() |
||||
|
||||
layer.cuda_mode = True |
||||
new_out = layer(tokens) |
||||
|
||||
# print(torch.max(torch.abs(old_out - new_out))) |
||||
if data_type == torch.float32: |
||||
check_equal(old_out, new_out) |
||||
else: |
||||
check_equal(old_out, new_out, 1e-2) |
||||
# print(f"forward functions passed") |
||||
|
||||
# print(f"new output:\n{new_out}") |
||||
new_out.backward(grad) |
||||
n_tk_grad = tokens.grad.data.clone() |
||||
n_gt_grad = layer.gate.weight.grad.data.clone() |
||||
|
||||
# print(torch.max(torch.abs(o_tk_grad - n_tk_grad))) |
||||
if data_type == torch.float32: |
||||
check_equal(o_tk_grad, n_tk_grad) |
||||
else: |
||||
check_equal(o_tk_grad, o_tk_grad, 1e-2) |
||||
# print(f"tokens gradient passed") |
||||
|
||||
# print(torch.max(torch.abs(o_gt_grad - n_gt_grad))) |
||||
if data_type == torch.float32: |
||||
check_equal(o_gt_grad, n_gt_grad, 5e-05) |
||||
else: |
||||
check_equal(o_gt_grad, n_gt_grad, 2e-01) |
||||
# print(f"linear weight gradient passed") |
||||
|
||||
|
||||
@pytest.mark.skip(reason="Should be activated for detailed tests") |
||||
@pytest.mark.parametrize("rs", [2, 42, 60]) |
||||
@pytest.mark.parametrize("hidden_size", [128, 256, 512, 768, 1024, 2048]) |
||||
@pytest.mark.parametrize("data_type", [torch.float32, torch.float16]) |
||||
def test_moe_top2(rs, hidden_size, data_type): |
||||
world_size = 4 |
||||
run_func = partial(run_routing, |
||||
world_size=world_size, |
||||
port=free_port(), |
||||
rs=rs, |
||||
hidden_size=hidden_size, |
||||
data_type=data_type) |
||||
mp.spawn(run_func, nprocs=world_size) |
||||
|
||||
|
||||
if __name__ == '__main__': |
||||
test_moe_top2(2, 256, torch.float16) |
Loading…
Reference in new issue