Making large AI models cheaper, faster and more accessible
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

132 lines
4.4 KiB

from copy import deepcopy
import pytest
import torch
import torch.distributed as dist
from transformers.models.mixtral.configuration_mixtral import MixtralConfig
from transformers.models.mixtral.modeling_mixtral import MixtralModel
import colossalai
from colossalai.booster.booster import Booster
from colossalai.booster.plugin import HybridParallelPlugin
from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
from colossalai.testing.random import seed_all
from tests.test_moe.moe_utils import assert_loose_close
NUM_BATCH = 4
NUM_TOK_PER_BATCH, NUM_EXPERTS = 7, 4
HIDDEN_SIZE_PER_HEAD = 4
NUM_HEADS = 4
TOP_K = 2
@parameterize("stage", [1])
@parameterize("ep_size", [2])
def run_zero_with_original_model(stage: int, ep_size: int):
tp_size = dist.get_world_size() // ep_size
dtype = torch.bfloat16
rank = torch.distributed.get_rank()
torch.cuda.set_device(dist.get_rank())
seed_all(10086)
config = MixtralConfig(
hidden_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS,
intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
num_hidden_layers=2,
num_attention_heads=NUM_HEADS,
num_key_value_heads=NUM_HEADS,
num_local_experts=NUM_EXPERTS,
num_experts_per_tok=TOP_K,
)
torch_model = MixtralModel(config).to(dtype).cuda()
zero_model = deepcopy(torch_model).to(dtype)
zero_optimizer = torch.optim.SGD(zero_model.parameters(), lr=1)
moe_booster = Booster(
plugin=MoeHybridParallelPlugin(
tp_size=tp_size,
moe_tp_size=tp_size,
pp_size=1,
ep_size=ep_size,
zero_stage=stage,
overlap_communication=False,
initial_scale=1,
)
)
zero_model, zero_optimizer, _, _, _ = moe_booster.boost(zero_model, zero_optimizer)
hybird_booster = Booster(
plugin=HybridParallelPlugin(
tp_size=tp_size,
pp_size=1,
zero_stage=stage,
overlap_communication=False,
initial_scale=1,
)
)
hybrid_model, hybrid_optimizer, _, _, _ = hybird_booster.boost(
torch_model, torch.optim.SGD(torch_model.parameters(), lr=1)
)
# create different input
seed_all(1453 + rank)
hybrid_model.train()
zero_model.train()
for _ in range(2):
# zero-dp forward
input_data = torch.rand(
NUM_BATCH, NUM_TOK_PER_BATCH, HIDDEN_SIZE_PER_HEAD * NUM_HEADS, requires_grad=True
).cuda()
zero_output = zero_model(inputs_embeds=input_data.to(dtype)).last_hidden_state.mean()
# zero-dp backward
zero_optimizer.backward(zero_output)
# torch-ddp forward
hybrid_output = hybrid_model(inputs_embeds=input_data.to(dtype)).last_hidden_state.mean()
assert_loose_close(zero_output, hybrid_output, dtype=dtype)
# torch-ddp backward
hybrid_optimizer.backward(hybrid_output)
# check grad
name_to_p = {n: p for n, p in hybrid_model.named_parameters()}
for n, p in zero_model.named_parameters():
zero_grad = zero_optimizer.get_param_grad(p)
if name_to_p[n].grad is None:
name_to_p[n].grad = torch.zeros_like(name_to_p[n])
continue
if zero_grad.shape != name_to_p[n].grad.shape: # TODO check sharded and sliced moe
continue
assert_loose_close(zero_grad, name_to_p[n].grad, dtype=dtype, name=n)
# zero-dp step
zero_optimizer.step()
# original model step
hybrid_optimizer.step()
# check updated param
for n, p in zero_model.named_parameters():
if p.data.shape != name_to_p[n].data.shape: # TODO check sharded and sliced moe
continue
assert_loose_close(p.data, name_to_p[n].data, dtype=dtype, name=n)
print(f"{dist.get_rank()} test passed")
def run_dist(rank, world_size, port):
colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
run_zero_with_original_model()
@pytest.mark.skip("tested in corresponding sharderformer")
@pytest.mark.dist
@pytest.mark.parametrize("world_size", [4])
@rerun_if_address_is_in_use()
def test_moe_ep_tp(world_size):
spawn(run_dist, world_size)
if __name__ == "__main__":
test_moe_ep_tp(world_size=4)