ColossalAI/tests/test_moe/test_moe_ep_tp.py

from copy import deepcopy

import pytest
import torch
import torch.distributed as dist
from transformers.models.mixtral.configuration_mixtral import MixtralConfig
from transformers.models.mixtral.modeling_mixtral import MixtralModel

import colossalai
from colossalai.booster.booster import Booster
from colossalai.booster.plugin import HybridParallelPlugin
from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
from colossalai.testing.random import seed_all
from tests.test_moe.moe_utils import loose_close

NUM_BATCH = 4
NUM_TOK_PER_BATCH, NUM_EXPERTS = 7, 4
HIDDEN_SIZE_PER_HEAD = 4
NUM_HEADS = 4
TOP_K = 2


@parameterize("stage", [1])
@parameterize("ep_size", [1, 2, 4])
def run_zero_with_original_model(stage: int, ep_size: int):
    tp_size = dist.get_world_size() // ep_size
    dtype = torch.bfloat16

    rank = torch.distributed.get_rank()
    torch.cuda.set_device(dist.get_rank())

    seed_all(10086)

    config = MixtralConfig(
        hidden_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS,
        intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
        num_hidden_layers=2,
        num_attention_heads=NUM_HEADS,
        num_key_value_heads=NUM_HEADS,
        num_local_experts=NUM_EXPERTS,
        num_experts_per_tok=TOP_K,
    )
    torch_model = MixtralModel(config).to(dtype).cuda()

    zero_model = deepcopy(torch_model).to(dtype)
    zero_optimizer = torch.optim.SGD(zero_model.parameters(), lr=1)
    moe_booster = Booster(
        plugin=MoeHybridParallelPlugin(
            tp_size=tp_size,
            moe_tp_size=tp_size,
            pp_size=1,
            ep_size=ep_size,
            zero_stage=stage,
            overlap_communication=False,
            initial_scale=1,
        )
    )
    zero_model, zero_optimizer, _, _, _ = moe_booster.boost(zero_model, zero_optimizer)

    hybird_booster = Booster(
        plugin=HybridParallelPlugin(
            tp_size=tp_size,
            pp_size=1,
            zero_stage=stage,
            overlap_communication=False,
            initial_scale=1,
        )
    )
    hybrid_model, hybrid_optimizer, _, _, _ = hybird_booster.boost(
        torch_model, torch.optim.SGD(torch_model.parameters(), lr=1)
    )
    # create different input
    seed_all(1453 + rank)

    hybrid_model.train()
    zero_model.train()
    for _ in range(2):
        # zero-dp forward
        input_data = torch.rand(
            NUM_BATCH, NUM_TOK_PER_BATCH, HIDDEN_SIZE_PER_HEAD * NUM_HEADS, requires_grad=True
        ).cuda()
        zero_output = zero_model(inputs_embeds=input_data.to(dtype)).last_hidden_state.mean()
        # zero-dp backward
        zero_optimizer.backward(zero_output)
        # torch-ddp forward
        hybrid_output = hybrid_model(inputs_embeds=input_data.to(dtype)).last_hidden_state.mean()
        loose_close(zero_output, hybrid_output, dtype=dtype)
        # torch-ddp backward
        hybrid_optimizer.backward(hybrid_output)

        # check grad
        name_to_p = {n: p for n, p in hybrid_model.named_parameters()}
        for n, p in zero_model.named_parameters():
            zero_grad = zero_optimizer.get_param_grad(p)
            if name_to_p[n].grad is None:
                name_to_p[n].grad = torch.zeros_like(name_to_p[n])
                continue
            if zero_grad.shape != name_to_p[n].grad.shape:  # TODO check sharded and sliced moe
                continue
            loose_close(zero_grad, name_to_p[n].grad, dtype=dtype, name=n)

        # zero-dp step
        zero_optimizer.step()

        # original model step
        hybrid_optimizer.step()

        # check updated param
        for n, p in zero_model.named_parameters():
            if p.data.shape != name_to_p[n].data.shape:  # TODO check sharded and sliced moe
                continue
            loose_close(p.data, name_to_p[n].data, dtype=dtype, name=n)

    print(f"{dist.get_rank()} test passed")


def run_dist(rank, world_size, port):
    colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
    run_zero_with_original_model()


@pytest.mark.dist
@pytest.mark.parametrize("world_size", [4])
@rerun_if_address_is_in_use()
def test_moe_ep_tp(world_size):
    spawn(run_dist, world_size)


if __name__ == "__main__":
    test_moe_ep_tp(world_size=4)
[zero] solve hang 5 months ago			`from copy import deepcopy`
[moe]: fix ep/tp tests, add hierarchical all2all (#4982) * fix: add warning for EP different behavior * fix: use shard_data in ep & tp model * to: add used_capacity * fix: fix router test * feat: add create_ep_node_group * feat: add create_ep_hierarchical_group fn * feat: add HierarchicalAllToAll * test: add hierarchical all2all test * fix: fix test errors * fix: simplify create_ep_hierarchical_group * fix: add hierarchical_alltoall arg * fix: fix environ typo * revert: revert process mesh order * to: add todo mark * fix: skip hierarchical_comm if torch < 1.13.1 1 year ago
[moe] merge moe into main (#4978) * update moe module * support openmoe 1 year ago			`import pytest`
			`import torch`
			`import torch.distributed as dist`
[zero] solve hang 5 months ago			`from transformers.models.mixtral.configuration_mixtral import MixtralConfig`
			`from transformers.models.mixtral.modeling_mixtral import MixtralModel`
[moe] merge moe into main (#4978) * update moe module * support openmoe 1 year ago
			`import colossalai`
[zero] solve hang 5 months ago			`from colossalai.booster.booster import Booster`
			`from colossalai.booster.plugin import HybridParallelPlugin`
[misc] solve booster hang by rename the variable 5 months ago			`from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin`
[zero] solve hang 5 months ago			`from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn`
			`from colossalai.testing.random import seed_all`
			`from tests.test_moe.moe_utils import loose_close`

[misc] solve booster hang by rename the variable 5 months ago			`NUM_BATCH = 4`
[zero] solve hang 5 months ago			`NUM_TOK_PER_BATCH, NUM_EXPERTS = 7, 4`
			`HIDDEN_SIZE_PER_HEAD = 4`
[misc] solve booster hang by rename the variable 5 months ago			`NUM_HEADS = 4`
[zero] solve hang 5 months ago			`TOP_K = 2`


			`@parameterize("stage", [1])`
			`@parameterize("ep_size", [1, 2, 4])`
[moe] implement tp 4 months ago			`def run_zero_with_original_model(stage: int, ep_size: int):`
			`tp_size = dist.get_world_size() // ep_size`
[zero] solve hang 5 months ago			`dtype = torch.bfloat16`

			`rank = torch.distributed.get_rank()`
			`torch.cuda.set_device(dist.get_rank())`

			`seed_all(10086)`

			`config = MixtralConfig(`
			`hidden_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS,`
			`intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,`
			`num_hidden_layers=2,`
			`num_attention_heads=NUM_HEADS,`
			`num_key_value_heads=NUM_HEADS,`
			`num_local_experts=NUM_EXPERTS,`
			`num_experts_per_tok=TOP_K,`
[moe]: fix ep/tp tests, add hierarchical all2all (#4982) * fix: add warning for EP different behavior * fix: use shard_data in ep & tp model * to: add used_capacity * fix: fix router test * feat: add create_ep_node_group * feat: add create_ep_hierarchical_group fn * feat: add HierarchicalAllToAll * test: add hierarchical all2all test * fix: fix test errors * fix: simplify create_ep_hierarchical_group * fix: add hierarchical_alltoall arg * fix: fix environ typo * revert: revert process mesh order * to: add todo mark * fix: skip hierarchical_comm if torch < 1.13.1 1 year ago			`)`
[zero] solve hang 5 months ago			`torch_model = MixtralModel(config).to(dtype).cuda()`

			`zero_model = deepcopy(torch_model).to(dtype)`
			`zero_optimizer = torch.optim.SGD(zero_model.parameters(), lr=1)`
[misc] solve booster hang by rename the variable 5 months ago			`moe_booster = Booster(`
[zero] solve hang 5 months ago			`plugin=MoeHybridParallelPlugin(`
[moe] implement tp 4 months ago			`tp_size=tp_size,`
			`moe_tp_size=tp_size,`
			`pp_size=1,`
			`ep_size=ep_size,`
			`zero_stage=stage,`
			`overlap_communication=False,`
			`initial_scale=1,`
[moe]: fix ep/tp tests, add hierarchical all2all (#4982) * fix: add warning for EP different behavior * fix: use shard_data in ep & tp model * to: add used_capacity * fix: fix router test * feat: add create_ep_node_group * feat: add create_ep_hierarchical_group fn * feat: add HierarchicalAllToAll * test: add hierarchical all2all test * fix: fix test errors * fix: simplify create_ep_hierarchical_group * fix: add hierarchical_alltoall arg * fix: fix environ typo * revert: revert process mesh order * to: add todo mark * fix: skip hierarchical_comm if torch < 1.13.1 1 year ago			`)`
[zero] solve hang 5 months ago			`)`
[misc] solve booster hang by rename the variable 5 months ago			`zero_model, zero_optimizer, _, _, _ = moe_booster.boost(zero_model, zero_optimizer)`
[zero] solve hang 5 months ago
[misc] solve booster hang by rename the variable 5 months ago			`hybird_booster = Booster(`
[zero] solve hang 5 months ago			`plugin=HybridParallelPlugin(`
			`tp_size=tp_size,`
			`pp_size=1,`
			`zero_stage=stage,`
			`overlap_communication=False,`
			`initial_scale=1,`
[moe]: fix ep/tp tests, add hierarchical all2all (#4982) * fix: add warning for EP different behavior * fix: use shard_data in ep & tp model * to: add used_capacity * fix: fix router test * feat: add create_ep_node_group * feat: add create_ep_hierarchical_group fn * feat: add HierarchicalAllToAll * test: add hierarchical all2all test * fix: fix test errors * fix: simplify create_ep_hierarchical_group * fix: add hierarchical_alltoall arg * fix: fix environ typo * revert: revert process mesh order * to: add todo mark * fix: skip hierarchical_comm if torch < 1.13.1 1 year ago			`)`
[zero] solve hang 5 months ago			`)`
[misc] solve booster hang by rename the variable 5 months ago			`hybrid_model, hybrid_optimizer, _, _, _ = hybird_booster.boost(`
			`torch_model, torch.optim.SGD(torch_model.parameters(), lr=1)`
			`)`
[zero] solve hang 5 months ago			`# create different input`
			`seed_all(1453 + rank)`

			`hybrid_model.train()`
			`zero_model.train()`
			`for _ in range(2):`
			`# zero-dp forward`
[misc] solve booster hang by rename the variable 5 months ago			`input_data = torch.rand(`
			`NUM_BATCH, NUM_TOK_PER_BATCH, HIDDEN_SIZE_PER_HEAD * NUM_HEADS, requires_grad=True`
			`).cuda()`
[zero] solve hang 5 months ago			`zero_output = zero_model(inputs_embeds=input_data.to(dtype)).last_hidden_state.mean()`
			`# zero-dp backward`
			`zero_optimizer.backward(zero_output)`
			`# torch-ddp forward`
			`hybrid_output = hybrid_model(inputs_embeds=input_data.to(dtype)).last_hidden_state.mean()`
			`loose_close(zero_output, hybrid_output, dtype=dtype)`
			`# torch-ddp backward`
			`hybrid_optimizer.backward(hybrid_output)`

			`# check grad`
			`name_to_p = {n: p for n, p in hybrid_model.named_parameters()}`
			`for n, p in zero_model.named_parameters():`
			`zero_grad = zero_optimizer.get_param_grad(p)`
			`if name_to_p[n].grad is None:`
			`name_to_p[n].grad = torch.zeros_like(name_to_p[n])`
			`continue`
[moe] implement tp 4 months ago			`if zero_grad.shape != name_to_p[n].grad.shape: # TODO check sharded and sliced moe`
			`continue`
[zero] solve hang 5 months ago			`loose_close(zero_grad, name_to_p[n].grad, dtype=dtype, name=n)`

			`# zero-dp step`
			`zero_optimizer.step()`

			`# original model step`
			`hybrid_optimizer.step()`

			`# check updated param`
			`for n, p in zero_model.named_parameters():`
[moe] implement tp 4 months ago			`if p.data.shape != name_to_p[n].data.shape: # TODO check sharded and sliced moe`
			`continue`
[zero] solve hang 5 months ago			`loose_close(p.data, name_to_p[n].data, dtype=dtype, name=n)`

			`print(f"{dist.get_rank()} test passed")`


			`def run_dist(rank, world_size, port):`
			`colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")`
			`run_zero_with_original_model()`
[moe] merge moe into main (#4978) * update moe module * support openmoe 1 year ago

			`@pytest.mark.dist`
[zero] solve hang 5 months ago			`@pytest.mark.parametrize("world_size", [4])`
[moe] merge moe into main (#4978) * update moe module * support openmoe 1 year ago			`@rerun_if_address_is_in_use()`
[zero] solve hang 5 months ago			`def test_moe_ep_tp(world_size):`
			`spawn(run_dist, world_size)`
[moe] merge moe into main (#4978) * update moe module * support openmoe 1 year ago

[npu] change device to accelerator api (#5239) * update accelerator * fix timer * fix amp * update * fix * update bug * add error raise * fix autocast * fix set device * remove doc accelerator * update doc * update doc * update doc * use nullcontext * update cpu * update null context * change time limit for example * udpate * update * update * update * [npu] polish accelerator code --------- Co-authored-by: Xuanlei Zhao <xuanlei.zhao@gmail.com> Co-authored-by: zxl <43881818+oahzxl@users.noreply.github.com> 11 months ago			`if __name__ == "__main__":`
[zero] solve hang 5 months ago			`test_moe_ep_tp(world_size=4)`