[fix] rm use_zbv flag in Shardconfig; rm debug info;

2024-10-16 03:25:04 +00:00 · 2024-10-16 03:25:04 +00:00 · e76308c6e6
parent 90939b77e0
commit e76308c6e6
9 changed files with 212 additions and 651 deletions
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@ -1201,7 +1201,6 @@ class HybridParallelPlugin(PipelinePluginBase):
            gradient_checkpoint_config=gradient_checkpoint_config,
            fp8_communication=fp8_communication,
            inner_ring_size=inner_ring_size,
            use_zbv=(pp_style == "zbv"),
        )
        self.amp_config = dict(
            initial_scale=initial_scale,
--- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
@ -373,7 +373,6 @@ class MoeHybridParallelPlugin(HybridParallelPlugin):
            make_vocab_size_divisible_by=make_vocab_size_divisible_by,
            gradient_checkpoint_config=gradient_checkpoint_config,
            fp8_communication=fp8_communication,
            use_zbv=(pp_style == "zbv"),
        )
        self.amp_config = dict(
            initial_scale=initial_scale,
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@ -60,6 +60,11 @@ class LlamaPolicy(Policy):
        else:
            norm_cls = RMSNorm
        if self.pipeline_stage_manager:
            use_zbv = self.pipeline_stage_manager.use_zbv
        else:
            use_zbv = False
        sp_mode = self.shard_config.sequence_parallelism_mode or None
        sp_size = self.shard_config.sequence_parallel_size or None
        sp_group = self.shard_config.sequence_parallel_process_group or None
@ -129,7 +134,7 @@ class LlamaPolicy(Policy):
                        kwargs=dict(
                            seq_parallel_mode=sp_mode,
                            fp8_communication=self.shard_config.fp8_communication,
-                            use_zbv=self.shard_config.use_zbv,
+                            use_zbv=use_zbv,
                        ),
                    ),
                    SubModuleReplacementDescription(
@ -138,7 +143,7 @@ class LlamaPolicy(Policy):
                        kwargs=dict(
                            seq_parallel_mode=sp_mode,
                            fp8_communication=self.shard_config.fp8_communication,
-                            use_zbv=self.shard_config.use_zbv,
+                            use_zbv=use_zbv,
                        ),
                    ),
                    SubModuleReplacementDescription(
@ -147,7 +152,7 @@ class LlamaPolicy(Policy):
                        kwargs=dict(
                            seq_parallel_mode=sp_mode,
                            fp8_communication=self.shard_config.fp8_communication,
-                            use_zbv=self.shard_config.use_zbv,
+                            use_zbv=use_zbv,
                        ),
                    ),
                    SubModuleReplacementDescription(
@ -156,7 +161,7 @@ class LlamaPolicy(Policy):
                        kwargs=dict(
                            seq_parallel_mode=sp_mode,
                            fp8_communication=self.shard_config.fp8_communication,
-                            use_zbv=self.shard_config.use_zbv,
+                            use_zbv=use_zbv,
                        ),
                    ),
                    SubModuleReplacementDescription(
@ -165,7 +170,7 @@ class LlamaPolicy(Policy):
                        kwargs=dict(
                            seq_parallel_mode=sp_mode,
                            fp8_communication=self.shard_config.fp8_communication,
-                            use_zbv=self.shard_config.use_zbv,
+                            use_zbv=use_zbv,
                        ),
                    ),
                    SubModuleReplacementDescription(
@ -174,7 +179,7 @@ class LlamaPolicy(Policy):
                        kwargs=dict(
                            seq_parallel_mode=sp_mode,
                            fp8_communication=self.shard_config.fp8_communication,
-                            use_zbv=self.shard_config.use_zbv,
+                            use_zbv=use_zbv,
                        ),
                    ),
                    SubModuleReplacementDescription(
@ -183,7 +188,7 @@ class LlamaPolicy(Policy):
                        kwargs=dict(
                            seq_parallel_mode=sp_mode,
                            fp8_communication=self.shard_config.fp8_communication,
-                            use_zbv=self.shard_config.use_zbv,
+                            use_zbv=use_zbv,
                        ),
                    ),
                ],
@ -413,6 +418,10 @@ class LlamaForSequenceClassificationPolicy(LlamaPolicy):
        from transformers import LlamaForSequenceClassification
        policy = super().module_policy()
        if self.pipeline_stage_manager:
            use_zbv = self.pipeline_stage_manager.use_zbv
        else:
            use_zbv = False
        if self.shard_config.enable_tensor_parallelism:
            # add a new item for sequence classification
@ -425,6 +434,7 @@ class LlamaForSequenceClassificationPolicy(LlamaPolicy):
                            kwargs=dict(
                                gather_output=True,
                                fp8_communication=self.shard_config.fp8_communication,
                                use_zbv=use_zbv,
                            ),
                        )
                    ]
--- a/colossalai/shardformer/policies/mixtral.py
+++ b/colossalai/shardformer/policies/mixtral.py
@ -52,6 +52,10 @@ class MixtralPolicy(Policy):
        sp_group = self.shard_config.sequence_parallel_process_group or None
        sp_partial_derived = sp_mode in ["split_gather", "ring"]
        tp_size = self.shard_config.tensor_parallel_size
        if self.pipeline_stage_manager:
            use_zbv = self.pipeline_stage_manager.use_zbv
        else:
            use_zbv = False
        # modified for both SP and TP
        num_q_heads = self.model.config.num_attention_heads
@ -126,7 +130,7 @@ class MixtralPolicy(Policy):
                        target_module=Linear1D_Col,
                        kwargs={
                            "fp8_communication": self.shard_config.fp8_communication,
-                            "use_zbv": self.shard_config.use_zbv,
+                            "use_zbv": use_zbv,
                        },
                    ),
                    SubModuleReplacementDescription(
@ -134,7 +138,7 @@ class MixtralPolicy(Policy):
                        target_module=Linear1D_Col,
                        kwargs={
                            "fp8_communication": self.shard_config.fp8_communication,
-                            "use_zbv": self.shard_config.use_zbv,
+                            "use_zbv": use_zbv,
                        },
                    ),
                    SubModuleReplacementDescription(
@ -142,7 +146,7 @@ class MixtralPolicy(Policy):
                        target_module=Linear1D_Col,
                        kwargs={
                            "fp8_communication": self.shard_config.fp8_communication,
-                            "use_zbv": self.shard_config.use_zbv,
+                            "use_zbv": use_zbv,
                        },
                    ),
                    SubModuleReplacementDescription(
@ -150,7 +154,7 @@ class MixtralPolicy(Policy):
                        target_module=Linear1D_Row,
                        kwargs={
                            "fp8_communication": self.shard_config.fp8_communication,
-                            "use_zbv": self.shard_config.use_zbv,
+                            "use_zbv": use_zbv,
                        },
                    ),
                    SubModuleReplacementDescription(
@ -159,7 +163,7 @@ class MixtralPolicy(Policy):
                        kwargs={
                            "gather_output": True,
                            "fp8_communication": self.shard_config.fp8_communication,
-                            "use_zbv": self.shard_config.use_zbv,
+                            "use_zbv": use_zbv,
                        },
                    ),
                ],
@ -195,7 +199,7 @@ class MixtralPolicy(Policy):
                            "tp_group": self.shard_config.tensor_parallel_process_group,
                            "moe_dp_group": self.shard_config.moe_dp_group,
                            "fp8_communication": self.shard_config.fp8_communication,
-                            "use_zbv": self.shard_config.use_zbv,
+                            "use_zbv": use_zbv,
                        },
                    )
                ],
@ -330,6 +334,10 @@ class MixtralModelPolicy(MixtralPolicy):
 class MixtralForCausalLMPolicy(MixtralPolicy):
    def module_policy(self):
        policy = super().module_policy()
        if self.pipeline_stage_manager:
            use_zbv = self.pipeline_stage_manager.use_zbv
        else:
            use_zbv = False
        # TODO: assign pg mesh from plugin to all modules
        if self.shard_config.enable_tensor_parallelism:
            # add a new item for causal lm
@ -342,7 +350,7 @@ class MixtralForCausalLMPolicy(MixtralPolicy):
                            kwargs=dict(
                                gather_output=True,
                                fp8_communication=self.shard_config.fp8_communication,
-                                use_zbv=self.shard_config.use_zbv,
+                                use_zbv=use_zbv,
                            ),
                        )
                    ],
@ -392,6 +400,10 @@ class MixtralForSequenceClassificationPolicy(MixtralPolicy):
        from transformers import MixtralForSequenceClassification
        policy = super().module_policy()
        if self.pipeline_stage_manager:
            use_zbv = self.pipeline_stage_manager.use_zbv
        else:
            use_zbv = False
        if self.shard_config.enable_tensor_parallelism:
            # add a new item for sequence classification
@ -404,7 +416,7 @@ class MixtralForSequenceClassificationPolicy(MixtralPolicy):
                            kwargs=dict(
                                gather_output=True,
                                fp8_communication=self.shard_config.fp8_communication,
-                                use_zbv=self.shard_config.use_zbv,
+                                use_zbv=use_zbv,
                            ),
                        )
                    ]
--- a/colossalai/shardformer/shard/shard_config.py
+++ b/colossalai/shardformer/shard/shard_config.py
@ -49,7 +49,6 @@ class ShardConfig:
    make_vocab_size_divisible_by: int = 64
    gradient_checkpoint_config: Optional[GradientCheckpointConfig] = None
    extra_kwargs: Dict[str, Any] = field(default_factory=dict)
    use_zbv: bool = False
    # For ring attention
    inner_ring_size: Optional[int] = None
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@ -5,8 +5,6 @@ import warnings
 from contextlib import nullcontext
 import torch
 torch.autograd.set_detect_anomaly(True)
 import torch.distributed as dist
 from data_utils import RandomDataset
 from model_utils import format_numel_str, get_model_numel
--- a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
+++ b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
@ -8,12 +8,14 @@ import torch
 import torch.distributed as dist
 import torch.nn as nn
 from torch.testing import assert_close
 from transformers.models.llama.configuration_llama import LlamaConfig
 from transformers.models.llama.modeling_llama import LlamaModel
 from transformers.models.mixtral.configuration_mixtral import MixtralConfig
 from transformers.models.mixtral.modeling_mixtral import MixtralModel
 import colossalai
 from colossalai.booster.booster import Booster
-from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
+from colossalai.booster.plugin.moe_hybrid_parallel_plugin import HybridParallelPlugin, MoeHybridParallelPlugin
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.interface import OptimizerWrapper
 from colossalai.logging import disable_existing_loggers
@ -918,11 +920,181 @@ def run_with_booster_moehybridplugin(config: Tuple[int, ...]):
    torch.cuda.empty_cache()
@parameterize(
    "config",
    [
        (0, 4, 1, 1),
        # (1, 2, 2, 1),
        # (1, 2, 1, 2),
        # (1, 1, 2, 2),
    ],
 )
 def run_with_booster_hybridplugin(config: Tuple[int, ...]):
    stage, pp_size, tp_size, sp_size = config
    num_microbatches = pp_size
    dist.get_world_size()
    rank = dist.get_rank()
    dtype, precision = torch.float16, "fp16"
    torch.cuda.set_device(dist.get_rank())
    ########
    # init base model
    ########
    assert pp_size <= NUM_LAYERS, "pp_size should be less than or equal to NUM_LAYERS"
    config = LlamaConfig(
        hidden_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS,
        intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
        num_hidden_layers=NUM_LAYERS,
        num_attention_heads=NUM_HEADS,
        num_key_value_heads=NUM_HEADS,
        attn_implementation="flash_attention_2",
    )
    # init model with the same seed
    seed_all(10086)
    torch_model = LlamaModel(config).to(dtype).cuda()
    # TODO: Support MixtralForCausalLM
    # torch_model = MixtralForCausalLM(config).to(dtype).cuda()
    torch_optimizer = torch.optim.SGD(torch_model.parameters(), lr=1)
    # init schedule
    h, a, s = config.hidden_size, config.num_attention_heads, 1024
    mem_f = 34 * h + 5 * a * s
    mem_w = -32 * h
    mem_b = -mem_w - mem_f
    graph = PipelineGraph(
        n_stage=pp_size,
        n_micro=num_microbatches,
        f_cost=1,
        b_cost=1,
        w_cost=1,
        c_cost=1,
        f_mem=mem_f,
        b_mem=mem_b,
        w_mem=mem_w,
    )
    zbv_schedule = graph.get_v_schedule()
    # init MoeHybridPlugin
    plugin = HybridParallelPlugin(
        pp_size=pp_size,
        num_microbatches=pp_size,
        tp_size=tp_size,
        sp_size=sp_size,
        zero_stage=stage,
        enable_sequence_parallelism=sp_size > 1,
        sequence_parallelism_mode="all_to_all" if sp_size > 1 else None,
        overlap_communication=False,
        initial_scale=1,
        precision=precision,
        find_unused_parameters=True,
        pp_style="zbv",
        scheduler_nodes=zbv_schedule,
        num_model_chunks=2,
    )
    dp_size = plugin.dp_size
    booster = Booster(plugin=plugin)
    ########
    # init pp model
    ########
    parallel_model = deepcopy(torch_model)
    parallel_optimizer = torch.optim.SGD(parallel_model.parameters(), lr=1)
    parallel_model, parallel_optimizer, _, _, _ = booster.boost(parallel_model, parallel_optimizer)
    # create different input along dp axis
    seed_all(1453 + rank)
    torch_model.train()
    parallel_model.train()
    for i in range(2):
        # gen random input
        # input = torch.rand(
        #     NUM_BATCH, NUM_TOK_PER_BATCH, NUM_HEADS, HIDDEN_SIZE_PER_HEAD, requires_grad=True
        # ).cuda()
        input_ids = torch.randint(0, torch_model.vocab_size, (NUM_BATCH, config.max_position_embeddings)).cuda()
        attention_mask = torch.ones_like(input_ids).cuda()
        input_ids.clone().cuda()
        input_data = {"input_ids": input_ids, "attention_mask": attention_mask}
        # dist.all_reduce(
        #     input, group=plugin.pp_group
        # )  # pp inputs except the first stage doesn't matter, but need to be replicate for torch model check
        # dist.all_reduce(input, group=plugin.tp_group)  # tp group duplicate input
        # dist.all_reduce(input, group=plugin.sp_group)  # sp group duplicate input
        # run the model with hybrid parallel
        if booster.plugin.stage_manager is not None:
            # for test with pp
            data_iter = iter([input_data])
            sharded_output = booster.execute_pipeline(
                data_iter,
                parallel_model,
                lambda x, y: x.last_hidden_state.mean(),
                parallel_optimizer,
                return_loss=True,
                return_outputs=True,
            )
            # stage 0 chunk 0
            parallel_output = None
            if (
                booster.plugin.stage_manager.is_first_stage(ignore_chunk=True)
                and rank == dist.get_process_group_ranks(plugin.pp_group)[0]
            ):
                parallel_output = sharded_output["loss"]
            else:
                parallel_output = torch.tensor(12345.0, device="cuda")
            # broadcast along pp axis
            dist.broadcast(parallel_output, src=dist.get_process_group_ranks(plugin.pp_group)[0], group=plugin.pp_group)
        else:
            # for test without pp
            parallel_output = parallel_model(
                input_ids=input_data["input_ids"],
                attention_mask=input_data["attention_mask"],
            ).last_hidden_state.mean()
            parallel_optimizer.backward(parallel_output)
        parallel_optimizer.step()
        parallel_optimizer.zero_grad()
        dist.all_reduce(parallel_output, group=plugin.dp_group)
        # ===================================================================================
        # run normal model with all dp(different) inputs
        all_inputs = [input_data for _ in range(dp_size)]
        # dist.all_gather(all_inputs, input, group=plugin.dp_group)
        torch_output_sum = 0
        for input_data_ in all_inputs:
            torch_output = torch_model(
                input_ids=input_data_["input_ids"],
                attention_mask=input_data_["attention_mask"],
            ).last_hidden_state.mean()
            torch_output.backward()
            torch_output_sum += torch_output.detach()
        # print(f"parallel_output {parallel_output} torch_output_sum {torch_output_sum}")
        # avg dp grads follows zero optimizer
        for p in torch_model.parameters():
            if p.grad is not None:
                p.grad /= dp_size
        torch_optimizer.step()
        torch_optimizer.zero_grad()
        print(f"loop {i} rank {dist.get_rank()} parallel_output {parallel_output} torch_output_sum {torch_output_sum}")
        # assert_loose_close(parallel_output, torch_output_sum, dtype=dtype)
        # print(f"rank {dist.get_rank()} config {test_config}  test passed")
    clear_layout_converter()
    Randomizer.reset_index()
    torch.cuda.empty_cache()
 def run_dist(rank, world_size, port):
    disable_existing_loggers()
    colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
    # run_fwd_bwd_vschedule_with_optim()
    run_with_booster_moehybridplugin()
    # run_with_booster_hybridplugin()
@pytest.mark.dist
--- a/tests/test_pipeline/test_schedule/zbv_poc.py
+++ b/tests/test_pipeline/test_schedule/zbv_poc.py
@ -1,628 +0,0 @@
 import gc
 import time
 from copy import deepcopy
 import torch
 import torch.nn as nn
 from torch.testing import assert_close
 def get_model_numel(model):
    return sum(p.numel() for p in model.parameters()) / 1024**2
 # Step1: dx = w*dy
 def backward_b(loss, x, model):
    torch.autograd.backward(loss, inputs=x, retain_graph=True)
 # Step2: dummy dw = x*dy
 def backward_w(loss, model):
    torch.autograd.backward(loss, inputs=list(model.parameters()))
 def test_double_dx_dw_split_nsync():
    device = "cuda:0"
    model = nn.Linear(4096, 4096, bias=None).to(device=device)
    # print(f"model numel {get_model_numel(model)}") # 4GB
    x1 = torch.rand(4096, 4096).to(device=device)
    x2 = torch.rand(4096, 4096).to(device=device)
    ref_model = deepcopy(model)
    ref_x1 = x1.clone()
    ref_x2 = x1.clone()
    # first step
    x1.requires_grad_()
    x2.requires_grad_()
    ref_x1.requires_grad_()
    ref_x2.requires_grad_()
    # loss for dx_dw bwd
    loss1 = model(x1).sum()
    loss2 = model(x2).sum()
    # loss for common bwd
    ref_loss1 = ref_model(ref_x1).sum()
    ref_loss2 = ref_model(ref_x2).sum()
    # dx1
    torch.cuda.synchronize()
    bwd_b_start_time = time.time()
    backward_b(loss1, x1, model)
    bwd_b_end_time = time.time()
    print(f"loss_1 bwd B runtime {bwd_b_end_time - bwd_b_start_time}")
    for p in model.parameters():
        assert p.grad is None
    assert x1.grad is not None
    # dx2
    torch.cuda.synchronize()
    bwd_b_start_time = time.time()
    backward_b(loss2, x2, model)
    bwd_b_end_time = time.time()
    print(f"loss_2 bwd B runtime {bwd_b_end_time - bwd_b_start_time}")
    # dw1
    torch.cuda.synchronize()
    bwd_w_start_time = time.time()
    backward_w(loss1, model)
    bwd_w_end_time = time.time()
    print(f"loss_1 bwd W runtime {bwd_w_end_time - bwd_w_start_time}")
    for p in model.parameters():
        assert p.grad is not None
    # common bwd 1
    torch.cuda.synchronize()
    comm_bwd_start_time = time.time()
    ref_loss1.backward()
    comm_bwd_end_time = time.time()
    print(f"loss_1 comm bwd runtime {comm_bwd_end_time - comm_bwd_start_time}")
    # # assert dx1 & dw1 == bwd 1
    # assert_close(x1.grad, ref_x1.grad)
    # for p1, p2 in zip(model.parameters(), ref_model.parameters()):
    #     assert_close(p1, p2)
    #     assert_close(p1.grad, p2.grad)
    # dw2
    torch.cuda.synchronize()
    bwd_w_start_time = time.time()
    backward_w(loss2, model)
    bwd_w_end_time = time.time()
    print(f"loss_2 bwd W runtime {bwd_w_end_time - bwd_w_start_time}")
    # common bwd 2
    torch.cuda.synchronize()
    comm_bwd_start_time = time.time()
    ref_loss2.backward()
    comm_bwd_end_time = time.time()
    print(f"loss_2 comm bwd runtime {comm_bwd_end_time - comm_bwd_start_time}")
    # # assert dx2 & dw2 == bwd 2
    # assert_close(x2.grad, ref_x2.grad)
    # for p1, p2 in zip(model.parameters(), ref_model.parameters()):
    #     print(f"bwd2:\n p1 {p1.grad},\n p2 {p2.grad}\n")
    #     assert_close(p1, p2)
    #     assert_close(p1.grad, p2.grad)
 def test_double_dx_dw_split_sync():
    device = "cuda:0"
    model = nn.Linear(8, 8, bias=None).to(device=device)
    print(f"model size {get_model_numel(model)} ")  # 4GB
    x1 = torch.rand(8, 8).to(device=device)
    x2 = torch.rand(8, 8).to(device=device)
    # x1 = torch.ones(8, 8).to(device=device)
    # x2 = torch.ones(8, 8).to(device=device)
    ref_model = deepcopy(model)
    ref_x1 = x1.clone()
    ref_x2 = x2.clone()
    x1.requires_grad_()
    x2.requires_grad_()
    ref_x1.requires_grad_()
    ref_x2.requires_grad_()
    ############
    # step1:
    ############
    # loss1
    loss1 = model(x1).sum()
    # ref_loss1
    ref_model(ref_x1).sum()
    # dx1
    backward_b(loss1, x1, model)
    for p in model.parameters():
        assert p.grad is None
    assert x1.grad is not None
    # dw1
    backward_w(loss1, model)
    for p in model.parameters():
        assert p.grad is not None
    # common bwd 1
    # ref_loss1.backward()
    # assert dx1 & dw1 == bwd 1
    assert_close(x1.grad, ref_x1.grad)
    for p1, p2 in zip(model.parameters(), ref_model.parameters()):
        assert_close(p1, p2)
        assert_close(p1.grad, p2.grad)
    ############
    # step2:
    ############
    # loss2
    loss2 = model(x2).sum()
    # ref_loss2
    ref_loss2 = ref_model(ref_x2).sum()
    for p1, p2 in zip(model.parameters(), ref_model.parameters()):
        print(f"bwd2:\n p1 {p1.grad},\n p2 {p2.grad}\n")
        assert_close(p1, p2)
        assert_close(p1.grad, p2.grad)
    # dx2
    backward_b(loss2, x2, model)
    # dw2
    backward_w(loss2, model)
    # common bwd 2
    ref_loss2.backward()
    # assert dx2 & dw2 == bwd 2
    assert_close(x2.grad, ref_x2.grad)
    for p1, p2 in zip(model.parameters(), ref_model.parameters()):
        print(f"bwd2:\n p1 {p1.grad},\n p2 {p2.grad}\n")
        assert_close(p1, p2)
        assert_close(p1.grad, p2.grad)
 def deallocate_output_tensor(out):
    """Pseudo-deallocate (i.e., set to scalar) the output tensor's '.data' field.
    This method should be called right after the output tensor has been
    sent to the next pipeline stage. At this point, the output tensor is
    only useful for its '.grad_fn' field, and not its '.data'.
    """
    assert isinstance(out, torch.Tensor), "expected Tensor, found %s." % type(out).__name__
    assert out._base is None, "counter-productive to free a view of another tensor."
    out.data = torch.empty(
        (1,),
        device=out.device,
        dtype=out.dtype,
    )
 IN_DIM = 8192
 OUT_DIM = 8192
 NUM_LAYER = 3
 class MlpModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.ModuleList([nn.Linear(IN_DIM, OUT_DIM, bias=None) for _ in range(NUM_LAYER)])
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
 class Attention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.0, proj_drop=0.0, with_qkv=True):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5
        self.with_qkv = with_qkv
        if self.with_qkv:
            self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
            self.proj = nn.Linear(dim, dim)
            self.proj_drop = nn.Dropout(proj_drop)
        self.attn_drop = nn.Dropout(attn_drop)
    def forward(self, x):
        B, N, C = x.shape
        if self.with_qkv:
            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
            q, k, v = qkv[0], qkv[1], qkv[2]
        else:
            qkv = x.reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
            q, k, v = qkv, qkv, qkv
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        if self.with_qkv:
            x = self.proj(x)
            x = self.proj_drop(x)
        return x
 def mem_dx_dw():
    device = "cuda:0"
    # model = nn.Linear(IN_DIM, OUT_DIM, bias=None).to(device=device)
    print(f"Before init Model: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    model = MlpModel().to(device=device)
    print(f"model numel {get_model_numel(model)}")  # 4GB
    print(f"After init Model: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    print(f"Before init x1&2&3: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    x1 = torch.rand(IN_DIM, OUT_DIM).to(device=device)
    x2 = torch.rand(IN_DIM, OUT_DIM).to(device=device)
    x3 = torch.rand(IN_DIM, OUT_DIM).to(device=device)
    x1.requires_grad_()
    x2.requires_grad_()
    x3.requires_grad_()
    print(f"After init x1&2&3: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    ############
    # step1:
    ############
    print(f"\nStep1")
    # loss1
    print(f"Before Fwd x1: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    y1 = model(x1)
    print(f"After Fwd x1: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    print(f"Before loss1: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    loss1 = y1.sum()
    print(f"After loss1: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    # dx1
    backward_b(loss1, x1, model)
    # dw1
    backward_w(loss1, model)
    deallocate_output_tensor(x1)
    deallocate_output_tensor(y1)
    # del x1
    # del y1
    print(f"After del x1&y1: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    # print(f"\n Step1:collect:{gc.collect()}")
    # print(f"object: {gc.get_objects()}")
    # print(f"garbage: {gc.garbage}")
    ############
    # step2:
    ############
    print(f"\nStep2")
    # loss2
    y2 = model(x2)
    loss2 = y2.sum()
    # dx2
    backward_b(loss2, x2, model)
    # dw2
    backward_w(loss2, model)
    deallocate_output_tensor(x2)
    deallocate_output_tensor(y2)
    # del x2
    # del y2
    print(f"After del x2&y2: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    print(f"\n Step2:collect:{gc.collect()}")
    # print(f"object: {gc.get_objects()}")
    print(f"garbage: {gc.garbage}")
    ############
    # step3:
    ############
    print(f"\nStep3")
    # loss3
    y3 = model(x3)
    loss3 = y3.sum()
    # dx2
    backward_b(loss3, x3, model)
    # dw2
    backward_w(loss3, model)
    deallocate_output_tensor(x3)
    deallocate_output_tensor(y3)
    # del x3
    # del y3
    print(f"After del x3&y3: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    print(f"\n Step3:collect:{gc.collect()}")
    # print(f"object: {gc.get_objects()}")
    print(f"garbage: {gc.garbage}")
 # del activation
 def activation_dx_dw():
    device = "cuda:0"
    # model = nn.Linear(IN_DIM, OUT_DIM, bias=None).to(device=device)
    print(f"Before init Model: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    model = MlpModel().to(device=device)
    x1 = torch.rand(IN_DIM, OUT_DIM).to(device=device)
    x2 = torch.rand(IN_DIM, OUT_DIM).to(device=device)
    x3 = torch.rand(IN_DIM, OUT_DIM).to(device=device)
    x1.requires_grad_()
    x2.requires_grad_()
    x3.requires_grad_()
    print(f"After init Model, x1,x2,x3: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    activations = {}
    def register_hooks(module):
        def activation_hook(module, input, output):
            activations[f"{module.__class__.__name__}_{id(module)}"] = output.detach()
        def bwd_hook(module, grad_input, grad_output):
            del activations[f"{module.__class__.__name__}_{id(module)}"]
        module.register_forward_hook(activation_hook)
        module.register_backward_hook(bwd_hook)
    model.apply(register_hooks)
    ############
    # step1:
    ############
    print(f"\nStep1")
    # loss1
    loss1 = model(x1).sum()
    # dx1
    backward_b(loss1, x1, model)
    # dw1
    backward_w(loss1, model)
    del loss1, x1
    print(f"After del x1&y1: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    ############
    # step2:
    ############
    print(f"\nStep2")
    # loss2
    loss2 = model(x2).sum()
    # dx2
    backward_b(loss2, x2, model)
    # dw2
    backward_w(loss2, model)
    # deallocate_output_tensor(x2)
    # deallocate_output_tensor(loss2)
    del x2, loss2
    print(f"After del x2&y2: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    ############
    # step3:
    ############
    print(f"\nStep3")
    # loss3
    loss3 = model(x3).sum()
    # dx2
    backward_b(loss3, x3, model)
    # dw2
    backward_w(loss3, model)
    del x3, loss3
    print(f"After del x3&y3: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
 # text dx dw in model chunk
 def model_chunk_dx_dw():
    device = "cuda:0"
    num_layers = 4
    print(f"Before init Model: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    model = MlpModel(in_dim=4096, out_dim=4096, num_layers=num_layers).to(device=device)
    x = torch.rand(4096, 4096).to(device=device)
    x.requires_grad_()
    model_chunk_0 = torch.nn.ModuleList()  # for layer 1 & 2
    model_chunk_1 = torch.nn.ModuleList()  # for layer 3 & 4
    for idx, sub_model in enumerate(model.layers):
        if idx < 2:
            model_chunk_0.append(sub_model).cuda()
        else:
            model_chunk_1.append(sub_model).cuda()
    print(f"After init Model & input: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    # Step1:chunk 0 fwd
    activation = dict()  # layer_id: activation
    out = x
    for i in range(len(model_chunk_0)):
        layer = model_chunk_0[i]
        activation[i] = layer(out)
    print(f"After chunk0 fwd: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    # Step2:chunk 1 fwd
    for i in range(len(model_chunk_1)):
        layer = model_chunk_0[i]
        activation[i + 2] = layer(out)
    print(f"After chunk1 fwd: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
    # Step3:chunk 1 bwd b: dx=w*dy & bwd w:dw=x*dy
    # visit layer reversely
    for i in range(len(model_chunk_1) - 1, -1, -1):
        layer = model_chunk_1[i]
        global_layer_idx = i + 2
        prev_global_layer_idx = i + 1 if i + 1 > 0 else None
        i + 3 if i + 3 < 4 else None
        # bwd b
        if global_layer_idx == num_layers - 1:  # last layer in last chunk; calculate loss
            loss = activation[global_layer_idx].sum()
            x = activation[prev_global_layer_idx]
            backward_b(loss, x, layer)
        else:
            loss = activation[global_layer_idx].sum()
            x = activation[prev_global_layer_idx]
            backward_b(loss, x, layer)
        # bwd w
        backward_w(loss, layer)
 def test_dx_dw_linear_benchmark():
    device = "cuda:0"
    model = nn.Linear(4096, 4096, bias=None).to(device=device)
    # print(f"model numel {get_model_numel(model)}") # 4GB
    x1 = torch.rand(4096, 4096).to(device=device)
    # x2 = torch.rand(4096, 4096).to(device=device)
    ref_model = deepcopy(model)
    ref_x1 = x1.clone()
    # ref_x2 = x1.clone()
    # first step
    x1.requires_grad_()
    # x2.requires_grad_()
    ref_x1.requires_grad_()
    # ref_x2.requires_grad_()
    # loss for dx_dw bwd
    loss1 = model(x1).sum()
    # loss2 = model(x2).sum()
    # loss for common bwd
    ref_model(ref_x1).sum()
    # ref_loss2 = ref_model(ref_x2).sum()
    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        # schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
        on_trace_ready=torch.profiler.tensorboard_trace_handler(
            f"/home/nvme-share/home/duanjunwen/ColossalAI/tests/test_pipeline/test_schedule"
        ),
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
        with_flops=True,
    ) as prof:
        # dx1
        torch.cuda.synchronize()
        bwd_b_start_time = time.time()
        backward_b(loss1, x1, model)
        bwd_b_end_time = time.time()
        print(f"loss_1 bwd B runtime {bwd_b_end_time - bwd_b_start_time}")
        for p in model.parameters():
            assert p.grad is None
        assert x1.grad is not None
        # dw1
        torch.cuda.synchronize()
        bwd_w_start_time = time.time()
        backward_w(loss1, model)
        bwd_w_end_time = time.time()
        print(f"loss_1 bwd W runtime {bwd_w_end_time - bwd_w_start_time}")
        for p in model.parameters():
            assert p.grad is not None
        # # common bwd 1
        # torch.cuda.synchronize()
        # comm_bwd_start_time = time.time()
        # ref_loss1.backward()
        # comm_bwd_end_time = time.time()
        # print(f"loss_1 comm bwd runtime {comm_bwd_end_time - comm_bwd_start_time}")
 def test_dx_dw_attn_benchmark():
    device = "cuda:0"
    model = Attention(dim=4096).to(device=device)
    # print(f"model numel {get_model_numel(model)}") # 4GB
    x1 = torch.rand(1, 256, 4096).to(device=device)
    # x2 = torch.rand(1, 256, 4096).to(device=device)
    ref_model = deepcopy(model)
    ref_x1 = x1.clone()
    # ref_x2 = x1.clone()
    # first step
    x1.requires_grad_()
    # x2.requires_grad_()
    ref_x1.requires_grad_()
    # ref_x2.requires_grad_()
    # loss for dx_dw bwd
    loss1 = model(x1).sum()
    # loss2 = model(x2).sum()
    # loss for common bwd
    ref_model(ref_x1).sum()
    # ref_loss2 = ref_model(ref_x2).sum()
    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        # schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
        on_trace_ready=torch.profiler.tensorboard_trace_handler(
            f"/home/nvme-share/home/duanjunwen/ColossalAI/tests/test_pipeline/test_schedule"
        ),
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
        with_flops=True,
    ) as prof:
        # dx1
        torch.cuda.synchronize()
        bwd_b_start_time = time.time()
        backward_b(loss1, x1, model)
        bwd_b_end_time = time.time()
        print(f"loss_1 bwd B runtime {bwd_b_end_time - bwd_b_start_time}")
        for p in model.parameters():
            assert p.grad is None
        assert x1.grad is not None
        # dw1
        torch.cuda.synchronize()
        bwd_w_start_time = time.time()
        backward_w(loss1, model)
        bwd_w_end_time = time.time()
        print(f"loss_1 bwd W runtime {bwd_w_end_time - bwd_w_start_time}")
        for p in model.parameters():
            assert p.grad is not None
        # # common bwd 1
        # torch.cuda.synchronize()
        # comm_bwd_start_time = time.time()
        # ref_loss1.backward()
        # comm_bwd_end_time = time.time()
        # print(f"loss_1 comm bwd runtime {comm_bwd_end_time - comm_bwd_start_time}")
 if __name__ == "__main__":
    # test_dx_dw_split()
    # test_double_dx_dw_split_nsync()
    # test_double_dx_dw_split_sync()
    # mem_dx_dw()
    # activation_dx_dw()
    # test_dx_dw_linear_benchmark()
    test_dx_dw_attn_benchmark()
--- a/tests/test_shardformer/test_model/test_shard_llama.py
+++ b/tests/test_shardformer/test_model/test_shard_llama.py
@ -277,7 +277,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
            "precision": "fp16",
            "initial_scale": 1,
        },
-        # TODO: assert layer error
+        # # TODO: assert layer error
        # {
        #     "tp_size": 2,
        #     "pp_size": 2,