[fix\ fix fail case test_shard_llama

2024-10-25 02:28:55 +00:00 · 2024-10-25 02:28:55 +00:00 · d0ec221b38
parent 2eca112c90
commit d0ec221b38
5 changed files with 10 additions and 12 deletions
--- a/colossalai/pipeline/schedule/zero_bubble_pp.py
+++ b/colossalai/pipeline/schedule/zero_bubble_pp.py
@ -3,6 +3,7 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 import torch
 import torch.cuda
 import torch.distributed
 from torch.nn import Module, ModuleList
 from torch.utils._pytree import tree_flatten, tree_map
@ -544,7 +545,6 @@ class ZeroBubbleVPipeScheduler(PipelineSchedule):
            ctx = optimizer.no_sync()
        except AttributeError:
            ctx = model_chunk.no_sync()
        with ctx:
            optimizer.backward_by_grad(
                tensor=output_obj_,
--- a/colossalai/pipeline/stage_manager.py
+++ b/colossalai/pipeline/stage_manager.py
@ -228,5 +228,4 @@ class PipelineStageManager:
            start_position = (num_stages * num_model_chunks) // 2 - remainder // 2
            for i in range(start_position, start_position + remainder):
                layers_per_stage[i] += 1
        # print(f"layers_per_stage {layers_per_stage}")
        return layers_per_stage
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@ -32,7 +32,6 @@ from colossalai.shardformer.shard import ShardConfig
 from ..layer import ColoAttention, RingAttention, dist_cross_entropy
 _SUPPORTED_SP_MODE = ["all_to_all", "split_gather", "ring", "ring_attn"]
 _GLOBAL_ORDER_ = 0
 class LlamaPipelineForwards:
@ -194,10 +193,6 @@ class LlamaPipelineForwards:
            assert num_ckpt_layers <= end_idx - start_idx
        for idx, decoder_layer in enumerate(self.layers[start_idx:end_idx], start=start_idx):
            # global _GLOBAL_ORDER_
            # if torch.distributed.get_rank() == 0:
            #     print(f"rank {torch.distributed.get_rank()} {stage_manager.stage}; start:{start_idx}, end:{end_idx} hidden_states require grad{hidden_states.requires_grad}")
            # # _GLOBAL_ORDER_ += 1
            if output_hidden_states:
                all_hidden_states += (hidden_states,)
            if idx - start_idx < num_ckpt_layers:
@ -221,8 +216,6 @@ class LlamaPipelineForwards:
                    use_cache=use_cache,
                    cache_position=cache_position,
                )
                # if torch.distributed.get_rank() == 0:
                #     print(f"rank {torch.distributed.get_rank()} {stage_manager.stage}; start:{start_idx}, end:{end_idx} layer_outputs require grad {layer_outputs[0].requires_grad}")
            hidden_states = layer_outputs[0]
            if use_cache:
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@ -287,6 +287,11 @@ def main():
    # ==============================
    dp_size = getattr(plugin, "dp_size", coordinator.world_size)
    if args.config in MODEL_CONFIGS:
        config = MODEL_CONFIGS[args.config]
    else:
        config = AutoConfig.from_pretrained(args.config, trust_remote_code=True)
    torch.cuda.manual_seed(42)
    dataset = RandomDataset(
        num_samples=args.batch_size * args.num_steps * dp_size, max_length=args.max_length, vocab_size=config.vocab_size
--- a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
+++ b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
@ -923,10 +923,11 @@ def run_with_booster_moehybridplugin(config: Tuple[int, ...]):
@parameterize(
    "config",
    [
-        # (0, 4, 1, 1),
+        # (1, 2, 2, 1), # Pass
-        (1, 2, 2, 1),
+        # TODO: only support pp + tp accleration; Will support fully pp and None tp Hybrid in furture;
        (0, 4, 1, 1),
        # (1, 2, 1, 2),
-        # (1, 1, 2, 2), # TODO: no pp show gather result err
+        # (1, 1, 2, 2),
    ],
 )
 def run_with_booster_hybridplugin(config: Tuple[int, ...]):