From d6e3d7d2a3364bc7d8d315ee0b5b6042aabf8a98 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Wed, 28 Aug 2024 02:41:05 +0000
Subject: [PATCH] [feat] fix ci; add assert;

---
 .../test_schedule/test_zerobubble_pp.py       | 27 ++++++++++++-------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
index e09805dee..65aa0db5a 100644
--- a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
+++ b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
@@ -479,15 +479,20 @@ def test_run_fwd_bwd_with_vschedule(
     rank: int,
     world_size: int,
     port: int,
+    num_microbatch: int,
+    batch_size: int,
+    num_model_chunk: int,
 ):
     # init dist
     colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost")
     rank = dist.get_rank()
     pp_size = world_size
     pg_mesh = ProcessGroupMesh(pp_size)
-    num_microbatch = 4
+    num_microbatch = num_microbatch
     # stage_manager
-    stage_manager = PipelineStageManager(pg_mesh, pipeline_axis=0, enable_interleave=True, num_model_chunks=pp_size)
+    stage_manager = PipelineStageManager(
+        pg_mesh, pipeline_axis=0, enable_interleave=True, num_model_chunks=num_model_chunk
+    )
 
     h, a, s = 4096, 32, 1024
     mem_f = 34 * h + 5 * a * s
@@ -511,7 +516,7 @@ def test_run_fwd_bwd_with_vschedule(
     scheduler = ZeroBubbleVPipeScheduler(
         schedule=zbv_schedule[rank],  # hint: send whole schedule or local schedule only ?
         stage_manager=stage_manager,
-        num_model_chunks=pp_size,
+        num_model_chunks=num_model_chunk,
         num_microbatch=num_microbatch,
         overlap_p2p=False,
     )
@@ -520,8 +525,9 @@ def test_run_fwd_bwd_with_vschedule(
         return (x * x).mean()
 
     # init model and input
-    batch_size = 4
+    batch_size = batch_size
     num_layers = 8
+    assert num_layers % num_model_chunk == 0, f"Model with {num_layers} layer can not dist on {num_model_chunk} chunk"
     in_dim = out_dim = 8
     print(f"Before init Model: {torch.cuda.memory_allocated()/1024**3 :.3f} GB on device {stage_manager.get_rank()};")
     model = MlpModel(in_dim=in_dim, out_dim=out_dim, num_layers=num_layers).to(rank)
@@ -611,16 +617,19 @@ def test_run_fwd_bwd_with_vschedule(
 
 
 @pytest.mark.dist
-# @pytest.mark.parametrize("num_microbatch", [4])
-# @pytest.mark.parametrize("batch_size", [4])
-# @pytest.mark.parametrize("num_model_chunk", [2])
+@pytest.mark.parametrize("num_microbatch", [4])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("num_model_chunk", [4])
 @rerun_if_address_is_in_use()
-def test_pp():
+def test_pp(num_microbatch: int, batch_size: int, num_model_chunk: int):
     spawn(
         test_run_fwd_bwd_with_vschedule,
         nprocs=4,
+        num_microbatch=num_microbatch,
+        batch_size=batch_size,
+        num_model_chunk=num_model_chunk,
     )
 
 
 if __name__ == "__main__":
-    test_pp()
+    test_pp(num_microbatch=4, batch_size=4, num_model_chunk=4)