From 29868a9ec18c103c945d574f90ebc566e7c9284e Mon Sep 17 00:00:00 2001
From: Jiarui Fang <fangjiarui123@gmail.com>
Date: Tue, 27 Dec 2022 17:39:53 +0800
Subject: [PATCH] [example] update gpt readme with performance (#2206)

---
 examples/language/gpt/README.md         | 33 +++++++++++++++++++++++++
 examples/language/gpt/run.sh            |  4 +--
 examples/language/gpt/train_gpt_demo.py | 20 +++++++++------
 3 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md
index 1f0454273..b540960c5 100644
--- a/examples/language/gpt/README.md
+++ b/examples/language/gpt/README.md
@@ -53,3 +53,36 @@ The `train_gpt_demo.py` provides three distributed plans, you can choose the pla
 - ZeRO2 (Colossal-AI)
 - Pytorch DDP
 - Pytorch ZeRO
+
+
+## Performance
+
+Testbed: a cluster of 8xA100 (80GB) and 1xAMD EPYC 7543 32-Core Processor (512 GB). GPUs are connected via PCI-e.
+ColossalAI version 0.1.13.
+
+How dose Batch Size affect the efficency.
+
+| model | #GPU | policy | TP |batch | Tflops |
+| ---------- | --------- |--------- |--------- |--------- |--------- |
+| gpt2_10b |  2  | cpu | 1 | 32 | 122.046 |
+| gpt2_10b |  2  | cpu | 1 | 16 | 82.649 |
+| gpt2_10b |  2  | cpu | 1 | 8 | 61.354 |
+
+
+How dose the Placement Policy affect the efficency.
+
+| model | #GPU | policy | TP |batch | Tflops |
+| ---------- | --------- |--------- |--------- |--------- |--------- |
+| gpt2_10b |  4  | auto | 1 | 8 | 88.657 |
+| gpt2_10b |  4  | cuda | 1 | 8 | OOM |
+| gpt2_10b |  4  | cpu | 1 | 8 | 61.354 |
+| gpt2_10b |  4  | const | 1 | 8 | 82.137 |
+
+How dose the Tensor Parallel Degree affect the efficency.
+
+| model | #GPU | policy | TP |batch | Tflops |
+| ---------- | --------- |--------- |--------- |--------- |--------- |
+| gpt2_10b |  4  | auto | 1 | 8 | 88.657 |
+| gpt2_10b |  4  | auto | 2 | 8 | 56.687 |
+| gpt2_10b |  4  | auto | 4 | 8 | 29.019 |
+| gpt2_10b |  4  | auto | 4 | 64 | 50.411 |
diff --git a/examples/language/gpt/run.sh b/examples/language/gpt/run.sh
index 5d3d2c559..15ca25c49 100644
--- a/examples/language/gpt/run.sh
+++ b/examples/language/gpt/run.sh
@@ -2,9 +2,9 @@
 export DISTPAN="colossalai"
 
 # The following options only valid when DISTPAN="colossalai"
-export TPDEGREE=2
+export TPDEGREE=4
 export GPUNUM=4
-export PLACEMENT='cpu'
+export PLACEMENT='auto'
 export USE_SHARD_INIT=False
 
 env OMP_NUM_THREADS=16 torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log
diff --git a/examples/language/gpt/train_gpt_demo.py b/examples/language/gpt/train_gpt_demo.py
index 3b22f05a6..1c36fd222 100644
--- a/examples/language/gpt/train_gpt_demo.py
+++ b/examples/language/gpt/train_gpt_demo.py
@@ -179,13 +179,17 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
 # Gemini + ZeRO DDP
 def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
     cai_version = colossalai.__version__
+    from colossalai.gemini import ChunkManager, GeminiManager
     if version.parse(cai_version) > version.parse("0.1.10"):
         from colossalai.nn.parallel import GeminiDDP
         model = GeminiDDP(model,
                           device=get_current_device(),
                           placement_policy=placememt_policy,
                           pin_memory=True,
-                          search_range_mb=32)
+                          hidden_dim=4096,
+                          search_range_mb=64)
+        if placememt_policy == 'const':
+            model.gemini_manager._placement_policy.set_const_memory_boundary(10 * 1024)
     elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"):
         from colossalai.gemini import ChunkManager, GeminiManager
         chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32)
@@ -206,9 +210,10 @@ def main():
     if args.distplan not in ["colossalai", "torch_ddp", "torch_zero", "zero1", "zero2"]:
         raise TypeError(f"{args.distplan} is error")
 
-    BATCH_SIZE = 8
+    BATCH_SIZE = 64
     SEQ_LEN = 1024
     VOCAB_SIZE = 50257
+
     NUM_STEPS = 10
 
     disable_existing_loggers()
@@ -227,22 +232,21 @@ def main():
         default_dist_spec = ShardSpec([-1], [args.tp_degree]) if args.shardinit else None
 
         # build GPT model
-        with ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg):
-            model = gpt2_medium(checkpoint=True)
+        with ColoInitContext(device=get_current_device(), default_dist_spec=default_dist_spec, default_pg=default_pg):
+            model = gpt2_10b(checkpoint=True)
 
         pg = default_pg
         # Tensor Parallelism (TP)
         tensor_parallelize(model, pg)
+
         # Gemini + ZeRO DP, Note it must be used after TP
         model = gemini_zero_dpp(model, pg, args.placement)
 
-        # build optimizer
+        # build highly optimized cpu optimizer
         optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5)
-        # optimizer = HybridAdam(model.parameters(), lr=1e-3)
-        # optimizer = ZeroOptimizer(optimizer, model, initial_scale=2**5)
         logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
     else:
-        model = gpt2_medium(checkpoint=True).cuda()
+        model = gpt2_10b(checkpoint=True).cuda()
 
     if args.distplan.startswith("torch"):
         model = DDP(model)