From 29868a9ec18c103c945d574f90ebc566e7c9284e Mon Sep 17 00:00:00 2001 From: Jiarui Fang Date: Tue, 27 Dec 2022 17:39:53 +0800 Subject: [PATCH] [example] update gpt readme with performance (#2206) --- examples/language/gpt/README.md | 33 +++++++++++++++++++++++++ examples/language/gpt/run.sh | 4 +-- examples/language/gpt/train_gpt_demo.py | 20 +++++++++------ 3 files changed, 47 insertions(+), 10 deletions(-) diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md index 1f0454273..b540960c5 100644 --- a/examples/language/gpt/README.md +++ b/examples/language/gpt/README.md @@ -53,3 +53,36 @@ The `train_gpt_demo.py` provides three distributed plans, you can choose the pla - ZeRO2 (Colossal-AI) - Pytorch DDP - Pytorch ZeRO + + +## Performance + +Testbed: a cluster of 8xA100 (80GB) and 1xAMD EPYC 7543 32-Core Processor (512 GB). GPUs are connected via PCI-e. +ColossalAI version 0.1.13. + +How dose Batch Size affect the efficency. + +| model | #GPU | policy | TP |batch | Tflops | +| ---------- | --------- |--------- |--------- |--------- |--------- | +| gpt2_10b | 2 | cpu | 1 | 32 | 122.046 | +| gpt2_10b | 2 | cpu | 1 | 16 | 82.649 | +| gpt2_10b | 2 | cpu | 1 | 8 | 61.354 | + + +How dose the Placement Policy affect the efficency. + +| model | #GPU | policy | TP |batch | Tflops | +| ---------- | --------- |--------- |--------- |--------- |--------- | +| gpt2_10b | 4 | auto | 1 | 8 | 88.657 | +| gpt2_10b | 4 | cuda | 1 | 8 | OOM | +| gpt2_10b | 4 | cpu | 1 | 8 | 61.354 | +| gpt2_10b | 4 | const | 1 | 8 | 82.137 | + +How dose the Tensor Parallel Degree affect the efficency. + +| model | #GPU | policy | TP |batch | Tflops | +| ---------- | --------- |--------- |--------- |--------- |--------- | +| gpt2_10b | 4 | auto | 1 | 8 | 88.657 | +| gpt2_10b | 4 | auto | 2 | 8 | 56.687 | +| gpt2_10b | 4 | auto | 4 | 8 | 29.019 | +| gpt2_10b | 4 | auto | 4 | 64 | 50.411 | diff --git a/examples/language/gpt/run.sh b/examples/language/gpt/run.sh index 5d3d2c559..15ca25c49 100644 --- a/examples/language/gpt/run.sh +++ b/examples/language/gpt/run.sh @@ -2,9 +2,9 @@ export DISTPAN="colossalai" # The following options only valid when DISTPAN="colossalai" -export TPDEGREE=2 +export TPDEGREE=4 export GPUNUM=4 -export PLACEMENT='cpu' +export PLACEMENT='auto' export USE_SHARD_INIT=False env OMP_NUM_THREADS=16 torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log diff --git a/examples/language/gpt/train_gpt_demo.py b/examples/language/gpt/train_gpt_demo.py index 3b22f05a6..1c36fd222 100644 --- a/examples/language/gpt/train_gpt_demo.py +++ b/examples/language/gpt/train_gpt_demo.py @@ -179,13 +179,17 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup): # Gemini + ZeRO DDP def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"): cai_version = colossalai.__version__ + from colossalai.gemini import ChunkManager, GeminiManager if version.parse(cai_version) > version.parse("0.1.10"): from colossalai.nn.parallel import GeminiDDP model = GeminiDDP(model, device=get_current_device(), placement_policy=placememt_policy, pin_memory=True, - search_range_mb=32) + hidden_dim=4096, + search_range_mb=64) + if placememt_policy == 'const': + model.gemini_manager._placement_policy.set_const_memory_boundary(10 * 1024) elif version.parse(cai_version) <= version.parse("0.1.10") and version.parse(cai_version) >= version.parse("0.1.9"): from colossalai.gemini import ChunkManager, GeminiManager chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 32) @@ -206,9 +210,10 @@ def main(): if args.distplan not in ["colossalai", "torch_ddp", "torch_zero", "zero1", "zero2"]: raise TypeError(f"{args.distplan} is error") - BATCH_SIZE = 8 + BATCH_SIZE = 64 SEQ_LEN = 1024 VOCAB_SIZE = 50257 + NUM_STEPS = 10 disable_existing_loggers() @@ -227,22 +232,21 @@ def main(): default_dist_spec = ShardSpec([-1], [args.tp_degree]) if args.shardinit else None # build GPT model - with ColoInitContext(device='cpu', default_dist_spec=default_dist_spec, default_pg=default_pg): - model = gpt2_medium(checkpoint=True) + with ColoInitContext(device=get_current_device(), default_dist_spec=default_dist_spec, default_pg=default_pg): + model = gpt2_10b(checkpoint=True) pg = default_pg # Tensor Parallelism (TP) tensor_parallelize(model, pg) + # Gemini + ZeRO DP, Note it must be used after TP model = gemini_zero_dpp(model, pg, args.placement) - # build optimizer + # build highly optimized cpu optimizer optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5) - # optimizer = HybridAdam(model.parameters(), lr=1e-3) - # optimizer = ZeroOptimizer(optimizer, model, initial_scale=2**5) logger.info(get_mem_info(prefix='After init optim, '), ranks=[0]) else: - model = gpt2_medium(checkpoint=True).cuda() + model = gpt2_10b(checkpoint=True).cuda() if args.distplan.startswith("torch"): model = DDP(model)