From 49c601da21bc7ad6bafbb85af9e0330993781802 Mon Sep 17 00:00:00 2001 From: Jiarui Fang Date: Thu, 29 Dec 2022 12:00:00 +0800 Subject: [PATCH] [example] add benchmark.sh for gpt (#2226) --- examples/language/gpt/benchmark.sh | 22 ++++++++++++++++++++++ examples/language/gpt/model_zoo.py | 4 +++- examples/language/gpt/run.sh | 14 +++++++------- examples/language/gpt/train_gpt_demo.py | 6 ++++-- 4 files changed, 36 insertions(+), 10 deletions(-) create mode 100644 examples/language/gpt/benchmark.sh diff --git a/examples/language/gpt/benchmark.sh b/examples/language/gpt/benchmark.sh new file mode 100644 index 000000000..ad519bf2b --- /dev/null +++ b/examples/language/gpt/benchmark.sh @@ -0,0 +1,22 @@ +for MODEL_NAME in "GPT2small" +do +for BATCH_SIZE in 8 +do +for GPUNUM in 1 2 4 8 +do +for TPDEGREE in 1 2 4 8 +do +if [ ${TPDEGREE} -gt ${GPUNUM} ] +then + continue +fi +echo "****************** Begin ***************************" +echo "* benchmrking MODEL_NAME ${MODEL_NAME} BS ${BATCH_SIZE} BS ${BS} GPUNUM ${GPUNUM} TPDEGREE ${TPDEGREE}" +bash ./run.sh +echo "****************** Finished ***************************" +echo "" +echo "" +done +done +done +done diff --git a/examples/language/gpt/model_zoo.py b/examples/language/gpt/model_zoo.py index e41f1272c..1fff3eb28 100644 --- a/examples/language/gpt/model_zoo.py +++ b/examples/language/gpt/model_zoo.py @@ -53,7 +53,7 @@ def gpt2_24b(checkpoint=True): return GPTLMModel(hidden_size=8192, num_layers=30, num_attention_heads=16, checkpoint=checkpoint) -def model_builder(model_size: str): +def model_builder(model_size: str) -> callable: if model_size == "gpt2_medium": return gpt2_medium elif model_size == "gpt2_xl": @@ -66,6 +66,8 @@ def model_builder(model_size: str): return gpt2_20b elif model_size == "gpt2_24b": return gpt2_24b + else: + raise TypeError(f"model_builder {model_size}") __all__ = ['model_builder'] diff --git a/examples/language/gpt/run.sh b/examples/language/gpt/run.sh index 6e17b0dfc..b0a1e35b6 100644 --- a/examples/language/gpt/run.sh +++ b/examples/language/gpt/run.sh @@ -1,13 +1,13 @@ # distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"] -export DISTPAN="colossalai" +export DISTPAN={$DISTPAN:-"colossalai"} # The following options only valid when DISTPAN="colossalai" -export TPDEGREE=1 -export GPUNUM=1 -export PLACEMENT='const' -export USE_SHARD_INIT=False -export BATCH_SIZE=32 -# export MODEL_TYPE="gpt2_10b" +export TPDEGREE=${TPDEGREE:-1} +export GPUNUM=${GPUNUM:-1} +export PLACEMENT=${PLACEMENT:'const'} +export USE_SHARD_INIT=${USE_SHARD_INIT:False} +export BATCH_SIZE=${BATCH_SIZE:-8} +export MODEL_TYPE=${MODEL_TYPE:"gpt2_medium"} mkdir -p logs torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log diff --git a/examples/language/gpt/train_gpt_demo.py b/examples/language/gpt/train_gpt_demo.py index a1c49cdcb..1437bffc4 100644 --- a/examples/language/gpt/train_gpt_demo.py +++ b/examples/language/gpt/train_gpt_demo.py @@ -5,7 +5,6 @@ from time import time import psutil import torch import torch.nn as nn -from model_zoo import model_builder from packaging import version from torch.nn.parallel import DistributedDataParallel as DDP @@ -17,6 +16,7 @@ from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, Proces from colossalai.utils import get_current_device from colossalai.utils.model.colo_init_context import ColoInitContext from colossalai.zero.sharded_optim import LowLevelZeroOptimizer +from model_zoo import model_builder def parse_args(): @@ -55,7 +55,7 @@ def parse_args(): parser.add_argument( "--model_type", type=str, - default='gpt2_medium', + default="gpt2_medium", help="model model scale", ) args = parser.parse_args() @@ -309,6 +309,8 @@ def main(): if n >= WARMUP_STEPS: tflops_list.append(step_tflops) + logger.info(f"max memory {torch.cuda.memory_allocated() / 1024**2} MB", ranks=[0]) + tflops_list.sort() median_index = ((NUM_STEPS - WARMUP_STEPS) >> 1) + WARMUP_STEPS logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}")