From d84e7479750f820040ca53ca8bbf4589ae6f645c Mon Sep 17 00:00:00 2001 From: HELSON Date: Tue, 10 Jan 2023 11:39:25 +0800 Subject: [PATCH] [hotfix] add DISTPAN argument for benchmark (#2412) * change the benchmark config file * change config * revert config file * rename distpan to distplan --- examples/language/gpt/gemini/benchmark_gemini.sh | 6 +++--- examples/language/gpt/gemini/run_gemini.sh | 8 ++++---- examples/language/gpt/gemini/train_gpt_demo.py | 2 ++ 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/examples/language/gpt/gemini/benchmark_gemini.sh b/examples/language/gpt/gemini/benchmark_gemini.sh index 464ea03da..9a630b2ff 100644 --- a/examples/language/gpt/gemini/benchmark_gemini.sh +++ b/examples/language/gpt/gemini/benchmark_gemini.sh @@ -1,5 +1,5 @@ for MODEL_TYPE in "gpt2_medium"; do - for DISPAN in "colossalai"; do + for DISTPLAN in "colossalai"; do for BATCH_SIZE in 16; do for GPUNUM in 1 2 4 8; do for TPDEGREE in 1 2 4 8; do @@ -8,8 +8,8 @@ for MODEL_TYPE in "gpt2_medium"; do fi for PLACEMENT in "cpu" "auto"; do echo "****************** Begin ***************************" - echo "+ benchmrking MODEL ${MODEL_TYPE} DISPAN ${DISPAN} GPU ${GPUNUM} BS ${BATCH_SIZE} TP ${TPDEGREE} POLICY ${PLACEMENT}" - MODEL_TYPE=${MODEL_TYPE} DISPAN=${DISPAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \ + echo "+ benchmrking MODEL ${MODEL_TYPE} DISTPLAN ${DISTPLAN} GPU ${GPUNUM} BS ${BATCH_SIZE} TP ${TPDEGREE} POLICY ${PLACEMENT}" + MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \ bash ./run_gemini.sh echo "****************** Finished ***************************" echo "" diff --git a/examples/language/gpt/gemini/run_gemini.sh b/examples/language/gpt/gemini/run_gemini.sh index ad577c350..0c2ea660f 100644 --- a/examples/language/gpt/gemini/run_gemini.sh +++ b/examples/language/gpt/gemini/run_gemini.sh @@ -1,8 +1,8 @@ set -x # distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"] -export DISTPAN=${DISTPAN:-"colossalai"} +export DISTPLAN=${DISTPLAN:-"colossalai"} -# The following options only valid when DISTPAN="colossalai" +# The following options only valid when DISTPLAN="colossalai" export GPUNUM=${GPUNUM:-1} export TPDEGREE=${TPDEGREE:-1} export PLACEMENT=${PLACEMENT:-"cpu"} @@ -20,5 +20,5 @@ torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \ --batch_size=${BATCH_SIZE} \ --placement=${PLACEMENT} \ --shardinit=${USE_SHARD_INIT} \ ---distplan=${DISTPAN} \ -2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log +--distplan=${DISTPLAN} \ +2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py index 891b1de15..92cb7393c 100644 --- a/examples/language/gpt/gemini/train_gpt_demo.py +++ b/examples/language/gpt/gemini/train_gpt_demo.py @@ -290,9 +290,11 @@ def main(): from torch.distributed.optim import ZeroRedundancyOptimizer optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=0.01) elif args.distplan.startswith("zero"): + model = model.half() partition_flag = args.distplan == "zero2" optimizer = torch.optim.Adam(model.parameters(), lr=0.01) optimizer = LowLevelZeroOptimizer(optimizer, + reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, partition_grad=partition_flag, verbose=True)