diff --git a/examples/language/gpt/gemini/benchmark_gemini.sh b/examples/language/gpt/gemini/benchmark_gemini.sh index 13086666e..464ea03da 100644 --- a/examples/language/gpt/gemini/benchmark_gemini.sh +++ b/examples/language/gpt/gemini/benchmark_gemini.sh @@ -1,18 +1,20 @@ for MODEL_TYPE in "gpt2_medium"; do - for BATCH_SIZE in 16; do - for GPUNUM in 1 2 4 8; do - for TPDEGREE in 1 2 4 8; do - if [ ${TPDEGREE} -gt ${GPUNUM} ]; then - continue - fi - for PLACEMENT in "cpu" "auto"; do - echo "****************** Begin ***************************" - echo "* benchmrking MODEL_TYPE ${MODEL_TYPE} BS ${BATCH_SIZE} BS ${BS} GPUNUM ${GPUNUM} TPDEGREE ${TPDEGREE} PLACEMENT ${PLACEMENT}" - MODEL_TYPE=${MODEL_TYPE} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \ - bash ./gemini/run_gemini.sh - echo "****************** Finished ***************************" - echo "" - echo "" + for DISPAN in "colossalai"; do + for BATCH_SIZE in 16; do + for GPUNUM in 1 2 4 8; do + for TPDEGREE in 1 2 4 8; do + if [ ${TPDEGREE} -gt ${GPUNUM} ]; then + continue + fi + for PLACEMENT in "cpu" "auto"; do + echo "****************** Begin ***************************" + echo "+ benchmrking MODEL ${MODEL_TYPE} DISPAN ${DISPAN} GPU ${GPUNUM} BS ${BATCH_SIZE} TP ${TPDEGREE} POLICY ${PLACEMENT}" + MODEL_TYPE=${MODEL_TYPE} DISPAN=${DISPAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \ + bash ./run_gemini.sh + echo "****************** Finished ***************************" + echo "" + echo "" + done done done done diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py index 29f8c8ef1..891b1de15 100644 --- a/examples/language/gpt/gemini/train_gpt_demo.py +++ b/examples/language/gpt/gemini/train_gpt_demo.py @@ -270,6 +270,7 @@ def main(): tp_pg = ProcessGroup(tp_degree=args.tp_degree) # Tensor Parallelism (TP) + # You should notice that v0.1.10 is not compatible with TP degree > 1 tensor_parallelize(model, tp_pg) # build a Gemini model and a highly optimized cpu optimizer @@ -278,6 +279,7 @@ def main(): logger.info(get_mem_info(prefix='After init optim, '), ranks=[0]) else: + assert args.tp_degree == 1, "The degree of TP should be 1 for DDP examples." model = model_builder(args.model_type)(checkpoint=True).cuda() if args.distplan.startswith("torch"):