mirror of https://github.com/hpcaitech/ColossalAI
aibig-modeldata-parallelismdeep-learningdistributed-computingfoundation-modelsheterogeneous-traininghpcinferencelarge-scalemodel-parallelismpipeline-parallelism
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
20 lines
674 B
20 lines
674 B
set -x |
|
# distplan in ["CAI_ZeRO1", "CAI_ZeRO2", "CAI_Gemini", "Pytorch_DDP", "Pytorch_ZeRO"] |
|
export DISTPLAN=${DISTPLAN:-"CAI_Gemini"} |
|
|
|
# The following options only valid when DISTPLAN="colossalai" |
|
export GPUNUM=${GPUNUM:-1} |
|
export BATCH_SIZE=${BATCH_SIZE:-16} |
|
export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"} |
|
export TRAIN_STEP=${TRAIN_STEP:-10} |
|
# export PYTHONPATH=$PWD:$PYTHONPATH |
|
|
|
|
|
mkdir -p gemini_logs |
|
|
|
torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \ |
|
--model_type=${MODEL_TYPE} \ |
|
--batch_size=${BATCH_SIZE} \ |
|
--distplan=${DISTPLAN} \ |
|
--train_step=${TRAIN_STEP} \ |
|
2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}.log
|
|
|