2022-12-20 06:30:27 +00:00
|
|
|
# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"]
|
2022-11-16 03:36:27 +00:00
|
|
|
export DISTPAN="colossalai"
|
|
|
|
|
|
|
|
# The following options only valid when DISTPAN="colossalai"
|
2022-12-28 08:07:08 +00:00
|
|
|
export TPDEGREE=4
|
2022-12-28 05:54:08 +00:00
|
|
|
export GPUNUM=8
|
|
|
|
export PLACEMENT='cpu'
|
2022-11-16 03:36:27 +00:00
|
|
|
export USE_SHARD_INIT=False
|
2022-12-28 08:07:08 +00:00
|
|
|
export BATCH_SIZE=32
|
|
|
|
# export MODEL_TYPE="gpt2_24b"
|
2022-11-16 03:36:27 +00:00
|
|
|
|
2022-12-28 05:54:08 +00:00
|
|
|
mkdir -p logs
|
|
|
|
env OMP_NUM_THREADS=16 torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log
|