mirror of https://github.com/hpcaitech/ColossalAI
Merge branch 'main' of https://github.com/hpcaitech/ColossalAI into dev0116
commit
236b4195ff
@ -0,0 +1,35 @@
|
||||
set -x
|
||||
$(cd `dirname $0`;pwd)
|
||||
export TRAIN_STEP=4
|
||||
|
||||
for MODEL_TYPE in "gpt2_medium"; do
|
||||
for DISTPLAN in "colossalai"; do
|
||||
for BATCH_SIZE in 2; do
|
||||
for GPUNUM in 1 4; do
|
||||
for TPDEGREE in 1 2; do
|
||||
if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
|
||||
continue
|
||||
fi
|
||||
for PLACEMENT in "cpu" "auto"; do
|
||||
MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
|
||||
bash ./run_gemini.sh
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
for DISTPLAN in "zero1" "zero2"; do
|
||||
for BATCH_SIZE in 2; do
|
||||
for GPUNUM in 1 4; do
|
||||
for TPDEGREE in 1; do
|
||||
if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
|
||||
continue
|
||||
fi
|
||||
MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE}\
|
||||
bash ./run_gemini.sh
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
@ -1,15 +1,2 @@
|
||||
pip install -r requirements.txt
|
||||
|
||||
# test colossalai
|
||||
for TP in 1 2; do
|
||||
for PLACEMENT in "cpu" "cuda" "auto" "const"; do
|
||||
for SHARD in "True" "False"; do
|
||||
colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan colossalai --tp_degree $TP --placement $PLACEMENT --shardinit $SHARD || exit 1
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
# test zero1&2
|
||||
for DIST in "zero1" "zero2"; do
|
||||
colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan $DIST || exit 1
|
||||
done
|
||||
set -x
|
||||
cd gemini && bash test_ci.sh
|
||||
|
@ -0,0 +1,4 @@
|
||||
for GPUNUM in 2 1
|
||||
do
|
||||
env BS=2 MODEL="125m" GPUNUM=$GPUNUM bash ./run_gemini.sh
|
||||
done
|
@ -0,0 +1,9 @@
|
||||
$(cd `dirname $0`;pwd)
|
||||
|
||||
for BATCH_SIZE in 2
|
||||
do
|
||||
for GPUNUM in 1 4
|
||||
do
|
||||
env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --dummy_data=True --batch_size=${BATCH_SIZE} 2>&1 | tee run.log
|
||||
done
|
||||
done
|
Loading…
Reference in new issue