mirror of https://github.com/hpcaitech/ColossalAI
Merge branch 'main' of https://github.com/hpcaitech/ColossalAI into dev0116
commit
236b4195ff
@ -0,0 +1,35 @@
|
|||||||
|
set -x
|
||||||
|
$(cd `dirname $0`;pwd)
|
||||||
|
export TRAIN_STEP=4
|
||||||
|
|
||||||
|
for MODEL_TYPE in "gpt2_medium"; do
|
||||||
|
for DISTPLAN in "colossalai"; do
|
||||||
|
for BATCH_SIZE in 2; do
|
||||||
|
for GPUNUM in 1 4; do
|
||||||
|
for TPDEGREE in 1 2; do
|
||||||
|
if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
for PLACEMENT in "cpu" "auto"; do
|
||||||
|
MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
|
||||||
|
bash ./run_gemini.sh
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
for DISTPLAN in "zero1" "zero2"; do
|
||||||
|
for BATCH_SIZE in 2; do
|
||||||
|
for GPUNUM in 1 4; do
|
||||||
|
for TPDEGREE in 1; do
|
||||||
|
if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE}\
|
||||||
|
bash ./run_gemini.sh
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
@ -1,15 +1,2 @@
|
|||||||
pip install -r requirements.txt
|
set -x
|
||||||
|
cd gemini && bash test_ci.sh
|
||||||
# test colossalai
|
|
||||||
for TP in 1 2; do
|
|
||||||
for PLACEMENT in "cpu" "cuda" "auto" "const"; do
|
|
||||||
for SHARD in "True" "False"; do
|
|
||||||
colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan colossalai --tp_degree $TP --placement $PLACEMENT --shardinit $SHARD || exit 1
|
|
||||||
done
|
|
||||||
done
|
|
||||||
done
|
|
||||||
|
|
||||||
# test zero1&2
|
|
||||||
for DIST in "zero1" "zero2"; do
|
|
||||||
colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan $DIST || exit 1
|
|
||||||
done
|
|
||||||
|
@ -0,0 +1,4 @@
|
|||||||
|
for GPUNUM in 2 1
|
||||||
|
do
|
||||||
|
env BS=2 MODEL="125m" GPUNUM=$GPUNUM bash ./run_gemini.sh
|
||||||
|
done
|
@ -0,0 +1,9 @@
|
|||||||
|
$(cd `dirname $0`;pwd)
|
||||||
|
|
||||||
|
for BATCH_SIZE in 2
|
||||||
|
do
|
||||||
|
for GPUNUM in 1 4
|
||||||
|
do
|
||||||
|
env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --dummy_data=True --batch_size=${BATCH_SIZE} 2>&1 | tee run.log
|
||||||
|
done
|
||||||
|
done
|
Loading…
Reference in new issue