mirror of https://github.com/hpcaitech/ColossalAI
jiaruifang
2 years ago
8 changed files with 86 additions and 24 deletions
@ -0,0 +1,35 @@
|
||||
set -x |
||||
$(cd `dirname $0`;pwd) |
||||
export TRAIN_STEP=4 |
||||
|
||||
for MODEL_TYPE in "gpt2_medium"; do |
||||
for DISTPLAN in "colossalai"; do |
||||
for BATCH_SIZE in 2; do |
||||
for GPUNUM in 1 4; do |
||||
for TPDEGREE in 1 2; do |
||||
if [ ${TPDEGREE} -gt ${GPUNUM} ]; then |
||||
continue |
||||
fi |
||||
for PLACEMENT in "cpu" "auto"; do |
||||
MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \ |
||||
bash ./run_gemini.sh |
||||
done |
||||
done |
||||
done |
||||
done |
||||
done |
||||
|
||||
for DISTPLAN in "zero1" "zero2"; do |
||||
for BATCH_SIZE in 2; do |
||||
for GPUNUM in 1 4; do |
||||
for TPDEGREE in 1; do |
||||
if [ ${TPDEGREE} -gt ${GPUNUM} ]; then |
||||
continue |
||||
fi |
||||
MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE}\ |
||||
bash ./run_gemini.sh |
||||
done |
||||
done |
||||
done |
||||
done |
||||
done |
@ -1,15 +1,2 @@
|
||||
pip install -r requirements.txt |
||||
|
||||
# test colossalai |
||||
for TP in 1 2; do |
||||
for PLACEMENT in "cpu" "cuda" "auto" "const"; do |
||||
for SHARD in "True" "False"; do |
||||
colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan colossalai --tp_degree $TP --placement $PLACEMENT --shardinit $SHARD || exit 1 |
||||
done |
||||
done |
||||
done |
||||
|
||||
# test zero1&2 |
||||
for DIST in "zero1" "zero2"; do |
||||
colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan $DIST || exit 1 |
||||
done |
||||
set -x |
||||
cd gemini && bash test_ci.sh |
||||
|
@ -0,0 +1,4 @@
|
||||
for GPUNUM in 2 1 |
||||
do |
||||
env BS=2 MODEL="125m" GPUNUM=$GPUNUM bash ./run_gemini.sh |
||||
done |
@ -0,0 +1,9 @@
|
||||
$(cd `dirname $0`;pwd) |
||||
|
||||
for BATCH_SIZE in 2 |
||||
do |
||||
for GPUNUM in 1 4 |
||||
do |
||||
env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --dummy_data=True --batch_size=${BATCH_SIZE} 2>&1 | tee run.log |
||||
done |
||||
done |
Loading…
Reference in new issue