mirror of https://github.com/hpcaitech/ColossalAI
46 lines
1.2 KiB
Bash
Executable File
46 lines
1.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Usage: $0 <?number-of-gpus> <?strategy> <?model>
|
|
set -xu
|
|
|
|
BASE=$(realpath $(dirname $0))
|
|
|
|
|
|
PY_SCRIPT=${BASE}/benchmark_gpt_dummy.py
|
|
export OMP_NUM_THREADS=8
|
|
|
|
function tune_batch_size() {
|
|
# we found when experience batch size is equal to train batch size
|
|
# peak CUDA memory usage of making experience phase is less than or equal to that of training phase
|
|
# thus, experience batch size can be larger than or equal to train batch size
|
|
for bs in 1 2 4 8 16 32 64 128 256; do
|
|
torchrun --standalone --nproc_per_node $1 $PY_SCRIPT --model $2 --strategy $3 --experience_batch_size $bs --train_batch_size $bs || return 1
|
|
done
|
|
}
|
|
|
|
if [ $# -eq 0 ]; then
|
|
num_gpus=(1 2 4 8)
|
|
else
|
|
num_gpus=($1)
|
|
fi
|
|
|
|
if [ $# -le 1 ]; then
|
|
strategies=("ddp" "colossalai_zero2" "colossalai_gemini" "colossalai_zero2_cpu" "colossalai_gemini_cpu")
|
|
else
|
|
strategies=($2)
|
|
fi
|
|
|
|
if [ $# -le 2 ]; then
|
|
models=("s" "m" "l" "xl" "2b" "4b" "6b" "8b" "10b")
|
|
else
|
|
models=($3)
|
|
fi
|
|
|
|
|
|
for num_gpu in ${num_gpus[@]}; do
|
|
for strategy in ${strategies[@]}; do
|
|
for model in ${models[@]}; do
|
|
tune_batch_size $num_gpu $model $strategy || break
|
|
done
|
|
done
|
|
done
|