ColossalAI/applications/Chat/benchmarks/benchmark_gpt_dummy.sh

46 lines
1.2 KiB
Bash
Executable File

#!/usr/bin/env bash
# Usage: $0 <?number-of-gpus> <?strategy> <?model>
set -xu
BASE=$(realpath $(dirname $0))
PY_SCRIPT=${BASE}/benchmark_gpt_dummy.py
export OMP_NUM_THREADS=8
function tune_batch_size() {
# we found when experience batch size is equal to train batch size
# peak CUDA memory usage of making experience phase is less than or equal to that of training phase
# thus, experience batch size can be larger than or equal to train batch size
for bs in 1 2 4 8 16 32 64 128 256; do
torchrun --standalone --nproc_per_node $1 $PY_SCRIPT --model $2 --strategy $3 --experience_batch_size $bs --train_batch_size $bs || return 1
done
}
if [ $# -eq 0 ]; then
num_gpus=(1 2 4 8)
else
num_gpus=($1)
fi
if [ $# -le 1 ]; then
strategies=("ddp" "colossalai_zero2" "colossalai_gemini" "colossalai_zero2_cpu" "colossalai_gemini_cpu")
else
strategies=($2)
fi
if [ $# -le 2 ]; then
models=("s" "m" "l" "xl" "2b" "4b" "6b" "8b" "10b")
else
models=($3)
fi
for num_gpu in ${num_gpus[@]}; do
for strategy in ${strategies[@]}; do
for model in ${models[@]}; do
tune_batch_size $num_gpu $model $strategy || break
done
done
done