#!/usr/bin/env bash # Usage: $0 set -xu BASE=$(realpath $(dirname $0)) PY_SCRIPT=${BASE}/benchmark_gpt_dummy.py export OMP_NUM_THREADS=8 function tune_batch_size() { # we found when experience batch size is equal to train batch size # peak CUDA memory usage of making experience phase is less than or equal to that of training phase # thus, experience batch size can be larger than or equal to train batch size for bs in 1 2 4 8 16 32 64 128 256; do torchrun --standalone --nproc_per_node $1 $PY_SCRIPT --model $2 --strategy $3 --experience_batch_size $bs --train_batch_size $bs || return 1 done } if [ $# -eq 0 ]; then num_gpus=(1 2 4 8) else num_gpus=($1) fi if [ $# -le 1 ]; then strategies=("ddp" "colossalai_zero2" "colossalai_gemini" "colossalai_zero2_cpu" "colossalai_gemini_cpu") else strategies=($2) fi if [ $# -le 2 ]; then models=("s" "m" "l" "xl" "2b" "4b" "6b" "8b" "10b") else models=($3) fi for num_gpu in ${num_gpus[@]}; do for strategy in ${strategies[@]}; do for model in ${models[@]}; do tune_batch_size $num_gpu $model $strategy || break done done done