ColossalAI/applications/ColossalChat/benchmarks/benchmark_ppo.sh

#!/usr/bin/env bash

set_n_least_used_CUDA_VISIBLE_DEVICES() {
    local n=${1:-"9999"}
    echo "GPU Memory Usage:"
    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
        tail -n +2 |
        nl -v 0 |
        tee /dev/tty |
        sort -g -k 2 |
        awk '{print $1}' |
        head -n $n)
    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
    echo "Now CUDA_VISIBLE_DEVICES is set to:"
    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
}

set_n_least_used_CUDA_VISIBLE_DEVICES 8

set -xu

NUM_RETRY=3
BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))
EXAMPLES_DIR=$BASE_DIR/examples
TEMP_DIR=$BASE_DIR/temp
MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models
MODELS_DIR=$TEMP_DIR/models_config
# To benchmark different models, change the following line
# MODELS=('125m' '350m' '700m' '1.3b' '2.7b' '3.5b' '5.5b' '6.7b' '10b' '13b')
MODELS=('125m')
# To benchmark different strategies, change the following line
# PLUGINS=('zero2', 'zero2_cpu', '3d')
PLUGINS=('zero2')
LORA_RANK=('0')

export OMP_NUM_THREADS=8

rm ./benchmark_memory_consumption.txt
rm ./benchmark_performance_summarization.txt

# install requirements
pip install -r $EXAMPLES_DIR/requirements.txt

random_choice() {
    local arr=("$@")
    local len=${#arr[@]}
    local idx=$((RANDOM % len))
    echo ${arr[$idx]}
}

echo "[Test]: testing ppo ..."

SKIPPED_TESTS=(
)

GRAD_CKPTS=('' '--grad_checkpoint')
GRAD_CKPTS=('')
for lora_rank in ${LORA_RANK[@]}; do
    for model in ${MODELS[@]}; do
        plugins=($(shuf -e "${PLUGINS[@]}"))
        for plugin in ${plugins[@]}; do
            if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin-$lora_rank " ]]; then
                echo "[Test]: Skipped $model-$plugin-$lora_rank"
                continue
            elif [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin " ]]; then
                echo "[Test]: Skipped $model-$plugin"
                continue
            fi
            pretrain=$model
            tokenizer_dir="facebook/opt-125m"
            grad_ckpt=$(random_choice "${GRAD_CKPTS[@]}")
            tp='1'
            if [[ $plugin == "3d" ]]; then
                tp='4'
            fi
            for i in $(seq $NUM_RETRY); do
                echo "[Test]: $model-$plugin-$lora_rank, attempt $i"
                declare -a prompt_dataset=()
                for split in $(seq -f "%05g" 0 9); do
                    prompt_dataset+=("$TEMP_DIR/benchmark/arrow/part-$split")
                done
                colossalai run --nproc_per_node 8 --master_port 28547 $BASE_DIR/benchmarks/benchmark_ppo.py \
                    --pretrain $pretrain \
                    --tokenizer_dir $tokenizer_dir \
                    --prompt_dataset ${prompt_dataset[@]} \
                    --ptx_coef 0 \
                    --save_path $MODEL_SAVE_PATH \
                    --conversation_template_config ./Opt.json \
                    --lora_rank $lora_rank \
                    --plugin $plugin \
                    --num_episodes 5 \
                    --num_collect_steps 1 \
                    --num_update_steps 1 \
                    --max_seq_len 128 \
                    --max_length 512 \
                    --experience_batch_size 32 \
                    --train_batch_size 32 \
                    --accumulation_steps 1 \
                    --lr 9e-6 \
                    --mixed_precision "bf16" \
                    --grad_clip 1.0 \
                    --use_flash_attn \
                    --tp $tp \
                    --lr 2e-5 \
                    $grad_ckpt
                passed=$?
                if [ $passed -eq 0 ]; then
                    rm -rf $MODEL_SAVE_PATH/*
                    rm -rf $MODELS_DIR/*
                    break
                fi
            done
            if [ $passed -ne 0 ]; then
                echo "[Test]: Failed $model-$plugin-$lora_rank"
                exit 1
            fi
        done
    done
done
[ColossalChat] Update RLHF V2 (#5286) * Add dpo. Fix sft, ppo, lora. Refactor all * fix and tested ppo * 2 nd round refactor * add ci tests * fix ci * fix ci * fix readme, style * fix readme style * fix style, fix benchmark * reproduce benchmark result, remove useless files * rename to ColossalChat * use new image * fix ci workflow * fix ci * use local model/tokenizer for ci tests * fix ci * fix ci * fix ci * fix ci timeout * fix rm progress bar. fix ci timeout * fix ci * fix ci typo * remove 3d plugin from ci temporary * test environment * cannot save optimizer * support chat template * fix readme * fix path * test ci locally * restore build_or_pr * fix ci data path * fix benchmark * fix ci, move ci tests to 3080, disable fast tokenizer * move ci to 85 * support flash attention 2 * add all-in-one data preparation script. Fix colossal-llama2-chat chat template * add hardware requirements * move ci test data * fix save_model, add unwrap * fix missing bos * fix missing bos; support grad accumulation with gemini * fix ci * fix ci * fix ci * fix llama2 chat template config * debug sft * debug sft * fix colossalai version requirement * fix ci * add sanity check to prevent NaN loss * fix requirements * add dummy data generation script * add dummy data generation script * add dummy data generation script * add dummy data generation script * update readme * update readme * update readme and ignore * fix logger bug * support parallel_output * modify data preparation logic * fix tokenization * update lr * fix inference * run pre-commit --------- Co-authored-by: Tong Li <tong.li352711588@gmail.com> 2024-03-29 06:12:29 +00:00			`#!/usr/bin/env bash`

			`set_n_least_used_CUDA_VISIBLE_DEVICES() {`
			`local n=${1:-"9999"}`
			`echo "GPU Memory Usage:"`
			`local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \|`
			`tail -n +2 \|`
			`nl -v 0 \|`
			`tee /dev/tty \|`
			`sort -g -k 2 \|`
			`awk '{print $1}' \|`
			`head -n $n)`
			`export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS \| sed 's/ /,/g')`
			`echo "Now CUDA_VISIBLE_DEVICES is set to:"`
			`echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"`
			`}`

			`set_n_least_used_CUDA_VISIBLE_DEVICES 8`

			`set -xu`

			`NUM_RETRY=3`
			`BASE_DIR=$(dirname $(dirname $(realpath $BASH_SOURCE)))`
			`EXAMPLES_DIR=$BASE_DIR/examples`
			`TEMP_DIR=$BASE_DIR/temp`
			`MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models`
			`MODELS_DIR=$TEMP_DIR/models_config`
			`# To benchmark different models, change the following line`
			`# MODELS=('125m' '350m' '700m' '1.3b' '2.7b' '3.5b' '5.5b' '6.7b' '10b' '13b')`
			`MODELS=('125m')`
			`# To benchmark different strategies, change the following line`
			`# PLUGINS=('zero2', 'zero2_cpu', '3d')`
			`PLUGINS=('zero2')`
			`LORA_RANK=('0')`

			`export OMP_NUM_THREADS=8`

			`rm ./benchmark_memory_consumption.txt`
			`rm ./benchmark_performance_summarization.txt`

			`# install requirements`
			`pip install -r $EXAMPLES_DIR/requirements.txt`

			`random_choice() {`
			`local arr=("$@")`
			`local len=${#arr[@]}`
			`local idx=$((RANDOM % len))`
			`echo ${arr[$idx]}`
			`}`

			`echo "[Test]: testing ppo ..."`

			`SKIPPED_TESTS=(`
			`)`

			`GRAD_CKPTS=('' '--grad_checkpoint')`
			`GRAD_CKPTS=('')`
			`for lora_rank in ${LORA_RANK[@]}; do`
			`for model in ${MODELS[@]}; do`
			`plugins=($(shuf -e "${PLUGINS[@]}"))`
			`for plugin in ${plugins[@]}; do`
			`if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin-$lora_rank " ]]; then`
			`echo "[Test]: Skipped $model-$plugin-$lora_rank"`
			`continue`
			`elif [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin " ]]; then`
			`echo "[Test]: Skipped $model-$plugin"`
			`continue`
			`fi`
			`pretrain=$model`
			`tokenizer_dir="facebook/opt-125m"`
			`grad_ckpt=$(random_choice "${GRAD_CKPTS[@]}")`
			`tp='1'`
			`if [[ $plugin == "3d" ]]; then`
			`tp='4'`
			`fi`
			`for i in $(seq $NUM_RETRY); do`
			`echo "[Test]: $model-$plugin-$lora_rank, attempt $i"`
			`declare -a prompt_dataset=()`
			`for split in $(seq -f "%05g" 0 9); do`
			`prompt_dataset+=("$TEMP_DIR/benchmark/arrow/part-$split")`
			`done`
			`colossalai run --nproc_per_node 8 --master_port 28547 $BASE_DIR/benchmarks/benchmark_ppo.py \`
			`--pretrain $pretrain \`
			`--tokenizer_dir $tokenizer_dir \`
			`--prompt_dataset ${prompt_dataset[@]} \`
			`--ptx_coef 0 \`
			`--save_path $MODEL_SAVE_PATH \`
			`--conversation_template_config ./Opt.json \`
			`--lora_rank $lora_rank \`
			`--plugin $plugin \`
			`--num_episodes 5 \`
			`--num_collect_steps 1 \`
			`--num_update_steps 1 \`
			`--max_seq_len 128 \`
			`--max_length 512 \`
			`--experience_batch_size 32 \`
			`--train_batch_size 32 \`
			`--accumulation_steps 1 \`
			`--lr 9e-6 \`
			`--mixed_precision "bf16" \`
			`--grad_clip 1.0 \`
			`--use_flash_attn \`
			`--tp $tp \`
			`--lr 2e-5 \`
			`$grad_ckpt`
			`passed=$?`
			`if [ $passed -eq 0 ]; then`
			`rm -rf $MODEL_SAVE_PATH/*`
			`rm -rf $MODELS_DIR/*`
			`break`
			`fi`
			`done`
			`if [ $passed -ne 0 ]; then`
			`echo "[Test]: Failed $model-$plugin-$lora_rank"`
			`exit 1`
			`fi`
			`done`
			`done`
			`done`