diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index 2935a6369..ee916ead2 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -91,7 +91,7 @@ SKIPPED_TESTS=( llama-gemini_auto-20 # gemini_auto plugin doesn't support lora llama-gemini-20 # gemini doesn't support lora ) -skip_eval=false + GRAD_CKPTS=('--grad_checkpoint') for lora_rank in ${LORA_RANK[@]}; do for model in ${MODELS[@]}; do @@ -134,13 +134,11 @@ for lora_rank in ${LORA_RANK[@]}; do bs='8' pp='2' plugin='3d' - skip_eval=true fi if [[ $plugin == "pp" ]]; then bs='8' pp='2' plugin='3d' - skip_eval=true fi if [[ $plugin == "sp_split_gather" ]]; then enable_sequence_parallelism='--enable_sequence_parallelism' @@ -178,53 +176,29 @@ for lora_rank in ${LORA_RANK[@]}; do for split in $(seq -f "%05g" 0 0); do dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_sft/arrow/part-$split") done - - if [[ $skip_eval ]]; then - colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ - --pretrain $pretrain \ - --tokenizer_dir $tokenizer_dir \ - --dataset ${dataset[@]} \ - --save_path $MODEL_SAVE_PATH \ - --config_file $MODELS_DIR/config.jsonl \ - $lora_config \ - --plugin $plugin \ - --batch_size $bs \ - --max_epochs 1 \ - --accumulation_steps $grad_accu \ - --tp $tp \ - --pp $pp \ - --zero_stage $zero_stage \ - --sp $sp \ - --sp_mode $sp_mode \ - $enable_sequence_parallelism \ - --lr 2e-5 \ - $grad_ckpt \ - --max_len 400 \ - --use_flash_attn - else - colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ - --pretrain $pretrain \ - --tokenizer_dir $tokenizer_dir \ - --dataset ${dataset[@]} \ - --eval_dataset ${dataset[@]} \ - --save_path $MODEL_SAVE_PATH \ - --config_file $MODELS_DIR/config.jsonl \ - $lora_config \ - --plugin $plugin \ - --batch_size $bs \ - --max_epochs 1 \ - --accumulation_steps $grad_accu \ - --tp $tp \ - --pp $pp \ - --zero_stage $zero_stage \ - --sp $sp \ - --sp_mode $sp_mode \ - $enable_sequence_parallelism \ - --lr 2e-5 \ - $grad_ckpt \ - --max_len 400 \ - --use_flash_attn - fi + colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ + --pretrain $pretrain \ + --tokenizer_dir $tokenizer_dir \ + --dataset ${dataset[@]} \ + --eval_dataset ${dataset[@]} \ + --save_path $MODEL_SAVE_PATH \ + --config_file $MODELS_DIR/config.jsonl \ + $lora_config \ + --plugin $plugin \ + --batch_size $bs \ + --max_epochs 1 \ + --accumulation_steps $grad_accu \ + --tp $tp \ + --pp $pp \ + --zero_stage $zero_stage \ + --sp $sp \ + --sp_mode $sp_mode \ + $enable_sequence_parallelism \ + --lr 2e-5 \ + $grad_ckpt \ + --max_len 400 \ + --use_flash_attn + # fi passed=$? if [ $passed -eq 0 ]; then rm -rf ${MODEL_SAVE_PATH:?}/*