diff --git a/.github/workflows/demo_in_readme.yaml b/.github/workflows/demo_in_readme.yaml index a3d4cd9..e8b1790 100644 --- a/.github/workflows/demo_in_readme.yaml +++ b/.github/workflows/demo_in_readme.yaml @@ -35,7 +35,7 @@ jobs: fi dataset-preparation: - if: ${{ always() }} + if: ${{ !cancelled() }} needs: check-requirements runs-on: [t_cluster] steps: @@ -55,7 +55,7 @@ jobs: sh ./ci_scripts/data/tokenizer_alpaca.sh train: - if: ${{ always() }} + if: ${{ !cancelled() }} needs: check-requirements runs-on: [t_cluster] timeout-minutes: 30 @@ -92,7 +92,7 @@ jobs: rsync -av --remove-source-files $GITHUB_WORKSPACE/llm_ckpts ${{env.WORKSPACE_PREFIX}}/ci_clean_bak convert-model-then-load: - if: ${{ always() }} + if: ${{ !cancelled() }} needs: check-requirements runs-on: [t_cluster] timeout-minutes: 15 @@ -108,11 +108,11 @@ jobs: export PYTHONPATH=$PWD:$PYTHONPATH sh ./ci_scripts/model/convert_to_hf.sh cd ./hf_ckpt - srun -p ${SLURM_PARTITION} --quotatype=spot --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --quotatype=spot --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py cd .. rsync -av --remove-source-files $GITHUB_WORKSPACE/hf_ckpt ${{env.WORKSPACE_PREFIX}}/ci_clean_bak load-chat-model-in-hf: - if: ${{ always() }} + if: ${{ !cancelled() }} needs: check-requirements runs-on: [t_cluster] timeout-minutes: 15 @@ -125,4 +125,4 @@ jobs: - name: chat-model-in-hf run: | source activate internlm-env-test - srun -p ${SLURM_PARTITION} --quotatype=spot --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ./ci_scripts/model/demo_load_7B_chat_model.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --quotatype=spot --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ./ci_scripts/model/demo_load_7B_chat_model.py diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index 75edf17..139eda6 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -19,4 +19,4 @@ jobs: - name: training_8GPU run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 - srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training diff --git a/.github/workflows/pr_merged.yaml b/.github/workflows/pr_merged.yaml new file mode 100644 index 0000000..65e273b --- /dev/null +++ b/.github/workflows/pr_merged.yaml @@ -0,0 +1,52 @@ +name: pr-merged +on: + push: + branches: + - "develop" + - "main" + paths-ignore: + - "cmds/**" + - "**.md" +env: + WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4) + SLURM_PARTITION: llm_s + +jobs: + check-requirements: + runs-on: [t_cluster] + steps: + - name: mask env + run: | + echo "::add-mask::${{env.WORKSPACE_PREFIX}}" + - uses: actions/checkout@v3 + with: + fetch-depth: 2 + - name: check-requirements + run: | + changed_files=$(git diff --name-only -r HEAD^1 HEAD) + echo $changed_files + if [[ $changed_files =~ "runtime.txt" ]]; then + pip install -r requirements/runtime.txt + fi + + if [[ $changed_files =~ "torch.txt" ]]; then + pip install -r requirements/torch.txt + fi + + + acc_tests: + if: ${{ !cancelled() }} + needs: check-requirements + runs-on: [t_cluster] + timeout-minutes: 30 + steps: + - name: mask env + run: | + echo "::add-mask::${{env.WORKSPACE_PREFIX}}" + - uses: actions/checkout@v3 + + - name: acc_tests + run: | + source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 + export PYTHONPATH=$PWD:$PYTHONPATH + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-acc-test-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python ./tests/test_training/train_CI.py --config ./tests/test_training/7B_check_acc.py diff --git a/.github/workflows/unit_tests.yaml b/.github/workflows/unit_tests.yaml index dc8c7e1..3f49868 100644 --- a/.github/workflows/unit_tests.yaml +++ b/.github/workflows/unit_tests.yaml @@ -42,7 +42,7 @@ jobs: unit_tests_core_pipeline: - if: ${{ always() }} + if: ${{ !cancelled() }} needs: check-requirements runs-on: [t_cluster] timeout-minutes: 20 @@ -56,10 +56,10 @@ jobs: run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 export PYTHONPATH=$PWD:$PYTHONPATH - srun -p ${SLURM_PARTITION} --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_core/test_pipeline.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_core/test_pipeline.py unit_tests_utils_storage_manager: - if: ${{ always() }} + if: ${{ !cancelled() }} needs: check-requirements runs-on: [t_cluster] timeout-minutes: 20 @@ -73,4 +73,4 @@ jobs: run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 export PYTHONPATH=$PWD:$PYTHONPATH - srun -p ${SLURM_PARTITION} --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_utils/test_storage_manager.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_utils/test_storage_manager.py diff --git a/.github/workflows/weekly_test.yaml b/.github/workflows/weekly_test.yaml index 880d097..bf360c8 100644 --- a/.github/workflows/weekly_test.yaml +++ b/.github/workflows/weekly_test.yaml @@ -17,7 +17,7 @@ jobs: - name: training_8GPU run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 - srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training training_16GPU_8DP2TP: runs-on: [t_cluster] @@ -29,7 +29,7 @@ jobs: run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 sed -i 's/^.*tensor=.*/ tensor=2,/' ./configs/7B_sft.py - srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training training_16GPU_8DP2TPSP: runs-on: [t_cluster] @@ -42,7 +42,7 @@ jobs: source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 sed -i 's/^.*tensor=.*/ tensor=2,/' ./configs/7B_sft.py sed -i 's/^.*sequence_parallel=.*/ sequence_parallel=True,/' ./configs/7B_sft.py - srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training training_16GPU_8DP2PP: runs-on: [t_cluster] @@ -54,7 +54,7 @@ jobs: run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py - srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training training_16GPU_8DP2PP_InterleavedOverlap: runs-on: [t_cluster] @@ -67,7 +67,7 @@ jobs: source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2, interleaved_overlap=True),/' ./configs/7B_sft.py sed -i 's/^.*num_chunks=.*/ num_chunks=2,/' ./configs/7B_sft.py - srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training unit_test_optimizer: runs-on: [t_cluster] @@ -78,7 +78,7 @@ jobs: - name: test_optimizer run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 - srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_solver/test_optimizer.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_solver/test_optimizer.py unit_test_model: runs-on: [t_cluster] @@ -89,14 +89,14 @@ jobs: - name: test_embedding_accuracy run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 - srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_embedding.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_embedding.py - name: test_model_internlm_accuracy run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 - srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_model_internlm.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_model_internlm.py - name: test_norm_accuracy run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 - srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_norm.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_norm.py diff --git a/ci_scripts/data/tokenizer_chinese.sh b/ci_scripts/data/tokenizer_chinese.sh index 81a5198..0715ed0 100644 --- a/ci_scripts/data/tokenizer_chinese.sh +++ b/ci_scripts/data/tokenizer_chinese.sh @@ -23,7 +23,7 @@ if [[ ${num} -gt 0 ]]; then fi fi -srun -p ${SLURM_PARTITION} --quotatype=spot --job-name=$1 --gpus-per-task=1 python tools/tokenizer.py --text_input_path ${DATA} --bin_output_path ${RESULT} +srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --quotatype=spot --job-name=$1 --gpus-per-task=1 python tools/tokenizer.py --text_input_path ${DATA} --bin_output_path ${RESULT} [[ $? -ne 0 ]] && { echo "test tokenizer.py failed."; exit_code=$(($exit_code + 1)); } file_list=($RESULT $RESULT_META) diff --git a/ci_scripts/train/load_ckpt.sh b/ci_scripts/train/load_ckpt.sh index 06c6c1e..d6d7e62 100644 --- a/ci_scripts/train/load_ckpt.sh +++ b/ci_scripts/train/load_ckpt.sh @@ -22,7 +22,7 @@ if [[ ! -f ${file} ]]; then exit_code=$(($exit_code + 1)) fi -srun -p ${SLURM_PARTITION} --exclusive --quotatype=spot --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file} +srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --quotatype=spot --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file} [[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); } diff --git a/ci_scripts/train/slurm_train.sh b/ci_scripts/train/slurm_train.sh index 3871fc4..5bb79ee 100644 --- a/ci_scripts/train/slurm_train.sh +++ b/ci_scripts/train/slurm_train.sh @@ -22,7 +22,7 @@ if [[ -d ${CKPTS20_PATH} ]]; then fi fi -srun -p ${SLURM_PARTITION} --exclusive --quotatype=spot --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py +srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --quotatype=spot --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py [[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); } num=$(num_files "${CKPTS20_OUTPUT}") diff --git a/ci_scripts/train/torchrun.sh b/ci_scripts/train/torchrun.sh index 29ed54f..941bb4f 100644 --- a/ci_scripts/train/torchrun.sh +++ b/ci_scripts/train/torchrun.sh @@ -22,7 +22,7 @@ if [[ -d ${CKPTS20_PATH} ]]; then fi fi -srun -p ${SLURM_PARTITION} --exclusive --quotatype=spot --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch +srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --quotatype=spot --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch [[ $? -ne 0 ]] && { echo "test torch training failed."; exit_code=$(($exit_code + 1)); } num=$(num_files "${CKPTS_OUTPUT}")