From b79d5ea7ae326b31d732d888a7f322af0fc0e3fe Mon Sep 17 00:00:00 2001 From: kkscilife <126147887+kkscilife@users.noreply.github.com> Date: Thu, 30 Nov 2023 11:04:07 +0800 Subject: [PATCH] test(workflow): add workflow for loss test and change trigger event (#513) * add workflow for loss test * change trigger event * optimize trigger event --------- Co-authored-by: wangmengke --- .github/workflows/pr_merged.yaml | 17 ++++++++++ .github/workflows/weekly_test.yaml | 53 +++++++++++++++++++----------- 2 files changed, 50 insertions(+), 20 deletions(-) diff --git a/.github/workflows/pr_merged.yaml b/.github/workflows/pr_merged.yaml index 65e273b..5a09019 100644 --- a/.github/workflows/pr_merged.yaml +++ b/.github/workflows/pr_merged.yaml @@ -50,3 +50,20 @@ jobs: source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 export PYTHONPATH=$PWD:$PYTHONPATH srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-acc-test-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python ./tests/test_training/train_CI.py --config ./tests/test_training/7B_check_acc.py + + check_loss_when_swapping_micro_num_and_micro_bsz: + if: ${{ !cancelled() }} + needs: check-requirements + runs-on: [t_cluster] + timeout-minutes: 40 + steps: + - name: mask env + run: | + echo "::add-mask::${{env.WORKSPACE_PREFIX}}" + - uses: actions/checkout@v3 + + - name: loss_tests + run: | + source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 + export PYTHONPATH=$PWD:$PYTHONPATH + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-loss-test-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_training/test_swap_nb_loss_and_gradnorm.py diff --git a/.github/workflows/weekly_test.yaml b/.github/workflows/weekly_test.yaml index bf360c8..133bccc 100644 --- a/.github/workflows/weekly_test.yaml +++ b/.github/workflows/weekly_test.yaml @@ -1,102 +1,115 @@ name: weekly-tests on: - push: - branches: - - "main" - - "develop" + workflow_dispatch: + schedule: + - cron: '56 18 * * 5' env: SLURM_PARTITION: llm_s jobs: training_8GPU: runs-on: [t_cluster] - timeout-minutes: 5 + timeout-minutes: 10 steps: - uses: actions/checkout@v3 + with: + ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }} - name: training_8GPU run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training training_16GPU_8DP2TP: runs-on: [t_cluster] - timeout-minutes: 5 + timeout-minutes: 10 steps: - uses: actions/checkout@v3 + with: + ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }} - name: training_16GPU_8DP2TP run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 sed -i 's/^.*tensor=.*/ tensor=2,/' ./configs/7B_sft.py - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training training_16GPU_8DP2TPSP: runs-on: [t_cluster] - timeout-minutes: 5 + timeout-minutes: 10 steps: - uses: actions/checkout@v3 + with: + ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }} - name: training_16GPU_8DP2TPSP run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 sed -i 's/^.*tensor=.*/ tensor=2,/' ./configs/7B_sft.py sed -i 's/^.*sequence_parallel=.*/ sequence_parallel=True,/' ./configs/7B_sft.py - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training training_16GPU_8DP2PP: runs-on: [t_cluster] - timeout-minutes: 5 + timeout-minutes: 10 steps: - uses: actions/checkout@v3 + with: + ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }} - name: training_16GPU_8DP2PP run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2),/' ./configs/7B_sft.py - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training training_16GPU_8DP2PP_InterleavedOverlap: runs-on: [t_cluster] - timeout-minutes: 5 + timeout-minutes: 10 steps: - uses: actions/checkout@v3 + with: + ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }} - name: training_16GPU_8DP2PP_InterleavedOverlap run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 sed -i 's/^.*pipeline=.*/ pipeline=dict(size=2, interleaved_overlap=True),/' ./configs/7B_sft.py sed -i 's/^.*num_chunks=.*/ num_chunks=2,/' ./configs/7B_sft.py - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training unit_test_optimizer: runs-on: [t_cluster] - timeout-minutes: 30 + timeout-minutes: 35 steps: - uses: actions/checkout@v3 + with: + ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }} - name: test_optimizer run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_solver/test_optimizer.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_solver/test_optimizer.py unit_test_model: runs-on: [t_cluster] - timeout-minutes: 5 + timeout-minutes: 10 steps: - uses: actions/checkout@v3 + with: + ref: ${{ github.event_name == 'schedule' && 'develop' || github.event_name == 'workflow_dispatch' && '' }} - name: test_embedding_accuracy run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_embedding.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_embedding.py - name: test_model_internlm_accuracy run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_model_internlm.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_model_internlm.py - name: test_norm_accuracy run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_norm.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s ./tests/test_model/test_norm.py