From a58bf853dbacb8d668149dd388ef2baebf048ded Mon Sep 17 00:00:00 2001 From: kkscilife <126147887+kkscilife@users.noreply.github.com> Date: Wed, 20 Dec 2023 14:41:09 +0800 Subject: [PATCH] change into reserved (#550) Co-authored-by: kkscilife --- .github/workflows/demo_in_readme.yaml | 4 ++-- .github/workflows/e2e_test.yaml | 2 +- .github/workflows/pr_before_merge.yaml | 2 +- .github/workflows/pr_merged.yaml | 2 +- .github/workflows/unit_tests.yaml | 12 ++++++------ ci_scripts/data/tokenizer_chinese.sh | 2 +- ci_scripts/train/load_ckpt.sh | 2 +- ci_scripts/train/slurm_train.sh | 2 +- ci_scripts/train/torchrun.sh | 2 +- 9 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/workflows/demo_in_readme.yaml b/.github/workflows/demo_in_readme.yaml index e8b1790..f4a01e8 100644 --- a/.github/workflows/demo_in_readme.yaml +++ b/.github/workflows/demo_in_readme.yaml @@ -108,7 +108,7 @@ jobs: export PYTHONPATH=$PWD:$PYTHONPATH sh ./ci_scripts/model/convert_to_hf.sh cd ./hf_ckpt - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --quotatype=spot --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py cd .. rsync -av --remove-source-files $GITHUB_WORKSPACE/hf_ckpt ${{env.WORKSPACE_PREFIX}}/ci_clean_bak load-chat-model-in-hf: @@ -125,4 +125,4 @@ jobs: - name: chat-model-in-hf run: | source activate internlm-env-test - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --quotatype=spot --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ./ci_scripts/model/demo_load_7B_chat_model.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ./ci_scripts/model/demo_load_7B_chat_model.py diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index 139eda6..805f822 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -19,4 +19,4 @@ jobs: - name: training_8GPU run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training diff --git a/.github/workflows/pr_before_merge.yaml b/.github/workflows/pr_before_merge.yaml index a853645..212bc05 100644 --- a/.github/workflows/pr_before_merge.yaml +++ b/.github/workflows/pr_before_merge.yaml @@ -49,4 +49,4 @@ jobs: run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 export PYTHONPATH=$PWD:$PYTHONPATH - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python ./tests/test_training/train_CI.py --config ./tests/test_training/7B_check_init.py --seed=1024 + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python ./tests/test_training/train_CI.py --config ./tests/test_training/7B_check_init.py --seed=1024 diff --git a/.github/workflows/pr_merged.yaml b/.github/workflows/pr_merged.yaml index 5a09019..945e3e8 100644 --- a/.github/workflows/pr_merged.yaml +++ b/.github/workflows/pr_merged.yaml @@ -49,7 +49,7 @@ jobs: run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 export PYTHONPATH=$PWD:$PYTHONPATH - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-acc-test-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python ./tests/test_training/train_CI.py --config ./tests/test_training/7B_check_acc.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-acc-test-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python ./tests/test_training/train_CI.py --config ./tests/test_training/7B_check_acc.py check_loss_when_swapping_micro_num_and_micro_bsz: if: ${{ !cancelled() }} diff --git a/.github/workflows/unit_tests.yaml b/.github/workflows/unit_tests.yaml index 581b293..91cc20a 100644 --- a/.github/workflows/unit_tests.yaml +++ b/.github/workflows/unit_tests.yaml @@ -56,7 +56,7 @@ jobs: run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 export PYTHONPATH=$PWD:$PYTHONPATH - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_core/test_pipeline.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_core/test_pipeline.py unit_tests_utils_storage_manager: if: ${{ !cancelled() }} @@ -73,7 +73,7 @@ jobs: run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 export PYTHONPATH=$PWD:$PYTHONPATH - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_utils/test_storage_manager.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_utils/test_storage_manager.py unit_tests_model_fused_precision: if: ${{ !cancelled() }} @@ -90,7 +90,7 @@ jobs: run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 export PYTHONPATH=$PWD:$PYTHONPATH - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_model/test_fused_precision/test_fused_precision.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_model/test_fused_precision/test_fused_precision.py unit_tests_data_batch_sampler: if: ${{ !cancelled() }} @@ -107,7 +107,7 @@ jobs: run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 export PYTHONPATH=$PWD:$PYTHONPATH - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_data/test_batch_sampler.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_data/test_batch_sampler.py unit_tests_utils_timeout: if: ${{ !cancelled() }} @@ -124,7 +124,7 @@ jobs: run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 export PYTHONPATH=$PWD:$PYTHONPATH - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:1 python -m pytest -s -v ./tests/test_utils/test_timeout.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:1 python -m pytest -s -v ./tests/test_utils/test_timeout.py unit_tests_utils_model_checkpoint: if: ${{ !cancelled() }} @@ -141,4 +141,4 @@ jobs: run: | source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0 export PYTHONPATH=$PWD:$PYTHONPATH - srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:2 python -m pytest -s -v ./tests/test_utils/test_model_checkpoint.py + srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} -N 1 -n 1 --gres=gpu:2 python -m pytest -s -v ./tests/test_utils/test_model_checkpoint.py diff --git a/ci_scripts/data/tokenizer_chinese.sh b/ci_scripts/data/tokenizer_chinese.sh index 0715ed0..c32d29e 100644 --- a/ci_scripts/data/tokenizer_chinese.sh +++ b/ci_scripts/data/tokenizer_chinese.sh @@ -23,7 +23,7 @@ if [[ ${num} -gt 0 ]]; then fi fi -srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --quotatype=spot --job-name=$1 --gpus-per-task=1 python tools/tokenizer.py --text_input_path ${DATA} --bin_output_path ${RESULT} +srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$1 --gpus-per-task=1 python tools/tokenizer.py --text_input_path ${DATA} --bin_output_path ${RESULT} [[ $? -ne 0 ]] && { echo "test tokenizer.py failed."; exit_code=$(($exit_code + 1)); } file_list=($RESULT $RESULT_META) diff --git a/ci_scripts/train/load_ckpt.sh b/ci_scripts/train/load_ckpt.sh index d6d7e62..287adbd 100644 --- a/ci_scripts/train/load_ckpt.sh +++ b/ci_scripts/train/load_ckpt.sh @@ -22,7 +22,7 @@ if [[ ! -f ${file} ]]; then exit_code=$(($exit_code + 1)) fi -srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --quotatype=spot --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file} +srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file} [[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); } diff --git a/ci_scripts/train/slurm_train.sh b/ci_scripts/train/slurm_train.sh index 5bb79ee..b3117a1 100644 --- a/ci_scripts/train/slurm_train.sh +++ b/ci_scripts/train/slurm_train.sh @@ -22,7 +22,7 @@ if [[ -d ${CKPTS20_PATH} ]]; then fi fi -srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --quotatype=spot --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py +srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py [[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); } num=$(num_files "${CKPTS20_OUTPUT}") diff --git a/ci_scripts/train/torchrun.sh b/ci_scripts/train/torchrun.sh index 941bb4f..31681d0 100644 --- a/ci_scripts/train/torchrun.sh +++ b/ci_scripts/train/torchrun.sh @@ -22,7 +22,7 @@ if [[ -d ${CKPTS20_PATH} ]]; then fi fi -srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --quotatype=spot --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch +srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --exclusive --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch [[ $? -ne 0 ]] && { echo "test torch training failed."; exit_code=$(($exit_code + 1)); } num=$(num_files "${CKPTS_OUTPUT}")