mirror of https://github.com/InternLM/InternLM
test(ci_scripts): add timeout settings and clean work after the slurm job (#185)
* restore pr test on develop branch * add mask * add post action to cancel slurm job * remove readonly attribute on job log * add debug info * debug job log * try stdin * use stdin * set default value avoid error * try setting readonly on job log * performance echo * remove debug info * use squeue to check slurm job status * restore the lossed parm * litmit retry times * use exclusive to avoid port already in use * optimize loop body * remove partition * add {} for variables * set env variable for slurm partition --------- Co-authored-by: qa-caif-cicd <qa-caif-cicd@pjlab.org.cn>pull/203/head
parent
7cfea534e7
commit
ccb06a98e4
|
@ -3,14 +3,21 @@ on:
|
||||||
pull_request:
|
pull_request:
|
||||||
branches:
|
branches:
|
||||||
- "main"
|
- "main"
|
||||||
|
- "develop"
|
||||||
paths-ignore:
|
paths-ignore:
|
||||||
- "docs/**"
|
- "docs/**"
|
||||||
- "**.md"
|
- "**.md"
|
||||||
|
env:
|
||||||
|
WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
|
||||||
|
SLURM_PARTITION: llm
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
check-requirements:
|
check-requirements:
|
||||||
runs-on: [lmtest]
|
runs-on: [lmtest]
|
||||||
steps:
|
steps:
|
||||||
|
- name: mask env
|
||||||
|
run: |
|
||||||
|
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
with:
|
with:
|
||||||
fetch-depth: 2
|
fetch-depth: 2
|
||||||
|
@ -32,12 +39,15 @@ jobs:
|
||||||
needs: check-requirements
|
needs: check-requirements
|
||||||
runs-on: [lmtest]
|
runs-on: [lmtest]
|
||||||
steps:
|
steps:
|
||||||
|
- name: mask env
|
||||||
|
run: |
|
||||||
|
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: raw-chinese-data
|
- name: raw-chinese-data
|
||||||
run: |
|
run: |
|
||||||
source activate internlm-env-test
|
source activate internlm-env-test
|
||||||
sh ./ci_scripts/data/tokenizer_chinese.sh
|
sh ./ci_scripts/data/tokenizer_chinese.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
|
||||||
|
|
||||||
- name: alpaca-data
|
- name: alpaca-data
|
||||||
run: |
|
run: |
|
||||||
|
@ -48,19 +58,23 @@ jobs:
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
needs: check-requirements
|
needs: check-requirements
|
||||||
runs-on: [lmtest]
|
runs-on: [lmtest]
|
||||||
|
timeout-minutes: 30
|
||||||
steps:
|
steps:
|
||||||
|
- name: mask env
|
||||||
|
run: |
|
||||||
|
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: slurm-train
|
- name: slurm-train
|
||||||
run: |
|
run: |
|
||||||
source activate internlm-env-test
|
source activate internlm-env-test
|
||||||
sh ./ci_scripts/train/slurm_train.sh
|
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
|
||||||
rm -rf $GITHUB_WORKSPACE/llm_ckpts
|
rm -rf $GITHUB_WORKSPACE/llm_ckpts
|
||||||
|
|
||||||
- name: torchrun-train
|
- name: torchrun-train
|
||||||
run: |
|
run: |
|
||||||
source activate internlm-env-test
|
source activate internlm-env-test
|
||||||
sh ./ci_scripts/train/torchrun.sh
|
sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
|
||||||
rm -rf $GITHUB_WORKSPACE/llm_ckpts
|
rm -rf $GITHUB_WORKSPACE/llm_ckpts
|
||||||
|
|
||||||
convert-model-then-load:
|
convert-model-then-load:
|
||||||
|
@ -68,6 +82,9 @@ jobs:
|
||||||
needs: check-requirements
|
needs: check-requirements
|
||||||
runs-on: [lmtest]
|
runs-on: [lmtest]
|
||||||
steps:
|
steps:
|
||||||
|
- name: mask env
|
||||||
|
run: |
|
||||||
|
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: convert-model-then-load
|
- name: convert-model-then-load
|
||||||
|
@ -76,7 +93,7 @@ jobs:
|
||||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||||
sh ./ci_scripts/model/convert_to_hf.sh
|
sh ./ci_scripts/model/convert_to_hf.sh
|
||||||
cd ./hf_ckpt
|
cd ./hf_ckpt
|
||||||
srun -p llm python ../ci_scripts/model/loaded_as_transformer.py
|
srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py
|
||||||
cd ..
|
cd ..
|
||||||
rm -rf $GITHUB_WORKSPACE/hf_ckpt
|
rm -rf $GITHUB_WORKSPACE/hf_ckpt
|
||||||
|
|
||||||
|
@ -85,9 +102,12 @@ jobs:
|
||||||
needs: check-requirements
|
needs: check-requirements
|
||||||
runs-on: [lmtest]
|
runs-on: [lmtest]
|
||||||
steps:
|
steps:
|
||||||
|
- name: mask env
|
||||||
|
run: |
|
||||||
|
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: chat-model-in-hf
|
- name: chat-model-in-hf
|
||||||
run: |
|
run: |
|
||||||
source activate internlm-env-test
|
source activate internlm-env-test
|
||||||
srun -p llm python ./ci_scripts/model/demo_load_7B_chat_model.py
|
srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ./ci_scripts/model/demo_load_7B_chat_model.py
|
||||||
|
|
|
@ -10,8 +10,7 @@ on:
|
||||||
jobs:
|
jobs:
|
||||||
# lint check can be auto-executed by the workflow
|
# lint check can be auto-executed by the workflow
|
||||||
lint-check:
|
lint-check:
|
||||||
runs-on: [internlm]
|
runs-on: ubuntu-latest
|
||||||
if: github.repository == 'InternLM/InternLM'
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -x
|
||||||
|
|
||||||
|
retry_times=3
|
||||||
|
for ((i=1;i<=$retry_times;i++));do
|
||||||
|
jobid=$(squeue -o "%A %j" -u $USER | grep ${GITHUB_RUN_ID}-${GITHUB_JOB} | awk '{print $1}')
|
||||||
|
if [[ -n "$jobid" ]];then
|
||||||
|
echo "The job $jobid will be canceled."
|
||||||
|
scancel $jobid
|
||||||
|
sleep 0.5
|
||||||
|
else
|
||||||
|
echo "There are no more jobs that need to be canceled."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ $i -gt $retry_times ]];then
|
||||||
|
echo "There have been tried $retry_times times. Please contact user $USER to confirm the job status."
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
|
@ -22,7 +22,7 @@ if [[ ${num} -gt 0 ]]; then
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
srun -p llm python tools/tokenizer.py --text_input_path ${DATA} --bin_output_path ${RESULT}
|
srun -p ${SLURM_PARTITION} --job-name=$1 --gpus-per-task=1 python tools/tokenizer.py --text_input_path ${DATA} --bin_output_path ${RESULT}
|
||||||
[[ $? -ne 0 ]] && { echo "test tokenizer.py failed."; exit_code=$(($exit_code + 1)); }
|
[[ $? -ne 0 ]] && { echo "test tokenizer.py failed."; exit_code=$(($exit_code + 1)); }
|
||||||
|
|
||||||
file_list=($RESULT $RESULT_META)
|
file_list=($RESULT $RESULT_META)
|
||||||
|
|
|
@ -19,7 +19,7 @@ if [[ -d ${CKPTS20_PATH} ]]; then
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
srun -p llm --quotatype=spot -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py
|
srun -p ${SLURM_PARTITION} --exclusive --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py
|
||||||
[[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); }
|
[[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); }
|
||||||
|
|
||||||
num=$(num_files "${CKPTS20_OUTPUT}")
|
num=$(num_files "${CKPTS20_OUTPUT}")
|
||||||
|
|
|
@ -19,7 +19,7 @@ if [[ -d ${CKPTS20_PATH} ]]; then
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
srun -p llm -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch
|
srun -p ${SLURM_PARTITION} --exclusive --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch
|
||||||
[[ $? -ne 0 ]] && { echo "test torch training failed."; exit_code=$(($exit_code + 1)); }
|
[[ $? -ne 0 ]] && { echo "test torch training failed."; exit_code=$(($exit_code + 1)); }
|
||||||
|
|
||||||
num=$(num_files "${CKPTS_OUTPUT}")
|
num=$(num_files "${CKPTS_OUTPUT}")
|
||||||
|
|
Loading…
Reference in New Issue