mirror of https://github.com/InternLM/InternLM
				
				
				
			test(ci_scripts): add timeout settings and clean work after the slurm job (#185)
* restore pr test on develop branch
* add mask
* add post action to cancel slurm job
* remove readonly attribute on job log
* add debug info
* debug job log
* try stdin
* use stdin
* set default value avoid error
* try setting readonly on job log
* performance echo
* remove debug info
* use squeue to check slurm job status
* restore the lossed parm
* litmit retry times
* use exclusive to avoid port already in use
* optimize loop body
* remove partition
* add {} for variables
* set env variable for slurm partition
---------
Co-authored-by: qa-caif-cicd <qa-caif-cicd@pjlab.org.cn>
			
			
				pull/203/head
			
			
		
							parent
							
								
									7cfea534e7
								
							
						
					
					
						commit
						ccb06a98e4
					
				| 
						 | 
				
			
			@ -3,14 +3,21 @@ on:
 | 
			
		|||
  pull_request:
 | 
			
		||||
    branches:
 | 
			
		||||
      - "main"
 | 
			
		||||
      - "develop"
 | 
			
		||||
    paths-ignore:
 | 
			
		||||
      - "docs/**"
 | 
			
		||||
      - "**.md"
 | 
			
		||||
env:
 | 
			
		||||
  WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
 | 
			
		||||
  SLURM_PARTITION: llm
 | 
			
		||||
 | 
			
		||||
jobs:
 | 
			
		||||
  check-requirements:
 | 
			
		||||
    runs-on: [lmtest]
 | 
			
		||||
    steps:
 | 
			
		||||
    - name: mask env
 | 
			
		||||
      run: |
 | 
			
		||||
        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
 | 
			
		||||
    - uses: actions/checkout@v3
 | 
			
		||||
      with:
 | 
			
		||||
         fetch-depth: 2
 | 
			
		||||
| 
						 | 
				
			
			@ -32,12 +39,15 @@ jobs:
 | 
			
		|||
    needs: check-requirements
 | 
			
		||||
    runs-on: [lmtest]
 | 
			
		||||
    steps:
 | 
			
		||||
    - name: mask env 
 | 
			
		||||
      run: |
 | 
			
		||||
        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
 | 
			
		||||
    - uses: actions/checkout@v3
 | 
			
		||||
 | 
			
		||||
    - name: raw-chinese-data
 | 
			
		||||
      run: |
 | 
			
		||||
        source activate internlm-env-test
 | 
			
		||||
        sh ./ci_scripts/data/tokenizer_chinese.sh
 | 
			
		||||
        sh ./ci_scripts/data/tokenizer_chinese.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
 | 
			
		||||
 | 
			
		||||
    - name: alpaca-data
 | 
			
		||||
      run: |
 | 
			
		||||
| 
						 | 
				
			
			@ -48,19 +58,23 @@ jobs:
 | 
			
		|||
    if: ${{ always() }}
 | 
			
		||||
    needs: check-requirements
 | 
			
		||||
    runs-on: [lmtest]
 | 
			
		||||
    timeout-minutes: 30
 | 
			
		||||
    steps:
 | 
			
		||||
    - name: mask env 
 | 
			
		||||
      run: |
 | 
			
		||||
        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
 | 
			
		||||
    - uses: actions/checkout@v3
 | 
			
		||||
 | 
			
		||||
    - name: slurm-train
 | 
			
		||||
      run: |
 | 
			
		||||
        source activate internlm-env-test
 | 
			
		||||
        sh ./ci_scripts/train/slurm_train.sh
 | 
			
		||||
        sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
 | 
			
		||||
        rm -rf $GITHUB_WORKSPACE/llm_ckpts
 | 
			
		||||
 | 
			
		||||
    - name: torchrun-train
 | 
			
		||||
      run: |
 | 
			
		||||
        source activate internlm-env-test
 | 
			
		||||
        sh ./ci_scripts/train/torchrun.sh
 | 
			
		||||
        sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
 | 
			
		||||
        rm -rf $GITHUB_WORKSPACE/llm_ckpts
 | 
			
		||||
 | 
			
		||||
  convert-model-then-load:
 | 
			
		||||
| 
						 | 
				
			
			@ -68,6 +82,9 @@ jobs:
 | 
			
		|||
    needs: check-requirements
 | 
			
		||||
    runs-on: [lmtest]
 | 
			
		||||
    steps:
 | 
			
		||||
    - name: mask env
 | 
			
		||||
      run: |
 | 
			
		||||
        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
 | 
			
		||||
    - uses: actions/checkout@v3
 | 
			
		||||
 | 
			
		||||
    - name: convert-model-then-load
 | 
			
		||||
| 
						 | 
				
			
			@ -76,7 +93,7 @@ jobs:
 | 
			
		|||
        export PYTHONPATH=$PWD:$PYTHONPATH
 | 
			
		||||
        sh ./ci_scripts/model/convert_to_hf.sh 
 | 
			
		||||
        cd ./hf_ckpt
 | 
			
		||||
        srun -p llm python ../ci_scripts/model/loaded_as_transformer.py
 | 
			
		||||
        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py
 | 
			
		||||
        cd ..
 | 
			
		||||
        rm -rf $GITHUB_WORKSPACE/hf_ckpt
 | 
			
		||||
  
 | 
			
		||||
| 
						 | 
				
			
			@ -85,9 +102,12 @@ jobs:
 | 
			
		|||
    needs: check-requirements
 | 
			
		||||
    runs-on: [lmtest]
 | 
			
		||||
    steps:
 | 
			
		||||
    - name: mask env 
 | 
			
		||||
      run: |
 | 
			
		||||
        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
 | 
			
		||||
    - uses: actions/checkout@v3
 | 
			
		||||
 | 
			
		||||
    - name: chat-model-in-hf
 | 
			
		||||
      run: |
 | 
			
		||||
        source activate internlm-env-test
 | 
			
		||||
        srun -p llm python ./ci_scripts/model/demo_load_7B_chat_model.py
 | 
			
		||||
        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ./ci_scripts/model/demo_load_7B_chat_model.py
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -10,8 +10,7 @@ on:
 | 
			
		|||
jobs:
 | 
			
		||||
  # lint check can be auto-executed by the workflow
 | 
			
		||||
  lint-check:
 | 
			
		||||
    runs-on: [internlm]
 | 
			
		||||
    if: github.repository == 'InternLM/InternLM'
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    steps:
 | 
			
		||||
    - uses: actions/checkout@v3
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,21 @@
 | 
			
		|||
#!/bin/bash
 | 
			
		||||
set -x
 | 
			
		||||
 | 
			
		||||
retry_times=3
 | 
			
		||||
for ((i=1;i<=$retry_times;i++));do
 | 
			
		||||
    jobid=$(squeue -o "%A %j" -u $USER | grep ${GITHUB_RUN_ID}-${GITHUB_JOB} | awk '{print $1}')
 | 
			
		||||
    if [[ -n "$jobid" ]];then
 | 
			
		||||
        echo "The job $jobid will be canceled."
 | 
			
		||||
        scancel $jobid
 | 
			
		||||
        sleep 0.5
 | 
			
		||||
    else
 | 
			
		||||
        echo "There are no more jobs that need to be canceled."
 | 
			
		||||
        break
 | 
			
		||||
    fi
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
if [[ $i -gt $retry_times ]];then
 | 
			
		||||
    echo "There have been tried $retry_times times. Please contact user $USER to confirm the job status."
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
exit 0
 | 
			
		||||
| 
						 | 
				
			
			@ -22,7 +22,7 @@ if [[ ${num} -gt 0 ]]; then
 | 
			
		|||
    fi
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
srun -p llm python tools/tokenizer.py --text_input_path ${DATA} --bin_output_path ${RESULT}
 | 
			
		||||
srun -p ${SLURM_PARTITION} --job-name=$1 --gpus-per-task=1 python tools/tokenizer.py --text_input_path ${DATA} --bin_output_path ${RESULT}
 | 
			
		||||
[[ $? -ne 0 ]] && { echo "test tokenizer.py failed.";  exit_code=$(($exit_code + 1)); }
 | 
			
		||||
 | 
			
		||||
file_list=($RESULT $RESULT_META)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -19,7 +19,7 @@ if [[ -d ${CKPTS20_PATH} ]]; then
 | 
			
		|||
    fi
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
srun -p llm --quotatype=spot -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py
 | 
			
		||||
srun -p ${SLURM_PARTITION} --exclusive --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py
 | 
			
		||||
[[ $? -ne 0 ]] && { echo "test slurm training failed.";  exit_code=$(($exit_code + 1)); }
 | 
			
		||||
 | 
			
		||||
num=$(num_files "${CKPTS20_OUTPUT}")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -19,7 +19,7 @@ if [[ -d ${CKPTS20_PATH} ]]; then
 | 
			
		|||
    fi
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
srun -p llm -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch
 | 
			
		||||
srun -p ${SLURM_PARTITION} --exclusive --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch
 | 
			
		||||
[[ $? -ne 0 ]] && { echo "test torch training failed.";  exit_code=$(($exit_code + 1)); }
 | 
			
		||||
 | 
			
		||||
num=$(num_files "${CKPTS_OUTPUT}")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue