test(ci_scripts): move ci env (#317)

* change partition and runner label

* change rm action to mv

* use spot

* use rsync to move test files

* remove *

* remove *

* change into llm_s partition

---------

Co-authored-by: wangmengke <wangmengke@pjlab.org.cn>
pull/328/head
kkscilife 2023-09-19 14:52:32 +08:00 committed by GitHub
parent 2710fa7343
commit bfefc4ea3c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 48 additions and 31 deletions

View File

@ -9,11 +9,11 @@ on:
- "**.md" - "**.md"
env: env:
WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4) WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
SLURM_PARTITION: llm SLURM_PARTITION: llm_s
jobs: jobs:
check-requirements: check-requirements:
runs-on: [lmtest] runs-on: [t_cluster]
steps: steps:
- name: mask env - name: mask env
run: | run: |
@ -37,7 +37,7 @@ jobs:
dataset-preparation: dataset-preparation:
if: ${{ always() }} if: ${{ always() }}
needs: check-requirements needs: check-requirements
runs-on: [lmtest] runs-on: [t_cluster]
steps: steps:
- name: mask env - name: mask env
run: | run: |
@ -57,7 +57,7 @@ jobs:
train: train:
if: ${{ always() }} if: ${{ always() }}
needs: check-requirements needs: check-requirements
runs-on: [lmtest] runs-on: [t_cluster]
timeout-minutes: 30 timeout-minutes: 30
steps: steps:
- name: mask env - name: mask env
@ -83,18 +83,19 @@ jobs:
source activate internlm-env-test source activate internlm-env-test
export PYTHONPATH=$PWD:$PYTHONPATH export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB} sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
rm -rf $GITHUB_WORKSPACE/llm_ckpts rsync -av --remove-source-files $GITHUB_WORKSPACE/llm_ckpts ${{env.WORKSPACE_PREFIX}}/ci_clean_bak
- name: torchrun-train - name: torchrun-train
run: | run: |
source activate internlm-env-test source activate internlm-env-test
sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB} sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
rm -rf $GITHUB_WORKSPACE/llm_ckpts rsync -av --remove-source-files $GITHUB_WORKSPACE/llm_ckpts ${{env.WORKSPACE_PREFIX}}/ci_clean_bak
convert-model-then-load: convert-model-then-load:
if: ${{ always() }} if: ${{ always() }}
needs: check-requirements needs: check-requirements
runs-on: [lmtest] runs-on: [t_cluster]
timeout-minutes: 15
steps: steps:
- name: mask env - name: mask env
run: | run: |
@ -107,13 +108,14 @@ jobs:
export PYTHONPATH=$PWD:$PYTHONPATH export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/model/convert_to_hf.sh sh ./ci_scripts/model/convert_to_hf.sh
cd ./hf_ckpt cd ./hf_ckpt
srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py srun -p ${SLURM_PARTITION} --quotatype=spot --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py
cd .. cd ..
rm -rf $GITHUB_WORKSPACE/hf_ckpt rsync -av --remove-source-files $GITHUB_WORKSPACE/hf_ckpt ${{env.WORKSPACE_PREFIX}}/ci_clean_bak
load-chat-model-in-hf: load-chat-model-in-hf:
if: ${{ always() }} if: ${{ always() }}
needs: check-requirements needs: check-requirements
runs-on: [lmtest] runs-on: [t_cluster]
timeout-minutes: 15
steps: steps:
- name: mask env - name: mask env
run: | run: |
@ -123,4 +125,4 @@ jobs:
- name: chat-model-in-hf - name: chat-model-in-hf
run: | run: |
source activate internlm-env-test source activate internlm-env-test
srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ./ci_scripts/model/demo_load_7B_chat_model.py srun -p ${SLURM_PARTITION} --quotatype=spot --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ./ci_scripts/model/demo_load_7B_chat_model.py

View File

@ -1,3 +1,4 @@
#!/bin/bash #!/bin/bash
readonly DATA_VOLUME=$(echo $GITHUB_WORKSPACE | cut -d '/' -f 1-4)/data readonly DATA_VOLUME=$(echo $GITHUB_WORKSPACE | cut -d '/' -f 1-4)/data
readonly CLEAN_PATH=$(echo $GITHUB_WORKSPACE | cut -d '/' -f 1-4)/ci_clean_bak

View File

@ -3,6 +3,7 @@ set -x
source ./ci_scripts/common/variables.sh source ./ci_scripts/common/variables.sh
[[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci, exit."; exit 1; } [[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci, exit."; exit 1; }
[[ -n ${CLEAN_PATH} ]] || { echo "should set CLEAN_PATH first before ci, exit."; exit 1; }
readonly SRC_DATASET_META=${DATA_VOLUME}/lm_data/alpaca_data/alpaca_data.json readonly SRC_DATASET_META=${DATA_VOLUME}/lm_data/alpaca_data/alpaca_data.json
readonly RESULTS=${DATA_VOLUME}/lm_data/alpaca_data/result readonly RESULTS=${DATA_VOLUME}/lm_data/alpaca_data/result
@ -19,7 +20,7 @@ source ./ci_scripts/common/basic_func.sh
echo "start to test alpaca_tokenizer.py." echo "start to test alpaca_tokenizer.py."
if [[ -d ${RESULTS} ]]; then if [[ -d ${RESULTS} ]]; then
if ! rm -rf ${RESULTS}/*; then if ! rsync -av --remove-source-files ${RESULTS} ${CLEAN_PATH}; then
echo "cleaning test data in ${RESULTS} failed, exit." echo "cleaning test data in ${RESULTS} failed, exit."
exit 1 exit 1
fi fi
@ -41,8 +42,8 @@ for file in ${file_list[@]}; do
fi fi
done done
# clean the test files. # move the test files.
if ! rm -rf ${RESULTS}/*; then if ! rsync -av --remove-source-files ${RESULTS} ${CLEAN_PATH}; then
echo "cleaning test data in ${RESULTS} failed." echo "cleaning test data in ${RESULTS} failed."
exit_code=$(($exit_code + 1)) exit_code=$(($exit_code + 1))
fi fi

View File

@ -2,7 +2,8 @@
set -x set -x
source ./ci_scripts/common/variables.sh source ./ci_scripts/common/variables.sh
[[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci."; exit 1; } [[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci, exit."; exit 1; }
[[ -n ${CLEAN_PATH} ]] || { echo "should set CLEAN_PATH first before ci, exit."; exit 1; }
readonly DATA=${DATA_VOLUME}/lm_data/cn_data/raw_data.txt readonly DATA=${DATA_VOLUME}/lm_data/cn_data/raw_data.txt
readonly RESULT=${DATA_VOLUME}/lm_data/cn_data/result.bin readonly RESULT=${DATA_VOLUME}/lm_data/cn_data/result.bin
@ -16,13 +17,13 @@ echo "start to test tokenizer.py."
num=$(num_files "${RESULTS}") num=$(num_files "${RESULTS}")
if [[ ${num} -gt 0 ]]; then if [[ ${num} -gt 0 ]]; then
if ! rm -rf ${RESULTS}; then if ! rsync -av --remove-source-files ${RESULTS} ${CLEAN_PATH}; then
echo "cleaning test data ${RESULTS} failed, exit." echo "cleaning test data ${RESULTS} failed, exit."
exit 1 exit 1
fi fi
fi fi
srun -p ${SLURM_PARTITION} --job-name=$1 --gpus-per-task=1 python tools/tokenizer.py --text_input_path ${DATA} --bin_output_path ${RESULT} srun -p ${SLURM_PARTITION} --quotatype=spot --job-name=$1 --gpus-per-task=1 python tools/tokenizer.py --text_input_path ${DATA} --bin_output_path ${RESULT}
[[ $? -ne 0 ]] && { echo "test tokenizer.py failed."; exit_code=$(($exit_code + 1)); } [[ $? -ne 0 ]] && { echo "test tokenizer.py failed."; exit_code=$(($exit_code + 1)); }
file_list=($RESULT $RESULT_META) file_list=($RESULT $RESULT_META)
@ -33,8 +34,8 @@ for file in ${file_list[@]}; do
fi fi
done done
# clean the test files. # move the test files.
if ! rm -rf ${RESULTS}/*; then if ! rsync -av --remove-source-files ${RESULTS} ${CLEAN_PATH}; then
echo "cleaning cached file in ${RESULTS} failed." echo "cleaning cached file in ${RESULTS} failed."
exit_code=$(($exit_code + 1)) exit_code=$(($exit_code + 1))
fi fi

View File

@ -4,6 +4,7 @@ set -x
source ./ci_scripts/common/variables.sh source ./ci_scripts/common/variables.sh
[[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci, exit."; exit 1; } [[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci, exit."; exit 1; }
[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; } [[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; }
[[ -n ${CLEAN_PATH} ]] || { echo "should set CLEAN_PATH first before ci, exit."; exit 1; }
readonly CKPTS_INPUT="${DATA_VOLUME}/lm_data/alpaca_data/llm_ckpts/20" readonly CKPTS_INPUT="${DATA_VOLUME}/lm_data/alpaca_data/llm_ckpts/20"
readonly CKPTS_OUTPUT="${GITHUB_WORKSPACE}/hf_ckpt" readonly CKPTS_OUTPUT="${GITHUB_WORKSPACE}/hf_ckpt"
@ -18,7 +19,7 @@ source ./ci_scripts/common/basic_func.sh
echo "start to test convert2hf.py." echo "start to test convert2hf.py."
if [[ -d ${CKPTS_OUTPUT} ]]; then if [[ -d ${CKPTS_OUTPUT} ]]; then
if ! rm -rf ${CKPTS_OUTPUT}/*; then if ! rsync -av --remove-source-files ${CKPTS_OUTPUT}/* ${CLEAN_PATH}; then
echo "cleaning cached file in ${CKPTS_OUTPUT} failed, exit." echo "cleaning cached file in ${CKPTS_OUTPUT} failed, exit."
exit 1 exit 1
fi fi

View File

@ -1,7 +1,10 @@
#!/bin/bash #!/bin/bash
set -x set -x
source ./ci_scripts/common/variables.sh
[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; } [[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; }
[[ -n ${CLEAN_PATH} ]] || { echo "should set CLEAN_PATH first before ci, exit."; exit 1; }
readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts" readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
readonly CKPTS40_PATH="$GITHUB_WORKSPACE/llm_ckpts/40" readonly CKPTS40_PATH="$GITHUB_WORKSPACE/llm_ckpts/40"
readonly CKPTS40_OUTPUT="${CKPTS40_PATH}/*.pt" readonly CKPTS40_OUTPUT="${CKPTS40_PATH}/*.pt"
@ -19,7 +22,7 @@ if [[ ! -f ${file} ]]; then
exit_code=$(($exit_code + 1)) exit_code=$(($exit_code + 1))
fi fi
srun -p ${SLURM_PARTITION} --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file} srun -p ${SLURM_PARTITION} --exclusive --quotatype=spot --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file}
[[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); } [[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); }
@ -29,10 +32,12 @@ if [[ ${num} -ne ${expected_num} ]]; then
exit_code=$(($exit_code + 1)) exit_code=$(($exit_code + 1))
fi fi
# clean the test files. # move the test files.
if ! rm -rf ${CKPTS_PATH}/*; then if [[ -d ${CKPTS_PATH} ]]; then
if ! rsync -av --remove-source-files ${CKPTS_PATH} ${CLEAN_PATH}; then
echo "cleaning cached file in ${CKPTS_PATH} failed." echo "cleaning cached file in ${CKPTS_PATH} failed."
exit_code=$(($exit_code + 1)) exit_code=$(($exit_code + 1))
fi
fi fi
exit $exit_code exit $exit_code

View File

@ -1,7 +1,10 @@
#!/bin/bash #!/bin/bash
set -x set -x
source ./ci_scripts/common/variables.sh
[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; } [[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; }
[[ -n ${CLEAN_PATH} ]] || { echo "should set CLEAN_PATH first before ci, exit."; exit 1; }
readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts" readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20" readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20"
readonly CKPTS20_OUTPUT="${CKPTS20_PATH}/*.pt" readonly CKPTS20_OUTPUT="${CKPTS20_PATH}/*.pt"
@ -13,13 +16,13 @@ source ./ci_scripts/common/basic_func.sh
echo "start to test slurm training." echo "start to test slurm training."
if [[ -d ${CKPTS20_PATH} ]]; then if [[ -d ${CKPTS20_PATH} ]]; then
if ! rm -rf ${CKPTS20_PATH}/*; then if ! rsync -av --remove-source-files ${CKPTS20_PATH} ${CLEAN_PATH}; then
echo "cleaning cached file in ${CKPTS20_PATH} failed, exit." echo "cleaning cached file in ${CKPTS20_PATH} failed, exit."
exit 1 exit 1
fi fi
fi fi
srun -p ${SLURM_PARTITION} --exclusive --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py srun -p ${SLURM_PARTITION} --exclusive --quotatype=spot --job-name=$1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py
[[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); } [[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); }
num=$(num_files "${CKPTS20_OUTPUT}") num=$(num_files "${CKPTS20_OUTPUT}")

View File

@ -1,7 +1,10 @@
#!/bin/bash #!/bin/bash
set -x set -x
source ./ci_scripts/common/variables.sh
[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; } [[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; }
[[ -n ${CLEAN_PATH} ]] || { echo "should set CLEAN_PATH first before ci, exit."; exit 1; }
readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts" readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20" readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20"
readonly CKPTS_OUTPUT="${CKPTS20_PATH}/*.pt" readonly CKPTS_OUTPUT="${CKPTS20_PATH}/*.pt"
@ -13,13 +16,13 @@ source ./ci_scripts/common/basic_func.sh
echo "start to test torch training." echo "start to test torch training."
if [[ -d ${CKPTS20_PATH} ]]; then if [[ -d ${CKPTS20_PATH} ]]; then
if ! rm -rf ${CKPTS20_PATH}/*; then if ! rsync -av --remove-source-files ${CKPTS20_PATH} ${CLEAN_PATH}; then
echo "cleaning cached file in ${CKPTS20_PATH} failed, exit." echo "cleaning cached file in ${CKPTS20_PATH} failed, exit."
exit 1 exit 1
fi fi
fi fi
srun -p ${SLURM_PARTITION} --exclusive --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch srun -p ${SLURM_PARTITION} --exclusive --quotatype=spot --job-name=$1 -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch
[[ $? -ne 0 ]] && { echo "test torch training failed."; exit_code=$(($exit_code + 1)); } [[ $? -ne 0 ]] && { echo "test torch training failed."; exit_code=$(($exit_code + 1)); }
num=$(num_files "${CKPTS_OUTPUT}") num=$(num_files "${CKPTS_OUTPUT}")
@ -28,8 +31,8 @@ if [[ ${num} -ne ${expected_num} ]]; then
exit_code=$(($exit_code + 1)) exit_code=$(($exit_code + 1))
fi fi
# clean the test files. # move the test files.
if ! rm -rf ${CKPTS_PATH}/*; then if ! rsync -av --remove-source-files ${CKPTS_PATH}/* ${CLEAN_PATH}; then
echo "cleaning cached file in ${CKPTS_PATH} failed." echo "cleaning cached file in ${CKPTS_PATH} failed."
exit_code=$(($exit_code + 1)) exit_code=$(($exit_code + 1))
fi fi