test(ci_scripts): clean test data after test, remove unnecessary global variables, and other optimizations (#165)

* test: optimization of ci scripts(variables, test data cleaning, etc).

* chore(workflows): disable ci job on push.

* fix: update partition
pull/178/head
zachtzy 2023-08-03 11:26:51 +08:00 committed by GitHub
parent 7fbf85eac9
commit 585071c95b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 191 additions and 78 deletions

View File

@ -1,14 +1,18 @@
#!/bin/bash #!/bin/bash
export exit_code=0 #######################################
# Calculate the number of files in a directory.
function if_exist() { # Call this function like this: num_files "${file_path}".
ls -l $file_path # Globals:
exit_code_now=$? # None
exit_code=$(($exit_code + $exit_code_now)) # Arguments:
} # $1: the directory path
# Returns:
function num_files() { # the number of files in the directory
file_num=$(ls -l $file_dir |wc -l) #######################################
echo "there are $file_num files in $file_dir" num_files() {
[[ $# -eq 1 ]] || return 1
local file_num
file_num=$(ls -l $1 | grep '^-' | wc -l)
echo $file_num
} }

View File

@ -0,0 +1,4 @@
#!/bin/bash
# TODO: variable defination should be in repo configures.
readonly DATA_VOLUME=${DATA_VOLUME:-"/mnt/petrelfs/qa-caif-cicd/data"}

View File

@ -1,22 +1,50 @@
#!/bin/bash #!/bin/bash
set -x
rm -rf /mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/result/* source ./ci_scripts/common/variables.sh
[[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci, exit."; exit 1; }
python tools/alpaca_tokenizer.py /mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/alpaca_data.json /mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/result tools/V7_sft.model --split_ratio 0.1 readonly SRC_DATASET_META=${DATA_VOLUME}/lm_data/alpaca_data/alpaca_data.json
readonly RESULTS=${DATA_VOLUME}/lm_data/alpaca_data/result
readonly TRAIN_DATASET=${RESULTS}/train/en/dataset.bin
readonly TRAIN_DATASET_META=${RESULTS}/train/en/dataset.bin.meta
readonly VALID_DATASET=${RESULTS}/valid/en/dataset.bin
readonly VALID_DATASET_META=${RESULTS}/valid/en/dataset.bin.meta
file_one="/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/result/train/en/dataset.bin" split_ratio=0.1
file_two="/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/result/train/en/dataset.bin.meta" exit_code=0
file_three="/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/result/valid/en/dataset.bin"
file_four="/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/result/valid/en/dataset.bin.meta"
file_list=($file_one $file_two $file_three $file_four)
source ./ci_scripts/common/basic_func.sh source ./ci_scripts/common/basic_func.sh
for file_path in ${file_list[@]};
do echo "start to test alpaca_tokenizer.py."
if_exist $file_path
if [[ -d ${RESULTS} ]]; then
if ! rm -rf ${RESULTS}/*; then
echo "cleaning test data in ${RESULTS} failed, exit."
exit 1
fi
fi
if [[ ! -f ${SRC_DATASET_META} ]]; then
echo "${SRC_DATASET_META} should be exist, exit."
exit 1
fi
python tools/alpaca_tokenizer.py ${SRC_DATASET_META} ${RESULTS} tools/V7_sft.model --split_ratio ${split_ratio}
[[ $? -ne 0 ]] && { echo "test alpaca_tokenizer.py failed."; exit_code=$(($exit_code + 1)); }
file_list=(${TRAIN_DATASET} ${TRAIN_DATASET_META} ${VALID_DATASET} ${VALID_DATASET_META})
for file in ${file_list[@]}; do
if [[ ! -f ${file} ]]; then
echo "expect: ${file} exists, actual: not exist."
exit_code=$(($exit_code + 1))
fi
done done
if [ $exit_code -ne 0 ] # clean the test files.
then if ! rm -rf ${RESULTS}/*; then
exit 1 echo "cleaning test data in ${RESULTS} failed."
exit_code=$(($exit_code + 1))
fi fi
exit $exit_code

View File

@ -1,19 +1,42 @@
#!/bin/bash #!/bin/bash
set -x
rm -rf /mnt/petrelfs/qa-caif-cicd/data/lm_data/cn_data/result.* source ./ci_scripts/common/variables.sh
srun -p llm python tools/tokenizer.py --text_input_path /mnt/petrelfs/qa-caif-cicd/data/lm_data/cn_data/raw_data.txt --bin_output_path /mnt/petrelfs/qa-caif-cicd/data/lm_data/cn_data/result.bin [[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci."; exit 1; }
file_one="/mnt/petrelfs/qa-caif-cicd/data/lm_data/cn_data/result.bin" readonly DATA=${DATA_VOLUME}/lm_data/cn_data/raw_data.txt
file_two="/mnt/petrelfs/qa-caif-cicd/data/lm_data/cn_data/result.bin.meta" readonly RESULT=${DATA_VOLUME}/lm_data/cn_data/result.bin
file_list=($file_one $file_two) readonly RESULT_META=${DATA_VOLUME}/lm_data/cn_data/result.bin.meta
readonly RESULTS=${DATA_VOLUME}/lm_data/cn_data/result.*
exit_code=0
source ./ci_scripts/common/basic_func.sh source ./ci_scripts/common/basic_func.sh
for file_path in ${file_list[@]};
do echo "start to test tokenizer.py."
if_exist $file_path
num=$(num_files "${RESULTS}")
if [[ ${num} -gt 0 ]]; then
if ! rm -rf ${RESULTS}; then
echo "cleaning test data ${RESULTS} failed, exit."
exit 1
fi
fi
srun -p llm python tools/tokenizer.py --text_input_path ${DATA} --bin_output_path ${RESULT}
[[ $? -ne 0 ]] && { echo "test tokenizer.py failed."; exit_code=$(($exit_code + 1)); }
file_list=($RESULT $RESULT_META)
for file in ${file_list[@]}; do
if [[ ! -f ${file} ]]; then
echo "expect: ${file} exists, actual: not exist."
exit_code=$(($exit_code + 1))
fi
done done
if [ $exit_code -ne 0 ] # clean the test files.
then if ! rm -rf ${RESULTS}/*; then
exit 1 echo "cleaning cached file in ${RESULTS} failed."
exit_code=$(($exit_code + 1))
fi fi
exit $exit_code

View File

@ -1,33 +1,47 @@
#!/bin/bash #!/bin/bash
set -x
rm -rf ./hf_ckpt/* source ./ci_scripts/common/variables.sh
python ./tools/transformers/convert2hf.py --src_folder /mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/llm_ckpts/20 --tgt_folder hf_ckpt/ --tokenizer ./tools/V7_sft.model [[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci, exit."; exit 1; }
[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; }
#assert exists model readonly CKPTS_INPUT="${DATA_VOLUME}/lm_data/alpaca_data/llm_ckpts/20"
file_one="$GITHUB_WORKSPACE/hf_ckpt/tokenizer.model" readonly CKPTS_OUTPUT="${GITHUB_WORKSPACE}/hf_ckpt"
file_two="$GITHUB_WORKSPACE/hf_ckpt/config.json" readonly TOKENIZER="${GITHUB_WORKSPACE}/hf_ckpt/tokenizer.model"
file_three="$GITHUB_WORKSPACE/hf_ckpt/modeling_internlm.py" readonly CONFIG="${GITHUB_WORKSPACE}/hf_ckpt/config.json"
file_list=($file_one $file_two $file_three) readonly INERNLM="${GITHUB_WORKSPACE}/hf_ckpt/modeling_internlm.py"
file_dir="$GITHUB_WORKSPACE/hf_ckpt/*" exit_code=0
expected_num=9
source ./ci_scripts/common/basic_func.sh source ./ci_scripts/common/basic_func.sh
for file_path in ${file_list[@]}; echo "start to test convert2hf.py."
do
if_exist $file_path if [[ -d ${CKPTS_OUTPUT} ]]; then
if ! rm -rf ${CKPTS_OUTPUT}/*; then
echo "cleaning cached file in ${CKPTS_OUTPUT} failed, exit."
exit 1
fi
fi
python ./tools/transformers/convert2hf.py --src_folder ${CKPTS_INPUT} --tgt_folder ${CKPTS_OUTPUT} --tokenizer ./tools/V7_sft.model
[[ $? -ne 0 ]] && { echo "test convert2hf.py failed."; exit_code=$(($exit_code + 1)); }
#assert exists model
file_list=($TOKENIZER $CONFIG $INERNLM)
for file in ${file_list[@]}; do
if [[ ! -f ${file} ]];then
echo "file ${file} does not exist."
exit_code=$(($exit_code + 1))
fi
done done
num=$(num_files "${CKPTS_OUTPUT}")
num_files ${file_dir} if [[ ${num} -ne ${expected_num} ]]; then
echo "expect: ${expected_num} files, actual: ${num} files."
if [ $file_num -ne 9 ]
then
echo "The num of files is not right"
ls -l $file_dir
exit_code=$(($exit_code + 1)) exit_code=$(($exit_code + 1))
fi fi
if [ $exit_code -ne 0 ] # NOTICE: should not remove the cached files, because the cached files will be used in the next test case.
then exit $exit_code
exit 1
fi

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from transformers import AutoTokenizer, AutoModelForCausalLM from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)

View File

@ -1,3 +1,5 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from transformers import AutoModel from transformers import AutoModel
model = AutoModel.from_pretrained("../hf_ckpt/", trust_remote_code=True).cuda() model = AutoModel.from_pretrained("../hf_ckpt/", trust_remote_code=True).cuda()

View File

@ -1,20 +1,37 @@
#!/bin/bash #!/bin/bash
set -x
rm -rf $GITHUB_WORKSPACE/llm_ckpts/20 [[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; }
readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20"
readonly CKPTS20_OUTPUT="${CKPTS20_PATH}/*.pt"
expected_num=21
exit_code=0
srun -p llm --quotatype=spot -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py
file_dir="$GITHUB_WORKSPACE/llm_ckpts/20/*.pt"
source ./ci_scripts/common/basic_func.sh source ./ci_scripts/common/basic_func.sh
num_files ${file_dir} echo "start to test slurm training."
if [ $file_num -ne 21 ] if [[ -d ${CKPTS20_PATH} ]]; then
then if ! rm -rf ${CKPTS20_PATH}/*; then
echo "The num of files is not right" echo "cleaning cached file in ${CKPTS20_PATH} failed, exit."
ls -l $file_dir exit 1
rm -rf $GITHUB_WORKSPACE/llm_ckpts fi
exit 1
fi fi
srun -p llm --quotatype=spot -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py
[[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); }
num=$(num_files "${CKPTS20_OUTPUT}")
if [[ ${num} -ne ${expected_num} ]]; then
echo "expect: ${expected_num} files, actual: ${num} files."
exit_code=$(($exit_code + 1))
fi
# clean the test files.
if ! rm -rf ${CKPTS_PATH}/*; then
echo "cleaning cached file in ${CKPTS_PATH} failed."
exit_code=$(($exit_code + 1))
fi
exit $exit_code

View File

@ -1,17 +1,37 @@
#!/bin/bash #!/bin/bash
set -x
rm -rf $GITHUB_WORKSPACE/llm_ckpts/20 [[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; }
srun -p llm -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher "torch" readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20"
readonly CKPTS_OUTPUT="${CKPTS20_PATH}/*.pt"
expected_num=21
exit_code=0
file_dir="$GITHUB_WORKSPACE/llm_ckpts/20/*.pt"
source ./ci_scripts/common/basic_func.sh source ./ci_scripts/common/basic_func.sh
num_files ${file_dir} echo "start to test torch training."
if [ $file_num -ne 21 ] if [[ -d ${CKPTS20_PATH} ]]; then
then if ! rm -rf ${CKPTS20_PATH}/*; then
echo "The num of files is not right" echo "cleaning cached file in ${CKPTS20_PATH} failed, exit."
ls -l $file_dir exit 1
rm -rf $GITHUB_WORKSPACE/llm_ckpts fi
exit 1
fi fi
srun -p llm -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch
[[ $? -ne 0 ]] && { echo "test torch training failed."; exit_code=$(($exit_code + 1)); }
num=$(num_files "${CKPTS_OUTPUT}")
if [[ ${num} -ne ${expected_num} ]]; then
echo "expect: ${expected_num} files, actual: ${num} files."
exit_code=$(($exit_code + 1))
fi
# clean the test files.
if ! rm -rf ${CKPTS_PATH}/*; then
echo "cleaning cached file in ${CKPTS_PATH} failed."
exit_code=$(($exit_code + 1))
fi
exit $exit_code