diff --git a/ci_scripts/common/basic_func.sh b/ci_scripts/common/basic_func.sh index f9bb196..8ce1c54 100644 --- a/ci_scripts/common/basic_func.sh +++ b/ci_scripts/common/basic_func.sh @@ -1,14 +1,18 @@ #!/bin/bash -export exit_code=0 - -function if_exist() { -ls -l $file_path -exit_code_now=$? -exit_code=$(($exit_code + $exit_code_now)) -} - -function num_files() { -file_num=$(ls -l $file_dir |wc -l) -echo "there are $file_num files in $file_dir" +####################################### +# Calculate the number of files in a directory. +# Call this function like this: num_files "${file_path}". +# Globals: +# None +# Arguments: +# $1: the directory path +# Returns: +# the number of files in the directory +####################################### +num_files() { + [[ $# -eq 1 ]] || return 1 + local file_num + file_num=$(ls -l $1 | grep '^-' | wc -l) + echo $file_num } diff --git a/ci_scripts/common/variables.sh b/ci_scripts/common/variables.sh new file mode 100644 index 0000000..5dcc5fa --- /dev/null +++ b/ci_scripts/common/variables.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +# TODO: variable defination should be in repo configures. +readonly DATA_VOLUME=${DATA_VOLUME:-"/mnt/petrelfs/qa-caif-cicd/data"} \ No newline at end of file diff --git a/ci_scripts/data/tokenizer_alpaca.sh b/ci_scripts/data/tokenizer_alpaca.sh index e8ccac7..0d06455 100644 --- a/ci_scripts/data/tokenizer_alpaca.sh +++ b/ci_scripts/data/tokenizer_alpaca.sh @@ -1,22 +1,50 @@ #!/bin/bash +set -x -rm -rf /mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/result/* +source ./ci_scripts/common/variables.sh +[[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci, exit."; exit 1; } -python tools/alpaca_tokenizer.py /mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/alpaca_data.json /mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/result tools/V7_sft.model --split_ratio 0.1 +readonly SRC_DATASET_META=${DATA_VOLUME}/lm_data/alpaca_data/alpaca_data.json +readonly RESULTS=${DATA_VOLUME}/lm_data/alpaca_data/result +readonly TRAIN_DATASET=${RESULTS}/train/en/dataset.bin +readonly TRAIN_DATASET_META=${RESULTS}/train/en/dataset.bin.meta +readonly VALID_DATASET=${RESULTS}/valid/en/dataset.bin +readonly VALID_DATASET_META=${RESULTS}/valid/en/dataset.bin.meta -file_one="/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/result/train/en/dataset.bin" -file_two="/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/result/train/en/dataset.bin.meta" -file_three="/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/result/valid/en/dataset.bin" -file_four="/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/result/valid/en/dataset.bin.meta" -file_list=($file_one $file_two $file_three $file_four) +split_ratio=0.1 +exit_code=0 source ./ci_scripts/common/basic_func.sh -for file_path in ${file_list[@]}; -do -if_exist $file_path + +echo "start to test alpaca_tokenizer.py." + +if [[ -d ${RESULTS} ]]; then + if ! rm -rf ${RESULTS}/*; then + echo "cleaning test data in ${RESULTS} failed, exit." + exit 1 + fi +fi + +if [[ ! -f ${SRC_DATASET_META} ]]; then + echo "${SRC_DATASET_META} should be exist, exit." + exit 1 +fi + +python tools/alpaca_tokenizer.py ${SRC_DATASET_META} ${RESULTS} tools/V7_sft.model --split_ratio ${split_ratio} +[[ $? -ne 0 ]] && { echo "test alpaca_tokenizer.py failed."; exit_code=$(($exit_code + 1)); } + +file_list=(${TRAIN_DATASET} ${TRAIN_DATASET_META} ${VALID_DATASET} ${VALID_DATASET_META}) +for file in ${file_list[@]}; do + if [[ ! -f ${file} ]]; then + echo "expect: ${file} exists, actual: not exist." + exit_code=$(($exit_code + 1)) + fi done -if [ $exit_code -ne 0 ] -then - exit 1 +# clean the test files. +if ! rm -rf ${RESULTS}/*; then + echo "cleaning test data in ${RESULTS} failed." + exit_code=$(($exit_code + 1)) fi + +exit $exit_code diff --git a/ci_scripts/data/tokenizer_chinese.sh b/ci_scripts/data/tokenizer_chinese.sh index 99241e7..d427c0b 100644 --- a/ci_scripts/data/tokenizer_chinese.sh +++ b/ci_scripts/data/tokenizer_chinese.sh @@ -1,19 +1,42 @@ #!/bin/bash +set -x -rm -rf /mnt/petrelfs/qa-caif-cicd/data/lm_data/cn_data/result.* -srun -p llm python tools/tokenizer.py --text_input_path /mnt/petrelfs/qa-caif-cicd/data/lm_data/cn_data/raw_data.txt --bin_output_path /mnt/petrelfs/qa-caif-cicd/data/lm_data/cn_data/result.bin +source ./ci_scripts/common/variables.sh +[[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci."; exit 1; } -file_one="/mnt/petrelfs/qa-caif-cicd/data/lm_data/cn_data/result.bin" -file_two="/mnt/petrelfs/qa-caif-cicd/data/lm_data/cn_data/result.bin.meta" -file_list=($file_one $file_two) +readonly DATA=${DATA_VOLUME}/lm_data/cn_data/raw_data.txt +readonly RESULT=${DATA_VOLUME}/lm_data/cn_data/result.bin +readonly RESULT_META=${DATA_VOLUME}/lm_data/cn_data/result.bin.meta +readonly RESULTS=${DATA_VOLUME}/lm_data/cn_data/result.* +exit_code=0 source ./ci_scripts/common/basic_func.sh -for file_path in ${file_list[@]}; -do -if_exist $file_path + +echo "start to test tokenizer.py." + +num=$(num_files "${RESULTS}") +if [[ ${num} -gt 0 ]]; then + if ! rm -rf ${RESULTS}; then + echo "cleaning test data ${RESULTS} failed, exit." + exit 1 + fi +fi + +srun -p llm python tools/tokenizer.py --text_input_path ${DATA} --bin_output_path ${RESULT} +[[ $? -ne 0 ]] && { echo "test tokenizer.py failed."; exit_code=$(($exit_code + 1)); } + +file_list=($RESULT $RESULT_META) +for file in ${file_list[@]}; do + if [[ ! -f ${file} ]]; then + echo "expect: ${file} exists, actual: not exist." + exit_code=$(($exit_code + 1)) + fi done -if [ $exit_code -ne 0 ] -then - exit 1 +# clean the test files. +if ! rm -rf ${RESULTS}/*; then + echo "cleaning cached file in ${RESULTS} failed." + exit_code=$(($exit_code + 1)) fi + +exit $exit_code diff --git a/ci_scripts/model/convert_to_hf.sh b/ci_scripts/model/convert_to_hf.sh index 385bba5..162946d 100644 --- a/ci_scripts/model/convert_to_hf.sh +++ b/ci_scripts/model/convert_to_hf.sh @@ -1,33 +1,47 @@ #!/bin/bash +set -x -rm -rf ./hf_ckpt/* -python ./tools/transformers/convert2hf.py --src_folder /mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/llm_ckpts/20 --tgt_folder hf_ckpt/ --tokenizer ./tools/V7_sft.model +source ./ci_scripts/common/variables.sh +[[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci, exit."; exit 1; } +[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; } -#assert exists model -file_one="$GITHUB_WORKSPACE/hf_ckpt/tokenizer.model" -file_two="$GITHUB_WORKSPACE/hf_ckpt/config.json" -file_three="$GITHUB_WORKSPACE/hf_ckpt/modeling_internlm.py" -file_list=($file_one $file_two $file_three) -file_dir="$GITHUB_WORKSPACE/hf_ckpt/*" +readonly CKPTS_INPUT="${DATA_VOLUME}/lm_data/alpaca_data/llm_ckpts/20" +readonly CKPTS_OUTPUT="${GITHUB_WORKSPACE}/hf_ckpt" +readonly TOKENIZER="${GITHUB_WORKSPACE}/hf_ckpt/tokenizer.model" +readonly CONFIG="${GITHUB_WORKSPACE}/hf_ckpt/config.json" +readonly INERNLM="${GITHUB_WORKSPACE}/hf_ckpt/modeling_internlm.py" +exit_code=0 +expected_num=9 source ./ci_scripts/common/basic_func.sh -for file_path in ${file_list[@]}; -do -if_exist $file_path +echo "start to test convert2hf.py." + +if [[ -d ${CKPTS_OUTPUT} ]]; then + if ! rm -rf ${CKPTS_OUTPUT}/*; then + echo "cleaning cached file in ${CKPTS_OUTPUT} failed, exit." + exit 1 + fi +fi + +python ./tools/transformers/convert2hf.py --src_folder ${CKPTS_INPUT} --tgt_folder ${CKPTS_OUTPUT} --tokenizer ./tools/V7_sft.model +[[ $? -ne 0 ]] && { echo "test convert2hf.py failed."; exit_code=$(($exit_code + 1)); } + +#assert exists model +file_list=($TOKENIZER $CONFIG $INERNLM) +for file in ${file_list[@]}; do + if [[ ! -f ${file} ]];then + echo "file ${file} does not exist." + exit_code=$(($exit_code + 1)) + fi done +num=$(num_files "${CKPTS_OUTPUT}") -num_files ${file_dir} - -if [ $file_num -ne 9 ] -then - echo "The num of files is not right" - ls -l $file_dir +if [[ ${num} -ne ${expected_num} ]]; then + echo "expect: ${expected_num} files, actual: ${num} files." exit_code=$(($exit_code + 1)) fi -if [ $exit_code -ne 0 ] -then - exit 1 -fi +# NOTICE: should not remove the cached files, because the cached files will be used in the next test case. +exit $exit_code diff --git a/ci_scripts/model/demo_load_7B_chat_model.py b/ci_scripts/model/demo_load_7B_chat_model.py index 61cec0d..695be27 100644 --- a/ci_scripts/model/demo_load_7B_chat_model.py +++ b/ci_scripts/model/demo_load_7B_chat_model.py @@ -1,4 +1,5 @@ - +#!/usr/bin/env python +# -*- encoding: utf-8 -*- from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True) diff --git a/ci_scripts/model/loaded_as_transformer.py b/ci_scripts/model/loaded_as_transformer.py index 5e3d28d..5254fb9 100644 --- a/ci_scripts/model/loaded_as_transformer.py +++ b/ci_scripts/model/loaded_as_transformer.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- from transformers import AutoModel model = AutoModel.from_pretrained("../hf_ckpt/", trust_remote_code=True).cuda() diff --git a/ci_scripts/train/slurm_train.sh b/ci_scripts/train/slurm_train.sh index 9759c0e..2ece5e5 100644 --- a/ci_scripts/train/slurm_train.sh +++ b/ci_scripts/train/slurm_train.sh @@ -1,20 +1,37 @@ #!/bin/bash +set -x -rm -rf $GITHUB_WORKSPACE/llm_ckpts/20 +[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; } +readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts" +readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20" +readonly CKPTS20_OUTPUT="${CKPTS20_PATH}/*.pt" +expected_num=21 +exit_code=0 -srun -p llm --quotatype=spot -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py - -file_dir="$GITHUB_WORKSPACE/llm_ckpts/20/*.pt" source ./ci_scripts/common/basic_func.sh -num_files ${file_dir} +echo "start to test slurm training." -if [ $file_num -ne 21 ] -then - echo "The num of files is not right" - ls -l $file_dir - rm -rf $GITHUB_WORKSPACE/llm_ckpts - exit 1 +if [[ -d ${CKPTS20_PATH} ]]; then + if ! rm -rf ${CKPTS20_PATH}/*; then + echo "cleaning cached file in ${CKPTS20_PATH} failed, exit." + exit 1 + fi fi +srun -p llm --quotatype=spot -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./ci_scripts/train/ci_7B_sft.py +[[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); } +num=$(num_files "${CKPTS20_OUTPUT}") +if [[ ${num} -ne ${expected_num} ]]; then + echo "expect: ${expected_num} files, actual: ${num} files." + exit_code=$(($exit_code + 1)) +fi + +# clean the test files. +if ! rm -rf ${CKPTS_PATH}/*; then + echo "cleaning cached file in ${CKPTS_PATH} failed." + exit_code=$(($exit_code + 1)) +fi + +exit $exit_code diff --git a/ci_scripts/train/torchrun.sh b/ci_scripts/train/torchrun.sh index db781f0..9eb4220 100644 --- a/ci_scripts/train/torchrun.sh +++ b/ci_scripts/train/torchrun.sh @@ -1,17 +1,37 @@ #!/bin/bash +set -x -rm -rf $GITHUB_WORKSPACE/llm_ckpts/20 -srun -p llm -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher "torch" +[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; } +readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts" +readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20" +readonly CKPTS_OUTPUT="${CKPTS20_PATH}/*.pt" +expected_num=21 +exit_code=0 -file_dir="$GITHUB_WORKSPACE/llm_ckpts/20/*.pt" source ./ci_scripts/common/basic_func.sh -num_files ${file_dir} +echo "start to test torch training." -if [ $file_num -ne 21 ] -then - echo "The num of files is not right" - ls -l $file_dir - rm -rf $GITHUB_WORKSPACE/llm_ckpts - exit 1 +if [[ -d ${CKPTS20_PATH} ]]; then + if ! rm -rf ${CKPTS20_PATH}/*; then + echo "cleaning cached file in ${CKPTS20_PATH} failed, exit." + exit 1 + fi fi + +srun -p llm -N 1 torchrun --nnodes=1 --nproc_per_node=8 --master_port=29501 train.py --config ./ci_scripts/train/ci_7B_sft.py --launcher torch +[[ $? -ne 0 ]] && { echo "test torch training failed."; exit_code=$(($exit_code + 1)); } + +num=$(num_files "${CKPTS_OUTPUT}") +if [[ ${num} -ne ${expected_num} ]]; then + echo "expect: ${expected_num} files, actual: ${num} files." + exit_code=$(($exit_code + 1)) +fi + +# clean the test files. +if ! rm -rf ${CKPTS_PATH}/*; then + echo "cleaning cached file in ${CKPTS_PATH} failed." + exit_code=$(($exit_code + 1)) +fi + +exit $exit_code