#!/bin/bash set -x source ./ci_scripts/common/variables.sh [[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci, exit."; exit 1; } readonly SRC_DATASET_META=${DATA_VOLUME}/lm_data/alpaca_data/alpaca_data.json readonly RESULTS=${DATA_VOLUME}/lm_data/alpaca_data/result readonly TRAIN_DATASET=${RESULTS}/train/en/dataset.bin readonly TRAIN_DATASET_META=${RESULTS}/train/en/dataset.bin.meta readonly VALID_DATASET=${RESULTS}/valid/en/dataset.bin readonly VALID_DATASET_META=${RESULTS}/valid/en/dataset.bin.meta split_ratio=0.1 exit_code=0 source ./ci_scripts/common/basic_func.sh echo "start to test alpaca_tokenizer.py." if [[ -d ${RESULTS} ]]; then if ! rm -rf ${RESULTS}/*; then echo "cleaning test data in ${RESULTS} failed, exit." exit 1 fi fi if [[ ! -f ${SRC_DATASET_META} ]]; then echo "${SRC_DATASET_META} should be exist, exit." exit 1 fi python tools/alpaca_tokenizer.py ${SRC_DATASET_META} ${RESULTS} tools/V7_sft.model --split_ratio ${split_ratio} [[ $? -ne 0 ]] && { echo "test alpaca_tokenizer.py failed."; exit_code=$(($exit_code + 1)); } file_list=(${TRAIN_DATASET} ${TRAIN_DATASET_META} ${VALID_DATASET} ${VALID_DATASET_META}) for file in ${file_list[@]}; do if [[ ! -f ${file} ]]; then echo "expect: ${file} exists, actual: not exist." exit_code=$(($exit_code + 1)) fi done # clean the test files. if ! rm -rf ${RESULTS}/*; then echo "cleaning test data in ${RESULTS} failed." exit_code=$(($exit_code + 1)) fi exit $exit_code