InternLM/ci_scripts/data/tokenizer_alpaca.sh

51 lines
1.5 KiB
Bash

#!/bin/bash
set -x
source ./ci_scripts/common/variables.sh
[[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci, exit."; exit 1; }
readonly SRC_DATASET_META=${DATA_VOLUME}/lm_data/alpaca_data/alpaca_data.json
readonly RESULTS=${DATA_VOLUME}/lm_data/alpaca_data/result
readonly TRAIN_DATASET=${RESULTS}/train/en/dataset.bin
readonly TRAIN_DATASET_META=${RESULTS}/train/en/dataset.bin.meta
readonly VALID_DATASET=${RESULTS}/valid/en/dataset.bin
readonly VALID_DATASET_META=${RESULTS}/valid/en/dataset.bin.meta
split_ratio=0.1
exit_code=0
source ./ci_scripts/common/basic_func.sh
echo "start to test alpaca_tokenizer.py."
if [[ -d ${RESULTS} ]]; then
if ! rm -rf ${RESULTS}/*; then
echo "cleaning test data in ${RESULTS} failed, exit."
exit 1
fi
fi
if [[ ! -f ${SRC_DATASET_META} ]]; then
echo "${SRC_DATASET_META} should be exist, exit."
exit 1
fi
python tools/alpaca_tokenizer.py ${SRC_DATASET_META} ${RESULTS} tools/V7_sft.model --split_ratio ${split_ratio}
[[ $? -ne 0 ]] && { echo "test alpaca_tokenizer.py failed."; exit_code=$(($exit_code + 1)); }
file_list=(${TRAIN_DATASET} ${TRAIN_DATASET_META} ${VALID_DATASET} ${VALID_DATASET_META})
for file in ${file_list[@]}; do
if [[ ! -f ${file} ]]; then
echo "expect: ${file} exists, actual: not exist."
exit_code=$(($exit_code + 1))
fi
done
# clean the test files.
if ! rm -rf ${RESULTS}/*; then
echo "cleaning test data in ${RESULTS} failed."
exit_code=$(($exit_code + 1))
fi
exit $exit_code