mirror of https://github.com/InternLM/InternLM
51 lines
1.5 KiB
Bash
51 lines
1.5 KiB
Bash
#!/bin/bash
|
|
set -x
|
|
|
|
source ./ci_scripts/common/variables.sh
|
|
[[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci, exit."; exit 1; }
|
|
|
|
readonly SRC_DATASET_META=${DATA_VOLUME}/lm_data/alpaca_data/alpaca_data.json
|
|
readonly RESULTS=${DATA_VOLUME}/lm_data/alpaca_data/result
|
|
readonly TRAIN_DATASET=${RESULTS}/train/en/dataset.bin
|
|
readonly TRAIN_DATASET_META=${RESULTS}/train/en/dataset.bin.meta
|
|
readonly VALID_DATASET=${RESULTS}/valid/en/dataset.bin
|
|
readonly VALID_DATASET_META=${RESULTS}/valid/en/dataset.bin.meta
|
|
|
|
split_ratio=0.1
|
|
exit_code=0
|
|
|
|
source ./ci_scripts/common/basic_func.sh
|
|
|
|
echo "start to test alpaca_tokenizer.py."
|
|
|
|
if [[ -d ${RESULTS} ]]; then
|
|
if ! rm -rf ${RESULTS}/*; then
|
|
echo "cleaning test data in ${RESULTS} failed, exit."
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
if [[ ! -f ${SRC_DATASET_META} ]]; then
|
|
echo "${SRC_DATASET_META} should be exist, exit."
|
|
exit 1
|
|
fi
|
|
|
|
python tools/alpaca_tokenizer.py ${SRC_DATASET_META} ${RESULTS} tools/V7_sft.model --split_ratio ${split_ratio}
|
|
[[ $? -ne 0 ]] && { echo "test alpaca_tokenizer.py failed."; exit_code=$(($exit_code + 1)); }
|
|
|
|
file_list=(${TRAIN_DATASET} ${TRAIN_DATASET_META} ${VALID_DATASET} ${VALID_DATASET_META})
|
|
for file in ${file_list[@]}; do
|
|
if [[ ! -f ${file} ]]; then
|
|
echo "expect: ${file} exists, actual: not exist."
|
|
exit_code=$(($exit_code + 1))
|
|
fi
|
|
done
|
|
|
|
# clean the test files.
|
|
if ! rm -rf ${RESULTS}/*; then
|
|
echo "cleaning test data in ${RESULTS} failed."
|
|
exit_code=$(($exit_code + 1))
|
|
fi
|
|
|
|
exit $exit_code
|