mirror of https://github.com/InternLM/InternLM
43 lines
1.2 KiB
Bash
43 lines
1.2 KiB
Bash
#!/bin/bash
|
|
set -x
|
|
|
|
source ./ci_scripts/common/variables.sh
|
|
[[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci."; exit 1; }
|
|
|
|
readonly DATA=${DATA_VOLUME}/lm_data/cn_data/raw_data.txt
|
|
readonly RESULT=${DATA_VOLUME}/lm_data/cn_data/result.bin
|
|
readonly RESULT_META=${DATA_VOLUME}/lm_data/cn_data/result.bin.meta
|
|
readonly RESULTS=${DATA_VOLUME}/lm_data/cn_data/result.*
|
|
exit_code=0
|
|
|
|
source ./ci_scripts/common/basic_func.sh
|
|
|
|
echo "start to test tokenizer.py."
|
|
|
|
num=$(num_files "${RESULTS}")
|
|
if [[ ${num} -gt 0 ]]; then
|
|
if ! rm -rf ${RESULTS}; then
|
|
echo "cleaning test data ${RESULTS} failed, exit."
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
srun -p llm python tools/tokenizer.py --text_input_path ${DATA} --bin_output_path ${RESULT}
|
|
[[ $? -ne 0 ]] && { echo "test tokenizer.py failed."; exit_code=$(($exit_code + 1)); }
|
|
|
|
file_list=($RESULT $RESULT_META)
|
|
for file in ${file_list[@]}; do
|
|
if [[ ! -f ${file} ]]; then
|
|
echo "expect: ${file} exists, actual: not exist."
|
|
exit_code=$(($exit_code + 1))
|
|
fi
|
|
done
|
|
|
|
# clean the test files.
|
|
if ! rm -rf ${RESULTS}/*; then
|
|
echo "cleaning cached file in ${RESULTS} failed."
|
|
exit_code=$(($exit_code + 1))
|
|
fi
|
|
|
|
exit $exit_code
|