mirror of https://github.com/InternLM/InternLM
				
				
				
			
		
			
				
	
	
		
			43 lines
		
	
	
		
			1.2 KiB
		
	
	
	
		
			Bash
		
	
	
			
		
		
	
	
			43 lines
		
	
	
		
			1.2 KiB
		
	
	
	
		
			Bash
		
	
	
#!/bin/bash
 | 
						|
set -x
 | 
						|
 | 
						|
source ./ci_scripts/common/variables.sh
 | 
						|
[[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci."; exit 1; }
 | 
						|
 | 
						|
readonly DATA=${DATA_VOLUME}/lm_data/cn_data/raw_data.txt
 | 
						|
readonly RESULT=${DATA_VOLUME}/lm_data/cn_data/result.bin
 | 
						|
readonly RESULT_META=${DATA_VOLUME}/lm_data/cn_data/result.bin.meta
 | 
						|
readonly RESULTS=${DATA_VOLUME}/lm_data/cn_data/result.*
 | 
						|
exit_code=0
 | 
						|
 | 
						|
source ./ci_scripts/common/basic_func.sh
 | 
						|
 | 
						|
echo "start to test tokenizer.py." 
 | 
						|
 | 
						|
num=$(num_files "${RESULTS}")
 | 
						|
if [[ ${num} -gt 0 ]]; then
 | 
						|
    if ! rm -rf ${RESULTS}; then
 | 
						|
       echo "cleaning test data ${RESULTS} failed, exit."
 | 
						|
       exit 1
 | 
						|
    fi
 | 
						|
fi
 | 
						|
 | 
						|
srun -p ${SLURM_PARTITION} --job-name=$1 --gpus-per-task=1 python tools/tokenizer.py --text_input_path ${DATA} --bin_output_path ${RESULT}
 | 
						|
[[ $? -ne 0 ]] && { echo "test tokenizer.py failed.";  exit_code=$(($exit_code + 1)); }
 | 
						|
 | 
						|
file_list=($RESULT $RESULT_META)
 | 
						|
for file in ${file_list[@]}; do
 | 
						|
    if [[ ! -f ${file} ]]; then
 | 
						|
        echo "expect: ${file} exists, actual: not exist."
 | 
						|
        exit_code=$(($exit_code + 1))
 | 
						|
    fi
 | 
						|
done
 | 
						|
 | 
						|
# clean the test files.
 | 
						|
if ! rm -rf ${RESULTS}/*; then
 | 
						|
   echo "cleaning cached file in ${RESULTS} failed."
 | 
						|
   exit_code=$(($exit_code + 1))
 | 
						|
fi
 | 
						|
 | 
						|
exit $exit_code
 |