mirror of https://github.com/InternLM/InternLM
				
				
				
			
		
			
				
	
	
		
			51 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Bash
		
	
	
			
		
		
	
	
			51 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Bash
		
	
	
#!/bin/bash
 | 
						|
set -x
 | 
						|
 | 
						|
source ./ci_scripts/common/variables.sh
 | 
						|
[[ -n ${DATA_VOLUME} ]] || { echo "should set DATA_VOLUME first before ci, exit."; exit 1; }
 | 
						|
 | 
						|
readonly SRC_DATASET_META=${DATA_VOLUME}/lm_data/alpaca_data/alpaca_data.json
 | 
						|
readonly RESULTS=${DATA_VOLUME}/lm_data/alpaca_data/result
 | 
						|
readonly TRAIN_DATASET=${RESULTS}/train/en/dataset.bin
 | 
						|
readonly TRAIN_DATASET_META=${RESULTS}/train/en/dataset.bin.meta
 | 
						|
readonly VALID_DATASET=${RESULTS}/valid/en/dataset.bin
 | 
						|
readonly VALID_DATASET_META=${RESULTS}/valid/en/dataset.bin.meta
 | 
						|
 | 
						|
split_ratio=0.1
 | 
						|
exit_code=0
 | 
						|
 | 
						|
source ./ci_scripts/common/basic_func.sh
 | 
						|
 | 
						|
echo "start to test alpaca_tokenizer.py."
 | 
						|
 | 
						|
if [[ -d ${RESULTS} ]]; then
 | 
						|
    if ! rm -rf ${RESULTS}/*; then
 | 
						|
       echo "cleaning test data in ${RESULTS} failed, exit."
 | 
						|
       exit 1
 | 
						|
    fi
 | 
						|
fi
 | 
						|
 | 
						|
if [[ ! -f ${SRC_DATASET_META} ]]; then
 | 
						|
   echo "${SRC_DATASET_META} should be exist, exit."
 | 
						|
   exit 1
 | 
						|
fi
 | 
						|
 | 
						|
python tools/alpaca_tokenizer.py ${SRC_DATASET_META} ${RESULTS} tools/V7_sft.model --split_ratio ${split_ratio}
 | 
						|
[[ $? -ne 0 ]] && { echo "test alpaca_tokenizer.py failed.";  exit_code=$(($exit_code + 1)); }
 | 
						|
 | 
						|
file_list=(${TRAIN_DATASET} ${TRAIN_DATASET_META} ${VALID_DATASET} ${VALID_DATASET_META})
 | 
						|
for file in ${file_list[@]}; do
 | 
						|
    if [[ ! -f ${file} ]]; then
 | 
						|
        echo "expect: ${file} exists, actual: not exist."
 | 
						|
        exit_code=$(($exit_code + 1))
 | 
						|
    fi
 | 
						|
done
 | 
						|
 | 
						|
# clean the test files.
 | 
						|
if ! rm -rf ${RESULTS}/*; then
 | 
						|
    echo "cleaning test data in ${RESULTS} failed."
 | 
						|
    exit_code=$(($exit_code + 1))
 | 
						|
fi
 | 
						|
 | 
						|
exit $exit_code
 |