From b0e15d563ee9b010f97520c1cad7ec779945af7c Mon Sep 17 00:00:00 2001 From: YeAnbang Date: Mon, 22 Jul 2024 06:11:38 +0000 Subject: [PATCH] remove real data path --- applications/ColossalChat/benchmarks/benchmark_dpo.sh | 4 ++-- applications/ColossalChat/benchmarks/benchmark_kto.sh | 4 ++-- applications/ColossalChat/benchmarks/benchmark_orpo.sh | 4 ++-- applications/ColossalChat/benchmarks/benchmark_sft.sh | 4 ++-- applications/ColossalChat/benchmarks/benchmark_simpo.sh | 4 ++-- .../data_preparation_scripts/prepare_sft_dataset.sh | 6 +++--- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/applications/ColossalChat/benchmarks/benchmark_dpo.sh b/applications/ColossalChat/benchmarks/benchmark_dpo.sh index 08ce0629c..44d821a87 100755 --- a/applications/ColossalChat/benchmarks/benchmark_dpo.sh +++ b/applications/ColossalChat/benchmarks/benchmark_dpo.sh @@ -17,8 +17,8 @@ set_n_least_used_CUDA_VISIBLE_DEVICES 4 PROJECT_NAME="dpo" PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs -PRETRAINED_MODEL_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local model path -PRETRAINED_TOKENIZER_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local tokenizer path +PRETRAINED_MODEL_PATH="" # huggingface or local model path +PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path BENCHMARK_DATA_DIR="./temp/dpo" # Path to benchmark data DATASET_SIZE=320 diff --git a/applications/ColossalChat/benchmarks/benchmark_kto.sh b/applications/ColossalChat/benchmarks/benchmark_kto.sh index 41de40f13..82d3e3421 100755 --- a/applications/ColossalChat/benchmarks/benchmark_kto.sh +++ b/applications/ColossalChat/benchmarks/benchmark_kto.sh @@ -17,8 +17,8 @@ set_n_least_used_CUDA_VISIBLE_DEVICES 4 PROJECT_NAME="kto" PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs -PRETRAINED_MODEL_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local model path -PRETRAINED_TOKENIZER_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local tokenizer path +PRETRAINED_MODEL_PATH="" # huggingface or local model path +PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path BENCHMARK_DATA_DIR="./temp/kto" # Path to benchmark data DATASET_SIZE=80 diff --git a/applications/ColossalChat/benchmarks/benchmark_orpo.sh b/applications/ColossalChat/benchmarks/benchmark_orpo.sh index fa51a788f..f8fb264ae 100755 --- a/applications/ColossalChat/benchmarks/benchmark_orpo.sh +++ b/applications/ColossalChat/benchmarks/benchmark_orpo.sh @@ -17,8 +17,8 @@ set_n_least_used_CUDA_VISIBLE_DEVICES 2 PROJECT_NAME="orpo" PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs -PRETRAINED_MODEL_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local model path -PRETRAINED_TOKENIZER_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local tokenizer path +PRETRAINED_MODEL_PATH="" # huggingface or local model path +PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path BENCHMARK_DATA_DIR="./temp/orpo" # Path to benchmark data DATASET_SIZE=160 diff --git a/applications/ColossalChat/benchmarks/benchmark_sft.sh b/applications/ColossalChat/benchmarks/benchmark_sft.sh index 3d7e2ec16..efcd428dd 100755 --- a/applications/ColossalChat/benchmarks/benchmark_sft.sh +++ b/applications/ColossalChat/benchmarks/benchmark_sft.sh @@ -17,8 +17,8 @@ set_n_least_used_CUDA_VISIBLE_DEVICES 4 PROJECT_NAME="sft" PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs -PRETRAINED_MODEL_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local model path -PRETRAINED_TOKENIZER_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local tokenizer path +PRETRAINED_MODEL_PATH="" # huggingface or local model path +PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path BENCHMARK_DATA_DIR="./temp/sft" # Path to benchmark data DATASET_SIZE=640 diff --git a/applications/ColossalChat/benchmarks/benchmark_simpo.sh b/applications/ColossalChat/benchmarks/benchmark_simpo.sh index 5d41c34af..47dfc8595 100755 --- a/applications/ColossalChat/benchmarks/benchmark_simpo.sh +++ b/applications/ColossalChat/benchmarks/benchmark_simpo.sh @@ -17,8 +17,8 @@ set_n_least_used_CUDA_VISIBLE_DEVICES 4 PROJECT_NAME="simpo" PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs -PRETRAINED_MODEL_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local model path -PRETRAINED_TOKENIZER_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local tokenizer path +PRETRAINED_MODEL_PATH="" # huggingface or local model path +PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path BENCHMARK_DATA_DIR="./temp/simpo" # Path to benchmark data DATASET_SIZE=640 diff --git a/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh b/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh index dbd323438..c4b6fec9b 100755 --- a/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh +++ b/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh @@ -5,9 +5,9 @@ rm -rf $SAVE_DIR/jsonl rm -rf $SAVE_DIR/arrow python prepare_dataset.py --type sft \ - --data_input_dirs /home/nvme-share/home/yeanbang/data/dataset/hh_rlhf/sft \ - --conversation_template_config /home/nvme-share/home/yeanbang/ColossalAI/applications/ColossalChat/config/conversation_template/llama2.json \ - --tokenizer_dir "/home/nvme-share/share/models/Sheared-LLaMA-1.3B" \ + --data_input_dirs /PATH/TO/SFT/DATASET \ + --conversation_template_config /PATH/TO/CHAT/TEMPLATE/CONFIG.json \ + --tokenizer_dir "" \ --data_cache_dir $SAVE_DIR/cache \ --data_jsonl_output_dir $SAVE_DIR/jsonl \ --data_arrow_output_dir $SAVE_DIR/arrow \