From 544b7a38a167cb05cdc7590cfc100e23c0ed5ab7 Mon Sep 17 00:00:00 2001
From: YeAnbang
diff --git a/applications/ColossalChat/examples/data_preparation_scripts/prepare_kto_dataset.sh b/applications/ColossalChat/examples/data_preparation_scripts/prepare_kto_dataset.sh index 274e2a3fd..0450d570d 100755 --- a/applications/ColossalChat/examples/data_preparation_scripts/prepare_kto_dataset.sh +++ b/applications/ColossalChat/examples/data_preparation_scripts/prepare_kto_dataset.sh @@ -5,9 +5,9 @@ rm -rf $SAVE_DIR/jsonl rm -rf $SAVE_DIR/arrow python prepare_dataset.py --type kto \ - --data_input_dirs /home/nvme-share/home/yeanbang/data/dataset/hh_rlhf/kto_format/data \ - --conversation_template_config /home/nvme-share/home/yeanbang/ColossalAI/applications/ColossalChat/config/conversation_template/llama2.json \ - --tokenizer_dir "/home/nvme-share/share/models/Sheared-LLaMA-1.3B" \ + --data_input_dirs /PATH/TO/KTO/DATASET \ + --conversation_template_config /PATH/TO/CHAT/TEMPLATE/CONFIG.json \ + --tokenizer_dir "" \ --data_cache_dir $SAVE_DIR/cache \ --data_jsonl_output_dir $SAVE_DIR/jsonl \ --data_arrow_output_dir $SAVE_DIR/arrow \ diff --git a/applications/ColossalChat/examples/data_preparation_scripts/prepare_preference_dataset.sh b/applications/ColossalChat/examples/data_preparation_scripts/prepare_preference_dataset.sh index b6546a21e..5c06b43fe 100755 --- a/applications/ColossalChat/examples/data_preparation_scripts/prepare_preference_dataset.sh +++ b/applications/ColossalChat/examples/data_preparation_scripts/prepare_preference_dataset.sh @@ -10,4 +10,5 @@ python prepare_dataset.py --type preference \ --tokenizer_dir "" \ --data_cache_dir $SAVE_DIR/cache \ --data_jsonl_output_dir $SAVE_DIR/jsonl \ - --data_arrow_output_dir $SAVE_DIR/arrow + --data_arrow_output_dir $SAVE_DIR/arrow \ + --max_length 1024 diff --git a/applications/ColossalChat/examples/data_preparation_scripts/prepare_prompt_dataset.sh b/applications/ColossalChat/examples/data_preparation_scripts/prepare_prompt_dataset.sh index 8d3d6c2c2..d74667889 100755 --- a/applications/ColossalChat/examples/data_preparation_scripts/prepare_prompt_dataset.sh +++ b/applications/ColossalChat/examples/data_preparation_scripts/prepare_prompt_dataset.sh @@ -10,4 +10,5 @@ python prepare_dataset.py --type prompt \ --tokenizer_dir "" \ --data_cache_dir $SAVE_DIR/cache \ --data_jsonl_output_dir $SAVE_DIR/jsonl \ - --data_arrow_output_dir $SAVE_DIR/arrow + --data_arrow_output_dir $SAVE_DIR/arrow \ + --max_length 1024 diff --git a/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh b/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh index 3f555883d..dbd323438 100755 --- a/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh +++ b/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh @@ -11,3 +11,4 @@ python prepare_dataset.py --type sft \ --data_cache_dir $SAVE_DIR/cache \ --data_jsonl_output_dir $SAVE_DIR/jsonl \ --data_arrow_output_dir $SAVE_DIR/arrow \ + --max_length 4096 diff --git a/applications/ColossalChat/examples/training_scripts/train_kto.sh b/applications/ColossalChat/examples/training_scripts/train_kto.sh index 86b5897f1..3dcf6af02 100755 --- a/applications/ColossalChat/examples/training_scripts/train_kto.sh +++ b/applications/ColossalChat/examples/training_scripts/train_kto.sh @@ -16,23 +16,23 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() { set_n_least_used_CUDA_VISIBLE_DEVICES 4 PROJECT_NAME="kto" -PARENT_SAVE_DIR="/home/nvme-share/home/yeanbang/data/experiments/kto/checkpoint" # Path to a folder to save checkpoints -PARENT_TENSORBOARD_DIR="/home/nvme-share/home/yeanbang/data/experiments/kto/log" # Path to a folder to save logs -PARENT_CONFIG_FILE="/home/nvme-share/home/yeanbang/data/experiments/kto/log" # Path to a folder to save training config logs -PRETRAINED_MODEL_PATH="/home/nvme-share/home/yeanbang/data/model/hh_rlhf_sheared_llamasft-2024-07-17-07-29-29/modeling" # huggingface or local model path -PRETRAINED_TOKENIZER_PATH="/home/nvme-share/share/models/Sheared-LLaMA-1.3B" # huggingface or local tokenizer path +PARENT_SAVE_DIR="" # Path to a folder to save checkpoints +PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs +PARENT_CONFIG_FILE="" # Path to a folder to save training config logs +PRETRAINED_MODEL_PATH="" # huggingface or local model path +PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path declare -a dataset=( - /home/nvme-share/home/yeanbang/data/experiments/kto/arrow/part-00000 - /home/nvme-share/home/yeanbang/data/experiments/kto/arrow/part-00001 - /home/nvme-share/home/yeanbang/data/experiments/kto/arrow/part-00002 - /home/nvme-share/home/yeanbang/data/experiments/kto/arrow/part-00003 - /home/nvme-share/home/yeanbang/data/experiments/kto/arrow/part-00004 - /home/nvme-share/home/yeanbang/data/experiments/kto/arrow/part-00005 - /home/nvme-share/home/yeanbang/data/experiments/kto/arrow/part-00006 - /home/nvme-share/home/yeanbang/data/experiments/kto/arrow/part-00007 - /home/nvme-share/home/yeanbang/data/experiments/kto/arrow/part-00008 - /home/nvme-share/home/yeanbang/data/experiments/kto/arrow/part-00009 + /Your/KTO/Data/arrow/part-00000 + /Your/KTO/Data/arrow/part-00001 + /Your/KTO/Data/arrow/part-00002 + /Your/KTO/Data/arrow/part-00003 + /Your/KTO/Data/arrow/part-00004 + /Your/KTO/Data/arrow/part-00005 + /Your/KTO/Data/arrow/part-00006 + /Your/KTO/Data/arrow/part-00007 + /Your/KTO/Data/arrow/part-00008 + /Your/KTO/Data/arrow/part-00009 ) TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S) diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.sh b/applications/ColossalChat/examples/training_scripts/train_sft.sh index 31c2ab562..f0a281475 100755 --- a/applications/ColossalChat/examples/training_scripts/train_sft.sh +++ b/applications/ColossalChat/examples/training_scripts/train_sft.sh @@ -15,22 +15,22 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() { set_n_least_used_CUDA_VISIBLE_DEVICES 4 PROJECT_NAME="sft" -PARENT_SAVE_DIR="/home/nvme-share/home/yeanbang/data/model/hh_rlhf_sheared_llama" # Path to a folder to save checkpoints -PARENT_TENSORBOARD_DIR="/home/nvme-share/home/yeanbang/data/experiments/sft/log" # Path to a folder to save logs -PARENT_CONFIG_FILE="/home/nvme-share/home/yeanbang/data/experiments/kto/log" # Path to a folder to save training config logs -PRETRAINED_MODEL_PATH="/home/nvme-share/share/models/Sheared-LLaMA-1.3B" # huggingface or local model path -PRETRAINED_TOKENIZER_PATH="/home/nvme-share/share/models/Sheared-LLaMA-1.3B" # huggingface or local tokenizer path +PARENT_SAVE_DIR="" # Path to a folder to save checkpoints +PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs +PARENT_CONFIG_FILE="" # Path to a folder to save training config logs +PRETRAINED_MODEL_PATH="" # huggingface or local model path +PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path declare -a dataset=( - /home/nvme-share/home/yeanbang/data/experiments/sft/arrow/part-00000 - /home/nvme-share/home/yeanbang/data/experiments/sft/arrow/part-00001 - /home/nvme-share/home/yeanbang/data/experiments/sft/arrow/part-00002 - /home/nvme-share/home/yeanbang/data/experiments/sft/arrow/part-00003 - /home/nvme-share/home/yeanbang/data/experiments/sft/arrow/part-00004 - /home/nvme-share/home/yeanbang/data/experiments/sft/arrow/part-00005 - /home/nvme-share/home/yeanbang/data/experiments/sft/arrow/part-00006 - /home/nvme-share/home/yeanbang/data/experiments/sft/arrow/part-00007 - /home/nvme-share/home/yeanbang/data/experiments/sft/arrow/part-00008 - /home/nvme-share/home/yeanbang/data/experiments/sft/arrow/part-00009 + /Your/SFT/Data/arrow/part-00000 + /Your/SFT/Data/arrow/part-00001 + /Your/SFT/Data/arrow/part-00002 + /Your/SFT/Data/arrow/part-00003 + /Your/SFT/Data/arrow/part-00004 + /Your/SFT/Data/arrow/part-00005 + /Your/SFT/Data/arrow/part-00006 + /Your/SFT/Data/arrow/part-00007 + /Your/SFT/Data/arrow/part-00008 + /Your/SFT/Data/arrow/part-00009 ) TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)