From 45195ac53d5e8384fde266b6fd7dcfc0bcec59e9 Mon Sep 17 00:00:00 2001 From: YeAnbang Date: Wed, 29 May 2024 02:25:19 +0000 Subject: [PATCH] remove local data path --- .../prepare_sft_dataset.sh | 17 ++++------------ .../examples/training_scripts/train_sft.sh | 20 +++++++++---------- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh b/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh index 47d828a6c..8562b47ee 100755 --- a/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh +++ b/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh @@ -1,22 +1,13 @@ -SAVE_DIR="/home/yeanbang/data/experiment/dataset/alpaca/test/Yi-1.5-6B" +SAVE_DIR="" rm -rf $SAVE_DIR/cache rm -rf $SAVE_DIR/jsonl rm -rf $SAVE_DIR/arrow -# python prepare_dataset.py --type sft \ -# --data_input_dirs /home/yeanbang/data/experiment/dataset/sft_data/test/sft-data \ -# --conversation_template_config /home/yeanbang/data/ColossalAI/applications/ColossalChat/config/conversation_template/THUDM_chatglm3-6b.json \ -# --tokenizer_dir "/mnt/jfs-hdd/home/data/models/ChatGlm-6B" \ -# --data_cache_dir $SAVE_DIR/cache \ -# --data_jsonl_output_dir $SAVE_DIR/jsonl \ -# --data_arrow_output_dir $SAVE_DIR/arrow \ - - python prepare_dataset.py --type sft \ - --data_input_dirs /home/yeanbang/data/experiment/dataset/sft_data/test/sft-data \ - --conversation_template_config /home/yeanbang/data/ColossalAI/applications/ColossalChat/config/conversation_template/01-ai_Yi-1.5-9B-Chat.json \ - --tokenizer_dir "/mnt/jfs-hdd/share/models/Yi-1.5-6B" \ + --data_input_dirs "PATH/TO/SFT/DATA" \ + --conversation_template_config /PATH/TO/CHAT/TEMPLATE/CONFIG.json \ + --tokenizer_dir "" \ --data_cache_dir $SAVE_DIR/cache \ --data_jsonl_output_dir $SAVE_DIR/jsonl \ --data_arrow_output_dir $SAVE_DIR/arrow \ diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.sh b/applications/ColossalChat/examples/training_scripts/train_sft.sh index 5d0f55f90..53c712901 100755 --- a/applications/ColossalChat/examples/training_scripts/train_sft.sh +++ b/applications/ColossalChat/examples/training_scripts/train_sft.sh @@ -23,16 +23,16 @@ PARENT_CONFIG_FILE="" # Path to a folder to save training config logs PRETRAINED_MODEL_PATH="" # huggingface or local model path PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path declare -a dataset=( - YOUR/PREFERENCE/DATA/DIR/arrow/part-00000 - YOUR/PREFERENCE/DATA/DIR/arrow/part-00001 - YOUR/PREFERENCE/DATA/DIR/arrow/part-00002 - YOUR/PREFERENCE/DATA/DIR/arrow/part-00003 - YOUR/PREFERENCE/DATA/DIR/arrow/part-00004 - YOUR/PREFERENCE/DATA/DIR/arrow/part-00005 - YOUR/PREFERENCE/DATA/DIR/arrow/part-00006 - YOUR/PREFERENCE/DATA/DIR/arrow/part-00007 - YOUR/PREFERENCE/DATA/DIR/arrow/part-00008 - YOUR/PREFERENCE/DATA/DIR/arrow/part-00009 + YOUR/SFT/DATA/DIR/arrow/part-00000 + YOUR/SFT/DATA/DIR/arrow/part-00001 + YOUR/SFT/DATA/DIR/arrow/part-00002 + YOUR/SFT/DATA/DIR/arrow/part-00003 + YOUR/SFT/DATA/DIR/arrow/part-00004 + YOUR/SFT/DATA/DIR/arrow/part-00005 + YOUR/SFT/DATA/DIR/arrow/part-00006 + YOUR/SFT/DATA/DIR/arrow/part-00007 + YOUR/SFT/DATA/DIR/arrow/part-00008 + YOUR/SFT/DATA/DIR/arrow/part-00009 ) TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)