From 07026d1821ed0651d72c7e52f395b80de16b6ca9 Mon Sep 17 00:00:00 2001 From: Yang Gao Date: Fri, 10 Nov 2023 15:08:22 +0800 Subject: [PATCH] fix dataset types when using random dataset (#489) --- .gitignore | 1 + internlm/train/training_internlm.py | 1 + 2 files changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 8992a0f..5e78704 100644 --- a/.gitignore +++ b/.gitignore @@ -145,3 +145,4 @@ core.* llm_ckpts events.* memory_trace +RUN*/ diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index f79606a..9721c3d 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -226,6 +226,7 @@ def get_train_data_loader( dataset_types = list(get_dataset_type_ids_map(train_folder).keys()) if not train_folder: + dataset_types = ["en", "cn", "code"] train_ds = RandomDataset(num_samples=1000000, max_len=data_cfg.seq_len) if data_cfg.pack_sample_into_one: train_ds = PackedDatasetWithoutCuSeqlen(