diff --git a/.gitignore b/.gitignore index 8992a0f..5e78704 100644 --- a/.gitignore +++ b/.gitignore @@ -145,3 +145,4 @@ core.* llm_ckpts events.* memory_trace +RUN*/ diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index f79606a..9721c3d 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -226,6 +226,7 @@ def get_train_data_loader( dataset_types = list(get_dataset_type_ids_map(train_folder).keys()) if not train_folder: + dataset_types = ["en", "cn", "code"] train_ds = RandomDataset(num_samples=1000000, max_len=data_cfg.seq_len) if data_cfg.pack_sample_into_one: train_ds = PackedDatasetWithoutCuSeqlen(