fix dataset types when using random dataset (#489)

pull/490/head^2
Yang Gao 2023-11-10 15:08:22 +08:00 committed by GitHub
parent 5d3242027a
commit 07026d1821
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 2 additions and 0 deletions

1
.gitignore vendored
View File

@ -145,3 +145,4 @@ core.*
llm_ckpts
events.*
memory_trace
RUN*/

View File

@ -226,6 +226,7 @@ def get_train_data_loader(
dataset_types = list(get_dataset_type_ids_map(train_folder).keys())
if not train_folder:
dataset_types = ["en", "cn", "code"]
train_ds = RandomDataset(num_samples=1000000, max_len=data_cfg.seq_len)
if data_cfg.pack_sample_into_one:
train_ds = PackedDatasetWithoutCuSeqlen(