diff --git a/.gitignore b/.gitignore index 8992a0f..5e78704 100644 --- a/.gitignore +++ b/.gitignore @@ -145,3 +145,4 @@ core.* llm_ckpts events.* memory_trace +RUN*/ diff --git a/doc/code-docs/source/example/20B_demo.rst b/doc/code-docs/source/example/20B_demo.rst index 483c3ff..da7f1d2 100644 --- a/doc/code-docs/source/example/20B_demo.rst +++ b/doc/code-docs/source/example/20B_demo.rst @@ -199,4 +199,4 @@ 2023-11-10T15:06:06.988+08:00 INFO [training_internlm.py, line 601, in record_current_batch_training_metrics] - pid=78690 : tflops=127.89743136367036 step=2 loss=10.111495971679688 tgs (tokens/gpu/second)=1031.72 tgs/last_tgs_1=1031.72 tgs/tgs_all=600.43 tgs/tgs_avg=767.12 tgs/tgs_SMA=600.43 tgs/last_tgs_10=0 tgs/last_tgs_50=0 lr=8.000000000000001e-07 loss_scale=65536.0 grad_norm={'0_default': 76.99318912653898, '1_fp32': 0.0} micro_num=4 num_consumed_tokens=196608 inf_nan_skip_batches=0 num_samples_in_batch=17 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.4 acc=0.0704 perplexity=25907.623 acc/en=0.0704 acc/cn=0.0 acc/code=0.0 tokens/en=60244 tokens/cn=0 tokens/code=0 loss_from_metric=10.1623 loss/en=10.1623 loss/cn=nan loss/code=nan 2023-11-10T15:06:10.994+08:00 INFO [training_internlm.py, line 601, in record_current_batch_training_metrics] - pid=78690 : tflops=127.89845291183941 step=3 loss=8.848427772521973 tgs (tokens/gpu/second)=1031.73 tgs/last_tgs_1=1031.73 tgs/tgs_all=670.5 tgs/tgs_avg=833.27 tgs/tgs_SMA=670.5 tgs/last_tgs_10=0 tgs/last_tgs_50=0 lr=1.0000000000000002e-06 loss_scale=65536.0 grad_norm={'0_default': 60.47092413727133, '1_fp32': 0.0} micro_num=4 num_consumed_tokens=262144 inf_nan_skip_batches=0 num_samples_in_batch=17 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.41 acc=0.0783 perplexity=7380.229 acc/en=0.0783 acc/cn=0.0 acc/code=0.0 tokens/en=60328 tokens/cn=0 tokens/code=0 loss_from_metric=8.9066 loss/en=8.9066 loss/cn=nan loss/code=nan 2023-11-10T15:06:15.041+08:00 INFO [training_internlm.py, line 601, in record_current_batch_training_metrics] - pid=78690 : tflops=126.55593705224216 step=4 loss=7.509810924530029 tgs (tokens/gpu/second)=1020.9 tgs/last_tgs_1=1020.9 tgs/tgs_all=719.92 tgs/tgs_avg=870.8 tgs/tgs_SMA=719.92 tgs/last_tgs_10=0 tgs/last_tgs_50=0 lr=1.2000000000000002e-06 loss_scale=65536.0 grad_norm={'0_default': 42.36608180721121, '1_fp32': 0.0} micro_num=4 num_consumed_tokens=327680 inf_nan_skip_batches=0 num_samples_in_batch=22 largest_length=1893 largest_batch=8 smallest_batch=4 adam_beta2=0.95 fwd_bwd_time=3.43 acc=0.0706 perplexity=2728.5764 acc/en=0.0706 acc/cn=0.0 acc/code=0.0 tokens/en=61028 tokens/cn=0 tokens/code=0 loss_from_metric=7.9115 loss/en=7.9115 loss/cn=nan loss/code=nan - 2023-11-10T15:06:19.051+08:00 INFO [training_internlm.py, line 601, in record_current_batch_training_metrics] - pid=78690 : tflops=127.79902453659938 step=5 loss=7.049621105194092 tgs (tokens/gpu/second)=1030.92 tgs/last_tgs_1=1030.93 tgs/tgs_all=758.03 tgs/tgs_avg=897.49 tgs/tgs_SMA=758.03 tgs/last_tgs_10=0 tgs/last_tgs_50=0 lr=1.4000000000000001e-06 loss_scale=65536.0 grad_norm={'0_default': 32.49298677335042, '1_fp32': 0.0} micro_num=4 num_consumed_tokens=393216 inf_nan_skip_batches=0 num_samples_in_batch=13 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.42 acc=0.0726 perplexity=1169.7916 acc/en=0.0726 acc/cn=0.0 acc/code=0.0 tokens/en=61004 tokens/cn=0 tokens/code=0 loss_from_metric=7.0646 loss/en=7.0646 loss/cn=nan loss/code=nan \ No newline at end of file + 2023-11-10T15:06:19.051+08:00 INFO [training_internlm.py, line 601, in record_current_batch_training_metrics] - pid=78690 : tflops=127.79902453659938 step=5 loss=7.049621105194092 tgs (tokens/gpu/second)=1030.92 tgs/last_tgs_1=1030.93 tgs/tgs_all=758.03 tgs/tgs_avg=897.49 tgs/tgs_SMA=758.03 tgs/last_tgs_10=0 tgs/last_tgs_50=0 lr=1.4000000000000001e-06 loss_scale=65536.0 grad_norm={'0_default': 32.49298677335042, '1_fp32': 0.0} micro_num=4 num_consumed_tokens=393216 inf_nan_skip_batches=0 num_samples_in_batch=13 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.42 acc=0.0726 perplexity=1169.7916 acc/en=0.0726 acc/cn=0.0 acc/code=0.0 tokens/en=61004 tokens/cn=0 tokens/code=0 loss_from_metric=7.0646 loss/en=7.0646 loss/cn=nan loss/code=nan diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index f79606a..9721c3d 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -226,6 +226,7 @@ def get_train_data_loader( dataset_types = list(get_dataset_type_ids_map(train_folder).keys()) if not train_folder: + dataset_types = ["en", "cn", "code"] train_ds = RandomDataset(num_samples=1000000, max_len=data_cfg.seq_len) if data_cfg.pack_sample_into_one: train_ds = PackedDatasetWithoutCuSeqlen(