diff --git a/internlm/core/engine.py b/internlm/core/engine.py index b92b7f8..eb33e35 100644 --- a/internlm/core/engine.py +++ b/internlm/core/engine.py @@ -186,7 +186,7 @@ class Engine: if to_gpu: batch_data = move_to_device(batch_data) - # For packed-dataset, batch_data is (micro_num, micro_num*micro_bsz), + # For packed-dataset, batch_data is (micro_num, micro_bsz*seq_len), # therefore 'batch_size' is equal to 'micro_num' # For nopacked-dataset, batch_data is (micro_num*micro_bsz, seq_len), # therefore 'batch_size' is equal to 'micro_num*micro_bsz' diff --git a/internlm/core/scheduler/pipeline_scheduler.py b/internlm/core/scheduler/pipeline_scheduler.py index 5a6da61..5b864ff 100644 --- a/internlm/core/scheduler/pipeline_scheduler.py +++ b/internlm/core/scheduler/pipeline_scheduler.py @@ -193,6 +193,9 @@ class PipelineScheduler(BaseScheduler): # Pipeline schedule just puts data in memory, batch_data, actual_batch_size = engine.load_batch(data_iter, to_gpu=False) + # Even if 'use_flash_attn' is False, the data seen when the 'load_batch' is called is still packed, + # because internlm's current train dataset is packed, even using dummy data. + # The unpack operation is performed in load_micro_batch(). if check_data_is_packed(batch_data): micro_num = actual_batch_size else: