add memory print

2023-10-25 14:31:00 +08:00 · 2023-10-25 14:31:00 +08:00 · 363275b500
parent 918dff7257
commit 363275b500
2 changed files with 4 additions and 1 deletions
--- a/internlm/model/overlap_handler.py
+++ b/internlm/model/overlap_handler.py
@ -316,7 +316,8 @@ class FSTPOverlapSchedulerHook(SchedulerHook):
            self._overlap_handler.set_forward_mode(True)
    def after_forward(self, scheduler, outputs) -> None:
-        pass
+        print("after forward allocated memory: ", torch.cuda.memory_allocated() / 1024 / 1024 /1024, flush=True)
        print("after forward max memory: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True)
    def before_criterion(self, scheduler, outputs, label) -> None:
        pass
--- a/train.py
+++ b/train.py
@ -255,6 +255,8 @@ def main(args):
            # update parameters, and returns (success_update, grad_norm)
            trainer_result = trainer.step()
            assert trainer_result is not None
            print("after step: ", torch.cuda.memory_allocated() / 1024 / 1024 /1024, flush=True)
            print("after step: ", torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024, flush=True)
            success_update, grad_norm_groups = trainer_result
            if success_update:  # update parameters successfully