move manual gc before train loop starts

2023-12-19 15:29:55 +08:00 · 2023-12-19 15:29:55 +08:00 · 78400c21b8
parent 2afeebe5b0
commit 78400c21b8
2 changed files with 4 additions and 3 deletions
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@ -533,9 +533,6 @@ def initialize_distributed_env(
        seed (int, optional): Specified random seed for every process. 1024 by default.
    """

-    # close automatic garbage collection
-    gc.disable()
-
    torch.cuda.empty_cache()

    if launcher == "torch":
--- a/train.py
+++ b/train.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-

+import gc
 import socket
 import time
 import traceback
@ -191,6 +192,9 @@ def main(args):
    # transfer the train data loader into train data iterator
    train_iter = iter(train_dl)

+    # close automatic garbage collection
+    gc.disable()
+
    with initialize_llm_profile(profiling=args.profiling, start_time=current_time) as prof:
        # start iterating the train data and begin training
        for batch_count in range(train_state.batch_count, total_steps):