diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index 491e2b0..ddf380a 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -533,9 +533,6 @@ def initialize_distributed_env( seed (int, optional): Specified random seed for every process. 1024 by default. """ - # close automatic garbage collection - gc.disable() - torch.cuda.empty_cache() if launcher == "torch": diff --git a/train.py b/train.py index 6874f9e..11ce3de 100644 --- a/train.py +++ b/train.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- +import gc import socket import time import traceback @@ -191,6 +192,9 @@ def main(args): # transfer the train data loader into train data iterator train_iter = iter(train_dl) + # close automatic garbage collection + gc.disable() + with initialize_llm_profile(profiling=args.profiling, start_time=current_time) as prof: # start iterating the train data and begin training for batch_count in range(train_state.batch_count, total_steps):