diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index 8ae8ee0..323715e 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -2,6 +2,7 @@ # -*- encoding: utf-8 -*- import argparse +import gc import os from pathlib import Path from typing import Dict, Union @@ -446,6 +447,8 @@ def initialize_distributed_env( master_port (str): The master port for distributed training. 8888 by default. seed (int, optional): Specified random seed for every process. 1024 by default. """ + # close automatic garbage collection + gc.disable() torch.cuda.empty_cache() diff --git a/internlm/utils/gputest.py b/internlm/utils/gputest.py index ddb4932..48877b9 100644 --- a/internlm/utils/gputest.py +++ b/internlm/utils/gputest.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- +import gc import math import socket @@ -41,6 +42,8 @@ def empty_cache_and_diag(batch_count, interval=50): bench_net() # do empty_cache after the bench torch.cuda.empty_cache() + # do garbage collection + gc.collect() def benchmark_forward(