From 3ea94f2e2ac6a7c81f89da8b8742be0d210c60ec Mon Sep 17 00:00:00 2001 From: Guoteng <32697156+SolenoidWGT@users.noreply.github.com> Date: Thu, 19 Oct 2023 10:00:57 +0800 Subject: [PATCH] fix(utils): disable bench_net in gputest.py (#421) --- internlm/utils/gputest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/internlm/utils/gputest.py b/internlm/utils/gputest.py index 48877b9..85d4cdc 100644 --- a/internlm/utils/gputest.py +++ b/internlm/utils/gputest.py @@ -39,7 +39,9 @@ def empty_cache_and_diag(batch_count, interval=50): with torch.no_grad(): timer_diagnosis() bench_gpu() - bench_net() + # FIXME: Runtime benchmark diagnosis can easily cause the training process + # to exit due to NCCL errors. + # bench_net() # do empty_cache after the bench torch.cuda.empty_cache() # do garbage collection