From e3d128230bd7d1db42021b3ae92b42cfc5ee8f93 Mon Sep 17 00:00:00 2001 From: "877825076@qq.com" <877825076@qq.com> Date: Wed, 18 Oct 2023 12:26:24 +0800 Subject: [PATCH] fix(utils): disable bench_net in gputest.py --- internlm/utils/gputest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/internlm/utils/gputest.py b/internlm/utils/gputest.py index 48877b9..85d4cdc 100644 --- a/internlm/utils/gputest.py +++ b/internlm/utils/gputest.py @@ -39,7 +39,9 @@ def empty_cache_and_diag(batch_count, interval=50): with torch.no_grad(): timer_diagnosis() bench_gpu() - bench_net() + # FIXME: Runtime benchmark diagnosis can easily cause the training process + # to exit due to NCCL errors. + # bench_net() # do empty_cache after the bench torch.cuda.empty_cache() # do garbage collection