diff --git a/internlm/utils/gputest.py b/internlm/utils/gputest.py index 48877b9..85d4cdc 100644 --- a/internlm/utils/gputest.py +++ b/internlm/utils/gputest.py @@ -39,7 +39,9 @@ def empty_cache_and_diag(batch_count, interval=50): with torch.no_grad(): timer_diagnosis() bench_gpu() - bench_net() + # FIXME: Runtime benchmark diagnosis can easily cause the training process + # to exit due to NCCL errors. + # bench_net() # do empty_cache after the bench torch.cuda.empty_cache() # do garbage collection