From 056996f8b31e8c6b42ec31784e9f72dd26c22a11 Mon Sep 17 00:00:00 2001
From: huangting4201 <1538303371@qq.com>
Date: Tue, 26 Sep 2023 16:54:29 +0800
Subject: [PATCH] fix(fsdp_optimizer.py): wait grad async

---
 internlm/solver/optimizer/fsdp_optimizer.py | 3 ++-
 internlm/utils/model_checkpoint.py          | 9 +++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/internlm/solver/optimizer/fsdp_optimizer.py b/internlm/solver/optimizer/fsdp_optimizer.py
index c08b584..a19de0e 100644
--- a/internlm/solver/optimizer/fsdp_optimizer.py
+++ b/internlm/solver/optimizer/fsdp_optimizer.py
@@ -100,7 +100,8 @@ class FSDPadaptOptimizer(BaseOptimizer):
             params = self._fp16_param_groups[group_idx]
             for param in params:
                 if param.requires_grad:
-                    reduce_tensor(tensor=param.grad, parallel_mode=ParallelMode.ZERO3_DP)
+                    handle = reduce_tensor(tensor=param.grad, parallel_mode=ParallelMode.ZERO3_DP)
+                    handle.wait()
 
         # compute norm
         found_inf = False
diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py
index 75adab6..095a61b 100644
--- a/internlm/utils/model_checkpoint.py
+++ b/internlm/utils/model_checkpoint.py
@@ -309,7 +309,7 @@ def load_model_checkpoint(folder, model):
     """
     There should be weights with names similar to the following under the folder.
     - folder
-        - model_tp{tp_rank}_pp{pp_rank}.pt\
+        - model_tp{tp_rank}_pp{pp_rank}.pt
 
     If fsdp is activated, the saved weight is named:
     - folder
@@ -345,15 +345,16 @@ def load_model_checkpoint(folder, model):
                 max_pp = max(max_pp, int(segements[-1][2:]))
                 max_tp = max(max_tp, int(segements[-2][2:]))
 
-    assert (
-        zo_size == max_zo + 1
-    ), f"The weights are save for {max_zo+1} FSDP shards , while current has {zo_size} FSDP shards"
     assert (
         pp_size == max_pp + 1
     ), f"The weights are save for {max_pp+1} pipelines, while current has {pp_size} pipelines"
     assert (
         tp_size == max_tp + 1
     ), f"The weights are save for {max_tp+1} parallelism, while current has {tp_size} tensor parallelism"
+    if gpc.config.parallel.use_fsdp:
+        assert (
+            zo_size == max_zo + 1
+        ), f"The weights are save for {max_zo+1} FSDP shards , while current has {zo_size} FSDP shards"
 
     if gpc.config.parallel.use_fsdp:
         should_load_name = f"model_tp{tp_rank}_pp{pp_rank}_zo{zo_rank}.pt"