fix(fsdp_optimizer.py): wait grad async

2023-09-26 16:54:29 +08:00 · 2023-09-26 16:54:29 +08:00 · 056996f8b3
parent c5a7e76ada
commit 056996f8b3
2 changed files with 7 additions and 5 deletions
--- a/internlm/solver/optimizer/fsdp_optimizer.py
+++ b/internlm/solver/optimizer/fsdp_optimizer.py
@ -100,7 +100,8 @@ class FSDPadaptOptimizer(BaseOptimizer):
            params = self._fp16_param_groups[group_idx]
            for param in params:
                if param.requires_grad:
-                    reduce_tensor(tensor=param.grad, parallel_mode=ParallelMode.ZERO3_DP)
+                    handle = reduce_tensor(tensor=param.grad, parallel_mode=ParallelMode.ZERO3_DP)
+                    handle.wait()

        # compute norm
        found_inf = False
--- a/internlm/utils/model_checkpoint.py
+++ b/internlm/utils/model_checkpoint.py
@ -309,7 +309,7 @@ def load_model_checkpoint(folder, model):
    """
    There should be weights with names similar to the following under the folder.
    - folder
-        - model_tp{tp_rank}_pp{pp_rank}.pt\
+        - model_tp{tp_rank}_pp{pp_rank}.pt

    If fsdp is activated, the saved weight is named:
    - folder
@ -345,15 +345,16 @@ def load_model_checkpoint(folder, model):
                max_pp = max(max_pp, int(segements[-1][2:]))
                max_tp = max(max_tp, int(segements[-2][2:]))

-    assert (
-        zo_size == max_zo + 1
-    ), f"The weights are save for {max_zo+1} FSDP shards , while current has {zo_size} FSDP shards"
    assert (
        pp_size == max_pp + 1
    ), f"The weights are save for {max_pp+1} pipelines, while current has {pp_size} pipelines"
    assert (
        tp_size == max_tp + 1
    ), f"The weights are save for {max_tp+1} parallelism, while current has {tp_size} tensor parallelism"
+    if gpc.config.parallel.use_fsdp:
+        assert (
+            zo_size == max_zo + 1
+        ), f"The weights are save for {max_zo+1} FSDP shards , while current has {zo_size} FSDP shards"

    if gpc.config.parallel.use_fsdp:
        should_load_name = f"model_tp{tp_rank}_pp{pp_rank}_zo{zo_rank}.pt"