From 056996f8b31e8c6b42ec31784e9f72dd26c22a11 Mon Sep 17 00:00:00 2001 From: huangting4201 <1538303371@qq.com> Date: Tue, 26 Sep 2023 16:54:29 +0800 Subject: [PATCH] fix(fsdp_optimizer.py): wait grad async --- internlm/solver/optimizer/fsdp_optimizer.py | 3 ++- internlm/utils/model_checkpoint.py | 9 +++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/internlm/solver/optimizer/fsdp_optimizer.py b/internlm/solver/optimizer/fsdp_optimizer.py index c08b584..a19de0e 100644 --- a/internlm/solver/optimizer/fsdp_optimizer.py +++ b/internlm/solver/optimizer/fsdp_optimizer.py @@ -100,7 +100,8 @@ class FSDPadaptOptimizer(BaseOptimizer): params = self._fp16_param_groups[group_idx] for param in params: if param.requires_grad: - reduce_tensor(tensor=param.grad, parallel_mode=ParallelMode.ZERO3_DP) + handle = reduce_tensor(tensor=param.grad, parallel_mode=ParallelMode.ZERO3_DP) + handle.wait() # compute norm found_inf = False diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py index 75adab6..095a61b 100644 --- a/internlm/utils/model_checkpoint.py +++ b/internlm/utils/model_checkpoint.py @@ -309,7 +309,7 @@ def load_model_checkpoint(folder, model): """ There should be weights with names similar to the following under the folder. - folder - - model_tp{tp_rank}_pp{pp_rank}.pt\ + - model_tp{tp_rank}_pp{pp_rank}.pt If fsdp is activated, the saved weight is named: - folder @@ -345,15 +345,16 @@ def load_model_checkpoint(folder, model): max_pp = max(max_pp, int(segements[-1][2:])) max_tp = max(max_tp, int(segements[-2][2:])) - assert ( - zo_size == max_zo + 1 - ), f"The weights are save for {max_zo+1} FSDP shards , while current has {zo_size} FSDP shards" assert ( pp_size == max_pp + 1 ), f"The weights are save for {max_pp+1} pipelines, while current has {pp_size} pipelines" assert ( tp_size == max_tp + 1 ), f"The weights are save for {max_tp+1} parallelism, while current has {tp_size} tensor parallelism" + if gpc.config.parallel.use_fsdp: + assert ( + zo_size == max_zo + 1 + ), f"The weights are save for {max_zo+1} FSDP shards , while current has {zo_size} FSDP shards" if gpc.config.parallel.use_fsdp: should_load_name = f"model_tp{tp_rank}_pp{pp_rank}_zo{zo_rank}.pt"