From a81a4a4ba823e76e0745476bd087a66d531a291b Mon Sep 17 00:00:00 2001 From: Qu Wenwen Date: Fri, 27 Oct 2023 11:05:17 +0800 Subject: [PATCH] delete old sync logic --- internlm/model/modeling_moe.py | 5 ----- internlm/model/utils.py | 12 ----------- internlm/moe/sharded_moe.py | 3 --- .../solver/optimizer/hybrid_zero_optim.py | 20 +++++++++---------- internlm/train/utils.py | 12 ++--------- 5 files changed, 12 insertions(+), 40 deletions(-) diff --git a/internlm/model/modeling_moe.py b/internlm/model/modeling_moe.py index 43489bc..df6c7a8 100644 --- a/internlm/model/modeling_moe.py +++ b/internlm/model/modeling_moe.py @@ -127,11 +127,6 @@ class PackedFlashBaseLayer1D(nn.Module): else: self.norm1 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon) self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon) - - for param in self.norm1.parameters(): - param.is_norm = True - for param in self.norm2.parameters(): - param.is_norm = True set_fp32_attr_to_module(self.norm1) set_fp32_attr_to_module(self.norm2) diff --git a/internlm/model/utils.py b/internlm/model/utils.py index 570a86f..46fba59 100644 --- a/internlm/model/utils.py +++ b/internlm/model/utils.py @@ -215,18 +215,6 @@ def is_moe_param(param: torch.Tensor) -> bool: return False -def is_gate_param(param: torch.Tensor) -> bool: - if hasattr(param, "is_gate") and param.is_gate: - return True - return False - - -def is_norm_param(param: torch.Tensor) -> bool: - if hasattr(param, "is_norm") and param.is_norm: - return True - return False - - def Silu(w1_o, w2_o): return F.silu(w1_o) * w2_o diff --git a/internlm/moe/sharded_moe.py b/internlm/moe/sharded_moe.py index dbee2a4..5d695ac 100644 --- a/internlm/moe/sharded_moe.py +++ b/internlm/moe/sharded_moe.py @@ -352,9 +352,6 @@ class TopKGate(Module): self.drop_tokens = drop_tokens self.use_rts = use_rts - for param in self.wg.parameters(): - param.is_gate = True - def forward( self, inputs: torch.Tensor, used_token: torch.Tensor = None ) -> Tuple[Tensor, Tensor, Tensor]: # type: ignore diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 5004f8a..5d81300 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -657,16 +657,16 @@ class HybridZeroOptimizer(BaseOptimizer): # Parameters shared within a TP group, such as norm and moe gate, have precision inconsistency in gradients. # Therefore, it is recommended to synchronize gradients within the TP group to eliminate accumulated errors. - is_tp_sync_groups = ( - self._is_norm_group(self.optim.param_groups[group_id]), - self._is_gate_group(self.optim.param_groups[group_id]), - ) - if any(is_tp_sync_groups): - dist.all_reduce( - flat_fp32_avg_grads, - op=dist.ReduceOp.AVG, - group=gpc.get_group(ParallelMode.TENSOR), - ) + # is_tp_sync_groups = ( + # self._is_norm_group(self.optim.param_groups[group_id]), + # self._is_gate_group(self.optim.param_groups[group_id]), + # ) + # if any(is_tp_sync_groups): + # dist.all_reduce( + # flat_fp32_avg_grads, + # op=dist.ReduceOp.AVG, + # group=gpc.get_group(ParallelMode.TENSOR), + # ) single_grad_partition_groups.append(flat_fp32_avg_grads) device = self._fp32_flat_param_groups_of_current_rank[group_id].device diff --git a/internlm/train/utils.py b/internlm/train/utils.py index 9096a2a..ff59597 100644 --- a/internlm/train/utils.py +++ b/internlm/train/utils.py @@ -4,7 +4,7 @@ import torch from internlm.core.context.parallel_context import ParallelMode from internlm.core.context.parallel_context import global_context as gpc -from internlm.model.utils import is_gate_param, is_moe_param, is_norm_param +from internlm.model.utils import is_moe_param def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict]) -> Tuple[Dict]: @@ -40,9 +40,6 @@ def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict]) new_groups = {} new_groups["fp32"] = {"name": "fp32", "params": [], "dp_mode": ParallelMode.DATA} if gpc.config.model.get("num_experts", 0) > 1: - # norm and gate are special group to force sync (when enable MoE). - for key in ["gate", "norm"]: - new_groups[key] = {"name": key, key: True, "params": [], "dp_mode": ParallelMode.DATA} for key in gpc.expert_parallel_group_names: new_groups[key] = {"name": key, "moe": True, "params": [], "dp_mode": ParallelMode.EXPERT_DATA} @@ -58,12 +55,7 @@ def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict]) # first split the norm and gate groups, which are special case to force sync (when enable MoE), # then fp32 group and the moe group. for param in pgroup["params"]: - if gpc.config.model.get("num_experts", 0) > 1 and is_norm_param(param): - new_groups["norm"]["params"].append(param) - # gate param means MoE is enabled - elif is_gate_param(param): - new_groups["gate"]["params"].append(param) - elif param.dtype == torch.float32: + if param.dtype == torch.float32: new_groups["fp32"]["params"].append(param) # moe param means MoE is enabled elif is_moe_param(param):