From a81a4a4ba823e76e0745476bd087a66d531a291b Mon Sep 17 00:00:00 2001
From: Qu Wenwen <vinny_qu@163.com>
Date: Fri, 27 Oct 2023 11:05:17 +0800
Subject: [PATCH] delete old sync logic

---
 internlm/model/modeling_moe.py                |  5 -----
 internlm/model/utils.py                       | 12 -----------
 internlm/moe/sharded_moe.py                   |  3 ---
 .../solver/optimizer/hybrid_zero_optim.py     | 20 +++++++++----------
 internlm/train/utils.py                       | 12 ++---------
 5 files changed, 12 insertions(+), 40 deletions(-)

diff --git a/internlm/model/modeling_moe.py b/internlm/model/modeling_moe.py
index 43489bc..df6c7a8 100644
--- a/internlm/model/modeling_moe.py
+++ b/internlm/model/modeling_moe.py
@@ -127,11 +127,6 @@ class PackedFlashBaseLayer1D(nn.Module):
         else:
             self.norm1 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
             self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
-
-        for param in self.norm1.parameters():
-            param.is_norm = True
-        for param in self.norm2.parameters():
-            param.is_norm = True
         set_fp32_attr_to_module(self.norm1)
         set_fp32_attr_to_module(self.norm2)
 
diff --git a/internlm/model/utils.py b/internlm/model/utils.py
index 570a86f..46fba59 100644
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@@ -215,18 +215,6 @@ def is_moe_param(param: torch.Tensor) -> bool:
     return False
 
 
-def is_gate_param(param: torch.Tensor) -> bool:
-    if hasattr(param, "is_gate") and param.is_gate:
-        return True
-    return False
-
-
-def is_norm_param(param: torch.Tensor) -> bool:
-    if hasattr(param, "is_norm") and param.is_norm:
-        return True
-    return False
-
-
 def Silu(w1_o, w2_o):
     return F.silu(w1_o) * w2_o
 
diff --git a/internlm/moe/sharded_moe.py b/internlm/moe/sharded_moe.py
index dbee2a4..5d695ac 100644
--- a/internlm/moe/sharded_moe.py
+++ b/internlm/moe/sharded_moe.py
@@ -352,9 +352,6 @@ class TopKGate(Module):
         self.drop_tokens = drop_tokens
         self.use_rts = use_rts
 
-        for param in self.wg.parameters():
-            param.is_gate = True
-
     def forward(
         self, inputs: torch.Tensor, used_token: torch.Tensor = None
     ) -> Tuple[Tensor, Tensor, Tensor]:  # type: ignore
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
index 5004f8a..5d81300 100644
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -657,16 +657,16 @@ class HybridZeroOptimizer(BaseOptimizer):
 
             # Parameters shared within a TP group, such as norm and moe gate, have precision inconsistency in gradients.
             # Therefore, it is recommended to synchronize gradients within the TP group to eliminate accumulated errors.
-            is_tp_sync_groups = (
-                self._is_norm_group(self.optim.param_groups[group_id]),
-                self._is_gate_group(self.optim.param_groups[group_id]),
-            )
-            if any(is_tp_sync_groups):
-                dist.all_reduce(
-                    flat_fp32_avg_grads,
-                    op=dist.ReduceOp.AVG,
-                    group=gpc.get_group(ParallelMode.TENSOR),
-                )
+            # is_tp_sync_groups = (
+            #     self._is_norm_group(self.optim.param_groups[group_id]),
+            #     self._is_gate_group(self.optim.param_groups[group_id]),
+            # )
+            # if any(is_tp_sync_groups):
+            #     dist.all_reduce(
+            #         flat_fp32_avg_grads,
+            #         op=dist.ReduceOp.AVG,
+            #         group=gpc.get_group(ParallelMode.TENSOR),
+            #     )
 
             single_grad_partition_groups.append(flat_fp32_avg_grads)
             device = self._fp32_flat_param_groups_of_current_rank[group_id].device
diff --git a/internlm/train/utils.py b/internlm/train/utils.py
index 9096a2a..ff59597 100644
--- a/internlm/train/utils.py
+++ b/internlm/train/utils.py
@@ -4,7 +4,7 @@ import torch
 
 from internlm.core.context.parallel_context import ParallelMode
 from internlm.core.context.parallel_context import global_context as gpc
-from internlm.model.utils import is_gate_param, is_moe_param, is_norm_param
+from internlm.model.utils import is_moe_param
 
 
 def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict]) -> Tuple[Dict]:
@@ -40,9 +40,6 @@ def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict])
     new_groups = {}
     new_groups["fp32"] = {"name": "fp32", "params": [], "dp_mode": ParallelMode.DATA}
     if gpc.config.model.get("num_experts", 0) > 1:
-        # norm and gate are special group to force sync (when enable MoE).
-        for key in ["gate", "norm"]:
-            new_groups[key] = {"name": key, key: True, "params": [], "dp_mode": ParallelMode.DATA}
         for key in gpc.expert_parallel_group_names:
             new_groups[key] = {"name": key, "moe": True, "params": [], "dp_mode": ParallelMode.EXPERT_DATA}
 
@@ -58,12 +55,7 @@ def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict])
         # first split the norm and gate groups, which are special case to force sync (when enable MoE),
         # then fp32 group and the moe group.
         for param in pgroup["params"]:
-            if gpc.config.model.get("num_experts", 0) > 1 and is_norm_param(param):
-                new_groups["norm"]["params"].append(param)
-            # gate param means MoE is enabled
-            elif is_gate_param(param):
-                new_groups["gate"]["params"].append(param)
-            elif param.dtype == torch.float32:
+            if param.dtype == torch.float32:
                 new_groups["fp32"]["params"].append(param)
             # moe param means MoE is enabled
             elif is_moe_param(param):