delete old sync logic

2023-10-27 11:05:17 +08:00 · 2023-10-27 11:05:17 +08:00 · a81a4a4ba8
parent 15ff413362
commit a81a4a4ba8
5 changed files with 12 additions and 40 deletions
--- a/internlm/model/modeling_moe.py
+++ b/internlm/model/modeling_moe.py
@ -127,11 +127,6 @@ class PackedFlashBaseLayer1D(nn.Module):
        else:
            self.norm1 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
            self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
-
-        for param in self.norm1.parameters():
-            param.is_norm = True
-        for param in self.norm2.parameters():
-            param.is_norm = True
        set_fp32_attr_to_module(self.norm1)
        set_fp32_attr_to_module(self.norm2)

--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@ -215,18 +215,6 @@ def is_moe_param(param: torch.Tensor) -> bool:
    return False


-def is_gate_param(param: torch.Tensor) -> bool:
-    if hasattr(param, "is_gate") and param.is_gate:
-        return True
-    return False
-
-
-def is_norm_param(param: torch.Tensor) -> bool:
-    if hasattr(param, "is_norm") and param.is_norm:
-        return True
-    return False
-
-
 def Silu(w1_o, w2_o):
    return F.silu(w1_o) * w2_o

--- a/internlm/moe/sharded_moe.py
+++ b/internlm/moe/sharded_moe.py
@ -352,9 +352,6 @@ class TopKGate(Module):
        self.drop_tokens = drop_tokens
        self.use_rts = use_rts

-        for param in self.wg.parameters():
-            param.is_gate = True
-
    def forward(
        self, inputs: torch.Tensor, used_token: torch.Tensor = None
    ) -> Tuple[Tensor, Tensor, Tensor]:  # type: ignore
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@ -657,16 +657,16 @@ class HybridZeroOptimizer(BaseOptimizer):

            # Parameters shared within a TP group, such as norm and moe gate, have precision inconsistency in gradients.
            # Therefore, it is recommended to synchronize gradients within the TP group to eliminate accumulated errors.
-            is_tp_sync_groups = (
-                self._is_norm_group(self.optim.param_groups[group_id]),
-                self._is_gate_group(self.optim.param_groups[group_id]),
-            )
-            if any(is_tp_sync_groups):
-                dist.all_reduce(
-                    flat_fp32_avg_grads,
-                    op=dist.ReduceOp.AVG,
-                    group=gpc.get_group(ParallelMode.TENSOR),
-                )
+            # is_tp_sync_groups = (
+            #     self._is_norm_group(self.optim.param_groups[group_id]),
+            #     self._is_gate_group(self.optim.param_groups[group_id]),
+            # )
+            # if any(is_tp_sync_groups):
+            #     dist.all_reduce(
+            #         flat_fp32_avg_grads,
+            #         op=dist.ReduceOp.AVG,
+            #         group=gpc.get_group(ParallelMode.TENSOR),
+            #     )

            single_grad_partition_groups.append(flat_fp32_avg_grads)
            device = self._fp32_flat_param_groups_of_current_rank[group_id].device
--- a/internlm/train/utils.py
+++ b/internlm/train/utils.py
@ -4,7 +4,7 @@ import torch

 from internlm.core.context.parallel_context import ParallelMode
 from internlm.core.context.parallel_context import global_context as gpc
-from internlm.model.utils import is_gate_param, is_moe_param, is_norm_param
+from internlm.model.utils import is_moe_param


 def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict]) -> Tuple[Dict]:
@ -40,9 +40,6 @@ def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict])
    new_groups = {}
    new_groups["fp32"] = {"name": "fp32", "params": [], "dp_mode": ParallelMode.DATA}
    if gpc.config.model.get("num_experts", 0) > 1:
-        # norm and gate are special group to force sync (when enable MoE).
-        for key in ["gate", "norm"]:
-            new_groups[key] = {"name": key, key: True, "params": [], "dp_mode": ParallelMode.DATA}
        for key in gpc.expert_parallel_group_names:
            new_groups[key] = {"name": key, "moe": True, "params": [], "dp_mode": ParallelMode.EXPERT_DATA}

@ -58,12 +55,7 @@ def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict])
        # first split the norm and gate groups, which are special case to force sync (when enable MoE),
        # then fp32 group and the moe group.
        for param in pgroup["params"]:
-            if gpc.config.model.get("num_experts", 0) > 1 and is_norm_param(param):
-                new_groups["norm"]["params"].append(param)
-            # gate param means MoE is enabled
-            elif is_gate_param(param):
-                new_groups["gate"]["params"].append(param)
-            elif param.dtype == torch.float32:
+            if param.dtype == torch.float32:
                new_groups["fp32"]["params"].append(param)
            # moe param means MoE is enabled
            elif is_moe_param(param):