delete old sync logic

pull/448/head
Qu Wenwen 2023-10-27 11:05:17 +08:00
parent 15ff413362
commit a81a4a4ba8
5 changed files with 12 additions and 40 deletions

View File

@ -127,11 +127,6 @@ class PackedFlashBaseLayer1D(nn.Module):
else:
self.norm1 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
for param in self.norm1.parameters():
param.is_norm = True
for param in self.norm2.parameters():
param.is_norm = True
set_fp32_attr_to_module(self.norm1)
set_fp32_attr_to_module(self.norm2)

View File

@ -215,18 +215,6 @@ def is_moe_param(param: torch.Tensor) -> bool:
return False
def is_gate_param(param: torch.Tensor) -> bool:
if hasattr(param, "is_gate") and param.is_gate:
return True
return False
def is_norm_param(param: torch.Tensor) -> bool:
if hasattr(param, "is_norm") and param.is_norm:
return True
return False
def Silu(w1_o, w2_o):
return F.silu(w1_o) * w2_o

View File

@ -352,9 +352,6 @@ class TopKGate(Module):
self.drop_tokens = drop_tokens
self.use_rts = use_rts
for param in self.wg.parameters():
param.is_gate = True
def forward(
self, inputs: torch.Tensor, used_token: torch.Tensor = None
) -> Tuple[Tensor, Tensor, Tensor]: # type: ignore

View File

@ -657,16 +657,16 @@ class HybridZeroOptimizer(BaseOptimizer):
# Parameters shared within a TP group, such as norm and moe gate, have precision inconsistency in gradients.
# Therefore, it is recommended to synchronize gradients within the TP group to eliminate accumulated errors.
is_tp_sync_groups = (
self._is_norm_group(self.optim.param_groups[group_id]),
self._is_gate_group(self.optim.param_groups[group_id]),
)
if any(is_tp_sync_groups):
dist.all_reduce(
flat_fp32_avg_grads,
op=dist.ReduceOp.AVG,
group=gpc.get_group(ParallelMode.TENSOR),
)
# is_tp_sync_groups = (
# self._is_norm_group(self.optim.param_groups[group_id]),
# self._is_gate_group(self.optim.param_groups[group_id]),
# )
# if any(is_tp_sync_groups):
# dist.all_reduce(
# flat_fp32_avg_grads,
# op=dist.ReduceOp.AVG,
# group=gpc.get_group(ParallelMode.TENSOR),
# )
single_grad_partition_groups.append(flat_fp32_avg_grads)
device = self._fp32_flat_param_groups_of_current_rank[group_id].device

View File

@ -4,7 +4,7 @@ import torch
from internlm.core.context.parallel_context import ParallelMode
from internlm.core.context.parallel_context import global_context as gpc
from internlm.model.utils import is_gate_param, is_moe_param, is_norm_param
from internlm.model.utils import is_moe_param
def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict]) -> Tuple[Dict]:
@ -40,9 +40,6 @@ def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict])
new_groups = {}
new_groups["fp32"] = {"name": "fp32", "params": [], "dp_mode": ParallelMode.DATA}
if gpc.config.model.get("num_experts", 0) > 1:
# norm and gate are special group to force sync (when enable MoE).
for key in ["gate", "norm"]:
new_groups[key] = {"name": key, key: True, "params": [], "dp_mode": ParallelMode.DATA}
for key in gpc.expert_parallel_group_names:
new_groups[key] = {"name": key, "moe": True, "params": [], "dp_mode": ParallelMode.EXPERT_DATA}
@ -58,12 +55,7 @@ def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict])
# first split the norm and gate groups, which are special case to force sync (when enable MoE),
# then fp32 group and the moe group.
for param in pgroup["params"]:
if gpc.config.model.get("num_experts", 0) > 1 and is_norm_param(param):
new_groups["norm"]["params"].append(param)
# gate param means MoE is enabled
elif is_gate_param(param):
new_groups["gate"]["params"].append(param)
elif param.dtype == torch.float32:
if param.dtype == torch.float32:
new_groups["fp32"]["params"].append(param)
# moe param means MoE is enabled
elif is_moe_param(param):