mirror of https://github.com/InternLM/InternLM
delete old sync logic
parent
15ff413362
commit
a81a4a4ba8
|
@ -127,11 +127,6 @@ class PackedFlashBaseLayer1D(nn.Module):
|
|||
else:
|
||||
self.norm1 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
|
||||
self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
|
||||
|
||||
for param in self.norm1.parameters():
|
||||
param.is_norm = True
|
||||
for param in self.norm2.parameters():
|
||||
param.is_norm = True
|
||||
set_fp32_attr_to_module(self.norm1)
|
||||
set_fp32_attr_to_module(self.norm2)
|
||||
|
||||
|
|
|
@ -215,18 +215,6 @@ def is_moe_param(param: torch.Tensor) -> bool:
|
|||
return False
|
||||
|
||||
|
||||
def is_gate_param(param: torch.Tensor) -> bool:
|
||||
if hasattr(param, "is_gate") and param.is_gate:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_norm_param(param: torch.Tensor) -> bool:
|
||||
if hasattr(param, "is_norm") and param.is_norm:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def Silu(w1_o, w2_o):
|
||||
return F.silu(w1_o) * w2_o
|
||||
|
||||
|
|
|
@ -352,9 +352,6 @@ class TopKGate(Module):
|
|||
self.drop_tokens = drop_tokens
|
||||
self.use_rts = use_rts
|
||||
|
||||
for param in self.wg.parameters():
|
||||
param.is_gate = True
|
||||
|
||||
def forward(
|
||||
self, inputs: torch.Tensor, used_token: torch.Tensor = None
|
||||
) -> Tuple[Tensor, Tensor, Tensor]: # type: ignore
|
||||
|
|
|
@ -657,16 +657,16 @@ class HybridZeroOptimizer(BaseOptimizer):
|
|||
|
||||
# Parameters shared within a TP group, such as norm and moe gate, have precision inconsistency in gradients.
|
||||
# Therefore, it is recommended to synchronize gradients within the TP group to eliminate accumulated errors.
|
||||
is_tp_sync_groups = (
|
||||
self._is_norm_group(self.optim.param_groups[group_id]),
|
||||
self._is_gate_group(self.optim.param_groups[group_id]),
|
||||
)
|
||||
if any(is_tp_sync_groups):
|
||||
dist.all_reduce(
|
||||
flat_fp32_avg_grads,
|
||||
op=dist.ReduceOp.AVG,
|
||||
group=gpc.get_group(ParallelMode.TENSOR),
|
||||
)
|
||||
# is_tp_sync_groups = (
|
||||
# self._is_norm_group(self.optim.param_groups[group_id]),
|
||||
# self._is_gate_group(self.optim.param_groups[group_id]),
|
||||
# )
|
||||
# if any(is_tp_sync_groups):
|
||||
# dist.all_reduce(
|
||||
# flat_fp32_avg_grads,
|
||||
# op=dist.ReduceOp.AVG,
|
||||
# group=gpc.get_group(ParallelMode.TENSOR),
|
||||
# )
|
||||
|
||||
single_grad_partition_groups.append(flat_fp32_avg_grads)
|
||||
device = self._fp32_flat_param_groups_of_current_rank[group_id].device
|
||||
|
|
|
@ -4,7 +4,7 @@ import torch
|
|||
|
||||
from internlm.core.context.parallel_context import ParallelMode
|
||||
from internlm.core.context.parallel_context import global_context as gpc
|
||||
from internlm.model.utils import is_gate_param, is_moe_param, is_norm_param
|
||||
from internlm.model.utils import is_moe_param
|
||||
|
||||
|
||||
def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict]) -> Tuple[Dict]:
|
||||
|
@ -40,9 +40,6 @@ def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict])
|
|||
new_groups = {}
|
||||
new_groups["fp32"] = {"name": "fp32", "params": [], "dp_mode": ParallelMode.DATA}
|
||||
if gpc.config.model.get("num_experts", 0) > 1:
|
||||
# norm and gate are special group to force sync (when enable MoE).
|
||||
for key in ["gate", "norm"]:
|
||||
new_groups[key] = {"name": key, key: True, "params": [], "dp_mode": ParallelMode.DATA}
|
||||
for key in gpc.expert_parallel_group_names:
|
||||
new_groups[key] = {"name": key, "moe": True, "params": [], "dp_mode": ParallelMode.EXPERT_DATA}
|
||||
|
||||
|
@ -58,12 +55,7 @@ def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict])
|
|||
# first split the norm and gate groups, which are special case to force sync (when enable MoE),
|
||||
# then fp32 group and the moe group.
|
||||
for param in pgroup["params"]:
|
||||
if gpc.config.model.get("num_experts", 0) > 1 and is_norm_param(param):
|
||||
new_groups["norm"]["params"].append(param)
|
||||
# gate param means MoE is enabled
|
||||
elif is_gate_param(param):
|
||||
new_groups["gate"]["params"].append(param)
|
||||
elif param.dtype == torch.float32:
|
||||
if param.dtype == torch.float32:
|
||||
new_groups["fp32"]["params"].append(param)
|
||||
# moe param means MoE is enabled
|
||||
elif is_moe_param(param):
|
||||
|
|
Loading…
Reference in New Issue