diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 3ad185d..4cb64c3 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -160,37 +160,9 @@ class PackedFlashBaseLayer1D(nn.Module): dtype=dtype, ) else: - experts = torch.nn.ModuleList( - [ - FeedForward( - hidden_size, - int(hidden_size * gpc.config.model.mlp_ratio), - out_features=hidden_size, - process_group=gpc.get_group(ParallelMode.TENSOR), - bias=False, - device=torch.device("cuda"), - dtype=torch.float, - ) - for i in range(num_experts // ep_size) - ] - ) - - # residual network, see https://arxiv.org/pdf/2201.05596.pdf, seems useful for convergence - if moe_use_residual: - residual_mlp = FeedForward( - hidden_size, - int(hidden_size * gpc.config.model.mlp_ratio), - out_features=hidden_size, - process_group=gpc.get_group(ParallelMode.TENSOR), - bias=False, - device=torch.device("cuda"), - dtype=torch.float, - ) - # replace mlp by MoE module. The expert in MoE is a FeedForward module. self.mlp = MoE( hidden_size=hidden_size, - experts=experts, num_experts=num_experts, ep_size=ep_size, k=moe_gate_k, @@ -201,7 +173,6 @@ class PackedFlashBaseLayer1D(nn.Module): drop_tokens=moe_drop_tokens, use_rts=moe_use_rts, use_residual=moe_use_residual, - residual_mlp=residual_mlp if moe_use_residual else None, ) self.dropout2 = nn.Dropout(drop_rate) diff --git a/internlm/model/moe.py b/internlm/model/moe.py index 1504838..8ddbc48 100644 --- a/internlm/model/moe.py +++ b/internlm/model/moe.py @@ -5,6 +5,7 @@ import torch from internlm.core.context import ParallelMode from internlm.core.context import global_context as gpc +from internlm.model.linear import FeedForward from internlm.moe.experts import Experts from internlm.moe.sharded_moe import MOELayer, TopKGate from internlm.utils.logger import get_logger @@ -63,7 +64,6 @@ class MoE(torch.nn.Module): def __init__( self, hidden_size, - experts, num_experts=1, ep_size=1, k=1, @@ -75,7 +75,6 @@ class MoE(torch.nn.Module): use_rts: bool = True, using_default_moe: bool = True, use_residual=False, - residual_mlp=None, ): super().__init__() @@ -91,12 +90,26 @@ class MoE(torch.nn.Module): f"Creating MoE layer with num_experts: {num_experts} | num_local_experts:" f"{self.num_local_experts} | expert_parallel_size: {self.ep_size}" ) - assert noisy_gate_policy is None or noisy_gate_policy in ["None", "Jitter", "RSample"], ( "Unsupported noisy_gate_policy: " + noisy_gate_policy ) + # for elastic expert paralle, experts may have multiple groups expert_group_name = f"ep_size_{self.ep_size}" + experts = torch.nn.ModuleList( + [ + FeedForward( + hidden_size, + int(hidden_size * gpc.config.model.mlp_ratio), + out_features=hidden_size, + process_group=gpc.get_group(ParallelMode.TENSOR), + bias=False, + device=torch.device("cuda"), + dtype=torch.float, + ) + for _ in range(self.num_local_experts) + ] + ) experts = Experts(experts, self.num_local_experts, expert_group_name) if using_default_moe: @@ -118,10 +131,19 @@ class MoE(torch.nn.Module): self.num_local_experts, ) + # residual network, see https://arxiv.org/pdf/2201.05596.pdf, seems useful for convergence self.use_residual = use_residual if use_residual: - self.residual_mlp = residual_mlp - # coefficient is used for weighted sum of the output of expert and mlp + self.residual_mlp = FeedForward( + hidden_size, + int(hidden_size * gpc.config.model.mlp_ratio), + out_features=hidden_size, + process_group=gpc.get_group(ParallelMode.TENSOR), + bias=False, + device=torch.device("cuda"), + dtype=torch.float, + ) + # coefficient is used for weighted sum of the output of expert and residual mlp self.coefficient = torch.nn.Linear(hidden_size, 2) def forward(self, hidden_states, used_token=None): diff --git a/internlm/moe/sharded_moe.py b/internlm/moe/sharded_moe.py index 3bd529b..c450365 100644 --- a/internlm/moe/sharded_moe.py +++ b/internlm/moe/sharded_moe.py @@ -356,7 +356,6 @@ class TopKGate(Module): # Only top-1 and top-2 are supported at the moment. if k not in (1, 2): raise ValueError("Only top-1 and top-2 gatings are supported.") - # TODO: can we use tensor parallel here? # Deepspeed's mechisms, alway use fp32 self.wg = torch.nn.Linear(model_dim, num_experts, bias=False).float() self.k = k @@ -437,9 +436,6 @@ class MOELayer(Base): self.time_moe = 0.0 self.wall_clock_breakdown = False - def _set_ep_group(self, ep_group): - self.ep_group = ep_group - def forward(self, *inputs: Tensor) -> Tensor: if self.wall_clock_breakdown: diff --git a/train.py b/train.py index 1d43ce1..680f6e4 100644 --- a/train.py +++ b/train.py @@ -262,7 +262,7 @@ def main(args): start_time=start_time, loss=loss, moe_loss=moe_loss, - grad_norm=np.array(grad_norm_groups), + grad_norm=np.linalg.norm(grad_norm_groups), metric=metric, update_panel=uniscale_logger is not None, )