diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 09a10c8..a8f1973 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -57,9 +57,11 @@ class PackedFlashBaseLayer1D(nn.Module): moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time. moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor. moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'. - moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity). + moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to + infinite capacity). moe_use_rts (bool, optional): default=True, whether to use Random Token Selection. - moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer. + moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE + (https://arxiv.org/abs/2201.05596) layer. """ def __init__( @@ -184,7 +186,7 @@ class PackedFlashBaseLayer1D(nn.Module): device=torch.device("cuda"), dtype=torch.float, ) - + self.mlp = MoE( hidden_size=hidden_size, experts=experts, @@ -332,9 +334,11 @@ class PackedFlashInternLm1D(nn.Module): moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time. moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor. moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'. - moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity). + moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent + to infinite capacity). moe_use_rts (bool, optional): default=True, whether to use Random Token Selection. - moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer. + moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE + (https://arxiv.org/abs/2201.05596) layer. """ def __init__( @@ -517,7 +521,7 @@ def _build_generic_model_1d(num_layers, num_chunks, device=torch.device("cuda"), all_parts = partition_uniform(num_layers, pipeline_size, num_chunks) parts = all_parts[pipeline_rank] if gpc.is_rank_for_log(): - logger.info(f"The layer sharding is {all_parts}.") + logger.info(f"The layer sharding is {all_parts}.") # pylint: disable=W1203 models = [] @@ -578,7 +582,7 @@ def build_model_with_cfg( moe_noisy_gate_policy: str = None, moe_drop_tokens: bool = True, moe_use_rts: bool = True, - moe_use_residual: bool = False, + moe_use_residual: bool = True, ): """ Builde model with config @@ -615,9 +619,11 @@ def build_model_with_cfg( moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time. moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor. moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'. - moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity). + moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent + to infinite capacity). moe_use_rts (bool, optional): default=True, whether to use Random Token Selection. - moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer. + moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE + (https://arxiv.org/abs/2201.05596) layer. """ cfg = dict( diff --git a/internlm/model/moe.py b/internlm/model/moe.py index 4c2722c..04a39bd 100644 --- a/internlm/model/moe.py +++ b/internlm/model/moe.py @@ -50,9 +50,13 @@ class MoE(torch.nn.Module): min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor. noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'. + using_default_moe (bool, optional): default=True, whether to use the default MoE layer. drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity). use_rts (bool, optional): default=True, whether to use Random Token Selection. + moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE + (https://arxiv.org/abs/2201.05596) layer. + residual_mlp (torch.nn.Module, optional): default=None, the torch module that defines the residual MLP. """ def __init__( @@ -70,7 +74,7 @@ class MoE(torch.nn.Module): use_rts: bool = True, using_default_moe: bool = True, use_residual=True, - residual_mlp=None + residual_mlp=None, ): super().__init__() @@ -82,7 +86,7 @@ class MoE(torch.nn.Module): self.num_experts = num_experts self.num_local_experts = num_experts // self.ep_size - logger.info( + logger.info( # pylint: disable=W1203 f"Creating MoE layer with num_experts: {num_experts} | num_local_experts:" f"{self.num_local_experts} | expert_parallel_size: {self.ep_size}" ) @@ -136,11 +140,11 @@ class MoE(torch.nn.Module): """ output = self.moe_layer(hidden_states, used_token) if self.use_residual: - # Residual MoE - output_mlp = self.residual_mlp(hidden_states) - if type(output_mlp) is tuple: - output_mlp = output_mlp[0] # Ignore the bias term for now - coef = self.coefficient(hidden_states) - coef = torch.nn.functional.softmax(coef, dim=-1) - output = output * coef[..., 0:1] + output_mlp * coef[..., 1:] + # Residual MoE + output_mlp = self.residual_mlp(hidden_states) + if isinstance(output_mlp, tuple): + output_mlp = output_mlp[0] # Ignore the bias term for now + coef = self.coefficient(hidden_states) + coef = torch.nn.functional.softmax(coef, dim=-1) + output = output * coef[..., 0:1] + output_mlp * coef[..., 1:] return output, self.moe_layer.l_aux, self.moe_layer.exp_counts