reformat code

2023-08-09 16:03:47 +08:00 · 2023-08-09 16:03:47 +08:00 · 5f2e082b21
parent 4a5cf5d1df
commit 5f2e082b21
2 changed files with 28 additions and 18 deletions
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@ -57,9 +57,11 @@ class PackedFlashBaseLayer1D(nn.Module):
        moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
        moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
        moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'.
-        moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
+        moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to
+                                          infinite capacity).
        moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
-        moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
+        moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE
+                                          (https://arxiv.org/abs/2201.05596) layer.
    """

    def __init__(
@ -332,9 +334,11 @@ class PackedFlashInternLm1D(nn.Module):
        moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
        moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
        moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'.
-        moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
+        moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent
+                                          to infinite capacity).
        moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
-        moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
+        moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE
+                                          (https://arxiv.org/abs/2201.05596) layer.
    """

    def __init__(
@ -517,7 +521,7 @@ def _build_generic_model_1d(num_layers, num_chunks, device=torch.device("cuda"),
    all_parts = partition_uniform(num_layers, pipeline_size, num_chunks)
    parts = all_parts[pipeline_rank]
    if gpc.is_rank_for_log():
-        logger.info(f"The layer sharding is {all_parts}.")
+        logger.info(f"The layer sharding is {all_parts}.")  # pylint: disable=W1203

    models = []

@ -578,7 +582,7 @@ def build_model_with_cfg(
    moe_noisy_gate_policy: str = None,
    moe_drop_tokens: bool = True,
    moe_use_rts: bool = True,
-    moe_use_residual: bool = False,
+    moe_use_residual: bool = True,
 ):
    """
    Builde model with config
@ -615,9 +619,11 @@ def build_model_with_cfg(
        moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
        moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
        moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'.
-        moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
+        moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent
+                                          to infinite capacity).
        moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
-        moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
+        moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE
+                                           (https://arxiv.org/abs/2201.05596) layer.
    """

    cfg = dict(
--- a/internlm/model/moe.py
+++ b/internlm/model/moe.py
@ -50,9 +50,13 @@ class MoE(torch.nn.Module):
        min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
        noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'
        or 'None'.
+        using_default_moe (bool, optional): default=True, whether to use the default MoE layer.
        drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to
        infinite capacity).
        use_rts (bool, optional): default=True, whether to use Random Token Selection.
+        moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE
+                                          (https://arxiv.org/abs/2201.05596) layer.
+        residual_mlp (torch.nn.Module, optional): default=None, the torch module that defines the residual MLP.
    """

    def __init__(
@ -70,7 +74,7 @@ class MoE(torch.nn.Module):
        use_rts: bool = True,
        using_default_moe: bool = True,
        use_residual=True,
-        residual_mlp=None
+        residual_mlp=None,
    ):

        super().__init__()
@ -82,7 +86,7 @@ class MoE(torch.nn.Module):
        self.num_experts = num_experts
        self.num_local_experts = num_experts // self.ep_size

-        logger.info(
+        logger.info(  # pylint: disable=W1203
            f"Creating MoE layer with num_experts: {num_experts} | num_local_experts:"
            f"{self.num_local_experts} | expert_parallel_size: {self.ep_size}"
        )
@ -138,7 +142,7 @@ class MoE(torch.nn.Module):
        if self.use_residual:
            # Residual MoE
            output_mlp = self.residual_mlp(hidden_states)
-                if type(output_mlp) is tuple:
+            if isinstance(output_mlp, tuple):
                output_mlp = output_mlp[0]  # Ignore the bias term for now
            coef = self.coefficient(hidden_states)
            coef = torch.nn.functional.softmax(coef, dim=-1)