add residual and other moe features

2023-08-09 14:14:18 +08:00 · 2023-08-09 14:14:18 +08:00 · cdf3ed9533
parent bc699ad46f
commit cdf3ed9533
2 changed files with 87 additions and 7 deletions
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@ -73,6 +73,14 @@ class PackedFlashBaseLayer1D(nn.Module):
        use_swiglu: bool = True,
        use_flash_attn: bool = True,
        num_experts: int = 1,
        moe_gate_k: int = 1,
        moe_capacity_factor: float = 1.0,
        moe_eval_capacity_factor: float = 1.0,
        moe_min_capacity: int = 4,
        moe_noisy_gate_policy: str = None,
        moe_drop_tokens: bool = True,
        moe_use_rts: bool = True,
        moe_use_residual: bool = False,
    ):
        super().__init__()
        self.checkpoint = checkpoint
@ -135,7 +143,7 @@ class PackedFlashBaseLayer1D(nn.Module):
                    dtype=dtype,
                )
        else:
-            expert = torch.nn.ModuleList(
+            experts = torch.nn.ModuleList(
                [
                    FeedForward(
                        hidden_size,
@ -149,9 +157,34 @@ class PackedFlashBaseLayer1D(nn.Module):
                    for i in range(num_experts // ep_size)
                ]
            )
-            # TODO: test moe for now, need more parameter such as: capacity_factor,
+
-            # eval_capacity_factor, min_capacity, drop_tokens
+            if moe_use_residual:
-            self.mlp = MoE(hidden_size=hidden_size, expert=expert, ep_size=ep_size, num_experts=num_experts, k=1)
+                residual_mlp = FeedForward(
                    hidden_size,
                    int(hidden_size * gpc.config.model.mlp_ratio),
                    out_features=hidden_size,
                    process_group=gpc.get_group(ParallelMode.TENSOR),
                    bias=False,
                    device=torch.device("cuda"),
                    dtype=torch.float,
                )
            self.mlp = MoE(
                hidden_size=hidden_size,
                experts=experts,
                num_experts=num_experts,
                ep_size=ep_size,
                k=moe_gate_k,
                capacity_factor=moe_capacity_factor,
                eval_capacity_factor=moe_eval_capacity_factor,
                min_capacity=moe_min_capacity,
                noisy_gate_policy=moe_noisy_gate_policy,
                drop_tokens=moe_drop_tokens,
                use_rts=moe_use_rts,
                use_residual=moe_use_residual,
                residual_mlp=residual_mlp if moe_use_residual else None,
            )
        self.dropout2 = nn.Dropout(drop_rate)
        self.use_swiglu = use_swiglu
        self.use_scaled_init = use_scaled_init
@ -309,6 +342,14 @@ class PackedFlashInternLm1D(nn.Module):
        use_swiglu: bool = True,
        use_flash_attn: bool = True,
        num_experts: bool = 1,
        moe_gate_k: int = 1,
        moe_capacity_factor: float = 1.0,
        moe_eval_capacity_factor: float = 1.0,
        moe_min_capacity: int = 4,
        moe_noisy_gate_policy: str = None,
        moe_drop_tokens: bool = True,
        moe_use_rts: bool = True,
        moe_use_residual: bool = False,
    ):
        super().__init__()
@ -361,6 +402,14 @@ class PackedFlashInternLm1D(nn.Module):
                    use_swiglu=use_swiglu,
                    use_flash_attn=use_flash_attn,
                    num_experts=num_experts,
                    moe_gate_k=moe_gate_k,
                    moe_capacity_factor=moe_capacity_factor,
                    moe_eval_capacity_factor=moe_eval_capacity_factor,
                    moe_min_capacity=moe_min_capacity,
                    moe_noisy_gate_policy=moe_noisy_gate_policy,
                    moe_drop_tokens=moe_drop_tokens,
                    moe_use_rts=moe_use_rts,
                    moe_use_residual=moe_use_residual,
                )
                for lid in range(num_layers)
            ]
@ -499,6 +548,14 @@ def build_model_with_cfg(
    use_flash_attn: bool = True,
    sequence_parallel: bool = False,  # pylint: disable=W0613
    num_experts: int = 1,
    moe_gate_k: int = 1,
    moe_capacity_factor: float = 1.0,
    moe_eval_capacity_factor: float = 1.0,
    moe_min_capacity: int = 4,
    moe_noisy_gate_policy: str = None,
    moe_drop_tokens: bool = True,
    moe_use_rts: bool = True,
    moe_use_residual: bool = False,
 ):
    """
    Builde model with config
@ -555,6 +612,14 @@ def build_model_with_cfg(
        use_flash_attn=use_flash_attn,
        sequence_parallel=sequence_parallel,
        num_experts=num_experts,
        moe_gate_k=moe_gate_k,
        moe_capacity_factor=moe_capacity_factor,
        moe_eval_capacity_factor=moe_eval_capacity_factor,
        moe_min_capacity=moe_min_capacity,
        moe_noisy_gate_policy=moe_noisy_gate_policy,
        moe_drop_tokens=moe_drop_tokens,
        moe_use_rts=moe_use_rts,
        moe_use_residual=moe_use_residual,
    )
    return _build_generic_model_1d(num_layers=num_layers, num_chunks=num_chunks, **cfg)
--- a/internlm/model/moe.py
+++ b/internlm/model/moe.py
@ -58,7 +58,7 @@ class MoE(torch.nn.Module):
    def __init__(
        self,
        hidden_size,
-        expert,
+        experts,
        num_experts=1,
        ep_size=1,
        k=1,
@ -69,6 +69,8 @@ class MoE(torch.nn.Module):
        drop_tokens: bool = True,
        use_rts: bool = True,
        using_default_moe: bool = True,
        use_residual=True,
        residual_mlp=None
    ):
        super().__init__()
@ -89,7 +91,7 @@ class MoE(torch.nn.Module):
            "Unsupported noisy_gate_policy: " + noisy_gate_policy
        )
-        experts = Experts(expert, self.num_local_experts)
+        experts = Experts(experts, self.num_local_experts)
        if using_default_moe:
            self.moe_layer = MOELayer(
@ -110,6 +112,12 @@ class MoE(torch.nn.Module):
                self.num_local_experts,
            )
        self.use_residual = use_residual
        if use_residual:
            self.residual_mlp = residual_mlp
            # coefficient is used for weighted sum of the output of expert and mlp
            self.coefficient = torch.nn.Linear(hidden_size, 2)
    def forward(self, hidden_states, used_token=None):
        """MoE forward
@ -127,5 +135,12 @@ class MoE(torch.nn.Module):
            * exp_counts (int): expert count
        """
        output = self.moe_layer(hidden_states, used_token)
-
+        if self.use_residual:
                # Residual MoE
                output_mlp = self.residual_mlp(hidden_states)
                if type(output_mlp) is tuple:
                    output_mlp = output_mlp[0]  # Ignore the bias term for now
                coef = self.coefficient(hidden_states)
                coef = torch.nn.functional.softmax(coef, dim=-1)
                output = output * coef[..., 0:1] + output_mlp * coef[..., 1:]
        return output, self.moe_layer.l_aux, self.moe_layer.exp_counts