From d1c7b607fa7ec219434a265b1e1da756392381df Mon Sep 17 00:00:00 2001
From: zhanglei <ryancheung98@163.com>
Date: Wed, 9 Aug 2023 15:37:53 +0800
Subject: [PATCH] add param arguments

---
 internlm/model/modeling_internlm.py | 34 +++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index ac2733e..09a10c8 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -52,6 +52,14 @@ class PackedFlashBaseLayer1D(nn.Module):
         norm_type (str): Use RMS norm or layernorm."rmsnorm" by default.
         use_flash_attn (bool): Whether use flash-attn. True by default.
         num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default.
+        moe_gate_k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
+        moe_capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
+        moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
+        moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
+        moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'.
+        moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
+        moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
+        moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
     """
 
     def __init__(
@@ -115,6 +123,14 @@ class PackedFlashBaseLayer1D(nn.Module):
 
         # TODO: replace num_experts and epsize with function parameter
         self.num_experts = num_experts
+        self.moe_gate_k = moe_gate_k
+        self.moe_capacity_factor = moe_capacity_factor
+        self.moe_eval_capacity_factor = moe_eval_capacity_factor
+        self.moe_min_capacity = moe_min_capacity
+        self.moe_noisy_gate_policy = moe_noisy_gate_policy
+        self.moe_drop_tokens = moe_drop_tokens
+        self.moe_use_rts = moe_use_rts
+        self.moe_use_residual = moe_use_residual
         ep_size = gpc.get_world_size(ParallelMode.EXPERT)
         if num_experts <= 1:  # dense, not MoE
             if use_swiglu:
@@ -311,7 +327,14 @@ class PackedFlashInternLm1D(nn.Module):
         norm_type (str): Normalization type. Use RMSNorm or LayerNorm. "rmsnorm" by default.
         use_flash_attn (bool): Whether to use flash-attn. True by default.
         num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default.
-
+        moe_gate_k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
+        moe_capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
+        moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
+        moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
+        moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'.
+        moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
+        moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
+        moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
     """
 
     def __init__(
@@ -587,7 +610,14 @@ def build_model_with_cfg(
         use_swiglu (bool): Whether to use swiglu. True by default.
         use_flash_attn (bool): Whether to use flash-attn. True by default.
         num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default.
-
+        moe_gate_k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
+        moe_capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
+        moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
+        moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
+        moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'.
+        moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
+        moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
+        moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
     """
 
     cfg = dict(