add param arguments

2023-08-09 15:37:53 +08:00 · 2023-08-09 15:37:53 +08:00 · d1c7b607fa
parent 0905da3e7b
commit d1c7b607fa
1 changed files with 32 additions and 2 deletions
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@ -52,6 +52,14 @@ class PackedFlashBaseLayer1D(nn.Module):
        norm_type (str): Use RMS norm or layernorm."rmsnorm" by default.
        use_flash_attn (bool): Whether use flash-attn. True by default.
        num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default.
        moe_gate_k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
        moe_capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
        moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
        moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
        moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'.
        moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
        moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
        moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
    """
    def __init__(
@ -115,6 +123,14 @@ class PackedFlashBaseLayer1D(nn.Module):
        # TODO: replace num_experts and epsize with function parameter
        self.num_experts = num_experts
        self.moe_gate_k = moe_gate_k
        self.moe_capacity_factor = moe_capacity_factor
        self.moe_eval_capacity_factor = moe_eval_capacity_factor
        self.moe_min_capacity = moe_min_capacity
        self.moe_noisy_gate_policy = moe_noisy_gate_policy
        self.moe_drop_tokens = moe_drop_tokens
        self.moe_use_rts = moe_use_rts
        self.moe_use_residual = moe_use_residual
        ep_size = gpc.get_world_size(ParallelMode.EXPERT)
        if num_experts <= 1:  # dense, not MoE
            if use_swiglu:
@ -311,7 +327,14 @@ class PackedFlashInternLm1D(nn.Module):
        norm_type (str): Normalization type. Use RMSNorm or LayerNorm. "rmsnorm" by default.
        use_flash_attn (bool): Whether to use flash-attn. True by default.
        num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default.
-
+        moe_gate_k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
        moe_capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
        moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
        moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
        moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'.
        moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
        moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
        moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
    """
    def __init__(
@ -587,7 +610,14 @@ def build_model_with_cfg(
        use_swiglu (bool): Whether to use swiglu. True by default.
        use_flash_attn (bool): Whether to use flash-attn. True by default.
        num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default.
-
+        moe_gate_k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
        moe_capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
        moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
        moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
        moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'.
        moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
        moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
        moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
    """
    cfg = dict(