From d1c7b607fa7ec219434a265b1e1da756392381df Mon Sep 17 00:00:00 2001 From: zhanglei Date: Wed, 9 Aug 2023 15:37:53 +0800 Subject: [PATCH] add param arguments --- internlm/model/modeling_internlm.py | 34 +++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index ac2733e..09a10c8 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -52,6 +52,14 @@ class PackedFlashBaseLayer1D(nn.Module): norm_type (str): Use RMS norm or layernorm."rmsnorm" by default. use_flash_attn (bool): Whether use flash-attn. True by default. num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default. + moe_gate_k (int, optional): default=1, top-k gating value, only supports k=1 or k=2. + moe_capacity_factor (float, optional): default=1.0, the capacity of the expert at training time. + moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time. + moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor. + moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'. + moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity). + moe_use_rts (bool, optional): default=True, whether to use Random Token Selection. + moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer. """ def __init__( @@ -115,6 +123,14 @@ class PackedFlashBaseLayer1D(nn.Module): # TODO: replace num_experts and epsize with function parameter self.num_experts = num_experts + self.moe_gate_k = moe_gate_k + self.moe_capacity_factor = moe_capacity_factor + self.moe_eval_capacity_factor = moe_eval_capacity_factor + self.moe_min_capacity = moe_min_capacity + self.moe_noisy_gate_policy = moe_noisy_gate_policy + self.moe_drop_tokens = moe_drop_tokens + self.moe_use_rts = moe_use_rts + self.moe_use_residual = moe_use_residual ep_size = gpc.get_world_size(ParallelMode.EXPERT) if num_experts <= 1: # dense, not MoE if use_swiglu: @@ -311,7 +327,14 @@ class PackedFlashInternLm1D(nn.Module): norm_type (str): Normalization type. Use RMSNorm or LayerNorm. "rmsnorm" by default. use_flash_attn (bool): Whether to use flash-attn. True by default. num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default. - + moe_gate_k (int, optional): default=1, top-k gating value, only supports k=1 or k=2. + moe_capacity_factor (float, optional): default=1.0, the capacity of the expert at training time. + moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time. + moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor. + moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'. + moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity). + moe_use_rts (bool, optional): default=True, whether to use Random Token Selection. + moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer. """ def __init__( @@ -587,7 +610,14 @@ def build_model_with_cfg( use_swiglu (bool): Whether to use swiglu. True by default. use_flash_attn (bool): Whether to use flash-attn. True by default. num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default. - + moe_gate_k (int, optional): default=1, top-k gating value, only supports k=1 or k=2. + moe_capacity_factor (float, optional): default=1.0, the capacity of the expert at training time. + moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time. + moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor. + moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'. + moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity). + moe_use_rts (bool, optional): default=True, whether to use Random Token Selection. + moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer. """ cfg = dict(