mirror of https://github.com/InternLM/InternLM
add param arguments
parent
0905da3e7b
commit
d1c7b607fa
|
@ -52,6 +52,14 @@ class PackedFlashBaseLayer1D(nn.Module):
|
||||||
norm_type (str): Use RMS norm or layernorm."rmsnorm" by default.
|
norm_type (str): Use RMS norm or layernorm."rmsnorm" by default.
|
||||||
use_flash_attn (bool): Whether use flash-attn. True by default.
|
use_flash_attn (bool): Whether use flash-attn. True by default.
|
||||||
num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default.
|
num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default.
|
||||||
|
moe_gate_k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
|
||||||
|
moe_capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
|
||||||
|
moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
|
||||||
|
moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
|
||||||
|
moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'.
|
||||||
|
moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
|
||||||
|
moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
|
||||||
|
moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -115,6 +123,14 @@ class PackedFlashBaseLayer1D(nn.Module):
|
||||||
|
|
||||||
# TODO: replace num_experts and epsize with function parameter
|
# TODO: replace num_experts and epsize with function parameter
|
||||||
self.num_experts = num_experts
|
self.num_experts = num_experts
|
||||||
|
self.moe_gate_k = moe_gate_k
|
||||||
|
self.moe_capacity_factor = moe_capacity_factor
|
||||||
|
self.moe_eval_capacity_factor = moe_eval_capacity_factor
|
||||||
|
self.moe_min_capacity = moe_min_capacity
|
||||||
|
self.moe_noisy_gate_policy = moe_noisy_gate_policy
|
||||||
|
self.moe_drop_tokens = moe_drop_tokens
|
||||||
|
self.moe_use_rts = moe_use_rts
|
||||||
|
self.moe_use_residual = moe_use_residual
|
||||||
ep_size = gpc.get_world_size(ParallelMode.EXPERT)
|
ep_size = gpc.get_world_size(ParallelMode.EXPERT)
|
||||||
if num_experts <= 1: # dense, not MoE
|
if num_experts <= 1: # dense, not MoE
|
||||||
if use_swiglu:
|
if use_swiglu:
|
||||||
|
@ -311,7 +327,14 @@ class PackedFlashInternLm1D(nn.Module):
|
||||||
norm_type (str): Normalization type. Use RMSNorm or LayerNorm. "rmsnorm" by default.
|
norm_type (str): Normalization type. Use RMSNorm or LayerNorm. "rmsnorm" by default.
|
||||||
use_flash_attn (bool): Whether to use flash-attn. True by default.
|
use_flash_attn (bool): Whether to use flash-attn. True by default.
|
||||||
num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default.
|
num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default.
|
||||||
|
moe_gate_k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
|
||||||
|
moe_capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
|
||||||
|
moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
|
||||||
|
moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
|
||||||
|
moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'.
|
||||||
|
moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
|
||||||
|
moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
|
||||||
|
moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -587,7 +610,14 @@ def build_model_with_cfg(
|
||||||
use_swiglu (bool): Whether to use swiglu. True by default.
|
use_swiglu (bool): Whether to use swiglu. True by default.
|
||||||
use_flash_attn (bool): Whether to use flash-attn. True by default.
|
use_flash_attn (bool): Whether to use flash-attn. True by default.
|
||||||
num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default.
|
num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default.
|
||||||
|
moe_gate_k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
|
||||||
|
moe_capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
|
||||||
|
moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
|
||||||
|
moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
|
||||||
|
moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'.
|
||||||
|
moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
|
||||||
|
moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
|
||||||
|
moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cfg = dict(
|
cfg = dict(
|
||||||
|
|
Loading…
Reference in New Issue