add param arguments

pull/375/head
zhanglei 2023-08-09 15:37:53 +08:00
parent 0905da3e7b
commit d1c7b607fa
1 changed files with 32 additions and 2 deletions

View File

@ -52,6 +52,14 @@ class PackedFlashBaseLayer1D(nn.Module):
norm_type (str): Use RMS norm or layernorm."rmsnorm" by default. norm_type (str): Use RMS norm or layernorm."rmsnorm" by default.
use_flash_attn (bool): Whether use flash-attn. True by default. use_flash_attn (bool): Whether use flash-attn. True by default.
num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default. num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default.
moe_gate_k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
moe_capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'.
moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
""" """
def __init__( def __init__(
@ -115,6 +123,14 @@ class PackedFlashBaseLayer1D(nn.Module):
# TODO: replace num_experts and epsize with function parameter # TODO: replace num_experts and epsize with function parameter
self.num_experts = num_experts self.num_experts = num_experts
self.moe_gate_k = moe_gate_k
self.moe_capacity_factor = moe_capacity_factor
self.moe_eval_capacity_factor = moe_eval_capacity_factor
self.moe_min_capacity = moe_min_capacity
self.moe_noisy_gate_policy = moe_noisy_gate_policy
self.moe_drop_tokens = moe_drop_tokens
self.moe_use_rts = moe_use_rts
self.moe_use_residual = moe_use_residual
ep_size = gpc.get_world_size(ParallelMode.EXPERT) ep_size = gpc.get_world_size(ParallelMode.EXPERT)
if num_experts <= 1: # dense, not MoE if num_experts <= 1: # dense, not MoE
if use_swiglu: if use_swiglu:
@ -311,7 +327,14 @@ class PackedFlashInternLm1D(nn.Module):
norm_type (str): Normalization type. Use RMSNorm or LayerNorm. "rmsnorm" by default. norm_type (str): Normalization type. Use RMSNorm or LayerNorm. "rmsnorm" by default.
use_flash_attn (bool): Whether to use flash-attn. True by default. use_flash_attn (bool): Whether to use flash-attn. True by default.
num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default. num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default.
moe_gate_k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
moe_capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'.
moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
""" """
def __init__( def __init__(
@ -587,7 +610,14 @@ def build_model_with_cfg(
use_swiglu (bool): Whether to use swiglu. True by default. use_swiglu (bool): Whether to use swiglu. True by default.
use_flash_attn (bool): Whether to use flash-attn. True by default. use_flash_attn (bool): Whether to use flash-attn. True by default.
num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default. num_experts (int): The number of experts. <=1 means dense, >1 means MoE. 1 by default.
moe_gate_k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
moe_capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
moe_eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
moe_min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
moe_noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample'.
moe_drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
""" """
cfg = dict( cfg = dict(