From 73998a0bb74d8b436589249db3c50f3375dc9ec4 Mon Sep 17 00:00:00 2001 From: Wenwen Qu Date: Fri, 11 Aug 2023 11:58:22 +0800 Subject: [PATCH] modified: internlm/model/modeling_internlm.py --- internlm/model/modeling_internlm.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index a8f1973..19d68d7 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -123,7 +123,6 @@ class PackedFlashBaseLayer1D(nn.Module): self.norm1 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon) self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon) - # TODO: replace num_experts and epsize with function parameter self.num_experts = num_experts self.moe_gate_k = moe_gate_k self.moe_capacity_factor = moe_capacity_factor @@ -582,7 +581,7 @@ def build_model_with_cfg( moe_noisy_gate_policy: str = None, moe_drop_tokens: bool = True, moe_use_rts: bool = True, - moe_use_residual: bool = True, + moe_use_residual: bool = False, ): """ Builde model with config @@ -646,7 +645,6 @@ def build_model_with_cfg( use_scaled_init=use_scaled_init, use_swiglu=use_swiglu, use_flash_attn=use_flash_attn, - sequence_parallel=sequence_parallel, num_experts=num_experts, moe_gate_k=moe_gate_k, moe_capacity_factor=moe_capacity_factor,