update moe config to fit training on 8 GPU

pull/548/head
Qu Wenwen 2023-12-18 14:02:33 +08:00
parent c801336732
commit 35778efff3
1 changed files with 1 additions and 1 deletions

View File

@ -141,7 +141,7 @@ model = dict(
layer_norm_epsilon=1e-5,
use_flash_attn=True,
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
num_experts=8,
num_experts=4,
moe_use_residual=False,
moe_gate_k=2,
)