mirror of https://github.com/InternLM/InternLM
fix(moe): fix moe zero mode bug (#548)
* fix moe zero mode bugs * update moe config to fit training on 8 GPUpull/562/head
parent
bbb5651582
commit
513ebb9c3a
|
@ -141,7 +141,7 @@ model = dict(
|
|||
layer_norm_epsilon=1e-5,
|
||||
use_flash_attn=True,
|
||||
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
|
||||
num_experts=8,
|
||||
num_experts=4,
|
||||
moe_use_residual=False,
|
||||
moe_gate_k=2,
|
||||
)
|
||||
|
|
|
@ -150,7 +150,7 @@ class HybridZeroOptimizer(BaseOptimizer):
|
|||
# if zero is used, expert dp group will use ParallelMode.EXPERT_DATA as the real zero mode
|
||||
zero_mode = (
|
||||
ParallelMode.ZERO1
|
||||
if param_group["dp_mode"] == gpc.get_world_size(ParallelMode.ZERO1) == 1 or ParallelMode.DATA
|
||||
if gpc.get_world_size(ParallelMode.ZERO1) == 1 or param_group["dp_mode"] == ParallelMode.DATA
|
||||
else ParallelMode.EXPERT_DATA
|
||||
)
|
||||
self._zero_local_rank.append(gpc.get_local_rank(zero_mode))
|
||||
|
|
Loading…
Reference in New Issue