@ -55,6 +55,7 @@ class MoeHybridParallelZeroOptimizer(HybridParallelZeroOptimizer):
partition_grad:bool=False,# stage 2 flag
cpu_offload:bool=False,# cpu offload
forced_dtype:Optional[torch.dtype]=None,
overlap_allgather:bool=False,
):
WARN_STR="Note that you need to make sure every expert are routed (i.e.) every expert has backward, otherwise this might lead to program hang or inconsistent result"