diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py index 3dd57eb..566bb0f 100644 --- a/internlm/utils/model_checkpoint.py +++ b/internlm/utils/model_checkpoint.py @@ -258,7 +258,14 @@ def save_model_checkpoint(folder, model): llm_save(topo_fp, saved_obj=topo) # try to save expert parameter to separate files if model have moe layer - try_save_moe_checkpoint(folder, model, tp_rank, pp_rank) + expert_dp_size = gpc.get_world_size(ParallelMode.EXPERT_DATA) + expert_dp_rank = gpc.get_local_rank(ParallelMode.EXPERT_DATA) + should_save_rank_pair.clear() + for i in range(tp_size): + should_save_rank_pair.add((i, i % expert_dp_size)) + + if (tp_rank, expert_dp_rank) in should_save_rank_pair: + try_save_moe_checkpoint(folder, model, tp_rank, pp_rank) torch.distributed.barrier()