From c4c43bf15752df4aacc3fec2b9beab8f91838675 Mon Sep 17 00:00:00 2001 From: Wenwen Qu Date: Wed, 27 Sep 2023 19:00:36 +0800 Subject: [PATCH] fix model checkpoint for local dp mode of expert --- internlm/utils/model_checkpoint.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/internlm/utils/model_checkpoint.py b/internlm/utils/model_checkpoint.py index 3dd57eb..566bb0f 100644 --- a/internlm/utils/model_checkpoint.py +++ b/internlm/utils/model_checkpoint.py @@ -258,7 +258,14 @@ def save_model_checkpoint(folder, model): llm_save(topo_fp, saved_obj=topo) # try to save expert parameter to separate files if model have moe layer - try_save_moe_checkpoint(folder, model, tp_rank, pp_rank) + expert_dp_size = gpc.get_world_size(ParallelMode.EXPERT_DATA) + expert_dp_rank = gpc.get_local_rank(ParallelMode.EXPERT_DATA) + should_save_rank_pair.clear() + for i in range(tp_size): + should_save_rank_pair.add((i, i % expert_dp_size)) + + if (tp_rank, expert_dp_rank) in should_save_rank_pair: + try_save_moe_checkpoint(folder, model, tp_rank, pp_rank) torch.distributed.barrier()