fix model checkpoint for local dp mode of expert

pull/376/head
Wenwen Qu 2023-09-27 19:00:36 +08:00
parent 2b863bd099
commit c4c43bf157
1 changed files with 8 additions and 1 deletions

View File

@ -258,6 +258,13 @@ def save_model_checkpoint(folder, model):
llm_save(topo_fp, saved_obj=topo)
# try to save expert parameter to separate files if model have moe layer
expert_dp_size = gpc.get_world_size(ParallelMode.EXPERT_DATA)
expert_dp_rank = gpc.get_local_rank(ParallelMode.EXPERT_DATA)
should_save_rank_pair.clear()
for i in range(tp_size):
should_save_rank_pair.add((i, i % expert_dp_size))
if (tp_rank, expert_dp_rank) in should_save_rank_pair:
try_save_moe_checkpoint(folder, model, tp_rank, pp_rank)
torch.distributed.barrier()