From db685e8a3157e38fd07920022f4528137ee5a32a Mon Sep 17 00:00:00 2001 From: zhanglei Date: Mon, 21 Aug 2023 09:59:58 +0800 Subject: [PATCH] fix the pp moe bugs --- internlm/core/scheduler/pipeline_scheduler.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/internlm/core/scheduler/pipeline_scheduler.py b/internlm/core/scheduler/pipeline_scheduler.py index d6db0e2..1b749e7 100644 --- a/internlm/core/scheduler/pipeline_scheduler.py +++ b/internlm/core/scheduler/pipeline_scheduler.py @@ -1326,9 +1326,11 @@ class InterleavedPipelineScheduler(PipelineScheduler): output, label = pack_return_tensors(self._return_tensors) else: output, label = (None, None) - accum_loss = self._accum_loss - logger.info(f"{gpc.get_local_rank(ParallelMode.PIPELINE)}, moe_loss: {accum_moe_loss.item()}") + accum_loss = self._accum_loss + accum_loss += self._accum_moe_loss + + logger.info(f"{gpc.get_local_rank(ParallelMode.PIPELINE)}, moe_loss: {self._accum_moe_loss.item()}") dist.all_reduce(self._accum_moe_loss, group=gpc.get_group(ParallelMode.PIPELINE)) accum_moe_loss = self._accum_moe_loss