diff --git a/internlm/core/scheduler/pipeline_scheduler.py b/internlm/core/scheduler/pipeline_scheduler.py index a29cdc5..6d18e02 100644 --- a/internlm/core/scheduler/pipeline_scheduler.py +++ b/internlm/core/scheduler/pipeline_scheduler.py @@ -646,7 +646,8 @@ class PipelineScheduler(BaseScheduler): return_loss (bool, optional): Whether returns the loss value. Default is true. return_output_label (bool, optional): If False, the output and label won't be returned. Returns: - Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss, loss), loss and label could be None. + Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss, moe_loss), loss and label could be None. + The loss would be returned only in the last stage. And the moe_loss is accumulated from all stages. """ assert ( @@ -1316,8 +1317,8 @@ class InterleavedPipelineScheduler(PipelineScheduler): return_output_label (bool, optional): If False, the output and label won't be returned. Returns: - Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss), loss and label could be None. - The loss would be returned only in the last stage. + Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss, moe_loss), loss and label could be None. + The loss would be returned only in the last stage. And the moe_loss is accumulated from all stages. """ assert ( forward_only or return_loss diff --git a/internlm/core/trainer.py b/internlm/core/trainer.py index 7954bc6..705b17a 100644 --- a/internlm/core/trainer.py +++ b/internlm/core/trainer.py @@ -203,7 +203,7 @@ class Trainer: **kwargs: Additional keyword arguments. Returns: - Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss). + Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss, moe_loss). """ output, label, loss, moe_loss = self._schedule.forward_backward_step(self._engine, data_iter, **kwargs) return output, label, loss, moe_loss diff --git a/internlm/model/moe.py b/internlm/model/moe.py index 414baa0..28e5ae6 100644 --- a/internlm/model/moe.py +++ b/internlm/model/moe.py @@ -9,12 +9,6 @@ from internlm.moe.experts import Experts from internlm.moe.sharded_moe import MOELayer, TopKGate from internlm.utils.logger import get_logger -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team - - # global llm logger logger = get_logger(__file__) diff --git a/internlm/moe/experts.py b/internlm/moe/experts.py index ab93a0f..be06686 100644 --- a/internlm/moe/experts.py +++ b/internlm/moe/experts.py @@ -4,12 +4,6 @@ https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/experts.py Git commit hash: f3943cf9109226ed3ecf2d5dbb639a11cd925555 We retain the following license from the original files: """ - -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team - from typing import Union, cast import torch diff --git a/internlm/moe/sharded_moe.py b/internlm/moe/sharded_moe.py index caddc8a..dbee2a4 100644 --- a/internlm/moe/sharded_moe.py +++ b/internlm/moe/sharded_moe.py @@ -4,13 +4,6 @@ https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/experts.py Git commit hash: f3943cf9109226ed3ecf2d5dbb639a11cd925555 We retain the following license from the original files: """ - -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team - - from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple import torch diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index c6897eb..ccdb53c 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -538,8 +538,7 @@ class HybridZeroOptimizer(BaseOptimizer): def _compute_norm_with_moe_group(self, group_id): params = self._param_store.get_fp16_params_by_rank_group(group_id=group_id, rank=self._zero_local_rank) - # wo do not get the average grad for moe parameters, so we have to constuct the gradients list here. - # Maybe this can be optimized. + # we do not get the average grad for moe parameters, so we have to constuct the gradients list here. grads = [p.grad for p in params] if len(params) == 0: @@ -696,14 +695,11 @@ class HybridZeroOptimizer(BaseOptimizer): # Parameters shared within a TP group, such as norm and moe gate, have precision inconsistency in gradients. # Therefore, it is recommended to synchronize gradients within the TP group to eliminate accumulated errors. - if self._is_norm_group(self.optim.param_groups[group_id]): - dist.all_reduce( - flat_fp32_avg_grads, - op=dist.ReduceOp.AVG, - group=gpc.get_group(ParallelMode.TENSOR), - ) - - if self._is_gate_group(self.optim.param_groups[group_id]): + is_tp_sync_groups = ( + self._is_norm_group(self.optim.param_groups[group_id]), + self._is_gate_group(self.optim.param_groups[group_id]), + ) + if any(is_tp_sync_groups): dist.all_reduce( flat_fp32_avg_grads, op=dist.ReduceOp.AVG,