diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py index d2f8412..cba39fb 100644 --- a/internlm/core/context/parallel_context.py +++ b/internlm/core/context/parallel_context.py @@ -491,7 +491,7 @@ class ParallelContext(metaclass=SingletonMeta): initializers.append(pgroup_initializer.Initializer_Nettest(*initializer_args)) if self.pipeline_parallel_size > 1: initializers.append(pgroup_initializer.Initializer_Pipeline(*initializer_args)) - if self.config.model.num_experts > 1: + if self.config.model.get("num_experts", 1) > 1: initializers.append(pgroup_initializer.Initializer_Expert(*initializer_args)) for initializer in initializers: parallel_setting = initializer.init_dist_group() diff --git a/internlm/core/scheduler/pipeline_scheduler.py b/internlm/core/scheduler/pipeline_scheduler.py index 58b36a3..a29cdc5 100644 --- a/internlm/core/scheduler/pipeline_scheduler.py +++ b/internlm/core/scheduler/pipeline_scheduler.py @@ -15,13 +15,10 @@ from internlm.core.context import global_context as gpc from internlm.core.engine import Engine from internlm.core.naive_amp import NaiveAMPModel from internlm.utils.common import get_current_device, move_to_device -from internlm.utils.logger import get_logger from internlm.utils.timeout import llm_timeout from .base_scheduler import BaseScheduler, SchedulerHook -logger = get_logger(__file__) - def get_tensor_shape(): if hasattr(gpc.config, "TENSOR_SHAPE"): @@ -1347,8 +1344,6 @@ class InterleavedPipelineScheduler(PipelineScheduler): else: output, label = (None, None) - logger.info(f"{gpc.get_local_rank(ParallelMode.PIPELINE)}, moe_loss: {self._accum_moe_loss.item()}") - dist.all_reduce(self._accum_moe_loss, group=gpc.get_group(ParallelMode.PIPELINE)) accum_moe_loss = self._accum_moe_loss diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py index 9ab410c..29cd7bf 100644 --- a/internlm/model/modeling_internlm.py +++ b/internlm/model/modeling_internlm.py @@ -501,7 +501,7 @@ def _build_generic_model_1d(num_layers, num_chunks, device=torch.device("cuda"), all_parts = partition_uniform(num_layers, pipeline_size, num_chunks) parts = all_parts[pipeline_rank] if gpc.is_rank_for_log(): - logger.info(f"The layer sharding is {all_parts}.") # pylint: disable=W1203 + logger.info(f"The layer sharding is {all_parts}.") models = [] diff --git a/internlm/moe/sharded_moe.py b/internlm/moe/sharded_moe.py index 6a6c511..caddc8a 100644 --- a/internlm/moe/sharded_moe.py +++ b/internlm/moe/sharded_moe.py @@ -69,10 +69,6 @@ def gumbel_rsample(shape: Tuple, device: torch.device) -> Tensor: return gumbel(shape) -# einsum dimensions: (g)roup, (s)equence, (e)xpert, (m)odel, (c)apacity -# See https://arxiv.org/pdf/2006.16668.pdf for details. - - # Based on https://github.com/pytorch/pytorch/pull/40762 class _AllToAll(torch.autograd.Function): """ @@ -477,10 +473,10 @@ class MOELayer(Base): combined_output = einsum("sec,ecm->sm", combine_weights.type_as(inputs[0]), expert_output) - a = combined_output.reshape(inputs[0].shape) + out = combined_output.reshape(inputs[0].shape) if self.wall_clock_breakdown: timer("moe").stop() self.time_moe = timer("moe").elapsed(reset=False) - return a + return out diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py index 3fe3eb5..c6897eb 100644 --- a/internlm/solver/optimizer/hybrid_zero_optim.py +++ b/internlm/solver/optimizer/hybrid_zero_optim.py @@ -592,7 +592,7 @@ class HybridZeroOptimizer(BaseOptimizer): groups_norms = [] for group_id in range(self.num_param_groups): if self._is_moe_group(self.optim.param_groups[group_id]): - groups_norms.append([]) + groups_norms.append(None) else: groups_norms.append(self._compute_norm_with_stage(group_id=group_id)) diff --git a/internlm/train/utils.py b/internlm/train/utils.py index f91f1f1..29f238a 100644 --- a/internlm/train/utils.py +++ b/internlm/train/utils.py @@ -9,12 +9,23 @@ def split_params_into_different_groups_for_optimizer(param_groups: Tuple[Dict]) Compatiable with muiltiple param groups, each should have a name Args: - param_groups (Tuple[Dict]): - The list of parameter groups to split + param_groups (Tuple[Dict]): The list of parameter groups to split + Output Example: + >>> ( + >>> {'name': 'default', 'params': [tensor], 'weight_decay' :xxx}, + >>> ..., + >>> ) Returns: - Tuple[Dict]: - list of MoE/non-MoE groups for optimizer + Tuple[Dict]: list of params groups for optimizer + Output Example: + >>> ( + >>> {'name': 'default','params': [tensor],'weight_decay' :xxx}, + >>> {'name': 'norm', 'norm': True, 'params': [tensor],'weight_decay' :xxx}, + >>> {'name': 'gate', 'gate': True, 'params': [tensor],'weight_decay' :xxx}, + >>> {'name': 'moe_ep_size_4', 'moe': True, 'params': [tensor],'weight_decay' :xxx}, + >>> ..., + >>> ) """ if isinstance(param_groups, tuple): param_groups = list(param_groups) # Tuple cannot be modified