diff --git a/internlm/initialize/initialize_trainer.py b/internlm/initialize/initialize_trainer.py index e6c779c..ec65f49 100644 --- a/internlm/initialize/initialize_trainer.py +++ b/internlm/initialize/initialize_trainer.py @@ -70,8 +70,7 @@ def initialize_trainer( assert isinstance(optimizer, BaseOptimizer), "optimizer must be instance of BaseOptimizer" - # gradient handler, only support PipelineSharedModuleGradientHandler now - # TODO: can refactor code here + # gradient handler, support PipelineSharedModuleGradientHandler and EmbeddingSharedModuleGradientHandler now if gpc.is_using_pp(): gpc.config.gradient_handler = [ dict(type="PipelineSharedModuleGradientHandler"), diff --git a/internlm/model/linear.py b/internlm/model/linear.py index 8ab8707..2b11180 100644 --- a/internlm/model/linear.py +++ b/internlm/model/linear.py @@ -58,7 +58,7 @@ class ScaleColumnParallelLinear(nn.Linear): # If not, then the input is already gathered. if shared_weight is None: if self.weight is None: - raise RuntimeError("weight was not given in forward pass " "and skip_weight_allocation is True.") + raise RuntimeError("weight was not given in forward pass and skip_weight_allocation is True.") shared_weight = self.weight if self.weight_scale != 1: weight = shared_weight * self.weight_scale + (1 - self.weight_scale) * shared_weight.detach() @@ -102,7 +102,7 @@ class RewardModelLinear(ScaleColumnParallelLinear): skip_weight_alloction: bool = False, ) -> None: # TODO have not use RewardModelLinear for now - assert not skip_weight_alloction, "shared weight not support here for now" + assert not skip_weight_alloction, "shared weight is not supported in RewardModelLinear for now" super().__init__(in_features, out_features, process_group, bias, device, dtype, weight_scale) torch.distributed.broadcast(self.weight, gpc.get_ranks_in_group(ParallelMode.TENSOR)[0], process_group) diff --git a/internlm/model/modeling_moe.py b/internlm/model/modeling_moe.py index fa53d27..0f033ae 100644 --- a/internlm/model/modeling_moe.py +++ b/internlm/model/modeling_moe.py @@ -331,7 +331,7 @@ class PackedFlashInternLm1D(nn.Module): moe_use_rts (bool, optional): default=True, whether to use Random Token Selection. moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer. - tie_embeddings_and_output_weights: embedding and output layer share the same weight. + tie_embeddings_and_output_weights: default=False, whether embedding and output layer share the same weight. """ def __init__( @@ -529,7 +529,7 @@ class PackedFlashInternLm1D(nn.Module): device: Optional[torch.device] = None, ): if not self.tie_embeddings_and_output_weights: - raise Exception("initialize_word_embeddings() was called but " "tie_embeddings_and_output_weights is false") + raise Exception("initialize_word_embeddings() was called but tie_embeddings_and_output_weights is false") # This function just initializes the word embeddings in the final stage # when we are using pipeline parallelism. Nothing to do if we aren't @@ -546,9 +546,9 @@ class PackedFlashInternLm1D(nn.Module): # 2. Do an all-reduce between the first and last stage to ensure that # the two copies of word_embeddings start off with the same # parameter values. - # 3. In the training loop, before an all-reduce between the grads of - # the two word_embeddings layers to ensure that every applied weight - # update is the same on both stages. + # 3. In the training loop, before step perform an all-reduce between the + # grads of the two word_embeddings layers to ensure that every applied + # weight update is the same on both stages. if gpc.is_pipeline_last_stage(): assert not gpc.is_pipeline_first_stage() # set word_embeddings weights to 0 here, then copy first @@ -692,7 +692,7 @@ def build_model_with_moe_cfg( moe_use_rts (bool, optional): default=True, whether to use Random Token Selection. moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer. - tie_embeddings_and_output_weights: embedding and output layer share the same weight. + tie_embeddings_and_output_weights: default=False, whether embedding and output layer share the same weight. """ cfg = dict(