change code comments

2023-10-18 17:37:20 +08:00 · 2023-10-18 17:37:20 +08:00 · c0d9063a8d
parent bf6dbf07fa
commit c0d9063a8d
3 changed files with 9 additions and 10 deletions
--- a/internlm/initialize/initialize_trainer.py
+++ b/internlm/initialize/initialize_trainer.py
@ -70,8 +70,7 @@ def initialize_trainer(

    assert isinstance(optimizer, BaseOptimizer), "optimizer must be instance of BaseOptimizer"

-    # gradient handler, only support PipelineSharedModuleGradientHandler now
-    # TODO: can refactor code here
+    # gradient handler, support PipelineSharedModuleGradientHandler and EmbeddingSharedModuleGradientHandler now
    if gpc.is_using_pp():
        gpc.config.gradient_handler = [
            dict(type="PipelineSharedModuleGradientHandler"),
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@ -58,7 +58,7 @@ class ScaleColumnParallelLinear(nn.Linear):
        # If not, then the input is already gathered.
        if shared_weight is None:
            if self.weight is None:
-                raise RuntimeError("weight was not given in forward pass " "and skip_weight_allocation is True.")
+                raise RuntimeError("weight was not given in forward pass and skip_weight_allocation is True.")
            shared_weight = self.weight
        if self.weight_scale != 1:
            weight = shared_weight * self.weight_scale + (1 - self.weight_scale) * shared_weight.detach()
@ -102,7 +102,7 @@ class RewardModelLinear(ScaleColumnParallelLinear):
        skip_weight_alloction: bool = False,
    ) -> None:
        # TODO have not use RewardModelLinear for now
-        assert not skip_weight_alloction, "shared weight not support here for now"
+        assert not skip_weight_alloction, "shared weight is not supported in RewardModelLinear for now"

        super().__init__(in_features, out_features, process_group, bias, device, dtype, weight_scale)
        torch.distributed.broadcast(self.weight, gpc.get_ranks_in_group(ParallelMode.TENSOR)[0], process_group)
--- a/internlm/model/modeling_moe.py
+++ b/internlm/model/modeling_moe.py
@ -331,7 +331,7 @@ class PackedFlashInternLm1D(nn.Module):
        moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
        moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE
                                          (https://arxiv.org/abs/2201.05596) layer.
-        tie_embeddings_and_output_weights: embedding and output layer share the same weight.
+        tie_embeddings_and_output_weights: default=False, whether embedding and output layer share the same weight.
    """

    def __init__(
@ -529,7 +529,7 @@ class PackedFlashInternLm1D(nn.Module):
        device: Optional[torch.device] = None,
    ):
        if not self.tie_embeddings_and_output_weights:
-            raise Exception("initialize_word_embeddings() was called but " "tie_embeddings_and_output_weights is false")
+            raise Exception("initialize_word_embeddings() was called but tie_embeddings_and_output_weights is false")

        # This function just initializes the word embeddings in the final stage
        # when we are using pipeline parallelism. Nothing to do if we aren't
@ -546,9 +546,9 @@ class PackedFlashInternLm1D(nn.Module):
        # 2. Do an all-reduce between the first and last stage to ensure that
        #    the two copies of word_embeddings start off with the same
        #    parameter values.
-        # 3. In the training loop, before an all-reduce between the grads of
-        #    the two word_embeddings layers to ensure that every applied weight
-        #    update is the same on both stages.
+        # 3. In the training loop, before step perform an all-reduce between the
+        #    grads of the two word_embeddings layers to ensure that every applied
+        #    weight update is the same on both stages.
        if gpc.is_pipeline_last_stage():
            assert not gpc.is_pipeline_first_stage()
            # set word_embeddings weights to 0 here, then copy first
@ -692,7 +692,7 @@ def build_model_with_moe_cfg(
        moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
        moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE
                                           (https://arxiv.org/abs/2201.05596) layer.
-        tie_embeddings_and_output_weights: embedding and output layer share the same weight.
+        tie_embeddings_and_output_weights: default=False, whether embedding and output layer share the same weight.
    """

    cfg = dict(