diff --git a/internlm/initialize/initialize_trainer.py b/internlm/initialize/initialize_trainer.py
index e6c779c..ec65f49 100644
--- a/internlm/initialize/initialize_trainer.py
+++ b/internlm/initialize/initialize_trainer.py
@@ -70,8 +70,7 @@ def initialize_trainer(
 
     assert isinstance(optimizer, BaseOptimizer), "optimizer must be instance of BaseOptimizer"
 
-    # gradient handler, only support PipelineSharedModuleGradientHandler now
-    # TODO: can refactor code here
+    # gradient handler, support PipelineSharedModuleGradientHandler and EmbeddingSharedModuleGradientHandler now
     if gpc.is_using_pp():
         gpc.config.gradient_handler = [
             dict(type="PipelineSharedModuleGradientHandler"),
diff --git a/internlm/model/linear.py b/internlm/model/linear.py
index 8ab8707..2b11180 100644
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@@ -58,7 +58,7 @@ class ScaleColumnParallelLinear(nn.Linear):
         # If not, then the input is already gathered.
         if shared_weight is None:
             if self.weight is None:
-                raise RuntimeError("weight was not given in forward pass " "and skip_weight_allocation is True.")
+                raise RuntimeError("weight was not given in forward pass and skip_weight_allocation is True.")
             shared_weight = self.weight
         if self.weight_scale != 1:
             weight = shared_weight * self.weight_scale + (1 - self.weight_scale) * shared_weight.detach()
@@ -102,7 +102,7 @@ class RewardModelLinear(ScaleColumnParallelLinear):
         skip_weight_alloction: bool = False,
     ) -> None:
         # TODO have not use RewardModelLinear for now
-        assert not skip_weight_alloction, "shared weight not support here for now"
+        assert not skip_weight_alloction, "shared weight is not supported in RewardModelLinear for now"
 
         super().__init__(in_features, out_features, process_group, bias, device, dtype, weight_scale)
         torch.distributed.broadcast(self.weight, gpc.get_ranks_in_group(ParallelMode.TENSOR)[0], process_group)
diff --git a/internlm/model/modeling_moe.py b/internlm/model/modeling_moe.py
index fa53d27..0f033ae 100644
--- a/internlm/model/modeling_moe.py
+++ b/internlm/model/modeling_moe.py
@@ -331,7 +331,7 @@ class PackedFlashInternLm1D(nn.Module):
         moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
         moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE
                                           (https://arxiv.org/abs/2201.05596) layer.
-        tie_embeddings_and_output_weights: embedding and output layer share the same weight.
+        tie_embeddings_and_output_weights: default=False, whether embedding and output layer share the same weight.
     """
 
     def __init__(
@@ -529,7 +529,7 @@ class PackedFlashInternLm1D(nn.Module):
         device: Optional[torch.device] = None,
     ):
         if not self.tie_embeddings_and_output_weights:
-            raise Exception("initialize_word_embeddings() was called but " "tie_embeddings_and_output_weights is false")
+            raise Exception("initialize_word_embeddings() was called but tie_embeddings_and_output_weights is false")
 
         # This function just initializes the word embeddings in the final stage
         # when we are using pipeline parallelism. Nothing to do if we aren't
@@ -546,9 +546,9 @@ class PackedFlashInternLm1D(nn.Module):
         # 2. Do an all-reduce between the first and last stage to ensure that
         #    the two copies of word_embeddings start off with the same
         #    parameter values.
-        # 3. In the training loop, before an all-reduce between the grads of
-        #    the two word_embeddings layers to ensure that every applied weight
-        #    update is the same on both stages.
+        # 3. In the training loop, before step perform an all-reduce between the
+        #    grads of the two word_embeddings layers to ensure that every applied
+        #    weight update is the same on both stages.
         if gpc.is_pipeline_last_stage():
             assert not gpc.is_pipeline_first_stage()
             # set word_embeddings weights to 0 here, then copy first
@@ -692,7 +692,7 @@ def build_model_with_moe_cfg(
         moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
         moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE
                                            (https://arxiv.org/abs/2201.05596) layer.
-        tie_embeddings_and_output_weights: embedding and output layer share the same weight.
+        tie_embeddings_and_output_weights: default=False, whether embedding and output layer share the same weight.
     """
 
     cfg = dict(