diff --git a/internlm/core/context/parallel_context.py b/internlm/core/context/parallel_context.py
index ceea60c..a8e8101 100644
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@@ -36,7 +36,7 @@ class Config(dict):
         config (dict): The dict object to be wrapped.
     """
 
-    def __init__(self, config: dict = None):
+    def __init__(self, config: dict = None):  # pylint: disable=W0231
         if config is not None:
             for k, v in config.items():
                 self._add_item(k, v)
@@ -100,7 +100,7 @@ class Config(dict):
 
         module_name = filepath.stem
         source_file = SourceFileLoader(fullname=str(module_name), path=str(filepath))
-        module = source_file.load_module()  # pylint: disable=W4902,E1120
+        module = source_file.load_module()  # pylint: disable=W4902,E1120, W1505:
 
         # load into config
         config = Config()
@@ -527,6 +527,7 @@ class ParallelContext(metaclass=SingletonMeta):
         if dpseed_with_tpoffset:
             dp_seed = seed + pipeline_offset * 1024
         add_seed(ParallelMode.DATA, dp_seed)
+        add_seed(ParallelMode.DUMMY, dp_seed)
 
         # model parallel seeds are different across ranks
         if self.is_initialized(ParallelMode.TENSOR):
@@ -534,7 +535,11 @@ class ParallelContext(metaclass=SingletonMeta):
             tp_seed = seed + tp_rank + pipeline_offset * 1024
             add_seed(ParallelMode.TENSOR, tp_seed)
 
-        set_mode(ParallelMode.DATA)
+        # we do not set the random state mode to ParallelMode.DATA until model is built (instead, we use a dummy mode
+        # during model construction), this is because the random state will be different in different tensor parallel
+        # device of the same data parallel group. The underlying reason is that the device of tp_rank = 0 will perform
+        # additional random operations during the RowParallelLinear module building process.
+        set_mode(ParallelMode.DUMMY)
 
         seeds = get_seeds()
         seed_str = ", ".join([f"{k}: {v}" for k, v in seeds.items()])
diff --git a/internlm/core/context/process_group_initializer.py b/internlm/core/context/process_group_initializer.py
index fbed8bc..323c08c 100644
--- a/internlm/core/context/process_group_initializer.py
+++ b/internlm/core/context/process_group_initializer.py
@@ -37,6 +37,9 @@ class ParallelMode(Enum):
     # expert data parallel
     EXPERT_DATA = "expert_data"
 
+    # dummy mode, only used during mode construction
+    DUMMY = "dummy"
+
 
 class ProcessGroupInitializer(ABC):
     """An object, knowing the parallelism configuration, that initializes parallel groups.
diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
index 9c73e61..ab73558 100644
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@@ -12,6 +12,7 @@ from torch.utils.data import ConcatDataset, DataLoader
 
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
+from internlm.core.context.random import set_mode
 from internlm.core.naive_amp import NaiveAMPModel
 from internlm.core.trainer import TrainState
 from internlm.data.batch_sampler import StaticBatchSampler, get_dpsampler_dataloader
@@ -81,6 +82,10 @@ def initialize_model():
     # the same across tensor parallelism.
     sync_model_param_within_tp(model)
 
+    # Change random state mode to ParallelMode.DATA after model is built, guaranteeing the random
+    # state in the same dp group are all the same.
+    set_mode(ParallelMode.DATA)
+
     return model
 
 
@@ -414,7 +419,7 @@ def record_current_batch_training_metrics(
                 "step": batch_count,
                 "lr": lr,
                 "num_consumed_tokens": train_state.num_consumed_tokens,
-                "loss": loss.item(),
+                "loss": loss.item() - moe_loss.item(),
                 "flops": tflops,
                 "tgs": tk_per_gpu,
                 "acc": acc_perplex["acc"],