diff --git a/.gitignore b/.gitignore
index 055e7ad..8992a0f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -132,7 +132,6 @@ runs_bak/
 LLM_ALERT
 small_demo/
 7b_llama_nopp/
-test/
 
 # Pytorch
 *.pth
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8a43efd..19cd7c8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -50,4 +50,4 @@ repos:
             [
                 '--rcfile=.pylintrc',
                 '--disable=C0114,C0415,W0212,W0235,W0238,W0621,C0103,R1735,C2801,E0402,C0412,W0719,R1728,W1514,W0718,W0105,W0707,C0209,W0703,W1203'
-            ]
+            ]
\ No newline at end of file
diff --git a/internlm/utils/evaluation.py b/internlm/utils/evaluation.py
index d60ebd2..6128249 100644
--- a/internlm/utils/evaluation.py
+++ b/internlm/utils/evaluation.py
@@ -112,7 +112,7 @@ def evaluate_on_val_dls(
                             tensor_shape=tensor_shape,
                             metric_hook_list=[val_sche_metric_hook],
                         ):
-                            _, _, loss, _ = trainer.execute_schedule(
+                            _, _, loss, moe_loss = trainer.execute_schedule(
                                 batch, forward_only=True, return_loss=True, return_output_label=False
                             )
                     else:
@@ -126,11 +126,11 @@ def evaluate_on_val_dls(
                             grad_accum_batch_size=grad_accum_batch_size,
                             metric_hook_list=[val_sche_metric_hook],
                         ):
-                            _, _, loss, _ = trainer.execute_schedule(
+                            _, _, loss, moe_loss = trainer.execute_schedule(
                                 batch, forward_only=True, return_loss=True, return_output_label=False
                             )
                 if verbose:
-                    val_loss += loss.item()
+                    val_loss += loss.item() - moe_loss.item()
 
             assert val_idx != -1
             dist.barrier()
diff --git a/tests/test_training/test_loss.py b/tests/test_training/test_loss.py
index 2f52500..864fc24 100644
--- a/tests/test_training/test_loss.py
+++ b/tests/test_training/test_loss.py
@@ -186,11 +186,11 @@ def train(
         # do forward and backward
         timer("fwd-bwd").start()
 
-        _, _, loss = trainer.execute_schedule(batch, forward_only=False, return_loss=True, return_output_label=False)
+        _, _, loss, moe_loss = trainer.execute_schedule(batch, forward_only=False, return_loss=True, return_output_label=False)
         if gpc.is_rank_for_log():
             assert loss is not None and not math.isnan(loss.item())
             global cur_loss_list
-            cur_loss_list.append(loss.item())
+            cur_loss_list.append(loss.item() - moe_loss.item())
         timer("fwd-bwd").stop()
 
         # update parameters, and returns (success_update, grad_norm)