diff --git a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
index 047782aa9..bf450534f 100644
--- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
@@ -113,9 +113,6 @@ class MoeHybridParallelPlugin(HybridParallelPlugin):
             )
             self.ddp_config["find_unused_parameters"] = True
 
-        if moe_tp_size != 1:
-            raise NotImplementedError
-
         world_size = dist.get_world_size()
         self.moe_dp_size = world_size // (self.pp_size * ep_size * moe_tp_size)
         self.ep_size = ep_size
@@ -182,6 +179,13 @@ class MoeHybridParallelPlugin(HybridParallelPlugin):
                         assert self.moe_tp_group is None
                         self.moe_tp_group = group
 
+        if dist.get_process_group_ranks(self.tp_group) != dist.get_process_group_ranks(self.moe_tp_group):
+            # NOTE: different tp settings between moe and non moe param require complex comm logic, where all_to_all might not be suitable
+            # this assertion implies that dp_size == moe_dp_size * ep_size
+            raise NotImplementedError(
+                f"Only support shared tp group between moe and non moe params, but found non-moe tp {dist.get_process_group_ranks(self.tp_group)}, moe tp {dist.get_process_group_ranks(self.moe_tp_group)}, please make sure tp_size == moe_tp_size"
+            )
+
         self.logger.info(
             f"rank {dist.get_rank()} moe_dp_group {dist.get_process_group_ranks(self.moe_dp_group)} ep_group {dist.get_process_group_ranks(self.ep_group)} moe_tp_group {dist.get_process_group_ranks(self.moe_tp_group)}",
             ranks=[0],
diff --git a/colossalai/checkpoint_io/moe_checkpoint.py b/colossalai/checkpoint_io/moe_checkpoint.py
index a0b625008..9181956b7 100644
--- a/colossalai/checkpoint_io/moe_checkpoint.py
+++ b/colossalai/checkpoint_io/moe_checkpoint.py
@@ -151,13 +151,10 @@ class MoECheckpointIO(HybridParallelCheckpointIO):
 
         # ep_rank 0 saves all the parameters and buffers.
         # other ep_ranks save only experts
-        ep_param_pattern = "experts." if self.ep_rank != 0 else None
 
         # Then collect the sharded parameters & buffers along tp_group.
         # Only devices with tp_rank == 0 are responsible for model saving.
-        state_dict_shard = MoECheckpointIO._model_sharder(
-            model, size_per_shard=size_per_shard, param_name_pattern=ep_param_pattern
-        )
+        state_dict_shard = MoECheckpointIO._model_sharder(model, size_per_shard=size_per_shard)
         weights_name, save_index_file = get_model_base_filenames(prefix, use_safetensors)
         index_file = CheckpointIndexFile(checkpoint)
         control_saving = self.tp_rank == 0
diff --git a/colossalai/moe/_operation.py b/colossalai/moe/_operation.py
index abec2aa6e..230b40530 100644
--- a/colossalai/moe/_operation.py
+++ b/colossalai/moe/_operation.py
@@ -443,7 +443,7 @@ def all_to_all_uneven(
 
 
 # ===========================================================
-# This code section was modified from 
+# This code section was modified from
 # https://github.com/microsoft/DeepSpeed/blob/3d347276ce80e1a29e777c839d1d7fabe8e5f034/deepspeed/moe/mappings.py
 
 # Copyright (c) Microsoft Corporation.
@@ -492,8 +492,9 @@ def _drop_tokens(input_, dim: int, tp_group: ProcessGroup):
 
     total_chunks = tp_group.size()
     this_chunk = tp_group.rank()
-    assert input_.shape[
-        dim] % total_chunks == 0, f"input dimension {dim} ({input_.shape[dim]}) is not divisible by tensor parallel world size ({total_chunks})"
+    assert (
+        input_.shape[dim] % total_chunks == 0
+    ), f"input dimension {dim} ({input_.shape[dim]}) is not divisible by tensor parallel world size ({total_chunks})"
     chunk_size = input_.shape[dim] // total_chunks
 
     return torch.narrow(input_, dim, this_chunk * chunk_size, chunk_size)
@@ -531,15 +532,20 @@ def gather_tokens(input_, dim: int, tp_group: ProcessGroup):
     if tp_group.size() == 1:
         # no tensor parallelism for non-experts
         return input_
-    assert input_.requires_grad, "Input must require grad to assure that backward is executed, otherwise it might hang the program."
-    return _GatherTokens.apply(input_, dim)
+    assert (
+        input_.requires_grad
+    ), "Input must require grad to assure that backward is executed, otherwise it might hang the program."
+    return _GatherTokens.apply(input_, dim, tp_group)
 
 
 def drop_tokens(input_, dim: int, tp_group: ProcessGroup):
     if tp_group.size() == 1:
         # no tensor parallelism for non-experts
         return input_
-    assert input_.requires_grad, "Input must require grad to assure that backward is executed, otherwise it might hang the program."
+    assert (
+        input_.requires_grad
+    ), "Input must require grad to assure that backward is executed, otherwise it might hang the program."
     return _DropTokens.apply(input_, dim, tp_group)
 
+
 # ===========================================================
diff --git a/colossalai/shardformer/modeling/mixtral.py b/colossalai/shardformer/modeling/mixtral.py
index 5a42a1073..86ef6c959 100644
--- a/colossalai/shardformer/modeling/mixtral.py
+++ b/colossalai/shardformer/modeling/mixtral.py
@@ -22,6 +22,7 @@ from colossalai.moe._operation import (
     all_to_all_uneven,
 )
 from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.shardformer.layer.linear import Linear1D_Col, Linear1D_Row
 from colossalai.shardformer.shard import ShardConfig
 from colossalai.shardformer.shard.utils import set_tensors_to_none
 from colossalai.tensor.moe_tensor.api import set_moe_tensor_ep_group
@@ -64,6 +65,11 @@ class EPMixtralSparseMoeBlock(MixtralSparseMoeBlock):
 
         # setup moe tp group
         self.moe_tp_group = moe_tp_group
+        if self.moe_tp_group.size() > 1:
+            for expert in held_experts:
+                expert.w1 = Linear1D_Col.from_native_module(expert.w1, self.moe_tp_group)
+                expert.w3 = Linear1D_Col.from_native_module(expert.w3, self.moe_tp_group)
+                expert.w2 = Linear1D_Row.from_native_module(expert.w2, self.moe_tp_group)
 
     @staticmethod
     def from_native_module(
diff --git a/colossalai/shardformer/policies/mixtral.py b/colossalai/shardformer/policies/mixtral.py
index 69bcc54ed..4b77a167f 100644
--- a/colossalai/shardformer/policies/mixtral.py
+++ b/colossalai/shardformer/policies/mixtral.py
@@ -76,9 +76,14 @@ class MixtralPolicy(Policy):
                         suffix="self_attn.o_proj",
                         target_module=Linear1D_Row,
                     ),
+                    SubModuleReplacementDescription(
+                        suffix="block_sparse_moe.gate", target_module=Linear1D_Col, kwargs={"gather_output": True}
+                    ),
                 ],
             )
 
+            # TODO shard vocab embedding
+
         if self.shard_config.ep_group:
             # expert parallel
             self.append_or_create_submodule_replacement(
@@ -86,7 +91,12 @@ class MixtralPolicy(Policy):
                     SubModuleReplacementDescription(
                         suffix="block_sparse_moe",
                         target_module=EPMixtralSparseMoeBlock,
-                        kwargs={"ep_group": self.shard_config.ep_group, "tp_group": self.shard_config.tensor_parallel_process_group, "moe_dp_group": self.shard_config.moe_dp_group, "moe_tp_group": self.shard_config.moe_tp_group},
+                        kwargs={
+                            "ep_group": self.shard_config.ep_group,
+                            "tp_group": self.shard_config.tensor_parallel_process_group,
+                            "moe_dp_group": self.shard_config.moe_dp_group,
+                            "moe_tp_group": self.shard_config.moe_tp_group,
+                        },
                     )
                 ],
                 policy=policy,
diff --git a/colossalai/zero/low_level/bookkeeping/gradient_store.py b/colossalai/zero/low_level/bookkeeping/gradient_store.py
index b84be034a..8b6d403f1 100644
--- a/colossalai/zero/low_level/bookkeeping/gradient_store.py
+++ b/colossalai/zero/low_level/bookkeeping/gradient_store.py
@@ -111,6 +111,7 @@ class GradientStore(BaseStore):
 
     def reset_all_gradients(self):
         self._grads_of_params = dict()
+        self.grad_to_param_mapping = dict()
 
     def get_param_id_for_grad(self, grad: Tensor) -> Optional[int]:
         """Return the id of a parameter which the gradient slice belongs to
diff --git a/tests/test_moe/modelling/test_mixtral.py b/tests/test_moe/modelling/test_mixtral.py
index 26fa81921..8309bfb22 100644
--- a/tests/test_moe/modelling/test_mixtral.py
+++ b/tests/test_moe/modelling/test_mixtral.py
@@ -1,6 +1,7 @@
 import os
 import shutil
 from copy import deepcopy
+from typing import Tuple
 
 import pytest
 import torch
@@ -19,7 +20,7 @@ from tests.test_moe.test_moe_checkpoint import check_model_equal
 NUM_BATCH = 4
 NUM_TOK_PER_BATCH, NUM_EXPERTS = 7, 4
 HIDDEN_SIZE_PER_HEAD = 4
-NUM_HEADS = 2
+NUM_HEADS = 4
 TOP_K = 1
 
 
@@ -33,9 +34,9 @@ def split_grad(grad, world_size):
     return splited_grad
 
 
-@parameterize("stage", [1])
-@parameterize("ep_size", [1, 2, 4])
-def run_zero_with_original_model(stage: int, ep_size: int, tp_size: int):
+@parameterize("config", [(1, 1, 4), (1, 2, 2), (1, 4, 1)])
+def run_zero_with_original_model(config: Tuple[int, ...]):
+    stage, ep_size, tp_size = config
     dtype = torch.float32
 
     rank = torch.distributed.get_rank()
@@ -43,7 +44,8 @@ def run_zero_with_original_model(stage: int, ep_size: int, tp_size: int):
 
     plugin = MoeHybridParallelPlugin(
         pp_size=1,
-        tp_size=1,
+        tp_size=tp_size,
+        moe_tp_size=tp_size,
         ep_size=ep_size,
         zero_stage=stage,
         overlap_communication=False,
@@ -77,17 +79,16 @@ def run_zero_with_original_model(stage: int, ep_size: int, tp_size: int):
 
     torch_model.train()
     zero_model.train()
-    for _ in range(1):
-        # zero-dp forward
+    for _ in range(2):
         input_data = torch.rand(
             NUM_BATCH, NUM_TOK_PER_BATCH, HIDDEN_SIZE_PER_HEAD * NUM_HEADS, requires_grad=True
         ).cuda()
+        dist.all_reduce(input_data, group=plugin.tp_group)  # tp requires duplicate input
+
         zero_output = zero_model(inputs_embeds=input_data.to(dtype)).last_hidden_state.mean()
-        # zero-dp backward
-        print(zero_output.dtype)
         zero_optimizer.backward(zero_output)
         zero_optimizer.step()
-
+        zero_optimizer.zero_grad()
         dist.all_reduce(zero_output)
 
         all_inputs = [torch.empty_like(input_data) for _ in range(dist.get_world_size())]
@@ -98,28 +99,32 @@ def run_zero_with_original_model(stage: int, ep_size: int, tp_size: int):
             torch_output = torch_model(inputs_embeds=input_data_.to(dtype)).last_hidden_state.mean()
             torch_output.backward()
             torch_output_sum += torch_output.detach()
-
         # avg dp grads
         for p in torch_model.parameters():
             if p.grad is not None:
                 p.grad /= dist.get_world_size()
+        torch_optimizer.step()
+        torch_optimizer.zero_grad()
 
         loose_close(zero_output, torch_output_sum, dtype=dtype)
-        torch_optimizer.step()
 
-        # use checkpoint to load sharded zero model
-        model_dir = "./test_mixtral"
-        if dist.get_rank() == 0:
-            os.makedirs(model_dir, exist_ok=True)
+    # use checkpoint to load sharded zero model
+    model_dir = "./test_mixtral"
+    if dist.get_rank() == 0:
+        os.makedirs(model_dir, exist_ok=True)
 
-        dist.barrier()
-        booster.save_model(zero_model, model_dir, shard=True)
-        dist.barrier()
+    dist.barrier()
 
-        if dist.get_rank() == 0:
-            saved_model = MixtralModel.from_pretrained(model_dir).cuda()
-            check_model_equal(torch_model, saved_model)
-            shutil.rmtree(model_dir)
+    booster.save_model(zero_model, model_dir, shard=True)
+
+    dist.barrier()
+
+    saved_model = MixtralModel.from_pretrained(model_dir).cuda()
+    check_model_equal(torch_model, saved_model)
+
+    dist.barrier()
+    if dist.get_rank() == 0:
+        shutil.rmtree(model_dir)
 
     print(f"{dist.get_rank()} test passed")
 
diff --git a/tests/test_moe/test_moe_ep_tp.py b/tests/test_moe/test_moe_ep_tp.py
index cc5448e51..e944a8c0a 100644
--- a/tests/test_moe/test_moe_ep_tp.py
+++ b/tests/test_moe/test_moe_ep_tp.py
@@ -33,8 +33,8 @@ def split_grad(grad, world_size):
 
 @parameterize("stage", [1])
 @parameterize("ep_size", [1, 2, 4])
-@parameterize("tp_size", [1, 2, 4])
-def run_zero_with_original_model(stage: int, ep_size: int, tp_size: int):
+def run_zero_with_original_model(stage: int, ep_size: int):
+    tp_size = dist.get_world_size() // ep_size
     dtype = torch.bfloat16
 
     rank = torch.distributed.get_rank()
@@ -57,7 +57,13 @@ def run_zero_with_original_model(stage: int, ep_size: int, tp_size: int):
     zero_optimizer = torch.optim.SGD(zero_model.parameters(), lr=1)
     moe_booster = Booster(
         plugin=MoeHybridParallelPlugin(
-            tp_size=tp_size, pp_size=1, ep_size=ep_size, zero_stage=stage, overlap_communication=False, initial_scale=1
+            tp_size=tp_size,
+            moe_tp_size=tp_size,
+            pp_size=1,
+            ep_size=ep_size,
+            zero_stage=stage,
+            overlap_communication=False,
+            initial_scale=1,
         )
     )
     zero_model, zero_optimizer, _, _, _ = moe_booster.boost(zero_model, zero_optimizer)
@@ -100,6 +106,8 @@ def run_zero_with_original_model(stage: int, ep_size: int, tp_size: int):
             if name_to_p[n].grad is None:
                 name_to_p[n].grad = torch.zeros_like(name_to_p[n])
                 continue
+            if zero_grad.shape != name_to_p[n].grad.shape:  # TODO check sharded and sliced moe
+                continue
             loose_close(zero_grad, name_to_p[n].grad, dtype=dtype, name=n)
 
         # zero-dp step
@@ -110,6 +118,8 @@ def run_zero_with_original_model(stage: int, ep_size: int, tp_size: int):
 
         # check updated param
         for n, p in zero_model.named_parameters():
+            if p.data.shape != name_to_p[n].data.shape:  # TODO check sharded and sliced moe
+                continue
             loose_close(p.data, name_to_p[n].data, dtype=dtype, name=n)
 
     print(f"{dist.get_rank()} test passed")