diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 51d2e9c..5e3e0c9 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -5,7 +5,7 @@ SEQ_LEN = 2048
 HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
 MLP_RATIO = 8 / 3
-NUM_LAYER = 4
+NUM_LAYER = 32
 VOCAB_SIZE = 103168
 
 MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
index 0db99ad..8ac8c58 100644
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@@ -372,7 +372,6 @@ class PackedFlashInternLm1D(nn.Module):
 
     def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=None, inference_params=None):
         # attention_mask: compute attention on the places where the value is 1
-        import pdb; pdb.set_trace()
         if hasattr(self, "embedding"):
             hidden_states = self.embedding(input_ids)
             if self.embed_grad_scale != 1:
diff --git a/internlm/model/multi_head_attention.py b/internlm/model/multi_head_attention.py
index abb9f19..e6d0a29 100644
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@@ -115,14 +115,14 @@ class DistributedAttention(torch.nn.Module):
         """
         # TODO Merge three alltoall calls into one
         #in shape : e.g.,  [s/p:h:]
-        qkv = _SeqAllToAll.apply(self.spg, qkv, self.scatter_idx, self.gather_idx)
+        qkv = _SeqAllToAll.apply(self.spg, qkv, 2, 0)
         # key_layer = _SeqAllToAll.apply(self.spg, key, self.scatter_idx, self.gather_idx)
         # value_layer = _SeqAllToAll.apply(self.spg, value, self.scatter_idx, self.gather_idx)
 
         #out shape : e.g., [s:h/p:]
         context_layer = self.local_attn(qkv, **kwargs)
 
-        output = _SeqAllToAll.apply(self.spg, context_layer, 0, 2)
+        output = _SeqAllToAll.apply(self.spg, context_layer, 0, 1)
 
         #out e.g., [s/p::h]
         return output