diff --git a/configs/13B_template.py b/configs/13B_template.py
index e0e016c..849c5aa 100644
--- a/configs/13B_template.py
+++ b/configs/13B_template.py
@@ -57,7 +57,7 @@ data = dict(
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
     valid_every=50,
-    pack_sample_into_one=False,
+    pack_sample_into_one=True,
     total_steps=20,
     skip_batches="",
     rampup_batch_size="",
@@ -65,7 +65,7 @@ data = dict(
     min_length=50,
     # train_folder=TRAIN_FOLDER,
     # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
+    empty_cache_and_diag_interval=100,
     diag_outlier_ratio=1.1,
 )
 
diff --git a/configs/30B_template.py b/configs/30B_template.py
index 4ac99bf..d19ece6 100644
--- a/configs/30B_template.py
+++ b/configs/30B_template.py
@@ -2,7 +2,7 @@
 DO_ALERT = False
 
 SEQ_LEN = {seq_len}
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
+JOB_NAME = "30b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
 HIDDEN_SIZE = 6144
 NUM_ATTENTION_HEAD = 48
 MLP_RATIO = 8 / 3
@@ -57,7 +57,7 @@ data = dict(
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
     valid_every=50,
-    pack_sample_into_one=False,
+    pack_sample_into_one=True,
     total_steps=20,
     skip_batches="",
     rampup_batch_size="",
@@ -65,7 +65,7 @@ data = dict(
     min_length=50,
     # train_folder=TRAIN_FOLDER,
     # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
+    empty_cache_and_diag_interval=100,
     diag_outlier_ratio=1.1,
 )
 
diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 4f48265..2d6a3be 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -57,7 +57,7 @@ data = dict(
     # defaults to 0, means disable evaluate
     valid_every=50,
     pack_sample_into_one=True,
-    total_steps=20,
+    total_steps=50,
     skip_batches="",
     rampup_batch_size="",
     # Datasets with less than 50 rows will be discarded
@@ -163,7 +163,7 @@ pipeline parallel (dict):
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
     pipeline=dict(size=1, interleaved_overlap=True),
 )
 
diff --git a/configs/generate.py b/configs/generate.py
index a8a5898..038998c 100644
--- a/configs/generate.py
+++ b/configs/generate.py
@@ -39,22 +39,7 @@ for idx, root_name in enumerate(root_names):
                         
                     log_name = root_name + "_" + output_file_name[:-3]
                     
-                    skip = True
-                    
-                    if idx == 0 and i == 4:  # 7b, intern_overlap = False
-                        skip = False
-                    if idx == 0 and ckpt is True and i == 3:  # 7b, ckpt = True
-                        skip = False
-                    if idx == 1:  # 13b
-                        skip = False
-                    if idx == 2:  # 30b
-                        skip = False
-                        
-                    if skip:
-                        import time; time.sleep(1)
-                        print(f"skip {log_name}", flush=True)
-                        continue
-                    
-                    command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=20 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
+                    print(log_name)
+                    command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=30 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
                     process = subprocess.Popen(command, shell=True, executable='/bin/bash')
                     process.wait() 
\ No newline at end of file
diff --git a/train.py b/train.py
index ae86728..f419596 100644
--- a/train.py
+++ b/train.py
@@ -309,8 +309,9 @@ def main(args):
 
             if memory_profiler is not None:
                 memory_profiler.step()
-
-            prof.step()
+            
+            if batch_count % 2 == 0:
+                prof.step()
 
             if gpc.fstp_handler is not None:
                 gpc.fstp_handler.clear_memory_pool()