From 949431f228cdf0dbfdcd0909b905cb6075517eb6 Mon Sep 17 00:00:00 2001
From: yingtongxiong <974106207@qq.com>
Date: Mon, 9 Oct 2023 18:06:22 +0800
Subject: [PATCH] modify the config

---
 configs/7B_sft.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 3e1d078..dd4104a 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -55,7 +55,7 @@ data = dict(
     # defaults to the value of micro_num
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
-    valid_every=10,
+    valid_every=50,
     pack_sample_into_one=False,
     total_steps=50000,
     skip_batches="",
@@ -64,7 +64,7 @@ data = dict(
     min_length=50,
     # train_folder=TRAIN_FOLDER,
     # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
+    empty_cache_and_diag_interval=10,
     diag_outlier_ratio=1.1,
 )
 
@@ -135,7 +135,7 @@ model = dict(
     num_layers=NUM_LAYER,
     mlp_ratio=MLP_RATIO,
     apply_post_layer_norm=False,
-    dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
     norm_type="rmsnorm",
     layer_norm_epsilon=1e-5,
     use_flash_attn=True,
@@ -155,9 +155,9 @@ tensor parallel: tensor parallel size, usually the number of GPUs per node.
 """
 parallel = dict(
     zero1=-1,
-    tensor=dict(size=2, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp'
+    tensor=dict(size=2, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True
     pipeline=dict(size=1, interleaved_overlap=True),
-    sequence_parallel=True,
+    sequence_parallel=False,
 )
 
 cudnn_deterministic = False