modify the config

2023-10-09 18:06:22 +08:00 · 2023-10-09 18:06:22 +08:00 · 949431f228
parent 21c1a7fa47
commit 949431f228
1 changed files with 5 additions and 5 deletions
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@ -55,7 +55,7 @@ data = dict(
    # defaults to the value of micro_num
    valid_micro_num=4,
    # defaults to 0, means disable evaluate
-    valid_every=10,
+    valid_every=50,
    pack_sample_into_one=False,
    total_steps=50000,
    skip_batches="",
@ -64,7 +64,7 @@ data = dict(
    min_length=50,
    # train_folder=TRAIN_FOLDER,
    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
+    empty_cache_and_diag_interval=10,
    diag_outlier_ratio=1.1,
 )

@ -135,7 +135,7 @@ model = dict(
    num_layers=NUM_LAYER,
    mlp_ratio=MLP_RATIO,
    apply_post_layer_norm=False,
-    dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
    norm_type="rmsnorm",
    layer_norm_epsilon=1e-5,
    use_flash_attn=True,
@ -155,9 +155,9 @@ tensor parallel: tensor parallel size, usually the number of GPUs per node.
 """
 parallel = dict(
    zero1=-1,
-    tensor=dict(size=2, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp'
+    tensor=dict(size=2, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True
    pipeline=dict(size=1, interleaved_overlap=True),
-    sequence_parallel=True,
+    sequence_parallel=False,
 )

 cudnn_deterministic = False