From 949431f228cdf0dbfdcd0909b905cb6075517eb6 Mon Sep 17 00:00:00 2001 From: yingtongxiong <974106207@qq.com> Date: Mon, 9 Oct 2023 18:06:22 +0800 Subject: [PATCH] modify the config --- configs/7B_sft.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 3e1d078..dd4104a 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -55,7 +55,7 @@ data = dict( # defaults to the value of micro_num valid_micro_num=4, # defaults to 0, means disable evaluate - valid_every=10, + valid_every=50, pack_sample_into_one=False, total_steps=50000, skip_batches="", @@ -64,7 +64,7 @@ data = dict( min_length=50, # train_folder=TRAIN_FOLDER, # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, + empty_cache_and_diag_interval=10, diag_outlier_ratio=1.1, ) @@ -135,7 +135,7 @@ model = dict( num_layers=NUM_LAYER, mlp_ratio=MLP_RATIO, apply_post_layer_norm=False, - dtype="torch.float16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" + dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" norm_type="rmsnorm", layer_norm_epsilon=1e-5, use_flash_attn=True, @@ -155,9 +155,9 @@ tensor parallel: tensor parallel size, usually the number of GPUs per node. """ parallel = dict( zero1=-1, - tensor=dict(size=2, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp' + tensor=dict(size=2, mode='origin_tp'), # the mode should be 'origin_tp' or 'fstp'. if the mode is 'fstp', the sequence_parallel should be True pipeline=dict(size=1, interleaved_overlap=True), - sequence_parallel=True, + sequence_parallel=False, ) cudnn_deterministic = False