From 37dbe6398bd0c5f0bda3c24cc6ad70f86c70c3d0 Mon Sep 17 00:00:00 2001 From: zaglc Date: Thu, 14 Sep 2023 17:15:08 +0800 Subject: [PATCH] restore 7B_sft --- configs/7B_sft.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 8cb1e04..eb9ef92 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -1,33 +1,28 @@ JOB_NAME = "7b_train" DO_ALERT = False -SEQ_LEN = 256 -HIDDEN_SIZE = 512 +SEQ_LEN = 2048 +HIDDEN_SIZE = 4096 NUM_ATTENTION_HEAD = 32 MLP_RATIO = 8 / 3 NUM_LAYER = 32 VOCAB_SIZE = 103168 -MODEL_ONLY_FOLDER = "local:llm_ckpts/20" +MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" # Ckpt folder format: # fs: 'local:/mnt/nfs/XXX' SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/20" +LOAD_CKPT_FOLDER = "local:llm_ckpts/49" # boto3 Ckpt folder format: # import os # BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint # SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" # LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 20 +CHECKPOINT_EVERY = 50 ckpt = dict( enable_save_ckpt=False, # enable ckpt save. save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder=LOAD_CKPT_FOLDER, # Ckpt path to resume training(load weights and scheduler/context states). - load_given_ckpt = False, - # load_model_only_folder=MODEL_ONLY_FOLDER, # Path to initialize with given model weights. - load_optimizer=True, # Wheter to load optimizer states when continuing training. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), load_ckpt_folder="local:llm_ckpts/", # 'load_ckpt_info' setting guide: @@ -35,14 +30,13 @@ ckpt = dict( # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - checkpoint_every=CHECKPOINT_EVERY, async_upload=True, # async ckpt upload. (only work for boto3 ckpt) async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. ) -TRAIN_FOLDER = "../../train_data"#"/path/to/dataset" +TRAIN_FOLDER = "/path/to/dataset" VALID_FOLDER = "/path/to/dataset" data = dict( seq_len=SEQ_LEN, @@ -55,13 +49,15 @@ data = dict( # defaults to 0, means disable evaluate valid_every=50, pack_sample_into_one=False, - total_steps=30, + total_steps=50000, skip_batches="", rampup_batch_size="", # Datasets with less than 50 rows will be discarded min_length=50, - train_folder=TRAIN_FOLDER, + # train_folder=TRAIN_FOLDER, # valid_folder=VALID_FOLDER, + empty_cache_and_diag_interval=10, + diag_outlier_ratio=1.1, ) grad_scaler = dict( @@ -121,7 +117,7 @@ beta2_scheduler = dict( ) model = dict( - checkpoint=True, # The proportion of layers for activation checkpointing, the optional value are True/False/[0-1] + checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] num_attention_heads=NUM_ATTENTION_HEAD, embed_split_hidden=True, vocab_size=VOCAB_SIZE, @@ -150,11 +146,9 @@ pipeline parallel (dict): tensor parallel: tensor parallel size, usually the number of GPUs per node. """ parallel = dict( - zero1=-1, + zero1=8, pipeline=dict(size=1, interleaved_overlap=True), - tensor=1, sequence_parallel=False, - use_fsdp=True, ) cudnn_deterministic = False @@ -167,4 +161,4 @@ monitor = dict( feishu_alert_address=None, # feishu webhook to send alert message light_monitor_address=None, # light_monitor address to send heartbeat ), -) +) \ No newline at end of file