From 37dbe6398bd0c5f0bda3c24cc6ad70f86c70c3d0 Mon Sep 17 00:00:00 2001
From: zaglc <persistentstriverng@sjtu.edu.cn>
Date: Thu, 14 Sep 2023 17:15:08 +0800
Subject: [PATCH] restore 7B_sft

---
 configs/7B_sft.py | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 8cb1e04..eb9ef92 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -1,33 +1,28 @@
 JOB_NAME = "7b_train"
 DO_ALERT = False
 
-SEQ_LEN = 256
-HIDDEN_SIZE = 512
+SEQ_LEN = 2048
+HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
 MLP_RATIO = 8 / 3
 NUM_LAYER = 32
 VOCAB_SIZE = 103168
 
-MODEL_ONLY_FOLDER = "local:llm_ckpts/20"
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
 # Ckpt folder format:
 # fs: 'local:/mnt/nfs/XXX'
 SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/20"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
 
 # boto3 Ckpt folder format:
 # import os
 # BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
 # SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
 # LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 20
+CHECKPOINT_EVERY = 50
 ckpt = dict(
     enable_save_ckpt=False,  # enable ckpt save.
     save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder=LOAD_CKPT_FOLDER, # Ckpt path to resume training(load weights and scheduler/context states).
-    load_given_ckpt = False,
-    # load_model_only_folder=MODEL_ONLY_FOLDER, # Path to initialize with given model weights.
-    load_optimizer=True,  # Wheter to load optimizer states when continuing training.
-
     # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
     load_ckpt_folder="local:llm_ckpts/",
     # 'load_ckpt_info' setting guide:
@@ -35,14 +30,13 @@ ckpt = dict(
     # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
     # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
     load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-
     checkpoint_every=CHECKPOINT_EVERY,
     async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
     async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
     oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
 )
 
-TRAIN_FOLDER = "../../train_data"#"/path/to/dataset"
+TRAIN_FOLDER = "/path/to/dataset"
 VALID_FOLDER = "/path/to/dataset"
 data = dict(
     seq_len=SEQ_LEN,
@@ -55,13 +49,15 @@ data = dict(
     # defaults to 0, means disable evaluate
     valid_every=50,
     pack_sample_into_one=False,
-    total_steps=30,
+    total_steps=50000,
     skip_batches="",
     rampup_batch_size="",
     # Datasets with less than 50 rows will be discarded
     min_length=50,
-    train_folder=TRAIN_FOLDER,
+    # train_folder=TRAIN_FOLDER,
     # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
 )
 
 grad_scaler = dict(
@@ -121,7 +117,7 @@ beta2_scheduler = dict(
 )
 
 model = dict(
-    checkpoint=True,  # The proportion of layers for activation checkpointing, the optional value are True/False/[0-1]
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
     num_attention_heads=NUM_ATTENTION_HEAD,
     embed_split_hidden=True,
     vocab_size=VOCAB_SIZE,
@@ -150,11 +146,9 @@ pipeline parallel (dict):
 tensor parallel: tensor parallel size, usually the number of GPUs per node.
 """
 parallel = dict(
-    zero1=-1,
+    zero1=8,
     pipeline=dict(size=1, interleaved_overlap=True),
-    tensor=1,
     sequence_parallel=False,
-    use_fsdp=True,
 )
 
 cudnn_deterministic = False
@@ -167,4 +161,4 @@ monitor = dict(
         feishu_alert_address=None,  # feishu webhook to send alert message
         light_monitor_address=None,  # light_monitor address to send heartbeat
     ),
-)
+)
\ No newline at end of file