restore 7B_sft

pull/293/head
zaglc 2023-09-14 17:15:08 +08:00
parent 9b1b0c5c20
commit 37dbe6398b
1 changed files with 13 additions and 19 deletions

View File

@ -1,33 +1,28 @@
JOB_NAME = "7b_train" JOB_NAME = "7b_train"
DO_ALERT = False DO_ALERT = False
SEQ_LEN = 256 SEQ_LEN = 2048
HIDDEN_SIZE = 512 HIDDEN_SIZE = 4096
NUM_ATTENTION_HEAD = 32 NUM_ATTENTION_HEAD = 32
MLP_RATIO = 8 / 3 MLP_RATIO = 8 / 3
NUM_LAYER = 32 NUM_LAYER = 32
VOCAB_SIZE = 103168 VOCAB_SIZE = 103168
MODEL_ONLY_FOLDER = "local:llm_ckpts/20" MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
# Ckpt folder format: # Ckpt folder format:
# fs: 'local:/mnt/nfs/XXX' # fs: 'local:/mnt/nfs/XXX'
SAVE_CKPT_FOLDER = "local:llm_ckpts" SAVE_CKPT_FOLDER = "local:llm_ckpts"
LOAD_CKPT_FOLDER = "local:llm_ckpts/20" LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
# boto3 Ckpt folder format: # boto3 Ckpt folder format:
# import os # import os
# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint # BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" # SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" # LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
CHECKPOINT_EVERY = 20 CHECKPOINT_EVERY = 50
ckpt = dict( ckpt = dict(
enable_save_ckpt=False, # enable ckpt save. enable_save_ckpt=False, # enable ckpt save.
save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt.
# load_ckpt_folder=LOAD_CKPT_FOLDER, # Ckpt path to resume training(load weights and scheduler/context states).
load_given_ckpt = False,
# load_model_only_folder=MODEL_ONLY_FOLDER, # Path to initialize with given model weights.
load_optimizer=True, # Wheter to load optimizer states when continuing training.
# load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
load_ckpt_folder="local:llm_ckpts/", load_ckpt_folder="local:llm_ckpts/",
# 'load_ckpt_info' setting guide: # 'load_ckpt_info' setting guide:
@ -35,14 +30,13 @@ ckpt = dict(
# 2. the 'content means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" # 2. the 'content means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
# 3. the ckpt_type means the type of checkpoint to be loaded, now only 'normal' type is supported. # 3. the ckpt_type means the type of checkpoint to be loaded, now only 'normal' type is supported.
load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
checkpoint_every=CHECKPOINT_EVERY, checkpoint_every=CHECKPOINT_EVERY,
async_upload=True, # async ckpt upload. (only work for boto3 ckpt) async_upload=True, # async ckpt upload. (only work for boto3 ckpt)
async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload.
oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency.
) )
TRAIN_FOLDER = "../../train_data"#"/path/to/dataset" TRAIN_FOLDER = "/path/to/dataset"
VALID_FOLDER = "/path/to/dataset" VALID_FOLDER = "/path/to/dataset"
data = dict( data = dict(
seq_len=SEQ_LEN, seq_len=SEQ_LEN,
@ -55,13 +49,15 @@ data = dict(
# defaults to 0, means disable evaluate # defaults to 0, means disable evaluate
valid_every=50, valid_every=50,
pack_sample_into_one=False, pack_sample_into_one=False,
total_steps=30, total_steps=50000,
skip_batches="", skip_batches="",
rampup_batch_size="", rampup_batch_size="",
# Datasets with less than 50 rows will be discarded # Datasets with less than 50 rows will be discarded
min_length=50, min_length=50,
train_folder=TRAIN_FOLDER, # train_folder=TRAIN_FOLDER,
# valid_folder=VALID_FOLDER, # valid_folder=VALID_FOLDER,
empty_cache_and_diag_interval=10,
diag_outlier_ratio=1.1,
) )
grad_scaler = dict( grad_scaler = dict(
@ -121,7 +117,7 @@ beta2_scheduler = dict(
) )
model = dict( model = dict(
checkpoint=True, # The proportion of layers for activation checkpointing, the optional value are True/False/[0-1] checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
num_attention_heads=NUM_ATTENTION_HEAD, num_attention_heads=NUM_ATTENTION_HEAD,
embed_split_hidden=True, embed_split_hidden=True,
vocab_size=VOCAB_SIZE, vocab_size=VOCAB_SIZE,
@ -150,11 +146,9 @@ pipeline parallel (dict):
tensor parallel: tensor parallel size, usually the number of GPUs per node. tensor parallel: tensor parallel size, usually the number of GPUs per node.
""" """
parallel = dict( parallel = dict(
zero1=-1, zero1=8,
pipeline=dict(size=1, interleaved_overlap=True), pipeline=dict(size=1, interleaved_overlap=True),
tensor=1,
sequence_parallel=False, sequence_parallel=False,
use_fsdp=True,
) )
cudnn_deterministic = False cudnn_deterministic = False