modify the config

pull/456/head
yingtongxiong 2023-10-26 17:41:17 +08:00
parent cc20fa271a
commit d831ddcc1d
5 changed files with 12 additions and 26 deletions

View File

@ -57,7 +57,7 @@ data = dict(
valid_micro_num=4, valid_micro_num=4,
# defaults to 0, means disable evaluate # defaults to 0, means disable evaluate
valid_every=50, valid_every=50,
pack_sample_into_one=False, pack_sample_into_one=True,
total_steps=20, total_steps=20,
skip_batches="", skip_batches="",
rampup_batch_size="", rampup_batch_size="",
@ -65,7 +65,7 @@ data = dict(
min_length=50, min_length=50,
# train_folder=TRAIN_FOLDER, # train_folder=TRAIN_FOLDER,
# valid_folder=VALID_FOLDER, # valid_folder=VALID_FOLDER,
empty_cache_and_diag_interval=10, empty_cache_and_diag_interval=100,
diag_outlier_ratio=1.1, diag_outlier_ratio=1.1,
) )

View File

@ -2,7 +2,7 @@
DO_ALERT = False DO_ALERT = False
SEQ_LEN = {seq_len} SEQ_LEN = {seq_len}
JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint}) JOB_NAME = "30b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
HIDDEN_SIZE = 6144 HIDDEN_SIZE = 6144
NUM_ATTENTION_HEAD = 48 NUM_ATTENTION_HEAD = 48
MLP_RATIO = 8 / 3 MLP_RATIO = 8 / 3
@ -57,7 +57,7 @@ data = dict(
valid_micro_num=4, valid_micro_num=4,
# defaults to 0, means disable evaluate # defaults to 0, means disable evaluate
valid_every=50, valid_every=50,
pack_sample_into_one=False, pack_sample_into_one=True,
total_steps=20, total_steps=20,
skip_batches="", skip_batches="",
rampup_batch_size="", rampup_batch_size="",
@ -65,7 +65,7 @@ data = dict(
min_length=50, min_length=50,
# train_folder=TRAIN_FOLDER, # train_folder=TRAIN_FOLDER,
# valid_folder=VALID_FOLDER, # valid_folder=VALID_FOLDER,
empty_cache_and_diag_interval=10, empty_cache_and_diag_interval=100,
diag_outlier_ratio=1.1, diag_outlier_ratio=1.1,
) )

View File

@ -57,7 +57,7 @@ data = dict(
# defaults to 0, means disable evaluate # defaults to 0, means disable evaluate
valid_every=50, valid_every=50,
pack_sample_into_one=True, pack_sample_into_one=True,
total_steps=20, total_steps=50,
skip_batches="", skip_batches="",
rampup_batch_size="", rampup_batch_size="",
# Datasets with less than 50 rows will be discarded # Datasets with less than 50 rows will be discarded
@ -163,7 +163,7 @@ pipeline parallel (dict):
""" """
parallel = dict( parallel = dict(
zero1=dict(size=-1, fsdp=False), zero1=dict(size=-1, fsdp=False),
tensor=dict(size=8, sp="none", intern_overlap=False), tensor=dict(size=8, sp="intern", intern_overlap=True),
pipeline=dict(size=1, interleaved_overlap=True), pipeline=dict(size=1, interleaved_overlap=True),
) )

View File

@ -39,22 +39,7 @@ for idx, root_name in enumerate(root_names):
log_name = root_name + "_" + output_file_name[:-3] log_name = root_name + "_" + output_file_name[:-3]
skip = True print(log_name)
command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=30 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
if idx == 0 and i == 4: # 7b, intern_overlap = False
skip = False
if idx == 0 and ckpt is True and i == 3: # 7b, ckpt = True
skip = False
if idx == 1: # 13b
skip = False
if idx == 2: # 30b
skip = False
if skip:
import time; time.sleep(1)
print(f"skip {log_name}", flush=True)
continue
command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=20 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
process = subprocess.Popen(command, shell=True, executable='/bin/bash') process = subprocess.Popen(command, shell=True, executable='/bin/bash')
process.wait() process.wait()

View File

@ -309,8 +309,9 @@ def main(args):
if memory_profiler is not None: if memory_profiler is not None:
memory_profiler.step() memory_profiler.step()
prof.step() if batch_count % 2 == 0:
prof.step()
if gpc.fstp_handler is not None: if gpc.fstp_handler is not None:
gpc.fstp_handler.clear_memory_pool() gpc.fstp_handler.clear_memory_pool()