diff --git a/configs/13B_template.py b/configs/13B_template.py index e0e016c..849c5aa 100644 --- a/configs/13B_template.py +++ b/configs/13B_template.py @@ -57,7 +57,7 @@ data = dict( valid_micro_num=4, # defaults to 0, means disable evaluate valid_every=50, - pack_sample_into_one=False, + pack_sample_into_one=True, total_steps=20, skip_batches="", rampup_batch_size="", @@ -65,7 +65,7 @@ data = dict( min_length=50, # train_folder=TRAIN_FOLDER, # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, + empty_cache_and_diag_interval=100, diag_outlier_ratio=1.1, ) diff --git a/configs/30B_template.py b/configs/30B_template.py index 4ac99bf..d19ece6 100644 --- a/configs/30B_template.py +++ b/configs/30B_template.py @@ -2,7 +2,7 @@ DO_ALERT = False SEQ_LEN = {seq_len} -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint}) +JOB_NAME = "30b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint}) HIDDEN_SIZE = 6144 NUM_ATTENTION_HEAD = 48 MLP_RATIO = 8 / 3 @@ -57,7 +57,7 @@ data = dict( valid_micro_num=4, # defaults to 0, means disable evaluate valid_every=50, - pack_sample_into_one=False, + pack_sample_into_one=True, total_steps=20, skip_batches="", rampup_batch_size="", @@ -65,7 +65,7 @@ data = dict( min_length=50, # train_folder=TRAIN_FOLDER, # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, + empty_cache_and_diag_interval=100, diag_outlier_ratio=1.1, ) diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 4f48265..2d6a3be 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -57,7 +57,7 @@ data = dict( # defaults to 0, means disable evaluate valid_every=50, pack_sample_into_one=True, - total_steps=20, + total_steps=50, skip_batches="", rampup_batch_size="", # Datasets with less than 50 rows will be discarded @@ -163,7 +163,7 @@ pipeline parallel (dict): """ parallel = dict( zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), + tensor=dict(size=8, sp="intern", intern_overlap=True), pipeline=dict(size=1, interleaved_overlap=True), ) diff --git a/configs/generate.py b/configs/generate.py index a8a5898..038998c 100644 --- a/configs/generate.py +++ b/configs/generate.py @@ -39,22 +39,7 @@ for idx, root_name in enumerate(root_names): log_name = root_name + "_" + output_file_name[:-3] - skip = True - - if idx == 0 and i == 4: # 7b, intern_overlap = False - skip = False - if idx == 0 and ckpt is True and i == 3: # 7b, ckpt = True - skip = False - if idx == 1: # 13b - skip = False - if idx == 2: # 30b - skip = False - - if skip: - import time; time.sleep(1) - print(f"skip {log_name}", flush=True) - continue - - command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=20 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log" + print(log_name) + command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=30 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log" process = subprocess.Popen(command, shell=True, executable='/bin/bash') process.wait() \ No newline at end of file diff --git a/train.py b/train.py index ae86728..f419596 100644 --- a/train.py +++ b/train.py @@ -309,8 +309,9 @@ def main(args): if memory_profiler is not None: memory_profiler.step() - - prof.step() + + if batch_count % 2 == 0: + prof.step() if gpc.fstp_handler is not None: gpc.fstp_handler.clear_memory_pool()