mirror of https://github.com/InternLM/InternLM
modify the config
parent
cc20fa271a
commit
d831ddcc1d
|
@ -57,7 +57,7 @@ data = dict(
|
||||||
valid_micro_num=4,
|
valid_micro_num=4,
|
||||||
# defaults to 0, means disable evaluate
|
# defaults to 0, means disable evaluate
|
||||||
valid_every=50,
|
valid_every=50,
|
||||||
pack_sample_into_one=False,
|
pack_sample_into_one=True,
|
||||||
total_steps=20,
|
total_steps=20,
|
||||||
skip_batches="",
|
skip_batches="",
|
||||||
rampup_batch_size="",
|
rampup_batch_size="",
|
||||||
|
@ -65,7 +65,7 @@ data = dict(
|
||||||
min_length=50,
|
min_length=50,
|
||||||
# train_folder=TRAIN_FOLDER,
|
# train_folder=TRAIN_FOLDER,
|
||||||
# valid_folder=VALID_FOLDER,
|
# valid_folder=VALID_FOLDER,
|
||||||
empty_cache_and_diag_interval=10,
|
empty_cache_and_diag_interval=100,
|
||||||
diag_outlier_ratio=1.1,
|
diag_outlier_ratio=1.1,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
DO_ALERT = False
|
DO_ALERT = False
|
||||||
|
|
||||||
SEQ_LEN = {seq_len}
|
SEQ_LEN = {seq_len}
|
||||||
JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
|
JOB_NAME = "30b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
|
||||||
HIDDEN_SIZE = 6144
|
HIDDEN_SIZE = 6144
|
||||||
NUM_ATTENTION_HEAD = 48
|
NUM_ATTENTION_HEAD = 48
|
||||||
MLP_RATIO = 8 / 3
|
MLP_RATIO = 8 / 3
|
||||||
|
@ -57,7 +57,7 @@ data = dict(
|
||||||
valid_micro_num=4,
|
valid_micro_num=4,
|
||||||
# defaults to 0, means disable evaluate
|
# defaults to 0, means disable evaluate
|
||||||
valid_every=50,
|
valid_every=50,
|
||||||
pack_sample_into_one=False,
|
pack_sample_into_one=True,
|
||||||
total_steps=20,
|
total_steps=20,
|
||||||
skip_batches="",
|
skip_batches="",
|
||||||
rampup_batch_size="",
|
rampup_batch_size="",
|
||||||
|
@ -65,7 +65,7 @@ data = dict(
|
||||||
min_length=50,
|
min_length=50,
|
||||||
# train_folder=TRAIN_FOLDER,
|
# train_folder=TRAIN_FOLDER,
|
||||||
# valid_folder=VALID_FOLDER,
|
# valid_folder=VALID_FOLDER,
|
||||||
empty_cache_and_diag_interval=10,
|
empty_cache_and_diag_interval=100,
|
||||||
diag_outlier_ratio=1.1,
|
diag_outlier_ratio=1.1,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -57,7 +57,7 @@ data = dict(
|
||||||
# defaults to 0, means disable evaluate
|
# defaults to 0, means disable evaluate
|
||||||
valid_every=50,
|
valid_every=50,
|
||||||
pack_sample_into_one=True,
|
pack_sample_into_one=True,
|
||||||
total_steps=20,
|
total_steps=50,
|
||||||
skip_batches="",
|
skip_batches="",
|
||||||
rampup_batch_size="",
|
rampup_batch_size="",
|
||||||
# Datasets with less than 50 rows will be discarded
|
# Datasets with less than 50 rows will be discarded
|
||||||
|
@ -163,7 +163,7 @@ pipeline parallel (dict):
|
||||||
"""
|
"""
|
||||||
parallel = dict(
|
parallel = dict(
|
||||||
zero1=dict(size=-1, fsdp=False),
|
zero1=dict(size=-1, fsdp=False),
|
||||||
tensor=dict(size=8, sp="none", intern_overlap=False),
|
tensor=dict(size=8, sp="intern", intern_overlap=True),
|
||||||
pipeline=dict(size=1, interleaved_overlap=True),
|
pipeline=dict(size=1, interleaved_overlap=True),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -39,22 +39,7 @@ for idx, root_name in enumerate(root_names):
|
||||||
|
|
||||||
log_name = root_name + "_" + output_file_name[:-3]
|
log_name = root_name + "_" + output_file_name[:-3]
|
||||||
|
|
||||||
skip = True
|
print(log_name)
|
||||||
|
command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=30 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
|
||||||
if idx == 0 and i == 4: # 7b, intern_overlap = False
|
|
||||||
skip = False
|
|
||||||
if idx == 0 and ckpt is True and i == 3: # 7b, ckpt = True
|
|
||||||
skip = False
|
|
||||||
if idx == 1: # 13b
|
|
||||||
skip = False
|
|
||||||
if idx == 2: # 30b
|
|
||||||
skip = False
|
|
||||||
|
|
||||||
if skip:
|
|
||||||
import time; time.sleep(1)
|
|
||||||
print(f"skip {log_name}", flush=True)
|
|
||||||
continue
|
|
||||||
|
|
||||||
command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=20 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
|
|
||||||
process = subprocess.Popen(command, shell=True, executable='/bin/bash')
|
process = subprocess.Popen(command, shell=True, executable='/bin/bash')
|
||||||
process.wait()
|
process.wait()
|
5
train.py
5
train.py
|
@ -309,8 +309,9 @@ def main(args):
|
||||||
|
|
||||||
if memory_profiler is not None:
|
if memory_profiler is not None:
|
||||||
memory_profiler.step()
|
memory_profiler.step()
|
||||||
|
|
||||||
prof.step()
|
if batch_count % 2 == 0:
|
||||||
|
prof.step()
|
||||||
|
|
||||||
if gpc.fstp_handler is not None:
|
if gpc.fstp_handler is not None:
|
||||||
gpc.fstp_handler.clear_memory_pool()
|
gpc.fstp_handler.clear_memory_pool()
|
||||||
|
|
Loading…
Reference in New Issue