mirror of https://github.com/InternLM/InternLM
fix(train): unify the exp paths (#492)
parent
3418898cbe
commit
626ed0fc5e
|
@ -391,7 +391,7 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None):
|
||||||
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
|
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
|
||||||
schedule=torch.profiler.schedule(skip_first=5, wait=1, warmup=1, active=1, repeat=1),
|
schedule=torch.profiler.schedule(skip_first=5, wait=1, warmup=1, active=1, repeat=1),
|
||||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||||
f"{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_"
|
f"RUN/{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_"
|
||||||
+ f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
|
+ f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
|
||||||
+ f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_"
|
+ f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_"
|
||||||
+ f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}",
|
+ f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}",
|
||||||
|
|
|
@ -118,7 +118,7 @@ def filter_kwargs(func, kwargs):
|
||||||
def launch_time():
|
def launch_time():
|
||||||
global CURRENT_TIME
|
global CURRENT_TIME
|
||||||
if not CURRENT_TIME:
|
if not CURRENT_TIME:
|
||||||
CURRENT_TIME = datetime.now().strftime("%b%d_%H-%M-%S")
|
CURRENT_TIME = datetime.now().strftime("%m-%d:%H:%M:%S")
|
||||||
return CURRENT_TIME
|
return CURRENT_TIME
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,7 @@ def init_tb_writer(
|
||||||
):
|
):
|
||||||
tb_log_file_name = file_name
|
tb_log_file_name = file_name
|
||||||
if not tensorboard_folder:
|
if not tensorboard_folder:
|
||||||
tb_folder = os.path.join(job_name, launch_time, "tensorboards")
|
tb_folder = os.path.join("RUN", job_name, launch_time, "tensorboards")
|
||||||
else:
|
else:
|
||||||
tb_folder = tensorboard_folder
|
tb_folder = tensorboard_folder
|
||||||
|
|
||||||
|
|
2
train.py
2
train.py
|
@ -176,7 +176,7 @@ def main(args):
|
||||||
memory_profiler = SimpleMemoryProfiler(
|
memory_profiler = SimpleMemoryProfiler(
|
||||||
model,
|
model,
|
||||||
optimizer.optim,
|
optimizer.optim,
|
||||||
log_folder=f"memory_trace/rank{gpc.get_global_rank()}_"
|
log_folder=f"RUN/{gpc.config.JOB_NAME}/{current_time}/memory_trace/rank{gpc.get_global_rank()}_"
|
||||||
+ f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
|
+ f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
|
||||||
+ f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}",
|
+ f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}",
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in New Issue