mirror of https://github.com/InternLM/InternLM
fix(train): unify the exp paths (#492)
parent
3418898cbe
commit
626ed0fc5e
|
@ -391,7 +391,7 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None):
|
|||
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
|
||||
schedule=torch.profiler.schedule(skip_first=5, wait=1, warmup=1, active=1, repeat=1),
|
||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||
f"{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_"
|
||||
f"RUN/{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_"
|
||||
+ f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
|
||||
+ f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_"
|
||||
+ f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}",
|
||||
|
|
|
@ -118,7 +118,7 @@ def filter_kwargs(func, kwargs):
|
|||
def launch_time():
|
||||
global CURRENT_TIME
|
||||
if not CURRENT_TIME:
|
||||
CURRENT_TIME = datetime.now().strftime("%b%d_%H-%M-%S")
|
||||
CURRENT_TIME = datetime.now().strftime("%m-%d:%H:%M:%S")
|
||||
return CURRENT_TIME
|
||||
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ def init_tb_writer(
|
|||
):
|
||||
tb_log_file_name = file_name
|
||||
if not tensorboard_folder:
|
||||
tb_folder = os.path.join(job_name, launch_time, "tensorboards")
|
||||
tb_folder = os.path.join("RUN", job_name, launch_time, "tensorboards")
|
||||
else:
|
||||
tb_folder = tensorboard_folder
|
||||
|
||||
|
|
2
train.py
2
train.py
|
@ -176,7 +176,7 @@ def main(args):
|
|||
memory_profiler = SimpleMemoryProfiler(
|
||||
model,
|
||||
optimizer.optim,
|
||||
log_folder=f"memory_trace/rank{gpc.get_global_rank()}_"
|
||||
log_folder=f"RUN/{gpc.config.JOB_NAME}/{current_time}/memory_trace/rank{gpc.get_global_rank()}_"
|
||||
+ f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
|
||||
+ f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}",
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue