From 626ed0fc5e20ebd3a1a80becf822df73d9eb7b69 Mon Sep 17 00:00:00 2001 From: jiaopenglong <44927264+JiaoPL@users.noreply.github.com> Date: Sat, 11 Nov 2023 20:15:59 +0800 Subject: [PATCH] fix(train): unify the exp paths (#492) --- internlm/train/training_internlm.py | 2 +- internlm/utils/common.py | 2 +- internlm/utils/writer.py | 2 +- train.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py index 9721c3d..71a47a0 100644 --- a/internlm/train/training_internlm.py +++ b/internlm/train/training_internlm.py @@ -391,7 +391,7 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None): activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA], schedule=torch.profiler.schedule(skip_first=5, wait=1, warmup=1, active=1, repeat=1), on_trace_ready=torch.profiler.tensorboard_trace_handler( - f"{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_" + f"RUN/{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_" + f"dp{gpc.get_local_rank(ParallelMode.DATA)}_" + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_" + f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}", diff --git a/internlm/utils/common.py b/internlm/utils/common.py index f3b58c0..6d7f7b2 100644 --- a/internlm/utils/common.py +++ b/internlm/utils/common.py @@ -118,7 +118,7 @@ def filter_kwargs(func, kwargs): def launch_time(): global CURRENT_TIME if not CURRENT_TIME: - CURRENT_TIME = datetime.now().strftime("%b%d_%H-%M-%S") + CURRENT_TIME = datetime.now().strftime("%m-%d:%H:%M:%S") return CURRENT_TIME diff --git a/internlm/utils/writer.py b/internlm/utils/writer.py index fb41fe5..018917a 100644 --- a/internlm/utils/writer.py +++ b/internlm/utils/writer.py @@ -33,7 +33,7 @@ def init_tb_writer( ): tb_log_file_name = file_name if not tensorboard_folder: - tb_folder = os.path.join(job_name, launch_time, "tensorboards") + tb_folder = os.path.join("RUN", job_name, launch_time, "tensorboards") else: tb_folder = tensorboard_folder diff --git a/train.py b/train.py index 139bac1..35e39fa 100644 --- a/train.py +++ b/train.py @@ -176,7 +176,7 @@ def main(args): memory_profiler = SimpleMemoryProfiler( model, optimizer.optim, - log_folder=f"memory_trace/rank{gpc.get_global_rank()}_" + log_folder=f"RUN/{gpc.config.JOB_NAME}/{current_time}/memory_trace/rank{gpc.get_global_rank()}_" + f"dp{gpc.get_local_rank(ParallelMode.DATA)}_" + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}", )