fix(train): unify the exp paths (#492)

pull/499/head
jiaopenglong 2023-11-11 20:15:59 +08:00 committed by GitHub
parent 3418898cbe
commit 626ed0fc5e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 4 additions and 4 deletions

View File

@ -391,7 +391,7 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None):
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
schedule=torch.profiler.schedule(skip_first=5, wait=1, warmup=1, active=1, repeat=1),
on_trace_ready=torch.profiler.tensorboard_trace_handler(
f"{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_"
f"RUN/{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_"
+ f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
+ f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_"
+ f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}",

View File

@ -118,7 +118,7 @@ def filter_kwargs(func, kwargs):
def launch_time():
global CURRENT_TIME
if not CURRENT_TIME:
CURRENT_TIME = datetime.now().strftime("%b%d_%H-%M-%S")
CURRENT_TIME = datetime.now().strftime("%m-%d:%H:%M:%S")
return CURRENT_TIME

View File

@ -33,7 +33,7 @@ def init_tb_writer(
):
tb_log_file_name = file_name
if not tensorboard_folder:
tb_folder = os.path.join(job_name, launch_time, "tensorboards")
tb_folder = os.path.join("RUN", job_name, launch_time, "tensorboards")
else:
tb_folder = tensorboard_folder

View File

@ -176,7 +176,7 @@ def main(args):
memory_profiler = SimpleMemoryProfiler(
model,
optimizer.optim,
log_folder=f"memory_trace/rank{gpc.get_global_rank()}_"
log_folder=f"RUN/{gpc.config.JOB_NAME}/{current_time}/memory_trace/rank{gpc.get_global_rank()}_"
+ f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
+ f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}",
)