From f5aea7e08cb8a376d53b6da6884634b60245aa69 Mon Sep 17 00:00:00 2001 From: jiaopenglong <44927264+JiaoPL@users.noreply.github.com> Date: Tue, 21 Nov 2023 19:19:22 +0800 Subject: [PATCH] fix(timeout): larger timeout (#495) * larger initialize timeout * unify time format * update timeout thresholds --- internlm/utils/common.py | 2 +- internlm/utils/timeout.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/internlm/utils/common.py b/internlm/utils/common.py index 6d7f7b2..6c9cc68 100644 --- a/internlm/utils/common.py +++ b/internlm/utils/common.py @@ -118,7 +118,7 @@ def filter_kwargs(func, kwargs): def launch_time(): global CURRENT_TIME if not CURRENT_TIME: - CURRENT_TIME = datetime.now().strftime("%m-%d:%H:%M:%S") + CURRENT_TIME = datetime.now().strftime("%m-%d-%H:%M:%S") return CURRENT_TIME diff --git a/internlm/utils/timeout.py b/internlm/utils/timeout.py index 4e68ce9..711c6da 100644 --- a/internlm/utils/timeout.py +++ b/internlm/utils/timeout.py @@ -39,14 +39,14 @@ ENABLE_TIMEOUT = os.getenv("INTERNLM_ENABLE_TIMEOUT", None) timeout_threshold_dict = { - "initialize_distributed_env": 120, + "initialize_distributed_env": 240, "nopp_forward_backward_step": 360, - "initialize_model": 10, - "initialize_optimizer": 20, - "optim_step": 30, + "initialize_model": 60, + "initialize_optimizer": 60, + "optim_step": 60, "get_train_data_loader": 600, - "get_validation_data_loader": 60, - "load_new_batch": 10, + "get_validation_data_loader": 120, + "load_new_batch": 20, "record_current_batch_training_metrics": 10, "save_checkpoint": 1200, "interleaved_forward_backward_step": 600,