mirror of https://github.com/InternLM/InternLM
fix(timeout): larger timeout (#495)
* larger initialize timeout * unify time format * update timeout thresholdspull/512/head
parent
eba2b859fc
commit
f5aea7e08c
|
@ -118,7 +118,7 @@ def filter_kwargs(func, kwargs):
|
||||||
def launch_time():
|
def launch_time():
|
||||||
global CURRENT_TIME
|
global CURRENT_TIME
|
||||||
if not CURRENT_TIME:
|
if not CURRENT_TIME:
|
||||||
CURRENT_TIME = datetime.now().strftime("%m-%d:%H:%M:%S")
|
CURRENT_TIME = datetime.now().strftime("%m-%d-%H:%M:%S")
|
||||||
return CURRENT_TIME
|
return CURRENT_TIME
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -39,14 +39,14 @@ ENABLE_TIMEOUT = os.getenv("INTERNLM_ENABLE_TIMEOUT", None)
|
||||||
|
|
||||||
|
|
||||||
timeout_threshold_dict = {
|
timeout_threshold_dict = {
|
||||||
"initialize_distributed_env": 120,
|
"initialize_distributed_env": 240,
|
||||||
"nopp_forward_backward_step": 360,
|
"nopp_forward_backward_step": 360,
|
||||||
"initialize_model": 10,
|
"initialize_model": 60,
|
||||||
"initialize_optimizer": 20,
|
"initialize_optimizer": 60,
|
||||||
"optim_step": 30,
|
"optim_step": 60,
|
||||||
"get_train_data_loader": 600,
|
"get_train_data_loader": 600,
|
||||||
"get_validation_data_loader": 60,
|
"get_validation_data_loader": 120,
|
||||||
"load_new_batch": 10,
|
"load_new_batch": 20,
|
||||||
"record_current_batch_training_metrics": 10,
|
"record_current_batch_training_metrics": 10,
|
||||||
"save_checkpoint": 1200,
|
"save_checkpoint": 1200,
|
||||||
"interleaved_forward_backward_step": 600,
|
"interleaved_forward_backward_step": 600,
|
||||||
|
|
Loading…
Reference in New Issue