mirror of https://github.com/InternLM/InternLM
monitor task only if DO_ALERT is True
parent
a8ff9acbfd
commit
d680876a9a
|
@ -136,9 +136,12 @@ class MonitorManager(metaclass=SingletonMeta):
|
||||||
self.last_step_loss = -1
|
self.last_step_loss = -1
|
||||||
self.send_exception = try_import_send_exception()
|
self.send_exception = try_import_send_exception()
|
||||||
self.alert_file_path = None
|
self.alert_file_path = None
|
||||||
|
self.enable_alert = False
|
||||||
|
self.light_monitor_address = None
|
||||||
|
|
||||||
def monitor_loss_spike(self, alert_address: str = None, step_count: int = 0, cur_step_loss: float = 0.0):
|
def monitor_loss_spike(self, alert_address: str = None, step_count: int = 0, cur_step_loss: float = 0.0):
|
||||||
"""Check loss value, if loss spike occurs, send alert message to Feishu."""
|
"""Check loss value, if loss spike occurs, send alert message to Feishu."""
|
||||||
|
if self.enable_alert:
|
||||||
set_env_var(key="LOSS", value=cur_step_loss)
|
set_env_var(key="LOSS", value=cur_step_loss)
|
||||||
set_env_var(key="STEP_ID", value=step_count)
|
set_env_var(key="STEP_ID", value=step_count)
|
||||||
|
|
||||||
|
@ -153,7 +156,6 @@ class MonitorManager(metaclass=SingletonMeta):
|
||||||
self.last_step_loss = cur_step_loss
|
self.last_step_loss = cur_step_loss
|
||||||
|
|
||||||
def exception_should_be_alert(self, msg: str, alert_address: str = None):
|
def exception_should_be_alert(self, msg: str, alert_address: str = None):
|
||||||
enable_alert = gpc.config.monitor.alert.get("enable_feishu_alert", False)
|
|
||||||
try:
|
try:
|
||||||
with open(self.alert_file_path, "a+") as f:
|
with open(self.alert_file_path, "a+") as f:
|
||||||
fcntl.flock(f, fcntl.LOCK_EX)
|
fcntl.flock(f, fcntl.LOCK_EX)
|
||||||
|
@ -165,25 +167,23 @@ class MonitorManager(metaclass=SingletonMeta):
|
||||||
|
|
||||||
f.write(msg)
|
f.write(msg)
|
||||||
fcntl.flock(f, fcntl.LOCK_UN)
|
fcntl.flock(f, fcntl.LOCK_UN)
|
||||||
return enable_alert and True
|
return True
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
send_alert_message(
|
send_alert_message(
|
||||||
address=alert_address,
|
address=alert_address,
|
||||||
message=f"Failed to open ALERT file: {err}",
|
message=f"Failed to open ALERT file: {err}",
|
||||||
)
|
)
|
||||||
return enable_alert and True
|
return True
|
||||||
|
|
||||||
def monitor_exception(self, alert_address: str = None, excp_info: str = None):
|
def monitor_exception(self, alert_address: str = None, excp_info: str = None):
|
||||||
"""Catch and format exception information, send alert message to Feishu."""
|
"""Catch and format exception information, send alert message to Feishu."""
|
||||||
|
if self.enable_alert:
|
||||||
filtered_trace = excp_info.split("\n")[-10:]
|
filtered_trace = excp_info.split("\n")[-10:]
|
||||||
format_trace = ""
|
format_trace = ""
|
||||||
for line in filtered_trace:
|
for line in filtered_trace:
|
||||||
format_trace += "\n" + line
|
format_trace += "\n" + line
|
||||||
if (
|
|
||||||
self.send_exception
|
if self.send_exception and self.light_monitor_address:
|
||||||
and gpc.config.monitor.alert.get("enable_feishu_alert", False)
|
|
||||||
and gpc.config.monitor.alert.get("light_monitor_address", None)
|
|
||||||
):
|
|
||||||
self.send_exception(format_trace, gpc.get_global_rank())
|
self.send_exception(format_trace, gpc.get_global_rank())
|
||||||
message = f"Catch Exception from {socket.gethostname()} with rank id {gpc.get_global_rank()}:{format_trace}"
|
message = f"Catch Exception from {socket.gethostname()} with rank id {gpc.get_global_rank()}:{format_trace}"
|
||||||
if self.alert_file_path:
|
if self.alert_file_path:
|
||||||
|
@ -200,14 +200,11 @@ class MonitorManager(metaclass=SingletonMeta):
|
||||||
"""Catch SIGTERM signal, and send alert message to Feishu."""
|
"""Catch SIGTERM signal, and send alert message to Feishu."""
|
||||||
|
|
||||||
def sigterm_handler(sys_signal, frame):
|
def sigterm_handler(sys_signal, frame):
|
||||||
|
if self.enable_alert:
|
||||||
print("receive frame: ", frame)
|
print("receive frame: ", frame)
|
||||||
print("receive signal: ", sys_signal)
|
print("receive signal: ", sys_signal)
|
||||||
message = f"Process received signal {signal} and exited."
|
message = f"Process received signal {signal} and exited."
|
||||||
if (
|
if self.send_exception and self.light_monitor_address:
|
||||||
self.send_exception
|
|
||||||
and gpc.config.monitor.alert.get("enable_feishu_alert", False)
|
|
||||||
and gpc.config.monitor.alert.get("light_monitor_address", None)
|
|
||||||
):
|
|
||||||
self.send_exception(message, gpc.get_global_rank())
|
self.send_exception(message, gpc.get_global_rank())
|
||||||
send_alert_message(
|
send_alert_message(
|
||||||
address=alert_address,
|
address=alert_address,
|
||||||
|
@ -236,7 +233,10 @@ class MonitorManager(metaclass=SingletonMeta):
|
||||||
|
|
||||||
# initialize some variables for monitoring
|
# initialize some variables for monitoring
|
||||||
set_env_var(key="JOB_NAME", value=job_name)
|
set_env_var(key="JOB_NAME", value=job_name)
|
||||||
|
self.enable_alert = gpc.config.monitor.alert.get("enable_feishu_alert", False)
|
||||||
|
|
||||||
|
if self.enable_alert:
|
||||||
|
self.light_monitor_address = gpc.config.monitor.alert.get("light_monitor_address", None)
|
||||||
# initialize alert file
|
# initialize alert file
|
||||||
self.alert_file_path = gpc.config.monitor.alert.get("alert_file_path")
|
self.alert_file_path = gpc.config.monitor.alert.get("alert_file_path")
|
||||||
if self.alert_file_path and gpc.is_rank_for_log():
|
if self.alert_file_path and gpc.is_rank_for_log():
|
||||||
|
|
Loading…
Reference in New Issue