From 4b5bdedff24d3e9491e850772894d97ff6a30819 Mon Sep 17 00:00:00 2001 From: jiaopenglong <44927264+JiaoPL@users.noreply.github.com> Date: Wed, 18 Oct 2023 21:00:21 +0800 Subject: [PATCH] feat(monitor): send exception to light monitor (#420) * send exception to light monitor * update try_import_send_exception --- internlm/monitor/monitor.py | 10 ++++++++-- internlm/monitor/utils.py | 13 +++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/internlm/monitor/monitor.py b/internlm/monitor/monitor.py index 6a3b9dc..8c3943d 100644 --- a/internlm/monitor/monitor.py +++ b/internlm/monitor/monitor.py @@ -9,7 +9,7 @@ from internlm.core.context import global_context as gpc from internlm.monitor.alert import send_feishu_msg_with_webhook from internlm.utils.common import SingletonMeta -from .utils import get_job_key, set_env_var +from .utils import get_job_key, set_env_var, try_import_send_exception def send_alert_message(address: str = None, title: str = None, message: str = None): @@ -132,6 +132,7 @@ class MonitorManager(metaclass=SingletonMeta): self.monitor_thread = None self.loss_spike_limit = loss_spike_limit self.last_step_loss = -1 + self.send_exception = try_import_send_exception() def monitor_loss_spike(self, alert_address: str = None, step_count: int = 0, cur_step_loss: float = 0.0): """Check loss value, if loss spike occurs, send alert message to Feishu.""" @@ -154,6 +155,8 @@ class MonitorManager(metaclass=SingletonMeta): format_trace = "" for line in filtered_trace: format_trace += "\n" + line + if self.send_exception: + self.send_exception(format_trace, gpc.get_global_rank()) send_alert_message( address=alert_address, message=f"Catch Exception from {socket.gethostname()} with rank id {gpc.get_global_rank()}:{format_trace}", @@ -165,9 +168,12 @@ class MonitorManager(metaclass=SingletonMeta): def sigterm_handler(sys_signal, frame): print("receive frame: ", frame) print("receive signal: ", sys_signal) + message = f"Process received signal {signal} and exited." + if self.send_exception: + self.send_exception(message, gpc.get_global_rank()) send_alert_message( address=alert_address, - message=f"Process received signal {signal} and exited.", + message=message, ) signal.signal(signal.SIGTERM, sigterm_handler) diff --git a/internlm/monitor/utils.py b/internlm/monitor/utils.py index 34360b5..0881897 100644 --- a/internlm/monitor/utils.py +++ b/internlm/monitor/utils.py @@ -32,3 +32,16 @@ def get_job_name(): def get_job_key(): return f"{get_job_id()}_{get_job_name()}" + + +def try_import_send_exception(): + """ + Try import send_exception from uniscale_monitoring, if failed, return None + + """ + try: + from uniscale_monitoring import send_exception_msg as send_exception + + return send_exception + except ImportError: + return None