From d537e454561b618031f7fe6654808dff9ff6ffae Mon Sep 17 00:00:00 2001 From: jiaopenglong <44927264+JiaoPL@users.noreply.github.com> Date: Fri, 3 Nov 2023 10:55:16 +0800 Subject: [PATCH] send exception to light monitor only if the server is available (#465) --- internlm/monitor/monitor.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/internlm/monitor/monitor.py b/internlm/monitor/monitor.py index 8c3943d..541b3ef 100644 --- a/internlm/monitor/monitor.py +++ b/internlm/monitor/monitor.py @@ -155,7 +155,11 @@ class MonitorManager(metaclass=SingletonMeta): format_trace = "" for line in filtered_trace: format_trace += "\n" + line - if self.send_exception: + if ( + self.send_exception + and gpc.config.monitor.alert.get("enable_feishu_alert", False) + and gpc.config.monitor.alert.get("light_monitor_address", None) + ): self.send_exception(format_trace, gpc.get_global_rank()) send_alert_message( address=alert_address, @@ -169,7 +173,11 @@ class MonitorManager(metaclass=SingletonMeta): print("receive frame: ", frame) print("receive signal: ", sys_signal) message = f"Process received signal {signal} and exited." - if self.send_exception: + if ( + self.send_exception + and gpc.config.monitor.alert.get("enable_feishu_alert", False) + and gpc.config.monitor.alert.get("light_monitor_address", None) + ): self.send_exception(message, gpc.get_global_rank()) send_alert_message( address=alert_address,