From 0763bf39726b796d1c6fbe126a7deeb9be4d37f1 Mon Sep 17 00:00:00 2001 From: jiaopenglong <44927264+JiaoPL@users.noreply.github.com> Date: Thu, 9 Nov 2023 20:04:21 +0800 Subject: [PATCH] init light monitoring on all ranks (#462) --- internlm/initialize/launch.py | 4 ++-- internlm/monitor/alert.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index ad404f2..c50c77e 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -532,11 +532,11 @@ def initialize_distributed_env( # init light monitor client if gpc.config.get("monitor") and gpc.config.monitor.get("alert"): alert_config = gpc.config.monitor.alert - if alert_config.enable_feishu_alert and gpc.is_rank_for_log(): + if alert_config.enable_feishu_alert: light_monitor_address = alert_config.light_monitor_address if light_monitor_address: initialize_light_monitor(light_monitor_address) - else: + elif gpc.is_rank_for_log(): logger.warning("monitor address is none, monitor could not be used!") diff --git a/internlm/monitor/alert.py b/internlm/monitor/alert.py index e04aa0c..3df16f2 100644 --- a/internlm/monitor/alert.py +++ b/internlm/monitor/alert.py @@ -7,6 +7,7 @@ from typing import Dict import requests +from internlm.core.context import global_context as gpc from internlm.utils.logger import get_logger logger = get_logger(__file__) @@ -29,7 +30,7 @@ def initialize_light_monitor(monitor_address: str = None): try: from uniscale_monitoring import init_monitor - init_monitor(monitor_address) + init_monitor(monitor_address, is_root_rank=gpc.is_rank_for_log()) except Exception as e: logger.warning(f"init monitor meet error: {e}")