feat(monitor): add light monitor (#275)

* add light monitor

* filter key of metrics dict

* test no light_monitor case

* mv init_light_monitor to initialize_distributed_env
pull/281/head
jiaopenglong 2023-09-05 19:24:01 +08:00 committed by GitHub
parent 9445faf5be
commit 8d8d811e10
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 72 additions and 2 deletions

View File

@ -10,6 +10,7 @@ import torch
from internlm.core.context import Config
from internlm.core.context import global_context as gpc
from internlm.monitor import initialize_light_monitor
from internlm.utils.common import get_master_node
from internlm.utils.logger import get_logger
@ -332,6 +333,14 @@ def launch(
f"tensor parallel size: {gpc.tensor_parallel_size}",
)
# init light monitor client
light_monitor_address = gpc.config.get("light_monitor_address", None)
if light_monitor_address is None:
if gpc.is_rank_for_log():
logger.warning("monitor address is none, monitor could not be used!")
else:
initialize_light_monitor(light_monitor_address)
def launch_from_slurm(
config: Union[str, Path, Config, Dict],

View File

@ -1,4 +1,11 @@
from .alert import initialize_light_monitor, send_heartbeat
from .monitor import initialize_monitor_manager, send_alert_message
from .utils import set_env_var
__all__ = ["send_alert_message", "initialize_monitor_manager", "set_env_var"]
__all__ = [
"send_alert_message",
"initialize_monitor_manager",
"set_env_var",
"initialize_light_monitor",
"send_heartbeat",
]

View File

@ -1,8 +1,59 @@
import json
import math
import os
import re
import time
from typing import Dict
import requests
from internlm.utils.logger import get_logger
logger = get_logger(__file__)
def initialize_light_monitor(monitor_address: str = None):
try:
from uniscale_monitoring import init_monitor
init_monitor(monitor_address)
except Exception as e:
logger.warning(f"init monitor meet error: {e}")
def send_heartbeat(msg_type: str, msg: Dict):
def nan2none(v):
if isinstance(v, float) and math.isnan(v):
return None
return v
try:
from uniscale_monitoring import send_meta
data = {}
for k, v in msg.items():
if isinstance(v, Dict):
for k1, v1 in v.items():
new_k = f"{k}_{k1}".split(" ")[0]
new_k = re.sub(r"[^a-zA-Z0-9_]", "_", new_k)
data[new_k] = nan2none(v1)
else:
new_k = k.split(" ")[0]
new_k = re.sub(r"[^a-zA-Z0-9_]", "_", new_k)
data[new_k] = nan2none(v)
if os.getenv("CLUSTER_NAME"):
data.update({"cluster": os.getenv("CLUSTER_NAME")})
if msg_type == "train_metrics":
data.update({"msg_type": "train_metrics"})
elif msg_type == "init_time":
data.update({"msg_type": "init_time"})
elif msg_type == "stage_time":
data.update({"msg_type": "stage_time"})
send_meta(data, timeout=0.1)
except Exception as e:
logger.warning(f"send heartbeat meet error: {e}")
def send_feishu_msg_with_webhook(webhook: str, title: str, message: str):
"""

View File

@ -24,7 +24,7 @@ from internlm.data.packed_dataset import (
get_packed_dataset_without_short_length,
)
from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data
from internlm.monitor import set_env_var
from internlm.monitor import send_heartbeat, set_env_var
from internlm.monitor.monitor import monitor_manager as mm
from internlm.solver.beta2_scheduler import Beta2Scheduler
from internlm.solver.lr_scheduler import FineTuneCosineAnnealingWarmupLR
@ -394,6 +394,9 @@ def record_current_batch_training_metrics(
else:
writer.add_scalar(key=key, value=value, step=train_state.step_count)
if gpc.config.get("light_monitor_address", None) and batch_count % 50 == 0:
send_heartbeat("train_metrics", infos)
if update_panel:
# metrics shown with dashboard panels
panel_metrics = {