mirror of https://github.com/InternLM/InternLM
105 lines
2.9 KiB
Python
105 lines
2.9 KiB
Python
import json
|
|
import math
|
|
import os
|
|
import re
|
|
import time
|
|
from typing import Dict
|
|
|
|
import requests
|
|
|
|
from internlm.utils.logger import get_logger
|
|
|
|
logger = get_logger(__file__)
|
|
|
|
|
|
def initialize_light_monitor(monitor_address: str = None):
|
|
try:
|
|
from uniscale_monitoring import init_monitor
|
|
|
|
init_monitor(monitor_address)
|
|
except Exception as e:
|
|
logger.warning(f"init monitor meet error: {e}")
|
|
|
|
|
|
def send_heartbeat(msg_type: str, msg: Dict):
|
|
def nan2none(v):
|
|
if isinstance(v, float) and math.isnan(v):
|
|
return None
|
|
return v
|
|
|
|
try:
|
|
from uniscale_monitoring import send_meta
|
|
|
|
data = {}
|
|
for k, v in msg.items():
|
|
if isinstance(v, Dict):
|
|
for k1, v1 in v.items():
|
|
new_k = f"{k}_{k1}".split(" ")[0]
|
|
new_k = re.sub(r"[^a-zA-Z0-9_]", "_", new_k)
|
|
data[new_k] = nan2none(v1)
|
|
else:
|
|
new_k = k.split(" ")[0]
|
|
new_k = re.sub(r"[^a-zA-Z0-9_]", "_", new_k)
|
|
data[new_k] = nan2none(v)
|
|
|
|
if os.getenv("CLUSTER_NAME"):
|
|
data.update({"cluster": os.getenv("CLUSTER_NAME")})
|
|
if msg_type == "train_metrics":
|
|
data.update({"msg_type": "train_metrics"})
|
|
elif msg_type == "init_time":
|
|
data.update({"msg_type": "init_time"})
|
|
elif msg_type == "stage_time":
|
|
data.update({"msg_type": "stage_time"})
|
|
send_meta(data, timeout=0.1)
|
|
except Exception as e:
|
|
logger.warning(f"send heartbeat meet error: {e}")
|
|
|
|
|
|
def send_feishu_msg_with_webhook(webhook: str, title: str, message: str):
|
|
"""
|
|
Use Feishu robot to send messages with the given webhook.
|
|
|
|
Args:
|
|
webhook (str): The webhook to be used to send message.
|
|
title (str): The message title.
|
|
message (str): The message body.
|
|
|
|
Returns:
|
|
The response from the request. Or catch the exception and return None.
|
|
|
|
Raises:
|
|
Exception: An exception rasied by the HTTP post request.
|
|
|
|
"""
|
|
|
|
headers = {"Content-Type": "application/json;charset=utf-8"}
|
|
msg_body = {
|
|
"timestamp": int(time.time()),
|
|
"msg_type": "post",
|
|
"content": {
|
|
"post": {
|
|
"zh_cn": {
|
|
"title": title,
|
|
"content": [
|
|
[
|
|
{
|
|
"tag": "text",
|
|
"text": message,
|
|
},
|
|
],
|
|
],
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
try:
|
|
res = requests.post(webhook, data=json.dumps(msg_body), headers=headers, timeout=30)
|
|
res = res.json()
|
|
print(f"Feishu webhook response: {res}")
|
|
except Exception as err: # pylint: disable=W0703
|
|
print(f"HTTP Post error: {err}")
|
|
res = None
|
|
|
|
return res
|