upgrade monitor module

pull/330/head
vapao 2021-04-25 22:26:20 +08:00
parent 540fc3511c
commit b4bcb2606a
11 changed files with 195 additions and 150 deletions

View File

@ -21,6 +21,7 @@ class Alarm(models.Model, ModelMixin):
)
name = models.CharField(max_length=50)
type = models.CharField(max_length=50)
target = models.CharField(max_length=100)
notify_mode = models.CharField(max_length=255)
notify_grp = models.CharField(max_length=255)
status = models.CharField(max_length=2, choices=STATUS)

View File

@ -12,6 +12,8 @@ import logging
MONITOR_WORKER_KEY = settings.MONITOR_WORKER_KEY
SCHEDULE_WORKER_KEY = settings.SCHEDULE_WORKER_KEY
logging.basicConfig(level=logging.WARNING, format='%(asctime)s %(message)s')
class Worker:
def __init__(self):

View File

@ -2,13 +2,16 @@
# Copyright: (c) <spug.dev@gmail.com>
# Released under the AGPL-3.0 License.
from django.db import close_old_connections
from django_redis import get_redis_connection
from apps.host.models import Host
from apps.monitor.utils import handle_notify
from socket import socket
import subprocess
import platform
import requests
import logging
import json
import time
logging.captureWarnings(True)
@ -64,8 +67,8 @@ def host_executor(host, command):
def monitor_worker_handler(job):
print('enter: ', job)
task_id, tp, addr, extra = json.loads(job)
task_id, tp, addr, extra, threshold, quiet = json.loads(job)
target = addr
if tp == '1':
is_ok, message = site_check(addr, extra)
elif tp == '2':
@ -82,20 +85,24 @@ def monitor_worker_handler(job):
is_ok, message = False, f'unknown host id for {addr!r}'
else:
is_ok, message = host_executor(host, command)
target = f'{host.name}({host.hostname})'
rds, key, f_count, f_time = get_redis_connection(), f'spug:det:{task_id}', f'c_{addr}', f't_{addr}'
v_count, v_time = rds.hmget(key, f_count, f_time)
if is_ok:
if v_count:
rds.hdel(key, f_count, f_time)
if v_time:
logging.warning('send recovery notification')
handle_notify(task_id, target, is_ok, message, int(v_count) + 1)
return
v_count = rds.hincrby(key, f_count)
if v_count >= threshold:
if not v_time or int(time.time()) - int(v_time) >= quiet * 60:
rds.hset(key, f_time, int(time.time()))
logging.warning('send fault alarm notification')
handle_notify(task_id, target, is_ok, message, v_count)
# is_notified = True if obj.latest_notify_time else False
# if obj.latest_status in [0, None] and is_ok is False:
# obj.latest_fault_time = int(time.time())
# if is_ok:
# obj.latest_notify_time = 0
# obj.fault_times = 0
# else:
# obj.fault_times += 1
# obj.latest_status = 0 if is_ok else 1
# obj.latest_run_time = human_datetime(event.scheduled_run_time)
# obj.save()
# self._handle_notify(obj, is_notified, out)
def dispatch(tp, addr, extra):
if tp == '1':

View File

@ -16,13 +16,12 @@ class Detection(models.Model, ModelMixin):
('5', 'Ping检测'),
)
STATUS = (
(0, '成功'),
(1, '失败'),
(0, '正常'),
(1, '异常'),
)
name = models.CharField(max_length=50)
type = models.CharField(max_length=2, choices=TYPES)
group = models.CharField(max_length=255, null=True)
addr = models.CharField(max_length=255) # 要删除的
targets = models.TextField()
extra = models.TextField(null=True)
desc = models.CharField(max_length=255, null=True)
@ -33,10 +32,7 @@ class Detection(models.Model, ModelMixin):
fault_times = models.SmallIntegerField(default=0)
notify_mode = models.CharField(max_length=255)
notify_grp = models.CharField(max_length=255)
latest_status = models.SmallIntegerField(choices=STATUS, null=True)
latest_run_time = models.CharField(max_length=20, null=True)
latest_fault_time = models.IntegerField(null=True)
latest_notify_time = models.IntegerField(default=0)
created_at = models.CharField(max_length=20, default=human_datetime)
created_by = models.ForeignKey(User, models.PROTECT, related_name='+')
@ -46,7 +42,6 @@ class Detection(models.Model, ModelMixin):
def to_dict(self, *args, **kwargs):
tmp = super().to_dict(*args, **kwargs)
tmp['type_alias'] = self.get_type_display()
tmp['latest_status_alias'] = self.get_latest_status_display()
tmp['notify_mode'] = json.loads(self.notify_mode)
tmp['notify_grp'] = json.loads(self.notify_grp)
tmp['targets'] = json.loads(self.targets)

View File

@ -99,12 +99,12 @@ class Scheduler:
obj.save()
self._handle_notify(obj, is_notified, out)
def _dispatch(self, task_id, tp, targets, extra):
def _dispatch(self, task_id, tp, targets, extra, threshold, quiet):
close_old_connections()
Detection.objects.filter(pk=task_id).update(latest_run_time=human_datetime())
rds_cli = get_redis_connection()
for t in json.loads(targets):
rds_cli.rpush(MONITOR_WORKER_KEY, json.dumps([task_id, tp, t, extra]))
rds_cli.rpush(MONITOR_WORKER_KEY, json.dumps([task_id, tp, t, extra, threshold, quiet]))
def _init(self):
self.scheduler.start()
@ -114,7 +114,7 @@ class Scheduler:
self._dispatch,
trigger,
id=str(item.id),
args=(item.id, item.type, item.targets, item.extra),
args=(item.id, item.type, item.targets, item.extra, item.threshold, item.quiet),
)
def run(self):
@ -131,7 +131,7 @@ class Scheduler:
self._dispatch,
trigger,
id=str(task.id),
args=(task.id, task.type, task.targets, task.extra),
args=(task.id, task.type, task.targets, task.extra, task.threshold, task.quiet),
replace_existing=True
)
elif task.action == 'remove':

View File

@ -1,6 +1,13 @@
# Copyright: (c) OpenSpug Organization. https://github.com/openspug/spug
# Copyright: (c) <spug.dev@gmail.com>
# Released under the AGPL-3.0 License.
from django.db import close_old_connections
from apps.alarm.models import Alarm
from apps.monitor.models import Detection
from libs.spug import Notification
import json
def seconds_to_human(seconds):
text = ''
if seconds > 3600:
@ -9,4 +16,28 @@ def seconds_to_human(seconds):
if seconds > 60:
text += f'{int(seconds / 60)}分钟'
seconds = seconds % 60
return f'{text}{int(seconds)}'
if seconds:
text += f'{seconds}'
return text
def _record_alarm(det, target, duration, status):
Alarm.objects.create(
name=det.name,
type=det.get_type_display(),
target=target,
status=status,
duration=duration,
notify_grp=det.notify_grp,
notify_mode=det.notify_mode)
def handle_notify(task_id, target, is_ok, out, fault_times):
close_old_connections()
det = Detection.objects.get(pk=task_id)
duration = seconds_to_human(det.rate * fault_times * 60)
event = '2' if is_ok else '1'
_record_alarm(det, target, duration, event)
grp = json.loads(det.notify_grp)
notify = Notification(grp, event, target, det.name, out, duration)
notify.dispatch(json.loads(det.notify_mode))

View File

@ -32,7 +32,7 @@ class Notify(models.Model, ModelMixin):
if not with_quiet or time.time() - cache.get('spug:notify_quiet', 0) > 3600:
cache.set('spug:notify_quiet', time.time())
cls.objects.create(source=source, title=title, type=type, content=content)
Channel.send_notify(title, content)
Channel.send_notify(title, content)
def __repr__(self):
return '<Notify %r>' % self.title

View File

@ -9,125 +9,144 @@ from libs.utils import human_datetime
import requests
import json
spug_server = 'http://spug-wx.qbangmang.com'
spug_server = 'https://api.spug.cc'
notify_source = 'monitor'
def _parse_args(grp):
spug_key = AppSetting.get_default('spug_key')
return spug_key, sum([json.loads(x.contacts) for x in Group.objects.filter(id__in=grp)], [])
class Notification:
def __init__(self, grp, event, target, title, message, duration):
self.event = event
self.title = title
self.target = target
self.message = message
self.duration = duration
self.spug_key, self.u_ids = self._parse_args(grp)
def _parse_args(self, grp):
spug_key = AppSetting.get_default('spug_key')
return spug_key, sum([json.loads(x.contacts) for x in Group.objects.filter(id__in=grp)], [])
def _handle_response(res, mode):
if res.status_code != 200:
Notify.make_notify(notify_source, '1', '告警通知发送失败', f'返回状态码:{res.status_code}, 请求URL{res.url}')
if mode in ['dd', 'wx']:
res = res.json()
if res.get('errcode') != 0:
Notify.make_notify(notify_source, '1', '告警通知发送失败', f'返回数据:{res}')
if mode == 'spug':
res = res.json()
if res.get('error'):
Notify.make_notify(notify_source, '1', '告警通知发送失败', f'错误信息:{res}')
def _handle_request(self, mode, url, data):
try:
res = requests.post(url, json=data, timeout=30)
except Exception as e:
Notify.make_notify(notify_source, '1', '告警通知发送失败', f'接口调用异常:{e}')
return
if res.status_code != 200:
Notify.make_notify(notify_source, '1', '告警通知发送失败', f'返回状态码:{res.status_code}, 请求URL{res.url}')
if mode in ['dd', 'wx']:
res = res.json()
if res.get('errcode') != 0:
Notify.make_notify(notify_source, '1', '告警通知发送失败', f'返回数据:{res}')
if mode == 'spug':
res = res.json()
if res.get('error'):
Notify.make_notify(notify_source, '1', '告警通知发送失败', f'错误信息:{res}')
def notify_by_wx(event, obj):
spug_key, u_ids = _parse_args(obj.grp)
if not spug_key:
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。')
return
users = set(x.wx_token for x in Contact.objects.filter(id__in=u_ids, wx_token__isnull=False))
if users:
data = {
'token': spug_key,
'event': event,
'subject': obj.name,
'desc': obj.out,
'remark': f'故障持续{obj.duration}' if event == '2' else None,
'users': list(users)
}
res = requests.post(f'{spug_server}/apis/notify/wx/', json=data)
_handle_response(res, 'spug')
else:
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象请确保设置了相关报警联系人的微信Token。')
def notify_by_email(event, obj):
spug_key, u_ids = _parse_args(obj.grp)
users = set(x.email for x in Contact.objects.filter(id__in=u_ids, email__isnull=False))
if users:
mail_service = json.loads(AppSetting.get_default('mail_service', '{}'))
body = ['告警名称:' + obj.name, '告警时间:' + human_datetime(), '告警描述:' + obj.out]
if event == '2':
body.append('故障持续:' + obj.duration)
if mail_service.get('server'):
event_map = {'1': '告警发生', '2': '告警恢复'}
subject = f'{event_map[event]}-{obj.name}'
mail = Mail(**mail_service)
mail.send_text_mail(users, subject, '\r\n'.join(body) + '\r\n\r\n自动发送,请勿回复。')
elif spug_key:
def _by_wx(self):
if not self.spug_key:
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。')
return
users = set(x.wx_token for x in Contact.objects.filter(id__in=self.u_ids, wx_token__isnull=False))
if users:
data = {
'token': spug_key,
'event': event,
'subject': obj.name,
'body': '\r\n'.join(body),
'token': self.spug_key,
'event': self.event,
'subject': self.title,
'desc': self.message,
'remark': f'故障持续{self.duration}' if self.event == '2' else None,
'users': list(users)
}
res = requests.post(f'{spug_server}/apis/notify/mail/', json=data)
_handle_response(res, 'spug')
self._handle_request('spug', f'{spug_server}/apis/notify/wx/', data)
else:
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。')
else:
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的邮件地址。')
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象请确保设置了相关报警联系人的微信Token。')
def _by_email(self):
users = set(x.email for x in Contact.objects.filter(id__in=self.u_ids, email__isnull=False))
if users:
mail_service = json.loads(AppSetting.get_default('mail_service', '{}'))
body = [
f'告警名称:{self.title}',
f'告警对象:{self.target}',
f'{"告警" if self.event == "1" else "恢复"}时间:{human_datetime()}',
f'告警描述:{self.message}'
]
if self.event == '2':
body.append('故障持续:' + self.duration)
if mail_service.get('server'):
event_map = {'1': '监控告警通知', '2': '告警恢复通知'}
subject = f'{event_map[self.event]}-{self.title}'
mail = Mail(**mail_service)
mail.send_text_mail(users, subject, '\r\n'.join(body) + '\r\n\r\n自动发送,请勿回复。')
elif self.spug_key:
data = {
'token': self.spug_key,
'event': self.event,
'subject': self.title,
'body': '\r\n'.join(body),
'users': list(users)
}
self._handle_request('spug', f'{spug_server}/apis/notify/mail/', data)
else:
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。')
else:
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的邮件地址。')
def notify_by_dd(event, obj):
_, u_ids = _parse_args(obj.grp)
users = set(x.ding for x in Contact.objects.filter(id__in=u_ids, ding__isnull=False))
if users:
texts = [
'## %s ## ' % ('监控告警通知' if event == '1' else '告警恢复通知'),
f'**告警名称:** <font color="#{"f90202" if event == "1" else "008000"}">{obj.name}</font> ',
f'**告警时间:** {human_datetime()} ',
f'**告警描述:** {obj.out} ',
]
if event == '2':
texts.append(f'**持续时间:** {obj.duration} ')
data = {
'msgtype': 'markdown',
'markdown': {
'title': '监控告警通知',
'text': '\n\n'.join(texts) + '\n\n> ###### 来自 Spug运维平台'
def _by_dd(self):
users = set(x.ding for x in Contact.objects.filter(id__in=self.u_ids, ding__isnull=False))
if users:
texts = [
'## %s ## ' % ('监控告警通知' if self.event == '1' else '告警恢复通知'),
f'**告警名称:** <font color="#{"f90202" if self.event == "1" else "008000"}">{self.title}</font> ',
f'**告警对象:** {self.target} ',
f'**{"告警" if self.event == "1" else "恢复"}时间:** {human_datetime()} ',
f'**告警描述:** {self.message} ',
]
if self.event == '2':
texts.append(f'**持续时间:** {self.duration} ')
data = {
'msgtype': 'markdown',
'markdown': {
'title': '监控告警通知',
'text': '\n\n'.join(texts) + '\n\n> ###### 来自 Spug运维平台'
}
}
}
for url in users:
res = requests.post(url, json=data)
_handle_response(res, 'dd')
else:
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的钉钉。')
for url in users:
self._handle_request('dd', url, data)
else:
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的钉钉。')
def notify_by_qy_wx(event, obj):
_, u_ids = _parse_args(obj.grp)
users = set(x.qy_wx for x in Contact.objects.filter(id__in=u_ids, qy_wx__isnull=False))
if users:
color, title = ('warning', '监控告警通知') if event == '1' else ('info', '告警恢复通知')
texts = [
f'## {title}',
f'**告警名称:** <font color="{color}">{obj.name}</font> ',
f'**告警时间:** {human_datetime()} ',
f'**告警描述:** {obj.out} ',
]
if event == '2':
texts.append(f'**持续时间:** {obj.duration} ')
data = {
'msgtype': 'markdown',
'markdown': {
'content': '\n'.join(texts) + '\n> 来自 Spug运维平台'
def _by_qy_wx(self):
users = set(x.qy_wx for x in Contact.objects.filter(id__in=self.u_ids, qy_wx__isnull=False))
if users:
color, title = ('warning', '监控告警通知') if self.event == '1' else ('info', '告警恢复通知')
texts = [
f'## {title}',
f'**告警名称:** <font color="{color}">{self.title}</font> ',
f'**告警对象:** {self.target}',
f'**{"告警" if self.event == "1" else "恢复"}时间:** {human_datetime()} ',
f'**告警描述:** {self.message} ',
]
if self.event == '2':
texts.append(f'**持续时间:** {self.duration} ')
data = {
'msgtype': 'markdown',
'markdown': {
'content': '\n'.join(texts) + '\n> 来自 Spug运维平台'
}
}
}
for url in users:
res = requests.post(url, json=data)
_handle_response(res, 'wx')
else:
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的企业微信。')
for url in users:
self._handle_request('wx', url, data)
else:
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的企业微信。')
def dispatch(self, modes):
for mode in modes:
if mode == '1':
self._by_wx()
elif mode == '3':
self._by_dd()
elif mode == '4':
self._by_email()
elif mode == '5':
self._by_qy_wx()

View File

@ -1,4 +1,4 @@
apscheduler==3.6.3
apscheduler==3.7.0
Django==2.2.13
channels==2.3.1
channels_redis==2.4.1

View File

@ -42,6 +42,9 @@ class ComTable extends React.Component {
}, {
title: '监控类型',
dataIndex: 'type',
}, {
title: '监控对象',
dataIndex: 'target'
}, {
title: '状态',
dataIndex: 'status',

View File

@ -99,25 +99,12 @@ class ComTable extends React.Component {
<Table.Column title="监控分组" dataIndex="group" />
<Table.Column title="监控名称" dataIndex="name"/>
<Table.Column title="类型" dataIndex="type_alias"/>
<Table.Column ellipsis title="地址" render={info => {
if ('34'.includes(info.type)) {
return lds.get(this.state.hosts, `${info.addr}.name`)
} else {
return info.addr
}
}}/>
<Table.Column title="频率" dataIndex="rate" render={value => `${value}分钟`}/>
<Table.Column title="状态" render={info => {
if (info.is_active) {
if (info['latest_status'] === 0) {
return <Tag color="green">正常</Tag>
} else if (info['latest_status'] === 1) {
return <Tag color="red">异常</Tag>
} else {
return <Tag color="orange">待检测</Tag>
}
return <Tag color="green">监控中</Tag>
} else {
return <Tag>未启用</Tag>
return <Tag color="red">未启用</Tag>
}
}}/>
<Table.Column title="更新于" dataIndex="latest_run_time_alias"