upgrade monitor module

pull/330/head
vapao 2021-04-25 22:26:20 +08:00
parent 540fc3511c
commit b4bcb2606a
11 changed files with 195 additions and 150 deletions

View File

@ -21,6 +21,7 @@ class Alarm(models.Model, ModelMixin):
) )
name = models.CharField(max_length=50) name = models.CharField(max_length=50)
type = models.CharField(max_length=50) type = models.CharField(max_length=50)
target = models.CharField(max_length=100)
notify_mode = models.CharField(max_length=255) notify_mode = models.CharField(max_length=255)
notify_grp = models.CharField(max_length=255) notify_grp = models.CharField(max_length=255)
status = models.CharField(max_length=2, choices=STATUS) status = models.CharField(max_length=2, choices=STATUS)

View File

@ -12,6 +12,8 @@ import logging
MONITOR_WORKER_KEY = settings.MONITOR_WORKER_KEY MONITOR_WORKER_KEY = settings.MONITOR_WORKER_KEY
SCHEDULE_WORKER_KEY = settings.SCHEDULE_WORKER_KEY SCHEDULE_WORKER_KEY = settings.SCHEDULE_WORKER_KEY
logging.basicConfig(level=logging.WARNING, format='%(asctime)s %(message)s')
class Worker: class Worker:
def __init__(self): def __init__(self):

View File

@ -2,13 +2,16 @@
# Copyright: (c) <spug.dev@gmail.com> # Copyright: (c) <spug.dev@gmail.com>
# Released under the AGPL-3.0 License. # Released under the AGPL-3.0 License.
from django.db import close_old_connections from django.db import close_old_connections
from django_redis import get_redis_connection
from apps.host.models import Host from apps.host.models import Host
from apps.monitor.utils import handle_notify
from socket import socket from socket import socket
import subprocess import subprocess
import platform import platform
import requests import requests
import logging import logging
import json import json
import time
logging.captureWarnings(True) logging.captureWarnings(True)
@ -64,8 +67,8 @@ def host_executor(host, command):
def monitor_worker_handler(job): def monitor_worker_handler(job):
print('enter: ', job) task_id, tp, addr, extra, threshold, quiet = json.loads(job)
task_id, tp, addr, extra = json.loads(job) target = addr
if tp == '1': if tp == '1':
is_ok, message = site_check(addr, extra) is_ok, message = site_check(addr, extra)
elif tp == '2': elif tp == '2':
@ -82,20 +85,24 @@ def monitor_worker_handler(job):
is_ok, message = False, f'unknown host id for {addr!r}' is_ok, message = False, f'unknown host id for {addr!r}'
else: else:
is_ok, message = host_executor(host, command) is_ok, message = host_executor(host, command)
target = f'{host.name}({host.hostname})'
rds, key, f_count, f_time = get_redis_connection(), f'spug:det:{task_id}', f'c_{addr}', f't_{addr}'
v_count, v_time = rds.hmget(key, f_count, f_time)
if is_ok:
if v_count:
rds.hdel(key, f_count, f_time)
if v_time:
logging.warning('send recovery notification')
handle_notify(task_id, target, is_ok, message, int(v_count) + 1)
return
v_count = rds.hincrby(key, f_count)
if v_count >= threshold:
if not v_time or int(time.time()) - int(v_time) >= quiet * 60:
rds.hset(key, f_time, int(time.time()))
logging.warning('send fault alarm notification')
handle_notify(task_id, target, is_ok, message, v_count)
# is_notified = True if obj.latest_notify_time else False
# if obj.latest_status in [0, None] and is_ok is False:
# obj.latest_fault_time = int(time.time())
# if is_ok:
# obj.latest_notify_time = 0
# obj.fault_times = 0
# else:
# obj.fault_times += 1
# obj.latest_status = 0 if is_ok else 1
# obj.latest_run_time = human_datetime(event.scheduled_run_time)
# obj.save()
# self._handle_notify(obj, is_notified, out)
def dispatch(tp, addr, extra): def dispatch(tp, addr, extra):
if tp == '1': if tp == '1':

View File

@ -16,13 +16,12 @@ class Detection(models.Model, ModelMixin):
('5', 'Ping检测'), ('5', 'Ping检测'),
) )
STATUS = ( STATUS = (
(0, '成功'), (0, '正常'),
(1, '失败'), (1, '异常'),
) )
name = models.CharField(max_length=50) name = models.CharField(max_length=50)
type = models.CharField(max_length=2, choices=TYPES) type = models.CharField(max_length=2, choices=TYPES)
group = models.CharField(max_length=255, null=True) group = models.CharField(max_length=255, null=True)
addr = models.CharField(max_length=255) # 要删除的
targets = models.TextField() targets = models.TextField()
extra = models.TextField(null=True) extra = models.TextField(null=True)
desc = models.CharField(max_length=255, null=True) desc = models.CharField(max_length=255, null=True)
@ -33,10 +32,7 @@ class Detection(models.Model, ModelMixin):
fault_times = models.SmallIntegerField(default=0) fault_times = models.SmallIntegerField(default=0)
notify_mode = models.CharField(max_length=255) notify_mode = models.CharField(max_length=255)
notify_grp = models.CharField(max_length=255) notify_grp = models.CharField(max_length=255)
latest_status = models.SmallIntegerField(choices=STATUS, null=True)
latest_run_time = models.CharField(max_length=20, null=True) latest_run_time = models.CharField(max_length=20, null=True)
latest_fault_time = models.IntegerField(null=True)
latest_notify_time = models.IntegerField(default=0)
created_at = models.CharField(max_length=20, default=human_datetime) created_at = models.CharField(max_length=20, default=human_datetime)
created_by = models.ForeignKey(User, models.PROTECT, related_name='+') created_by = models.ForeignKey(User, models.PROTECT, related_name='+')
@ -46,7 +42,6 @@ class Detection(models.Model, ModelMixin):
def to_dict(self, *args, **kwargs): def to_dict(self, *args, **kwargs):
tmp = super().to_dict(*args, **kwargs) tmp = super().to_dict(*args, **kwargs)
tmp['type_alias'] = self.get_type_display() tmp['type_alias'] = self.get_type_display()
tmp['latest_status_alias'] = self.get_latest_status_display()
tmp['notify_mode'] = json.loads(self.notify_mode) tmp['notify_mode'] = json.loads(self.notify_mode)
tmp['notify_grp'] = json.loads(self.notify_grp) tmp['notify_grp'] = json.loads(self.notify_grp)
tmp['targets'] = json.loads(self.targets) tmp['targets'] = json.loads(self.targets)

View File

@ -99,12 +99,12 @@ class Scheduler:
obj.save() obj.save()
self._handle_notify(obj, is_notified, out) self._handle_notify(obj, is_notified, out)
def _dispatch(self, task_id, tp, targets, extra): def _dispatch(self, task_id, tp, targets, extra, threshold, quiet):
close_old_connections() close_old_connections()
Detection.objects.filter(pk=task_id).update(latest_run_time=human_datetime()) Detection.objects.filter(pk=task_id).update(latest_run_time=human_datetime())
rds_cli = get_redis_connection() rds_cli = get_redis_connection()
for t in json.loads(targets): for t in json.loads(targets):
rds_cli.rpush(MONITOR_WORKER_KEY, json.dumps([task_id, tp, t, extra])) rds_cli.rpush(MONITOR_WORKER_KEY, json.dumps([task_id, tp, t, extra, threshold, quiet]))
def _init(self): def _init(self):
self.scheduler.start() self.scheduler.start()
@ -114,7 +114,7 @@ class Scheduler:
self._dispatch, self._dispatch,
trigger, trigger,
id=str(item.id), id=str(item.id),
args=(item.id, item.type, item.targets, item.extra), args=(item.id, item.type, item.targets, item.extra, item.threshold, item.quiet),
) )
def run(self): def run(self):
@ -131,7 +131,7 @@ class Scheduler:
self._dispatch, self._dispatch,
trigger, trigger,
id=str(task.id), id=str(task.id),
args=(task.id, task.type, task.targets, task.extra), args=(task.id, task.type, task.targets, task.extra, task.threshold, task.quiet),
replace_existing=True replace_existing=True
) )
elif task.action == 'remove': elif task.action == 'remove':

View File

@ -1,6 +1,13 @@
# Copyright: (c) OpenSpug Organization. https://github.com/openspug/spug # Copyright: (c) OpenSpug Organization. https://github.com/openspug/spug
# Copyright: (c) <spug.dev@gmail.com> # Copyright: (c) <spug.dev@gmail.com>
# Released under the AGPL-3.0 License. # Released under the AGPL-3.0 License.
from django.db import close_old_connections
from apps.alarm.models import Alarm
from apps.monitor.models import Detection
from libs.spug import Notification
import json
def seconds_to_human(seconds): def seconds_to_human(seconds):
text = '' text = ''
if seconds > 3600: if seconds > 3600:
@ -9,4 +16,28 @@ def seconds_to_human(seconds):
if seconds > 60: if seconds > 60:
text += f'{int(seconds / 60)}分钟' text += f'{int(seconds / 60)}分钟'
seconds = seconds % 60 seconds = seconds % 60
return f'{text}{int(seconds)}' if seconds:
text += f'{seconds}'
return text
def _record_alarm(det, target, duration, status):
Alarm.objects.create(
name=det.name,
type=det.get_type_display(),
target=target,
status=status,
duration=duration,
notify_grp=det.notify_grp,
notify_mode=det.notify_mode)
def handle_notify(task_id, target, is_ok, out, fault_times):
close_old_connections()
det = Detection.objects.get(pk=task_id)
duration = seconds_to_human(det.rate * fault_times * 60)
event = '2' if is_ok else '1'
_record_alarm(det, target, duration, event)
grp = json.loads(det.notify_grp)
notify = Notification(grp, event, target, det.name, out, duration)
notify.dispatch(json.loads(det.notify_mode))

View File

@ -32,7 +32,7 @@ class Notify(models.Model, ModelMixin):
if not with_quiet or time.time() - cache.get('spug:notify_quiet', 0) > 3600: if not with_quiet or time.time() - cache.get('spug:notify_quiet', 0) > 3600:
cache.set('spug:notify_quiet', time.time()) cache.set('spug:notify_quiet', time.time())
cls.objects.create(source=source, title=title, type=type, content=content) cls.objects.create(source=source, title=title, type=type, content=content)
Channel.send_notify(title, content) Channel.send_notify(title, content)
def __repr__(self): def __repr__(self):
return '<Notify %r>' % self.title return '<Notify %r>' % self.title

View File

@ -9,125 +9,144 @@ from libs.utils import human_datetime
import requests import requests
import json import json
spug_server = 'http://spug-wx.qbangmang.com' spug_server = 'https://api.spug.cc'
notify_source = 'monitor' notify_source = 'monitor'
def _parse_args(grp): class Notification:
spug_key = AppSetting.get_default('spug_key') def __init__(self, grp, event, target, title, message, duration):
return spug_key, sum([json.loads(x.contacts) for x in Group.objects.filter(id__in=grp)], []) self.event = event
self.title = title
self.target = target
self.message = message
self.duration = duration
self.spug_key, self.u_ids = self._parse_args(grp)
def _parse_args(self, grp):
spug_key = AppSetting.get_default('spug_key')
return spug_key, sum([json.loads(x.contacts) for x in Group.objects.filter(id__in=grp)], [])
def _handle_response(res, mode): def _handle_request(self, mode, url, data):
if res.status_code != 200: try:
Notify.make_notify(notify_source, '1', '告警通知发送失败', f'返回状态码:{res.status_code}, 请求URL{res.url}') res = requests.post(url, json=data, timeout=30)
if mode in ['dd', 'wx']: except Exception as e:
res = res.json() Notify.make_notify(notify_source, '1', '告警通知发送失败', f'接口调用异常:{e}')
if res.get('errcode') != 0: return
Notify.make_notify(notify_source, '1', '告警通知发送失败', f'返回数据:{res}') if res.status_code != 200:
if mode == 'spug': Notify.make_notify(notify_source, '1', '告警通知发送失败', f'返回状态码:{res.status_code}, 请求URL{res.url}')
res = res.json() if mode in ['dd', 'wx']:
if res.get('error'): res = res.json()
Notify.make_notify(notify_source, '1', '告警通知发送失败', f'错误信息:{res}') if res.get('errcode') != 0:
Notify.make_notify(notify_source, '1', '告警通知发送失败', f'返回数据:{res}')
if mode == 'spug':
res = res.json()
if res.get('error'):
Notify.make_notify(notify_source, '1', '告警通知发送失败', f'错误信息:{res}')
def _by_wx(self):
def notify_by_wx(event, obj): if not self.spug_key:
spug_key, u_ids = _parse_args(obj.grp) Notify.make_notify(notify_source, '1', '发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。')
if not spug_key: return
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。') users = set(x.wx_token for x in Contact.objects.filter(id__in=self.u_ids, wx_token__isnull=False))
return if users:
users = set(x.wx_token for x in Contact.objects.filter(id__in=u_ids, wx_token__isnull=False))
if users:
data = {
'token': spug_key,
'event': event,
'subject': obj.name,
'desc': obj.out,
'remark': f'故障持续{obj.duration}' if event == '2' else None,
'users': list(users)
}
res = requests.post(f'{spug_server}/apis/notify/wx/', json=data)
_handle_response(res, 'spug')
else:
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象请确保设置了相关报警联系人的微信Token。')
def notify_by_email(event, obj):
spug_key, u_ids = _parse_args(obj.grp)
users = set(x.email for x in Contact.objects.filter(id__in=u_ids, email__isnull=False))
if users:
mail_service = json.loads(AppSetting.get_default('mail_service', '{}'))
body = ['告警名称:' + obj.name, '告警时间:' + human_datetime(), '告警描述:' + obj.out]
if event == '2':
body.append('故障持续:' + obj.duration)
if mail_service.get('server'):
event_map = {'1': '告警发生', '2': '告警恢复'}
subject = f'{event_map[event]}-{obj.name}'
mail = Mail(**mail_service)
mail.send_text_mail(users, subject, '\r\n'.join(body) + '\r\n\r\n自动发送,请勿回复。')
elif spug_key:
data = { data = {
'token': spug_key, 'token': self.spug_key,
'event': event, 'event': self.event,
'subject': obj.name, 'subject': self.title,
'body': '\r\n'.join(body), 'desc': self.message,
'remark': f'故障持续{self.duration}' if self.event == '2' else None,
'users': list(users) 'users': list(users)
} }
res = requests.post(f'{spug_server}/apis/notify/mail/', json=data) self._handle_request('spug', f'{spug_server}/apis/notify/wx/', data)
_handle_response(res, 'spug')
else: else:
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。') Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象请确保设置了相关报警联系人的微信Token。')
else:
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的邮件地址。')
def _by_email(self):
users = set(x.email for x in Contact.objects.filter(id__in=self.u_ids, email__isnull=False))
if users:
mail_service = json.loads(AppSetting.get_default('mail_service', '{}'))
body = [
f'告警名称:{self.title}',
f'告警对象:{self.target}',
f'{"告警" if self.event == "1" else "恢复"}时间:{human_datetime()}',
f'告警描述:{self.message}'
]
if self.event == '2':
body.append('故障持续:' + self.duration)
if mail_service.get('server'):
event_map = {'1': '监控告警通知', '2': '告警恢复通知'}
subject = f'{event_map[self.event]}-{self.title}'
mail = Mail(**mail_service)
mail.send_text_mail(users, subject, '\r\n'.join(body) + '\r\n\r\n自动发送,请勿回复。')
elif self.spug_key:
data = {
'token': self.spug_key,
'event': self.event,
'subject': self.title,
'body': '\r\n'.join(body),
'users': list(users)
}
self._handle_request('spug', f'{spug_server}/apis/notify/mail/', data)
else:
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。')
else:
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的邮件地址。')
def notify_by_dd(event, obj): def _by_dd(self):
_, u_ids = _parse_args(obj.grp) users = set(x.ding for x in Contact.objects.filter(id__in=self.u_ids, ding__isnull=False))
users = set(x.ding for x in Contact.objects.filter(id__in=u_ids, ding__isnull=False)) if users:
if users: texts = [
texts = [ '## %s ## ' % ('监控告警通知' if self.event == '1' else '告警恢复通知'),
'## %s ## ' % ('监控告警通知' if event == '1' else '告警恢复通知'), f'**告警名称:** <font color="#{"f90202" if self.event == "1" else "008000"}">{self.title}</font> ',
f'**告警名称:** <font color="#{"f90202" if event == "1" else "008000"}">{obj.name}</font> ', f'**告警对象:** {self.target} ',
f'**告警时间:** {human_datetime()} ', f'**{"告警" if self.event == "1" else "恢复"}时间:** {human_datetime()} ',
f'**告警描述:** {obj.out} ', f'**告警描述:** {self.message} ',
] ]
if event == '2': if self.event == '2':
texts.append(f'**持续时间:** {obj.duration} ') texts.append(f'**持续时间:** {self.duration} ')
data = { data = {
'msgtype': 'markdown', 'msgtype': 'markdown',
'markdown': { 'markdown': {
'title': '监控告警通知', 'title': '监控告警通知',
'text': '\n\n'.join(texts) + '\n\n> ###### 来自 Spug运维平台' 'text': '\n\n'.join(texts) + '\n\n> ###### 来自 Spug运维平台'
}
} }
} for url in users:
for url in users: self._handle_request('dd', url, data)
res = requests.post(url, json=data) else:
_handle_response(res, 'dd') Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的钉钉。')
else:
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的钉钉。')
def _by_qy_wx(self):
def notify_by_qy_wx(event, obj): users = set(x.qy_wx for x in Contact.objects.filter(id__in=self.u_ids, qy_wx__isnull=False))
_, u_ids = _parse_args(obj.grp) if users:
users = set(x.qy_wx for x in Contact.objects.filter(id__in=u_ids, qy_wx__isnull=False)) color, title = ('warning', '监控告警通知') if self.event == '1' else ('info', '告警恢复通知')
if users: texts = [
color, title = ('warning', '监控告警通知') if event == '1' else ('info', '告警恢复通知') f'## {title}',
texts = [ f'**告警名称:** <font color="{color}">{self.title}</font> ',
f'## {title}', f'**告警对象:** {self.target}',
f'**告警名称:** <font color="{color}">{obj.name}</font> ', f'**{"告警" if self.event == "1" else "恢复"}时间:** {human_datetime()} ',
f'**告警时间:** {human_datetime()} ', f'**告警描述:** {self.message} ',
f'**告警描述:** {obj.out} ', ]
] if self.event == '2':
if event == '2': texts.append(f'**持续时间:** {self.duration} ')
texts.append(f'**持续时间:** {obj.duration} ') data = {
data = { 'msgtype': 'markdown',
'msgtype': 'markdown', 'markdown': {
'markdown': { 'content': '\n'.join(texts) + '\n> 来自 Spug运维平台'
'content': '\n'.join(texts) + '\n> 来自 Spug运维平台' }
} }
} for url in users:
for url in users: self._handle_request('wx', url, data)
res = requests.post(url, json=data) else:
_handle_response(res, 'wx') Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的企业微信。')
else:
Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的企业微信。') def dispatch(self, modes):
for mode in modes:
if mode == '1':
self._by_wx()
elif mode == '3':
self._by_dd()
elif mode == '4':
self._by_email()
elif mode == '5':
self._by_qy_wx()

View File

@ -1,4 +1,4 @@
apscheduler==3.6.3 apscheduler==3.7.0
Django==2.2.13 Django==2.2.13
channels==2.3.1 channels==2.3.1
channels_redis==2.4.1 channels_redis==2.4.1

View File

@ -42,6 +42,9 @@ class ComTable extends React.Component {
}, { }, {
title: '监控类型', title: '监控类型',
dataIndex: 'type', dataIndex: 'type',
}, {
title: '监控对象',
dataIndex: 'target'
}, { }, {
title: '状态', title: '状态',
dataIndex: 'status', dataIndex: 'status',

View File

@ -99,25 +99,12 @@ class ComTable extends React.Component {
<Table.Column title="监控分组" dataIndex="group" /> <Table.Column title="监控分组" dataIndex="group" />
<Table.Column title="监控名称" dataIndex="name"/> <Table.Column title="监控名称" dataIndex="name"/>
<Table.Column title="类型" dataIndex="type_alias"/> <Table.Column title="类型" dataIndex="type_alias"/>
<Table.Column ellipsis title="地址" render={info => {
if ('34'.includes(info.type)) {
return lds.get(this.state.hosts, `${info.addr}.name`)
} else {
return info.addr
}
}}/>
<Table.Column title="频率" dataIndex="rate" render={value => `${value}分钟`}/> <Table.Column title="频率" dataIndex="rate" render={value => `${value}分钟`}/>
<Table.Column title="状态" render={info => { <Table.Column title="状态" render={info => {
if (info.is_active) { if (info.is_active) {
if (info['latest_status'] === 0) { return <Tag color="green">监控中</Tag>
return <Tag color="green">正常</Tag>
} else if (info['latest_status'] === 1) {
return <Tag color="red">异常</Tag>
} else {
return <Tag color="orange">待检测</Tag>
}
} else { } else {
return <Tag>未启用</Tag> return <Tag color="red">未启用</Tag>
} }
}}/> }}/>
<Table.Column title="更新于" dataIndex="latest_run_time_alias" <Table.Column title="更新于" dataIndex="latest_run_time_alias"