From b4bcb2606a99175ac74f430c43e73a4fa7d26b3d Mon Sep 17 00:00:00 2001 From: vapao Date: Sun, 25 Apr 2021 22:26:20 +0800 Subject: [PATCH] upgrade monitor module --- spug_api/apps/alarm/models.py | 1 + .../exec/management/commands/runexecutor.py | 2 + spug_api/apps/monitor/executors.py | 35 +-- spug_api/apps/monitor/models.py | 9 +- spug_api/apps/monitor/scheduler.py | 8 +- spug_api/apps/monitor/utils.py | 33 ++- spug_api/apps/notify/models.py | 2 +- spug_api/libs/spug.py | 233 ++++++++++-------- spug_api/requirements.txt | 2 +- spug_web/src/pages/alarm/alarm/Table.js | 3 + spug_web/src/pages/monitor/Table.js | 17 +- 11 files changed, 195 insertions(+), 150 deletions(-) diff --git a/spug_api/apps/alarm/models.py b/spug_api/apps/alarm/models.py index 6cafefa..3ffbc86 100644 --- a/spug_api/apps/alarm/models.py +++ b/spug_api/apps/alarm/models.py @@ -21,6 +21,7 @@ class Alarm(models.Model, ModelMixin): ) name = models.CharField(max_length=50) type = models.CharField(max_length=50) + target = models.CharField(max_length=100) notify_mode = models.CharField(max_length=255) notify_grp = models.CharField(max_length=255) status = models.CharField(max_length=2, choices=STATUS) diff --git a/spug_api/apps/exec/management/commands/runexecutor.py b/spug_api/apps/exec/management/commands/runexecutor.py index 2a51cc8..d9ef35a 100644 --- a/spug_api/apps/exec/management/commands/runexecutor.py +++ b/spug_api/apps/exec/management/commands/runexecutor.py @@ -12,6 +12,8 @@ import logging MONITOR_WORKER_KEY = settings.MONITOR_WORKER_KEY SCHEDULE_WORKER_KEY = settings.SCHEDULE_WORKER_KEY +logging.basicConfig(level=logging.WARNING, format='%(asctime)s %(message)s') + class Worker: def __init__(self): diff --git a/spug_api/apps/monitor/executors.py b/spug_api/apps/monitor/executors.py index b2cf0c7..7614e53 100644 --- a/spug_api/apps/monitor/executors.py +++ b/spug_api/apps/monitor/executors.py @@ -2,13 +2,16 @@ # Copyright: (c) # Released under the AGPL-3.0 License. from django.db import close_old_connections +from django_redis import get_redis_connection from apps.host.models import Host +from apps.monitor.utils import handle_notify from socket import socket import subprocess import platform import requests import logging import json +import time logging.captureWarnings(True) @@ -64,8 +67,8 @@ def host_executor(host, command): def monitor_worker_handler(job): - print('enter: ', job) - task_id, tp, addr, extra = json.loads(job) + task_id, tp, addr, extra, threshold, quiet = json.loads(job) + target = addr if tp == '1': is_ok, message = site_check(addr, extra) elif tp == '2': @@ -82,20 +85,24 @@ def monitor_worker_handler(job): is_ok, message = False, f'unknown host id for {addr!r}' else: is_ok, message = host_executor(host, command) + target = f'{host.name}({host.hostname})' + rds, key, f_count, f_time = get_redis_connection(), f'spug:det:{task_id}', f'c_{addr}', f't_{addr}' + v_count, v_time = rds.hmget(key, f_count, f_time) + if is_ok: + if v_count: + rds.hdel(key, f_count, f_time) + if v_time: + logging.warning('send recovery notification') + handle_notify(task_id, target, is_ok, message, int(v_count) + 1) + return + v_count = rds.hincrby(key, f_count) + if v_count >= threshold: + if not v_time or int(time.time()) - int(v_time) >= quiet * 60: + rds.hset(key, f_time, int(time.time())) + logging.warning('send fault alarm notification') + handle_notify(task_id, target, is_ok, message, v_count) - # is_notified = True if obj.latest_notify_time else False - # if obj.latest_status in [0, None] and is_ok is False: - # obj.latest_fault_time = int(time.time()) - # if is_ok: - # obj.latest_notify_time = 0 - # obj.fault_times = 0 - # else: - # obj.fault_times += 1 - # obj.latest_status = 0 if is_ok else 1 - # obj.latest_run_time = human_datetime(event.scheduled_run_time) - # obj.save() - # self._handle_notify(obj, is_notified, out) def dispatch(tp, addr, extra): if tp == '1': diff --git a/spug_api/apps/monitor/models.py b/spug_api/apps/monitor/models.py index 17e765b..afc4579 100644 --- a/spug_api/apps/monitor/models.py +++ b/spug_api/apps/monitor/models.py @@ -16,13 +16,12 @@ class Detection(models.Model, ModelMixin): ('5', 'Ping检测'), ) STATUS = ( - (0, '成功'), - (1, '失败'), + (0, '正常'), + (1, '异常'), ) name = models.CharField(max_length=50) type = models.CharField(max_length=2, choices=TYPES) group = models.CharField(max_length=255, null=True) - addr = models.CharField(max_length=255) # 要删除的 targets = models.TextField() extra = models.TextField(null=True) desc = models.CharField(max_length=255, null=True) @@ -33,10 +32,7 @@ class Detection(models.Model, ModelMixin): fault_times = models.SmallIntegerField(default=0) notify_mode = models.CharField(max_length=255) notify_grp = models.CharField(max_length=255) - latest_status = models.SmallIntegerField(choices=STATUS, null=True) latest_run_time = models.CharField(max_length=20, null=True) - latest_fault_time = models.IntegerField(null=True) - latest_notify_time = models.IntegerField(default=0) created_at = models.CharField(max_length=20, default=human_datetime) created_by = models.ForeignKey(User, models.PROTECT, related_name='+') @@ -46,7 +42,6 @@ class Detection(models.Model, ModelMixin): def to_dict(self, *args, **kwargs): tmp = super().to_dict(*args, **kwargs) tmp['type_alias'] = self.get_type_display() - tmp['latest_status_alias'] = self.get_latest_status_display() tmp['notify_mode'] = json.loads(self.notify_mode) tmp['notify_grp'] = json.loads(self.notify_grp) tmp['targets'] = json.loads(self.targets) diff --git a/spug_api/apps/monitor/scheduler.py b/spug_api/apps/monitor/scheduler.py index 6b49473..b0cace6 100644 --- a/spug_api/apps/monitor/scheduler.py +++ b/spug_api/apps/monitor/scheduler.py @@ -99,12 +99,12 @@ class Scheduler: obj.save() self._handle_notify(obj, is_notified, out) - def _dispatch(self, task_id, tp, targets, extra): + def _dispatch(self, task_id, tp, targets, extra, threshold, quiet): close_old_connections() Detection.objects.filter(pk=task_id).update(latest_run_time=human_datetime()) rds_cli = get_redis_connection() for t in json.loads(targets): - rds_cli.rpush(MONITOR_WORKER_KEY, json.dumps([task_id, tp, t, extra])) + rds_cli.rpush(MONITOR_WORKER_KEY, json.dumps([task_id, tp, t, extra, threshold, quiet])) def _init(self): self.scheduler.start() @@ -114,7 +114,7 @@ class Scheduler: self._dispatch, trigger, id=str(item.id), - args=(item.id, item.type, item.targets, item.extra), + args=(item.id, item.type, item.targets, item.extra, item.threshold, item.quiet), ) def run(self): @@ -131,7 +131,7 @@ class Scheduler: self._dispatch, trigger, id=str(task.id), - args=(task.id, task.type, task.targets, task.extra), + args=(task.id, task.type, task.targets, task.extra, task.threshold, task.quiet), replace_existing=True ) elif task.action == 'remove': diff --git a/spug_api/apps/monitor/utils.py b/spug_api/apps/monitor/utils.py index 4f26150..2fe9e5d 100644 --- a/spug_api/apps/monitor/utils.py +++ b/spug_api/apps/monitor/utils.py @@ -1,6 +1,13 @@ # Copyright: (c) OpenSpug Organization. https://github.com/openspug/spug # Copyright: (c) # Released under the AGPL-3.0 License. +from django.db import close_old_connections +from apps.alarm.models import Alarm +from apps.monitor.models import Detection +from libs.spug import Notification +import json + + def seconds_to_human(seconds): text = '' if seconds > 3600: @@ -9,4 +16,28 @@ def seconds_to_human(seconds): if seconds > 60: text += f'{int(seconds / 60)}分钟' seconds = seconds % 60 - return f'{text}{int(seconds)}秒' + if seconds: + text += f'{seconds}秒' + return text + + +def _record_alarm(det, target, duration, status): + Alarm.objects.create( + name=det.name, + type=det.get_type_display(), + target=target, + status=status, + duration=duration, + notify_grp=det.notify_grp, + notify_mode=det.notify_mode) + + +def handle_notify(task_id, target, is_ok, out, fault_times): + close_old_connections() + det = Detection.objects.get(pk=task_id) + duration = seconds_to_human(det.rate * fault_times * 60) + event = '2' if is_ok else '1' + _record_alarm(det, target, duration, event) + grp = json.loads(det.notify_grp) + notify = Notification(grp, event, target, det.name, out, duration) + notify.dispatch(json.loads(det.notify_mode)) diff --git a/spug_api/apps/notify/models.py b/spug_api/apps/notify/models.py index c5c7516..ecb346a 100644 --- a/spug_api/apps/notify/models.py +++ b/spug_api/apps/notify/models.py @@ -32,7 +32,7 @@ class Notify(models.Model, ModelMixin): if not with_quiet or time.time() - cache.get('spug:notify_quiet', 0) > 3600: cache.set('spug:notify_quiet', time.time()) cls.objects.create(source=source, title=title, type=type, content=content) - Channel.send_notify(title, content) + Channel.send_notify(title, content) def __repr__(self): return '' % self.title diff --git a/spug_api/libs/spug.py b/spug_api/libs/spug.py index 3e723bb..dac6850 100644 --- a/spug_api/libs/spug.py +++ b/spug_api/libs/spug.py @@ -9,125 +9,144 @@ from libs.utils import human_datetime import requests import json -spug_server = 'http://spug-wx.qbangmang.com' +spug_server = 'https://api.spug.cc' notify_source = 'monitor' -def _parse_args(grp): - spug_key = AppSetting.get_default('spug_key') - return spug_key, sum([json.loads(x.contacts) for x in Group.objects.filter(id__in=grp)], []) +class Notification: + def __init__(self, grp, event, target, title, message, duration): + self.event = event + self.title = title + self.target = target + self.message = message + self.duration = duration + self.spug_key, self.u_ids = self._parse_args(grp) + def _parse_args(self, grp): + spug_key = AppSetting.get_default('spug_key') + return spug_key, sum([json.loads(x.contacts) for x in Group.objects.filter(id__in=grp)], []) -def _handle_response(res, mode): - if res.status_code != 200: - Notify.make_notify(notify_source, '1', '告警通知发送失败', f'返回状态码:{res.status_code}, 请求URL:{res.url}') - if mode in ['dd', 'wx']: - res = res.json() - if res.get('errcode') != 0: - Notify.make_notify(notify_source, '1', '告警通知发送失败', f'返回数据:{res}') - if mode == 'spug': - res = res.json() - if res.get('error'): - Notify.make_notify(notify_source, '1', '告警通知发送失败', f'错误信息:{res}') + def _handle_request(self, mode, url, data): + try: + res = requests.post(url, json=data, timeout=30) + except Exception as e: + Notify.make_notify(notify_source, '1', '告警通知发送失败', f'接口调用异常:{e}') + return + if res.status_code != 200: + Notify.make_notify(notify_source, '1', '告警通知发送失败', f'返回状态码:{res.status_code}, 请求URL:{res.url}') + if mode in ['dd', 'wx']: + res = res.json() + if res.get('errcode') != 0: + Notify.make_notify(notify_source, '1', '告警通知发送失败', f'返回数据:{res}') + if mode == 'spug': + res = res.json() + if res.get('error'): + Notify.make_notify(notify_source, '1', '告警通知发送失败', f'错误信息:{res}') - -def notify_by_wx(event, obj): - spug_key, u_ids = _parse_args(obj.grp) - if not spug_key: - Notify.make_notify(notify_source, '1', '发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。') - return - users = set(x.wx_token for x in Contact.objects.filter(id__in=u_ids, wx_token__isnull=False)) - if users: - data = { - 'token': spug_key, - 'event': event, - 'subject': obj.name, - 'desc': obj.out, - 'remark': f'故障持续{obj.duration}' if event == '2' else None, - 'users': list(users) - } - res = requests.post(f'{spug_server}/apis/notify/wx/', json=data) - _handle_response(res, 'spug') - else: - Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的微信Token。') - - -def notify_by_email(event, obj): - spug_key, u_ids = _parse_args(obj.grp) - users = set(x.email for x in Contact.objects.filter(id__in=u_ids, email__isnull=False)) - if users: - mail_service = json.loads(AppSetting.get_default('mail_service', '{}')) - body = ['告警名称:' + obj.name, '告警时间:' + human_datetime(), '告警描述:' + obj.out] - if event == '2': - body.append('故障持续:' + obj.duration) - if mail_service.get('server'): - event_map = {'1': '告警发生', '2': '告警恢复'} - subject = f'{event_map[event]}-{obj.name}' - mail = Mail(**mail_service) - mail.send_text_mail(users, subject, '\r\n'.join(body) + '\r\n\r\n自动发送,请勿回复。') - elif spug_key: + def _by_wx(self): + if not self.spug_key: + Notify.make_notify(notify_source, '1', '发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。') + return + users = set(x.wx_token for x in Contact.objects.filter(id__in=self.u_ids, wx_token__isnull=False)) + if users: data = { - 'token': spug_key, - 'event': event, - 'subject': obj.name, - 'body': '\r\n'.join(body), + 'token': self.spug_key, + 'event': self.event, + 'subject': self.title, + 'desc': self.message, + 'remark': f'故障持续{self.duration}' if self.event == '2' else None, 'users': list(users) } - res = requests.post(f'{spug_server}/apis/notify/mail/', json=data) - _handle_response(res, 'spug') + self._handle_request('spug', f'{spug_server}/apis/notify/wx/', data) else: - Notify.make_notify(notify_source, '1', '发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。') - else: - Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的邮件地址。') + Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的微信Token。') + def _by_email(self): + users = set(x.email for x in Contact.objects.filter(id__in=self.u_ids, email__isnull=False)) + if users: + mail_service = json.loads(AppSetting.get_default('mail_service', '{}')) + body = [ + f'告警名称:{self.title}', + f'告警对象:{self.target}', + f'{"告警" if self.event == "1" else "恢复"}时间:{human_datetime()}', + f'告警描述:{self.message}' + ] + if self.event == '2': + body.append('故障持续:' + self.duration) + if mail_service.get('server'): + event_map = {'1': '监控告警通知', '2': '告警恢复通知'} + subject = f'{event_map[self.event]}-{self.title}' + mail = Mail(**mail_service) + mail.send_text_mail(users, subject, '\r\n'.join(body) + '\r\n\r\n自动发送,请勿回复。') + elif self.spug_key: + data = { + 'token': self.spug_key, + 'event': self.event, + 'subject': self.title, + 'body': '\r\n'.join(body), + 'users': list(users) + } + self._handle_request('spug', f'{spug_server}/apis/notify/mail/', data) + else: + Notify.make_notify(notify_source, '1', '发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。') + else: + Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的邮件地址。') -def notify_by_dd(event, obj): - _, u_ids = _parse_args(obj.grp) - users = set(x.ding for x in Contact.objects.filter(id__in=u_ids, ding__isnull=False)) - if users: - texts = [ - '## %s ## ' % ('监控告警通知' if event == '1' else '告警恢复通知'), - f'**告警名称:** {obj.name} ', - f'**告警时间:** {human_datetime()} ', - f'**告警描述:** {obj.out} ', - ] - if event == '2': - texts.append(f'**持续时间:** {obj.duration} ') - data = { - 'msgtype': 'markdown', - 'markdown': { - 'title': '监控告警通知', - 'text': '\n\n'.join(texts) + '\n\n> ###### 来自 Spug运维平台' + def _by_dd(self): + users = set(x.ding for x in Contact.objects.filter(id__in=self.u_ids, ding__isnull=False)) + if users: + texts = [ + '## %s ## ' % ('监控告警通知' if self.event == '1' else '告警恢复通知'), + f'**告警名称:** {self.title} ', + f'**告警对象:** {self.target} ', + f'**{"告警" if self.event == "1" else "恢复"}时间:** {human_datetime()} ', + f'**告警描述:** {self.message} ', + ] + if self.event == '2': + texts.append(f'**持续时间:** {self.duration} ') + data = { + 'msgtype': 'markdown', + 'markdown': { + 'title': '监控告警通知', + 'text': '\n\n'.join(texts) + '\n\n> ###### 来自 Spug运维平台' + } } - } - for url in users: - res = requests.post(url, json=data) - _handle_response(res, 'dd') - else: - Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的钉钉。') + for url in users: + self._handle_request('dd', url, data) + else: + Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的钉钉。') - -def notify_by_qy_wx(event, obj): - _, u_ids = _parse_args(obj.grp) - users = set(x.qy_wx for x in Contact.objects.filter(id__in=u_ids, qy_wx__isnull=False)) - if users: - color, title = ('warning', '监控告警通知') if event == '1' else ('info', '告警恢复通知') - texts = [ - f'## {title}', - f'**告警名称:** {obj.name} ', - f'**告警时间:** {human_datetime()} ', - f'**告警描述:** {obj.out} ', - ] - if event == '2': - texts.append(f'**持续时间:** {obj.duration} ') - data = { - 'msgtype': 'markdown', - 'markdown': { - 'content': '\n'.join(texts) + '\n> 来自 Spug运维平台' + def _by_qy_wx(self): + users = set(x.qy_wx for x in Contact.objects.filter(id__in=self.u_ids, qy_wx__isnull=False)) + if users: + color, title = ('warning', '监控告警通知') if self.event == '1' else ('info', '告警恢复通知') + texts = [ + f'## {title}', + f'**告警名称:** {self.title} ', + f'**告警对象:** {self.target}', + f'**{"告警" if self.event == "1" else "恢复"}时间:** {human_datetime()} ', + f'**告警描述:** {self.message} ', + ] + if self.event == '2': + texts.append(f'**持续时间:** {self.duration} ') + data = { + 'msgtype': 'markdown', + 'markdown': { + 'content': '\n'.join(texts) + '\n> 来自 Spug运维平台' + } } - } - for url in users: - res = requests.post(url, json=data) - _handle_response(res, 'wx') - else: - Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的企业微信。') + for url in users: + self._handle_request('wx', url, data) + else: + Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的企业微信。') + + def dispatch(self, modes): + for mode in modes: + if mode == '1': + self._by_wx() + elif mode == '3': + self._by_dd() + elif mode == '4': + self._by_email() + elif mode == '5': + self._by_qy_wx() diff --git a/spug_api/requirements.txt b/spug_api/requirements.txt index 339e71e..469b744 100644 --- a/spug_api/requirements.txt +++ b/spug_api/requirements.txt @@ -1,4 +1,4 @@ -apscheduler==3.6.3 +apscheduler==3.7.0 Django==2.2.13 channels==2.3.1 channels_redis==2.4.1 diff --git a/spug_web/src/pages/alarm/alarm/Table.js b/spug_web/src/pages/alarm/alarm/Table.js index 32315e8..42afd6d 100644 --- a/spug_web/src/pages/alarm/alarm/Table.js +++ b/spug_web/src/pages/alarm/alarm/Table.js @@ -42,6 +42,9 @@ class ComTable extends React.Component { }, { title: '监控类型', dataIndex: 'type', + }, { + title: '监控对象', + dataIndex: 'target' }, { title: '状态', dataIndex: 'status', diff --git a/spug_web/src/pages/monitor/Table.js b/spug_web/src/pages/monitor/Table.js index e68f91f..601d979 100644 --- a/spug_web/src/pages/monitor/Table.js +++ b/spug_web/src/pages/monitor/Table.js @@ -99,25 +99,12 @@ class ComTable extends React.Component { - { - if ('34'.includes(info.type)) { - return lds.get(this.state.hosts, `${info.addr}.name`) - } else { - return info.addr - } - }}/> `${value}分钟`}/> { if (info.is_active) { - if (info['latest_status'] === 0) { - return 正常 - } else if (info['latest_status'] === 1) { - return 异常 - } else { - return 待检测 - } + return 监控中 } else { - return 未启用 + return 未启用 } }}/>