diff --git a/spug_api/apps/monitor/executors.py b/spug_api/apps/monitor/executors.py index 2df3697..2ae6f72 100644 --- a/spug_api/apps/monitor/executors.py +++ b/spug_api/apps/monitor/executors.py @@ -12,27 +12,30 @@ logging.captureWarnings(True) def site_check(url): - status_code = -1 try: res = requests.get(url, timeout=10, verify=False) - status_code = res.status_code - finally: - return status_code == 200 + return 200 <= res.status_code < 400, f'返回状态码:{res.status_code}' + except Exception as e: + return False, f'异常信息:{e}' def port_check(addr, port): - sock = socket() - sock.settimeout(5) - return sock.connect_ex((addr, int(port))) == 0 + try: + sock = socket() + sock.settimeout(5) + sock.connect((addr, int(port))) + return True, None + except Exception as e: + return False, f'异常信息:{e}' def host_executor(host, pkey, command): - exit_code = -1 try: cli = SSH(host.hostname, host.port, host.username, pkey=pkey) - exit_code, _ = cli.exec_command(command) - finally: - return exit_code == 0 + exit_code, out = cli.exec_command(command) + return exit_code == 0, out.decode() + except Exception as e: + return False, f'异常信息:{e}' def dispatch(tp, addr, extra): diff --git a/spug_api/apps/monitor/scheduler.py b/spug_api/apps/monitor/scheduler.py index 3572074..f5e9edb 100644 --- a/spug_api/apps/monitor/scheduler.py +++ b/spug_api/apps/monitor/scheduler.py @@ -13,7 +13,8 @@ from apps.monitor.executors import dispatch from apps.monitor.utils import seconds_to_human from apps.notify.models import Notify from django.conf import settings -from libs import spug, AttrDict, human_datetime +from libs import spug, AttrDict, human_datetime, human_diff_time +from datetime import datetime import logging import json import time @@ -40,22 +41,25 @@ class Scheduler: notify_grp=obj.notify_grp, notify_mode=obj.notify_mode) - def _do_notify(self, event, obj): - grp = json.loads(obj.notify_grp) + def _do_notify(self, event, obj, out): + obj.out = out + obj.grp = json.loads(obj.notify_grp) + if event == '2': + obj.duration = human_diff_time(datetime.now(), datetime.fromtimestamp(obj.latest_fault_time)) for mode in json.loads(obj.notify_mode): if mode == '1': - spug.notify_by_wx(event, obj.name, grp) + spug.notify_by_wx(event, obj) elif mode == '3': - spug.notify_by_dd(event, obj.name, grp) + spug.notify_by_dd(event, obj) elif mode == '4': - spug.notify_by_email(event, obj.name, grp) + spug.notify_by_email(event, obj) - def _handle_notify(self, obj, old_status): + def _handle_notify(self, obj, old_status, out): if obj.latest_status == 0: if old_status == 1: self._record_alarm(obj, '2') logger.info(f'{human_datetime()} recover job_id: {obj.id}') - self._do_notify('2', obj) + self._do_notify('2', obj, out) else: if obj.fault_times >= obj.threshold: if time.time() - obj.latest_notify_time >= obj.quiet * 60: @@ -63,7 +67,7 @@ class Scheduler: obj.save() self._record_alarm(obj, '1') logger.info(f'{human_datetime()} notify job_id: {obj.id}') - self._do_notify('1', obj) + self._do_notify('1', obj, out) def _handle_event(self, event): close_old_connections() @@ -78,11 +82,12 @@ class Scheduler: logger.info(f'EVENT_JOB_ERROR: job_id {event.job_id} exception: {event.exception}') Notify.make_notify('monitor', '1', f'{obj.name} - 执行异常', f'{event.exception}') elif event.code == EVENT_JOB_EXECUTED: + is_ok, out = event.retval obj = Detection.objects.filter(pk=event.job_id).first() old_status = obj.latest_status - obj.latest_status = 0 if event.retval else 1 + obj.latest_status = 0 if is_ok else 1 obj.latest_run_time = human_datetime(event.scheduled_run_time) - if old_status in [0, None] and event.retval is False: + if old_status in [0, None] and is_ok is False: obj.latest_fault_time = int(time.time()) if obj.latest_status == 0: obj.latest_notify_time = 0 @@ -90,7 +95,7 @@ class Scheduler: else: obj.fault_times += 1 obj.save() - self._handle_notify(obj, old_status) + self._handle_notify(obj, old_status, out) def _init(self): self.scheduler.start() diff --git a/spug_api/libs/spug.py b/spug_api/libs/spug.py index 515a9d7..04437e2 100644 --- a/spug_api/libs/spug.py +++ b/spug_api/libs/spug.py @@ -18,8 +18,8 @@ def _parse_args(grp): return spug_key, sum([json.loads(x.contacts) for x in Group.objects.filter(id__in=grp)], []) -def notify_by_wx(event, subject, n_grp): - spug_key, u_ids = _parse_args(n_grp) +def notify_by_wx(event, obj): + spug_key, u_ids = _parse_args(obj.grp) if not spug_key: Notify.make_notify(notify_source, '1', '发送报警信息失败', '未配置报警服务调用凭据,请在系统管理/系统设置/报警服务设置中配置。') return @@ -28,7 +28,9 @@ def notify_by_wx(event, subject, n_grp): data = { 'token': spug_key, 'event': event, - 'subject': subject, + 'subject': obj.name, + 'desc': obj.out, + 'remark': f'故障持续{obj.duration}' if event == '2' else None, 'users': list(users) } requests.post(f'{spug_server}/apis/notify/wx/', json=data) @@ -36,21 +38,25 @@ def notify_by_wx(event, subject, n_grp): Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的微信Token。') -def notify_by_email(event, subject, grp): - spug_key, u_ids = _parse_args(grp) +def notify_by_email(event, obj): + spug_key, u_ids = _parse_args(obj.grp) users = set(x.email for x in Contact.objects.filter(id__in=u_ids, email__isnull=False)) if users: mail_service = json.loads(AppSetting.get_default('mail_service', '{}')) + body = ['告警名称:' + obj.name, '告警时间:' + human_datetime(), '告警描述:' + obj.out] + if event == '2': + body.append('故障持续:' + obj.duration) if mail_service.get('server'): - event_map = {'1': '告警', '2': '恢复'} - subject = f'{event_map[event]}-{subject}' + event_map = {'1': '告警发生', '2': '告警恢复'} + subject = f'{event_map[event]}-{obj.name}' mail = Mail(**mail_service) - mail.send_text_mail(users, subject, f'{subject}\r\n\r\n自动发送,请勿回复。') + mail.send_text_mail(users, subject, '\r\n'.join(body) + '\r\n\r\n自动发送,请勿回复。') elif spug_key: data = { 'token': spug_key, 'event': event, - 'subject': subject, + 'subject': obj.name, + 'body': '\r\n'.join(body), 'users': list(users) } requests.post(f'{spug_server}/apis/notify/mail/', json=data) @@ -60,22 +66,23 @@ def notify_by_email(event, subject, grp): Notify.make_notify(notify_source, '1', '发送报警信息失败', '未找到可用的通知对象,请确保设置了相关报警联系人的邮件地址。') -def notify_by_dd(event, subject, grp): - _, u_ids = _parse_args(grp) +def notify_by_dd(event, obj): + _, u_ids = _parse_args(obj.grp) users = set(x.ding for x in Contact.objects.filter(id__in=u_ids, ding__isnull=False)) if users: texts = [ - '## %s ## ' % '监控告警通知' if event == '1' else '告警恢复通知', - f'**告警名称:** {subject} ', + '## %s ## ' % ('监控告警通知' if event == '1' else '告警恢复通知'), + f'**告警名称:** {obj.name} ', f'**告警时间:** {human_datetime()} ', - '**告警描述:** %s ' % '请在运维平台监控中心查看详情' if event == '1' else '告警已恢复', - '> ###### 来自 Spug运维平台' + f'**告警描述:** {obj.out} ', ] + if event == '2': + texts.append(f'**持续时间:** {obj.duration} ') data = { 'msgtype': 'markdown', 'markdown': { 'title': '监控告警通知', - 'text': '\n\n'.join(texts) + 'text': '\n\n'.join(texts) + '\n\n> ###### 来自 Spug运维平台' } } for url in users: