spug/spug_api/apps/monitor/executors.py

127 lines
4.1 KiB
Python

# Copyright: (c) OpenSpug Organization. https://github.com/openspug/spug
# Copyright: (c) <spug.dev@gmail.com>
# Released under the AGPL-3.0 License.
from django_redis import get_redis_connection
from apps.host.models import Host
from apps.monitor.utils import handle_notify, handle_trigger_event
from socket import socket
import subprocess
import platform
import requests
import logging
import json
import time
import re
logging.captureWarnings(True)
regex = re.compile(r'Failed to establish a new connection: (.*)\'\)+')
def site_check(url, limit):
try:
res = requests.get(url, timeout=30)
if limit:
duration = int(res.elapsed.total_seconds() * 1000)
if duration > int(limit):
return False, f'响应时间 {duration}ms 大于 {limit}ms'
return 200 <= res.status_code < 400, f'返回HTTP状态码 {res.status_code}'
except Exception as e:
error = e.__str__()
exps = re.findall(regex, error)
if exps:
error = exps[0]
return False, error
def port_check(addr, port):
try:
sock = socket()
sock.settimeout(5)
sock.connect((addr, int(port)))
sock.close()
return True, '端口状态检测正常'
except Exception as e:
return False, f'异常信息:{e}'
def ping_check(addr):
try:
if platform.system().lower() == 'windows':
command = f'ping -n 1 -w 3000 {addr}'
else:
command = f'ping -c 1 -W 3 {addr}'
task = subprocess.run(command, shell=True, stdout=subprocess.PIPE)
if task.returncode == 0:
return True, 'Ping检测正常'
else:
return False, 'Ping检测失败'
except Exception as e:
return False, f'异常信息:{e}'
def host_executor(host, command):
try:
with host.get_ssh() as ssh:
exit_code, out = ssh.exec_command_raw(command)
if exit_code == 0:
return True, out or '检测状态正常'
else:
return False, out or f'退出状态码:{exit_code}'
except Exception as e:
return False, f'异常信息:{e}'
def monitor_worker_handler(job):
task_id, tp, addr, extra, threshold, quiet = json.loads(job)
target = addr
if tp == '1':
is_ok, message = site_check(addr, extra)
elif tp == '2':
is_ok, message = port_check(addr, extra)
elif tp == '5':
is_ok, message = ping_check(addr)
elif tp not in ('3', '4'):
is_ok, message = False, f'invalid monitor type for {tp!r}'
else:
command = f'ps -ef|grep -v grep|grep {extra!r}' if tp == '3' else extra
host = Host.objects.filter(pk=addr).first()
if not host:
is_ok, message = False, f'unknown host id for {addr!r}'
else:
is_ok, message = host_executor(host, command)
target = f'{host.name}({host.hostname})'
rds, key, f_count, f_time = get_redis_connection(), f'spug:det:{task_id}', f'c_{addr}', f't_{addr}'
v_count, v_time = rds.hmget(key, f_count, f_time)
if is_ok:
if v_count:
rds.hdel(key, f_count, f_time)
if v_time:
logging.warning('send recovery notification')
handle_notify(task_id, target, is_ok, message, int(v_count) + 1)
return
v_count = rds.hincrby(key, f_count)
if v_count >= threshold:
if not v_time or int(time.time()) - int(v_time) >= quiet * 60:
rds.hset(key, f_time, int(time.time()))
logging.warning('send fault alarm notification')
handle_trigger_event(task_id, addr if tp in ('3', '4') else None)
handle_notify(task_id, target, is_ok, message, v_count)
def dispatch(tp, addr, extra):
if tp == '1':
return site_check(addr, extra)
elif tp == '2':
return port_check(addr, extra)
elif tp == '5':
return ping_check(addr)
elif tp == '3':
command = f'ps -ef|grep -v grep|grep {extra!r}'
elif tp == '4':
command = extra
else:
raise TypeError(f'invalid monitor type: {tp!r}')
host = Host.objects.filter(pk=addr).first()
return host_executor(host, command)