feat: 服务性能告警指标包含:Core服务和各组件状态;指标包括:cpu/disk/memory/is_alive (#6564)

* feat: 服务性能告警指标包含:Core服务和各组件状态;指标包括:cpu/disk/memory/is_alive

* feat: 服务性能告警指标包含:Core服务和各组件状态;指标包括:cpu/disk/memory/is_alive 2

Co-authored-by: Bai <bugatti_it@163.com>
pull/6569/head
fit2bot 2021-07-30 15:42:06 +08:00 committed by GitHub
parent 67f6b1080e
commit 66b0173e20
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 140 additions and 21 deletions

View File

@ -249,6 +249,18 @@ def get_disk_usage():
return usages
def get_cpu_load():
cpu_load_1, cpu_load_5, cpu_load_15 = psutil.getloadavg()
cpu_count = psutil.cpu_count()
single_cpu_load_1 = cpu_load_1 / cpu_count
single_cpu_load_1 = '%.2f' % single_cpu_load_1
return float(single_cpu_load_1)
def get_memory_used():
return psutil.virtual_memory().percent
class Time:
def __init__(self):
self._timestamps = []

View File

@ -92,8 +92,9 @@ class Message(metaclass=MessageType):
def get_email_msg(self) -> dict:
msg = self.get_common_msg()
subject = f'{msg[:20]} ...' if len(msg) >= 20 else msg
return {
'subject': msg,
'subject': subject,
'message': msg
}

View File

@ -1,11 +1,14 @@
from django.utils.translation import gettext_lazy as _
from django.conf import settings
from notifications.notifications import SystemMessage
from notifications.models import SystemMsgSubscription
from users.models import User
from notifications.backends import BACKEND
from common.utils import get_disk_usage, get_cpu_load, get_memory_used
from terminal.models import Status, Terminal
__all__ = ('ServerPerformanceMessage',)
__all__ = ('ServerPerformanceMessage', 'ServerPerformanceCheckUtil')
class ServerPerformanceMessage(SystemMessage):
@ -13,13 +16,11 @@ class ServerPerformanceMessage(SystemMessage):
category_label = _('Operations')
message_type_label = _('Server performance')
def __init__(self, path, usage):
self.path = path
self.usage = usage
def __init__(self, msg):
self._msg = msg
def get_common_msg(self):
msg = _("Disk used more than 80%: {} => {}").format(self.path, self.usage.percent)
return msg
return self._msg
@classmethod
def post_insert_to_db(cls, subscription: SystemMsgSubscription):
@ -27,3 +28,117 @@ class ServerPerformanceMessage(SystemMessage):
subscription.users.add(*admins)
subscription.receive_backends = [BACKEND.EMAIL]
subscription.save()
class ServerPerformanceCheckUtil(object):
def __init__(self):
self.alarm_messages = []
self.disk_usage_threshold = 20 # 80
self.cpu_load_threshold = 1 # 5
self.memory_usage_threshold = 20 # 85
# checking terminal
self._terminal = None
def check_and_publish(self):
self.check()
self.publish()
def publish(self):
if not self.alarm_messages:
return
msg = '<br>'.join(self.alarm_messages)
ServerPerformanceMessage(msg).publish()
def check(self):
check_items = ['disk_usage', 'cpu_load', 'memory_usage']
# Check local
if settings.DISK_CHECK_ENABLED:
self.check_items(check_items)
# Check terminal
check_items += ['is_alive']
terminals = self.get_terminals()
for terminal in terminals:
self._terminal = terminal
self.check_items(check_items)
@staticmethod
def get_terminals():
terminals = []
for terminal in Terminal.objects.filter(is_accepted=True, is_deleted=False):
if not terminal.is_active:
continue
terminal.status = Status.get_terminal_latest_stat(terminal)
terminals.append(terminal)
return terminals
def check_items(self, items):
for item in items:
messages = getattr(self, f'check_{item}', lambda: None)()
self.alarm_messages.extend(messages)
def check_is_alive(self):
message = []
if not self._terminal and not self._terminal.is_alive:
name = self._terminal.name
msg = _('The terminal is offline: {}').format(name)
message.append(msg)
return message
def check_disk_usage(self):
messages = []
if self._terminal:
name = self._terminal.name
disk_used = getattr(self._terminal.status, 'disk_used', None)
disks_used = [['/', disk_used]] if disk_used else []
else:
name = 'Core'
disks_used = self._get_local_disk_usage()
for disk, used in disks_used:
if used <= self.disk_usage_threshold:
continue
msg = _("Disk used more than {}%: {} => {} ({})").format(self.disk_usage_threshold, disk, used, name)
messages.append(msg)
return messages
@staticmethod
def _get_local_disk_usage():
disks_usage = []
usages = get_disk_usage()
uncheck_paths = ['/etc', '/boot']
for path, usage in usages.items():
if len(path) > 4 and path[:4] in uncheck_paths:
continue
disks_usage.append([path, usage.percent])
return disks_usage
def check_cpu_load(self):
messages = []
if self._terminal:
name = self._terminal.name
cpu_load = getattr(self._terminal.status, 'cpu_load', 0)
else:
name = 'Core'
cpu_load = get_cpu_load()
if cpu_load > self.cpu_load_threshold:
msg = _('CPU load more than {}: => {} ({})').format(self.cpu_load_threshold, cpu_load, name)
messages.append(msg)
return messages
def check_memory_usage(self):
messages = []
if self._terminal:
name = self._terminal.name
memory_usage = getattr(self._terminal.status, 'memory_usage', 0)
else:
name = 'Core'
memory_usage = get_memory_used()
if memory_usage > self.memory_usage_threshold:
msg = _('Memory used more than {}%: => {} ({})').format(self.memory_usage_threshold, memory_usage, name)
messages.append(msg)
return messages

View File

@ -9,7 +9,7 @@ from celery.exceptions import SoftTimeLimitExceeded
from django.utils import timezone
from django.utils.translation import ugettext_lazy as _
from common.utils import get_logger, get_object_or_none, get_disk_usage, get_log_keep_day
from common.utils import get_logger, get_object_or_none, get_log_keep_day
from orgs.utils import tmp_to_root_org, tmp_to_org
from .celery.decorator import (
register_as_period_task, after_app_shutdown_clean_periodic,
@ -20,7 +20,7 @@ from .celery.utils import (
disable_celery_periodic_task, delete_celery_periodic_task
)
from .models import Task, CommandExecution, CeleryTask
from .notifications import ServerPerformanceMessage
from .notifications import ServerPerformanceCheckUtil
logger = get_logger(__file__)
@ -132,18 +132,7 @@ def create_or_update_registered_periodic_tasks():
@shared_task
@register_as_period_task(interval=3600)
def check_server_performance_period():
if not settings.DISK_CHECK_ENABLED:
return
usages = get_disk_usage()
uncheck_paths = ['/etc', '/boot']
for path, usage in usages.items():
need_check = True
for uncheck_path in uncheck_paths:
if path.startswith(uncheck_path):
need_check = False
if need_check and usage.percent > 80:
ServerPerformanceMessage(path=path, usage=usage).publish()
ServerPerformanceCheckUtil().check_and_publish()
@shared_task(queue="ansible")

View File

@ -184,6 +184,8 @@ class Terminal(StorageMixin, TerminalStatusMixin, models.Model):
status = "Deleted"
elif not self.is_active:
status = "Disable"
elif not self.is_alive:
status = 'Offline'
return '%s: %s' % (self.name, status)
class Meta: