From 66b0173e206a94036d7fd3ab1ac58a05b5850647 Mon Sep 17 00:00:00 2001 From: fit2bot <68588906+fit2bot@users.noreply.github.com> Date: Fri, 30 Jul 2021 15:42:06 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=9C=8D=E5=8A=A1=E6=80=A7=E8=83=BD?= =?UTF-8?q?=E5=91=8A=E8=AD=A6=E6=8C=87=E6=A0=87=E5=8C=85=E5=90=AB=EF=BC=9A?= =?UTF-8?q?Core=E6=9C=8D=E5=8A=A1=E5=92=8C=E5=90=84=E7=BB=84=E4=BB=B6?= =?UTF-8?q?=E7=8A=B6=E6=80=81=EF=BC=9B=E6=8C=87=E6=A0=87=E5=8C=85=E6=8B=AC?= =?UTF-8?q?=EF=BC=9Acpu/disk/memory/is=5Falive=20(#6564)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: 服务性能告警指标包含:Core服务和各组件状态;指标包括:cpu/disk/memory/is_alive * feat: 服务性能告警指标包含:Core服务和各组件状态;指标包括:cpu/disk/memory/is_alive 2 Co-authored-by: Bai --- apps/common/utils/common.py | 12 +++ apps/notifications/notifications.py | 3 +- apps/ops/notifications.py | 127 ++++++++++++++++++++++++++-- apps/ops/tasks.py | 17 +--- apps/terminal/models/terminal.py | 2 + 5 files changed, 140 insertions(+), 21 deletions(-) diff --git a/apps/common/utils/common.py b/apps/common/utils/common.py index 1a9a79c05..ac2bc0c90 100644 --- a/apps/common/utils/common.py +++ b/apps/common/utils/common.py @@ -249,6 +249,18 @@ def get_disk_usage(): return usages +def get_cpu_load(): + cpu_load_1, cpu_load_5, cpu_load_15 = psutil.getloadavg() + cpu_count = psutil.cpu_count() + single_cpu_load_1 = cpu_load_1 / cpu_count + single_cpu_load_1 = '%.2f' % single_cpu_load_1 + return float(single_cpu_load_1) + + +def get_memory_used(): + return psutil.virtual_memory().percent + + class Time: def __init__(self): self._timestamps = [] diff --git a/apps/notifications/notifications.py b/apps/notifications/notifications.py index bbf9fe7ee..82086c618 100644 --- a/apps/notifications/notifications.py +++ b/apps/notifications/notifications.py @@ -92,8 +92,9 @@ class Message(metaclass=MessageType): def get_email_msg(self) -> dict: msg = self.get_common_msg() + subject = f'{msg[:20]} ...' if len(msg) >= 20 else msg return { - 'subject': msg, + 'subject': subject, 'message': msg } diff --git a/apps/ops/notifications.py b/apps/ops/notifications.py index 4a65d8a4e..5ca460c8f 100644 --- a/apps/ops/notifications.py +++ b/apps/ops/notifications.py @@ -1,11 +1,14 @@ from django.utils.translation import gettext_lazy as _ +from django.conf import settings from notifications.notifications import SystemMessage from notifications.models import SystemMsgSubscription from users.models import User from notifications.backends import BACKEND +from common.utils import get_disk_usage, get_cpu_load, get_memory_used +from terminal.models import Status, Terminal -__all__ = ('ServerPerformanceMessage',) +__all__ = ('ServerPerformanceMessage', 'ServerPerformanceCheckUtil') class ServerPerformanceMessage(SystemMessage): @@ -13,13 +16,11 @@ class ServerPerformanceMessage(SystemMessage): category_label = _('Operations') message_type_label = _('Server performance') - def __init__(self, path, usage): - self.path = path - self.usage = usage + def __init__(self, msg): + self._msg = msg def get_common_msg(self): - msg = _("Disk used more than 80%: {} => {}").format(self.path, self.usage.percent) - return msg + return self._msg @classmethod def post_insert_to_db(cls, subscription: SystemMsgSubscription): @@ -27,3 +28,117 @@ class ServerPerformanceMessage(SystemMessage): subscription.users.add(*admins) subscription.receive_backends = [BACKEND.EMAIL] subscription.save() + + +class ServerPerformanceCheckUtil(object): + + def __init__(self): + self.alarm_messages = [] + self.disk_usage_threshold = 20 # 80 + self.cpu_load_threshold = 1 # 5 + self.memory_usage_threshold = 20 # 85 + # checking terminal + self._terminal = None + + def check_and_publish(self): + self.check() + self.publish() + + def publish(self): + if not self.alarm_messages: + return + msg = '
'.join(self.alarm_messages) + ServerPerformanceMessage(msg).publish() + + def check(self): + check_items = ['disk_usage', 'cpu_load', 'memory_usage'] + + # Check local + if settings.DISK_CHECK_ENABLED: + self.check_items(check_items) + + # Check terminal + check_items += ['is_alive'] + terminals = self.get_terminals() + for terminal in terminals: + self._terminal = terminal + self.check_items(check_items) + + @staticmethod + def get_terminals(): + terminals = [] + for terminal in Terminal.objects.filter(is_accepted=True, is_deleted=False): + if not terminal.is_active: + continue + terminal.status = Status.get_terminal_latest_stat(terminal) + terminals.append(terminal) + return terminals + + def check_items(self, items): + for item in items: + messages = getattr(self, f'check_{item}', lambda: None)() + self.alarm_messages.extend(messages) + + def check_is_alive(self): + message = [] + if not self._terminal and not self._terminal.is_alive: + name = self._terminal.name + msg = _('The terminal is offline: {}').format(name) + message.append(msg) + return message + + def check_disk_usage(self): + messages = [] + if self._terminal: + name = self._terminal.name + disk_used = getattr(self._terminal.status, 'disk_used', None) + disks_used = [['/', disk_used]] if disk_used else [] + else: + name = 'Core' + disks_used = self._get_local_disk_usage() + + for disk, used in disks_used: + if used <= self.disk_usage_threshold: + continue + msg = _("Disk used more than {}%: {} => {} ({})").format(self.disk_usage_threshold, disk, used, name) + messages.append(msg) + return messages + + @staticmethod + def _get_local_disk_usage(): + disks_usage = [] + usages = get_disk_usage() + uncheck_paths = ['/etc', '/boot'] + for path, usage in usages.items(): + if len(path) > 4 and path[:4] in uncheck_paths: + continue + disks_usage.append([path, usage.percent]) + return disks_usage + + def check_cpu_load(self): + messages = [] + if self._terminal: + name = self._terminal.name + cpu_load = getattr(self._terminal.status, 'cpu_load', 0) + else: + name = 'Core' + cpu_load = get_cpu_load() + + if cpu_load > self.cpu_load_threshold: + msg = _('CPU load more than {}: => {} ({})').format(self.cpu_load_threshold, cpu_load, name) + messages.append(msg) + return messages + + def check_memory_usage(self): + messages = [] + if self._terminal: + name = self._terminal.name + memory_usage = getattr(self._terminal.status, 'memory_usage', 0) + else: + name = 'Core' + memory_usage = get_memory_used() + + if memory_usage > self.memory_usage_threshold: + msg = _('Memory used more than {}%: => {} ({})').format(self.memory_usage_threshold, memory_usage, name) + messages.append(msg) + return messages diff --git a/apps/ops/tasks.py b/apps/ops/tasks.py index 60f639668..00a0027cd 100644 --- a/apps/ops/tasks.py +++ b/apps/ops/tasks.py @@ -9,7 +9,7 @@ from celery.exceptions import SoftTimeLimitExceeded from django.utils import timezone from django.utils.translation import ugettext_lazy as _ -from common.utils import get_logger, get_object_or_none, get_disk_usage, get_log_keep_day +from common.utils import get_logger, get_object_or_none, get_log_keep_day from orgs.utils import tmp_to_root_org, tmp_to_org from .celery.decorator import ( register_as_period_task, after_app_shutdown_clean_periodic, @@ -20,7 +20,7 @@ from .celery.utils import ( disable_celery_periodic_task, delete_celery_periodic_task ) from .models import Task, CommandExecution, CeleryTask -from .notifications import ServerPerformanceMessage +from .notifications import ServerPerformanceCheckUtil logger = get_logger(__file__) @@ -132,18 +132,7 @@ def create_or_update_registered_periodic_tasks(): @shared_task @register_as_period_task(interval=3600) def check_server_performance_period(): - if not settings.DISK_CHECK_ENABLED: - return - usages = get_disk_usage() - uncheck_paths = ['/etc', '/boot'] - - for path, usage in usages.items(): - need_check = True - for uncheck_path in uncheck_paths: - if path.startswith(uncheck_path): - need_check = False - if need_check and usage.percent > 80: - ServerPerformanceMessage(path=path, usage=usage).publish() + ServerPerformanceCheckUtil().check_and_publish() @shared_task(queue="ansible") diff --git a/apps/terminal/models/terminal.py b/apps/terminal/models/terminal.py index 77c9b1ce8..4a69c1112 100644 --- a/apps/terminal/models/terminal.py +++ b/apps/terminal/models/terminal.py @@ -184,6 +184,8 @@ class Terminal(StorageMixin, TerminalStatusMixin, models.Model): status = "Deleted" elif not self.is_active: status = "Disable" + elif not self.is_alive: + status = 'Offline' return '%s: %s' % (self.name, status) class Meta: