feat: 服务性能告警指标包含:Core服务和各组件状态;指标包括:cpu/disk/memory/is_alive (#6564)

* feat: 服务性能告警指标包含:Core服务和各组件状态;指标包括:cpu/disk/memory/is_alive

* feat: 服务性能告警指标包含:Core服务和各组件状态;指标包括:cpu/disk/memory/is_alive 2

Co-authored-by: Bai <bugatti_it@163.com>
pull/6569/head
fit2bot 2021-07-30 15:42:06 +08:00 committed by GitHub
parent 67f6b1080e
commit 66b0173e20
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 140 additions and 21 deletions

View File

@ -249,6 +249,18 @@ def get_disk_usage():
return usages return usages
def get_cpu_load():
cpu_load_1, cpu_load_5, cpu_load_15 = psutil.getloadavg()
cpu_count = psutil.cpu_count()
single_cpu_load_1 = cpu_load_1 / cpu_count
single_cpu_load_1 = '%.2f' % single_cpu_load_1
return float(single_cpu_load_1)
def get_memory_used():
return psutil.virtual_memory().percent
class Time: class Time:
def __init__(self): def __init__(self):
self._timestamps = [] self._timestamps = []

View File

@ -92,8 +92,9 @@ class Message(metaclass=MessageType):
def get_email_msg(self) -> dict: def get_email_msg(self) -> dict:
msg = self.get_common_msg() msg = self.get_common_msg()
subject = f'{msg[:20]} ...' if len(msg) >= 20 else msg
return { return {
'subject': msg, 'subject': subject,
'message': msg 'message': msg
} }

View File

@ -1,11 +1,14 @@
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _
from django.conf import settings
from notifications.notifications import SystemMessage from notifications.notifications import SystemMessage
from notifications.models import SystemMsgSubscription from notifications.models import SystemMsgSubscription
from users.models import User from users.models import User
from notifications.backends import BACKEND from notifications.backends import BACKEND
from common.utils import get_disk_usage, get_cpu_load, get_memory_used
from terminal.models import Status, Terminal
__all__ = ('ServerPerformanceMessage',) __all__ = ('ServerPerformanceMessage', 'ServerPerformanceCheckUtil')
class ServerPerformanceMessage(SystemMessage): class ServerPerformanceMessage(SystemMessage):
@ -13,13 +16,11 @@ class ServerPerformanceMessage(SystemMessage):
category_label = _('Operations') category_label = _('Operations')
message_type_label = _('Server performance') message_type_label = _('Server performance')
def __init__(self, path, usage): def __init__(self, msg):
self.path = path self._msg = msg
self.usage = usage
def get_common_msg(self): def get_common_msg(self):
msg = _("Disk used more than 80%: {} => {}").format(self.path, self.usage.percent) return self._msg
return msg
@classmethod @classmethod
def post_insert_to_db(cls, subscription: SystemMsgSubscription): def post_insert_to_db(cls, subscription: SystemMsgSubscription):
@ -27,3 +28,117 @@ class ServerPerformanceMessage(SystemMessage):
subscription.users.add(*admins) subscription.users.add(*admins)
subscription.receive_backends = [BACKEND.EMAIL] subscription.receive_backends = [BACKEND.EMAIL]
subscription.save() subscription.save()
class ServerPerformanceCheckUtil(object):
def __init__(self):
self.alarm_messages = []
self.disk_usage_threshold = 20 # 80
self.cpu_load_threshold = 1 # 5
self.memory_usage_threshold = 20 # 85
# checking terminal
self._terminal = None
def check_and_publish(self):
self.check()
self.publish()
def publish(self):
if not self.alarm_messages:
return
msg = '<br>'.join(self.alarm_messages)
ServerPerformanceMessage(msg).publish()
def check(self):
check_items = ['disk_usage', 'cpu_load', 'memory_usage']
# Check local
if settings.DISK_CHECK_ENABLED:
self.check_items(check_items)
# Check terminal
check_items += ['is_alive']
terminals = self.get_terminals()
for terminal in terminals:
self._terminal = terminal
self.check_items(check_items)
@staticmethod
def get_terminals():
terminals = []
for terminal in Terminal.objects.filter(is_accepted=True, is_deleted=False):
if not terminal.is_active:
continue
terminal.status = Status.get_terminal_latest_stat(terminal)
terminals.append(terminal)
return terminals
def check_items(self, items):
for item in items:
messages = getattr(self, f'check_{item}', lambda: None)()
self.alarm_messages.extend(messages)
def check_is_alive(self):
message = []
if not self._terminal and not self._terminal.is_alive:
name = self._terminal.name
msg = _('The terminal is offline: {}').format(name)
message.append(msg)
return message
def check_disk_usage(self):
messages = []
if self._terminal:
name = self._terminal.name
disk_used = getattr(self._terminal.status, 'disk_used', None)
disks_used = [['/', disk_used]] if disk_used else []
else:
name = 'Core'
disks_used = self._get_local_disk_usage()
for disk, used in disks_used:
if used <= self.disk_usage_threshold:
continue
msg = _("Disk used more than {}%: {} => {} ({})").format(self.disk_usage_threshold, disk, used, name)
messages.append(msg)
return messages
@staticmethod
def _get_local_disk_usage():
disks_usage = []
usages = get_disk_usage()
uncheck_paths = ['/etc', '/boot']
for path, usage in usages.items():
if len(path) > 4 and path[:4] in uncheck_paths:
continue
disks_usage.append([path, usage.percent])
return disks_usage
def check_cpu_load(self):
messages = []
if self._terminal:
name = self._terminal.name
cpu_load = getattr(self._terminal.status, 'cpu_load', 0)
else:
name = 'Core'
cpu_load = get_cpu_load()
if cpu_load > self.cpu_load_threshold:
msg = _('CPU load more than {}: => {} ({})').format(self.cpu_load_threshold, cpu_load, name)
messages.append(msg)
return messages
def check_memory_usage(self):
messages = []
if self._terminal:
name = self._terminal.name
memory_usage = getattr(self._terminal.status, 'memory_usage', 0)
else:
name = 'Core'
memory_usage = get_memory_used()
if memory_usage > self.memory_usage_threshold:
msg = _('Memory used more than {}%: => {} ({})').format(self.memory_usage_threshold, memory_usage, name)
messages.append(msg)
return messages

View File

@ -9,7 +9,7 @@ from celery.exceptions import SoftTimeLimitExceeded
from django.utils import timezone from django.utils import timezone
from django.utils.translation import ugettext_lazy as _ from django.utils.translation import ugettext_lazy as _
from common.utils import get_logger, get_object_or_none, get_disk_usage, get_log_keep_day from common.utils import get_logger, get_object_or_none, get_log_keep_day
from orgs.utils import tmp_to_root_org, tmp_to_org from orgs.utils import tmp_to_root_org, tmp_to_org
from .celery.decorator import ( from .celery.decorator import (
register_as_period_task, after_app_shutdown_clean_periodic, register_as_period_task, after_app_shutdown_clean_periodic,
@ -20,7 +20,7 @@ from .celery.utils import (
disable_celery_periodic_task, delete_celery_periodic_task disable_celery_periodic_task, delete_celery_periodic_task
) )
from .models import Task, CommandExecution, CeleryTask from .models import Task, CommandExecution, CeleryTask
from .notifications import ServerPerformanceMessage from .notifications import ServerPerformanceCheckUtil
logger = get_logger(__file__) logger = get_logger(__file__)
@ -132,18 +132,7 @@ def create_or_update_registered_periodic_tasks():
@shared_task @shared_task
@register_as_period_task(interval=3600) @register_as_period_task(interval=3600)
def check_server_performance_period(): def check_server_performance_period():
if not settings.DISK_CHECK_ENABLED: ServerPerformanceCheckUtil().check_and_publish()
return
usages = get_disk_usage()
uncheck_paths = ['/etc', '/boot']
for path, usage in usages.items():
need_check = True
for uncheck_path in uncheck_paths:
if path.startswith(uncheck_path):
need_check = False
if need_check and usage.percent > 80:
ServerPerformanceMessage(path=path, usage=usage).publish()
@shared_task(queue="ansible") @shared_task(queue="ansible")

View File

@ -184,6 +184,8 @@ class Terminal(StorageMixin, TerminalStatusMixin, models.Model):
status = "Deleted" status = "Deleted"
elif not self.is_active: elif not self.is_active:
status = "Disable" status = "Disable"
elif not self.is_alive:
status = 'Offline'
return '%s: %s' % (self.name, status) return '%s: %s' % (self.name, status)
class Meta: class Meta: