mirror of https://github.com/jumpserver/jumpserver
feat: 服务性能告警指标包含:Core服务和各组件状态;指标包括:cpu/disk/memory/is_alive (#6564)
* feat: 服务性能告警指标包含:Core服务和各组件状态;指标包括:cpu/disk/memory/is_alive * feat: 服务性能告警指标包含:Core服务和各组件状态;指标包括:cpu/disk/memory/is_alive 2 Co-authored-by: Bai <bugatti_it@163.com>pull/6569/head
parent
67f6b1080e
commit
66b0173e20
|
@ -249,6 +249,18 @@ def get_disk_usage():
|
|||
return usages
|
||||
|
||||
|
||||
def get_cpu_load():
|
||||
cpu_load_1, cpu_load_5, cpu_load_15 = psutil.getloadavg()
|
||||
cpu_count = psutil.cpu_count()
|
||||
single_cpu_load_1 = cpu_load_1 / cpu_count
|
||||
single_cpu_load_1 = '%.2f' % single_cpu_load_1
|
||||
return float(single_cpu_load_1)
|
||||
|
||||
|
||||
def get_memory_used():
|
||||
return psutil.virtual_memory().percent
|
||||
|
||||
|
||||
class Time:
|
||||
def __init__(self):
|
||||
self._timestamps = []
|
||||
|
|
|
@ -92,8 +92,9 @@ class Message(metaclass=MessageType):
|
|||
|
||||
def get_email_msg(self) -> dict:
|
||||
msg = self.get_common_msg()
|
||||
subject = f'{msg[:20]} ...' if len(msg) >= 20 else msg
|
||||
return {
|
||||
'subject': msg,
|
||||
'subject': subject,
|
||||
'message': msg
|
||||
}
|
||||
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
from django.utils.translation import gettext_lazy as _
|
||||
from django.conf import settings
|
||||
|
||||
from notifications.notifications import SystemMessage
|
||||
from notifications.models import SystemMsgSubscription
|
||||
from users.models import User
|
||||
from notifications.backends import BACKEND
|
||||
from common.utils import get_disk_usage, get_cpu_load, get_memory_used
|
||||
from terminal.models import Status, Terminal
|
||||
|
||||
__all__ = ('ServerPerformanceMessage',)
|
||||
__all__ = ('ServerPerformanceMessage', 'ServerPerformanceCheckUtil')
|
||||
|
||||
|
||||
class ServerPerformanceMessage(SystemMessage):
|
||||
|
@ -13,13 +16,11 @@ class ServerPerformanceMessage(SystemMessage):
|
|||
category_label = _('Operations')
|
||||
message_type_label = _('Server performance')
|
||||
|
||||
def __init__(self, path, usage):
|
||||
self.path = path
|
||||
self.usage = usage
|
||||
def __init__(self, msg):
|
||||
self._msg = msg
|
||||
|
||||
def get_common_msg(self):
|
||||
msg = _("Disk used more than 80%: {} => {}").format(self.path, self.usage.percent)
|
||||
return msg
|
||||
return self._msg
|
||||
|
||||
@classmethod
|
||||
def post_insert_to_db(cls, subscription: SystemMsgSubscription):
|
||||
|
@ -27,3 +28,117 @@ class ServerPerformanceMessage(SystemMessage):
|
|||
subscription.users.add(*admins)
|
||||
subscription.receive_backends = [BACKEND.EMAIL]
|
||||
subscription.save()
|
||||
|
||||
|
||||
class ServerPerformanceCheckUtil(object):
|
||||
|
||||
def __init__(self):
|
||||
self.alarm_messages = []
|
||||
self.disk_usage_threshold = 20 # 80
|
||||
self.cpu_load_threshold = 1 # 5
|
||||
self.memory_usage_threshold = 20 # 85
|
||||
# checking terminal
|
||||
self._terminal = None
|
||||
|
||||
def check_and_publish(self):
|
||||
self.check()
|
||||
self.publish()
|
||||
|
||||
def publish(self):
|
||||
if not self.alarm_messages:
|
||||
return
|
||||
msg = '<br>'.join(self.alarm_messages)
|
||||
ServerPerformanceMessage(msg).publish()
|
||||
|
||||
def check(self):
|
||||
check_items = ['disk_usage', 'cpu_load', 'memory_usage']
|
||||
|
||||
# Check local
|
||||
if settings.DISK_CHECK_ENABLED:
|
||||
self.check_items(check_items)
|
||||
|
||||
# Check terminal
|
||||
check_items += ['is_alive']
|
||||
terminals = self.get_terminals()
|
||||
for terminal in terminals:
|
||||
self._terminal = terminal
|
||||
self.check_items(check_items)
|
||||
|
||||
@staticmethod
|
||||
def get_terminals():
|
||||
terminals = []
|
||||
for terminal in Terminal.objects.filter(is_accepted=True, is_deleted=False):
|
||||
if not terminal.is_active:
|
||||
continue
|
||||
terminal.status = Status.get_terminal_latest_stat(terminal)
|
||||
terminals.append(terminal)
|
||||
return terminals
|
||||
|
||||
def check_items(self, items):
|
||||
for item in items:
|
||||
messages = getattr(self, f'check_{item}', lambda: None)()
|
||||
self.alarm_messages.extend(messages)
|
||||
|
||||
def check_is_alive(self):
|
||||
message = []
|
||||
if not self._terminal and not self._terminal.is_alive:
|
||||
name = self._terminal.name
|
||||
msg = _('The terminal is offline: {}').format(name)
|
||||
message.append(msg)
|
||||
return message
|
||||
|
||||
def check_disk_usage(self):
|
||||
messages = []
|
||||
if self._terminal:
|
||||
name = self._terminal.name
|
||||
disk_used = getattr(self._terminal.status, 'disk_used', None)
|
||||
disks_used = [['/', disk_used]] if disk_used else []
|
||||
else:
|
||||
name = 'Core'
|
||||
disks_used = self._get_local_disk_usage()
|
||||
|
||||
for disk, used in disks_used:
|
||||
if used <= self.disk_usage_threshold:
|
||||
continue
|
||||
msg = _("Disk used more than {}%: {} => {} ({})").format(self.disk_usage_threshold, disk, used, name)
|
||||
messages.append(msg)
|
||||
return messages
|
||||
|
||||
@staticmethod
|
||||
def _get_local_disk_usage():
|
||||
disks_usage = []
|
||||
usages = get_disk_usage()
|
||||
uncheck_paths = ['/etc', '/boot']
|
||||
for path, usage in usages.items():
|
||||
if len(path) > 4 and path[:4] in uncheck_paths:
|
||||
continue
|
||||
disks_usage.append([path, usage.percent])
|
||||
return disks_usage
|
||||
|
||||
def check_cpu_load(self):
|
||||
messages = []
|
||||
if self._terminal:
|
||||
name = self._terminal.name
|
||||
cpu_load = getattr(self._terminal.status, 'cpu_load', 0)
|
||||
else:
|
||||
name = 'Core'
|
||||
cpu_load = get_cpu_load()
|
||||
|
||||
if cpu_load > self.cpu_load_threshold:
|
||||
msg = _('CPU load more than {}: => {} ({})').format(self.cpu_load_threshold, cpu_load, name)
|
||||
messages.append(msg)
|
||||
return messages
|
||||
|
||||
def check_memory_usage(self):
|
||||
messages = []
|
||||
if self._terminal:
|
||||
name = self._terminal.name
|
||||
memory_usage = getattr(self._terminal.status, 'memory_usage', 0)
|
||||
else:
|
||||
name = 'Core'
|
||||
memory_usage = get_memory_used()
|
||||
|
||||
if memory_usage > self.memory_usage_threshold:
|
||||
msg = _('Memory used more than {}%: => {} ({})').format(self.memory_usage_threshold, memory_usage, name)
|
||||
messages.append(msg)
|
||||
return messages
|
||||
|
|
|
@ -9,7 +9,7 @@ from celery.exceptions import SoftTimeLimitExceeded
|
|||
from django.utils import timezone
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from common.utils import get_logger, get_object_or_none, get_disk_usage, get_log_keep_day
|
||||
from common.utils import get_logger, get_object_or_none, get_log_keep_day
|
||||
from orgs.utils import tmp_to_root_org, tmp_to_org
|
||||
from .celery.decorator import (
|
||||
register_as_period_task, after_app_shutdown_clean_periodic,
|
||||
|
@ -20,7 +20,7 @@ from .celery.utils import (
|
|||
disable_celery_periodic_task, delete_celery_periodic_task
|
||||
)
|
||||
from .models import Task, CommandExecution, CeleryTask
|
||||
from .notifications import ServerPerformanceMessage
|
||||
from .notifications import ServerPerformanceCheckUtil
|
||||
|
||||
logger = get_logger(__file__)
|
||||
|
||||
|
@ -132,18 +132,7 @@ def create_or_update_registered_periodic_tasks():
|
|||
@shared_task
|
||||
@register_as_period_task(interval=3600)
|
||||
def check_server_performance_period():
|
||||
if not settings.DISK_CHECK_ENABLED:
|
||||
return
|
||||
usages = get_disk_usage()
|
||||
uncheck_paths = ['/etc', '/boot']
|
||||
|
||||
for path, usage in usages.items():
|
||||
need_check = True
|
||||
for uncheck_path in uncheck_paths:
|
||||
if path.startswith(uncheck_path):
|
||||
need_check = False
|
||||
if need_check and usage.percent > 80:
|
||||
ServerPerformanceMessage(path=path, usage=usage).publish()
|
||||
ServerPerformanceCheckUtil().check_and_publish()
|
||||
|
||||
|
||||
@shared_task(queue="ansible")
|
||||
|
|
|
@ -184,6 +184,8 @@ class Terminal(StorageMixin, TerminalStatusMixin, models.Model):
|
|||
status = "Deleted"
|
||||
elif not self.is_active:
|
||||
status = "Disable"
|
||||
elif not self.is_alive:
|
||||
status = 'Offline'
|
||||
return '%s: %s' % (self.name, status)
|
||||
|
||||
class Meta:
|
||||
|
|
Loading…
Reference in New Issue