2022-04-09 01:35:50 +00:00
|
|
|
|
from config import consul_token,consul_url
|
|
|
|
|
def ecs_config(services_list,ostype_list):
|
|
|
|
|
consul_server = consul_url.split("/")[2]
|
|
|
|
|
job_dict = {'linux':'node_exporter','windows':'windows_exporter'}
|
|
|
|
|
configs = ''
|
|
|
|
|
for ostype in ostype_list:
|
|
|
|
|
job_name = job_dict[ostype]
|
|
|
|
|
config_str = f"""
|
|
|
|
|
- job_name: {job_name}
|
|
|
|
|
scrape_interval: 15s
|
|
|
|
|
scrape_timeout: 5s
|
|
|
|
|
consul_sd_configs:
|
|
|
|
|
- server: '{consul_server}'
|
|
|
|
|
token: '{consul_token}'
|
|
|
|
|
refresh_interval: 30s
|
|
|
|
|
services: {services_list}
|
|
|
|
|
tags: ['{ostype}']
|
|
|
|
|
relabel_configs:
|
2022-05-08 12:37:24 +00:00
|
|
|
|
- source_labels: ['__meta_consul_service']
|
|
|
|
|
target_label: cservice
|
2022-04-09 01:35:50 +00:00
|
|
|
|
- source_labels: ['__meta_consul_service_metadata_vendor']
|
|
|
|
|
target_label: vendor
|
|
|
|
|
- source_labels: ['__meta_consul_service_metadata_region']
|
|
|
|
|
target_label: region
|
|
|
|
|
- source_labels: ['__meta_consul_service_metadata_group']
|
|
|
|
|
target_label: group
|
|
|
|
|
- source_labels: ['__meta_consul_service_metadata_account']
|
|
|
|
|
target_label: account
|
|
|
|
|
- source_labels: ['__meta_consul_service_metadata_name']
|
|
|
|
|
target_label: name
|
|
|
|
|
- source_labels: ['__meta_consul_service_metadata_iid']
|
|
|
|
|
target_label: iid
|
|
|
|
|
- source_labels: ['__meta_consul_service_metadata_exp']
|
|
|
|
|
target_label: exp
|
|
|
|
|
- source_labels: ['__meta_consul_service_metadata_instance']
|
|
|
|
|
target_label: instance
|
|
|
|
|
- source_labels: [instance]
|
|
|
|
|
target_label: __address__
|
|
|
|
|
"""
|
|
|
|
|
configs = configs + config_str
|
|
|
|
|
return {'code': 20000,'configs': configs }
|
2022-06-04 09:36:18 +00:00
|
|
|
|
def get_rules():
|
|
|
|
|
rules = """
|
|
|
|
|
groups:
|
|
|
|
|
- name: node_usage_record_rules
|
|
|
|
|
interval: 1m
|
|
|
|
|
rules:
|
|
|
|
|
- record: cpu:usage:rate1m
|
|
|
|
|
expr: (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance,vendor,account,group,name)) * 100
|
|
|
|
|
- record: mem:usage:rate1m
|
|
|
|
|
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100
|
|
|
|
|
|
|
|
|
|
- name: node-exporter
|
|
|
|
|
rules:
|
|
|
|
|
- alert: 内存使用率
|
|
|
|
|
expr: 100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
|
|
|
|
|
for: 5m
|
|
|
|
|
labels:
|
|
|
|
|
alertype: system
|
|
|
|
|
severity: warning
|
|
|
|
|
annotations:
|
|
|
|
|
description: "{{ $labels.name }}:内存使用率{{ $value | humanize }}%\\n> {{ $labels.group }}-{{ $labels.instance }}"
|
|
|
|
|
|
|
|
|
|
- alert: CPU使用率
|
|
|
|
|
expr: 100 - (avg by(instance,name,group,account) (irate(node_cpu_seconds_total[5m])) * 100) > 92
|
|
|
|
|
for: 5m
|
|
|
|
|
labels:
|
|
|
|
|
alertype: system
|
|
|
|
|
severity: warning
|
|
|
|
|
annotations:
|
|
|
|
|
description: "{{ $labels.name }}:CPU使用率{{ $value | humanize }}%\\n> {{ $labels.group }}-{{ $labels.instance }}"
|
|
|
|
|
|
|
|
|
|
- alert: 系统负载
|
|
|
|
|
expr: node_load5 / on (instance,name,group,account) sum(count(node_cpu_seconds_total{mode='system'}) by (cpu,instance,name,group,account)) by(instance,name,group,account) > 1.7
|
|
|
|
|
for: 10m
|
|
|
|
|
labels:
|
|
|
|
|
alertype: system
|
|
|
|
|
severity: warning
|
|
|
|
|
annotations:
|
|
|
|
|
description: "{{ $labels.name }}:系统负载{{ $value | humanize }}倍\\n> {{ $labels.group }}-{{ $labels.instance }}"
|
|
|
|
|
|
|
|
|
|
- alert: 磁盘使用率
|
|
|
|
|
expr: |
|
|
|
|
|
100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{fstype=~"ext.?|xfs",mountpoint!~".*pods.*|/var/lib/docker/devicemapper/mnt/.*"} * 100) > 85
|
|
|
|
|
for: 5m
|
|
|
|
|
labels:
|
|
|
|
|
alertype: system
|
|
|
|
|
severity: warning
|
|
|
|
|
annotations:
|
|
|
|
|
description: "{{ $labels.name }}_{{ $labels.mountpoint }}:磁盘使用率{{ $value | humanize }}%\\n> {{ $labels.group }}-{{ $labels.instance }}"
|
|
|
|
|
|
|
|
|
|
- alert: K8S节点POD磁盘使用率
|
|
|
|
|
expr: 100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{mountpoint=~"/var/lib/docker/devicemapper/mnt/.*"} * 100) > 85
|
|
|
|
|
for: 5m
|
|
|
|
|
labels:
|
|
|
|
|
alertype: system
|
|
|
|
|
severity: warning
|
|
|
|
|
annotations:
|
|
|
|
|
description: "{{ $labels.name }}_{{ $labels.mountpoint }}:磁盘使用率{{ $value | humanize }}%\\n> {{ $labels.group }}-{{ $labels.instance }}"
|
|
|
|
|
|
|
|
|
|
- alert: NFS磁盘使用率
|
|
|
|
|
expr: 100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{fstype="nfs"} * 100) > 90
|
|
|
|
|
for: 5m
|
|
|
|
|
labels:
|
|
|
|
|
alertype: system
|
|
|
|
|
severity: warning
|
|
|
|
|
annotations:
|
|
|
|
|
description: "{{ $labels.name }}_{{ $labels.mountpoint }}:磁盘使用率{{ $value | humanize }}%\\n> {{ $labels.group }}-{{ $labels.instance }}"
|
|
|
|
|
|
|
|
|
|
- alert: 磁盘读写容量
|
|
|
|
|
expr: (irate(node_disk_read_bytes_total[5m]) ) /1024 /1024 > 80 or (irate(node_disk_written_bytes_total[5m]) ) /1024 /1024 > 80
|
|
|
|
|
for: 8m
|
|
|
|
|
labels:
|
|
|
|
|
alertype: disk
|
|
|
|
|
severity: warning
|
|
|
|
|
annotations:
|
|
|
|
|
description: "{{ $labels.name }}_{{ $labels.device }}:当前IO为{{ $value | humanize }}MB/s\\n> {{ $labels.group }}-{{ $labels.instance }}"
|
|
|
|
|
|
|
|
|
|
- alert: 网络流入(下载)数据过多
|
|
|
|
|
expr: sum by(device,instance, name, group, account) (irate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr.*|lo.*|cni.*'}[5m])) / 1024 / 1024 > 70
|
|
|
|
|
for: 5m
|
|
|
|
|
labels:
|
|
|
|
|
alertype: network
|
|
|
|
|
severity: warning
|
|
|
|
|
annotations:
|
|
|
|
|
description: "{{ $labels.name }}:流入数据为{{ $value | humanize }}MB/s\\n> {{ $labels.group }}-{{ $labels.instance }}"
|
|
|
|
|
|
|
|
|
|
- alert: 网络流出(上传)数据过多
|
|
|
|
|
expr: sum by(device,instance, name, group, account) (irate(node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr.*|lo.*|cni.*'}[5m])) / 1024 / 1024 > 70
|
|
|
|
|
for: 5m
|
|
|
|
|
labels:
|
|
|
|
|
alertype: network
|
|
|
|
|
severity: warning
|
|
|
|
|
annotations:
|
|
|
|
|
description: "{{ $labels.name }}:流出数据为{{ $value | humanize }}MB/s\\n> {{ $labels.group }}-{{ $labels.instance }}"
|
|
|
|
|
|
|
|
|
|
- name: Itself
|
|
|
|
|
rules:
|
|
|
|
|
- alert: Exporter状态
|
|
|
|
|
expr: up == 0
|
|
|
|
|
for: 3m
|
|
|
|
|
labels:
|
|
|
|
|
alertype: itself
|
|
|
|
|
severity: critical
|
|
|
|
|
annotations:
|
|
|
|
|
description: "{{ $labels.job }}:异常\\n> {{ $labels.group }}-{{ $labels.name }}-{{ $labels.instance }}"
|
|
|
|
|
"""
|
|
|
|
|
return {"code": 20000, "rules": rules}
|