From 230843e95af8edbbc6ef4163687ab54afb067467 Mon Sep 17 00:00:00 2001 From: "StarsL.cn" Date: Sat, 4 Jun 2022 17:36:18 +0800 Subject: [PATCH] v0.5.2 --- flask-consul/units/gen_config.py | 107 +++++++++++++++++++ flask-consul/units/selfnode_manager.py | 2 + flask-consul/units/token_auth.py | 7 +- flask-consul/views/login.py | 2 +- flask-consul/views/nodes.py | 3 +- flask-consul/views/selfnode.py | 1 + vue-consul/src/api/node-exporter.js | 6 ++ vue-consul/src/router/index.js | 6 ++ vue-consul/src/views/blackbox/index.vue | 2 +- vue-consul/src/views/dashboard/index.vue | 9 ++ vue-consul/src/views/login/index.vue | 2 +- vue-consul/src/views/node-exporter/rules.vue | 51 +++++++++ 12 files changed, 193 insertions(+), 5 deletions(-) create mode 100644 vue-consul/src/views/node-exporter/rules.vue diff --git a/flask-consul/units/gen_config.py b/flask-consul/units/gen_config.py index 0117089..f9aef97 100644 --- a/flask-consul/units/gen_config.py +++ b/flask-consul/units/gen_config.py @@ -39,3 +39,110 @@ def ecs_config(services_list,ostype_list): """ configs = configs + config_str return {'code': 20000,'configs': configs } +def get_rules(): + rules = """ +groups: +- name: node_usage_record_rules + interval: 1m + rules: + - record: cpu:usage:rate1m + expr: (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance,vendor,account,group,name)) * 100 + - record: mem:usage:rate1m + expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 + +- name: node-exporter + rules: + - alert: 内存使用率 + expr: 100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90 + for: 5m + labels: + alertype: system + severity: warning + annotations: + description: "{{ $labels.name }}:内存使用率{{ $value | humanize }}%\\n> {{ $labels.group }}-{{ $labels.instance }}" + + - alert: CPU使用率 + expr: 100 - (avg by(instance,name,group,account) (irate(node_cpu_seconds_total[5m])) * 100) > 92 + for: 5m + labels: + alertype: system + severity: warning + annotations: + description: "{{ $labels.name }}:CPU使用率{{ $value | humanize }}%\\n> {{ $labels.group }}-{{ $labels.instance }}" + + - alert: 系统负载 + expr: node_load5 / on (instance,name,group,account) sum(count(node_cpu_seconds_total{mode='system'}) by (cpu,instance,name,group,account)) by(instance,name,group,account) > 1.7 + for: 10m + labels: + alertype: system + severity: warning + annotations: + description: "{{ $labels.name }}:系统负载{{ $value | humanize }}倍\\n> {{ $labels.group }}-{{ $labels.instance }}" + + - alert: 磁盘使用率 + expr: | + 100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{fstype=~"ext.?|xfs",mountpoint!~".*pods.*|/var/lib/docker/devicemapper/mnt/.*"} * 100) > 85 + for: 5m + labels: + alertype: system + severity: warning + annotations: + description: "{{ $labels.name }}_{{ $labels.mountpoint }}:磁盘使用率{{ $value | humanize }}%\\n> {{ $labels.group }}-{{ $labels.instance }}" + + - alert: K8S节点POD磁盘使用率 + expr: 100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{mountpoint=~"/var/lib/docker/devicemapper/mnt/.*"} * 100) > 85 + for: 5m + labels: + alertype: system + severity: warning + annotations: + description: "{{ $labels.name }}_{{ $labels.mountpoint }}:磁盘使用率{{ $value | humanize }}%\\n> {{ $labels.group }}-{{ $labels.instance }}" + + - alert: NFS磁盘使用率 + expr: 100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{fstype="nfs"} * 100) > 90 + for: 5m + labels: + alertype: system + severity: warning + annotations: + description: "{{ $labels.name }}_{{ $labels.mountpoint }}:磁盘使用率{{ $value | humanize }}%\\n> {{ $labels.group }}-{{ $labels.instance }}" + + - alert: 磁盘读写容量 + expr: (irate(node_disk_read_bytes_total[5m]) ) /1024 /1024 > 80 or (irate(node_disk_written_bytes_total[5m]) ) /1024 /1024 > 80 + for: 8m + labels: + alertype: disk + severity: warning + annotations: + description: "{{ $labels.name }}_{{ $labels.device }}:当前IO为{{ $value | humanize }}MB/s\\n> {{ $labels.group }}-{{ $labels.instance }}" + + - alert: 网络流入(下载)数据过多 + expr: sum by(device,instance, name, group, account) (irate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr.*|lo.*|cni.*'}[5m])) / 1024 / 1024 > 70 + for: 5m + labels: + alertype: network + severity: warning + annotations: + description: "{{ $labels.name }}:流入数据为{{ $value | humanize }}MB/s\\n> {{ $labels.group }}-{{ $labels.instance }}" + + - alert: 网络流出(上传)数据过多 + expr: sum by(device,instance, name, group, account) (irate(node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr.*|lo.*|cni.*'}[5m])) / 1024 / 1024 > 70 + for: 5m + labels: + alertype: network + severity: warning + annotations: + description: "{{ $labels.name }}:流出数据为{{ $value | humanize }}MB/s\\n> {{ $labels.group }}-{{ $labels.instance }}" + +- name: Itself + rules: + - alert: Exporter状态 + expr: up == 0 + for: 3m + labels: + alertype: itself + severity: critical + annotations: + description: "{{ $labels.job }}:异常\\n> {{ $labels.group }}-{{ $labels.name }}-{{ $labels.instance }}" +""" + return {"code": 20000, "rules": rules} diff --git a/flask-consul/units/selfnode_manager.py b/flask-consul/units/selfnode_manager.py index 75865a0..38669dc 100644 --- a/flask-consul/units/selfnode_manager.py +++ b/flask-consul/units/selfnode_manager.py @@ -39,6 +39,8 @@ def get_service(): return {'code': 50000, 'data': f'{response.status_code}:{response.text}'} def add_service(vendor,account,region,group,name,ip,port,os): + if port is None or name is None: + return {"code": 50000, "data": f"名称或IP不能为空!"} sid = f"{vendor}/{account}/{region}/{group}@{name}" instance = f'{ip}:{port}' if '//' in sid or sid.startswith('/') or sid.endswith('/'): diff --git a/flask-consul/units/token_auth.py b/flask-consul/units/token_auth.py index 98fa31c..e083332 100644 --- a/flask-consul/units/token_auth.py +++ b/flask-consul/units/token_auth.py @@ -2,13 +2,18 @@ from flask_httpauth import HTTPTokenAuth from itsdangerous import TimedJSONWebSignatureSerializer from units import consul_kv secret_key = consul_kv.get_value('ConsulManager/assets/secret/skey')['sk'] -s = TimedJSONWebSignatureSerializer(secret_key) +s = TimedJSONWebSignatureSerializer(secret_key,expires_in=28800) auth = HTTPTokenAuth() @auth.verify_token def verify_token(token): try: data = s.loads(token) + except BadSignature: + raise AuthFailed(msg='token不正确') + except SignatureExpired: + raise AuthFailed(msg='token过期') + return {"code": 40000, "data": "登录过期,请重新登录!"} except: return False return True diff --git a/flask-consul/views/login.py b/flask-consul/views/login.py index 25bb043..1df652a 100644 --- a/flask-consul/views/login.py +++ b/flask-consul/views/login.py @@ -6,7 +6,7 @@ sys.path.append("..") from config import admin_passwd from units import token_auth, consul_kv secret_key = consul_kv.get_value('ConsulManager/assets/secret/skey')['sk'] -s = TimedJSONWebSignatureSerializer(secret_key) +s = TimedJSONWebSignatureSerializer(secret_key,expires_in=28800) blueprint = Blueprint('login',__name__) api = Api(blueprint) diff --git a/flask-consul/views/nodes.py b/flask-consul/views/nodes.py index 1cfc266..93901f2 100644 --- a/flask-consul/views/nodes.py +++ b/flask-consul/views/nodes.py @@ -38,7 +38,8 @@ class Nodes(Resource): serivces = i.split("/") services_list.append(f'{serivces[0]}_{serivces[1]}_{serivces[2]}') return {'code': 20000,'services_list': sorted(set(services_list))} - + elif stype == 'rules': + return gen_config.get_rules() def post(self, stype): if stype == 'config': args = parser.parse_args() diff --git a/flask-consul/views/selfnode.py b/flask-consul/views/selfnode.py index 80e86f7..7609d21 100644 --- a/flask-consul/views/selfnode.py +++ b/flask-consul/views/selfnode.py @@ -31,6 +31,7 @@ class SelfnodeApi(Resource): return selfnode_manager.get_service() def post(self): args = parser.parse_args() + print('=======\n',args,flush=True) return selfnode_manager.add_service(args['vendor'],args['account'],args['region'], args['group'],args['name'],args['ip'],args['port'],args['os']) def put(self): diff --git a/vue-consul/src/api/node-exporter.js b/vue-consul/src/api/node-exporter.js index 7186800..1cdc261 100644 --- a/vue-consul/src/api/node-exporter.js +++ b/vue-consul/src/api/node-exporter.js @@ -59,3 +59,9 @@ export function getConfig(services_dict) { data: { services_dict } }) } +export function getRules() { + return request({ + url: '/api/nodes/rules', + method: 'get' + }) +} diff --git a/vue-consul/src/router/index.js b/vue-consul/src/router/index.js index 9095255..ab6a0ea 100644 --- a/vue-consul/src/router/index.js +++ b/vue-consul/src/router/index.js @@ -113,6 +113,12 @@ export const constantRoutes = [ component: () => import('@/views/node-exporter/pconfig'), meta: { title: 'Prometheus 配置', icon: 'el-icon-set-up' } }, + { + path: 'rules', + name: '告警规则', + component: () => import('@/views/node-exporter/rules'), + meta: { title: '告警规则', icon: 'el-icon-bell' } + }, { path: 'grafana', name: 'Grafana 看板', diff --git a/vue-consul/src/views/blackbox/index.vue b/vue-consul/src/views/blackbox/index.vue index 22f97f1..3bcb29f 100644 --- a/vue-consul/src/views/blackbox/index.vue +++ b/vue-consul/src/views/blackbox/index.vue @@ -115,7 +115,7 @@ - + 前5个字段组合后需唯一,重复会覆盖已有监控项! diff --git a/vue-consul/src/views/dashboard/index.vue b/vue-consul/src/views/dashboard/index.vue index 6dc54d0..566f14f 100644 --- a/vue-consul/src/views/dashboard/index.vue +++ b/vue-consul/src/views/dashboard/index.vue @@ -4,6 +4,15 @@ StarsL.cn + + +

v0.5.2

+

增加了node-exporter的告警规则。

+

修正了一个新增自建主机监控项的bug。

+

新增站点监控的描述做了优化。

+

登录过期时间修改为8小时。

+
+

v0.5.1

diff --git a/vue-consul/src/views/login/index.vue b/vue-consul/src/views/login/index.vue index a1a5bf9..ea17a95 100644 --- a/vue-consul/src/views/login/index.vue +++ b/vue-consul/src/views/login/index.vue @@ -46,7 +46,7 @@
- v0.5.1 + v0.5.2
diff --git a/vue-consul/src/views/node-exporter/rules.vue b/vue-consul/src/views/node-exporter/rules.vue new file mode 100644 index 0000000..e4a2c3e --- /dev/null +++ b/vue-consul/src/views/node-exporter/rules.vue @@ -0,0 +1,51 @@ + + + +