From 580c497261dd03026a83266d2a383d7546fbbfd8 Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Tue, 28 Mar 2023 06:58:17 +0800 Subject: [PATCH] Add NodeSystemSaturation and NodeMemoryMajorPagesFaults Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 37 ++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 2ea61ba1..071cfd8d 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -323,6 +323,41 @@ description: 'CPU usage at {{ $labels.instance }} has been above 80% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.', }, }, + { + alert: 'NodeSystemSaturation', + expr: ||| + node_load1{%(nodeExporterSelector)s} + / count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}) > 2 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'System saturated, load per core is very high.', + description: ||| + System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + This might indicate this instance resources saturation and can cause it becoming unresponsive. + |||, + }, + }, + { + alert: 'NodeMemoryMajorPagesFaults', + expr: ||| + rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[5m]) > 500 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Memory major page faults are occurring at very high rate.', + description: ||| + Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + Please check that there is enough memory available at this instance. + |||, + }, + }, { alert: 'NodeMemoryHighUtilization', expr: ||| @@ -352,7 +387,7 @@ summary: 'Disk IO queue is high.', description: ||| Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. - This symptom might indicate disk saturation., + This symptom might indicate disk saturation. |||, }, },