diff --git a/node-mixin/.gitignore b/node-mixin/.gitignore new file mode 100644 index 00000000..65d141bd --- /dev/null +++ b/node-mixin/.gitignore @@ -0,0 +1,3 @@ +/jsonnetfile.lock.json +/vendor/ + diff --git a/node-mixin/alerts/alerts.libsonnet b/node-mixin/alerts/alerts.libsonnet new file mode 100644 index 00000000..8ea70cc7 --- /dev/null +++ b/node-mixin/alerts/alerts.libsonnet @@ -0,0 +1,165 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'node-exporter', + rules: [ + { + alert: 'NodeFilesystemSpaceFillingUp', + expr: ||| + predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + and + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 24 hours.', + }, + }, + { + alert: 'NodeFilesystemSpaceFillingUp', + expr: ||| + predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + and + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 4 hours.', + }, + }, + { + alert: 'NodeFilesystemOutOfSpace', + expr: ||| + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', + }, + }, + { + alert: 'NodeFilesystemOutOfSpace', + expr: ||| + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', + }, + }, + { + alert: 'NodeFilesystemFilesFillingUp', + expr: ||| + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + and + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 24 hours.', + }, + }, + { + alert: 'NodeFilesystemFilesFillingUp', + expr: ||| + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + and + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 4 hours.', + }, + }, + { + alert: 'NodeFilesystemOutOfFiles', + expr: ||| + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available inodes left.', + }, + }, + { + alert: 'NodeFilesystemOutOfSpace', + expr: ||| + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', + }, + }, + { + alert: 'NodeNetworkReceiveErrs', + expr: ||| + increase(node_network_receive_errs_total[2m]) > 10 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while receiving packets ({{ $value }} errors in two minutes).', + }, + }, + { + alert: 'NodeNetworkTransmitErrs', + expr: ||| + increase(node_network_transmit_errs_total[2m]) > 10 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while transmitting packets ({{ $value }} errors in two minutes).', + }, + }, + ], + }, + ], + }, +} diff --git a/node-mixin/config.libsonnet b/node-mixin/config.libsonnet new file mode 100644 index 00000000..6c5d6f74 --- /dev/null +++ b/node-mixin/config.libsonnet @@ -0,0 +1,11 @@ +{ + _config+:: { + // Selectors are inserted between {} in Prometheus queries. + nodeExporterSelector: 'job="node-exporter"', + + // Mainly extracted because they are repetitive, but also useful to customize. + fsSelectors: 'fstype=~"ext.|xfs",mountpoint!="/var/lib/docker/aufs"', + + grafana_prefix: '', + }, +} diff --git a/node-mixin/dashboards/dashboards.libsonnet b/node-mixin/dashboards/dashboards.libsonnet new file mode 100644 index 00000000..e6adbd4f --- /dev/null +++ b/node-mixin/dashboards/dashboards.libsonnet @@ -0,0 +1,2 @@ +(import 'node.libsonnet') + +(import 'use.libsonnet') diff --git a/node-mixin/dashboards/node.libsonnet b/node-mixin/dashboards/node.libsonnet new file mode 100644 index 00000000..4594e3ed --- /dev/null +++ b/node-mixin/dashboards/node.libsonnet @@ -0,0 +1,170 @@ +local grafana = import 'grafonnet/grafana.libsonnet'; +local dashboard = grafana.dashboard; +local row = grafana.row; +local prometheus = grafana.prometheus; +local template = grafana.template; +local graphPanel = grafana.graphPanel; +local promgrafonnet = import '../lib/promgrafonnet/promgrafonnet.libsonnet'; +local gauge = promgrafonnet.gauge; + +{ + grafanaDashboards+:: { + 'nodes.json': + local idleCPU = + graphPanel.new( + 'Idle CPU', + datasource='$datasource', + span=6, + format='percentunit', + max=100, + min=0, + ) + .addTarget(prometheus.target( + ||| + 1 - avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) + ||| % $._config, + legendFormat='{{cpu}}', + intervalFactor=10, + )); + + local systemLoad = + graphPanel.new( + 'System load', + datasource='$datasource', + span=6, + format='percentunit', + ) + .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 1m')) + .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 5m')) + .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 15m')); + + local memoryGraph = + graphPanel.new( + 'Memory Usage', + datasource='$datasource', + span=9, + format='bytes', + ) + .addTarget(prometheus.target( + ||| + node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"} + ||| % $._config, legendFormat='memory used' + )) + .addTarget(prometheus.target('node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers')) + .addTarget(prometheus.target('node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) + .addTarget(prometheus.target('node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); + + local memoryGauge = gauge.new( + 'Memory Usage', + ||| + node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"} + / + node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + ||| % $._config, + ).withLowerBeingBetter(); + + local diskIO = + graphPanel.new( + 'Disk I/O', + datasource='$datasource', + span=9, + ) + .addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_read_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read')) + .addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_written_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written')) + .addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) + + { + seriesOverrides: [ + { + alias: 'read', + yaxis: 1, + }, + { + alias: 'io time', + yaxis: 2, + }, + ], + yaxes: [ + self.yaxe(format='bytes'), + self.yaxe(format='ms'), + ], + }; + + local diskSpaceUsage = gauge.new( + 'Disk Space Usage', + ||| + 1 - ( + sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} + / + sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} + ) + ||| % $._config, + ).withLowerBeingBetter(); + + local networkReceived = + graphPanel.new( + 'Network Received', + datasource='$datasource', + span=6, + format='bytes', + ) + .addTarget(prometheus.target('irate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}')); + + local networkTransmitted = + graphPanel.new( + 'Network Transmitted', + datasource='$datasource', + span=6, + format='bytes', + ) + .addTarget(prometheus.target('irate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}')); + + dashboard.new('Nodes', time_from='now-1h') + .addTemplate( + { + current: { + text: 'Prometheus', + value: 'Prometheus', + }, + hide: 0, + label: null, + name: 'datasource', + options: [], + query: 'prometheus', + refresh: 1, + regex: '', + type: 'datasource', + }, + ) + .addTemplate( + template.new( + 'instance', + '$datasource', + 'label_values(node_boot_time{%(nodeExporterSelector)s}, instance)' % $._config, + refresh='time', + ) + ) + .addRow( + row.new() + .addPanel(idleCPU) + .addPanel(systemLoad) + ) + .addRow( + row.new() + .addPanel(memoryGraph) + .addPanel(memoryGauge) + ) + .addRow( + row.new() + .addPanel(diskIO) + .addPanel(diskSpaceUsage) + ) + .addRow( + row.new() + .addPanel(networkReceived) + .addPanel(networkTransmitted) + ), + }, +} diff --git a/node-mixin/dashboards/use.libsonnet b/node-mixin/dashboards/use.libsonnet new file mode 100644 index 00000000..3e368c86 --- /dev/null +++ b/node-mixin/dashboards/use.libsonnet @@ -0,0 +1,151 @@ +local g = import 'grafana-builder/grafana.libsonnet'; + +{ + grafanaDashboards+:: { + 'node-cluster-rsrc-use.json': + local legendLink = '%s/dashboard/file/k8s-node-rsrc-use.json' % $._config.grafana_prefix; + + g.dashboard('USE Method / Cluster') + .addRow( + g.row('CPU') + .addPanel( + g.panel('CPU Utilisation') + + g.queryPanel('instance:node_cpu_utilisation:avg1m * instance:node_num_cpu:sum / scalar(sum(instance:node_num_cpu:sum))', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + g.panel('CPU Saturation (Load1)') + + g.queryPanel(||| + instance:node_cpu_saturation_load1: / scalar(sum(up{%(nodeExporterSelector)s})) + ||| % $._config, '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + ) + .addRow( + g.row('Memory') + .addPanel( + g.panel('Memory Utilisation') + + g.queryPanel('instance:node_memory_utilisation:ratio', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + g.panel('Memory Saturation (Swap I/O)') + + g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes('Bps') }, + ) + ) + .addRow( + g.row('Disk') + .addPanel( + g.panel('Disk IO Utilisation') + + // Full utilisation would be all disks on each node spending an average of + // 1 sec per second doing I/O, normalize by node count for stacked charts + g.queryPanel(||| + instance:node_disk_utilisation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s})) + ||| % $._config, '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + g.panel('Disk IO Saturation') + + g.queryPanel(||| + instance:node_disk_saturation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s})) + ||| % $._config, '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + ) + .addRow( + g.row('Network') + .addPanel( + g.panel('Net Utilisation (Transmitted)') + + g.queryPanel('instance:node_net_utilisation:sum_irate', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes('Bps') }, + ) + .addPanel( + g.panel('Net Saturation (Dropped)') + + g.queryPanel('instance:node_net_saturation:sum_irate', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes('Bps') }, + ) + ) + .addRow( + g.row('Storage') + .addPanel( + g.panel('Disk Capacity') + + g.queryPanel('sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_free_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ), + ), + + 'node-rsrc-use.json': + g.dashboard('USE Method / Node') + .addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance') + .addRow( + g.row('CPU') + .addPanel( + g.panel('CPU Utilisation') + + g.queryPanel('instance:node_cpu_utilisation:avg1m{instance="$instance"}', 'Utilisation') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + g.panel('CPU Saturation (Load1)') + + g.queryPanel('instance:node_cpu_saturation_load1:{instance="$instance"}', 'Saturation') + + { yaxes: g.yaxes('percentunit') }, + ) + ) + .addRow( + g.row('Memory') + .addPanel( + g.panel('Memory Utilisation') + + g.queryPanel('instance:node_memory_utilisation:ratio{instance="$instance"}', 'Memory') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + g.panel('Memory Saturation (Swap I/O)') + + g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{instance="$instance"}', 'Swap IO') + + { yaxes: g.yaxes('Bps') }, + ) + ) + .addRow( + g.row('Disk') + .addPanel( + g.panel('Disk IO Utilisation') + + g.queryPanel('instance:node_disk_utilisation:sum_irate{instance="$instance"}', 'Utilisation') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + g.panel('Disk IO Saturation') + + g.queryPanel('instance:node_disk_saturation:sum_irate{instance="$instance"}', 'Saturation') + + { yaxes: g.yaxes('percentunit') }, + ) + ) + .addRow( + g.row('Net') + .addPanel( + g.panel('Net Utilisation (Transmitted)') + + g.queryPanel('instance:node_net_utilisation:sum_irate{instance="$instance"}', 'Utilisation') + + { yaxes: g.yaxes('Bps') }, + ) + .addPanel( + g.panel('Net Saturation (Dropped)') + + g.queryPanel('instance:node_net_saturation:sum_irate{instance="$instance"}', 'Saturation') + + { yaxes: g.yaxes('Bps') }, + ) + ) + .addRow( + g.row('Disk') + .addPanel( + g.panel('Disk Utilisation') + + g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free_bytes{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size_bytes{fstype=~"ext[24]"}))', 'Disk') + + { yaxes: g.yaxes('percentunit') }, + ), + ), + }, +} diff --git a/node-mixin/jsonnetfile.json b/node-mixin/jsonnetfile.json new file mode 100644 index 00000000..45326aad --- /dev/null +++ b/node-mixin/jsonnetfile.json @@ -0,0 +1,24 @@ +{ + "dependencies": [ + { + "name": "grafonnet", + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib", + "subdir": "grafonnet" + } + }, + "version": "master" + }, + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/kausalco/public", + "subdir": "grafana-builder" + } + }, + "version": "master" + } + ] +} diff --git a/node-mixin/lib/promgrafonnet/gauge.libsonnet b/node-mixin/lib/promgrafonnet/gauge.libsonnet new file mode 100644 index 00000000..43640b6d --- /dev/null +++ b/node-mixin/lib/promgrafonnet/gauge.libsonnet @@ -0,0 +1,60 @@ +local grafana = import 'grafonnet/grafana.libsonnet'; +local singlestat = grafana.singlestat; +local prometheus = grafana.prometheus; + +{ + new(title, query):: + singlestat.new( + title, + datasource='$datasource', + span=3, + format='percentunit', + valueName='current', + colors=[ + 'rgba(245, 54, 54, 0.9)', + 'rgba(237, 129, 40, 0.89)', + 'rgba(50, 172, 45, 0.97)', + ], + thresholds='50, 80', + valueMaps=[ + { + op: '=', + text: 'N/A', + value: 'null', + }, + ], + ) + .addTarget( + prometheus.target( + query + ) + ) + { + gauge: { + maxValue: 100, + minValue: 0, + show: true, + thresholdLabels: false, + thresholdMarkers: true, + }, + withTextNullValue(text):: self { + valueMaps: [ + { + op: '=', + text: text, + value: 'null', + }, + ], + }, + withSpanSize(size):: self { + span: size, + }, + withLowerBeingBetter():: self { + colors: [ + 'rgba(50, 172, 45, 0.97)', + 'rgba(237, 129, 40, 0.89)', + 'rgba(245, 54, 54, 0.9)', + ], + thresholds: '80, 90', + }, + }, +} diff --git a/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet b/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet new file mode 100644 index 00000000..bc1d6f6f --- /dev/null +++ b/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet @@ -0,0 +1,48 @@ +local grafana = import 'grafonnet/grafana.libsonnet'; +local singlestat = grafana.singlestat; +local prometheus = grafana.prometheus; + +{ + new(title, query):: + singlestat.new( + title, + datasource='prometheus', + span=3, + valueName='current', + valueMaps=[ + { + op: '=', + text: '0', + value: 'null', + }, + ], + ) + .addTarget( + prometheus.target( + query + ) + ) + { + withTextNullValue(text):: self { + valueMaps: [ + { + op: '=', + text: text, + value: 'null', + }, + ], + }, + withSpanSize(size):: self { + span: size, + }, + withPostfix(postfix):: self { + postfix: postfix, + }, + withSparkline():: self { + sparkline: { + show: true, + lineColor: 'rgb(31, 120, 193)', + fillColor: 'rgba(31, 118, 189, 0.18)', + }, + }, + }, +} diff --git a/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet b/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet new file mode 100644 index 00000000..013ff42b --- /dev/null +++ b/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet @@ -0,0 +1,5 @@ +{ + numbersinglestat:: import 'numbersinglestat.libsonnet', + gauge:: import 'gauge.libsonnet', + percentlinegraph:: import 'percentlinegraph.libsonnet', +} diff --git a/node-mixin/mixin.libsonnet b/node-mixin/mixin.libsonnet new file mode 100644 index 00000000..b9831f93 --- /dev/null +++ b/node-mixin/mixin.libsonnet @@ -0,0 +1,4 @@ +(import 'config.libsonnet') + +(import 'alerts/alerts.libsonnet') + +(import 'dashboards/dashboards.libsonnet') + +(import 'rules/rules.libsonnet') diff --git a/node-mixin/rules/rules.libsonnet b/node-mixin/rules/rules.libsonnet new file mode 100644 index 00000000..f836d0d0 --- /dev/null +++ b/node-mixin/rules/rules.libsonnet @@ -0,0 +1,106 @@ +{ + prometheusRules+:: { + groups+: [ + { + name: 'node-exporter.rules', + rules: [ + { + // This rule gives the number of CPUs per node. + record: 'instance:node_num_cpu:sum', + expr: ||| + count by (instance) ( + sum by (instance, cpu) ( + node_cpu_seconds_total{%(nodeExporterSelector)s} + ) + ) + ||| % $._config, + }, + { + // CPU utilisation is % CPU is not idle. + record: 'instance:node_cpu_utilisation:avg1m', + expr: ||| + 1 - avg by (instance) ( + rate(node_cpu_seconds_total{%(nodeExporterSelector)s,mode="idle"}[1m]) + ) + ||| % $._config, + }, + { + // CPU saturation is 1min avg run queue length / number of CPUs. + // Can go over 100%. >100% is bad. + record: 'instance:node_cpu_saturation_load1:', + expr: ||| + sum by (instance) (node_load1{%(nodeExporterSelector)s}) + / + instance:node_num_cpu:sum + ||| % $._config, + }, + { + // Total memory per node + record: 'instance:node_memory_bytes_total:sum', + expr: ||| + sum by (instance) ( + node_memory_MemTotal_bytes{%(nodeExporterSelector)s} + ) + ||| % $._config, + }, + { + // Memory utilisation per node, normalized by per-node memory + record: 'instance:node_memory_utilisation:ratio', + expr: ||| + 1 - ( + node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} + / + node_memory_MemTotal_bytes{%(nodeExporterSelector)s} + ) + ||| % $._config, + }, + { + record: 'instance:node_memory_swap_io_bytes:sum_rate', + expr: ||| + 1e3 * sum by (instance) ( + (rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m]) + + rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m])) + ) + ||| % $._config, + }, + { + // Disk utilisation (ms spent, 1 second irate()) + record: 'instance:node_disk_utilisation:sum_irate', + expr: ||| + sum by (instance) ( + irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) + ) + ||| % $._config, + }, + { + // Disk saturation (ms spent, by rate() it's bound by 1 second) + record: 'instance:node_disk_saturation:sum_irate', + expr: ||| + sum by (instance) ( + irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) + ) + ||| % $._config, + }, + { + record: 'instance:node_net_utilisation:sum_irate', + expr: ||| + sum by (instance) ( + (irate(node_network_receive_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + + irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) + ) + ||| % $._config, + }, + { + record: 'instance:node_net_saturation:sum_irate', + expr: ||| + sum by (instance) ( + (irate(node_network_receive_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + + irate(node_network_transmit_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) + ) + ||| % $._config, + }, + ], + }, + ], + }, +}