From bafe1707f13f9da58c7a88b42f15ab596f649ba9 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 8 May 2018 12:10:29 +0200 Subject: [PATCH 01/31] Beginnings of a node-exporter monitoring mixin. Signed-off-by: Tom Wilkie --- node-mixin/alerts/alerts.libsonnet | 165 ++++++++++++++++ node-mixin/config.libsonnet | 11 ++ node-mixin/dashboards/dashboards.libsonnet | 2 + node-mixin/dashboards/node.libsonnet | 176 ++++++++++++++++++ node-mixin/dashboards/use.libsonnet | 151 +++++++++++++++ node-mixin/jsonnetfile.json | 24 +++ node-mixin/lib/promgrafonnet/gauge.libsonnet | 60 ++++++ .../promgrafonnet/numbersinglestat.libsonnet | 48 +++++ .../lib/promgrafonnet/promgrafonnet.libsonnet | 5 + node-mixin/mixin.libsonnet | 4 + node-mixin/rules/rules.libsonnet | 121 ++++++++++++ 11 files changed, 767 insertions(+) create mode 100644 node-mixin/alerts/alerts.libsonnet create mode 100644 node-mixin/config.libsonnet create mode 100644 node-mixin/dashboards/dashboards.libsonnet create mode 100644 node-mixin/dashboards/node.libsonnet create mode 100644 node-mixin/dashboards/use.libsonnet create mode 100644 node-mixin/jsonnetfile.json create mode 100644 node-mixin/lib/promgrafonnet/gauge.libsonnet create mode 100644 node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet create mode 100644 node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet create mode 100644 node-mixin/mixin.libsonnet create mode 100644 node-mixin/rules/rules.libsonnet diff --git a/node-mixin/alerts/alerts.libsonnet b/node-mixin/alerts/alerts.libsonnet new file mode 100644 index 00000000..198e22fd --- /dev/null +++ b/node-mixin/alerts/alerts.libsonnet @@ -0,0 +1,165 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'node', + rules: [ + { + alert: 'NodeFilesystemSpaceFillingUp', + expr: ||| + predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + AND + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 24 hours.', + }, + }, + { + alert: 'NodeFilesystemSpaceFillingUp', + expr: ||| + predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + AND + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 4 hours.', + }, + }, + { + alert: 'NodeFilesystemOutOfSpace', + expr: ||| + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', + }, + }, + { + alert: 'NodeFilesystemOutOfSpace', + expr: ||| + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', + }, + }, + { + alert: 'NodeFilesystemFilesFillingUp', + expr: ||| + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + AND + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 24 hours.', + }, + }, + { + alert: 'NodeFilesystemFilesFillingUp', + expr: ||| + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + AND + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 4 hours.', + }, + }, + { + alert: 'NodeFilesystemOutOfFiles', + expr: ||| + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available inodes left.', + }, + }, + { + alert: 'NodeFilesystemOutOfSpace', + expr: ||| + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', + }, + }, + { + alert: 'NodeNetworkReceiveErrs', + expr: ||| + increase(node_network_receive_errs[2m]) > 10 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while receiving packets ({{ $value }} errors in two minutes).', + }, + }, + { + alert: 'NodeNetworkTransmitErrs', + expr: ||| + increase(node_network_transmit_errs[2m]) > 10 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while transmitting packets ({{ $value }} errors in two minutes).', + }, + }, + ], + }, + ], + }, +} diff --git a/node-mixin/config.libsonnet b/node-mixin/config.libsonnet new file mode 100644 index 00000000..6c5d6f74 --- /dev/null +++ b/node-mixin/config.libsonnet @@ -0,0 +1,11 @@ +{ + _config+:: { + // Selectors are inserted between {} in Prometheus queries. + nodeExporterSelector: 'job="node-exporter"', + + // Mainly extracted because they are repetitive, but also useful to customize. + fsSelectors: 'fstype=~"ext.|xfs",mountpoint!="/var/lib/docker/aufs"', + + grafana_prefix: '', + }, +} diff --git a/node-mixin/dashboards/dashboards.libsonnet b/node-mixin/dashboards/dashboards.libsonnet new file mode 100644 index 00000000..e6adbd4f --- /dev/null +++ b/node-mixin/dashboards/dashboards.libsonnet @@ -0,0 +1,2 @@ +(import 'node.libsonnet') + +(import 'use.libsonnet') diff --git a/node-mixin/dashboards/node.libsonnet b/node-mixin/dashboards/node.libsonnet new file mode 100644 index 00000000..471c5b37 --- /dev/null +++ b/node-mixin/dashboards/node.libsonnet @@ -0,0 +1,176 @@ +local grafana = import 'grafonnet/grafana.libsonnet'; +local dashboard = grafana.dashboard; +local row = grafana.row; +local prometheus = grafana.prometheus; +local template = grafana.template; +local graphPanel = grafana.graphPanel; +local promgrafonnet = import '../lib/promgrafonnet/promgrafonnet.libsonnet'; +local gauge = promgrafonnet.gauge; + +{ + grafanaDashboards+:: { + 'nodes.json': + local idleCPU = + graphPanel.new( + 'Idle CPU', + datasource='$datasource', + span=6, + format='percent', + max=100, + min=0, + ) + .addTarget(prometheus.target( + ||| + 100 - (avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[5m])) * 100) + ||| % $._config, + legendFormat='{{cpu}}', + intervalFactor=10, + )); + + local systemLoad = + graphPanel.new( + 'System load', + datasource='$datasource', + span=6, + format='percent', + ) + .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 1m')) + .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 5m')) + .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 15m')); + + local memoryGraph = + graphPanel.new( + 'Memory Usage', + datasource='$datasource', + span=9, + format='bytes', + ) + .addTarget(prometheus.target( + ||| + node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"} + ||| % $._config, legendFormat='memory used' + )) + .addTarget(prometheus.target('node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers')) + .addTarget(prometheus.target('node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) + .addTarget(prometheus.target('node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); + + local memoryGauge = gauge.new( + 'Memory Usage', + ||| + ( + node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"} + ) * 100 + / + node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + ||| % $._config, + ).withLowerBeingBetter(); + + local diskIO = + graphPanel.new( + 'Disk I/O', + datasource='$datasource', + span=9, + ) + .addTarget(prometheus.target('sum by (instance) (rate(node_disk_bytes_read{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='read')) + .addTarget(prometheus.target('sum by (instance) (rate(node_disk_bytes_written{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='written')) + .addTarget(prometheus.target('sum by (instance) (rate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='io time')) + + { + seriesOverrides: [ + { + alias: 'read', + yaxis: 1, + }, + { + alias: 'io time', + yaxis: 2, + }, + ], + yaxes: [ + self.yaxe(format='bytes'), + self.yaxe(format='ms'), + ], + }; + + local diskSpaceUsage = gauge.new( + 'Disk Space Usage', + ||| + ( + sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}) + - sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}) + ) * 100 + / + sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}) + ||| % $._config, + ).withLowerBeingBetter(); + + local networkReceived = + graphPanel.new( + 'Network Received', + datasource='$datasource', + span=6, + format='bytes', + ) + .addTarget(prometheus.target('rate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[5m])' % $._config, legendFormat='{{device}}')); + + local networkTransmitted = + graphPanel.new( + 'Network Transmitted', + datasource='$datasource', + span=6, + format='bytes', + ) + .addTarget(prometheus.target('rate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[5m])' % $._config, legendFormat='{{device}}')); + + dashboard.new('Nodes', time_from='now-1h') + .addTemplate( + { + current: { + text: 'Prometheus', + value: 'Prometheus', + }, + hide: 0, + label: null, + name: 'datasource', + options: [], + query: 'prometheus', + refresh: 1, + regex: '', + type: 'datasource', + }, + ) + .addTemplate( + template.new( + 'instance', + '$datasource', + 'label_values(node_boot_time{%(nodeExporterSelector)s}, instance)' % $._config, + refresh='time', + ) + ) + .addRow( + row.new() + .addPanel(idleCPU) + .addPanel(systemLoad) + ) + .addRow( + row.new() + .addPanel(memoryGraph) + .addPanel(memoryGauge) + ) + .addRow( + row.new() + .addPanel(diskIO) + .addPanel(diskSpaceUsage) + ) + .addRow( + row.new() + .addPanel(networkReceived) + .addPanel(networkTransmitted) + ), + }, +} diff --git a/node-mixin/dashboards/use.libsonnet b/node-mixin/dashboards/use.libsonnet new file mode 100644 index 00000000..526002f6 --- /dev/null +++ b/node-mixin/dashboards/use.libsonnet @@ -0,0 +1,151 @@ +local g = import 'grafana-builder/grafana.libsonnet'; + +{ + grafanaDashboards+:: { + 'node-cluster-rsrc-use.json': + local legendLink = '%s/dashboard/file/k8s-node-rsrc-use.json' % $._config.grafana_prefix; + + g.dashboard('USE Method / Cluster') + .addRow( + g.row('CPU') + .addPanel( + g.panel('CPU Utilisation') + + g.queryPanel('instance:node_cpu_utilisation:avg1m * instance:node_num_cpu:sum / scalar(sum(instance:node_num_cpu:sum))', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + g.panel('CPU Saturation (Load1)') + + g.queryPanel(||| + instance:node_cpu_saturation_load1: / scalar(sum(up{%(nodeExporterSelector)s})) + ||| % $._config, '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + ) + .addRow( + g.row('Memory') + .addPanel( + g.panel('Memory Utilisation') + + g.queryPanel('instance:node_memory_utilisation:ratio', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + g.panel('Memory Saturation (Swap I/O)') + + g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes('Bps') }, + ) + ) + .addRow( + g.row('Disk') + .addPanel( + g.panel('Disk IO Utilisation') + + // Full utilisation would be all disks on each node spending an average of + // 1 sec per second doing I/O, normalize by node count for stacked charts + g.queryPanel(||| + instance:node_disk_utilisation:avg_irate / scalar(sum(up{%(nodeExporterSelector)s})) + ||| % $._config, '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + g.panel('Disk IO Saturation') + + g.queryPanel(||| + instance:node_disk_saturation:avg_irate / scalar(sum(up{%(nodeExporterSelector)s})) + ||| % $._config, '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + ) + .addRow( + g.row('Network') + .addPanel( + g.panel('Net Utilisation (Transmitted)') + + g.queryPanel('instance:node_net_utilisation:sum_irate', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes('Bps') }, + ) + .addPanel( + g.panel('Net Saturation (Dropped)') + + g.queryPanel('instance:node_net_saturation:sum_irate', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes('Bps') }, + ) + ) + .addRow( + g.row('Storage') + .addPanel( + g.panel('Disk Capacity') + + g.queryPanel('sum(max(node_filesystem_size{fstype=~"ext[24]"} - node_filesystem_free{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ), + ), + + 'k8s-node-rsrc-use.json': + g.dashboard('K8s / USE Method / Node') + .addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance') + .addRow( + g.row('CPU') + .addPanel( + g.panel('CPU Utilisation') + + g.queryPanel('instance:node_cpu_utilisation:avg1m{instance="$instance"}', 'Utilisation') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + g.panel('CPU Saturation (Load1)') + + g.queryPanel('instance:node_cpu_saturation_load1:{instance="$instance"}', 'Saturation') + + { yaxes: g.yaxes('percentunit') }, + ) + ) + .addRow( + g.row('Memory') + .addPanel( + g.panel('Memory Utilisation') + + g.queryPanel('instance:node_memory_utilisation:{instance="$instance"}', 'Memory') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + g.panel('Memory Saturation (Swap I/O)') + + g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{instance="$instance"}', 'Swap IO') + + { yaxes: g.yaxes('Bps') }, + ) + ) + .addRow( + g.row('Disk') + .addPanel( + g.panel('Disk IO Utilisation') + + g.queryPanel('instance:node_disk_utilisation:avg_irate{instance="$instance"}', 'Utilisation') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + g.panel('Disk IO Saturation') + + g.queryPanel('instance:node_disk_saturation:avg_irate{instance="$instance"}', 'Saturation') + + { yaxes: g.yaxes('percentunit') }, + ) + ) + .addRow( + g.row('Net') + .addPanel( + g.panel('Net Utilisation (Transmitted)') + + g.queryPanel('instance:node_net_utilisation:sum_irate{instance="$instance"}', 'Utilisation') + + { yaxes: g.yaxes('Bps') }, + ) + .addPanel( + g.panel('Net Saturation (Dropped)') + + g.queryPanel('instance:node_net_saturation:sum_irate{instance="$instance"}', 'Saturation') + + { yaxes: g.yaxes('Bps') }, + ) + ) + .addRow( + g.row('Disk') + .addPanel( + g.panel('Disk Utilisation') + + g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size{fstype=~"ext[24]"}))', 'Disk') + + { yaxes: g.yaxes('percentunit') }, + ), + ), + }, +} diff --git a/node-mixin/jsonnetfile.json b/node-mixin/jsonnetfile.json new file mode 100644 index 00000000..45326aad --- /dev/null +++ b/node-mixin/jsonnetfile.json @@ -0,0 +1,24 @@ +{ + "dependencies": [ + { + "name": "grafonnet", + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib", + "subdir": "grafonnet" + } + }, + "version": "master" + }, + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/kausalco/public", + "subdir": "grafana-builder" + } + }, + "version": "master" + } + ] +} diff --git a/node-mixin/lib/promgrafonnet/gauge.libsonnet b/node-mixin/lib/promgrafonnet/gauge.libsonnet new file mode 100644 index 00000000..ea6c1ab6 --- /dev/null +++ b/node-mixin/lib/promgrafonnet/gauge.libsonnet @@ -0,0 +1,60 @@ +local grafana = import 'grafonnet/grafana.libsonnet'; +local singlestat = grafana.singlestat; +local prometheus = grafana.prometheus; + +{ + new(title, query):: + singlestat.new( + title, + datasource='prometheus', + span=3, + format='percent', + valueName='current', + colors=[ + 'rgba(245, 54, 54, 0.9)', + 'rgba(237, 129, 40, 0.89)', + 'rgba(50, 172, 45, 0.97)', + ], + thresholds='50, 80', + valueMaps=[ + { + op: '=', + text: 'N/A', + value: 'null', + }, + ], + ) + .addTarget( + prometheus.target( + query + ) + ) + { + gauge: { + maxValue: 100, + minValue: 0, + show: true, + thresholdLabels: false, + thresholdMarkers: true, + }, + withTextNullValue(text):: self { + valueMaps: [ + { + op: '=', + text: text, + value: 'null', + }, + ], + }, + withSpanSize(size):: self { + span: size, + }, + withLowerBeingBetter():: self { + colors: [ + 'rgba(50, 172, 45, 0.97)', + 'rgba(237, 129, 40, 0.89)', + 'rgba(245, 54, 54, 0.9)', + ], + thresholds: '80, 90', + }, + }, +} diff --git a/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet b/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet new file mode 100644 index 00000000..bc1d6f6f --- /dev/null +++ b/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet @@ -0,0 +1,48 @@ +local grafana = import 'grafonnet/grafana.libsonnet'; +local singlestat = grafana.singlestat; +local prometheus = grafana.prometheus; + +{ + new(title, query):: + singlestat.new( + title, + datasource='prometheus', + span=3, + valueName='current', + valueMaps=[ + { + op: '=', + text: '0', + value: 'null', + }, + ], + ) + .addTarget( + prometheus.target( + query + ) + ) + { + withTextNullValue(text):: self { + valueMaps: [ + { + op: '=', + text: text, + value: 'null', + }, + ], + }, + withSpanSize(size):: self { + span: size, + }, + withPostfix(postfix):: self { + postfix: postfix, + }, + withSparkline():: self { + sparkline: { + show: true, + lineColor: 'rgb(31, 120, 193)', + fillColor: 'rgba(31, 118, 189, 0.18)', + }, + }, + }, +} diff --git a/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet b/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet new file mode 100644 index 00000000..013ff42b --- /dev/null +++ b/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet @@ -0,0 +1,5 @@ +{ + numbersinglestat:: import 'numbersinglestat.libsonnet', + gauge:: import 'gauge.libsonnet', + percentlinegraph:: import 'percentlinegraph.libsonnet', +} diff --git a/node-mixin/mixin.libsonnet b/node-mixin/mixin.libsonnet new file mode 100644 index 00000000..b9831f93 --- /dev/null +++ b/node-mixin/mixin.libsonnet @@ -0,0 +1,4 @@ +(import 'config.libsonnet') + +(import 'alerts/alerts.libsonnet') + +(import 'dashboards/dashboards.libsonnet') + +(import 'rules/rules.libsonnet') diff --git a/node-mixin/rules/rules.libsonnet b/node-mixin/rules/rules.libsonnet new file mode 100644 index 00000000..ad1cc09b --- /dev/null +++ b/node-mixin/rules/rules.libsonnet @@ -0,0 +1,121 @@ +{ + prometheusRules+:: { + groups+: [ + { + name: 'node.rules', + rules: [ + { + // This rule gives the number of CPUs per node. + record: 'instance:node_num_cpu:sum', + expr: ||| + count by (instance) ( + sum by (instance, cpu) ( + node_cpu{%(nodeExporterSelector)s} + ) + ) + ||| % $._config, + }, + { + // CPU utilisation is % CPU is not idle. + record: 'instance:node_cpu_utilisation:avg1m', + expr: ||| + 1 - avg by (instance) ( + rate(node_cpu{%(nodeExporterSelector)s,mode="idle"}[1m]) + ) + ||| % $._config, + }, + { + // CPU saturation is 1min avg run queue length / number of CPUs. + // Can go over 100%. >100% is bad. + record: 'instance:node_cpu_saturation_load1:', + expr: ||| + sum by (instance) ( + node_load1{%(nodeExporterSelector)s} + ) + / + instance:node_num_cpu:sum + ||| % $._config, + }, + { + // Available memory per node + record: 'instance:node_memory_bytes_available:sum', + expr: ||| + sum by (instance) ( + (node_memory_MemFree{%(nodeExporterSelector)s} + node_memory_Cached{%(nodeExporterSelector)s} + node_memory_Buffers{%(nodeExporterSelector)s}) + ) + ||| % $._config, + }, + { + // Total memory per node + record: 'instance:node_memory_bytes_total:sum', + expr: ||| + sum by (instance) ( + node_memory_MemTotal{%(nodeExporterSelector)s} + ) + ||| % $._config, + }, + { + // Memory utilisation per node, normalized by per-node memory + record: 'instance:node_memory_utilisation:ratio', + expr: ||| + (instance:node_memory_bytes_total:sum - instance:node_memory_bytes_available:sum) + / + scalar(sum(instance:node_memory_bytes_total:sum)) + |||, + }, + { + record: 'instance:node_memory_utilisation:', + expr: ||| + 1 - (instance:node_memory_bytes_available:sum / instance:node_memory_bytes_total:sum) + ||| % $._config, + }, + { + record: 'instance:node_memory_swap_io_bytes:sum_rate', + expr: ||| + 1e3 * sum by (instance) ( + (rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m]) + + rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m])) + ) + ||| % $._config, + }, + { + // Disk utilisation (ms spent, by rate() it's bound by 1 second) + record: 'instance:node_disk_utilisation:avg_irate', + expr: ||| + avg by (instance) ( + irate(node_disk_io_time_ms{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3 + ) + ||| % $._config, + }, + { + // Disk saturation (ms spent, by rate() it's bound by 1 second) + record: 'instance:node_disk_saturation:avg_irate', + expr: ||| + avg by (instance) ( + irate(node_disk_io_time_weighted{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3 + ) + ||| % $._config, + }, + { + record: 'instance:node_net_utilisation:sum_irate', + expr: ||| + sum by (instance) ( + (irate(node_network_receive_bytes{%(nodeExporterSelector)s,device="eth0"}[1m]) + + irate(node_network_transmit_bytes{%(nodeExporterSelector)s,device="eth0"}[1m])) + ) + ||| % $._config, + }, + { + record: 'instance:node_net_saturation:sum_irate', + expr: ||| + sum by (instance) ( + (irate(node_network_receive_drop{%(nodeExporterSelector)s,device="eth0"}[1m]) + + irate(node_network_transmit_drop{%(nodeExporterSelector)s,device="eth0"}[1m])) + ) + ||| % $._config, + }, + ], + }, + ], + }, +} From 9303cf78ff1713ac7e114c2f0fc9da9b99577ffa Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Thu, 10 May 2018 10:35:35 +0200 Subject: [PATCH 02/31] Lower case binary operators and fix indentation. Signed-off-by: Tom Wilkie --- node-mixin/alerts/alerts.libsonnet | 64 +++++++++++++++--------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/node-mixin/alerts/alerts.libsonnet b/node-mixin/alerts/alerts.libsonnet index 198e22fd..c66d76db 100644 --- a/node-mixin/alerts/alerts.libsonnet +++ b/node-mixin/alerts/alerts.libsonnet @@ -7,11 +7,11 @@ { alert: 'NodeFilesystemSpaceFillingUp', expr: ||| - predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 - AND - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 - AND - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + and + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -24,11 +24,11 @@ { alert: 'NodeFilesystemSpaceFillingUp', expr: ||| - predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 - AND - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 - AND - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + and + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -41,9 +41,9 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 - AND - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -56,9 +56,9 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 - AND - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -71,11 +71,11 @@ { alert: 'NodeFilesystemFilesFillingUp', expr: ||| - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 - AND - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 - AND - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + and + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -88,11 +88,11 @@ { alert: 'NodeFilesystemFilesFillingUp', expr: ||| - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 - AND - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 - AND - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + and + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -105,9 +105,9 @@ { alert: 'NodeFilesystemOutOfFiles', expr: ||| - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 - AND - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -120,9 +120,9 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 - AND - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { From 417316b0e498ac661f6502d3df0896f2137fa255 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Thu, 10 May 2018 11:05:59 +0200 Subject: [PATCH 03/31] Switch to irate[1m] for node dashboard. Signed-off-by: Tom Wilkie --- node-mixin/dashboards/node.libsonnet | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/node-mixin/dashboards/node.libsonnet b/node-mixin/dashboards/node.libsonnet index 471c5b37..943864f9 100644 --- a/node-mixin/dashboards/node.libsonnet +++ b/node-mixin/dashboards/node.libsonnet @@ -15,13 +15,13 @@ local gauge = promgrafonnet.gauge; 'Idle CPU', datasource='$datasource', span=6, - format='percent', + format='percentunit', max=100, min=0, ) .addTarget(prometheus.target( ||| - 100 - (avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[5m])) * 100) + 1 - avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) ||| % $._config, legendFormat='{{cpu}}', intervalFactor=10, @@ -32,11 +32,11 @@ local gauge = promgrafonnet.gauge; 'System load', datasource='$datasource', span=6, - format='percent', + format='percentunit', ) - .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 1m')) - .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 5m')) - .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 15m')); + .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 1m')) + .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 5m')) + .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 15m')); local memoryGraph = graphPanel.new( @@ -77,9 +77,9 @@ local gauge = promgrafonnet.gauge; datasource='$datasource', span=9, ) - .addTarget(prometheus.target('sum by (instance) (rate(node_disk_bytes_read{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='read')) - .addTarget(prometheus.target('sum by (instance) (rate(node_disk_bytes_written{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='written')) - .addTarget(prometheus.target('sum by (instance) (rate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='io time')) + + .addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_read_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read')) + .addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_written_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written')) + .addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) + { seriesOverrides: [ { @@ -116,7 +116,7 @@ local gauge = promgrafonnet.gauge; span=6, format='bytes', ) - .addTarget(prometheus.target('rate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[5m])' % $._config, legendFormat='{{device}}')); + .addTarget(prometheus.target('irate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}')); local networkTransmitted = graphPanel.new( @@ -125,7 +125,7 @@ local gauge = promgrafonnet.gauge; span=6, format='bytes', ) - .addTarget(prometheus.target('rate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[5m])' % $._config, legendFormat='{{device}}')); + .addTarget(prometheus.target('irate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}')); dashboard.new('Nodes', time_from='now-1h') .addTemplate( From c34275d6e587fc7d3a76d208f1ffc058adc82098 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Thu, 10 May 2018 11:21:00 +0200 Subject: [PATCH 04/31] Switch gauges to percentunit. Signed-off-by: Tom Wilkie --- node-mixin/dashboards/node.libsonnet | 18 ++++++------------ node-mixin/lib/promgrafonnet/gauge.libsonnet | 2 +- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/node-mixin/dashboards/node.libsonnet b/node-mixin/dashboards/node.libsonnet index 943864f9..ba092cf7 100644 --- a/node-mixin/dashboards/node.libsonnet +++ b/node-mixin/dashboards/node.libsonnet @@ -60,14 +60,9 @@ local gauge = promgrafonnet.gauge; local memoryGauge = gauge.new( 'Memory Usage', ||| - ( + node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"} + / node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"} - ) * 100 - / - node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} ||| % $._config, ).withLowerBeingBetter(); @@ -100,12 +95,11 @@ local gauge = promgrafonnet.gauge; local diskSpaceUsage = gauge.new( 'Disk Space Usage', ||| - ( - sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}) - - sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}) - ) * 100 + 1 - ( + sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} / - sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}) + sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} + ) ||| % $._config, ).withLowerBeingBetter(); diff --git a/node-mixin/lib/promgrafonnet/gauge.libsonnet b/node-mixin/lib/promgrafonnet/gauge.libsonnet index ea6c1ab6..f69a5cdc 100644 --- a/node-mixin/lib/promgrafonnet/gauge.libsonnet +++ b/node-mixin/lib/promgrafonnet/gauge.libsonnet @@ -8,7 +8,7 @@ local prometheus = grafana.prometheus; title, datasource='prometheus', span=3, - format='percent', + format='percentunit', valueName='current', colors=[ 'rgba(245, 54, 54, 0.9)', From 642f67ffa1f3d2738ca89430d722e66a2398e673 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Thu, 10 May 2018 11:35:48 +0200 Subject: [PATCH 05/31] Fix up some of the USE metrics. Signed-off-by: Tom Wilkie --- node-mixin/dashboards/use.libsonnet | 10 +++--- node-mixin/rules/rules.libsonnet | 47 ++++++++++------------------- 2 files changed, 21 insertions(+), 36 deletions(-) diff --git a/node-mixin/dashboards/use.libsonnet b/node-mixin/dashboards/use.libsonnet index 526002f6..9231a746 100644 --- a/node-mixin/dashboards/use.libsonnet +++ b/node-mixin/dashboards/use.libsonnet @@ -45,7 +45,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; // Full utilisation would be all disks on each node spending an average of // 1 sec per second doing I/O, normalize by node count for stacked charts g.queryPanel(||| - instance:node_disk_utilisation:avg_irate / scalar(sum(up{%(nodeExporterSelector)s})) + instance:node_disk_utilisation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s})) ||| % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, @@ -53,7 +53,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; .addPanel( g.panel('Disk IO Saturation') + g.queryPanel(||| - instance:node_disk_saturation:avg_irate / scalar(sum(up{%(nodeExporterSelector)s})) + instance:node_disk_saturation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s})) ||| % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, @@ -104,7 +104,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Memory') .addPanel( g.panel('Memory Utilisation') + - g.queryPanel('instance:node_memory_utilisation:{instance="$instance"}', 'Memory') + + g.queryPanel('instance:node_memory_utilisation:ratio{instance="$instance"}', 'Memory') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( @@ -117,12 +117,12 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Disk') .addPanel( g.panel('Disk IO Utilisation') + - g.queryPanel('instance:node_disk_utilisation:avg_irate{instance="$instance"}', 'Utilisation') + + g.queryPanel('instance:node_disk_utilisation:sum_irate{instance="$instance"}', 'Utilisation') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( g.panel('Disk IO Saturation') + - g.queryPanel('instance:node_disk_saturation:avg_irate{instance="$instance"}', 'Saturation') + + g.queryPanel('instance:node_disk_saturation:sum_irate{instance="$instance"}', 'Saturation') + { yaxes: g.yaxes('percentunit') }, ) ) diff --git a/node-mixin/rules/rules.libsonnet b/node-mixin/rules/rules.libsonnet index ad1cc09b..7c70540e 100644 --- a/node-mixin/rules/rules.libsonnet +++ b/node-mixin/rules/rules.libsonnet @@ -29,20 +29,9 @@ // Can go over 100%. >100% is bad. record: 'instance:node_cpu_saturation_load1:', expr: ||| - sum by (instance) ( - node_load1{%(nodeExporterSelector)s} - ) + sum by (instance) (node_load1{%(nodeExporterSelector)s}) / - instance:node_num_cpu:sum - ||| % $._config, - }, - { - // Available memory per node - record: 'instance:node_memory_bytes_available:sum', - expr: ||| - sum by (instance) ( - (node_memory_MemFree{%(nodeExporterSelector)s} + node_memory_Cached{%(nodeExporterSelector)s} + node_memory_Buffers{%(nodeExporterSelector)s}) - ) + instance:node_num_cpu:sum ||| % $._config, }, { @@ -58,17 +47,13 @@ // Memory utilisation per node, normalized by per-node memory record: 'instance:node_memory_utilisation:ratio', expr: ||| - (instance:node_memory_bytes_total:sum - instance:node_memory_bytes_available:sum) - / - scalar(sum(instance:node_memory_bytes_total:sum)) + 1 - ( + node_memory_MemAvailable{%(nodeExporterSelector)s} + / + node_memory_MemTotal{%(nodeExporterSelector)s} + ) |||, }, - { - record: 'instance:node_memory_utilisation:', - expr: ||| - 1 - (instance:node_memory_bytes_available:sum / instance:node_memory_bytes_total:sum) - ||| % $._config, - }, { record: 'instance:node_memory_swap_io_bytes:sum_rate', expr: ||| @@ -79,19 +64,19 @@ ||| % $._config, }, { - // Disk utilisation (ms spent, by rate() it's bound by 1 second) - record: 'instance:node_disk_utilisation:avg_irate', + // Disk utilisation (ms spent, 1 second irate()) + record: 'instance:node_disk_utilisation:sum_irate', expr: ||| - avg by (instance) ( + sum by (instance) ( irate(node_disk_io_time_ms{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3 ) ||| % $._config, }, { // Disk saturation (ms spent, by rate() it's bound by 1 second) - record: 'instance:node_disk_saturation:avg_irate', + record: 'instance:node_disk_saturation:sum_irate', expr: ||| - avg by (instance) ( + sum by (instance) ( irate(node_disk_io_time_weighted{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3 ) ||| % $._config, @@ -100,8 +85,8 @@ record: 'instance:node_net_utilisation:sum_irate', expr: ||| sum by (instance) ( - (irate(node_network_receive_bytes{%(nodeExporterSelector)s,device="eth0"}[1m]) + - irate(node_network_transmit_bytes{%(nodeExporterSelector)s,device="eth0"}[1m])) + (irate(node_network_receive_bytes{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + + irate(node_network_transmit_bytes{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) ) ||| % $._config, }, @@ -109,8 +94,8 @@ record: 'instance:node_net_saturation:sum_irate', expr: ||| sum by (instance) ( - (irate(node_network_receive_drop{%(nodeExporterSelector)s,device="eth0"}[1m]) + - irate(node_network_transmit_drop{%(nodeExporterSelector)s,device="eth0"}[1m])) + (irate(node_network_receive_drop{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + + irate(node_network_transmit_drop{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) ) ||| % $._config, }, From bd648827fe430b2c61c19cff792cdc1e5abbaba5 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Fri, 11 May 2018 14:40:20 +0100 Subject: [PATCH 06/31] Remove k8s from dashboard title, make gauges use datasource variable. Signed-off-by: Tom Wilkie --- node-mixin/dashboards/use.libsonnet | 4 ++-- node-mixin/lib/promgrafonnet/gauge.libsonnet | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/node-mixin/dashboards/use.libsonnet b/node-mixin/dashboards/use.libsonnet index 9231a746..eeb72093 100644 --- a/node-mixin/dashboards/use.libsonnet +++ b/node-mixin/dashboards/use.libsonnet @@ -84,8 +84,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; ), ), - 'k8s-node-rsrc-use.json': - g.dashboard('K8s / USE Method / Node') + 'node-rsrc-use.json': + g.dashboard('USE Method / Node') .addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance') .addRow( g.row('CPU') diff --git a/node-mixin/lib/promgrafonnet/gauge.libsonnet b/node-mixin/lib/promgrafonnet/gauge.libsonnet index f69a5cdc..43640b6d 100644 --- a/node-mixin/lib/promgrafonnet/gauge.libsonnet +++ b/node-mixin/lib/promgrafonnet/gauge.libsonnet @@ -6,7 +6,7 @@ local prometheus = grafana.prometheus; new(title, query):: singlestat.new( title, - datasource='prometheus', + datasource='$datasource', span=3, format='percentunit', valueName='current', From ff0a13d90056a88ef75cb135d32eeff45911ca7e Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 13 Jul 2018 15:01:01 +0200 Subject: [PATCH 07/31] Fix multiline strings Signed-off-by: Matthias Loibl --- node-mixin/alerts/alerts.libsonnet | 40 ++++++++++++++-------------- node-mixin/dashboards/node.libsonnet | 4 +-- node-mixin/rules/rules.libsonnet | 6 ++--- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/node-mixin/alerts/alerts.libsonnet b/node-mixin/alerts/alerts.libsonnet index c66d76db..17bbda8b 100644 --- a/node-mixin/alerts/alerts.libsonnet +++ b/node-mixin/alerts/alerts.libsonnet @@ -7,11 +7,11 @@ { alert: 'NodeFilesystemSpaceFillingUp', expr: ||| - predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 and - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -24,11 +24,11 @@ { alert: 'NodeFilesystemSpaceFillingUp', expr: ||| - predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 and - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -41,9 +41,9 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -56,9 +56,9 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -71,11 +71,11 @@ { alert: 'NodeFilesystemFilesFillingUp', expr: ||| - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 and - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -88,11 +88,11 @@ { alert: 'NodeFilesystemFilesFillingUp', expr: ||| - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 and - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -105,9 +105,9 @@ { alert: 'NodeFilesystemOutOfFiles', expr: ||| - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -120,9 +120,9 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { diff --git a/node-mixin/dashboards/node.libsonnet b/node-mixin/dashboards/node.libsonnet index ba092cf7..4594e3ed 100644 --- a/node-mixin/dashboards/node.libsonnet +++ b/node-mixin/dashboards/node.libsonnet @@ -60,9 +60,9 @@ local gauge = promgrafonnet.gauge; local memoryGauge = gauge.new( 'Memory Usage', ||| - node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"} + node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"} / - node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} ||| % $._config, ).withLowerBeingBetter(); diff --git a/node-mixin/rules/rules.libsonnet b/node-mixin/rules/rules.libsonnet index 7c70540e..72b18b9c 100644 --- a/node-mixin/rules/rules.libsonnet +++ b/node-mixin/rules/rules.libsonnet @@ -29,9 +29,9 @@ // Can go over 100%. >100% is bad. record: 'instance:node_cpu_saturation_load1:', expr: ||| - sum by (instance) (node_load1{%(nodeExporterSelector)s}) + sum by (instance) (node_load1{%(nodeExporterSelector)s}) / - instance:node_num_cpu:sum + instance:node_num_cpu:sum ||| % $._config, }, { @@ -52,7 +52,7 @@ / node_memory_MemTotal{%(nodeExporterSelector)s} ) - |||, + ||| % $._config, }, { record: 'instance:node_memory_swap_io_bytes:sum_rate', From 1482cc03095e1e85fc9c372edc3ca98949d7e5cf Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 6 Aug 2018 10:41:18 +0200 Subject: [PATCH 08/31] Rename group names to node-exporter to avoid naming collisions Signed-off-by: Matthias Loibl --- node-mixin/alerts/alerts.libsonnet | 2 +- node-mixin/rules/rules.libsonnet | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/node-mixin/alerts/alerts.libsonnet b/node-mixin/alerts/alerts.libsonnet index 17bbda8b..a0ca230f 100644 --- a/node-mixin/alerts/alerts.libsonnet +++ b/node-mixin/alerts/alerts.libsonnet @@ -2,7 +2,7 @@ prometheusAlerts+:: { groups+: [ { - name: 'node', + name: 'node-exporter', rules: [ { alert: 'NodeFilesystemSpaceFillingUp', diff --git a/node-mixin/rules/rules.libsonnet b/node-mixin/rules/rules.libsonnet index 72b18b9c..a9517119 100644 --- a/node-mixin/rules/rules.libsonnet +++ b/node-mixin/rules/rules.libsonnet @@ -2,7 +2,7 @@ prometheusRules+:: { groups+: [ { - name: 'node.rules', + name: 'node-exporter', rules: [ { // This rule gives the number of CPUs per node. From 961aa6770196407ac8282bea0a569365341e5775 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 6 Aug 2018 10:46:28 +0200 Subject: [PATCH 09/31] Append .rules to node_exporter.rules group name Signed-off-by: Matthias Loibl --- node-mixin/rules/rules.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node-mixin/rules/rules.libsonnet b/node-mixin/rules/rules.libsonnet index a9517119..c3f74ba7 100644 --- a/node-mixin/rules/rules.libsonnet +++ b/node-mixin/rules/rules.libsonnet @@ -2,7 +2,7 @@ prometheusRules+:: { groups+: [ { - name: 'node-exporter', + name: 'node-exporter.rules', rules: [ { // This rule gives the number of CPUs per node. From 619e23e5df8b4d9765c51740d893f0ac790aba2c Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 19 Nov 2018 16:00:48 +0100 Subject: [PATCH 10/31] node-mixin: Update rules to node_exporter v0.16 Signed-off-by: Matthias Loibl --- node-mixin/rules/rules.libsonnet | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/node-mixin/rules/rules.libsonnet b/node-mixin/rules/rules.libsonnet index c3f74ba7..f836d0d0 100644 --- a/node-mixin/rules/rules.libsonnet +++ b/node-mixin/rules/rules.libsonnet @@ -10,7 +10,7 @@ expr: ||| count by (instance) ( sum by (instance, cpu) ( - node_cpu{%(nodeExporterSelector)s} + node_cpu_seconds_total{%(nodeExporterSelector)s} ) ) ||| % $._config, @@ -20,7 +20,7 @@ record: 'instance:node_cpu_utilisation:avg1m', expr: ||| 1 - avg by (instance) ( - rate(node_cpu{%(nodeExporterSelector)s,mode="idle"}[1m]) + rate(node_cpu_seconds_total{%(nodeExporterSelector)s,mode="idle"}[1m]) ) ||| % $._config, }, @@ -39,7 +39,7 @@ record: 'instance:node_memory_bytes_total:sum', expr: ||| sum by (instance) ( - node_memory_MemTotal{%(nodeExporterSelector)s} + node_memory_MemTotal_bytes{%(nodeExporterSelector)s} ) ||| % $._config, }, @@ -48,9 +48,9 @@ record: 'instance:node_memory_utilisation:ratio', expr: ||| 1 - ( - node_memory_MemAvailable{%(nodeExporterSelector)s} + node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / - node_memory_MemTotal{%(nodeExporterSelector)s} + node_memory_MemTotal_bytes{%(nodeExporterSelector)s} ) ||| % $._config, }, @@ -68,7 +68,7 @@ record: 'instance:node_disk_utilisation:sum_irate', expr: ||| sum by (instance) ( - irate(node_disk_io_time_ms{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3 + irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) ) ||| % $._config, }, @@ -77,7 +77,7 @@ record: 'instance:node_disk_saturation:sum_irate', expr: ||| sum by (instance) ( - irate(node_disk_io_time_weighted{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3 + irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) ) ||| % $._config, }, @@ -85,8 +85,8 @@ record: 'instance:node_net_utilisation:sum_irate', expr: ||| sum by (instance) ( - (irate(node_network_receive_bytes{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + - irate(node_network_transmit_bytes{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) + (irate(node_network_receive_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + + irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) ) ||| % $._config, }, @@ -94,8 +94,8 @@ record: 'instance:node_net_saturation:sum_irate', expr: ||| sum by (instance) ( - (irate(node_network_receive_drop{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + - irate(node_network_transmit_drop{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) + (irate(node_network_receive_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + + irate(node_network_transmit_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) ) ||| % $._config, }, From 53e4093b64ec5348c99897bc2b26002f1d3332c7 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 19 Nov 2018 16:11:37 +0100 Subject: [PATCH 11/31] node-mixin: Update alerts to node_exporter v0.16 Signed-off-by: Matthias Loibl --- node-mixin/alerts/alerts.libsonnet | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/node-mixin/alerts/alerts.libsonnet b/node-mixin/alerts/alerts.libsonnet index a0ca230f..8ea70cc7 100644 --- a/node-mixin/alerts/alerts.libsonnet +++ b/node-mixin/alerts/alerts.libsonnet @@ -7,9 +7,9 @@ { alert: 'NodeFilesystemSpaceFillingUp', expr: ||| - predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 and - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 and node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, @@ -24,9 +24,9 @@ { alert: 'NodeFilesystemSpaceFillingUp', expr: ||| - predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 and - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 and node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, @@ -41,7 +41,7 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 and node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, @@ -56,7 +56,7 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 and node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, @@ -96,7 +96,7 @@ ||| % $._config, 'for': '1h', labels: { - severity: 'warning', + severity: 'critical', }, annotations: { message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 4 hours.', @@ -135,7 +135,7 @@ { alert: 'NodeNetworkReceiveErrs', expr: ||| - increase(node_network_receive_errs[2m]) > 10 + increase(node_network_receive_errs_total[2m]) > 10 ||| % $._config, 'for': '1h', labels: { @@ -148,7 +148,7 @@ { alert: 'NodeNetworkTransmitErrs', expr: ||| - increase(node_network_transmit_errs[2m]) > 10 + increase(node_network_transmit_errs_total[2m]) > 10 ||| % $._config, 'for': '1h', labels: { From 61bc03adbed4737fa4c4b9a80d78f455f3998f74 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 19 Nov 2018 16:56:05 +0100 Subject: [PATCH 12/31] node-mixin: Ignore jsonnetfile.lock.json and vendor folder Signed-off-by: Matthias Loibl --- node-mixin/.gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 node-mixin/.gitignore diff --git a/node-mixin/.gitignore b/node-mixin/.gitignore new file mode 100644 index 00000000..65d141bd --- /dev/null +++ b/node-mixin/.gitignore @@ -0,0 +1,3 @@ +/jsonnetfile.lock.json +/vendor/ + From 0bcded8d2bc27a108c00aa06f812c3fbc7929faa Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 19 Nov 2018 17:40:30 +0100 Subject: [PATCH 13/31] node-mixin: Update dashboards to v0.16 Signed-off-by: Matthias Loibl --- node-mixin/dashboards/use.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node-mixin/dashboards/use.libsonnet b/node-mixin/dashboards/use.libsonnet index eeb72093..3e368c86 100644 --- a/node-mixin/dashboards/use.libsonnet +++ b/node-mixin/dashboards/use.libsonnet @@ -78,7 +78,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Storage') .addPanel( g.panel('Disk Capacity') + - g.queryPanel('sum(max(node_filesystem_size{fstype=~"ext[24]"} - node_filesystem_free{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) + + g.queryPanel('sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_free_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ), @@ -143,7 +143,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Disk') .addPanel( g.panel('Disk Utilisation') + - g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size{fstype=~"ext[24]"}))', 'Disk') + + g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free_bytes{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size_bytes{fstype=~"ext[24]"}))', 'Disk') + { yaxes: g.yaxes('percentunit') }, ), ), From 2df034c05512628fc1946f5031773790b644abfc Mon Sep 17 00:00:00 2001 From: beorn7 Date: Fri, 5 Jul 2019 19:38:03 +0200 Subject: [PATCH 14/31] Move node-mixin into docs directory Signed-off-by: beorn7 --- {node-mixin => docs/node-mixin}/.gitignore | 0 {node-mixin => docs/node-mixin}/alerts/alerts.libsonnet | 0 {node-mixin => docs/node-mixin}/config.libsonnet | 0 {node-mixin => docs/node-mixin}/dashboards/dashboards.libsonnet | 0 {node-mixin => docs/node-mixin}/dashboards/node.libsonnet | 0 {node-mixin => docs/node-mixin}/dashboards/use.libsonnet | 0 {node-mixin => docs/node-mixin}/jsonnetfile.json | 0 {node-mixin => docs/node-mixin}/lib/promgrafonnet/gauge.libsonnet | 0 .../node-mixin}/lib/promgrafonnet/numbersinglestat.libsonnet | 0 .../node-mixin}/lib/promgrafonnet/promgrafonnet.libsonnet | 0 {node-mixin => docs/node-mixin}/mixin.libsonnet | 0 {node-mixin => docs/node-mixin}/rules/rules.libsonnet | 0 12 files changed, 0 insertions(+), 0 deletions(-) rename {node-mixin => docs/node-mixin}/.gitignore (100%) rename {node-mixin => docs/node-mixin}/alerts/alerts.libsonnet (100%) rename {node-mixin => docs/node-mixin}/config.libsonnet (100%) rename {node-mixin => docs/node-mixin}/dashboards/dashboards.libsonnet (100%) rename {node-mixin => docs/node-mixin}/dashboards/node.libsonnet (100%) rename {node-mixin => docs/node-mixin}/dashboards/use.libsonnet (100%) rename {node-mixin => docs/node-mixin}/jsonnetfile.json (100%) rename {node-mixin => docs/node-mixin}/lib/promgrafonnet/gauge.libsonnet (100%) rename {node-mixin => docs/node-mixin}/lib/promgrafonnet/numbersinglestat.libsonnet (100%) rename {node-mixin => docs/node-mixin}/lib/promgrafonnet/promgrafonnet.libsonnet (100%) rename {node-mixin => docs/node-mixin}/mixin.libsonnet (100%) rename {node-mixin => docs/node-mixin}/rules/rules.libsonnet (100%) diff --git a/node-mixin/.gitignore b/docs/node-mixin/.gitignore similarity index 100% rename from node-mixin/.gitignore rename to docs/node-mixin/.gitignore diff --git a/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet similarity index 100% rename from node-mixin/alerts/alerts.libsonnet rename to docs/node-mixin/alerts/alerts.libsonnet diff --git a/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet similarity index 100% rename from node-mixin/config.libsonnet rename to docs/node-mixin/config.libsonnet diff --git a/node-mixin/dashboards/dashboards.libsonnet b/docs/node-mixin/dashboards/dashboards.libsonnet similarity index 100% rename from node-mixin/dashboards/dashboards.libsonnet rename to docs/node-mixin/dashboards/dashboards.libsonnet diff --git a/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet similarity index 100% rename from node-mixin/dashboards/node.libsonnet rename to docs/node-mixin/dashboards/node.libsonnet diff --git a/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet similarity index 100% rename from node-mixin/dashboards/use.libsonnet rename to docs/node-mixin/dashboards/use.libsonnet diff --git a/node-mixin/jsonnetfile.json b/docs/node-mixin/jsonnetfile.json similarity index 100% rename from node-mixin/jsonnetfile.json rename to docs/node-mixin/jsonnetfile.json diff --git a/node-mixin/lib/promgrafonnet/gauge.libsonnet b/docs/node-mixin/lib/promgrafonnet/gauge.libsonnet similarity index 100% rename from node-mixin/lib/promgrafonnet/gauge.libsonnet rename to docs/node-mixin/lib/promgrafonnet/gauge.libsonnet diff --git a/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet b/docs/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet similarity index 100% rename from node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet rename to docs/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet diff --git a/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet b/docs/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet similarity index 100% rename from node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet rename to docs/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet diff --git a/node-mixin/mixin.libsonnet b/docs/node-mixin/mixin.libsonnet similarity index 100% rename from node-mixin/mixin.libsonnet rename to docs/node-mixin/mixin.libsonnet diff --git a/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet similarity index 100% rename from node-mixin/rules/rules.libsonnet rename to docs/node-mixin/rules/rules.libsonnet From cd2981f1b8ce8cc5dff2c1bdb654b51002024f0a Mon Sep 17 00:00:00 2001 From: beorn7 Date: Sat, 6 Jul 2019 20:10:47 +0200 Subject: [PATCH 15/31] Update vendoring to current location of jsonnet-libs Signed-off-by: beorn7 --- docs/node-mixin/jsonnetfile.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/node-mixin/jsonnetfile.json b/docs/node-mixin/jsonnetfile.json index 45326aad..fc5aeee0 100644 --- a/docs/node-mixin/jsonnetfile.json +++ b/docs/node-mixin/jsonnetfile.json @@ -14,7 +14,7 @@ "name": "grafana-builder", "source": { "git": { - "remote": "https://github.com/kausalco/public", + "remote": "https://github.com/grafana/jsonnet-libs", "subdir": "grafana-builder" } }, From f17829c48b971a0507a47f14b0cae3950af249f7 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Sat, 6 Jul 2019 20:11:27 +0200 Subject: [PATCH 16/31] Create jsonnet files to create output files This allows to create YAML files with rules and JSON files with dashboard descriptions. Signed-off-by: beorn7 --- docs/node-mixin/alerts.jsonnet | 1 + docs/node-mixin/dashboards.jsonnet | 6 ++++++ docs/node-mixin/rules.jsonnet | 1 + 3 files changed, 8 insertions(+) create mode 100644 docs/node-mixin/alerts.jsonnet create mode 100644 docs/node-mixin/dashboards.jsonnet create mode 100644 docs/node-mixin/rules.jsonnet diff --git a/docs/node-mixin/alerts.jsonnet b/docs/node-mixin/alerts.jsonnet new file mode 100644 index 00000000..75e7c1b2 --- /dev/null +++ b/docs/node-mixin/alerts.jsonnet @@ -0,0 +1 @@ +std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts) diff --git a/docs/node-mixin/dashboards.jsonnet b/docs/node-mixin/dashboards.jsonnet new file mode 100644 index 00000000..9d913ed3 --- /dev/null +++ b/docs/node-mixin/dashboards.jsonnet @@ -0,0 +1,6 @@ +local dashboards = (import 'mixin.libsonnet').grafanaDashboards; + +{ + [name]: dashboards[name] + for name in std.objectFields(dashboards) +} diff --git a/docs/node-mixin/rules.jsonnet b/docs/node-mixin/rules.jsonnet new file mode 100644 index 00000000..dbe13f41 --- /dev/null +++ b/docs/node-mixin/rules.jsonnet @@ -0,0 +1 @@ +std.manifestYamlDoc((import 'mixin.libsonnet').prometheusRules) From f2891703a54ca0f4b518c77bfd4dbe0df056a6bb Mon Sep 17 00:00:00 2001 From: beorn7 Date: Sat, 6 Jul 2019 20:21:56 +0200 Subject: [PATCH 17/31] Add Makefile to easily make output files and lint sources Signed-off-by: beorn7 --- docs/node-mixin/.gitignore | 7 ++++--- docs/node-mixin/Makefile | 28 ++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) create mode 100644 docs/node-mixin/Makefile diff --git a/docs/node-mixin/.gitignore b/docs/node-mixin/.gitignore index 65d141bd..522b99f0 100644 --- a/docs/node-mixin/.gitignore +++ b/docs/node-mixin/.gitignore @@ -1,3 +1,4 @@ -/jsonnetfile.lock.json -/vendor/ - +jsonnetfile.lock.json +vendor +*.yaml +dashboards_out diff --git a/docs/node-mixin/Makefile b/docs/node-mixin/Makefile new file mode 100644 index 00000000..012a4b50 --- /dev/null +++ b/docs/node-mixin/Makefile @@ -0,0 +1,28 @@ +JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s + +all: fmt node_alerts.yaml node_rules.yaml dashboards_out lint + +fmt: + find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + xargs -n 1 -- $(JSONNET_FMT) -i + +node_alerts.yaml: mixin.libsonnet config.libsonnet $(wildcard alerts/*) + jsonnet -S alerts.jsonnet > $@ + +node_rules.yaml: mixin.libsonnet config.libsonnet $(wildcard rules/*) + jsonnet -S rules.jsonnet > $@ + +dashboards_out: mixin.libsonnet config.libsonnet $(wildcard dashboards/*) + @mkdir -p dashboards_out + jsonnet -J vendor -m dashboards_out dashboards.jsonnet + +lint: node_alerts.yaml node_rules.yaml + find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + while read f; do \ + $(JSONNET_FMT) "$$f" | diff -u "$$f" -; \ + done + + promtool check rules node_alerts.yaml node_rules.yaml + +clean: + rm -rf dashboards_out node_alerts.yaml node_rules.yaml From e5266c242e4f6243e33d7893f0bd76e2c895dc75 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Sat, 6 Jul 2019 20:30:40 +0200 Subject: [PATCH 18/31] Add README.md Signed-off-by: beorn7 --- docs/node-mixin/README.md | 44 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 docs/node-mixin/README.md diff --git a/docs/node-mixin/README.md b/docs/node-mixin/README.md new file mode 100644 index 00000000..489b599c --- /dev/null +++ b/docs/node-mixin/README.md @@ -0,0 +1,44 @@ +# Node Mixin + +_This is work in progress. We aim for it to become a good role model for alerts +and dashboards eventually, but it is not quite there yet._ + +The Node Mixin is a set of configurable, reusable, and extensible alerts and +dashboards based on the metrics exported by the Node Exporter. The mixin create +recording and alerting rules for Prometheus and suitable dashboard descriptions +for Grafana. + +To use them, you need to have `jsonnet` (v0.13+) and `jb` installed. If you +have a working Go development environment, it's easiest to run the following: +```bash +$ go get github.com/google/go-jsonnet/cmd/jsonnet +$ go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb +``` + +_Note: The make targets `lint` and `fmt` need the `jsonnetfmt` binary, which is +currently not included in the Go implementation of `jsonnet`. For the time +being, you have to install the [C++ version of +jsonnetfmt](https://github.com/google/jsonnet) if you want to use `make lint` +or `make fmt`._ + +Next, install the dependencies by running the following command in this +directory: +```bash +$ jb install +``` + +You can then build the Prometheus rules files `node_alerts.yaml` and +`node_rules.yaml`: +```bash +$ make node_alerts.yaml node_rules.yaml +``` + +You can also build a directory `dashboard_out` with the JSON dashboard files +for Grafana: +```bash +$ make dashboards_out +``` + +For more advanced uses of mixins, see +https://github.com/monitoring-mixins/docs. + From f331b308f36012d6c00d00914ce2165c996ed321 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Sat, 6 Jul 2019 21:09:17 +0200 Subject: [PATCH 19/31] Use promgrafonnet as a vendored library from its source The only deviation that happened so far is to use format="percentunit" in a Grafana gauge. This change wasn't even properly used in this repo so far, so I opted to stick with "upstream" for now. If changes are really needed, we can try to change upstream first. Another change was done in parallal here and upstream, but it was "more correct" in upstream. (Change datasource to $datasource variable, only partially applied here.) Which is another point for using the upstream and not copy it here. Signed-off-by: beorn7 --- docs/node-mixin/dashboards/node.libsonnet | 2 +- docs/node-mixin/jsonnetfile.json | 10 ++++ .../lib/promgrafonnet/gauge.libsonnet | 60 ------------------- .../promgrafonnet/numbersinglestat.libsonnet | 48 --------------- .../lib/promgrafonnet/promgrafonnet.libsonnet | 5 -- 5 files changed, 11 insertions(+), 114 deletions(-) delete mode 100644 docs/node-mixin/lib/promgrafonnet/gauge.libsonnet delete mode 100644 docs/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet delete mode 100644 docs/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet index 4594e3ed..80e2fa92 100644 --- a/docs/node-mixin/dashboards/node.libsonnet +++ b/docs/node-mixin/dashboards/node.libsonnet @@ -4,7 +4,7 @@ local row = grafana.row; local prometheus = grafana.prometheus; local template = grafana.template; local graphPanel = grafana.graphPanel; -local promgrafonnet = import '../lib/promgrafonnet/promgrafonnet.libsonnet'; +local promgrafonnet = import 'promgrafonnet/promgrafonnet.libsonnet'; local gauge = promgrafonnet.gauge; { diff --git a/docs/node-mixin/jsonnetfile.json b/docs/node-mixin/jsonnetfile.json index fc5aeee0..dc92880d 100644 --- a/docs/node-mixin/jsonnetfile.json +++ b/docs/node-mixin/jsonnetfile.json @@ -19,6 +19,16 @@ } }, "version": "master" + }, + { + "name": "promgrafonnet", + "source": { + "git": { + "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin", + "subdir": "lib/promgrafonnet" + } + }, + "version": "master" } ] } diff --git a/docs/node-mixin/lib/promgrafonnet/gauge.libsonnet b/docs/node-mixin/lib/promgrafonnet/gauge.libsonnet deleted file mode 100644 index 43640b6d..00000000 --- a/docs/node-mixin/lib/promgrafonnet/gauge.libsonnet +++ /dev/null @@ -1,60 +0,0 @@ -local grafana = import 'grafonnet/grafana.libsonnet'; -local singlestat = grafana.singlestat; -local prometheus = grafana.prometheus; - -{ - new(title, query):: - singlestat.new( - title, - datasource='$datasource', - span=3, - format='percentunit', - valueName='current', - colors=[ - 'rgba(245, 54, 54, 0.9)', - 'rgba(237, 129, 40, 0.89)', - 'rgba(50, 172, 45, 0.97)', - ], - thresholds='50, 80', - valueMaps=[ - { - op: '=', - text: 'N/A', - value: 'null', - }, - ], - ) - .addTarget( - prometheus.target( - query - ) - ) + { - gauge: { - maxValue: 100, - minValue: 0, - show: true, - thresholdLabels: false, - thresholdMarkers: true, - }, - withTextNullValue(text):: self { - valueMaps: [ - { - op: '=', - text: text, - value: 'null', - }, - ], - }, - withSpanSize(size):: self { - span: size, - }, - withLowerBeingBetter():: self { - colors: [ - 'rgba(50, 172, 45, 0.97)', - 'rgba(237, 129, 40, 0.89)', - 'rgba(245, 54, 54, 0.9)', - ], - thresholds: '80, 90', - }, - }, -} diff --git a/docs/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet b/docs/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet deleted file mode 100644 index bc1d6f6f..00000000 --- a/docs/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet +++ /dev/null @@ -1,48 +0,0 @@ -local grafana = import 'grafonnet/grafana.libsonnet'; -local singlestat = grafana.singlestat; -local prometheus = grafana.prometheus; - -{ - new(title, query):: - singlestat.new( - title, - datasource='prometheus', - span=3, - valueName='current', - valueMaps=[ - { - op: '=', - text: '0', - value: 'null', - }, - ], - ) - .addTarget( - prometheus.target( - query - ) - ) + { - withTextNullValue(text):: self { - valueMaps: [ - { - op: '=', - text: text, - value: 'null', - }, - ], - }, - withSpanSize(size):: self { - span: size, - }, - withPostfix(postfix):: self { - postfix: postfix, - }, - withSparkline():: self { - sparkline: { - show: true, - lineColor: 'rgb(31, 120, 193)', - fillColor: 'rgba(31, 118, 189, 0.18)', - }, - }, - }, -} diff --git a/docs/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet b/docs/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet deleted file mode 100644 index 013ff42b..00000000 --- a/docs/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet +++ /dev/null @@ -1,5 +0,0 @@ -{ - numbersinglestat:: import 'numbersinglestat.libsonnet', - gauge:: import 'gauge.libsonnet', - percentlinegraph:: import 'percentlinegraph.libsonnet', -} From 9d7045e4830d5c177d79a1cdee6012e855bcefaa Mon Sep 17 00:00:00 2001 From: beorn7 Date: Wed, 10 Jul 2019 19:40:04 +0200 Subject: [PATCH 20/31] (Re-)adjust to Grafana gauge expecting percentage 0-100 (rather than 1-0) Signed-off-by: beorn7 --- docs/node-mixin/dashboards/node.libsonnet | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet index 80e2fa92..dd82b306 100644 --- a/docs/node-mixin/dashboards/node.libsonnet +++ b/docs/node-mixin/dashboards/node.libsonnet @@ -63,6 +63,7 @@ local gauge = promgrafonnet.gauge; node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"} / node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + * 100 ||| % $._config, ).withLowerBeingBetter(); @@ -95,11 +96,11 @@ local gauge = promgrafonnet.gauge; local diskSpaceUsage = gauge.new( 'Disk Space Usage', ||| - 1 - ( + 100 - ( sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} / sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} - ) + ) * 100 ||| % $._config, ).withLowerBeingBetter(); From dec5b5b05324315f3bb68a811ebde1ccd9553aee Mon Sep 17 00:00:00 2001 From: beorn7 Date: Wed, 10 Jul 2019 20:07:20 +0200 Subject: [PATCH 21/31] Fix indentation Signed-off-by: beorn7 --- docs/node-mixin/alerts/alerts.libsonnet | 56 +++++++++++++++-------- docs/node-mixin/dashboards/node.libsonnet | 33 ++++++++----- docs/node-mixin/dashboards/use.libsonnet | 4 +- docs/node-mixin/rules/rules.libsonnet | 27 ++++++----- 4 files changed, 74 insertions(+), 46 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 8ea70cc7..70060ba5 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -7,11 +7,13 @@ { alert: 'NodeFilesystemSpaceFillingUp', expr: ||| - predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + ( + predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 and - node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ) ||| % $._config, 'for': '1h', labels: { @@ -24,11 +26,13 @@ { alert: 'NodeFilesystemSpaceFillingUp', expr: ||| - predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + ( + predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 and - node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ) ||| % $._config, 'for': '1h', labels: { @@ -41,9 +45,11 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + ( + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ) ||| % $._config, 'for': '1h', labels: { @@ -56,9 +62,11 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + ( + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ) ||| % $._config, 'for': '1h', labels: { @@ -71,11 +79,13 @@ { alert: 'NodeFilesystemFilesFillingUp', expr: ||| - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + ( + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 and - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ) ||| % $._config, 'for': '1h', labels: { @@ -88,11 +98,13 @@ { alert: 'NodeFilesystemFilesFillingUp', expr: ||| - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + ( + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 and - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ) ||| % $._config, 'for': '1h', labels: { @@ -105,9 +117,11 @@ { alert: 'NodeFilesystemOutOfFiles', expr: ||| - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + ( + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ) ||| % $._config, 'for': '1h', labels: { @@ -120,9 +134,11 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + ( + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ) ||| % $._config, 'for': '1h', labels: { diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet index dd82b306..115d98cf 100644 --- a/docs/node-mixin/dashboards/node.libsonnet +++ b/docs/node-mixin/dashboards/node.libsonnet @@ -47,10 +47,15 @@ local gauge = promgrafonnet.gauge; ) .addTarget(prometheus.target( ||| - node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"} + ( + node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + - + node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"} + - + node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"} + - + node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"} + ) ||| % $._config, legendFormat='memory used' )) .addTarget(prometheus.target('node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers')) @@ -60,10 +65,12 @@ local gauge = promgrafonnet.gauge; local memoryGauge = gauge.new( 'Memory Usage', ||| - node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"} + ( + node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"} / - node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} - * 100 + node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + ) + * 100 ||| % $._config, ).withLowerBeingBetter(); @@ -96,11 +103,13 @@ local gauge = promgrafonnet.gauge; local diskSpaceUsage = gauge.new( 'Disk Space Usage', ||| - 100 - ( - sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} - / - sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} - ) * 100 + 100 - + ( + sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} + / + sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} + * 100 + ) ||| % $._config, ).withLowerBeingBetter(); diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index 3e368c86..9bba6043 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -44,9 +44,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('Disk IO Utilisation') + // Full utilisation would be all disks on each node spending an average of // 1 sec per second doing I/O, normalize by node count for stacked charts - g.queryPanel(||| - instance:node_disk_utilisation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s})) - ||| % $._config, '{{instance}}', legendLink) + + g.queryPanel('instance:node_disk_utilisation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s}))' % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet index f836d0d0..27636aa8 100644 --- a/docs/node-mixin/rules/rules.libsonnet +++ b/docs/node-mixin/rules/rules.libsonnet @@ -29,9 +29,11 @@ // Can go over 100%. >100% is bad. record: 'instance:node_cpu_saturation_load1:', expr: ||| - sum by (instance) (node_load1{%(nodeExporterSelector)s}) + ( + sum by (instance) (node_load1{%(nodeExporterSelector)s}) / - instance:node_num_cpu:sum + instance:node_num_cpu:sum + ) ||| % $._config, }, { @@ -48,9 +50,9 @@ record: 'instance:node_memory_utilisation:ratio', expr: ||| 1 - ( - node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} - / - node_memory_MemTotal_bytes{%(nodeExporterSelector)s} + node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} + / + node_memory_MemTotal_bytes{%(nodeExporterSelector)s} ) ||| % $._config, }, @@ -58,8 +60,9 @@ record: 'instance:node_memory_swap_io_bytes:sum_rate', expr: ||| 1e3 * sum by (instance) ( - (rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m]) - + rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m])) + rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m]) + + + rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m]) ) ||| % $._config, }, @@ -85,8 +88,9 @@ record: 'instance:node_net_utilisation:sum_irate', expr: ||| sum by (instance) ( - (irate(node_network_receive_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + - irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) + irate(node_network_receive_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + + + irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) ) ||| % $._config, }, @@ -94,8 +98,9 @@ record: 'instance:node_net_saturation:sum_irate', expr: ||| sum by (instance) ( - (irate(node_network_receive_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + - irate(node_network_transmit_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) + irate(node_network_receive_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + + + irate(node_network_transmit_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) ) ||| % $._config, }, From b3b47f2d0702ebc8086df96c127fd5d0bea34868 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Wed, 10 Jul 2019 20:09:01 +0200 Subject: [PATCH 22/31] Make selector naming consistent Signed-off-by: beorn7 --- docs/node-mixin/alerts/alerts.libsonnet | 40 ++++++++++++------------- docs/node-mixin/config.libsonnet | 2 +- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 70060ba5..013a9ee3 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -8,11 +8,11 @@ alert: 'NodeFilesystemSpaceFillingUp', expr: ||| ( - predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 24*60*60) < 0 and - node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} < 0.4 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 ) ||| % $._config, 'for': '1h', @@ -27,11 +27,11 @@ alert: 'NodeFilesystemSpaceFillingUp', expr: ||| ( - predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 4*60*60) < 0 and - node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} < 0.2 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 ) ||| % $._config, 'for': '1h', @@ -46,9 +46,9 @@ alert: 'NodeFilesystemOutOfSpace', expr: ||| ( - node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 ) ||| % $._config, 'for': '1h', @@ -63,9 +63,9 @@ alert: 'NodeFilesystemOutOfSpace', expr: ||| ( - node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 ) ||| % $._config, 'for': '1h', @@ -80,11 +80,11 @@ alert: 'NodeFilesystemFilesFillingUp', expr: ||| ( - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 24*60*60) < 0 and - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} < 0.4 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 ) ||| % $._config, 'for': '1h', @@ -99,11 +99,11 @@ alert: 'NodeFilesystemFilesFillingUp', expr: ||| ( - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 4*60*60) < 0 and - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} < 0.2 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 ) ||| % $._config, 'for': '1h', @@ -118,9 +118,9 @@ alert: 'NodeFilesystemOutOfFiles', expr: ||| ( - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 ) ||| % $._config, 'for': '1h', @@ -135,9 +135,9 @@ alert: 'NodeFilesystemOutOfSpace', expr: ||| ( - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 ) ||| % $._config, 'for': '1h', diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet index 6c5d6f74..5406bdbc 100644 --- a/docs/node-mixin/config.libsonnet +++ b/docs/node-mixin/config.libsonnet @@ -4,7 +4,7 @@ nodeExporterSelector: 'job="node-exporter"', // Mainly extracted because they are repetitive, but also useful to customize. - fsSelectors: 'fstype=~"ext.|xfs",mountpoint!="/var/lib/docker/aufs"', + fsSelector: 'fstype=~"ext.|xfs",mountpoint!="/var/lib/docker/aufs"', grafana_prefix: '', }, From 2180c2f3bf65b38952076a51e410742323ba47bb Mon Sep 17 00:00:00 2001 From: beorn7 Date: Fri, 12 Jul 2019 22:58:43 +0200 Subject: [PATCH 23/31] Address first batch of old review comments Signed-off-by: beorn7 --- docs/node-mixin/dashboards/node.libsonnet | 50 ++++++++-------- docs/node-mixin/dashboards/use.libsonnet | 71 ++++++++++++++++++----- docs/node-mixin/rules/rules.libsonnet | 49 +++++++--------- 3 files changed, 105 insertions(+), 65 deletions(-) diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet index 115d98cf..040d60a3 100644 --- a/docs/node-mixin/dashboards/node.libsonnet +++ b/docs/node-mixin/dashboards/node.libsonnet @@ -21,22 +21,23 @@ local gauge = promgrafonnet.gauge; ) .addTarget(prometheus.target( ||| - 1 - avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) + 1 - avg by (cpu) (irate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) ||| % $._config, legendFormat='{{cpu}}', intervalFactor=10, )); + // TODO: Is this panel useful? local systemLoad = graphPanel.new( - 'System load', + 'Load Average', datasource='$datasource', span=6, - format='percentunit', + format='short', ) - .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 1m')) - .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 5m')) - .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 15m')); + .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='1m load average')) + .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='5m load average')) + .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='15m load average')); local memoryGraph = graphPanel.new( @@ -48,27 +49,27 @@ local gauge = promgrafonnet.gauge; .addTarget(prometheus.target( ||| ( - node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"} + node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"} + node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"} + node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"} ) ||| % $._config, legendFormat='memory used' )) - .addTarget(prometheus.target('node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers')) - .addTarget(prometheus.target('node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) - .addTarget(prometheus.target('node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); + .addTarget(prometheus.target('node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers')) + .addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) + .addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); local memoryGauge = gauge.new( 'Memory Usage', ||| ( - node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"} + node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"} / - node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} ) * 100 ||| % $._config, @@ -80,9 +81,9 @@ local gauge = promgrafonnet.gauge; datasource='$datasource', span=9, ) - .addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_read_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read')) - .addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_written_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written')) - .addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) + + .addTarget(prometheus.target('sum by (instance) (irate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read')) + .addTarget(prometheus.target('sum by (instance) (irate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written')) + .addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) + { seriesOverrides: [ { @@ -96,18 +97,19 @@ local gauge = promgrafonnet.gauge; ], yaxes: [ self.yaxe(format='bytes'), - self.yaxe(format='ms'), + self.yaxe(format='s'), ], }; + // TODO: Should this be partitioned by mountpoint? local diskSpaceUsage = gauge.new( 'Disk Space Usage', ||| 100 - ( - sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} + sum(node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"} / - sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} + sum(node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"} * 100 ) ||| % $._config, @@ -120,7 +122,7 @@ local gauge = promgrafonnet.gauge; span=6, format='bytes', ) - .addTarget(prometheus.target('irate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}')); + .addTarget(prometheus.target('irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}')); local networkTransmitted = graphPanel.new( @@ -129,7 +131,7 @@ local gauge = promgrafonnet.gauge; span=6, format='bytes', ) - .addTarget(prometheus.target('irate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}')); + .addTarget(prometheus.target('irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}')); dashboard.new('Nodes', time_from='now-1h') .addTemplate( @@ -152,7 +154,7 @@ local gauge = promgrafonnet.gauge; template.new( 'instance', '$datasource', - 'label_values(node_boot_time{%(nodeExporterSelector)s}, instance)' % $._config, + 'label_values(node_boot_time_seconds{%(nodeExporterSelector)s}, instance)' % $._config, refresh='time', ) ) diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index 9bba6043..96bf0f59 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -10,16 +10,30 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('CPU') .addPanel( g.panel('CPU Utilisation') + - g.queryPanel('instance:node_cpu_utilisation:avg1m * instance:node_num_cpu:sum / scalar(sum(instance:node_num_cpu:sum))', '{{instance}}', legendLink) + + g.queryPanel(||| + ( + instance:node_cpu_utilisation:avg1m + * + instance:node_num_cpu:sum + / ignoring (instance) group_left + sum without (instance) (instance:node_num_cpu:sum) + ) + |||, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( - g.panel('CPU Saturation (Load1)') + + // TODO: Is this a useful panel? + g.panel('CPU Saturation (load1 per CPU)') + g.queryPanel(||| - instance:node_cpu_saturation_load1: / scalar(sum(up{%(nodeExporterSelector)s})) - ||| % $._config, '{{instance}}', legendLink) + + ( + instance:node_load1_per_cpu:ratio + / ignoring (instance) group_left + count without (instance) (instance:node_load1_per_cpu:ratio) + ) + |||, '{{instance}}', legendLink) + g.stack + + // TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios. { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) ) @@ -43,16 +57,26 @@ local g = import 'grafana-builder/grafana.libsonnet'; .addPanel( g.panel('Disk IO Utilisation') + // Full utilisation would be all disks on each node spending an average of - // 1 sec per second doing I/O, normalize by node count for stacked charts - g.queryPanel('instance:node_disk_utilisation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s}))' % $._config, '{{instance}}', legendLink) + + // 1 second per second doing I/O, normalize by metric cardinality for stacked charts. + g.queryPanel(||| + ( + instance:node_disk_utilisation:sum_irate + / ignoring (instance) group_left + count without (instance) (instance:node_disk_utilisation:sum_irate) + ) + |||, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( g.panel('Disk IO Saturation') + g.queryPanel(||| - instance:node_disk_saturation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s})) - ||| % $._config, '{{instance}}', legendLink) + + ( + instance:node_disk_saturation:sum_irate + / ignoring (instance) group_left + count without (instance) (instance:node_disk_saturation:sum_irate) + ) + |||, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) @@ -76,7 +100,21 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Storage') .addPanel( g.panel('Disk Capacity') + - g.queryPanel('sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_free_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) + + g.queryPanel(||| + ( + sum without (device) ( + max without (fstype, mountpoint) ( + node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_avail_bytes{fstype=~"ext[24]"} + ) + ) + / ignoring (instance) group_left + sum without (instance, device) ( + max without (fstype, mountpoint) ( + node_filesystem_size_bytes{fstype=~"ext[24]"} + ) + ) + ) + |||, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ), @@ -106,9 +144,9 @@ local g = import 'grafana-builder/grafana.libsonnet'; { yaxes: g.yaxes('percentunit') }, ) .addPanel( - g.panel('Memory Saturation (Swap I/O)') + - g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{instance="$instance"}', 'Swap IO') + - { yaxes: g.yaxes('Bps') }, + g.panel('Memory Saturation (pages swapped per second)') + + g.queryPanel('instance:node_memory_swap_io_pages:sum_rate{instance="$instance"}', 'Swap IO') + + { yaxes: g.yaxes('short') }, ) ) .addRow( @@ -141,7 +179,14 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Disk') .addPanel( g.panel('Disk Utilisation') + - g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free_bytes{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size_bytes{fstype=~"ext[24]"}))', 'Disk') + + g.queryPanel(||| + 1 - + ( + sum(max without (mountpoint, fstype) (node_filesystem_avail_bytes{fstype=~"ext[24]"})) + / + sum(max without (mountpoint, fstype) (node_filesystem_size_bytes{fstype=~"ext[24]"})) + ) + |||, 'Disk') + { yaxes: g.yaxes('percentunit') }, ), ), diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet index 27636aa8..6bd39a5f 100644 --- a/docs/node-mixin/rules/rules.libsonnet +++ b/docs/node-mixin/rules/rules.libsonnet @@ -8,8 +8,8 @@ // This rule gives the number of CPUs per node. record: 'instance:node_num_cpu:sum', expr: ||| - count by (instance) ( - sum by (instance, cpu) ( + count without (cpu) ( + sum without (mode) ( node_cpu_seconds_total{%(nodeExporterSelector)s} ) ) @@ -19,29 +19,20 @@ // CPU utilisation is % CPU is not idle. record: 'instance:node_cpu_utilisation:avg1m', expr: ||| - 1 - avg by (instance) ( - rate(node_cpu_seconds_total{%(nodeExporterSelector)s,mode="idle"}[1m]) + 1 - avg without (cpu, mode) ( + rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}[1m]) ) ||| % $._config, }, { - // CPU saturation is 1min avg run queue length / number of CPUs. - // Can go over 100%. >100% is bad. - record: 'instance:node_cpu_saturation_load1:', + // This is CPU saturation: 1min avg run queue length / number of CPUs. + // Can go over 1. >1 is bad. + record: 'instance:node_load1_per_cpu:ratio', expr: ||| ( - sum by (instance) (node_load1{%(nodeExporterSelector)s}) + node_load1{%(nodeExporterSelector)s} / - instance:node_num_cpu:sum - ) - ||| % $._config, - }, - { - // Total memory per node - record: 'instance:node_memory_bytes_total:sum', - expr: ||| - sum by (instance) ( - node_memory_MemTotal_bytes{%(nodeExporterSelector)s} + instance:node_num_cpu:sum{%(nodeExporterSelector)s} ) ||| % $._config, }, @@ -57,9 +48,9 @@ ||| % $._config, }, { - record: 'instance:node_memory_swap_io_bytes:sum_rate', + record: 'instance:node_memory_swap_io_pages:sum_rate', expr: ||| - 1e3 * sum by (instance) ( + ( rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m]) + rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m]) @@ -70,7 +61,7 @@ // Disk utilisation (ms spent, 1 second irate()) record: 'instance:node_disk_utilisation:sum_irate', expr: ||| - sum by (instance) ( + sum without (device) ( irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) ) ||| % $._config, @@ -79,28 +70,30 @@ // Disk saturation (ms spent, by rate() it's bound by 1 second) record: 'instance:node_disk_saturation:sum_irate', expr: ||| - sum by (instance) ( + sum without (device) ( irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) ) ||| % $._config, }, + // TODO: For the following two rules, consider configurable filtering to exclude more network + // device names than just "lo". { record: 'instance:node_net_utilisation:sum_irate', expr: ||| - sum by (instance) ( - irate(node_network_receive_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + sum without (device) ( + irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) + - irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) ) ||| % $._config, }, { record: 'instance:node_net_saturation:sum_irate', expr: ||| - sum by (instance) ( - irate(node_network_receive_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + sum without (device) ( + irate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) + - irate(node_network_transmit_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + irate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) ) ||| % $._config, }, From 3ab1f41d12d55e1561bab58bc6f0ef1604c5dd65 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Tue, 16 Jul 2019 19:34:27 +0200 Subject: [PATCH 24/31] Make more use of config.libsonnet Signed-off-by: beorn7 --- docs/node-mixin/config.libsonnet | 7 ++++++- docs/node-mixin/dashboards/use.libsonnet | 12 ++++++------ docs/node-mixin/rules/rules.libsonnet | 4 ++-- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet index 5406bdbc..de84b9ee 100644 --- a/docs/node-mixin/config.libsonnet +++ b/docs/node-mixin/config.libsonnet @@ -1,11 +1,16 @@ { _config+:: { // Selectors are inserted between {} in Prometheus queries. + + // Select the metrics coming from the node exporter. nodeExporterSelector: 'job="node-exporter"', - // Mainly extracted because they are repetitive, but also useful to customize. + // Select the fstype for filesystem-related queries. fsSelector: 'fstype=~"ext.|xfs",mountpoint!="/var/lib/docker/aufs"', + // Select the device for disk-related queries. + diskDeviceSelector: 'device=~"(sd|xvd).+"', + grafana_prefix: '', }, } diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index 96bf0f59..115e893c 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -104,17 +104,17 @@ local g = import 'grafana-builder/grafana.libsonnet'; ( sum without (device) ( max without (fstype, mountpoint) ( - node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_avail_bytes{fstype=~"ext[24]"} + node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s} ) ) / ignoring (instance) group_left sum without (instance, device) ( max without (fstype, mountpoint) ( - node_filesystem_size_bytes{fstype=~"ext[24]"} + node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} ) ) ) - |||, '{{instance}}', legendLink) + + ||| % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ), @@ -182,11 +182,11 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.queryPanel(||| 1 - ( - sum(max without (mountpoint, fstype) (node_filesystem_avail_bytes{fstype=~"ext[24]"})) + sum(max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s})) / - sum(max without (mountpoint, fstype) (node_filesystem_size_bytes{fstype=~"ext[24]"})) + sum(max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s})) ) - |||, 'Disk') + + ||| % $._config, 'Disk') + { yaxes: g.yaxes('percentunit') }, ), ), diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet index 6bd39a5f..c4bc31a8 100644 --- a/docs/node-mixin/rules/rules.libsonnet +++ b/docs/node-mixin/rules/rules.libsonnet @@ -62,7 +62,7 @@ record: 'instance:node_disk_utilisation:sum_irate', expr: ||| sum without (device) ( - irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) + irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) ) ||| % $._config, }, @@ -71,7 +71,7 @@ record: 'instance:node_disk_saturation:sum_irate', expr: ||| sum without (device) ( - irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) + irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) ) ||| % $._config, }, From a92d1d7889ddcbaad50e821cb155795bf3e9758a Mon Sep 17 00:00:00 2001 From: beorn7 Date: Tue, 16 Jul 2019 21:18:17 +0200 Subject: [PATCH 25/31] Address review comments, batch 2 Signed-off-by: beorn7 --- docs/node-mixin/alerts/alerts.libsonnet | 12 +++--- docs/node-mixin/config.libsonnet | 5 ++- docs/node-mixin/dashboards/node.libsonnet | 16 +++++--- docs/node-mixin/dashboards/use.libsonnet | 34 ++++++++++------- docs/node-mixin/rules/rules.libsonnet | 46 ++++++++++++++--------- 5 files changed, 68 insertions(+), 45 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 013a9ee3..76bbb031 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -43,7 +43,7 @@ }, }, { - alert: 'NodeFilesystemOutOfSpace', + alert: 'NodeFilesystemAlmostOutOfSpace', expr: ||| ( node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5 @@ -60,7 +60,7 @@ }, }, { - alert: 'NodeFilesystemOutOfSpace', + alert: 'NodeFilesystemAlmostOutOfSpace', expr: ||| ( node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3 @@ -115,7 +115,7 @@ }, }, { - alert: 'NodeFilesystemOutOfFiles', + alert: 'NodeFilesystemAlmostOutOfFiles', expr: ||| ( node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5 @@ -132,7 +132,7 @@ }, }, { - alert: 'NodeFilesystemOutOfSpace', + alert: 'NodeFilesystemAlmostOutOfFiles', expr: ||| ( node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3 @@ -155,7 +155,7 @@ ||| % $._config, 'for': '1h', labels: { - severity: 'critical', + severity: 'warning', }, annotations: { message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while receiving packets ({{ $value }} errors in two minutes).', @@ -168,7 +168,7 @@ ||| % $._config, 'for': '1h', labels: { - severity: 'critical', + severity: 'warning', }, annotations: { message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while transmitting packets ({{ $value }} errors in two minutes).', diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet index de84b9ee..701d9bea 100644 --- a/docs/node-mixin/config.libsonnet +++ b/docs/node-mixin/config.libsonnet @@ -3,10 +3,11 @@ // Selectors are inserted between {} in Prometheus queries. // Select the metrics coming from the node exporter. - nodeExporterSelector: 'job="node-exporter"', + nodeExporterSelector: 'job="node"', // Select the fstype for filesystem-related queries. - fsSelector: 'fstype=~"ext.|xfs",mountpoint!="/var/lib/docker/aufs"', + // TODO: What is a good default selector here? + fsSelector: 'fstype=~"ext.|xfs|jfs|btrfs|vfat|ntfs"', // Select the device for disk-related queries. diskDeviceSelector: 'device=~"(sd|xvd).+"', diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet index 040d60a3..915cbe48 100644 --- a/docs/node-mixin/dashboards/node.libsonnet +++ b/docs/node-mixin/dashboards/node.libsonnet @@ -20,8 +20,9 @@ local gauge = promgrafonnet.gauge; min=0, ) .addTarget(prometheus.target( + // TODO: Consider using `${__interval}` as range and a 1m min step. ||| - 1 - avg by (cpu) (irate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) + 1 - avg by (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) ||| % $._config, legendFormat='{{cpu}}', intervalFactor=10, @@ -81,9 +82,10 @@ local gauge = promgrafonnet.gauge; datasource='$datasource', span=9, ) - .addTarget(prometheus.target('sum by (instance) (irate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read')) - .addTarget(prometheus.target('sum by (instance) (irate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written')) - .addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) + + // TODO: Consider using `${__interval}` as range and a 1m min step. + .addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} read')) + .addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} written')) + .addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} io time')) + { seriesOverrides: [ { @@ -122,7 +124,8 @@ local gauge = promgrafonnet.gauge; span=6, format='bytes', ) - .addTarget(prometheus.target('irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}')); + // TODO: Consider using `${__interval}` as range and a 1m min step. + .addTarget(prometheus.target('rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}')); local networkTransmitted = graphPanel.new( @@ -131,7 +134,8 @@ local gauge = promgrafonnet.gauge; span=6, format='bytes', ) - .addTarget(prometheus.target('irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}')); + // TODO: Consider using `${__interval}` as range and a 1m min step. + .addTarget(prometheus.target('rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}')); dashboard.new('Nodes', time_from='now-1h') .addTemplate( diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index 115e893c..533f392b 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -12,7 +12,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('CPU Utilisation') + g.queryPanel(||| ( - instance:node_cpu_utilisation:avg1m + instance:node_cpu_utilisation:avg_rate1m * instance:node_num_cpu:sum / ignoring (instance) group_left @@ -60,9 +60,9 @@ local g = import 'grafana-builder/grafana.libsonnet'; // 1 second per second doing I/O, normalize by metric cardinality for stacked charts. g.queryPanel(||| ( - instance:node_disk_utilisation:sum_irate + instance:node_disk_io_time:sum_rate1m / ignoring (instance) group_left - count without (instance) (instance:node_disk_utilisation:sum_irate) + count without (instance) (instance:node_disk_io_time:sum_rate1m) ) |||, '{{instance}}', legendLink) + g.stack + @@ -72,9 +72,9 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('Disk IO Saturation') + g.queryPanel(||| ( - instance:node_disk_saturation:sum_irate + instance:node_disk_io_time_weighted:sum_rate1m / ignoring (instance) group_left - count without (instance) (instance:node_disk_saturation:sum_irate) + count without (instance) (instance:node_disk_io_time_weighted:sum_rate1m) ) |||, '{{instance}}', legendLink) + g.stack + @@ -127,7 +127,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('CPU') .addPanel( g.panel('CPU Utilisation') + - g.queryPanel('instance:node_cpu_utilisation:avg1m{instance="$instance"}', 'Utilisation') + + g.queryPanel('instance:node_cpu_utilisation:avg_rate1m{instance="$instance"}', 'Utilisation') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( @@ -145,7 +145,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; ) .addPanel( g.panel('Memory Saturation (pages swapped per second)') + - g.queryPanel('instance:node_memory_swap_io_pages:sum_rate{instance="$instance"}', 'Swap IO') + + g.queryPanel('instance:node_memory_swap_io_pages:rate1m{instance="$instance"}', 'Swap IO') + { yaxes: g.yaxes('short') }, ) ) @@ -153,26 +153,32 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Disk') .addPanel( g.panel('Disk IO Utilisation') + - g.queryPanel('instance:node_disk_utilisation:sum_irate{instance="$instance"}', 'Utilisation') + + g.queryPanel('instance:node_disk_io_time:sum_rate1m{instance="$instance"}', 'Utilisation') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( g.panel('Disk IO Saturation') + - g.queryPanel('instance:node_disk_saturation:sum_irate{instance="$instance"}', 'Saturation') + + g.queryPanel('instance:node_disk_io_time_weighted:sum_rate1m{instance="$instance"}', 'Saturation') + { yaxes: g.yaxes('percentunit') }, ) ) .addRow( g.row('Net') .addPanel( - g.panel('Net Utilisation (Transmitted)') + - g.queryPanel('instance:node_net_utilisation:sum_irate{instance="$instance"}', 'Utilisation') + + g.panel('Net Utilisation (Bytes Receive/Transmit)') + + g.queryPanel( + ['node_network_receive_bytes_total{instance="$instance"}', '-node_network_transmit_bytes_total{instance="$instance"}'], + ['Receive', 'Transmit'], + ) + { yaxes: g.yaxes('Bps') }, ) .addPanel( - g.panel('Net Saturation (Dropped)') + - g.queryPanel('instance:node_net_saturation:sum_irate{instance="$instance"}', 'Saturation') + - { yaxes: g.yaxes('Bps') }, + g.panel('Net Saturation (Drops Receive/Transmit)') + + g.queryPanel( + ['node_network_receive_drop_total{instance="$instance"}', '-node_network_transmit_drop_total{instance="$instance"}'], + ['Receive drops', 'Transmit drops'], + ) + + { yaxes: g.yaxes('rps') }, ) ) .addRow( diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet index c4bc31a8..5422f443 100644 --- a/docs/node-mixin/rules/rules.libsonnet +++ b/docs/node-mixin/rules/rules.libsonnet @@ -17,7 +17,7 @@ }, { // CPU utilisation is % CPU is not idle. - record: 'instance:node_cpu_utilisation:avg1m', + record: 'instance:node_cpu_utilisation:avg_rate1m', expr: ||| 1 - avg without (cpu, mode) ( rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}[1m]) @@ -48,7 +48,7 @@ ||| % $._config, }, { - record: 'instance:node_memory_swap_io_pages:sum_rate', + record: 'instance:node_memory_swap_io_pages:rate1m', expr: ||| ( rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m]) @@ -58,42 +58,54 @@ ||| % $._config, }, { - // Disk utilisation (ms spent, 1 second irate()) - record: 'instance:node_disk_utilisation:sum_irate', + // Disk utilisation (seconds spent, 1 second rate) + record: 'instance:node_disk_io_time:sum_rate1m', expr: ||| sum without (device) ( - irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) + rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) ) ||| % $._config, }, { - // Disk saturation (ms spent, by rate() it's bound by 1 second) - record: 'instance:node_disk_saturation:sum_irate', + // Disk saturation (weighted seconds spent, 1 second rate) + record: 'instance:node_disk_io_time_weighted:sum_rate1m', expr: ||| sum without (device) ( - irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) + rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) ) ||| % $._config, }, - // TODO: For the following two rules, consider configurable filtering to exclude more network + // TODO: For the following rules, consider configurable filtering to exclude more network // device names than just "lo". { - record: 'instance:node_net_utilisation:sum_irate', + record: 'instance:node_network_receive_bytes:sum_rate1m', expr: ||| sum without (device) ( - irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) - + - irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) + rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) ) ||| % $._config, }, { - record: 'instance:node_net_saturation:sum_irate', + record: 'instance:node_network_transmit_bytes:sum_rate1m', expr: ||| sum without (device) ( - irate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) - + - irate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) + rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) + ) + ||| % $._config, + }, + { + record: 'instance:node_network_receive_drop:sum_rate1m', + expr: ||| + sum without (device) ( + rate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) + ) + ||| % $._config, + }, + { + record: 'instance:node_network_transmit_drop:sum_rate1m', + expr: ||| + sum without (device) ( + rate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) ) ||| % $._config, }, From 3a770a0b1d988cc81fc1d1c25f994d3de4cb0af7 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Tue, 16 Jul 2019 21:40:57 +0200 Subject: [PATCH 26/31] Convert annotations from message to summary/description Signed-off-by: beorn7 --- docs/node-mixin/alerts/alerts.libsonnet | 46 +++++++++++++++---------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 76bbb031..7b9fb890 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -8,9 +8,9 @@ alert: 'NodeFilesystemSpaceFillingUp', expr: ||| ( - predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 24*60*60) < 0 - and node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} < 0.4 + and + predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 24*60*60) < 0 and node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 ) @@ -20,16 +20,17 @@ severity: 'warning', }, annotations: { - message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 24 hours.', + summary: 'Filesystem is predicted to run out of space within the next 24 hours.', + description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.', }, }, { alert: 'NodeFilesystemSpaceFillingUp', expr: ||| ( - predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 4*60*60) < 0 - and node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} < 0.2 + and + predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 4*60*60) < 0 and node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 ) @@ -39,7 +40,8 @@ severity: 'critical', }, annotations: { - message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 4 hours.', + summary: 'Filesystem is predicted to run out of space within the next 4 hours.', + description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.', }, }, { @@ -56,7 +58,8 @@ severity: 'warning', }, annotations: { - message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', + summary: 'Filesystem has less than 5% space left.', + description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', }, }, { @@ -73,16 +76,17 @@ severity: 'critical', }, annotations: { - message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', + summary: 'Filesystem has less than 3% space left.', + description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', }, }, { alert: 'NodeFilesystemFilesFillingUp', expr: ||| ( - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 24*60*60) < 0 - and node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} < 0.4 + and + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 24*60*60) < 0 and node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 ) @@ -92,16 +96,17 @@ severity: 'warning', }, annotations: { - message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 24 hours.', + summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.', + description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.', }, }, { alert: 'NodeFilesystemFilesFillingUp', expr: ||| ( - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 4*60*60) < 0 - and node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} < 0.2 + and + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 4*60*60) < 0 and node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 ) @@ -111,7 +116,8 @@ severity: 'critical', }, annotations: { - message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 4 hours.', + summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.', + description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.', }, }, { @@ -128,7 +134,8 @@ severity: 'warning', }, annotations: { - message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available inodes left.', + summary: 'Filesystem has less than 5% inodes left.', + description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', }, }, { @@ -145,7 +152,8 @@ severity: 'critical', }, annotations: { - message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', + summary: 'Filesystem has less than 3% inodes left.', + description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', }, }, { @@ -158,7 +166,8 @@ severity: 'warning', }, annotations: { - message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while receiving packets ({{ $value }} errors in two minutes).', + summary: 'Network interface is reporting many receive errors.', + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.', }, }, { @@ -171,7 +180,8 @@ severity: 'warning', }, annotations: { - message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while transmitting packets ({{ $value }} errors in two minutes).', + summary: 'Network interface is reporting many transmit errors.', + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.', }, }, ], From 706511a49598db1c256a85b2b7dec4e6d754cabd Mon Sep 17 00:00:00 2001 From: beorn7 Date: Wed, 17 Jul 2019 23:54:31 +0200 Subject: [PATCH 27/31] Responses to review comments, round 3 Signed-off-by: beorn7 --- docs/node-mixin/config.libsonnet | 15 +++-- docs/node-mixin/dashboards/node.libsonnet | 18 ++++-- docs/node-mixin/dashboards/use.libsonnet | 76 ++++++++++++++--------- docs/node-mixin/rules/rules.libsonnet | 15 +++-- 4 files changed, 81 insertions(+), 43 deletions(-) diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet index 701d9bea..95070ca9 100644 --- a/docs/node-mixin/config.libsonnet +++ b/docs/node-mixin/config.libsonnet @@ -5,12 +5,17 @@ // Select the metrics coming from the node exporter. nodeExporterSelector: 'job="node"', - // Select the fstype for filesystem-related queries. - // TODO: What is a good default selector here? - fsSelector: 'fstype=~"ext.|xfs|jfs|btrfs|vfat|ntfs"', + // Select the fstype for filesystem-related queries. If left + // empty, all filesystems are selected. If you have unusual + // filesystem you don't want to include in dashboards and + // alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'. + fsSelector: '', - // Select the device for disk-related queries. - diskDeviceSelector: 'device=~"(sd|xvd).+"', + // Select the device for disk-related queries. If left empty, all + // devices are selected. If you have unusual devices you don't + // want to include in dashboards and alerting, you can exclude + // them here, e.g. 'device!="tmpfs"'. + diskDeviceSelector: '', grafana_prefix: '', }, diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet index 915cbe48..c3c97f37 100644 --- a/docs/node-mixin/dashboards/node.libsonnet +++ b/docs/node-mixin/dashboards/node.libsonnet @@ -22,7 +22,7 @@ local gauge = promgrafonnet.gauge; .addTarget(prometheus.target( // TODO: Consider using `${__interval}` as range and a 1m min step. ||| - 1 - avg by (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) + 1 - rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m]) ||| % $._config, legendFormat='{{cpu}}', intervalFactor=10, @@ -64,15 +64,18 @@ local gauge = promgrafonnet.gauge; .addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) .addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); + // TODO: It would be nicer to have a gauge that gets a 0-1 range and displays it as a percentage 0%-100%. + // This needs to be added upstream in the promgrafonnet library and then changed here. local memoryGauge = gauge.new( 'Memory Usage', ||| + 100 - ( node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} - ) * 100 + ) ||| % $._config, ).withLowerBeingBetter(); @@ -82,10 +85,11 @@ local gauge = promgrafonnet.gauge; datasource='$datasource', span=9, ) + // TODO: Does it make sense to have those three in the same panel? // TODO: Consider using `${__interval}` as range and a 1m min step. - .addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} read')) - .addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} written')) - .addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} io time')) + + .addTarget(prometheus.target('rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m])' % $._config, legendFormat='{{device}} read')) + .addTarget(prometheus.target('rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m])' % $._config, legendFormat='{{device}} written')) + .addTarget(prometheus.target('rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m])' % $._config, legendFormat='{{device}} io time')) + { seriesOverrides: [ { @@ -103,6 +107,8 @@ local gauge = promgrafonnet.gauge; ], }; + // TODO: It would be nicer to have a gauge that gets a 0-1 range and displays it as a percentage 0%-100%. + // This needs to be added upstream in the promgrafonnet library and then changed here. // TODO: Should this be partitioned by mountpoint? local diskSpaceUsage = gauge.new( 'Disk Space Usage', @@ -158,7 +164,7 @@ local gauge = promgrafonnet.gauge; template.new( 'instance', '$datasource', - 'label_values(node_boot_time_seconds{%(nodeExporterSelector)s}, instance)' % $._config, + 'label_values(node_exporter_build_info{%(nodeExporterSelector)s}, instance)' % $._config, refresh='time', ) ) diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index 533f392b..e3739ac2 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -12,13 +12,13 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('CPU Utilisation') + g.queryPanel(||| ( - instance:node_cpu_utilisation:avg_rate1m + instance:node_cpu_utilisation:avg_rate1m{%(nodeExporterSelector)s} * - instance:node_num_cpu:sum + instance:node_num_cpu:sum{%(nodeExporterSelector)s} / ignoring (instance) group_left - sum without (instance) (instance:node_num_cpu:sum) + sum without (instance) (instance:node_num_cpu:sum{%(nodeExporterSelector)s}) ) - |||, '{{instance}}', legendLink) + + ||| % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) @@ -27,11 +27,11 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('CPU Saturation (load1 per CPU)') + g.queryPanel(||| ( - instance:node_load1_per_cpu:ratio + instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s} / ignoring (instance) group_left - count without (instance) (instance:node_load1_per_cpu:ratio) + count without (instance) (instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}) ) - |||, '{{instance}}', legendLink) + + ||| % $._config, '{{instance}}', legendLink) + g.stack + // TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios. { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, @@ -41,13 +41,13 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Memory') .addPanel( g.panel('Memory Utilisation') + - g.queryPanel('instance:node_memory_utilisation:ratio', '{{instance}}', legendLink) + + g.queryPanel('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( g.panel('Memory Saturation (Swap I/O)') + - g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate', '{{instance}}', legendLink) + + g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes('Bps') }, ) @@ -60,11 +60,11 @@ local g = import 'grafana-builder/grafana.libsonnet'; // 1 second per second doing I/O, normalize by metric cardinality for stacked charts. g.queryPanel(||| ( - instance:node_disk_io_time:sum_rate1m + instance:node_disk_io_time_seconds:sum_rate1m{%(nodeExporterSelector)s} / ignoring (instance) group_left - count without (instance) (instance:node_disk_io_time:sum_rate1m) + count without (instance) (instance:node_disk_io_time_seconds:sum_rate1m{%(nodeExporterSelector)s}) ) - |||, '{{instance}}', legendLink) + + ||| % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) @@ -72,11 +72,11 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('Disk IO Saturation') + g.queryPanel(||| ( - instance:node_disk_io_time_weighted:sum_rate1m + instance:node_disk_io_time_weighted_seconds:sum_rate1m{%(nodeExporterSelector)s} / ignoring (instance) group_left - count without (instance) (instance:node_disk_io_time_weighted:sum_rate1m) + count without (instance) (instance:node_disk_io_time_weighted_seconds:sum_rate1m{%(nodeExporterSelector)s}) ) - |||, '{{instance}}', legendLink) + + ||| % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) @@ -84,16 +84,30 @@ local g = import 'grafana-builder/grafana.libsonnet'; .addRow( g.row('Network') .addPanel( - g.panel('Net Utilisation (Transmitted)') + - g.queryPanel('instance:node_net_utilisation:sum_irate', '{{instance}}', legendLink) + + g.panel('Net Utilisation (Bytes Receive/Transmit)') + + g.queryPanel( + [ + 'instance:node_network_receive_bytes:sum_rate1m{%(nodeExporterSelector)s}' % $._config, + '-instance:node_network_transmit_bytes:sum_rate1m{%(nodeExporterSelector)s}' % $._config, + ], + ['{{instance}} Receive', '{{instance}} Transmit'], + legendLink, + ) + g.stack + { yaxes: g.yaxes('Bps') }, ) .addPanel( - g.panel('Net Saturation (Dropped)') + - g.queryPanel('instance:node_net_saturation:sum_irate', '{{instance}}', legendLink) + + g.panel('Net Saturation (Drops Receive/Transmit)') + + g.queryPanel( + [ + 'instance:node_network_receive_drop:sum_rate1m{%(nodeExporterSelector)s}' % $._config, + '-instance:node_network_transmit_drop:sum_rate1m{%(nodeExporterSelector)s}' % $._config, + ], + ['{{instance}} Receive', '{{instance}} Transmit'], + legendLink, + ) + g.stack + - { yaxes: g.yaxes('Bps') }, + { yaxes: g.yaxes('rps') }, ) ) .addRow( @@ -127,12 +141,12 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('CPU') .addPanel( g.panel('CPU Utilisation') + - g.queryPanel('instance:node_cpu_utilisation:avg_rate1m{instance="$instance"}', 'Utilisation') + + g.queryPanel('instance:node_cpu_utilisation:avg_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( g.panel('CPU Saturation (Load1)') + - g.queryPanel('instance:node_cpu_saturation_load1:{instance="$instance"}', 'Saturation') + + g.queryPanel('instance:node_cpu_saturation_load1:{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') + { yaxes: g.yaxes('percentunit') }, ) ) @@ -140,12 +154,12 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Memory') .addPanel( g.panel('Memory Utilisation') + - g.queryPanel('instance:node_memory_utilisation:ratio{instance="$instance"}', 'Memory') + + g.queryPanel('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Memory') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( g.panel('Memory Saturation (pages swapped per second)') + - g.queryPanel('instance:node_memory_swap_io_pages:rate1m{instance="$instance"}', 'Swap IO') + + g.queryPanel('instance:node_memory_swap_io_pages:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Swap IO') + { yaxes: g.yaxes('short') }, ) ) @@ -153,12 +167,12 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Disk') .addPanel( g.panel('Disk IO Utilisation') + - g.queryPanel('instance:node_disk_io_time:sum_rate1m{instance="$instance"}', 'Utilisation') + + g.queryPanel('instance:node_disk_io_time_seconds:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( g.panel('Disk IO Saturation') + - g.queryPanel('instance:node_disk_io_time_weighted:sum_rate1m{instance="$instance"}', 'Saturation') + + g.queryPanel('instance:node_disk_io_time_weighted_seconds:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') + { yaxes: g.yaxes('percentunit') }, ) ) @@ -167,7 +181,10 @@ local g = import 'grafana-builder/grafana.libsonnet'; .addPanel( g.panel('Net Utilisation (Bytes Receive/Transmit)') + g.queryPanel( - ['node_network_receive_bytes_total{instance="$instance"}', '-node_network_transmit_bytes_total{instance="$instance"}'], + [ + 'instance:node_network_receive_bytes:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + '-instance:node_network_transmit_bytes:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + ], ['Receive', 'Transmit'], ) + { yaxes: g.yaxes('Bps') }, @@ -175,7 +192,10 @@ local g = import 'grafana-builder/grafana.libsonnet'; .addPanel( g.panel('Net Saturation (Drops Receive/Transmit)') + g.queryPanel( - ['node_network_receive_drop_total{instance="$instance"}', '-node_network_transmit_drop_total{instance="$instance"}'], + [ + 'instance:node_network_receive_drop:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + '-instance:node_network_transmit_drop:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + ], ['Receive drops', 'Transmit drops'], ) + { yaxes: g.yaxes('rps') }, diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet index 5422f443..d8c0faed 100644 --- a/docs/node-mixin/rules/rules.libsonnet +++ b/docs/node-mixin/rules/rules.libsonnet @@ -9,7 +9,7 @@ record: 'instance:node_num_cpu:sum', expr: ||| count without (cpu) ( - sum without (mode) ( + count without (mode) ( node_cpu_seconds_total{%(nodeExporterSelector)s} ) ) @@ -26,7 +26,9 @@ }, { // This is CPU saturation: 1min avg run queue length / number of CPUs. - // Can go over 1. >1 is bad. + // Can go over 1. + // TODO: There are situation where a run queue >1/core is just normal and fine. + // We need to clarify how to lead this metric and if its usage is helpful at all. record: 'instance:node_load1_per_cpu:ratio', expr: ||| ( @@ -59,7 +61,9 @@ }, { // Disk utilisation (seconds spent, 1 second rate) - record: 'instance:node_disk_io_time:sum_rate1m', + // TODO: This should probably not aggregate over all devices but + // keep them separate. + record: 'instance:node_disk_io_time_seconds:sum_rate1m', expr: ||| sum without (device) ( rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) @@ -68,7 +72,9 @@ }, { // Disk saturation (weighted seconds spent, 1 second rate) - record: 'instance:node_disk_io_time_weighted:sum_rate1m', + // TODO: This should probably not aggregate over all devices but + // keep them separate. + record: 'instance:node_disk_io_time_weighted_seconds:sum_rate1m', expr: ||| sum without (device) ( rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) @@ -93,6 +99,7 @@ ) ||| % $._config, }, + // TODO: Find out if those drops ever happen on modern switched networks. { record: 'instance:node_network_receive_drop:sum_rate1m', expr: ||| From b8c4b0cb298bf63ca701073e921c65e5fc99a0d7 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Thu, 18 Jul 2019 14:14:02 +0200 Subject: [PATCH 28/31] Removed unneeded `sum_` and `avg_` from rule names Signed-off-by: beorn7 --- docs/node-mixin/dashboards/use.libsonnet | 38 ++++++++++++------------ docs/node-mixin/rules/rules.libsonnet | 14 ++++----- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index e3739ac2..23cd2ea7 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -12,7 +12,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('CPU Utilisation') + g.queryPanel(||| ( - instance:node_cpu_utilisation:avg_rate1m{%(nodeExporterSelector)s} + instance:node_cpu_utilisation:rate1m{%(nodeExporterSelector)s} * instance:node_num_cpu:sum{%(nodeExporterSelector)s} / ignoring (instance) group_left @@ -46,10 +46,10 @@ local g = import 'grafana-builder/grafana.libsonnet'; { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( - g.panel('Memory Saturation (Swap I/O)') + - g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) + + g.panel('Memory Saturation (Swapped Pages)') + + g.queryPanel('instance:node_memory_swap_io_pages:rate{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) + g.stack + - { yaxes: g.yaxes('Bps') }, + { yaxes: g.yaxes('rps') }, ) ) .addRow( @@ -60,9 +60,9 @@ local g = import 'grafana-builder/grafana.libsonnet'; // 1 second per second doing I/O, normalize by metric cardinality for stacked charts. g.queryPanel(||| ( - instance:node_disk_io_time_seconds:sum_rate1m{%(nodeExporterSelector)s} + instance:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s} / ignoring (instance) group_left - count without (instance) (instance:node_disk_io_time_seconds:sum_rate1m{%(nodeExporterSelector)s}) + count without (instance) (instance:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s}) ) ||| % $._config, '{{instance}}', legendLink) + g.stack + @@ -72,9 +72,9 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('Disk IO Saturation') + g.queryPanel(||| ( - instance:node_disk_io_time_weighted_seconds:sum_rate1m{%(nodeExporterSelector)s} + instance:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s} / ignoring (instance) group_left - count without (instance) (instance:node_disk_io_time_weighted_seconds:sum_rate1m{%(nodeExporterSelector)s}) + count without (instance) (instance:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s}) ) ||| % $._config, '{{instance}}', legendLink) + g.stack + @@ -87,8 +87,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('Net Utilisation (Bytes Receive/Transmit)') + g.queryPanel( [ - 'instance:node_network_receive_bytes:sum_rate1m{%(nodeExporterSelector)s}' % $._config, - '-instance:node_network_transmit_bytes:sum_rate1m{%(nodeExporterSelector)s}' % $._config, + 'instance:node_network_receive_bytes:rate1m{%(nodeExporterSelector)s}' % $._config, + '-instance:node_network_transmit_bytes:rate1m{%(nodeExporterSelector)s}' % $._config, ], ['{{instance}} Receive', '{{instance}} Transmit'], legendLink, @@ -100,8 +100,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('Net Saturation (Drops Receive/Transmit)') + g.queryPanel( [ - 'instance:node_network_receive_drop:sum_rate1m{%(nodeExporterSelector)s}' % $._config, - '-instance:node_network_transmit_drop:sum_rate1m{%(nodeExporterSelector)s}' % $._config, + 'instance:node_network_receive_drop:rate1m{%(nodeExporterSelector)s}' % $._config, + '-instance:node_network_transmit_drop:rate1m{%(nodeExporterSelector)s}' % $._config, ], ['{{instance}} Receive', '{{instance}} Transmit'], legendLink, @@ -141,7 +141,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('CPU') .addPanel( g.panel('CPU Utilisation') + - g.queryPanel('instance:node_cpu_utilisation:avg_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') + + g.queryPanel('instance:node_cpu_utilisation:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( @@ -167,12 +167,12 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Disk') .addPanel( g.panel('Disk IO Utilisation') + - g.queryPanel('instance:node_disk_io_time_seconds:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') + + g.queryPanel('instance:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( g.panel('Disk IO Saturation') + - g.queryPanel('instance:node_disk_io_time_weighted_seconds:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') + + g.queryPanel('instance:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') + { yaxes: g.yaxes('percentunit') }, ) ) @@ -182,8 +182,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('Net Utilisation (Bytes Receive/Transmit)') + g.queryPanel( [ - 'instance:node_network_receive_bytes:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, - '-instance:node_network_transmit_bytes:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + 'instance:node_network_receive_bytes:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + '-instance:node_network_transmit_bytes:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, ], ['Receive', 'Transmit'], ) + @@ -193,8 +193,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('Net Saturation (Drops Receive/Transmit)') + g.queryPanel( [ - 'instance:node_network_receive_drop:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, - '-instance:node_network_transmit_drop:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + 'instance:node_network_receive_drop:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + '-instance:node_network_transmit_drop:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, ], ['Receive drops', 'Transmit drops'], ) + diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet index d8c0faed..ad1b7171 100644 --- a/docs/node-mixin/rules/rules.libsonnet +++ b/docs/node-mixin/rules/rules.libsonnet @@ -17,7 +17,7 @@ }, { // CPU utilisation is % CPU is not idle. - record: 'instance:node_cpu_utilisation:avg_rate1m', + record: 'instance:node_cpu_utilisation:rate1m', expr: ||| 1 - avg without (cpu, mode) ( rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}[1m]) @@ -63,7 +63,7 @@ // Disk utilisation (seconds spent, 1 second rate) // TODO: This should probably not aggregate over all devices but // keep them separate. - record: 'instance:node_disk_io_time_seconds:sum_rate1m', + record: 'instance:node_disk_io_time_seconds:rate1m', expr: ||| sum without (device) ( rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) @@ -74,7 +74,7 @@ // Disk saturation (weighted seconds spent, 1 second rate) // TODO: This should probably not aggregate over all devices but // keep them separate. - record: 'instance:node_disk_io_time_weighted_seconds:sum_rate1m', + record: 'instance:node_disk_io_time_weighted_seconds:rate1m', expr: ||| sum without (device) ( rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) @@ -84,7 +84,7 @@ // TODO: For the following rules, consider configurable filtering to exclude more network // device names than just "lo". { - record: 'instance:node_network_receive_bytes:sum_rate1m', + record: 'instance:node_network_receive_bytes:rate1m', expr: ||| sum without (device) ( rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) @@ -92,7 +92,7 @@ ||| % $._config, }, { - record: 'instance:node_network_transmit_bytes:sum_rate1m', + record: 'instance:node_network_transmit_bytes:rate1m', expr: ||| sum without (device) ( rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) @@ -101,7 +101,7 @@ }, // TODO: Find out if those drops ever happen on modern switched networks. { - record: 'instance:node_network_receive_drop:sum_rate1m', + record: 'instance:node_network_receive_drop:rate1m', expr: ||| sum without (device) ( rate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) @@ -109,7 +109,7 @@ ||| % $._config, }, { - record: 'instance:node_network_transmit_drop:sum_rate1m', + record: 'instance:node_network_transmit_drop:rate1m', expr: ||| sum without (device) ( rate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) From e01d9f9e78536dbabf76836e355b4202535d690a Mon Sep 17 00:00:00 2001 From: beorn7 Date: Thu, 18 Jul 2019 15:59:35 +0200 Subject: [PATCH 29/31] Break out device in disk IO rules/dashboard Signed-off-by: beorn7 --- docs/node-mixin/dashboards/use.libsonnet | 20 ++++++++++---------- docs/node-mixin/rules/rules.libsonnet | 16 ++++------------ 2 files changed, 14 insertions(+), 22 deletions(-) diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index 23cd2ea7..160cfd8f 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -60,11 +60,11 @@ local g = import 'grafana-builder/grafana.libsonnet'; // 1 second per second doing I/O, normalize by metric cardinality for stacked charts. g.queryPanel(||| ( - instance:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s} - / ignoring (instance) group_left - count without (instance) (instance:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s}) + instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s} + / ignoring (instance, device) group_left + count without (instance, device) (instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s}) ) - ||| % $._config, '{{instance}}', legendLink) + + ||| % $._config, '{{instance}} {{device}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) @@ -72,11 +72,11 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('Disk IO Saturation') + g.queryPanel(||| ( - instance:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s} - / ignoring (instance) group_left - count without (instance) (instance:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s}) + instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s} + / ignoring (instance, device) group_left + count without (instance, device) (instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s}) ) - ||| % $._config, '{{instance}}', legendLink) + + ||| % $._config, '{{instance}} {{device}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) @@ -167,12 +167,12 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Disk') .addPanel( g.panel('Disk IO Utilisation') + - g.queryPanel('instance:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') + + g.queryPanel('instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation {{device}}') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( g.panel('Disk IO Saturation') + - g.queryPanel('instance:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') + + g.queryPanel('instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation {{device}}') + { yaxes: g.yaxes('percentunit') }, ) ) diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet index ad1b7171..b5efc6a2 100644 --- a/docs/node-mixin/rules/rules.libsonnet +++ b/docs/node-mixin/rules/rules.libsonnet @@ -61,24 +61,16 @@ }, { // Disk utilisation (seconds spent, 1 second rate) - // TODO: This should probably not aggregate over all devices but - // keep them separate. - record: 'instance:node_disk_io_time_seconds:rate1m', + record: 'instance_device:node_disk_io_time_seconds:rate1m', expr: ||| - sum without (device) ( - rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) - ) + rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) ||| % $._config, }, { // Disk saturation (weighted seconds spent, 1 second rate) - // TODO: This should probably not aggregate over all devices but - // keep them separate. - record: 'instance:node_disk_io_time_weighted_seconds:rate1m', + record: 'instance_device:node_disk_io_time_weighted_seconds:rate1m', expr: ||| - sum without (device) ( - rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) - ) + rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) ||| % $._config, }, // TODO: For the following rules, consider configurable filtering to exclude more network From 36dc7451c95dbac064fff72a2929d1c6dc82b187 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Mon, 22 Jul 2019 14:06:27 +0200 Subject: [PATCH 30/31] Improvement of comments and panel titles Signed-off-by: beorn7 --- docs/node-mixin/dashboards/use.libsonnet | 9 +++++++-- docs/node-mixin/rules/rules.libsonnet | 8 ++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index 160cfd8f..7499493b 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -23,7 +23,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( - // TODO: Is this a useful panel? + // TODO: Is this a useful panel? At least there should be some explanation how load + // average relates to the "CPU saturation" in the title. g.panel('CPU Saturation (load1 per CPU)') + g.queryPanel(||| ( @@ -58,6 +59,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('Disk IO Utilisation') + // Full utilisation would be all disks on each node spending an average of // 1 second per second doing I/O, normalize by metric cardinality for stacked charts. + // TODO: Does the partition by device make sense? Using the most utilized device per + // instance might make more sense. g.queryPanel(||| ( instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s} @@ -113,7 +116,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; .addRow( g.row('Storage') .addPanel( - g.panel('Disk Capacity') + + g.panel('Disk Space Utilisation') + g.queryPanel(||| ( sum without (device) ( @@ -145,6 +148,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; { yaxes: g.yaxes('percentunit') }, ) .addPanel( + // TODO: Is this a useful panel? At least there should be some explanation how load + // average relates to the "CPU saturation" in the title. g.panel('CPU Saturation (Load1)') + g.queryPanel('instance:node_cpu_saturation_load1:{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') + { yaxes: g.yaxes('percentunit') }, diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet index b5efc6a2..8bb37038 100644 --- a/docs/node-mixin/rules/rules.libsonnet +++ b/docs/node-mixin/rules/rules.libsonnet @@ -28,7 +28,7 @@ // This is CPU saturation: 1min avg run queue length / number of CPUs. // Can go over 1. // TODO: There are situation where a run queue >1/core is just normal and fine. - // We need to clarify how to lead this metric and if its usage is helpful at all. + // We need to clarify how to read this metric and if its usage is helpful at all. record: 'instance:node_load1_per_cpu:ratio', expr: ||| ( @@ -39,7 +39,7 @@ ||| % $._config, }, { - // Memory utilisation per node, normalized by per-node memory + // Memory utilisation (ratio of used memory per instance). record: 'instance:node_memory_utilisation:ratio', expr: ||| 1 - ( @@ -60,14 +60,14 @@ ||| % $._config, }, { - // Disk utilisation (seconds spent, 1 second rate) + // Disk utilisation (seconds spent, 1 second rate). record: 'instance_device:node_disk_io_time_seconds:rate1m', expr: ||| rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) ||| % $._config, }, { - // Disk saturation (weighted seconds spent, 1 second rate) + // Disk saturation (weighted seconds spent, 1 second rate). record: 'instance_device:node_disk_io_time_weighted_seconds:rate1m', expr: ||| rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) From 79f0357e38464a794c30f8966a2181ec7a42bc35 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Mon, 22 Jul 2019 20:21:52 +0200 Subject: [PATCH 31/31] Added `_excluding_lo` to name of network rules that exclude lo Signed-off-by: beorn7 --- docs/node-mixin/dashboards/use.libsonnet | 16 ++++++++-------- docs/node-mixin/rules/rules.libsonnet | 10 ++++------ 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index 7499493b..b74adef2 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -90,8 +90,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('Net Utilisation (Bytes Receive/Transmit)') + g.queryPanel( [ - 'instance:node_network_receive_bytes:rate1m{%(nodeExporterSelector)s}' % $._config, - '-instance:node_network_transmit_bytes:rate1m{%(nodeExporterSelector)s}' % $._config, + 'instance:node_network_receive_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config, + '-instance:node_network_transmit_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config, ], ['{{instance}} Receive', '{{instance}} Transmit'], legendLink, @@ -103,8 +103,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('Net Saturation (Drops Receive/Transmit)') + g.queryPanel( [ - 'instance:node_network_receive_drop:rate1m{%(nodeExporterSelector)s}' % $._config, - '-instance:node_network_transmit_drop:rate1m{%(nodeExporterSelector)s}' % $._config, + 'instance:node_network_receive_drop_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config, + '-instance:node_network_transmit_drop_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config, ], ['{{instance}} Receive', '{{instance}} Transmit'], legendLink, @@ -187,8 +187,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('Net Utilisation (Bytes Receive/Transmit)') + g.queryPanel( [ - 'instance:node_network_receive_bytes:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, - '-instance:node_network_transmit_bytes:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + 'instance:node_network_receive_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + '-instance:node_network_transmit_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, ], ['Receive', 'Transmit'], ) + @@ -198,8 +198,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('Net Saturation (Drops Receive/Transmit)') + g.queryPanel( [ - 'instance:node_network_receive_drop:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, - '-instance:node_network_transmit_drop:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + 'instance:node_network_receive_drop_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + '-instance:node_network_transmit_drop_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, ], ['Receive drops', 'Transmit drops'], ) + diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet index 8bb37038..85f7618d 100644 --- a/docs/node-mixin/rules/rules.libsonnet +++ b/docs/node-mixin/rules/rules.libsonnet @@ -73,10 +73,8 @@ rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) ||| % $._config, }, - // TODO: For the following rules, consider configurable filtering to exclude more network - // device names than just "lo". { - record: 'instance:node_network_receive_bytes:rate1m', + record: 'instance:node_network_receive_bytes_excluding_lo:rate1m', expr: ||| sum without (device) ( rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) @@ -84,7 +82,7 @@ ||| % $._config, }, { - record: 'instance:node_network_transmit_bytes:rate1m', + record: 'instance:node_network_transmit_bytes_excluding_lo:rate1m', expr: ||| sum without (device) ( rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) @@ -93,7 +91,7 @@ }, // TODO: Find out if those drops ever happen on modern switched networks. { - record: 'instance:node_network_receive_drop:rate1m', + record: 'instance:node_network_receive_drop_excluding_lo:rate1m', expr: ||| sum without (device) ( rate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) @@ -101,7 +99,7 @@ ||| % $._config, }, { - record: 'instance:node_network_transmit_drop:rate1m', + record: 'instance:node_network_transmit_drop_excluding_lo:rate1m', expr: ||| sum without (device) ( rate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m])