From f350aaf87e9f484dcb774b9a8b7a2d50a8af7d96 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Tue, 13 Aug 2019 21:54:28 +0200 Subject: [PATCH] node-mixin: Fix various straight-forward issues in the USE dashboards - Normalize cluster memory utilisation. - Fix missing `1m` in memory saturation. - Have both disk-related row next to each other instead with the network row in between. - Correctly render transmit network traffic as negative, using `seriesOverrides` and `min: null` for the y-axis. - Make panel and row naming consistent. - Remove legend where it would just display a single entry with exactly the title of the panel. - Fix metric name in individual node CPU Saturation panel. - Break up disk space utilisation by device in the panel for an individual node. NB: All of that doesn't touch any more subtle issues captured in the various TODOs. Signed-off-by: beorn7 --- docs/node-mixin/dashboards/use.libsonnet | 192 ++++++++++++++++------- 1 file changed, 131 insertions(+), 61 deletions(-) diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index b74adef2..1a75daa1 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -42,19 +42,80 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Memory') .addPanel( g.panel('Memory Utilisation') + - g.queryPanel('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) + + g.queryPanel(||| + ( + instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s} + / ignoring (instance) group_left + count without (instance) (instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}) + ) + ||| % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( g.panel('Memory Saturation (Swapped Pages)') + - g.queryPanel('instance:node_memory_swap_io_pages:rate{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) + + g.queryPanel('instance:node_memory_swap_io_pages:rate1m{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes('rps') }, ) ) .addRow( - g.row('Disk') + g.row('Network') + .addPanel( + g.panel('Net Utilisation (Bytes Receive/Transmit)') + + g.queryPanel( + [ + 'instance:node_network_receive_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config, + 'instance:node_network_transmit_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config, + ], + ['{{instance}} Receive', '{{instance}} Transmit'], + legendLink, + ) + + g.stack + + { + yaxes: g.yaxes({ format: 'Bps', min: null }), + seriesOverrides: [ + { + alias: '/ Receive/', + stack: 'A', + }, + { + alias: '/ Transmit/', + stack: 'B', + transform: 'negative-Y', + }, + ], + }, + ) + .addPanel( + g.panel('Net Saturation (Drops Receive/Transmit)') + + g.queryPanel( + [ + 'instance:node_network_receive_drop_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config, + 'instance:node_network_transmit_drop_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config, + ], + ['{{instance}} Receive', '{{instance}} Transmit'], + legendLink, + ) + + g.stack + + { + yaxes: g.yaxes({ format: 'rps', min: null }), + seriesOverrides: [ + { + alias: '/ Receive/', + stack: 'A', + }, + { + alias: '/ Transmit/', + stack: 'B', + transform: 'negative-Y', + }, + ], + }, + ) + ) + .addRow( + g.row('Disk IO') .addPanel( g.panel('Disk IO Utilisation') + // Full utilisation would be all disks on each node spending an average of @@ -85,36 +146,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; ) ) .addRow( - g.row('Network') - .addPanel( - g.panel('Net Utilisation (Bytes Receive/Transmit)') + - g.queryPanel( - [ - 'instance:node_network_receive_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config, - '-instance:node_network_transmit_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config, - ], - ['{{instance}} Receive', '{{instance}} Transmit'], - legendLink, - ) + - g.stack + - { yaxes: g.yaxes('Bps') }, - ) - .addPanel( - g.panel('Net Saturation (Drops Receive/Transmit)') + - g.queryPanel( - [ - 'instance:node_network_receive_drop_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config, - '-instance:node_network_transmit_drop_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config, - ], - ['{{instance}} Receive', '{{instance}} Transmit'], - legendLink, - ) + - g.stack + - { yaxes: g.yaxes('rps') }, - ) - ) - .addRow( - g.row('Storage') + g.row('Disk Space') .addPanel( g.panel('Disk Space Utilisation') + g.queryPanel(||| @@ -145,14 +177,20 @@ local g = import 'grafana-builder/grafana.libsonnet'; .addPanel( g.panel('CPU Utilisation') + g.queryPanel('instance:node_cpu_utilisation:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') + - { yaxes: g.yaxes('percentunit') }, + { + yaxes: g.yaxes('percentunit'), + legend+: { show: false }, + }, ) .addPanel( // TODO: Is this a useful panel? At least there should be some explanation how load // average relates to the "CPU saturation" in the title. - g.panel('CPU Saturation (Load1)') + - g.queryPanel('instance:node_cpu_saturation_load1:{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') + - { yaxes: g.yaxes('percentunit') }, + g.panel('CPU Saturation (Load1 per CPU)') + + g.queryPanel('instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') + + { + yaxes: g.yaxes('percentunit'), + legend+: { show: false }, + }, ) ) .addRow( @@ -165,20 +203,10 @@ local g = import 'grafana-builder/grafana.libsonnet'; .addPanel( g.panel('Memory Saturation (pages swapped per second)') + g.queryPanel('instance:node_memory_swap_io_pages:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Swap IO') + - { yaxes: g.yaxes('short') }, - ) - ) - .addRow( - g.row('Disk') - .addPanel( - g.panel('Disk IO Utilisation') + - g.queryPanel('instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation {{device}}') + - { yaxes: g.yaxes('percentunit') }, - ) - .addPanel( - g.panel('Disk IO Saturation') + - g.queryPanel('instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation {{device}}') + - { yaxes: g.yaxes('percentunit') }, + { + yaxes: g.yaxes('short'), + legend+: { show: false }, + }, ) ) .addRow( @@ -188,37 +216,79 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.queryPanel( [ 'instance:node_network_receive_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, - '-instance:node_network_transmit_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + 'instance:node_network_transmit_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, ], ['Receive', 'Transmit'], ) + - { yaxes: g.yaxes('Bps') }, + { + yaxes: g.yaxes({ format: 'Bps', min: null }), + seriesOverrides: [ + { + alias: '/Receive/', + stack: 'A', + }, + { + alias: '/Transmit/', + stack: 'B', + transform: 'negative-Y', + }, + ], + }, ) .addPanel( g.panel('Net Saturation (Drops Receive/Transmit)') + g.queryPanel( [ 'instance:node_network_receive_drop_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, - '-instance:node_network_transmit_drop_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + 'instance:node_network_transmit_drop_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, ], ['Receive drops', 'Transmit drops'], ) + - { yaxes: g.yaxes('rps') }, + { + yaxes: g.yaxes({ format: 'rps', min: null }), + seriesOverrides: [ + { + alias: '/Receive/', + stack: 'A', + }, + { + alias: '/Transmit/', + stack: 'B', + transform: 'negative-Y', + }, + ], + }, ) ) .addRow( - g.row('Disk') + g.row('Disk IO') .addPanel( - g.panel('Disk Utilisation') + + g.panel('Disk IO Utilisation') + + g.queryPanel('instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, '{{device}}') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + g.panel('Disk IO Saturation') + + g.queryPanel('instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, '{{device}}') + + { yaxes: g.yaxes('percentunit') }, + ) + ) + .addRow( + g.row('Disk Space') + .addPanel( + g.panel('Disk Space Utilisation') + g.queryPanel(||| 1 - ( - sum(max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s})) + max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"}}) / - sum(max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s})) + max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"}}) ) - ||| % $._config, 'Disk') + - { yaxes: g.yaxes('percentunit') }, + ||| % $._config, '{{device}}') + + { + yaxes: g.yaxes('percentunit'), + legend+: { show: false }, + }, ), ), },