diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index 160cfd8f..7499493b 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -23,7 +23,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( - // TODO: Is this a useful panel? + // TODO: Is this a useful panel? At least there should be some explanation how load + // average relates to the "CPU saturation" in the title. g.panel('CPU Saturation (load1 per CPU)') + g.queryPanel(||| ( @@ -58,6 +59,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('Disk IO Utilisation') + // Full utilisation would be all disks on each node spending an average of // 1 second per second doing I/O, normalize by metric cardinality for stacked charts. + // TODO: Does the partition by device make sense? Using the most utilized device per + // instance might make more sense. g.queryPanel(||| ( instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s} @@ -113,7 +116,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; .addRow( g.row('Storage') .addPanel( - g.panel('Disk Capacity') + + g.panel('Disk Space Utilisation') + g.queryPanel(||| ( sum without (device) ( @@ -145,6 +148,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; { yaxes: g.yaxes('percentunit') }, ) .addPanel( + // TODO: Is this a useful panel? At least there should be some explanation how load + // average relates to the "CPU saturation" in the title. g.panel('CPU Saturation (Load1)') + g.queryPanel('instance:node_cpu_saturation_load1:{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') + { yaxes: g.yaxes('percentunit') }, diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet index b5efc6a2..8bb37038 100644 --- a/docs/node-mixin/rules/rules.libsonnet +++ b/docs/node-mixin/rules/rules.libsonnet @@ -28,7 +28,7 @@ // This is CPU saturation: 1min avg run queue length / number of CPUs. // Can go over 1. // TODO: There are situation where a run queue >1/core is just normal and fine. - // We need to clarify how to lead this metric and if its usage is helpful at all. + // We need to clarify how to read this metric and if its usage is helpful at all. record: 'instance:node_load1_per_cpu:ratio', expr: ||| ( @@ -39,7 +39,7 @@ ||| % $._config, }, { - // Memory utilisation per node, normalized by per-node memory + // Memory utilisation (ratio of used memory per instance). record: 'instance:node_memory_utilisation:ratio', expr: ||| 1 - ( @@ -60,14 +60,14 @@ ||| % $._config, }, { - // Disk utilisation (seconds spent, 1 second rate) + // Disk utilisation (seconds spent, 1 second rate). record: 'instance_device:node_disk_io_time_seconds:rate1m', expr: ||| rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) ||| % $._config, }, { - // Disk saturation (weighted seconds spent, 1 second rate) + // Disk saturation (weighted seconds spent, 1 second rate). record: 'instance_device:node_disk_io_time_weighted_seconds:rate1m', expr: ||| rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m])