diff --git a/docs/node-mixin/.gitignore b/docs/node-mixin/.gitignore new file mode 100644 index 00000000..522b99f0 --- /dev/null +++ b/docs/node-mixin/.gitignore @@ -0,0 +1,4 @@ +jsonnetfile.lock.json +vendor +*.yaml +dashboards_out diff --git a/docs/node-mixin/Makefile b/docs/node-mixin/Makefile new file mode 100644 index 00000000..012a4b50 --- /dev/null +++ b/docs/node-mixin/Makefile @@ -0,0 +1,28 @@ +JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s + +all: fmt node_alerts.yaml node_rules.yaml dashboards_out lint + +fmt: + find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + xargs -n 1 -- $(JSONNET_FMT) -i + +node_alerts.yaml: mixin.libsonnet config.libsonnet $(wildcard alerts/*) + jsonnet -S alerts.jsonnet > $@ + +node_rules.yaml: mixin.libsonnet config.libsonnet $(wildcard rules/*) + jsonnet -S rules.jsonnet > $@ + +dashboards_out: mixin.libsonnet config.libsonnet $(wildcard dashboards/*) + @mkdir -p dashboards_out + jsonnet -J vendor -m dashboards_out dashboards.jsonnet + +lint: node_alerts.yaml node_rules.yaml + find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + while read f; do \ + $(JSONNET_FMT) "$$f" | diff -u "$$f" -; \ + done + + promtool check rules node_alerts.yaml node_rules.yaml + +clean: + rm -rf dashboards_out node_alerts.yaml node_rules.yaml diff --git a/docs/node-mixin/README.md b/docs/node-mixin/README.md new file mode 100644 index 00000000..489b599c --- /dev/null +++ b/docs/node-mixin/README.md @@ -0,0 +1,44 @@ +# Node Mixin + +_This is work in progress. We aim for it to become a good role model for alerts +and dashboards eventually, but it is not quite there yet._ + +The Node Mixin is a set of configurable, reusable, and extensible alerts and +dashboards based on the metrics exported by the Node Exporter. The mixin create +recording and alerting rules for Prometheus and suitable dashboard descriptions +for Grafana. + +To use them, you need to have `jsonnet` (v0.13+) and `jb` installed. If you +have a working Go development environment, it's easiest to run the following: +```bash +$ go get github.com/google/go-jsonnet/cmd/jsonnet +$ go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb +``` + +_Note: The make targets `lint` and `fmt` need the `jsonnetfmt` binary, which is +currently not included in the Go implementation of `jsonnet`. For the time +being, you have to install the [C++ version of +jsonnetfmt](https://github.com/google/jsonnet) if you want to use `make lint` +or `make fmt`._ + +Next, install the dependencies by running the following command in this +directory: +```bash +$ jb install +``` + +You can then build the Prometheus rules files `node_alerts.yaml` and +`node_rules.yaml`: +```bash +$ make node_alerts.yaml node_rules.yaml +``` + +You can also build a directory `dashboard_out` with the JSON dashboard files +for Grafana: +```bash +$ make dashboards_out +``` + +For more advanced uses of mixins, see +https://github.com/monitoring-mixins/docs. + diff --git a/docs/node-mixin/alerts.jsonnet b/docs/node-mixin/alerts.jsonnet new file mode 100644 index 00000000..75e7c1b2 --- /dev/null +++ b/docs/node-mixin/alerts.jsonnet @@ -0,0 +1 @@ +std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet new file mode 100644 index 00000000..7b9fb890 --- /dev/null +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -0,0 +1,191 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'node-exporter', + rules: [ + { + alert: 'NodeFilesystemSpaceFillingUp', + expr: ||| + ( + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} < 0.4 + and + predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 + ) + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Filesystem is predicted to run out of space within the next 24 hours.', + description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.', + }, + }, + { + alert: 'NodeFilesystemSpaceFillingUp', + expr: ||| + ( + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} < 0.2 + and + predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 + ) + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Filesystem is predicted to run out of space within the next 4 hours.', + description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.', + }, + }, + { + alert: 'NodeFilesystemAlmostOutOfSpace', + expr: ||| + ( + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 + ) + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Filesystem has less than 5% space left.', + description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', + }, + }, + { + alert: 'NodeFilesystemAlmostOutOfSpace', + expr: ||| + ( + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 + ) + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Filesystem has less than 3% space left.', + description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', + }, + }, + { + alert: 'NodeFilesystemFilesFillingUp', + expr: ||| + ( + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} < 0.4 + and + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 + ) + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.', + description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.', + }, + }, + { + alert: 'NodeFilesystemFilesFillingUp', + expr: ||| + ( + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} < 0.2 + and + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 + ) + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.', + description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.', + }, + }, + { + alert: 'NodeFilesystemAlmostOutOfFiles', + expr: ||| + ( + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 + ) + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Filesystem has less than 5% inodes left.', + description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', + }, + }, + { + alert: 'NodeFilesystemAlmostOutOfFiles', + expr: ||| + ( + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 + ) + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Filesystem has less than 3% inodes left.', + description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', + }, + }, + { + alert: 'NodeNetworkReceiveErrs', + expr: ||| + increase(node_network_receive_errs_total[2m]) > 10 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Network interface is reporting many receive errors.', + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.', + }, + }, + { + alert: 'NodeNetworkTransmitErrs', + expr: ||| + increase(node_network_transmit_errs_total[2m]) > 10 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Network interface is reporting many transmit errors.', + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.', + }, + }, + ], + }, + ], + }, +} diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet new file mode 100644 index 00000000..95070ca9 --- /dev/null +++ b/docs/node-mixin/config.libsonnet @@ -0,0 +1,22 @@ +{ + _config+:: { + // Selectors are inserted between {} in Prometheus queries. + + // Select the metrics coming from the node exporter. + nodeExporterSelector: 'job="node"', + + // Select the fstype for filesystem-related queries. If left + // empty, all filesystems are selected. If you have unusual + // filesystem you don't want to include in dashboards and + // alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'. + fsSelector: '', + + // Select the device for disk-related queries. If left empty, all + // devices are selected. If you have unusual devices you don't + // want to include in dashboards and alerting, you can exclude + // them here, e.g. 'device!="tmpfs"'. + diskDeviceSelector: '', + + grafana_prefix: '', + }, +} diff --git a/docs/node-mixin/dashboards.jsonnet b/docs/node-mixin/dashboards.jsonnet new file mode 100644 index 00000000..9d913ed3 --- /dev/null +++ b/docs/node-mixin/dashboards.jsonnet @@ -0,0 +1,6 @@ +local dashboards = (import 'mixin.libsonnet').grafanaDashboards; + +{ + [name]: dashboards[name] + for name in std.objectFields(dashboards) +} diff --git a/docs/node-mixin/dashboards/dashboards.libsonnet b/docs/node-mixin/dashboards/dashboards.libsonnet new file mode 100644 index 00000000..e6adbd4f --- /dev/null +++ b/docs/node-mixin/dashboards/dashboards.libsonnet @@ -0,0 +1,2 @@ +(import 'node.libsonnet') + +(import 'use.libsonnet') diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet new file mode 100644 index 00000000..c3c97f37 --- /dev/null +++ b/docs/node-mixin/dashboards/node.libsonnet @@ -0,0 +1,192 @@ +local grafana = import 'grafonnet/grafana.libsonnet'; +local dashboard = grafana.dashboard; +local row = grafana.row; +local prometheus = grafana.prometheus; +local template = grafana.template; +local graphPanel = grafana.graphPanel; +local promgrafonnet = import 'promgrafonnet/promgrafonnet.libsonnet'; +local gauge = promgrafonnet.gauge; + +{ + grafanaDashboards+:: { + 'nodes.json': + local idleCPU = + graphPanel.new( + 'Idle CPU', + datasource='$datasource', + span=6, + format='percentunit', + max=100, + min=0, + ) + .addTarget(prometheus.target( + // TODO: Consider using `${__interval}` as range and a 1m min step. + ||| + 1 - rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m]) + ||| % $._config, + legendFormat='{{cpu}}', + intervalFactor=10, + )); + + // TODO: Is this panel useful? + local systemLoad = + graphPanel.new( + 'Load Average', + datasource='$datasource', + span=6, + format='short', + ) + .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='1m load average')) + .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='5m load average')) + .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='15m load average')); + + local memoryGraph = + graphPanel.new( + 'Memory Usage', + datasource='$datasource', + span=9, + format='bytes', + ) + .addTarget(prometheus.target( + ||| + ( + node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} + - + node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"} + - + node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"} + - + node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"} + ) + ||| % $._config, legendFormat='memory used' + )) + .addTarget(prometheus.target('node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers')) + .addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) + .addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); + + // TODO: It would be nicer to have a gauge that gets a 0-1 range and displays it as a percentage 0%-100%. + // This needs to be added upstream in the promgrafonnet library and then changed here. + local memoryGauge = gauge.new( + 'Memory Usage', + ||| + 100 - + ( + node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"} + / + node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} + * 100 + ) + ||| % $._config, + ).withLowerBeingBetter(); + + local diskIO = + graphPanel.new( + 'Disk I/O', + datasource='$datasource', + span=9, + ) + // TODO: Does it make sense to have those three in the same panel? + // TODO: Consider using `${__interval}` as range and a 1m min step. + .addTarget(prometheus.target('rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m])' % $._config, legendFormat='{{device}} read')) + .addTarget(prometheus.target('rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m])' % $._config, legendFormat='{{device}} written')) + .addTarget(prometheus.target('rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m])' % $._config, legendFormat='{{device}} io time')) + + { + seriesOverrides: [ + { + alias: 'read', + yaxis: 1, + }, + { + alias: 'io time', + yaxis: 2, + }, + ], + yaxes: [ + self.yaxe(format='bytes'), + self.yaxe(format='s'), + ], + }; + + // TODO: It would be nicer to have a gauge that gets a 0-1 range and displays it as a percentage 0%-100%. + // This needs to be added upstream in the promgrafonnet library and then changed here. + // TODO: Should this be partitioned by mountpoint? + local diskSpaceUsage = gauge.new( + 'Disk Space Usage', + ||| + 100 - + ( + sum(node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"} + / + sum(node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"} + * 100 + ) + ||| % $._config, + ).withLowerBeingBetter(); + + local networkReceived = + graphPanel.new( + 'Network Received', + datasource='$datasource', + span=6, + format='bytes', + ) + // TODO: Consider using `${__interval}` as range and a 1m min step. + .addTarget(prometheus.target('rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}')); + + local networkTransmitted = + graphPanel.new( + 'Network Transmitted', + datasource='$datasource', + span=6, + format='bytes', + ) + // TODO: Consider using `${__interval}` as range and a 1m min step. + .addTarget(prometheus.target('rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}')); + + dashboard.new('Nodes', time_from='now-1h') + .addTemplate( + { + current: { + text: 'Prometheus', + value: 'Prometheus', + }, + hide: 0, + label: null, + name: 'datasource', + options: [], + query: 'prometheus', + refresh: 1, + regex: '', + type: 'datasource', + }, + ) + .addTemplate( + template.new( + 'instance', + '$datasource', + 'label_values(node_exporter_build_info{%(nodeExporterSelector)s}, instance)' % $._config, + refresh='time', + ) + ) + .addRow( + row.new() + .addPanel(idleCPU) + .addPanel(systemLoad) + ) + .addRow( + row.new() + .addPanel(memoryGraph) + .addPanel(memoryGauge) + ) + .addRow( + row.new() + .addPanel(diskIO) + .addPanel(diskSpaceUsage) + ) + .addRow( + row.new() + .addPanel(networkReceived) + .addPanel(networkTransmitted) + ), + }, +} diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet new file mode 100644 index 00000000..b74adef2 --- /dev/null +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -0,0 +1,225 @@ +local g = import 'grafana-builder/grafana.libsonnet'; + +{ + grafanaDashboards+:: { + 'node-cluster-rsrc-use.json': + local legendLink = '%s/dashboard/file/k8s-node-rsrc-use.json' % $._config.grafana_prefix; + + g.dashboard('USE Method / Cluster') + .addRow( + g.row('CPU') + .addPanel( + g.panel('CPU Utilisation') + + g.queryPanel(||| + ( + instance:node_cpu_utilisation:rate1m{%(nodeExporterSelector)s} + * + instance:node_num_cpu:sum{%(nodeExporterSelector)s} + / ignoring (instance) group_left + sum without (instance) (instance:node_num_cpu:sum{%(nodeExporterSelector)s}) + ) + ||| % $._config, '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + // TODO: Is this a useful panel? At least there should be some explanation how load + // average relates to the "CPU saturation" in the title. + g.panel('CPU Saturation (load1 per CPU)') + + g.queryPanel(||| + ( + instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s} + / ignoring (instance) group_left + count without (instance) (instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}) + ) + ||| % $._config, '{{instance}}', legendLink) + + g.stack + + // TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios. + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + ) + .addRow( + g.row('Memory') + .addPanel( + g.panel('Memory Utilisation') + + g.queryPanel('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + g.panel('Memory Saturation (Swapped Pages)') + + g.queryPanel('instance:node_memory_swap_io_pages:rate{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes('rps') }, + ) + ) + .addRow( + g.row('Disk') + .addPanel( + g.panel('Disk IO Utilisation') + + // Full utilisation would be all disks on each node spending an average of + // 1 second per second doing I/O, normalize by metric cardinality for stacked charts. + // TODO: Does the partition by device make sense? Using the most utilized device per + // instance might make more sense. + g.queryPanel(||| + ( + instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s} + / ignoring (instance, device) group_left + count without (instance, device) (instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s}) + ) + ||| % $._config, '{{instance}} {{device}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + g.panel('Disk IO Saturation') + + g.queryPanel(||| + ( + instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s} + / ignoring (instance, device) group_left + count without (instance, device) (instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s}) + ) + ||| % $._config, '{{instance}} {{device}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + ) + .addRow( + g.row('Network') + .addPanel( + g.panel('Net Utilisation (Bytes Receive/Transmit)') + + g.queryPanel( + [ + 'instance:node_network_receive_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config, + '-instance:node_network_transmit_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config, + ], + ['{{instance}} Receive', '{{instance}} Transmit'], + legendLink, + ) + + g.stack + + { yaxes: g.yaxes('Bps') }, + ) + .addPanel( + g.panel('Net Saturation (Drops Receive/Transmit)') + + g.queryPanel( + [ + 'instance:node_network_receive_drop_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config, + '-instance:node_network_transmit_drop_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config, + ], + ['{{instance}} Receive', '{{instance}} Transmit'], + legendLink, + ) + + g.stack + + { yaxes: g.yaxes('rps') }, + ) + ) + .addRow( + g.row('Storage') + .addPanel( + g.panel('Disk Space Utilisation') + + g.queryPanel(||| + ( + sum without (device) ( + max without (fstype, mountpoint) ( + node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s} + ) + ) + / ignoring (instance) group_left + sum without (instance, device) ( + max without (fstype, mountpoint) ( + node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} + ) + ) + ) + ||| % $._config, '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ), + ), + + 'node-rsrc-use.json': + g.dashboard('USE Method / Node') + .addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance') + .addRow( + g.row('CPU') + .addPanel( + g.panel('CPU Utilisation') + + g.queryPanel('instance:node_cpu_utilisation:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + // TODO: Is this a useful panel? At least there should be some explanation how load + // average relates to the "CPU saturation" in the title. + g.panel('CPU Saturation (Load1)') + + g.queryPanel('instance:node_cpu_saturation_load1:{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') + + { yaxes: g.yaxes('percentunit') }, + ) + ) + .addRow( + g.row('Memory') + .addPanel( + g.panel('Memory Utilisation') + + g.queryPanel('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Memory') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + g.panel('Memory Saturation (pages swapped per second)') + + g.queryPanel('instance:node_memory_swap_io_pages:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Swap IO') + + { yaxes: g.yaxes('short') }, + ) + ) + .addRow( + g.row('Disk') + .addPanel( + g.panel('Disk IO Utilisation') + + g.queryPanel('instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation {{device}}') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + g.panel('Disk IO Saturation') + + g.queryPanel('instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation {{device}}') + + { yaxes: g.yaxes('percentunit') }, + ) + ) + .addRow( + g.row('Net') + .addPanel( + g.panel('Net Utilisation (Bytes Receive/Transmit)') + + g.queryPanel( + [ + 'instance:node_network_receive_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + '-instance:node_network_transmit_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + ], + ['Receive', 'Transmit'], + ) + + { yaxes: g.yaxes('Bps') }, + ) + .addPanel( + g.panel('Net Saturation (Drops Receive/Transmit)') + + g.queryPanel( + [ + 'instance:node_network_receive_drop_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + '-instance:node_network_transmit_drop_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + ], + ['Receive drops', 'Transmit drops'], + ) + + { yaxes: g.yaxes('rps') }, + ) + ) + .addRow( + g.row('Disk') + .addPanel( + g.panel('Disk Utilisation') + + g.queryPanel(||| + 1 - + ( + sum(max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s})) + / + sum(max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s})) + ) + ||| % $._config, 'Disk') + + { yaxes: g.yaxes('percentunit') }, + ), + ), + }, +} diff --git a/docs/node-mixin/jsonnetfile.json b/docs/node-mixin/jsonnetfile.json new file mode 100644 index 00000000..dc92880d --- /dev/null +++ b/docs/node-mixin/jsonnetfile.json @@ -0,0 +1,34 @@ +{ + "dependencies": [ + { + "name": "grafonnet", + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib", + "subdir": "grafonnet" + } + }, + "version": "master" + }, + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "grafana-builder" + } + }, + "version": "master" + }, + { + "name": "promgrafonnet", + "source": { + "git": { + "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin", + "subdir": "lib/promgrafonnet" + } + }, + "version": "master" + } + ] +} diff --git a/docs/node-mixin/mixin.libsonnet b/docs/node-mixin/mixin.libsonnet new file mode 100644 index 00000000..b9831f93 --- /dev/null +++ b/docs/node-mixin/mixin.libsonnet @@ -0,0 +1,4 @@ +(import 'config.libsonnet') + +(import 'alerts/alerts.libsonnet') + +(import 'dashboards/dashboards.libsonnet') + +(import 'rules/rules.libsonnet') diff --git a/docs/node-mixin/rules.jsonnet b/docs/node-mixin/rules.jsonnet new file mode 100644 index 00000000..dbe13f41 --- /dev/null +++ b/docs/node-mixin/rules.jsonnet @@ -0,0 +1 @@ +std.manifestYamlDoc((import 'mixin.libsonnet').prometheusRules) diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet new file mode 100644 index 00000000..85f7618d --- /dev/null +++ b/docs/node-mixin/rules/rules.libsonnet @@ -0,0 +1,113 @@ +{ + prometheusRules+:: { + groups+: [ + { + name: 'node-exporter.rules', + rules: [ + { + // This rule gives the number of CPUs per node. + record: 'instance:node_num_cpu:sum', + expr: ||| + count without (cpu) ( + count without (mode) ( + node_cpu_seconds_total{%(nodeExporterSelector)s} + ) + ) + ||| % $._config, + }, + { + // CPU utilisation is % CPU is not idle. + record: 'instance:node_cpu_utilisation:rate1m', + expr: ||| + 1 - avg without (cpu, mode) ( + rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}[1m]) + ) + ||| % $._config, + }, + { + // This is CPU saturation: 1min avg run queue length / number of CPUs. + // Can go over 1. + // TODO: There are situation where a run queue >1/core is just normal and fine. + // We need to clarify how to read this metric and if its usage is helpful at all. + record: 'instance:node_load1_per_cpu:ratio', + expr: ||| + ( + node_load1{%(nodeExporterSelector)s} + / + instance:node_num_cpu:sum{%(nodeExporterSelector)s} + ) + ||| % $._config, + }, + { + // Memory utilisation (ratio of used memory per instance). + record: 'instance:node_memory_utilisation:ratio', + expr: ||| + 1 - ( + node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} + / + node_memory_MemTotal_bytes{%(nodeExporterSelector)s} + ) + ||| % $._config, + }, + { + record: 'instance:node_memory_swap_io_pages:rate1m', + expr: ||| + ( + rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m]) + + + rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m]) + ) + ||| % $._config, + }, + { + // Disk utilisation (seconds spent, 1 second rate). + record: 'instance_device:node_disk_io_time_seconds:rate1m', + expr: ||| + rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) + ||| % $._config, + }, + { + // Disk saturation (weighted seconds spent, 1 second rate). + record: 'instance_device:node_disk_io_time_weighted_seconds:rate1m', + expr: ||| + rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) + ||| % $._config, + }, + { + record: 'instance:node_network_receive_bytes_excluding_lo:rate1m', + expr: ||| + sum without (device) ( + rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) + ) + ||| % $._config, + }, + { + record: 'instance:node_network_transmit_bytes_excluding_lo:rate1m', + expr: ||| + sum without (device) ( + rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) + ) + ||| % $._config, + }, + // TODO: Find out if those drops ever happen on modern switched networks. + { + record: 'instance:node_network_receive_drop_excluding_lo:rate1m', + expr: ||| + sum without (device) ( + rate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) + ) + ||| % $._config, + }, + { + record: 'instance:node_network_transmit_drop_excluding_lo:rate1m', + expr: ||| + sum without (device) ( + rate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) + ) + ||| % $._config, + }, + ], + }, + ], + }, +}