From bafe1707f13f9da58c7a88b42f15ab596f649ba9 Mon Sep 17 00:00:00 2001
From: Tom Wilkie <tom.wilkie@gmail.com>
Date: Tue, 8 May 2018 12:10:29 +0200
Subject: [PATCH] Beginnings of a node-exporter monitoring mixin.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>
---
 node-mixin/alerts/alerts.libsonnet            | 165 ++++++++++++++++
 node-mixin/config.libsonnet                   |  11 ++
 node-mixin/dashboards/dashboards.libsonnet    |   2 +
 node-mixin/dashboards/node.libsonnet          | 176 ++++++++++++++++++
 node-mixin/dashboards/use.libsonnet           | 151 +++++++++++++++
 node-mixin/jsonnetfile.json                   |  24 +++
 node-mixin/lib/promgrafonnet/gauge.libsonnet  |  60 ++++++
 .../promgrafonnet/numbersinglestat.libsonnet  |  48 +++++
 .../lib/promgrafonnet/promgrafonnet.libsonnet |   5 +
 node-mixin/mixin.libsonnet                    |   4 +
 node-mixin/rules/rules.libsonnet              | 121 ++++++++++++
 11 files changed, 767 insertions(+)
 create mode 100644 node-mixin/alerts/alerts.libsonnet
 create mode 100644 node-mixin/config.libsonnet
 create mode 100644 node-mixin/dashboards/dashboards.libsonnet
 create mode 100644 node-mixin/dashboards/node.libsonnet
 create mode 100644 node-mixin/dashboards/use.libsonnet
 create mode 100644 node-mixin/jsonnetfile.json
 create mode 100644 node-mixin/lib/promgrafonnet/gauge.libsonnet
 create mode 100644 node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet
 create mode 100644 node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet
 create mode 100644 node-mixin/mixin.libsonnet
 create mode 100644 node-mixin/rules/rules.libsonnet

diff --git a/node-mixin/alerts/alerts.libsonnet b/node-mixin/alerts/alerts.libsonnet
new file mode 100644
index 00000000..198e22fd
--- /dev/null
+++ b/node-mixin/alerts/alerts.libsonnet
@@ -0,0 +1,165 @@
+{
+  prometheusAlerts+:: {
+    groups+: [
+      {
+        name: 'node',
+        rules: [
+          {
+            alert: 'NodeFilesystemSpaceFillingUp',
+            expr: |||
+              predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0
+                AND
+              node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4
+                AND
+              node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
+            ||| % $._config,
+            'for': '1h',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 24 hours.',
+            },
+          },
+          {
+            alert: 'NodeFilesystemSpaceFillingUp',
+            expr: |||
+              predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0
+                AND
+              node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2
+                AND
+              node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
+            ||| % $._config,
+            'for': '1h',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 4 hours.',
+            },
+          },
+          {
+            alert: 'NodeFilesystemOutOfSpace',
+            expr: |||
+              node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5
+                AND
+              node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
+            ||| % $._config,
+            'for': '1h',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.',
+            },
+          },
+          {
+            alert: 'NodeFilesystemOutOfSpace',
+            expr: |||
+              node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3
+                AND
+              node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
+            ||| % $._config,
+            'for': '1h',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.',
+            },
+          },
+          {
+            alert: 'NodeFilesystemFilesFillingUp',
+            expr: |||
+              predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0
+                AND
+              node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4
+                AND
+              node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
+            ||| % $._config,
+            'for': '1h',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 24 hours.',
+            },
+          },
+          {
+            alert: 'NodeFilesystemFilesFillingUp',
+            expr: |||
+              predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0
+                AND
+              node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2
+                AND
+              node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
+            ||| % $._config,
+            'for': '1h',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 4 hours.',
+            },
+          },
+          {
+            alert: 'NodeFilesystemOutOfFiles',
+            expr: |||
+              node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5
+                AND
+              node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
+            ||| % $._config,
+            'for': '1h',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available inodes left.',
+            },
+          },
+          {
+            alert: 'NodeFilesystemOutOfSpace',
+            expr: |||
+              node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3
+                AND
+              node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
+            ||| % $._config,
+            'for': '1h',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.',
+            },
+          },
+          {
+            alert: 'NodeNetworkReceiveErrs',
+            expr: |||
+              increase(node_network_receive_errs[2m]) > 10
+            ||| % $._config,
+            'for': '1h',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while receiving packets ({{ $value }} errors in two minutes).',
+            },
+          },
+          {
+            alert: 'NodeNetworkTransmitErrs',
+            expr: |||
+              increase(node_network_transmit_errs[2m]) > 10
+            ||| % $._config,
+            'for': '1h',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while transmitting packets ({{ $value }} errors in two minutes).',
+            },
+          },
+        ],
+      },
+    ],
+  },
+}
diff --git a/node-mixin/config.libsonnet b/node-mixin/config.libsonnet
new file mode 100644
index 00000000..6c5d6f74
--- /dev/null
+++ b/node-mixin/config.libsonnet
@@ -0,0 +1,11 @@
+{
+  _config+:: {
+    // Selectors are inserted between {} in Prometheus queries.
+    nodeExporterSelector: 'job="node-exporter"',
+
+    // Mainly extracted because they are repetitive, but also useful to customize.
+    fsSelectors: 'fstype=~"ext.|xfs",mountpoint!="/var/lib/docker/aufs"',
+
+    grafana_prefix: '',
+  },
+}
diff --git a/node-mixin/dashboards/dashboards.libsonnet b/node-mixin/dashboards/dashboards.libsonnet
new file mode 100644
index 00000000..e6adbd4f
--- /dev/null
+++ b/node-mixin/dashboards/dashboards.libsonnet
@@ -0,0 +1,2 @@
+(import 'node.libsonnet') +
+(import 'use.libsonnet')
diff --git a/node-mixin/dashboards/node.libsonnet b/node-mixin/dashboards/node.libsonnet
new file mode 100644
index 00000000..471c5b37
--- /dev/null
+++ b/node-mixin/dashboards/node.libsonnet
@@ -0,0 +1,176 @@
+local grafana = import 'grafonnet/grafana.libsonnet';
+local dashboard = grafana.dashboard;
+local row = grafana.row;
+local prometheus = grafana.prometheus;
+local template = grafana.template;
+local graphPanel = grafana.graphPanel;
+local promgrafonnet = import '../lib/promgrafonnet/promgrafonnet.libsonnet';
+local gauge = promgrafonnet.gauge;
+
+{
+  grafanaDashboards+:: {
+    'nodes.json':
+      local idleCPU =
+        graphPanel.new(
+          'Idle CPU',
+          datasource='$datasource',
+          span=6,
+          format='percent',
+          max=100,
+          min=0,
+        )
+        .addTarget(prometheus.target(
+          |||
+            100 - (avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[5m])) * 100)
+          ||| % $._config,
+          legendFormat='{{cpu}}',
+          intervalFactor=10,
+        ));
+
+      local systemLoad =
+        graphPanel.new(
+          'System load',
+          datasource='$datasource',
+          span=6,
+          format='percent',
+        )
+        .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 1m'))
+        .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 5m'))
+        .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 15m'));
+
+      local memoryGraph =
+        graphPanel.new(
+          'Memory Usage',
+          datasource='$datasource',
+          span=9,
+          format='bytes',
+        )
+        .addTarget(prometheus.target(
+          |||
+            node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"}
+            - node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}
+            - node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}
+            - node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}
+          ||| % $._config, legendFormat='memory used'
+        ))
+        .addTarget(prometheus.target('node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers'))
+        .addTarget(prometheus.target('node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached'))
+        .addTarget(prometheus.target('node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free'));
+
+      local memoryGauge = gauge.new(
+        'Memory Usage',
+        |||
+          (
+            node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"}
+          - node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}
+          - node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}
+          - node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}
+          ) * 100
+            /
+          node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"}
+        ||| % $._config,
+      ).withLowerBeingBetter();
+
+      local diskIO =
+        graphPanel.new(
+          'Disk I/O',
+          datasource='$datasource',
+          span=9,
+        )
+        .addTarget(prometheus.target('sum by (instance) (rate(node_disk_bytes_read{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='read'))
+        .addTarget(prometheus.target('sum by (instance) (rate(node_disk_bytes_written{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='written'))
+        .addTarget(prometheus.target('sum by (instance) (rate(node_disk_io_time_ms{%(nodeExporterSelector)s,  instance="$instance"}[2m]))' % $._config, legendFormat='io time')) +
+        {
+          seriesOverrides: [
+            {
+              alias: 'read',
+              yaxis: 1,
+            },
+            {
+              alias: 'io time',
+              yaxis: 2,
+            },
+          ],
+          yaxes: [
+            self.yaxe(format='bytes'),
+            self.yaxe(format='ms'),
+          ],
+        };
+
+      local diskSpaceUsage = gauge.new(
+        'Disk Space Usage',
+        |||
+          (
+            sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"})
+          - sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"})
+          ) * 100
+            /
+          sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"})
+        ||| % $._config,
+      ).withLowerBeingBetter();
+
+      local networkReceived =
+        graphPanel.new(
+          'Network Received',
+          datasource='$datasource',
+          span=6,
+          format='bytes',
+        )
+        .addTarget(prometheus.target('rate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[5m])' % $._config, legendFormat='{{device}}'));
+
+      local networkTransmitted =
+        graphPanel.new(
+          'Network Transmitted',
+          datasource='$datasource',
+          span=6,
+          format='bytes',
+        )
+        .addTarget(prometheus.target('rate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[5m])' % $._config, legendFormat='{{device}}'));
+
+      dashboard.new('Nodes', time_from='now-1h')
+      .addTemplate(
+        {
+          current: {
+            text: 'Prometheus',
+            value: 'Prometheus',
+          },
+          hide: 0,
+          label: null,
+          name: 'datasource',
+          options: [],
+          query: 'prometheus',
+          refresh: 1,
+          regex: '',
+          type: 'datasource',
+        },
+      )
+      .addTemplate(
+        template.new(
+          'instance',
+          '$datasource',
+          'label_values(node_boot_time{%(nodeExporterSelector)s}, instance)' % $._config,
+          refresh='time',
+        )
+      )
+      .addRow(
+        row.new()
+        .addPanel(idleCPU)
+        .addPanel(systemLoad)
+      )
+      .addRow(
+        row.new()
+        .addPanel(memoryGraph)
+        .addPanel(memoryGauge)
+      )
+      .addRow(
+        row.new()
+        .addPanel(diskIO)
+        .addPanel(diskSpaceUsage)
+      )
+      .addRow(
+        row.new()
+        .addPanel(networkReceived)
+        .addPanel(networkTransmitted)
+      ),
+  },
+}
diff --git a/node-mixin/dashboards/use.libsonnet b/node-mixin/dashboards/use.libsonnet
new file mode 100644
index 00000000..526002f6
--- /dev/null
+++ b/node-mixin/dashboards/use.libsonnet
@@ -0,0 +1,151 @@
+local g = import 'grafana-builder/grafana.libsonnet';
+
+{
+  grafanaDashboards+:: {
+    'node-cluster-rsrc-use.json':
+      local legendLink = '%s/dashboard/file/k8s-node-rsrc-use.json' % $._config.grafana_prefix;
+
+      g.dashboard('USE Method / Cluster')
+      .addRow(
+        g.row('CPU')
+        .addPanel(
+          g.panel('CPU Utilisation') +
+          g.queryPanel('instance:node_cpu_utilisation:avg1m * instance:node_num_cpu:sum / scalar(sum(instance:node_num_cpu:sum))', '{{instance}}', legendLink) +
+          g.stack +
+          { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
+        )
+        .addPanel(
+          g.panel('CPU Saturation (Load1)') +
+          g.queryPanel(|||
+            instance:node_cpu_saturation_load1: / scalar(sum(up{%(nodeExporterSelector)s}))
+          ||| % $._config, '{{instance}}', legendLink) +
+          g.stack +
+          { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
+        )
+      )
+      .addRow(
+        g.row('Memory')
+        .addPanel(
+          g.panel('Memory Utilisation') +
+          g.queryPanel('instance:node_memory_utilisation:ratio', '{{instance}}', legendLink) +
+          g.stack +
+          { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
+        )
+        .addPanel(
+          g.panel('Memory Saturation (Swap I/O)') +
+          g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate', '{{instance}}', legendLink) +
+          g.stack +
+          { yaxes: g.yaxes('Bps') },
+        )
+      )
+      .addRow(
+        g.row('Disk')
+        .addPanel(
+          g.panel('Disk IO Utilisation') +
+          // Full utilisation would be all disks on each node spending an average of
+          // 1 sec per second doing I/O, normalize by node count for stacked charts
+          g.queryPanel(|||
+            instance:node_disk_utilisation:avg_irate / scalar(sum(up{%(nodeExporterSelector)s}))
+          ||| % $._config, '{{instance}}', legendLink) +
+          g.stack +
+          { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
+        )
+        .addPanel(
+          g.panel('Disk IO Saturation') +
+          g.queryPanel(|||
+            instance:node_disk_saturation:avg_irate / scalar(sum(up{%(nodeExporterSelector)s}))
+          ||| % $._config, '{{instance}}', legendLink) +
+          g.stack +
+          { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
+        )
+      )
+      .addRow(
+        g.row('Network')
+        .addPanel(
+          g.panel('Net Utilisation (Transmitted)') +
+          g.queryPanel('instance:node_net_utilisation:sum_irate', '{{instance}}', legendLink) +
+          g.stack +
+          { yaxes: g.yaxes('Bps') },
+        )
+        .addPanel(
+          g.panel('Net Saturation (Dropped)') +
+          g.queryPanel('instance:node_net_saturation:sum_irate', '{{instance}}', legendLink) +
+          g.stack +
+          { yaxes: g.yaxes('Bps') },
+        )
+      )
+      .addRow(
+        g.row('Storage')
+        .addPanel(
+          g.panel('Disk Capacity') +
+          g.queryPanel('sum(max(node_filesystem_size{fstype=~"ext[24]"} - node_filesystem_free{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) +
+          g.stack +
+          { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
+        ),
+      ),
+
+    'k8s-node-rsrc-use.json':
+      g.dashboard('K8s / USE Method / Node')
+      .addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance')
+      .addRow(
+        g.row('CPU')
+        .addPanel(
+          g.panel('CPU Utilisation') +
+          g.queryPanel('instance:node_cpu_utilisation:avg1m{instance="$instance"}', 'Utilisation') +
+          { yaxes: g.yaxes('percentunit') },
+        )
+        .addPanel(
+          g.panel('CPU Saturation (Load1)') +
+          g.queryPanel('instance:node_cpu_saturation_load1:{instance="$instance"}', 'Saturation') +
+          { yaxes: g.yaxes('percentunit') },
+        )
+      )
+      .addRow(
+        g.row('Memory')
+        .addPanel(
+          g.panel('Memory Utilisation') +
+          g.queryPanel('instance:node_memory_utilisation:{instance="$instance"}', 'Memory') +
+          { yaxes: g.yaxes('percentunit') },
+        )
+        .addPanel(
+          g.panel('Memory Saturation (Swap I/O)') +
+          g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{instance="$instance"}', 'Swap IO') +
+          { yaxes: g.yaxes('Bps') },
+        )
+      )
+      .addRow(
+        g.row('Disk')
+        .addPanel(
+          g.panel('Disk IO Utilisation') +
+          g.queryPanel('instance:node_disk_utilisation:avg_irate{instance="$instance"}', 'Utilisation') +
+          { yaxes: g.yaxes('percentunit') },
+        )
+        .addPanel(
+          g.panel('Disk IO Saturation') +
+          g.queryPanel('instance:node_disk_saturation:avg_irate{instance="$instance"}', 'Saturation') +
+          { yaxes: g.yaxes('percentunit') },
+        )
+      )
+      .addRow(
+        g.row('Net')
+        .addPanel(
+          g.panel('Net Utilisation (Transmitted)') +
+          g.queryPanel('instance:node_net_utilisation:sum_irate{instance="$instance"}', 'Utilisation') +
+          { yaxes: g.yaxes('Bps') },
+        )
+        .addPanel(
+          g.panel('Net Saturation (Dropped)') +
+          g.queryPanel('instance:node_net_saturation:sum_irate{instance="$instance"}', 'Saturation') +
+          { yaxes: g.yaxes('Bps') },
+        )
+      )
+      .addRow(
+        g.row('Disk')
+        .addPanel(
+          g.panel('Disk Utilisation') +
+          g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size{fstype=~"ext[24]"}))', 'Disk') +
+          { yaxes: g.yaxes('percentunit') },
+        ),
+      ),
+  },
+}
diff --git a/node-mixin/jsonnetfile.json b/node-mixin/jsonnetfile.json
new file mode 100644
index 00000000..45326aad
--- /dev/null
+++ b/node-mixin/jsonnetfile.json
@@ -0,0 +1,24 @@
+{
+    "dependencies": [
+        {
+            "name": "grafonnet",
+            "source": {
+                "git": {
+                    "remote": "https://github.com/grafana/grafonnet-lib",
+                    "subdir": "grafonnet"
+                }
+            },
+            "version": "master"
+        },
+        {
+            "name": "grafana-builder",
+            "source": {
+                "git": {
+                    "remote": "https://github.com/kausalco/public",
+                    "subdir": "grafana-builder"
+                }
+            },
+            "version": "master"
+        }
+    ]
+}
diff --git a/node-mixin/lib/promgrafonnet/gauge.libsonnet b/node-mixin/lib/promgrafonnet/gauge.libsonnet
new file mode 100644
index 00000000..ea6c1ab6
--- /dev/null
+++ b/node-mixin/lib/promgrafonnet/gauge.libsonnet
@@ -0,0 +1,60 @@
+local grafana = import 'grafonnet/grafana.libsonnet';
+local singlestat = grafana.singlestat;
+local prometheus = grafana.prometheus;
+
+{
+  new(title, query)::
+    singlestat.new(
+      title,
+      datasource='prometheus',
+      span=3,
+      format='percent',
+      valueName='current',
+      colors=[
+        'rgba(245, 54, 54, 0.9)',
+        'rgba(237, 129, 40, 0.89)',
+        'rgba(50, 172, 45, 0.97)',
+      ],
+      thresholds='50, 80',
+      valueMaps=[
+        {
+          op: '=',
+          text: 'N/A',
+          value: 'null',
+        },
+      ],
+    )
+    .addTarget(
+      prometheus.target(
+        query
+      )
+    ) + {
+      gauge: {
+        maxValue: 100,
+        minValue: 0,
+        show: true,
+        thresholdLabels: false,
+        thresholdMarkers: true,
+      },
+      withTextNullValue(text):: self {
+        valueMaps: [
+          {
+            op: '=',
+            text: text,
+            value: 'null',
+          },
+        ],
+      },
+      withSpanSize(size):: self {
+        span: size,
+      },
+      withLowerBeingBetter():: self {
+        colors: [
+          'rgba(50, 172, 45, 0.97)',
+          'rgba(237, 129, 40, 0.89)',
+          'rgba(245, 54, 54, 0.9)',
+        ],
+        thresholds: '80, 90',
+      },
+    },
+}
diff --git a/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet b/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet
new file mode 100644
index 00000000..bc1d6f6f
--- /dev/null
+++ b/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet
@@ -0,0 +1,48 @@
+local grafana = import 'grafonnet/grafana.libsonnet';
+local singlestat = grafana.singlestat;
+local prometheus = grafana.prometheus;
+
+{
+  new(title, query)::
+    singlestat.new(
+      title,
+      datasource='prometheus',
+      span=3,
+      valueName='current',
+      valueMaps=[
+        {
+          op: '=',
+          text: '0',
+          value: 'null',
+        },
+      ],
+    )
+    .addTarget(
+      prometheus.target(
+        query
+      )
+    ) + {
+      withTextNullValue(text):: self {
+        valueMaps: [
+          {
+            op: '=',
+            text: text,
+            value: 'null',
+          },
+        ],
+      },
+      withSpanSize(size):: self {
+        span: size,
+      },
+      withPostfix(postfix):: self {
+        postfix: postfix,
+      },
+      withSparkline():: self {
+        sparkline: {
+          show: true,
+          lineColor: 'rgb(31, 120, 193)',
+          fillColor: 'rgba(31, 118, 189, 0.18)',
+        },
+      },
+    },
+}
diff --git a/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet b/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet
new file mode 100644
index 00000000..013ff42b
--- /dev/null
+++ b/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet
@@ -0,0 +1,5 @@
+{
+  numbersinglestat:: import 'numbersinglestat.libsonnet',
+  gauge:: import 'gauge.libsonnet',
+  percentlinegraph:: import 'percentlinegraph.libsonnet',
+}
diff --git a/node-mixin/mixin.libsonnet b/node-mixin/mixin.libsonnet
new file mode 100644
index 00000000..b9831f93
--- /dev/null
+++ b/node-mixin/mixin.libsonnet
@@ -0,0 +1,4 @@
+(import 'config.libsonnet') +
+(import 'alerts/alerts.libsonnet') +
+(import 'dashboards/dashboards.libsonnet') +
+(import 'rules/rules.libsonnet')
diff --git a/node-mixin/rules/rules.libsonnet b/node-mixin/rules/rules.libsonnet
new file mode 100644
index 00000000..ad1cc09b
--- /dev/null
+++ b/node-mixin/rules/rules.libsonnet
@@ -0,0 +1,121 @@
+{
+  prometheusRules+:: {
+    groups+: [
+      {
+        name: 'node.rules',
+        rules: [
+          {
+            // This rule gives the number of CPUs per node.
+            record: 'instance:node_num_cpu:sum',
+            expr: |||
+              count by (instance) (
+                sum by (instance, cpu) (
+                  node_cpu{%(nodeExporterSelector)s}
+                )
+              )
+            ||| % $._config,
+          },
+          {
+            // CPU utilisation is % CPU is not idle.
+            record: 'instance:node_cpu_utilisation:avg1m',
+            expr: |||
+              1 - avg by (instance) (
+                rate(node_cpu{%(nodeExporterSelector)s,mode="idle"}[1m])
+              )
+            ||| % $._config,
+          },
+          {
+            // CPU saturation is 1min avg run queue length / number of CPUs.
+            // Can go over 100%.  >100% is bad.
+            record: 'instance:node_cpu_saturation_load1:',
+            expr: |||
+              sum by (instance) (
+                node_load1{%(nodeExporterSelector)s}
+              )
+              /
+              instance:node_num_cpu:sum
+            ||| % $._config,
+          },
+          {
+            // Available memory per node
+            record: 'instance:node_memory_bytes_available:sum',
+            expr: |||
+              sum by (instance) (
+                (node_memory_MemFree{%(nodeExporterSelector)s} + node_memory_Cached{%(nodeExporterSelector)s} + node_memory_Buffers{%(nodeExporterSelector)s})
+              )
+            ||| % $._config,
+          },
+          {
+            // Total memory per node
+            record: 'instance:node_memory_bytes_total:sum',
+            expr: |||
+              sum by (instance) (
+                node_memory_MemTotal{%(nodeExporterSelector)s}
+              )
+            ||| % $._config,
+          },
+          {
+            // Memory utilisation per node, normalized by per-node memory
+            record: 'instance:node_memory_utilisation:ratio',
+            expr: |||
+              (instance:node_memory_bytes_total:sum - instance:node_memory_bytes_available:sum)
+              /
+              scalar(sum(instance:node_memory_bytes_total:sum))
+            |||,
+          },
+          {
+            record: 'instance:node_memory_utilisation:',
+            expr: |||
+              1 - (instance:node_memory_bytes_available:sum / instance:node_memory_bytes_total:sum)
+            ||| % $._config,
+          },
+          {
+            record: 'instance:node_memory_swap_io_bytes:sum_rate',
+            expr: |||
+              1e3 * sum by (instance) (
+                (rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m])
+                 + rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m]))
+              )
+            ||| % $._config,
+          },
+          {
+            // Disk utilisation (ms spent, by rate() it's bound by 1 second)
+            record: 'instance:node_disk_utilisation:avg_irate',
+            expr: |||
+              avg by (instance) (
+                irate(node_disk_io_time_ms{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3
+              )
+            ||| % $._config,
+          },
+          {
+            // Disk saturation (ms spent, by rate() it's bound by 1 second)
+            record: 'instance:node_disk_saturation:avg_irate',
+            expr: |||
+              avg by (instance) (
+                irate(node_disk_io_time_weighted{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3
+              )
+            ||| % $._config,
+          },
+          {
+            record: 'instance:node_net_utilisation:sum_irate',
+            expr: |||
+              sum by (instance) (
+                (irate(node_network_receive_bytes{%(nodeExporterSelector)s,device="eth0"}[1m]) +
+                 irate(node_network_transmit_bytes{%(nodeExporterSelector)s,device="eth0"}[1m]))
+              )
+            ||| % $._config,
+          },
+          {
+            record: 'instance:node_net_saturation:sum_irate',
+            expr: |||
+              sum by (instance) (
+                (irate(node_network_receive_drop{%(nodeExporterSelector)s,device="eth0"}[1m]) +
+                 irate(node_network_transmit_drop{%(nodeExporterSelector)s,device="eth0"}[1m]))
+              )
+            ||| % $._config,
+          },
+        ],
+      },
+    ],
+  },
+}