Merge branch 'mixin' of git://github.com/tomwilkie/node_exporter into beorn7/mixin

pull/1429/head
beorn7 5 years ago
commit 61bcc5b468

@ -0,0 +1,3 @@
/jsonnetfile.lock.json
/vendor/

@ -0,0 +1,165 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'node-exporter',
rules: [
{
alert: 'NodeFilesystemSpaceFillingUp',
expr: |||
predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0
and
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
||| % $._config,
'for': '1h',
labels: {
severity: 'warning',
},
annotations: {
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 24 hours.',
},
},
{
alert: 'NodeFilesystemSpaceFillingUp',
expr: |||
predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0
and
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
||| % $._config,
'for': '1h',
labels: {
severity: 'critical',
},
annotations: {
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 4 hours.',
},
},
{
alert: 'NodeFilesystemOutOfSpace',
expr: |||
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
||| % $._config,
'for': '1h',
labels: {
severity: 'warning',
},
annotations: {
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.',
},
},
{
alert: 'NodeFilesystemOutOfSpace',
expr: |||
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
||| % $._config,
'for': '1h',
labels: {
severity: 'critical',
},
annotations: {
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.',
},
},
{
alert: 'NodeFilesystemFilesFillingUp',
expr: |||
predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0
and
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
||| % $._config,
'for': '1h',
labels: {
severity: 'warning',
},
annotations: {
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 24 hours.',
},
},
{
alert: 'NodeFilesystemFilesFillingUp',
expr: |||
predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0
and
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
||| % $._config,
'for': '1h',
labels: {
severity: 'critical',
},
annotations: {
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 4 hours.',
},
},
{
alert: 'NodeFilesystemOutOfFiles',
expr: |||
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
||| % $._config,
'for': '1h',
labels: {
severity: 'warning',
},
annotations: {
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available inodes left.',
},
},
{
alert: 'NodeFilesystemOutOfSpace',
expr: |||
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
||| % $._config,
'for': '1h',
labels: {
severity: 'critical',
},
annotations: {
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.',
},
},
{
alert: 'NodeNetworkReceiveErrs',
expr: |||
increase(node_network_receive_errs_total[2m]) > 10
||| % $._config,
'for': '1h',
labels: {
severity: 'critical',
},
annotations: {
message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while receiving packets ({{ $value }} errors in two minutes).',
},
},
{
alert: 'NodeNetworkTransmitErrs',
expr: |||
increase(node_network_transmit_errs_total[2m]) > 10
||| % $._config,
'for': '1h',
labels: {
severity: 'critical',
},
annotations: {
message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while transmitting packets ({{ $value }} errors in two minutes).',
},
},
],
},
],
},
}

@ -0,0 +1,11 @@
{
_config+:: {
// Selectors are inserted between {} in Prometheus queries.
nodeExporterSelector: 'job="node-exporter"',
// Mainly extracted because they are repetitive, but also useful to customize.
fsSelectors: 'fstype=~"ext.|xfs",mountpoint!="/var/lib/docker/aufs"',
grafana_prefix: '',
},
}

@ -0,0 +1,2 @@
(import 'node.libsonnet') +
(import 'use.libsonnet')

@ -0,0 +1,170 @@
local grafana = import 'grafonnet/grafana.libsonnet';
local dashboard = grafana.dashboard;
local row = grafana.row;
local prometheus = grafana.prometheus;
local template = grafana.template;
local graphPanel = grafana.graphPanel;
local promgrafonnet = import '../lib/promgrafonnet/promgrafonnet.libsonnet';
local gauge = promgrafonnet.gauge;
{
grafanaDashboards+:: {
'nodes.json':
local idleCPU =
graphPanel.new(
'Idle CPU',
datasource='$datasource',
span=6,
format='percentunit',
max=100,
min=0,
)
.addTarget(prometheus.target(
|||
1 - avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m]))
||| % $._config,
legendFormat='{{cpu}}',
intervalFactor=10,
));
local systemLoad =
graphPanel.new(
'System load',
datasource='$datasource',
span=6,
format='percentunit',
)
.addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 1m'))
.addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 5m'))
.addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 15m'));
local memoryGraph =
graphPanel.new(
'Memory Usage',
datasource='$datasource',
span=9,
format='bytes',
)
.addTarget(prometheus.target(
|||
node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"}
- node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}
- node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}
- node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}
||| % $._config, legendFormat='memory used'
))
.addTarget(prometheus.target('node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers'))
.addTarget(prometheus.target('node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached'))
.addTarget(prometheus.target('node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free'));
local memoryGauge = gauge.new(
'Memory Usage',
|||
node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"}
/
node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"}
||| % $._config,
).withLowerBeingBetter();
local diskIO =
graphPanel.new(
'Disk I/O',
datasource='$datasource',
span=9,
)
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_read_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read'))
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_written_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written'))
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) +
{
seriesOverrides: [
{
alias: 'read',
yaxis: 1,
},
{
alias: 'io time',
yaxis: 2,
},
],
yaxes: [
self.yaxe(format='bytes'),
self.yaxe(format='ms'),
],
};
local diskSpaceUsage = gauge.new(
'Disk Space Usage',
|||
1 - (
sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}
/
sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}
)
||| % $._config,
).withLowerBeingBetter();
local networkReceived =
graphPanel.new(
'Network Received',
datasource='$datasource',
span=6,
format='bytes',
)
.addTarget(prometheus.target('irate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}'));
local networkTransmitted =
graphPanel.new(
'Network Transmitted',
datasource='$datasource',
span=6,
format='bytes',
)
.addTarget(prometheus.target('irate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}'));
dashboard.new('Nodes', time_from='now-1h')
.addTemplate(
{
current: {
text: 'Prometheus',
value: 'Prometheus',
},
hide: 0,
label: null,
name: 'datasource',
options: [],
query: 'prometheus',
refresh: 1,
regex: '',
type: 'datasource',
},
)
.addTemplate(
template.new(
'instance',
'$datasource',
'label_values(node_boot_time{%(nodeExporterSelector)s}, instance)' % $._config,
refresh='time',
)
)
.addRow(
row.new()
.addPanel(idleCPU)
.addPanel(systemLoad)
)
.addRow(
row.new()
.addPanel(memoryGraph)
.addPanel(memoryGauge)
)
.addRow(
row.new()
.addPanel(diskIO)
.addPanel(diskSpaceUsage)
)
.addRow(
row.new()
.addPanel(networkReceived)
.addPanel(networkTransmitted)
),
},
}

@ -0,0 +1,151 @@
local g = import 'grafana-builder/grafana.libsonnet';
{
grafanaDashboards+:: {
'node-cluster-rsrc-use.json':
local legendLink = '%s/dashboard/file/k8s-node-rsrc-use.json' % $._config.grafana_prefix;
g.dashboard('USE Method / Cluster')
.addRow(
g.row('CPU')
.addPanel(
g.panel('CPU Utilisation') +
g.queryPanel('instance:node_cpu_utilisation:avg1m * instance:node_num_cpu:sum / scalar(sum(instance:node_num_cpu:sum))', '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
.addPanel(
g.panel('CPU Saturation (Load1)') +
g.queryPanel(|||
instance:node_cpu_saturation_load1: / scalar(sum(up{%(nodeExporterSelector)s}))
||| % $._config, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
)
.addRow(
g.row('Memory')
.addPanel(
g.panel('Memory Utilisation') +
g.queryPanel('instance:node_memory_utilisation:ratio', '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
.addPanel(
g.panel('Memory Saturation (Swap I/O)') +
g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate', '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes('Bps') },
)
)
.addRow(
g.row('Disk')
.addPanel(
g.panel('Disk IO Utilisation') +
// Full utilisation would be all disks on each node spending an average of
// 1 sec per second doing I/O, normalize by node count for stacked charts
g.queryPanel(|||
instance:node_disk_utilisation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s}))
||| % $._config, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
.addPanel(
g.panel('Disk IO Saturation') +
g.queryPanel(|||
instance:node_disk_saturation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s}))
||| % $._config, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
)
.addRow(
g.row('Network')
.addPanel(
g.panel('Net Utilisation (Transmitted)') +
g.queryPanel('instance:node_net_utilisation:sum_irate', '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes('Bps') },
)
.addPanel(
g.panel('Net Saturation (Dropped)') +
g.queryPanel('instance:node_net_saturation:sum_irate', '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes('Bps') },
)
)
.addRow(
g.row('Storage')
.addPanel(
g.panel('Disk Capacity') +
g.queryPanel('sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_free_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
),
),
'node-rsrc-use.json':
g.dashboard('USE Method / Node')
.addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance')
.addRow(
g.row('CPU')
.addPanel(
g.panel('CPU Utilisation') +
g.queryPanel('instance:node_cpu_utilisation:avg1m{instance="$instance"}', 'Utilisation') +
{ yaxes: g.yaxes('percentunit') },
)
.addPanel(
g.panel('CPU Saturation (Load1)') +
g.queryPanel('instance:node_cpu_saturation_load1:{instance="$instance"}', 'Saturation') +
{ yaxes: g.yaxes('percentunit') },
)
)
.addRow(
g.row('Memory')
.addPanel(
g.panel('Memory Utilisation') +
g.queryPanel('instance:node_memory_utilisation:ratio{instance="$instance"}', 'Memory') +
{ yaxes: g.yaxes('percentunit') },
)
.addPanel(
g.panel('Memory Saturation (Swap I/O)') +
g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{instance="$instance"}', 'Swap IO') +
{ yaxes: g.yaxes('Bps') },
)
)
.addRow(
g.row('Disk')
.addPanel(
g.panel('Disk IO Utilisation') +
g.queryPanel('instance:node_disk_utilisation:sum_irate{instance="$instance"}', 'Utilisation') +
{ yaxes: g.yaxes('percentunit') },
)
.addPanel(
g.panel('Disk IO Saturation') +
g.queryPanel('instance:node_disk_saturation:sum_irate{instance="$instance"}', 'Saturation') +
{ yaxes: g.yaxes('percentunit') },
)
)
.addRow(
g.row('Net')
.addPanel(
g.panel('Net Utilisation (Transmitted)') +
g.queryPanel('instance:node_net_utilisation:sum_irate{instance="$instance"}', 'Utilisation') +
{ yaxes: g.yaxes('Bps') },
)
.addPanel(
g.panel('Net Saturation (Dropped)') +
g.queryPanel('instance:node_net_saturation:sum_irate{instance="$instance"}', 'Saturation') +
{ yaxes: g.yaxes('Bps') },
)
)
.addRow(
g.row('Disk')
.addPanel(
g.panel('Disk Utilisation') +
g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free_bytes{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size_bytes{fstype=~"ext[24]"}))', 'Disk') +
{ yaxes: g.yaxes('percentunit') },
),
),
},
}

@ -0,0 +1,24 @@
{
"dependencies": [
{
"name": "grafonnet",
"source": {
"git": {
"remote": "https://github.com/grafana/grafonnet-lib",
"subdir": "grafonnet"
}
},
"version": "master"
},
{
"name": "grafana-builder",
"source": {
"git": {
"remote": "https://github.com/kausalco/public",
"subdir": "grafana-builder"
}
},
"version": "master"
}
]
}

@ -0,0 +1,60 @@
local grafana = import 'grafonnet/grafana.libsonnet';
local singlestat = grafana.singlestat;
local prometheus = grafana.prometheus;
{
new(title, query)::
singlestat.new(
title,
datasource='$datasource',
span=3,
format='percentunit',
valueName='current',
colors=[
'rgba(245, 54, 54, 0.9)',
'rgba(237, 129, 40, 0.89)',
'rgba(50, 172, 45, 0.97)',
],
thresholds='50, 80',
valueMaps=[
{
op: '=',
text: 'N/A',
value: 'null',
},
],
)
.addTarget(
prometheus.target(
query
)
) + {
gauge: {
maxValue: 100,
minValue: 0,
show: true,
thresholdLabels: false,
thresholdMarkers: true,
},
withTextNullValue(text):: self {
valueMaps: [
{
op: '=',
text: text,
value: 'null',
},
],
},
withSpanSize(size):: self {
span: size,
},
withLowerBeingBetter():: self {
colors: [
'rgba(50, 172, 45, 0.97)',
'rgba(237, 129, 40, 0.89)',
'rgba(245, 54, 54, 0.9)',
],
thresholds: '80, 90',
},
},
}

@ -0,0 +1,48 @@
local grafana = import 'grafonnet/grafana.libsonnet';
local singlestat = grafana.singlestat;
local prometheus = grafana.prometheus;
{
new(title, query)::
singlestat.new(
title,
datasource='prometheus',
span=3,
valueName='current',
valueMaps=[
{
op: '=',
text: '0',
value: 'null',
},
],
)
.addTarget(
prometheus.target(
query
)
) + {
withTextNullValue(text):: self {
valueMaps: [
{
op: '=',
text: text,
value: 'null',
},
],
},
withSpanSize(size):: self {
span: size,
},
withPostfix(postfix):: self {
postfix: postfix,
},
withSparkline():: self {
sparkline: {
show: true,
lineColor: 'rgb(31, 120, 193)',
fillColor: 'rgba(31, 118, 189, 0.18)',
},
},
},
}

@ -0,0 +1,5 @@
{
numbersinglestat:: import 'numbersinglestat.libsonnet',
gauge:: import 'gauge.libsonnet',
percentlinegraph:: import 'percentlinegraph.libsonnet',
}

@ -0,0 +1,4 @@
(import 'config.libsonnet') +
(import 'alerts/alerts.libsonnet') +
(import 'dashboards/dashboards.libsonnet') +
(import 'rules/rules.libsonnet')

@ -0,0 +1,106 @@
{
prometheusRules+:: {
groups+: [
{
name: 'node-exporter.rules',
rules: [
{
// This rule gives the number of CPUs per node.
record: 'instance:node_num_cpu:sum',
expr: |||
count by (instance) (
sum by (instance, cpu) (
node_cpu_seconds_total{%(nodeExporterSelector)s}
)
)
||| % $._config,
},
{
// CPU utilisation is % CPU is not idle.
record: 'instance:node_cpu_utilisation:avg1m',
expr: |||
1 - avg by (instance) (
rate(node_cpu_seconds_total{%(nodeExporterSelector)s,mode="idle"}[1m])
)
||| % $._config,
},
{
// CPU saturation is 1min avg run queue length / number of CPUs.
// Can go over 100%. >100% is bad.
record: 'instance:node_cpu_saturation_load1:',
expr: |||
sum by (instance) (node_load1{%(nodeExporterSelector)s})
/
instance:node_num_cpu:sum
||| % $._config,
},
{
// Total memory per node
record: 'instance:node_memory_bytes_total:sum',
expr: |||
sum by (instance) (
node_memory_MemTotal_bytes{%(nodeExporterSelector)s}
)
||| % $._config,
},
{
// Memory utilisation per node, normalized by per-node memory
record: 'instance:node_memory_utilisation:ratio',
expr: |||
1 - (
node_memory_MemAvailable_bytes{%(nodeExporterSelector)s}
/
node_memory_MemTotal_bytes{%(nodeExporterSelector)s}
)
||| % $._config,
},
{
record: 'instance:node_memory_swap_io_bytes:sum_rate',
expr: |||
1e3 * sum by (instance) (
(rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m])
+ rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m]))
)
||| % $._config,
},
{
// Disk utilisation (ms spent, 1 second irate())
record: 'instance:node_disk_utilisation:sum_irate',
expr: |||
sum by (instance) (
irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m])
)
||| % $._config,
},
{
// Disk saturation (ms spent, by rate() it's bound by 1 second)
record: 'instance:node_disk_saturation:sum_irate',
expr: |||
sum by (instance) (
irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m])
)
||| % $._config,
},
{
record: 'instance:node_net_utilisation:sum_irate',
expr: |||
sum by (instance) (
(irate(node_network_receive_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) +
irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]))
)
||| % $._config,
},
{
record: 'instance:node_net_saturation:sum_irate',
expr: |||
sum by (instance) (
(irate(node_network_receive_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) +
irate(node_network_transmit_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]))
)
||| % $._config,
},
],
},
],
},
}
Loading…
Cancel
Save