prometheusmetricshost-metricsmachine-metricsnode-metricsprocfsprometheus-exportersystem-informationsystem-metrics
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
466 lines
24 KiB
466 lines
24 KiB
local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; |
|
local dashboard = grafana.dashboard; |
|
local row = grafana.row; |
|
local prometheus = grafana.prometheus; |
|
local template = grafana.template; |
|
local graphPanel = grafana.graphPanel; |
|
|
|
local c = import '../config.libsonnet'; |
|
|
|
local datasourceTemplate = { |
|
current: { |
|
text: 'default', |
|
value: 'default', |
|
}, |
|
hide: 0, |
|
label: 'Data Source', |
|
name: 'datasource', |
|
options: [], |
|
query: 'prometheus', |
|
refresh: 1, |
|
regex: '', |
|
type: 'datasource', |
|
}; |
|
|
|
local CPUUtilisation = |
|
graphPanel.new( |
|
'CPU Utilisation', |
|
datasource='$datasource', |
|
span=6, |
|
format='percentunit', |
|
stack=true, |
|
fill=10, |
|
legend_show=false, |
|
) { tooltip+: { sort: 2 } }; |
|
|
|
local CPUSaturation = |
|
// TODO: Is this a useful panel? At least there should be some explanation how load |
|
// average relates to the "CPU saturation" in the title. |
|
graphPanel.new( |
|
'CPU Saturation (Load1 per CPU)', |
|
datasource='$datasource', |
|
span=6, |
|
format='percentunit', |
|
stack=true, |
|
fill=10, |
|
legend_show=false, |
|
) { tooltip+: { sort: 2 } }; |
|
|
|
local memoryUtilisation = |
|
graphPanel.new( |
|
'Memory Utilisation', |
|
datasource='$datasource', |
|
span=6, |
|
format='percentunit', |
|
stack=true, |
|
fill=10, |
|
legend_show=false, |
|
) { tooltip+: { sort: 2 } }; |
|
|
|
local memorySaturation = |
|
graphPanel.new( |
|
'Memory Saturation (Major Page Faults)', |
|
datasource='$datasource', |
|
span=6, |
|
format='rds', |
|
stack=true, |
|
fill=10, |
|
legend_show=false, |
|
) { tooltip+: { sort: 2 } }; |
|
|
|
local networkUtilisation = |
|
graphPanel.new( |
|
'Network Utilisation (Bytes Receive/Transmit)', |
|
datasource='$datasource', |
|
span=6, |
|
format='Bps', |
|
stack=true, |
|
fill=10, |
|
legend_show=false, |
|
) |
|
.addSeriesOverride({ alias: '/Receive/', stack: 'A' }) |
|
.addSeriesOverride({ alias: '/Transmit/', stack: 'B', transform: 'negative-Y' }) |
|
{ tooltip+: { sort: 2 } }; |
|
|
|
local networkSaturation = |
|
graphPanel.new( |
|
'Network Saturation (Drops Receive/Transmit)', |
|
datasource='$datasource', |
|
span=6, |
|
format='Bps', |
|
stack=true, |
|
fill=10, |
|
legend_show=false, |
|
) |
|
.addSeriesOverride({ alias: '/ Receive/', stack: 'A' }) |
|
.addSeriesOverride({ alias: '/ Transmit/', stack: 'B', transform: 'negative-Y' }) |
|
{ tooltip+: { sort: 2 } }; |
|
|
|
local diskIOUtilisation = |
|
graphPanel.new( |
|
'Disk IO Utilisation', |
|
datasource='$datasource', |
|
span=6, |
|
format='percentunit', |
|
stack=true, |
|
fill=10, |
|
legend_show=false, |
|
) { tooltip+: { sort: 2 } }; |
|
|
|
local diskIOSaturation = |
|
graphPanel.new( |
|
'Disk IO Saturation', |
|
datasource='$datasource', |
|
span=6, |
|
format='percentunit', |
|
stack=true, |
|
fill=10, |
|
legend_show=false, |
|
) { tooltip+: { sort: 2 } }; |
|
|
|
local diskSpaceUtilisation = |
|
graphPanel.new( |
|
'Disk Space Utilisation', |
|
datasource='$datasource', |
|
span=12, |
|
format='percentunit', |
|
stack=true, |
|
fill=10, |
|
legend_show=false, |
|
) { tooltip+: { sort: 2 } }; |
|
|
|
{ |
|
_clusterTemplate:: template.new( |
|
name='cluster', |
|
datasource='$datasource', |
|
query='label_values(node_time_seconds, %s)' % $._config.clusterLabel, |
|
current='', |
|
hide=if $._config.showMultiCluster then '' else '2', |
|
refresh=2, |
|
includeAll=false, |
|
sort=1 |
|
), |
|
|
|
grafanaDashboards+:: { |
|
'node-rsrc-use.json': |
|
|
|
dashboard.new( |
|
'%sUSE Method / Node' % $._config.dashboardNamePrefix, |
|
time_from='now-1h', |
|
tags=($._config.dashboardTags), |
|
timezone='utc', |
|
refresh='30s', |
|
graphTooltip='shared_crosshair' |
|
) |
|
.addTemplate(datasourceTemplate) |
|
.addTemplate($._clusterTemplate) |
|
.addTemplate( |
|
template.new( |
|
'instance', |
|
'$datasource', |
|
'label_values(node_exporter_build_info{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}, instance)' % $._config, |
|
refresh='time', |
|
sort=1 |
|
) |
|
) |
|
.addRow( |
|
row.new('CPU') |
|
.addPanel(CPUUtilisation.addTarget(prometheus.target('instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Utilisation'))) |
|
.addPanel(CPUSaturation.addTarget(prometheus.target('instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Saturation'))) |
|
) |
|
.addRow( |
|
row.new('Memory') |
|
.addPanel(memoryUtilisation.addTarget(prometheus.target('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Utilisation'))) |
|
.addPanel(memorySaturation.addTarget(prometheus.target('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Major page Faults'))) |
|
) |
|
.addRow( |
|
row.new('Network') |
|
.addPanel( |
|
networkUtilisation |
|
.addTarget(prometheus.target('instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Receive')) |
|
.addTarget(prometheus.target('instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Transmit')) |
|
) |
|
.addPanel( |
|
networkSaturation |
|
.addTarget(prometheus.target('instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Receive')) |
|
.addTarget(prometheus.target('instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Transmit')) |
|
) |
|
) |
|
.addRow( |
|
row.new('Disk IO') |
|
.addPanel(diskIOUtilisation.addTarget(prometheus.target('instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{device}}'))) |
|
.addPanel(diskIOSaturation.addTarget(prometheus.target('instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{device}}'))) |
|
) |
|
.addRow( |
|
row.new('Disk Space') |
|
.addPanel( |
|
diskSpaceUtilisation.addTarget(prometheus.target( |
|
||| |
|
sort_desc(1 - |
|
( |
|
max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"}) |
|
/ |
|
max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"}) |
|
) != 0 |
|
) |
|
||| % $._config, legendFormat='{{device}}' |
|
)) |
|
) |
|
), |
|
|
|
'node-cluster-rsrc-use.json': |
|
dashboard.new( |
|
'%sUSE Method / Cluster' % $._config.dashboardNamePrefix, |
|
time_from='now-1h', |
|
tags=($._config.dashboardTags), |
|
timezone='utc', |
|
refresh='30s', |
|
graphTooltip='shared_crosshair' |
|
) |
|
.addTemplate(datasourceTemplate) |
|
.addTemplate($._clusterTemplate) |
|
.addRow( |
|
row.new('CPU') |
|
.addPanel( |
|
CPUUtilisation |
|
.addTarget(prometheus.target( |
|
||| |
|
(( |
|
instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} |
|
* |
|
instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} |
|
) != 0 ) |
|
/ scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) |
|
||| % $._config, legendFormat='{{ instance }}' |
|
)) |
|
) |
|
.addPanel( |
|
CPUSaturation |
|
.addTarget(prometheus.target( |
|
||| |
|
( |
|
instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} |
|
/ scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) |
|
) != 0 |
|
||| % $._config, legendFormat='{{instance}}' |
|
)) |
|
) |
|
) |
|
.addRow( |
|
row.new('Memory') |
|
.addPanel( |
|
memoryUtilisation |
|
.addTarget(prometheus.target( |
|
||| |
|
( |
|
instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} |
|
/ scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) |
|
) != 0 |
|
||| % $._config, legendFormat='{{instance}}', |
|
)) |
|
) |
|
.addPanel(memorySaturation.addTarget(prometheus.target('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{instance}}'))) |
|
) |
|
.addRow( |
|
row.new('Network') |
|
.addPanel( |
|
networkUtilisation |
|
.addTarget(prometheus.target('instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Receive')) |
|
.addTarget(prometheus.target('instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Transmit')) |
|
) |
|
.addPanel( |
|
networkSaturation |
|
.addTarget(prometheus.target('instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Receive')) |
|
.addTarget(prometheus.target('instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Transmit')) |
|
) |
|
) |
|
.addRow( |
|
row.new('Disk IO') |
|
.addPanel( |
|
diskIOUtilisation |
|
.addTarget(prometheus.target( |
|
||| |
|
( |
|
instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} |
|
/ scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) |
|
) != 0 |
|
||| % $._config, legendFormat='{{instance}} {{device}}' |
|
)) |
|
) |
|
.addPanel( |
|
diskIOSaturation |
|
.addTarget(prometheus.target( |
|
||| |
|
( |
|
instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} |
|
/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) |
|
) != 0 |
|
||| % $._config, legendFormat='{{instance}} {{device}}' |
|
)) |
|
) |
|
) |
|
.addRow( |
|
row.new('Disk Space') |
|
.addPanel( |
|
diskSpaceUtilisation |
|
.addTarget(prometheus.target( |
|
||| |
|
sum without (device) ( |
|
max without (fstype, mountpoint) (( |
|
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"} |
|
- |
|
node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"} |
|
) != 0) |
|
) |
|
/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"}))) |
|
||| % $._config, legendFormat='{{instance}}' |
|
)) |
|
) |
|
), |
|
} + |
|
if $._config.showMultiCluster then { |
|
'node-multicluster-rsrc-use.json': |
|
dashboard.new( |
|
'%sUSE Method / Multi-cluster' % $._config.dashboardNamePrefix, |
|
time_from='now-1h', |
|
tags=($._config.dashboardTags), |
|
timezone='utc', |
|
refresh='30s', |
|
graphTooltip='shared_crosshair' |
|
) |
|
.addTemplate(datasourceTemplate) |
|
.addRow( |
|
row.new('CPU') |
|
.addPanel( |
|
CPUUtilisation |
|
.addTarget(prometheus.target( |
|
||| |
|
sum( |
|
(( |
|
instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s} |
|
* |
|
instance:node_num_cpu:sum{%(nodeExporterSelector)s} |
|
) != 0) |
|
/ scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s})) |
|
) by (%(clusterLabel)s) |
|
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config |
|
)) |
|
) |
|
.addPanel( |
|
CPUSaturation |
|
.addTarget(prometheus.target( |
|
||| |
|
sum(( |
|
instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s} |
|
/ scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s})) |
|
) != 0) by (%(clusterLabel)s) |
|
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config |
|
)) |
|
) |
|
) |
|
.addRow( |
|
row.new('Memory') |
|
.addPanel( |
|
memoryUtilisation |
|
.addTarget(prometheus.target( |
|
||| |
|
sum(( |
|
instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s} |
|
/ scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s})) |
|
) != 0) by (%(clusterLabel)s) |
|
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config |
|
)) |
|
) |
|
.addPanel( |
|
memorySaturation |
|
.addTarget(prometheus.target( |
|
||| |
|
sum(( |
|
instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s} |
|
) != 0) by (%(clusterLabel)s) |
|
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config |
|
)) |
|
) |
|
) |
|
.addRow( |
|
row.new('Network') |
|
.addPanel( |
|
networkUtilisation |
|
.addTarget(prometheus.target( |
|
||| |
|
sum(( |
|
instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} |
|
) != 0) by (%(clusterLabel)s) |
|
||| % $._config, legendFormat='{{%(clusterLabel)s}} Receive' % $._config |
|
)) |
|
.addTarget(prometheus.target( |
|
||| |
|
sum(( |
|
instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} |
|
) != 0) by (%(clusterLabel)s) |
|
||| % $._config, legendFormat='{{%(clusterLabel)s}} Transmit' % $._config |
|
)) |
|
) |
|
.addPanel( |
|
networkSaturation |
|
.addTarget(prometheus.target( |
|
||| |
|
sum(( |
|
instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} |
|
) != 0) by (%(clusterLabel)s) |
|
||| % $._config, legendFormat='{{%(clusterLabel)s}} Receive' % $._config |
|
)) |
|
.addTarget(prometheus.target( |
|
||| |
|
sum(( |
|
instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} |
|
) != 0) by (%(clusterLabel)s) |
|
||| % $._config, legendFormat='{{%(clusterLabel)s}} Transmit' % $._config |
|
)) |
|
) |
|
) |
|
.addRow( |
|
row.new('Disk IO') |
|
.addPanel( |
|
diskIOUtilisation |
|
.addTarget(prometheus.target( |
|
||| |
|
sum(( |
|
instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s} |
|
/ scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s})) |
|
) != 0) by (%(clusterLabel)s, device) |
|
||| % $._config, legendFormat='{{%(clusterLabel)s}} {{device}}' % $._config |
|
)) |
|
) |
|
.addPanel( |
|
diskIOSaturation |
|
.addTarget(prometheus.target( |
|
||| |
|
sum(( |
|
instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s} |
|
/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s})) |
|
) != 0) by (%(clusterLabel)s, device) |
|
||| % $._config, legendFormat='{{%(clusterLabel)s}} {{device}}' % $._config |
|
)) |
|
) |
|
) |
|
.addRow( |
|
row.new('Disk Space') |
|
.addPanel( |
|
diskSpaceUtilisation |
|
.addTarget(prometheus.target( |
|
||| |
|
sum ( |
|
sum without (device) ( |
|
max without (fstype, mountpoint, instance, pod) (( |
|
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} |
|
) != 0) |
|
) |
|
/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}))) |
|
) by (%(clusterLabel)s) |
|
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config |
|
)) |
|
) |
|
), |
|
} else {}, |
|
}
|
|
|