The Prometheus monitoring system and time series database.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

377 lines
13 KiB

local g = import 'grafana-builder/grafana.libsonnet';
local grafana = import 'grafonnet/grafana.libsonnet';
local dashboard = grafana.dashboard;
local row = grafana.row;
local singlestat = grafana.singlestat;
local prometheus = grafana.prometheus;
local graphPanel = grafana.graphPanel;
local tablePanel = grafana.tablePanel;
local template = grafana.template;
{
grafanaDashboards+:: {
'prometheus.json':
g.dashboard('Prometheus Overview')
.addMultiTemplate('job', 'prometheus_build_info', 'job')
.addMultiTemplate('instance', 'prometheus_build_info', 'instance')
.addRow(
g.row('Prometheus Stats')
.addPanel(
g.panel('Prometheus Stats') +
g.tablePanel([
'count by (job, instance, version) (prometheus_build_info{job=~"$job", instance=~"$instance"})',
'max by (job, instance) (time() - process_start_time_seconds{job=~"$job", instance=~"$instance"})',
], {
job: { alias: 'Job' },
instance: { alias: 'Instance' },
version: { alias: 'Version' },
'Value #A': { alias: 'Count', type: 'hidden' },
'Value #B': { alias: 'Uptime' },
})
)
)
.addRow(
g.row('Discovery')
.addPanel(
g.panel('Target Sync') +
g.queryPanel('sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m])) by (scrape_job) * 1e3', '{{scrape_job}}') +
{ yaxes: g.yaxes('ms') }
)
.addPanel(
g.panel('Targets') +
g.queryPanel('sum(prometheus_sd_discovered_targets{job=~"$job",instance=~"$instance"})', 'Targets') +
g.stack
)
)
.addRow(
g.row('Retrieval')
.addPanel(
g.panel('Average Scrape Interval Duration') +
g.queryPanel('rate(prometheus_target_interval_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~"$job",instance=~"$instance"}[5m]) * 1e3', '{{interval}} configured') +
{ yaxes: g.yaxes('ms') }
)
.addPanel(
g.panel('Scrape failures') +
g.queryPanel([
'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))',
], [
'exceeded sample limit: {{job}}',
'duplicate timestamp: {{job}}',
'out of bounds: {{job}}',
'out of order: {{job}}',
]) +
g.stack
)
.addPanel(
g.panel('Appended Samples') +
g.queryPanel('rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[5m])', '{{job}} {{instance}}') +
g.stack
)
)
.addRow(
g.row('Storage')
.addPanel(
g.panel('Head Series') +
g.queryPanel('prometheus_tsdb_head_series{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head series') +
g.stack
)
.addPanel(
g.panel('Head Chunks') +
g.queryPanel('prometheus_tsdb_head_chunks{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head chunks') +
g.stack
)
)
.addRow(
g.row('Query')
.addPanel(
g.panel('Query Rate') +
g.queryPanel('rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])', '{{job}} {{instance}}') +
g.stack,
)
.addPanel(
g.panel('Stage Duration') +
g.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",job=~"$job",instance=~"$instance"}) * 1e3', '{{slice}}') +
{ yaxes: g.yaxes('ms') } +
g.stack,
)
),
// Remote write specific dashboard.
'prometheus-remote-write.json':
local timestampComparison =
graphPanel.new(
'Highest Timestamp In vs. Highest Timestamp Sent',
datasource='$datasource',
span=6,
)
.addTarget(prometheus.target(
|||
(
prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}
-
ignoring(remote_name, url) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}
)
|||,
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}',
));
local timestampComparisonRate =
graphPanel.new(
'Rate[5m]',
datasource='$datasource',
span=6,
)
.addTarget(prometheus.target(
|||
(
rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])
-
ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])
)
|||,
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}',
));
local samplesRate =
graphPanel.new(
'Rate, in vs. succeeded or dropped [5m]',
datasource='$datasource',
span=12,
)
.addTarget(prometheus.target(
|||
rate(
prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m])
-
ignoring(remote_name, url) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])
-
rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])
|||,
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
));
local currentShards =
graphPanel.new(
'Current Shards',
datasource='$datasource',
span=12,
min_span=6,
)
.addTarget(prometheus.target(
'prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}',
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
));
local maxShards =
graphPanel.new(
'Max Shards',
datasource='$datasource',
span=4,
)
.addTarget(prometheus.target(
'prometheus_remote_storage_shards_max{cluster=~"$cluster", instance=~"$instance"}',
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
));
local minShards =
graphPanel.new(
'Min Shards',
datasource='$datasource',
span=4,
)
.addTarget(prometheus.target(
'prometheus_remote_storage_shards_min{cluster=~"$cluster", instance=~"$instance"}',
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
));
local desiredShards =
graphPanel.new(
'Desired Shards',
datasource='$datasource',
span=4,
)
.addTarget(prometheus.target(
'prometheus_remote_storage_shards_desired{cluster=~"$cluster", instance=~"$instance"}',
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
));
local shardsCapacity =
graphPanel.new(
'Shard Capacity',
datasource='$datasource',
span=6,
)
.addTarget(prometheus.target(
'prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance"}',
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
));
local pendingSamples =
graphPanel.new(
'Pending Samples',
datasource='$datasource',
span=6,
)
.addTarget(prometheus.target(
'prometheus_remote_storage_pending_samples{cluster=~"$cluster", instance=~"$instance"}',
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
));
local walSegment =
graphPanel.new(
'TSDB Current Segment',
datasource='$datasource',
span=6,
formatY1='none',
)
.addTarget(prometheus.target(
'prometheus_tsdb_wal_segment_current{cluster=~"$cluster", instance=~"$instance"}',
legendFormat='{{cluster}}:{{instance}}'
));
local queueSegment =
graphPanel.new(
'Remote Write Current Segment',
datasource='$datasource',
span=6,
formatY1='none',
)
.addTarget(prometheus.target(
'prometheus_wal_watcher_current_segment{cluster=~"$cluster", instance=~"$instance"}',
legendFormat='{{cluster}}:{{instance}} {{consumer}}'
));
local droppedSamples =
graphPanel.new(
'Dropped Samples',
datasource='$datasource',
span=3,
)
.addTarget(prometheus.target(
'rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
));
local failedSamples =
graphPanel.new(
'Failed Samples',
datasource='$datasource',
span=3,
)
.addTarget(prometheus.target(
'rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
));
local retriedSamples =
graphPanel.new(
'Retried Samples',
datasource='$datasource',
span=3,
)
.addTarget(prometheus.target(
'rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
));
local enqueueRetries =
graphPanel.new(
'Enqueue Retries',
datasource='$datasource',
span=3,
)
.addTarget(prometheus.target(
'rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
));
dashboard.new('Prometheus Remote Write',
editable=true)
.addTemplate(
{
hide: 0,
label: null,
name: 'datasource',
options: [],
query: 'prometheus',
refresh: 1,
regex: '',
type: 'datasource',
},
)
.addTemplate(
template.new(
'instance',
'$datasource',
'label_values(prometheus_build_info, instance)' % $._config,
refresh='time',
current={
selected: true,
text: 'All',
value: '$__all',
},
includeAll=true,
)
)
.addTemplate(
template.new(
'cluster',
'$datasource',
'label_values(kube_pod_container_info{image=~".*prometheus.*"}, cluster)' % $._config,
refresh='time',
current={
selected: true,
text: 'All',
value: '$__all',
},
includeAll=true,
)
)
.addTemplate(
template.new(
'url',
'$datasource',
'label_values(prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}, url)' % $._config,
refresh='time',
includeAll=true,
)
)
.addRow(
row.new('Timestamps')
.addPanel(timestampComparison)
.addPanel(timestampComparisonRate)
)
.addRow(
row.new('Samples')
.addPanel(samplesRate)
)
.addRow(
row.new(
'Shards'
)
.addPanel(currentShards)
.addPanel(maxShards)
.addPanel(minShards)
.addPanel(desiredShards)
)
.addRow(
row.new('Shard Details')
.addPanel(shardsCapacity)
.addPanel(pendingSamples)
)
.addRow(
row.new('Segments')
.addPanel(walSegment)
.addPanel(queueSegment)
)
.addRow(
row.new('Misc. Rates')
.addPanel(droppedSamples)
.addPanel(failedSamples)
.addPanel(retriedSamples)
.addPanel(enqueueRetries)
),
},
}