Merge pull request #4474 from prometheus/mixin

Prometheus Monitoring Mixin for Prometheus itself.
5 years ago · 32c3a1beef
parent 588eb20018 4825585834
commit 32c3a1beef
10 changed files with 513 additions and 0 deletions
--- a/documentation/prometheus-mixin/.gitignore
+++ b/documentation/prometheus-mixin/.gitignore
@ -0,0 +1,4 @@
+*.yaml
+dashboards_out
+vendor
+jsonnetfile.lock.json
--- a/documentation/prometheus-mixin/Makefile
+++ b/documentation/prometheus-mixin/Makefile
@ -0,0 +1,25 @@
+JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s
+
+all: fmt prometheus_alerts.yaml dashboards_out lint
+
+fmt:
+	find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
+		xargs -n 1 -- $(JSONNET_FMT) -i
+
+prometheus_alerts.yaml: mixin.libsonnet config.libsonnet alerts.libsonnet
+	jsonnet -S alerts.jsonnet > $@
+
+dashboards_out: mixin.libsonnet config.libsonnet dashboards.libsonnet
+	@mkdir -p dashboards_out
+	jsonnet -J vendor -m dashboards_out dashboards.jsonnet
+
+lint: prometheus_alerts.yaml
+	find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
+		while read f; do \
+			$(JSONNET_FMT) "$$f" | diff -u "$$f" -; \
+		done
+
+	promtool check rules prometheus_alerts.yaml
+
+clean:
+	rm -rf dashboards_out prometheus_alerts.yaml
--- a/documentation/prometheus-mixin/README.md
+++ b/documentation/prometheus-mixin/README.md
@ -0,0 +1,36 @@
+# Prometheus Mixin
+
+_This is work in progress. We aim for it to become a good role model for alerts
+and dashboards eventually, but it is not quite there yet._
+
+The Prometheus Mixin is a set of configurable, reusable, and extensible alerts
+and dashboards for Prometheus.
+
+To use them, you need to have `jsonnet` (v0.13+) and `jb` installed. If you
+have a working Go development environment, it's easiest to run the following:
+```bash
+$ go get github.com/google/go-jsonnet/cmd/jsonnet
+$ go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb
+```
+
+_Note: The make targets `lint` and `fmt` need the `jsonnetfmt` binary, which is
+currently not included in the Go implementation of `jsonnet`. For the time
+being, you have to install the [C++ version of
+jsonnetfmt](https://github.com/google/jsonnet) if you want to use `make lint`
+or `make fmt`._
+
+Next, install the dependencies by running the following command in this
+directory:
+```bash
+$ jb install
+```
+
+You can then build a `prometheus_alerts.yaml` with the alerts and a directory
+`dashboards_out` with the Grafana dashboard JSON files:
+```bash
+$ make prometheus_alerts.yaml
+$ make dashboards_out
+```
+
+For more advanced uses of mixins, see https://github.com/monitoring-mixins/docs.
+
--- a/documentation/prometheus-mixin/alerts.jsonnet
+++ b/documentation/prometheus-mixin/alerts.jsonnet
@ -0,0 +1 @@
+std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts)
--- a/documentation/prometheus-mixin/alerts.libsonnet
+++ b/documentation/prometheus-mixin/alerts.libsonnet
@ -0,0 +1,260 @@
+{
+  prometheusAlerts+:: {
+    groups+: [
+      {
+        name: 'prometheus',
+        rules: [
+          {
+            alert: 'PrometheusBadConfig',
+            expr: |||
+              # Without max_over_time, failed scrapes could create false negatives, see
+              # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+              max_over_time(prometheus_config_last_reload_successful{%(prometheusSelector)s}[5m]) == 0
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              summary: 'Failed Prometheus configuration reload.',
+              description: 'Prometheus %(prometheusName)s has failed to reload its configuration.' % $._config,
+            },
+          },
+          {
+            alert: 'PrometheusNotificationQueueRunningFull',
+            expr: |||
+              # Without min_over_time, failed scrapes could create false negatives, see
+              # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+              (
+                predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30)
+              >
+                min_over_time(prometheus_notifications_queue_capacity{%(prometheusSelector)s}[5m])
+              )
+            ||| % $._config,
+            'for': '15m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'Prometheus alert notification queue predicted to run full in less than 30m.',
+              description: 'Alert notification queue of Prometheus %(prometheusName)s is running full.' % $._config,
+            },
+          },
+          {
+            alert: 'PrometheusErrorSendingAlertsToSomeAlertmanagers',
+            expr: |||
+              (
+                rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
+              /
+                rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m])
+              )
+              * 100
+              > 1
+            ||| % $._config,
+            'for': '15m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.',
+              description: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.alertmanager}}.' % $._config,
+            },
+          },
+          {
+            alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager',
+            expr: |||
+              min without(alertmanager) (
+                rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
+              /
+                rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m])
+              )
+              * 100
+              > 3
+            ||| % $._config,
+            'for': '15m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              summary: 'Prometheus encounters more than 3% errors sending alerts to any Alertmanager.',
+              description: '{{ printf "%%.1f" $value }}%% minimum errors while sending alerts from Prometheus %(prometheusName)s to any Alertmanager.' % $._config,
+            },
+          },
+          {
+            alert: 'PrometheusNotConnectedToAlertmanagers',
+            expr: |||
+              # Without max_over_time, failed scrapes could create false negatives, see
+              # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+              max_over_time(prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s}[5m]) < 1
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'Prometheus is not connected to any Alertmanagers.',
+              description: 'Prometheus %(prometheusName)s is not connected to any Alertmanagers.' % $._config,
+            },
+          },
+          {
+            alert: 'PrometheusTSDBReloadsFailing',
+            expr: |||
+              increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[3h]) > 0
+            ||| % $._config,
+            'for': '4h',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'Prometheus has issues reloading blocks from disk.',
+              description: 'Prometheus %(prometheusName)s has detected {{$value | humanize}} reload failures over the last 3h.' % $._config,
+            },
+          },
+          {
+            alert: 'PrometheusTSDBCompactionsFailing',
+            expr: |||
+              increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[3h]) > 0
+            ||| % $._config,
+            'for': '4h',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'Prometheus has issues compacting blocks.',
+              description: 'Prometheus %(prometheusName)s has detected {{$value | humanize}} compaction failures over the last 3h.' % $._config,
+            },
+          },
+          {
+            alert: 'PrometheusTSDBWALCorruptions',
+            expr: |||
+              increase(tsdb_wal_corruptions_total{%(prometheusSelector)s}[3h]) > 0
+            ||| % $._config,
+            'for': '4h',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'Prometheus is detecting WAL corruptions.',
+              description: 'Prometheus %(prometheusName)s has detected {{$value | humanize}} corruptions of the write-ahead log (WAL) over the last 3h.' % $._config,
+            },
+          },
+          {
+            alert: 'PrometheusNotIngestingSamples',
+            expr: |||
+              rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m]) <= 0
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'Prometheus is not ingesting samples.',
+              description: 'Prometheus %(prometheusName)s is not ingesting samples.' % $._config,
+            },
+          },
+          {
+            alert: 'PrometheusDuplicateTimestamps',
+            expr: |||
+              rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'Prometheus is dropping samples with duplicate timestamps.',
+              description: 'Prometheus %(prometheusName)s is dropping {{$value | humanize}} samples/s with different values but duplicated timestamp.' % $._config,
+            },
+          },
+          {
+            alert: 'PrometheusOutOfOrderTimestamps',
+            expr: |||
+              rate(prometheus_target_scrapes_sample_out_of_order_total{%(prometheusSelector)s}[5m]) > 0
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'Prometheus drops samples with out-of-order timestamps.',
+              description: 'Prometheus %(prometheusName)s is dropping {{$value | humanize}} samples/s with timestamps arriving out of order.' % $._config,
+            },
+          },
+          {
+            alert: 'PrometheusRemoteStorageFailures',
+            expr: |||
+              (
+                rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[5m])
+              /
+                (
+                  rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[5m])
+                +
+                  rate(prometheus_remote_storage_succeeded_samples_total{%(prometheusSelector)s}[5m])
+                )
+              )
+              * 100
+              > 1
+            ||| % $._config,
+            'for': '15m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              summary: 'Prometheus fails to send samples to remote storage.',
+              description: 'Prometheus %(prometheusName)s failed to send {{ printf "%%.1f" $value }}%% of the samples to queue {{$labels.queue}}.' % $._config,
+            },
+          },
+          {
+            alert: 'PrometheusRemoteWriteBehind',
+            expr: |||
+              # Without max_over_time, failed scrapes could create false negatives, see
+              # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+              (
+                max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{%(prometheusSelector)s}[5m])
+              - on(job, instance) group_right
+                max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{%(prometheusSelector)s}[5m])
+              )
+              > 120
+            ||| % $._config,
+            'for': '15m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              summary: 'Prometheus remote write is behind.',
+              description: 'Prometheus %(prometheusName)s remote write is {{ printf "%%.1f" $value }}s behind for queue {{$labels.queue}}.' % $._config,
+            },
+          },
+          {
+            alert: 'PrometheusRuleFailures',
+            expr: |||
+              increase(prometheus_rule_evaluation_failures_total{%(prometheusSelector)s}[5m]) > 0
+            ||| % $._config,
+            'for': '15m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              summary: 'Prometheus is failing rule evaluations.',
+              description: 'Prometheus %(prometheusName)s has failed to evaluate {{ printf "%%.0f" $value }} rules in the last 5m.' % $._config,
+            },
+          },
+          {
+            alert: 'PrometheusMissingRuleEvaluations',
+            expr: |||
+              increase(prometheus_rule_group_iterations_missed_total{%(prometheusSelector)s}[5m]) > 0
+            ||| % $._config,
+            'for': '15m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'Prometheus is missing rule evaluations due to slow rule group evaluation.',
+              description: 'Prometheus %(prometheusName)s has missed {{ printf "%%.0f" $value }} rule group evaluations in the last 5m.' % $._config,
+            },
+          },
+        ],
+      },
+    ],
+  },
+}
--- a/documentation/prometheus-mixin/config.libsonnet
+++ b/documentation/prometheus-mixin/config.libsonnet
@ -0,0 +1,16 @@
+{
+  _config+:: {
+    // prometheusSelector is inserted as part of the label selector in
+    // PromQL queries to identify metrics collected from Prometheus
+    // servers.
+    prometheusSelector: 'job="prometheus"',
+
+    // prometheusName is inserted into annotations to name the Prometheus
+    // instance affected by the alert.
+    prometheusName: '{{$labels.instance}}',
+    // If you run Prometheus on Kubernetes with the Prometheus
+    // Operator, you can make use of the configured target labels for
+    // nicer naming:
+    // prometheusNameTemplate: '{{$labels.namespace}}/{{$labels.pod}}'
+  },
+}
--- a/documentation/prometheus-mixin/dashboards.jsonnet
+++ b/documentation/prometheus-mixin/dashboards.jsonnet
@ -0,0 +1,6 @@
+local dashboards = (import 'mixin.libsonnet').dashboards;
+
+{
+  [name]: dashboards[name]
+  for name in std.objectFields(dashboards)
+}
--- a/documentation/prometheus-mixin/dashboards.libsonnet
+++ b/documentation/prometheus-mixin/dashboards.libsonnet
@ -0,0 +1,148 @@
+local g = import 'grafana-builder/grafana.libsonnet';
+
+{
+  dashboards+: {
+    'prometheus.json':
+      g.dashboard('Prometheus')
+      .addMultiTemplate('job', 'prometheus_build_info', 'job')
+      .addMultiTemplate('instance', 'prometheus_build_info', 'instance')
+      .addRow(
+        g.row('Prometheus Stats')
+        .addPanel(
+          g.panel('Prometheus Stats') +
+          g.tablePanel([
+            'count by (job, instance, version) (prometheus_build_info{job=~"$job", instance=~"$instance"})',
+            'max by (job, instance) (time() - process_start_time_seconds{job=~"$job", instance=~"$instance"})',
+          ], {
+            job: { alias: 'Job' },
+            instance: { alias: 'Instance' },
+            version: { alias: 'Version' },
+            'Value #A': { alias: 'Count', type: 'hidden' },
+            'Value #B': { alias: 'Uptime' },
+          })
+        )
+      )
+      .addRow(
+        g.row('Discovery')
+        .addPanel(
+          g.panel('Target Sync') +
+          g.queryPanel('sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m])) by (scrape_job) * 1e3', '{{scrape_job}}') +
+          { yaxes: g.yaxes('ms') }
+        )
+        .addPanel(
+          g.panel('Targets') +
+          g.queryPanel('sum(prometheus_sd_discovered_targets{job=~"$job",instance=~"$instance"})', 'Targets') +
+          g.stack
+        )
+      )
+      .addRow(
+        g.row('Retrieval')
+        .addPanel(
+          g.panel('Average Scrape Interval Duration') +
+          g.queryPanel('rate(prometheus_target_interval_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~"$job",instance=~"$instance"}[5m]) * 1e3', '{{interval}} configured') +
+          { yaxes: g.yaxes('ms') }
+        )
+        .addPanel(
+          g.panel('Scrape failures') +
+          g.queryPanel([
+            'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))',
+            'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))',
+            'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))',
+            'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))',
+          ], [
+            'exceeded sample limit: {{job}}',
+            'duplicate timestamp: {{job}}',
+            'out of bounds: {{job}}',
+            'out of order: {{job}}',
+          ]) +
+          g.stack
+        )
+        .addPanel(
+          g.panel('Appended Samples') +
+          g.queryPanel('rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[5m])', '{{job}} {{instance}}') +
+          g.stack
+        )
+      )
+      .addRow(
+        g.row('Storage')
+        .addPanel(
+          g.panel('Head Series') +
+          g.queryPanel('prometheus_tsdb_head_series{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head series') +
+          g.stack
+        )
+        .addPanel(
+          g.panel('Head Chunks') +
+          g.queryPanel('prometheus_tsdb_head_chunks{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head chunks') +
+          g.stack
+        )
+      )
+      .addRow(
+        g.row('Query')
+        .addPanel(
+          g.panel('Query Rate') +
+          g.queryPanel('rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])', '{{job}} {{instance}}') +
+          g.stack,
+        )
+        .addPanel(
+          g.panel('Stage Duration') +
+          g.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",job=~"$job",instance=~"$instance"}) * 1e3', '{{slice}}') +
+          { yaxes: g.yaxes('ms') } +
+          g.stack,
+        )
+      ),
+    // Remote write specific dashboard.
+    'prometheus-remote-write.json':
+      g.dashboard('Prometheus Remote Write')
+      .addMultiTemplate('instance', 'prometheus_build_info', 'instance')
+      .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*prometheus.*"}', 'cluster')
+      .addRow(
+        g.row('Timestamps')
+        .addPanel(
+          g.panel('Highest Timestamp In vs. Highest Timestamp Sent') +
+          g.queryPanel('prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"} - ignoring(queue) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}', '{{cluster}}:{{instance}}-{{queue}}') +
+          { yaxes: g.yaxes('s') }
+        )
+        .addPanel(
+          g.panel('Rate[5m]') +
+          g.queryPanel('rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])  - ignoring (queue) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}')
+        )
+      )
+      .addRow(
+        g.row('Samples')
+        .addPanel(
+          g.panel('Rate, in vs. succeeded or dropped [5m]') +
+          g.queryPanel('rate(prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m])- ignoring(queue) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) - rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}')
+        )
+      )
+      .addRow(
+        g.row('Shards')
+        .addPanel(
+          g.panel('Num. Shards') +
+          g.queryPanel('prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}', '{{cluster}}:{{instance}}-{{queue}}')
+        )
+        .addPanel(
+          g.panel('Capacity') +
+          g.queryPanel('prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance"}', '{{cluster}}:{{instance}}-{{queue}}')
+        )
+      )
+      .addRow(
+        g.row('Misc Rates.')
+        .addPanel(
+          g.panel('Dropped Samples') +
+          g.queryPanel('rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}')
+        )
+        .addPanel(
+          g.panel('Failed Samples') +
+          g.queryPanel('rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}')
+        )
+        .addPanel(
+          g.panel('Retried Samples') +
+          g.queryPanel('rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}')
+        )
+        .addPanel(
+          g.panel('Enqueue Retries') +
+          g.queryPanel('rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}')
+        )
+      ),
+  },
+}
--- a/documentation/prometheus-mixin/jsonnetfile.json
+++ b/documentation/prometheus-mixin/jsonnetfile.json
@ -0,0 +1,14 @@
+{
+    "dependencies": [
+        {
+            "name": "grafana-builder",
+            "source": {
+                "git": {
+                    "remote": "https://github.com/grafana/jsonnet-libs",
+                    "subdir": "grafana-builder"
+                }
+            },
+            "version": "master"
+        }
+    ]
+}
--- a/documentation/prometheus-mixin/mixin.libsonnet
+++ b/documentation/prometheus-mixin/mixin.libsonnet
@ -0,0 +1,3 @@
+(import 'config.libsonnet') +
+(import 'dashboards.libsonnet') +
+(import 'alerts.libsonnet')
				`@ -0,0 +1 @@`
				`std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts)`