From 553f904f2dd476aebac2e304080acc06d0b9c5cc Mon Sep 17 00:00:00 2001 From: beorn7 Date: Thu, 3 Dec 2020 20:59:53 +0100 Subject: [PATCH] mixin: Add a capability to exclude non-prod AM instances Signed-off-by: beorn7 --- documentation/prometheus-mixin/alerts.libsonnet | 8 ++++---- documentation/prometheus-mixin/config.libsonnet | 10 ++++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index d37b12b43..85bde6a02 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -267,9 +267,9 @@ alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager', expr: ||| min without (alertmanager) ( - rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) + rate(prometheus_notifications_errors_total{%(prometheusSelector)s,alertmanager!~`%(nonNotifyingAlertmanagerRegEx)s`}[5m]) / - rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) + rate(prometheus_notifications_sent_total{%(prometheusSelector)s,alertmanager!~`%(nonNotifyingAlertmanagerRegEx)s`}[5m]) ) * 100 > 3 @@ -289,9 +289,9 @@ alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager', expr: ||| min by (%(prometheusHAGroupLabels)s) ( - rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) + rate(prometheus_notifications_errors_total{%(prometheusSelector)s,alertmanager!~`%(nonNotifyingAlertmanagerRegEx)s`}[5m]) / - rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) + rate(prometheus_notifications_sent_total{%(prometheusSelector)s,alertmanager!~`%(nonNotifyingAlertmanagerRegEx)s`}[5m]) ) * 100 > 3 diff --git a/documentation/prometheus-mixin/config.libsonnet b/documentation/prometheus-mixin/config.libsonnet index 5c4d0123d..6b0c2f9f1 100644 --- a/documentation/prometheus-mixin/config.libsonnet +++ b/documentation/prometheus-mixin/config.libsonnet @@ -27,5 +27,15 @@ // HA group. All labels used here must also be present in // prometheusHAGroupLabels above. prometheusHAGroupName: '{{$labels.job}}', + + // nonNotifyingAlertmanagerRegEx can be used to mark Alertmanager + // instances that are not part of the Alertmanager cluster + // delivering production notifications. This is important for the + // PrometheusErrorSendingAlertsToAnyAlertmanager alert. Otherwise, + // a still working test or auditing instance could mask a full + // failure of all the production instances. The provided regular + // expression is matched against the `alertmanager` label. + // Example: @'http://test-alertmanager\..*' + nonNotifyingAlertmanagerRegEx: @'', }, }