From dde2e5eb73baac260f41e4b808043ba2aa309810 Mon Sep 17 00:00:00 2001 From: George Robinson Date: Thu, 25 Apr 2024 13:18:50 +0100 Subject: [PATCH] Improve comments around resending resolved alerts (#13990) Signed-off-by: George Robinson --- rules/alerting.go | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/rules/alerting.go b/rules/alerting.go index 50c67fa2d..edcdfe5e0 100644 --- a/rules/alerting.go +++ b/rules/alerting.go @@ -457,8 +457,17 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, } } - // If the alert was previously firing, keep it around for a given - // retention time so it is reported as resolved to the AlertManager. + // If the alert is resolved (was firing but is now inactive) keep it for + // at least the retention period. This is important for a number of reasons: + // + // 1. It allows for Prometheus to be more resilient to network issues that + // would otherwise prevent a resolved alert from being reported as resolved + // to Alertmanager. + // + // 2. It helps reduce the chance of resolved notifications being lost if + // Alertmanager crashes or restarts between receiving the resolved alert + // from Prometheus and sending the resolved notification. This tends to + // occur for routes with large Group intervals. if a.State == StatePending || (!a.ResolvedAt.IsZero() && ts.Sub(a.ResolvedAt) > resolvedRetention) { delete(r.active, fp) }