From f1f8317fa5a98410bb9c1705f91b47c4154679d0 Mon Sep 17 00:00:00 2001 From: Tobias Schmidt Date: Thu, 4 Feb 2016 23:42:55 -0500 Subject: [PATCH] Fix detection of flapping alerts Alerts in the resolve retention period must be transitioned to the active state again when their condition is met. --- rules/alerting.go | 4 ++-- rules/manager_test.go | 33 +++++++++++++++++++++------------ 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/rules/alerting.go b/rules/alerting.go index a2a389d31..fdd3d00a8 100644 --- a/rules/alerting.go +++ b/rules/alerting.go @@ -39,7 +39,7 @@ const ( type AlertState int const ( - // StateInactive is the state of an alert that is either firing nor pending. + // StateInactive is the state of an alert that is neither firing nor pending. StateInactive AlertState = iota // StatePending is the state of an alert that has been active for less than // the configured threshold duration. @@ -159,7 +159,7 @@ func (r *AlertingRule) eval(ts model.Time, engine *promql.Engine) (model.Vector, fp := smpl.Metric.Fingerprint() resultFPs[fp] = struct{}{} - if alert, ok := r.active[fp]; ok { + if alert, ok := r.active[fp]; ok && alert.State != StateInactive { alert.Value = smpl.Value continue } diff --git a/rules/manager_test.go b/rules/manager_test.go index 463388e15..40e57203a 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -27,14 +27,8 @@ import ( func TestAlertingRule(t *testing.T) { suite, err := promql.NewTest(t, ` load 5m - http_requests{job="api-server", instance="0", group="production"} 0+10x10 - http_requests{job="api-server", instance="1", group="production"} 0+20x10 - http_requests{job="api-server", instance="0", group="canary"} 0+30x10 - http_requests{job="api-server", instance="1", group="canary"} 0+40x10 - http_requests{job="app-server", instance="0", group="production"} 0+50x10 - http_requests{job="app-server", instance="1", group="production"} 0+60x10 - http_requests{job="app-server", instance="0", group="canary"} 0+70x10 - http_requests{job="app-server", instance="1", group="canary"} 0+80x10 + http_requests{job="app-server", instance="0", group="canary"} 75 85 95 105 105 95 85 + http_requests{job="app-server", instance="1", group="canary"} 80 90 100 110 120 130 140 `) if err != nil { t.Fatal(err) @@ -79,17 +73,32 @@ func TestAlertingRule(t *testing.T) { }, { time: 10 * time.Minute, result: []string{ + `ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`, `ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`, - `ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`, }, }, { - time: 15 * time.Minute, - result: nil, + time: 15 * time.Minute, + result: []string{ + `ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`, + }, }, { time: 20 * time.Minute, - result: nil, + result: []string{}, + }, + { + time: 25 * time.Minute, + result: []string{ + `ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`, + }, + }, + { + time: 30 * time.Minute, + result: []string{ + `ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`, + `ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`, + }, }, }