From 4daaa59c081fc63bcdf3d73e77babeb6f5494a53 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Tue, 23 Apr 2024 19:40:10 +0100 Subject: [PATCH] Rule Manager: Only query once per alert rule when restoring alert state Prometheus restores alert state between restarts and updates. For each rule, it looks at the alerts that are meant to be active and then queries the `ALERTS_FOR_STATE` series for _each_ alert within the rules. If the alert rule has 120 instances (or series) it'll execute the same query with slightly different labels. This PR changes the approach so that we only query once per alert rule and then match the corresponding alert that we're about to restore against the series-set. While the approach might use a bit more memory at start-up (if even?) the restore proccess is only ran once per restart so I'd consider this a big win. This builds on top of #13974 Signed-off-by: gotjosh --- rules/alerting.go | 32 +++++++++++++------------------- rules/group.go | 33 +++++++++++++++++++++++---------- 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/rules/alerting.go b/rules/alerting.go index 50c67fa2d..1bcf0a034 100644 --- a/rules/alerting.go +++ b/rules/alerting.go @@ -246,13 +246,16 @@ func (r *AlertingRule) sample(alert *Alert, ts time.Time) promql.Sample { return s } -// forStateSample returns the sample for ALERTS_FOR_STATE. +// forStateSample returns a promql.Sample with the rule labels, `ALERTS_FOR_STATE` as the metric name and the rule name as the `alertname` label. +// Optionally, if an alert is provided it'll copy the labels of the alert into the sample labels. func (r *AlertingRule) forStateSample(alert *Alert, ts time.Time, v float64) promql.Sample { lb := labels.NewBuilder(r.labels) - alert.Labels.Range(func(l labels.Label) { - lb.Set(l.Name, l.Value) - }) + if alert != nil { + alert.Labels.Range(func(l labels.Label) { + lb.Set(l.Name, l.Value) + }) + } lb.Set(labels.MetricName, alertForStateMetricName) lb.Set(labels.AlertName, r.name) @@ -265,9 +268,11 @@ func (r *AlertingRule) forStateSample(alert *Alert, ts time.Time, v float64) pro return s } -// QueryforStateSeries returns the series for ALERTS_FOR_STATE. -func (r *AlertingRule) QueryforStateSeries(ctx context.Context, alert *Alert, q storage.Querier) (storage.Series, error) { - smpl := r.forStateSample(alert, time.Now(), 0) +// QueryforStateSeries returns the series for ALERTS_FOR_STATE of the alert rule. +func (r *AlertingRule) QueryforStateSeries(ctx context.Context, q storage.Querier) (storage.SeriesSet, error) { + // We use a sample to ease the building of matchers. + // Don't provide an alert as we want matchers that match all series for the alert rule. + smpl := r.forStateSample(nil, time.Now(), 0) var matchers []*labels.Matcher smpl.Metric.Range(func(l labels.Label) { mt, err := labels.NewMatcher(labels.MatchEqual, l.Name, l.Value) @@ -278,18 +283,7 @@ func (r *AlertingRule) QueryforStateSeries(ctx context.Context, alert *Alert, q }) sset := q.Select(ctx, false, nil, matchers...) - var s storage.Series - for sset.Next() { - // Query assures that smpl.Metric is included in sset.At().Labels(), - // hence just checking the length would act like equality. - // (This is faster than calling labels.Compare again as we already have some info). - if sset.At().Labels().Len() == len(matchers) { - s = sset.At() - break - } - } - - return s, sset.Err() + return sset, sset.Err() } // SetEvaluationDuration updates evaluationDuration to the duration it took to evaluate the rule on its last evaluation. diff --git a/rules/group.go b/rules/group.go index 987136a00..81f7b1df2 100644 --- a/rules/group.go +++ b/rules/group.go @@ -664,19 +664,32 @@ func (g *Group) RestoreForState(ts time.Time) { continue } + sset, err := alertRule.QueryforStateSeries(g.opts.Context, q) + if err != nil { + level.Error(g.logger).Log( + "msg", "Failed to restore 'for' state", + labels.AlertName, alertRule.Name(), + "stage", "Select", + "err", err, + ) + continue + } + + // No results for this alert rule. + if err == nil { + level.Debug(g.logger).Log("msg", "Failed to find a series to restore the 'for' state", labels.AlertName, alertRule.Name()) + continue + } + alertRule.ForEachActiveAlert(func(a *Alert) { var s storage.Series - s, err := alertRule.QueryforStateSeries(g.opts.Context, a, q) - if err != nil { - // Querier Warnings are ignored. We do not care unless we have an error. - level.Error(g.logger).Log( - "msg", "Failed to restore 'for' state", - labels.AlertName, alertRule.Name(), - "stage", "Select", - "err", err, - ) - return + // Find the series for the given alert from the set. + for sset.Next() { + if sset.At().Labels().Hash() == a.Labels.Hash() { + s = sset.At() + break + } } if s == nil {