From 4b5ab80ca6986b386b47839d7bcf96aeab41ee68 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Thu, 18 Mar 2021 15:44:33 +0100 Subject: [PATCH] [rule] Update rule health for append/commit fails (#8619) * [rule] Update rule health for append/commit fails Similar to https://github.com/prometheus/prometheus/pull/8410 will provide more context. Signed-off-by: Goutham Veeramachaneni * Add test for updating health on append fails Signed-off-by: Goutham Veeramachaneni --- rules/manager.go | 10 ++++++++ rules/manager_test.go | 57 +++++++++++++++++++++++++++++++++++++++++++ rules/recording.go | 2 -- 3 files changed, 67 insertions(+), 2 deletions(-) diff --git a/rules/manager.go b/rules/manager.go index 348aa0ce8..04ce2f309 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -592,6 +592,9 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { vector, err := rule.Eval(ctx, ts, g.opts.QueryFunc, g.opts.ExternalURL) if err != nil { + rule.SetHealth(HealthBad) + rule.SetLastError(err) + // Canceled queries are intentional termination of queries. This normally // happens on shutdown and thus we skip logging of any errors here. if _, ok := err.(promql.ErrQueryCanceled); !ok { @@ -616,13 +619,20 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { seriesReturned := make(map[string]labels.Labels, len(g.seriesInPreviousEval[i])) defer func() { if err := app.Commit(); err != nil { + rule.SetHealth(HealthBad) + rule.SetLastError(err) + level.Warn(g.logger).Log("msg", "Rule sample appending failed", "err", err) return } g.seriesInPreviousEval[i] = seriesReturned }() + for _, s := range vector { if _, err := app.Append(0, s.Metric, s.T, s.V); err != nil { + rule.SetHealth(HealthBad) + rule.SetLastError(err) + switch errors.Cause(err) { case storage.ErrOutOfOrderSample: numOutOfOrder++ diff --git a/rules/manager_test.go b/rules/manager_test.go index 3f7810fa8..a6af68361 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -1163,3 +1163,60 @@ func TestGroupHasAlertingRules(t *testing.T) { require.Equal(t, test.want, got, "test case %d failed, expected:%t got:%t", i, test.want, got) } } + +func TestRuleHealthUpdates(t *testing.T) { + st := teststorage.New(t) + defer st.Close() + engineOpts := promql.EngineOpts{ + Logger: nil, + Reg: nil, + MaxSamples: 10, + Timeout: 10 * time.Second, + } + engine := promql.NewEngine(engineOpts) + opts := &ManagerOptions{ + QueryFunc: EngineQueryFunc(engine, st), + Appendable: st, + Queryable: st, + Context: context.Background(), + Logger: log.NewNopLogger(), + } + + expr, err := parser.ParseExpr("a + 1") + require.NoError(t, err) + rule := NewRecordingRule("a_plus_one", expr, labels.Labels{}) + group := NewGroup(GroupOptions{ + Name: "default", + Interval: time.Second, + Rules: []Rule{rule}, + ShouldRestore: true, + Opts: opts, + }) + + // A time series that has two samples. + app := st.Appender(context.Background()) + app.Append(0, labels.FromStrings(model.MetricNameLabel, "a"), 0, 1) + app.Append(0, labels.FromStrings(model.MetricNameLabel, "a"), 1000, 2) + err = app.Commit() + require.NoError(t, err) + + ctx := context.Background() + + rules := group.Rules()[0] + require.NoError(t, rules.LastError()) + require.Equal(t, HealthUnknown, rules.Health()) + + // Execute 2 times, it should be all green. + group.Eval(ctx, time.Unix(0, 0)) + group.Eval(ctx, time.Unix(1, 0)) + + rules = group.Rules()[0] + require.NoError(t, rules.LastError()) + require.Equal(t, HealthGood, rules.Health()) + + // Now execute the rule in the past again, this should cause append failures. + group.Eval(ctx, time.Unix(0, 0)) + rules = group.Rules()[0] + require.EqualError(t, rules.LastError(), storage.ErrOutOfOrderSample.Error()) + require.Equal(t, HealthBad, rules.Health()) +} diff --git a/rules/recording.go b/rules/recording.go index cec1a0fad..2665506af 100644 --- a/rules/recording.go +++ b/rules/recording.go @@ -76,8 +76,6 @@ func (rule *RecordingRule) Labels() labels.Labels { func (rule *RecordingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, _ *url.URL) (promql.Vector, error) { vector, err := query(ctx, rule.vector.String(), ts) if err != nil { - rule.SetHealth(HealthBad) - rule.SetLastError(err) return nil, err } // Override the metric name and labels.