diff --git a/rules/manager.go b/rules/manager.go index befcaa2f7..1e85cf5c5 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -21,6 +21,8 @@ import ( "sync" "time" + html_template "html/template" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/log" @@ -73,6 +75,20 @@ func init() { prometheus.MustRegister(evalDuration) } +// A Rule encapsulates a vector expression which is evaluated at a specified +// interval and acted upon (currently either recorded or used for alerting). +type Rule interface { + // Name returns the name of the rule. + Name() string + // Eval evaluates the rule, including any associated recording or alerting actions. + eval(clientmodel.Timestamp, *promql.Engine) (promql.Vector, error) + // String returns a human-readable string representation of the rule. + String() string + // HTMLSnippet returns a human-readable string representation of the rule, + // decorated with HTML elements for use the web frontend. + HTMLSnippet(pathPrefix string) html_template.HTML +} + // The Manager manages recording and alerting rules. type Manager struct { // Protects the rules list. @@ -271,12 +287,39 @@ func (m *Manager) runIteration() { wg.Wait() } +// transferAlertState makes a copy of the state of alerting rules and returns a function +// that restores them in the current state. +func (m *Manager) transferAlertState() func() { + + alertingRules := map[string]*AlertingRule{} + for _, r := range m.rules { + if ar, ok := r.(*AlertingRule); ok { + alertingRules[ar.name] = ar + } + } + + return func() { + // Restore alerting rule state. + for _, r := range m.rules { + ar, ok := r.(*AlertingRule) + if !ok { + continue + } + if old, ok := alertingRules[ar.name]; ok { + ar.activeAlerts = old.activeAlerts + } + } + } +} + // ApplyConfig updates the rule manager's state as the config requires. If // loading the new rules failed the old rule set is restored. Returns true on success. func (m *Manager) ApplyConfig(conf *config.Config) bool { m.Lock() defer m.Unlock() + defer m.transferAlertState()() + success := true m.interval = time.Duration(conf.GlobalConfig.EvaluationInterval) @@ -300,6 +343,7 @@ func (m *Manager) ApplyConfig(conf *config.Config) bool { log.Errorf("Error loading rules, previous rule set restored: %s", err) success = false } + return success } diff --git a/rules/manager_test.go b/rules/manager_test.go new file mode 100644 index 000000000..f51899f71 --- /dev/null +++ b/rules/manager_test.go @@ -0,0 +1,183 @@ +// Copyright 2013 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package rules + +import ( + "fmt" + "reflect" + "strings" + "testing" + "time" + + clientmodel "github.com/prometheus/client_golang/model" + + "github.com/prometheus/prometheus/promql" +) + +func TestAlertingRule(t *testing.T) { + suite, err := promql.NewTest(t, ` + load 5m + http_requests{job="api-server", instance="0", group="production"} 0+10x10 + http_requests{job="api-server", instance="1", group="production"} 0+20x10 + http_requests{job="api-server", instance="0", group="canary"} 0+30x10 + http_requests{job="api-server", instance="1", group="canary"} 0+40x10 + http_requests{job="app-server", instance="0", group="production"} 0+50x10 + http_requests{job="app-server", instance="1", group="production"} 0+60x10 + http_requests{job="app-server", instance="0", group="canary"} 0+70x10 + http_requests{job="app-server", instance="1", group="canary"} 0+80x10 + `) + if err != nil { + t.Fatal(err) + } + defer suite.Close() + + if err := suite.Run(); err != nil { + t.Fatal(err) + } + + expr, err := promql.ParseExpr(`http_requests{group="canary", job="app-server"} < 100`) + if err != nil { + t.Fatalf("Unable to parse alert expression: %s", err) + } + + rule := NewAlertingRule( + "HTTPRequestRateLow", + expr, + time.Minute, + clientmodel.LabelSet{"severity": "critical"}, + "summary", "description", "runbook", + ) + + var tests = []struct { + time time.Duration + result []string + }{ + { + time: 0, + result: []string{ + `ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`, + `ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`, + }, + }, { + time: 5 * time.Minute, + result: []string{ + `ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`, + `ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`, + `ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`, + `ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`, + }, + }, { + time: 10 * time.Minute, + result: []string{ + `ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`, + `ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`, + }, + }, + { + time: 15 * time.Minute, + result: nil, + }, + { + time: 20 * time.Minute, + result: nil, + }, + } + + for i, test := range tests { + evalTime := clientmodel.Timestamp(0).Add(test.time) + + res, err := rule.eval(evalTime, suite.QueryEngine()) + if err != nil { + t.Fatalf("Error during alerting rule evaluation: %s", err) + } + + actual := strings.Split(res.String(), "\n") + expected := annotateWithTime(test.result, evalTime) + if actual[0] == "" { + actual = []string{} + } + + if len(actual) != len(expected) { + t.Errorf("%d. Number of samples in expected and actual output don't match (%d vs. %d)", i, len(expected), len(actual)) + } + + for j, expectedSample := range expected { + found := false + for _, actualSample := range actual { + if actualSample == expectedSample { + found = true + } + } + if !found { + t.Errorf("%d.%d. Couldn't find expected sample in output: '%v'", i, j, expectedSample) + } + } + + if t.Failed() { + t.Errorf("%d. Expected and actual outputs don't match:", i) + t.Fatalf("Expected:\n%v\n----\nActual:\n%v", strings.Join(expected, "\n"), strings.Join(actual, "\n")) + } + } +} + +func annotateWithTime(lines []string, timestamp clientmodel.Timestamp) []string { + annotatedLines := []string{} + for _, line := range lines { + annotatedLines = append(annotatedLines, fmt.Sprintf(line, timestamp)) + } + return annotatedLines +} + +func TestTransferAlertState(t *testing.T) { + m := NewManager(&ManagerOptions{}) + + alert := &Alert{ + Name: "testalert", + State: StateFiring, + } + + arule := AlertingRule{ + name: "test", + activeAlerts: map[clientmodel.Fingerprint]*Alert{}, + } + aruleCopy := arule + + m.rules = append(m.rules, &arule) + + // Set an alert. + arule.activeAlerts[0] = alert + + // Save state and get the restore function. + restore := m.transferAlertState() + + // Remove arule from the rule list and add an unrelated rule and the + // stateless copy of arule. + m.rules = []Rule{ + &AlertingRule{ + name: "test_other", + activeAlerts: map[clientmodel.Fingerprint]*Alert{}, + }, + &aruleCopy, + } + + // Apply the restore function. + restore() + + if ar := m.rules[0].(*AlertingRule); len(ar.activeAlerts) != 0 { + t.Fatalf("unexpected alert for unrelated alerting rule") + } + if ar := m.rules[1].(*AlertingRule); !reflect.DeepEqual(ar.activeAlerts[0], alert) { + t.Fatalf("alert state was not restored") + } +} diff --git a/rules/rules.go b/rules/rules.go deleted file mode 100644 index 5d3a9cc6d..000000000 --- a/rules/rules.go +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2013 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package rules - -import ( - "html/template" - - clientmodel "github.com/prometheus/client_golang/model" - - "github.com/prometheus/prometheus/promql" -) - -// A Rule encapsulates a vector expression which is evaluated at a specified -// interval and acted upon (currently either recorded or used for alerting). -type Rule interface { - // Name returns the name of the rule. - Name() string - // Eval evaluates the rule, including any associated recording or alerting actions. - eval(clientmodel.Timestamp, *promql.Engine) (promql.Vector, error) - // String returns a human-readable string representation of the rule. - String() string - // HTMLSnippet returns a human-readable string representation of the rule, - // decorated with HTML elements for use the web frontend. - HTMLSnippet(pathPrefix string) template.HTML -} diff --git a/rules/rules_test.go b/rules/rules_test.go deleted file mode 100644 index d29cc6088..000000000 --- a/rules/rules_test.go +++ /dev/null @@ -1,223 +0,0 @@ -// Copyright 2013 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package rules - -import ( - "fmt" - "strings" - "testing" - "time" - - clientmodel "github.com/prometheus/client_golang/model" - - "github.com/prometheus/prometheus/promql" - "github.com/prometheus/prometheus/storage/local" - "github.com/prometheus/prometheus/storage/metric" -) - -var ( - testSampleInterval = time.Duration(5) * time.Minute - testStartTime = clientmodel.Timestamp(0) -) - -func getTestValueStream(startVal clientmodel.SampleValue, endVal clientmodel.SampleValue, stepVal clientmodel.SampleValue, startTime clientmodel.Timestamp) (resultValues metric.Values) { - currentTime := startTime - for currentVal := startVal; currentVal <= endVal; currentVal += stepVal { - sample := metric.SamplePair{ - Value: currentVal, - Timestamp: currentTime, - } - resultValues = append(resultValues, sample) - currentTime = currentTime.Add(testSampleInterval) - } - return resultValues -} - -func getTestVectorFromTestMatrix(matrix promql.Matrix) promql.Vector { - vector := promql.Vector{} - for _, sampleStream := range matrix { - lastSample := sampleStream.Values[len(sampleStream.Values)-1] - vector = append(vector, &promql.Sample{ - Metric: sampleStream.Metric, - Value: lastSample.Value, - Timestamp: lastSample.Timestamp, - }) - } - return vector -} - -func storeMatrix(storage local.Storage, matrix promql.Matrix) { - pendingSamples := clientmodel.Samples{} - for _, sampleStream := range matrix { - for _, sample := range sampleStream.Values { - pendingSamples = append(pendingSamples, &clientmodel.Sample{ - Metric: sampleStream.Metric.Metric, - Value: sample.Value, - Timestamp: sample.Timestamp, - }) - } - } - for _, s := range pendingSamples { - storage.Append(s) - } - storage.WaitForIndexing() -} - -func vectorComparisonString(expected []string, actual []string) string { - separator := "\n--------------\n" - return fmt.Sprintf("Expected:%v%v%v\nActual:%v%v%v ", - separator, - strings.Join(expected, "\n"), - separator, - separator, - strings.Join(actual, "\n"), - separator) -} - -func annotateWithTime(lines []string, timestamp clientmodel.Timestamp) []string { - annotatedLines := []string{} - for _, line := range lines { - annotatedLines = append(annotatedLines, fmt.Sprintf(line, timestamp)) - } - return annotatedLines -} - -var testMatrix = promql.Matrix{ - { - Metric: clientmodel.COWMetric{ - Metric: clientmodel.Metric{ - clientmodel.MetricNameLabel: "http_requests", - clientmodel.JobLabel: "api-server", - "instance": "0", - "group": "canary", - }, - }, - Values: getTestValueStream(0, 300, 30, testStartTime), - }, - { - Metric: clientmodel.COWMetric{ - Metric: clientmodel.Metric{ - clientmodel.MetricNameLabel: "http_requests", - clientmodel.JobLabel: "api-server", - "instance": "1", - "group": "canary", - }, - }, - Values: getTestValueStream(0, 400, 40, testStartTime), - }, - { - Metric: clientmodel.COWMetric{ - Metric: clientmodel.Metric{ - clientmodel.MetricNameLabel: "http_requests", - clientmodel.JobLabel: "app-server", - "instance": "0", - "group": "canary", - }, - }, - Values: getTestValueStream(0, 700, 70, testStartTime), - }, - { - Metric: clientmodel.COWMetric{ - Metric: clientmodel.Metric{ - clientmodel.MetricNameLabel: "http_requests", - clientmodel.JobLabel: "app-server", - "instance": "1", - "group": "canary", - }, - }, - Values: getTestValueStream(0, 800, 80, testStartTime), - }, -} - -func TestAlertingRule(t *testing.T) { - // Labels in expected output need to be alphabetically sorted. - var evalOutputs = [][]string{ - { - `ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`, - `ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`, - }, - { - `ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`, - `ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`, - `ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`, - `ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`, - }, - { - `ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`, - `ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`, - }, - { - /* empty */ - }, - { - /* empty */ - }, - } - - storage, closer := local.NewTestStorage(t, 1) - defer closer.Close() - - storeMatrix(storage, testMatrix) - - engine := promql.NewEngine(storage, nil) - defer engine.Stop() - - expr, err := promql.ParseExpr(`http_requests{group="canary", job="app-server"} < 100`) - if err != nil { - t.Fatalf("Unable to parse alert expression: %s", err) - } - - alertLabels := clientmodel.LabelSet{ - "severity": "critical", - } - rule := NewAlertingRule("HttpRequestRateLow", expr, time.Minute, alertLabels, "summary", "description", "runbook") - - for i, expectedLines := range evalOutputs { - evalTime := testStartTime.Add(testSampleInterval * time.Duration(i)) - - res, err := rule.eval(evalTime, engine) - if err != nil { - t.Fatalf("Error during alerting rule evaluation: %s", err) - } - - actualLines := strings.Split(res.String(), "\n") - expectedLines := annotateWithTime(expectedLines, evalTime) - if actualLines[0] == "" { - actualLines = []string{} - } - - failed := false - if len(actualLines) != len(expectedLines) { - t.Errorf("%d. Number of samples in expected and actual output don't match (%d vs. %d)", i, len(expectedLines), len(actualLines)) - failed = true - } - - for j, expectedSample := range expectedLines { - found := false - for _, actualSample := range actualLines { - if actualSample == expectedSample { - found = true - } - } - if !found { - t.Errorf("%d.%d. Couldn't find expected sample in output: '%v'", i, j, expectedSample) - failed = true - } - } - - if failed { - t.Fatalf("%d. Expected and actual outputs don't match:\n%v", i, vectorComparisonString(expectedLines, actualLines)) - } - } -}