From 9d9e86b33360c57b30aec2d346ec36e45bcf4982 Mon Sep 17 00:00:00 2001 From: Julien Date: Mon, 30 Sep 2024 12:55:11 +0200 Subject: [PATCH] Fix auto reload when a config file with a syntax error is reverted When we had a syntax error but restored the old file, we did not re-trigger the config reload, so the config reload metric was showing that config reload was unsucessful. I made magic to handle logs in cmd/prometheus. For now it is a separate file so we can backport this easily. I will generalize the helper in another PR. Signed-off-by: Julien --- cmd/prometheus/main.go | 11 +- cmd/prometheus/reload_test.go | 229 ++++++++++++++++++++++++++++++++++ 2 files changed, 234 insertions(+), 6 deletions(-) create mode 100644 cmd/prometheus/reload_test.go diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index 8fb6d4d38..af44331b0 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -1153,9 +1153,8 @@ func main() { if err := reloadConfig(cfg.configFile, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, callback, reloaders...); err != nil { logger.Error("Error reloading config", "err", err) } else if cfg.enableAutoReload { - if currentChecksum, err := config.GenerateChecksum(cfg.configFile); err == nil { - checksum = currentChecksum - } else { + checksum, err = config.GenerateChecksum(cfg.configFile) + if err != nil { logger.Error("Failed to generate checksum during configuration reload", "err", err) } } @@ -1166,9 +1165,8 @@ func main() { } else { rc <- nil if cfg.enableAutoReload { - if currentChecksum, err := config.GenerateChecksum(cfg.configFile); err == nil { - checksum = currentChecksum - } else { + checksum, err = config.GenerateChecksum(cfg.configFile) + if err != nil { logger.Error("Failed to generate checksum during configuration reload", "err", err) } } @@ -1179,6 +1177,7 @@ func main() { } currentChecksum, err := config.GenerateChecksum(cfg.configFile) if err != nil { + checksum = currentChecksum logger.Error("Failed to generate checksum during configuration reload", "err", err) } else if currentChecksum == checksum { continue diff --git a/cmd/prometheus/reload_test.go b/cmd/prometheus/reload_test.go new file mode 100644 index 000000000..18a7ff2ad --- /dev/null +++ b/cmd/prometheus/reload_test.go @@ -0,0 +1,229 @@ +// Copyright 2024 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "bufio" + "encoding/json" + "io" + "net/http" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/prometheus/prometheus/util/testutil" +) + +const configReloadMetric = "prometheus_config_last_reload_successful" + +func TestAutoReloadConfig_ValidToValid(t *testing.T) { + steps := []struct { + configText string + expectedInterval string + expectedMetric float64 + }{ + { + configText: ` +global: + scrape_interval: 30s +`, + expectedInterval: "30s", + expectedMetric: 1, + }, + { + configText: ` +global: + scrape_interval: 15s +`, + expectedInterval: "15s", + expectedMetric: 1, + }, + { + configText: ` +global: + scrape_interval: 30s +`, + expectedInterval: "30s", + expectedMetric: 1, + }, + } + + runTestSteps(t, steps) +} + +func TestAutoReloadConfig_ValidToInvalidToValid(t *testing.T) { + steps := []struct { + configText string + expectedInterval string + expectedMetric float64 + }{ + { + configText: ` +global: + scrape_interval: 30s +`, + expectedInterval: "30s", + expectedMetric: 1, + }, + { + configText: ` +global: + scrape_interval: 15s +invalid_syntax +`, + expectedInterval: "30s", + expectedMetric: 0, + }, + { + configText: ` +global: + scrape_interval: 30s +`, + expectedInterval: "30s", + expectedMetric: 1, + }, + } + + runTestSteps(t, steps) +} + +func runTestSteps(t *testing.T, steps []struct { + configText string + expectedInterval string + expectedMetric float64 +}, +) { + configDir := t.TempDir() + configFilePath := filepath.Join(configDir, "prometheus.yml") + + t.Logf("Config file path: %s", configFilePath) + + require.NoError(t, os.WriteFile(configFilePath, []byte(steps[0].configText), 0o644), "Failed to write initial config file") + + port := testutil.RandomUnprivilegedPort(t) + runPrometheusWithLogging(t, configFilePath, port) + + baseURL := "http://localhost:" + strconv.Itoa(port) + require.Eventually(t, func() bool { + resp, err := http.Get(baseURL + "/-/ready") + if err != nil { + return false + } + defer resp.Body.Close() + return resp.StatusCode == http.StatusOK + }, 5*time.Second, 100*time.Millisecond, "Prometheus didn't become ready in time") + + for i, step := range steps { + t.Logf("Step %d", i) + require.NoError(t, os.WriteFile(configFilePath, []byte(step.configText), 0o644), "Failed to write config file for step") + + require.Eventually(t, func() bool { + return verifyScrapeInterval(t, baseURL, step.expectedInterval) && + verifyConfigReloadMetric(t, baseURL, step.expectedMetric) + }, 10*time.Second, 500*time.Millisecond, "Prometheus config reload didn't happen in time") + } +} + +func verifyScrapeInterval(t *testing.T, baseURL, expectedInterval string) bool { + resp, err := http.Get(baseURL + "/api/v1/status/config") + require.NoError(t, err) + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + + config := struct { + Data struct { + YAML string `json:"yaml"` + } `json:"data"` + }{} + + require.NoError(t, json.Unmarshal(body, &config)) + return strings.Contains(config.Data.YAML, "scrape_interval: "+expectedInterval) +} + +func verifyConfigReloadMetric(t *testing.T, baseURL string, expectedValue float64) bool { + resp, err := http.Get(baseURL + "/metrics") + require.NoError(t, err) + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + + lines := string(body) + var actualValue float64 + found := false + + for _, line := range strings.Split(lines, "\n") { + if strings.HasPrefix(line, configReloadMetric) { + parts := strings.Fields(line) + if len(parts) >= 2 { + actualValue, err = strconv.ParseFloat(parts[1], 64) + require.NoError(t, err) + found = true + break + } + } + } + + return found && actualValue == expectedValue +} + +func captureLogsToTLog(t *testing.T, r io.Reader) { + scanner := bufio.NewScanner(r) + for scanner.Scan() { + t.Log(scanner.Text()) + } + if err := scanner.Err(); err != nil { + t.Logf("Error reading logs: %v", err) + } +} + +func runPrometheusWithLogging(t *testing.T, configFilePath string, port int) { + stdoutPipe, stdoutWriter := io.Pipe() + stderrPipe, stderrWriter := io.Pipe() + + var wg sync.WaitGroup + wg.Add(2) + + prom := exec.Command(promPath, "-test.main", "--enable-feature=auto-reload-config", "--config.file="+configFilePath, "--config.auto-reload-interval=1s", "--web.listen-address=0.0.0.0:"+strconv.Itoa(port)) + prom.Stdout = stdoutWriter + prom.Stderr = stderrWriter + + go func() { + defer wg.Done() + captureLogsToTLog(t, stdoutPipe) + }() + go func() { + defer wg.Done() + captureLogsToTLog(t, stderrPipe) + }() + + t.Cleanup(func() { + prom.Process.Kill() + prom.Wait() + stdoutWriter.Close() + stderrWriter.Close() + wg.Wait() + }) + + require.NoError(t, prom.Start()) +}