Browse Source

Fix auto reload when a config file with a syntax error is reverted

When we had a syntax error but restored the old file, we did not
re-trigger the config reload, so the config reload metric was showing
that config reload was unsucessful.

I made magic to handle logs in cmd/prometheus.

For now it is a separate file so we can backport this easily.

I will generalize the helper in another PR.

Signed-off-by: Julien <roidelapluie@o11y.eu>
pull/15388/head
Julien 2 months ago
parent
commit
b93aec077c
  1. 11
      cmd/prometheus/main.go
  2. 229
      cmd/prometheus/reload_test.go

11
cmd/prometheus/main.go

@ -1152,9 +1152,8 @@ func main() {
if err := reloadConfig(cfg.configFile, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, callback, reloaders...); err != nil {
logger.Error("Error reloading config", "err", err)
} else if cfg.enableAutoReload {
if currentChecksum, err := config.GenerateChecksum(cfg.configFile); err == nil {
checksum = currentChecksum
} else {
checksum, err = config.GenerateChecksum(cfg.configFile)
if err != nil {
logger.Error("Failed to generate checksum during configuration reload", "err", err)
}
}
@ -1165,9 +1164,8 @@ func main() {
} else {
rc <- nil
if cfg.enableAutoReload {
if currentChecksum, err := config.GenerateChecksum(cfg.configFile); err == nil {
checksum = currentChecksum
} else {
checksum, err = config.GenerateChecksum(cfg.configFile)
if err != nil {
logger.Error("Failed to generate checksum during configuration reload", "err", err)
}
}
@ -1178,6 +1176,7 @@ func main() {
}
currentChecksum, err := config.GenerateChecksum(cfg.configFile)
if err != nil {
checksum = currentChecksum
logger.Error("Failed to generate checksum during configuration reload", "err", err)
} else if currentChecksum == checksum {
continue

229
cmd/prometheus/reload_test.go

@ -0,0 +1,229 @@
// Copyright 2024 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"bufio"
"encoding/json"
"io"
"net/http"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"sync"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/prometheus/prometheus/util/testutil"
)
const configReloadMetric = "prometheus_config_last_reload_successful"
func TestAutoReloadConfig_ValidToValid(t *testing.T) {
steps := []struct {
configText string
expectedInterval string
expectedMetric float64
}{
{
configText: `
global:
scrape_interval: 30s
`,
expectedInterval: "30s",
expectedMetric: 1,
},
{
configText: `
global:
scrape_interval: 15s
`,
expectedInterval: "15s",
expectedMetric: 1,
},
{
configText: `
global:
scrape_interval: 30s
`,
expectedInterval: "30s",
expectedMetric: 1,
},
}
runTestSteps(t, steps)
}
func TestAutoReloadConfig_ValidToInvalidToValid(t *testing.T) {
steps := []struct {
configText string
expectedInterval string
expectedMetric float64
}{
{
configText: `
global:
scrape_interval: 30s
`,
expectedInterval: "30s",
expectedMetric: 1,
},
{
configText: `
global:
scrape_interval: 15s
invalid_syntax
`,
expectedInterval: "30s",
expectedMetric: 0,
},
{
configText: `
global:
scrape_interval: 30s
`,
expectedInterval: "30s",
expectedMetric: 1,
},
}
runTestSteps(t, steps)
}
func runTestSteps(t *testing.T, steps []struct {
configText string
expectedInterval string
expectedMetric float64
},
) {
configDir := t.TempDir()
configFilePath := filepath.Join(configDir, "prometheus.yml")
t.Logf("Config file path: %s", configFilePath)
require.NoError(t, os.WriteFile(configFilePath, []byte(steps[0].configText), 0o644), "Failed to write initial config file")
port := testutil.RandomUnprivilegedPort(t)
runPrometheusWithLogging(t, configFilePath, port)
baseURL := "http://localhost:" + strconv.Itoa(port)
require.Eventually(t, func() bool {
resp, err := http.Get(baseURL + "/-/ready")
if err != nil {
return false
}
defer resp.Body.Close()
return resp.StatusCode == http.StatusOK
}, 5*time.Second, 100*time.Millisecond, "Prometheus didn't become ready in time")
for i, step := range steps {
t.Logf("Step %d", i)
require.NoError(t, os.WriteFile(configFilePath, []byte(step.configText), 0o644), "Failed to write config file for step")
require.Eventually(t, func() bool {
return verifyScrapeInterval(t, baseURL, step.expectedInterval) &&
verifyConfigReloadMetric(t, baseURL, step.expectedMetric)
}, 10*time.Second, 500*time.Millisecond, "Prometheus config reload didn't happen in time")
}
}
func verifyScrapeInterval(t *testing.T, baseURL, expectedInterval string) bool {
resp, err := http.Get(baseURL + "/api/v1/status/config")
require.NoError(t, err)
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
require.NoError(t, err)
config := struct {
Data struct {
YAML string `json:"yaml"`
} `json:"data"`
}{}
require.NoError(t, json.Unmarshal(body, &config))
return strings.Contains(config.Data.YAML, "scrape_interval: "+expectedInterval)
}
func verifyConfigReloadMetric(t *testing.T, baseURL string, expectedValue float64) bool {
resp, err := http.Get(baseURL + "/metrics")
require.NoError(t, err)
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
require.NoError(t, err)
lines := string(body)
var actualValue float64
found := false
for _, line := range strings.Split(lines, "\n") {
if strings.HasPrefix(line, configReloadMetric) {
parts := strings.Fields(line)
if len(parts) >= 2 {
actualValue, err = strconv.ParseFloat(parts[1], 64)
require.NoError(t, err)
found = true
break
}
}
}
return found && actualValue == expectedValue
}
func captureLogsToTLog(t *testing.T, r io.Reader) {
scanner := bufio.NewScanner(r)
for scanner.Scan() {
t.Log(scanner.Text())
}
if err := scanner.Err(); err != nil {
t.Logf("Error reading logs: %v", err)
}
}
func runPrometheusWithLogging(t *testing.T, configFilePath string, port int) {
stdoutPipe, stdoutWriter := io.Pipe()
stderrPipe, stderrWriter := io.Pipe()
var wg sync.WaitGroup
wg.Add(2)
prom := exec.Command(promPath, "-test.main", "--enable-feature=auto-reload-config", "--config.file="+configFilePath, "--config.auto-reload-interval=1s", "--web.listen-address=0.0.0.0:"+strconv.Itoa(port))
prom.Stdout = stdoutWriter
prom.Stderr = stderrWriter
go func() {
defer wg.Done()
captureLogsToTLog(t, stdoutPipe)
}()
go func() {
defer wg.Done()
captureLogsToTLog(t, stderrPipe)
}()
t.Cleanup(func() {
prom.Process.Kill()
prom.Wait()
stdoutWriter.Close()
stderrWriter.Close()
wg.Wait()
})
require.NoError(t, prom.Start())
}
Loading…
Cancel
Save