From 1cd2d0498b287a5698e3eda8fa7979643b752265 Mon Sep 17 00:00:00 2001 From: Julien Date: Fri, 30 Aug 2024 13:25:02 +0200 Subject: [PATCH] Support reload config automatically Signed-off-by: Julien --- cmd/prometheus/main.go | 55 +++++++- config/reload.go | 86 ++++++++++++ config/reload_test.go | 225 ++++++++++++++++++++++++++++++++ docs/command-line/prometheus.md | 3 +- docs/feature_flags.md | 12 ++ 5 files changed, 379 insertions(+), 2 deletions(-) create mode 100644 config/reload.go create mode 100644 config/reload_test.go diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index 65ffd7de5..26f96db53 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -154,6 +154,9 @@ type flagConfig struct { RemoteFlushDeadline model.Duration nameEscapingScheme string + enableAutoReload bool + autoReloadInterval model.Duration + featureList []string memlimitRatio float64 // These options are extracted from featureList @@ -212,6 +215,12 @@ func (c *flagConfig) setFeatureListOptions(logger log.Logger) error { case "auto-gomaxprocs": c.enableAutoGOMAXPROCS = true level.Info(logger).Log("msg", "Automatically set GOMAXPROCS to match Linux container CPU quota") + case "auto-reload-config": + c.enableAutoReload = true + if s := time.Duration(c.autoReloadInterval).Seconds(); s > 0 && s < 1 { + c.autoReloadInterval, _ = model.ParseDuration("1s") + } + level.Info(logger).Log("msg", fmt.Sprintf("Enabled automatic configuration file reloading. Checking for configuration changes every %s.", c.autoReloadInterval)) case "auto-gomemlimit": c.enableAutoGOMEMLIMIT = true level.Info(logger).Log("msg", "Automatically set GOMEMLIMIT to match Linux container or system memory limit") @@ -302,6 +311,9 @@ func main() { a.Flag("config.file", "Prometheus configuration file path."). Default("prometheus.yml").StringVar(&cfg.configFile) + a.Flag("config.auto-reload-interval", "Specifies the interval for checking and automatically reloading the Prometheus configuration file upon detecting changes."). + Default("30s").SetValue(&cfg.autoReloadInterval) + a.Flag("web.listen-address", "Address to listen on for UI, API, and telemetry. Can be repeated."). Default("0.0.0.0:9090").StringsVar(&cfg.web.ListenAddresses) @@ -492,7 +504,7 @@ func main() { a.Flag("scrape.name-escaping-scheme", `Method for escaping legacy invalid names when sending to Prometheus that does not support UTF-8. Can be one of "values", "underscores", or "dots".`).Default(scrape.DefaultNameEscapingScheme.String()).StringVar(&cfg.nameEscapingScheme) - a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: agent, auto-gomaxprocs, auto-gomemlimit, concurrent-rule-eval, created-timestamp-zero-ingestion, delayed-compaction, exemplar-storage, expand-external-labels, extra-scrape-metrics, memory-snapshot-on-shutdown, native-histograms, new-service-discovery-manager, no-default-scrape-port, otlp-write-receiver, promql-experimental-functions, promql-delayed-name-removal, promql-per-step-stats, remote-write-receiver (DEPRECATED), utf8-names. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details."). + a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: agent, auto-gomaxprocs, auto-gomemlimit, auto-reload-config, concurrent-rule-eval, created-timestamp-zero-ingestion, delayed-compaction, exemplar-storage, expand-external-labels, extra-scrape-metrics, memory-snapshot-on-shutdown, native-histograms, new-service-discovery-manager, no-default-scrape-port, otlp-write-receiver, promql-experimental-functions, promql-delayed-name-removal, promql-per-step-stats, remote-write-receiver (DEPRECATED), utf8-names. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details."). Default("").StringsVar(&cfg.featureList) promlogflag.AddFlags(a, &cfg.promlogConfig) @@ -1120,6 +1132,15 @@ func main() { hup := make(chan os.Signal, 1) signal.Notify(hup, syscall.SIGHUP) cancel := make(chan struct{}) + + var checksum string + if cfg.enableAutoReload { + checksum, err = config.GenerateChecksum(cfg.configFile) + if err != nil { + level.Error(logger).Log("msg", "Failed to generate initial checksum for configuration file", "err", err) + } + } + g.Add( func() error { <-reloadReady.C @@ -1129,6 +1150,12 @@ func main() { case <-hup: if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, reloaders...); err != nil { level.Error(logger).Log("msg", "Error reloading config", "err", err) + } else if cfg.enableAutoReload { + if currentChecksum, err := config.GenerateChecksum(cfg.configFile); err == nil { + checksum = currentChecksum + } else { + level.Error(logger).Log("msg", "Failed to generate checksum during configuration reload", "err", err) + } } case rc := <-webHandler.Reload(): if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, reloaders...); err != nil { @@ -1137,6 +1164,32 @@ func main() { } else { rc <- nil } + if cfg.enableAutoReload { + if currentChecksum, err := config.GenerateChecksum(cfg.configFile); err == nil { + checksum = currentChecksum + } else { + level.Error(logger).Log("msg", "Failed to generate checksum during configuration reload", "err", err) + } + } + case <-time.Tick(time.Duration(cfg.autoReloadInterval)): + if !cfg.enableAutoReload { + continue + } + currentChecksum, err := config.GenerateChecksum(cfg.configFile) + if err != nil { + level.Error(logger).Log("msg", "Failed to generate checksum during configuration reload", "err", err) + continue + } + if currentChecksum == checksum { + continue + } + level.Info(logger).Log("msg", "Configuration file change detected, reloading the configuration.") + + if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, reloaders...); err != nil { + level.Error(logger).Log("msg", "Error reloading config", "err", err) + } else { + checksum = currentChecksum + } case <-cancel: return nil } diff --git a/config/reload.go b/config/reload.go new file mode 100644 index 000000000..2f03ca463 --- /dev/null +++ b/config/reload.go @@ -0,0 +1,86 @@ +// Copyright 2024 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package config + +import ( + "crypto/sha256" + "encoding/hex" + "fmt" + "os" + "path/filepath" + + "gopkg.in/yaml.v2" +) + +type ExternalFilesConfig struct { + RuleFiles []string `yaml:"rule_files"` + ScrapeConfigFiles []string `yaml:"scrape_config_files"` +} + +// GenerateChecksum generates a checksum of the YAML file and the files it references. +func GenerateChecksum(yamlFilePath string) (string, error) { + hash := sha256.New() + + yamlContent, err := os.ReadFile(yamlFilePath) + if err != nil { + return "", fmt.Errorf("error reading YAML file: %w", err) + } + _, _ = hash.Write(yamlContent) + + var config ExternalFilesConfig + if err := yaml.Unmarshal(yamlContent, &config); err != nil { + return "", fmt.Errorf("error unmarshalling YAML: %w", err) + } + + dir := filepath.Dir(yamlFilePath) + + for i, file := range config.RuleFiles { + config.RuleFiles[i] = filepath.Join(dir, file) + } + for i, file := range config.ScrapeConfigFiles { + config.ScrapeConfigFiles[i] = filepath.Join(dir, file) + } + + files := map[string][]string{ + "r": config.RuleFiles, // "r" for rule files + "s": config.ScrapeConfigFiles, // "s" for scrape config files + } + + for _, prefix := range []string{"r", "s"} { + for _, pattern := range files[prefix] { + matchingFiles, err := filepath.Glob(pattern) + if err != nil { + return "", fmt.Errorf("error finding files with pattern %s: %w", pattern, err) + } + + for _, file := range matchingFiles { + // Write prefix to the hash ("r" or "s") followed by \0. + _, _ = hash.Write([]byte(prefix + "\x00")) + + // Write the file path to the hash, followed by \0 to ensure + // separation. + _, _ = hash.Write([]byte(file + "\x00")) + + // Read and hash the content of the file. + content, err := os.ReadFile(file) + if err != nil { + return "", fmt.Errorf("error reading file %s: %w", file, err) + } + _, _ = hash.Write(append(content, []byte("\x00")...)) + } + } + } + + return hex.EncodeToString(hash.Sum(nil)), nil +} diff --git a/config/reload_test.go b/config/reload_test.go new file mode 100644 index 000000000..0d8630911 --- /dev/null +++ b/config/reload_test.go @@ -0,0 +1,225 @@ +// Copyright 2024 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package config + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestGenerateChecksum(t *testing.T) { + // Create a temporary directory to hold the test files. + tmpDir, err := os.MkdirTemp("", "checksum_test") + require.NoError(t, err) + defer os.RemoveAll(tmpDir) // Clean up. + + // Define paths for the temporary files. + yamlFilePath := filepath.Join(tmpDir, "test.yml") + ruleFilePath := filepath.Join(tmpDir, "rule_file.yml") + scrapeConfigFilePath := filepath.Join(tmpDir, "scrape_config.yml") + + // Define initial and modified content for the files. + originalRuleContent := "groups:\n- name: example\n rules:\n - alert: ExampleAlert" + modifiedRuleContent := "groups:\n- name: example\n rules:\n - alert: ModifiedAlert" + + originalScrapeConfigContent := "scrape_configs:\n- job_name: example" + modifiedScrapeConfigContent := "scrape_configs:\n- job_name: modified_example" + + // Define YAML content referencing the rule and scrape config files. + yamlContent := ` +rule_files: + - rule_file.yml +scrape_config_files: + - scrape_config.yml +` + + // Write initial content to files. + require.NoError(t, os.WriteFile(ruleFilePath, []byte(originalRuleContent), 0o644)) + require.NoError(t, os.WriteFile(scrapeConfigFilePath, []byte(originalScrapeConfigContent), 0o644)) + require.NoError(t, os.WriteFile(yamlFilePath, []byte(yamlContent), 0o644)) + + // Generate the original checksum. + originalChecksum := calculateChecksum(t, yamlFilePath) + + t.Run("Rule File Change", func(t *testing.T) { + // Modify the rule file. + require.NoError(t, os.WriteFile(ruleFilePath, []byte(modifiedRuleContent), 0o644)) + + // Checksum should change. + modifiedChecksum := calculateChecksum(t, yamlFilePath) + require.NotEqual(t, originalChecksum, modifiedChecksum) + + // Revert the rule file. + require.NoError(t, os.WriteFile(ruleFilePath, []byte(originalRuleContent), 0o644)) + + // Checksum should return to the original. + revertedChecksum := calculateChecksum(t, yamlFilePath) + require.Equal(t, originalChecksum, revertedChecksum) + }) + + t.Run("Scrape Config Change", func(t *testing.T) { + // Modify the scrape config file. + require.NoError(t, os.WriteFile(scrapeConfigFilePath, []byte(modifiedScrapeConfigContent), 0o644)) + + // Checksum should change. + modifiedChecksum := calculateChecksum(t, yamlFilePath) + require.NotEqual(t, originalChecksum, modifiedChecksum) + + // Revert the scrape config file. + require.NoError(t, os.WriteFile(scrapeConfigFilePath, []byte(originalScrapeConfigContent), 0o644)) + + // Checksum should return to the original. + revertedChecksum := calculateChecksum(t, yamlFilePath) + require.Equal(t, originalChecksum, revertedChecksum) + }) + + t.Run("Rule File Deletion", func(t *testing.T) { + // Delete the rule file. + require.NoError(t, os.Remove(ruleFilePath)) + + // Checksum should change. + deletedChecksum := calculateChecksum(t, yamlFilePath) + require.NotEqual(t, originalChecksum, deletedChecksum) + + // Restore the rule file. + require.NoError(t, os.WriteFile(ruleFilePath, []byte(originalRuleContent), 0o644)) + + // Checksum should return to the original. + revertedChecksum := calculateChecksum(t, yamlFilePath) + require.Equal(t, originalChecksum, revertedChecksum) + }) + + t.Run("Scrape Config Deletion", func(t *testing.T) { + // Delete the scrape config file. + require.NoError(t, os.Remove(scrapeConfigFilePath)) + + // Checksum should change. + deletedChecksum := calculateChecksum(t, yamlFilePath) + require.NotEqual(t, originalChecksum, deletedChecksum) + + // Restore the scrape config file. + require.NoError(t, os.WriteFile(scrapeConfigFilePath, []byte(originalScrapeConfigContent), 0o644)) + + // Checksum should return to the original. + revertedChecksum := calculateChecksum(t, yamlFilePath) + require.Equal(t, originalChecksum, revertedChecksum) + }) + + t.Run("Main File Change", func(t *testing.T) { + // Modify the main YAML file. + modifiedYamlContent := ` +global: + scrape_interval: 3s +rule_files: + - rule_file.yml +scrape_config_files: + - scrape_config.yml +` + require.NoError(t, os.WriteFile(yamlFilePath, []byte(modifiedYamlContent), 0o644)) + + // Checksum should change. + modifiedChecksum := calculateChecksum(t, yamlFilePath) + require.NotEqual(t, originalChecksum, modifiedChecksum) + + // Revert the main YAML file. + require.NoError(t, os.WriteFile(yamlFilePath, []byte(yamlContent), 0o644)) + + // Checksum should return to the original. + revertedChecksum := calculateChecksum(t, yamlFilePath) + require.Equal(t, originalChecksum, revertedChecksum) + }) + + t.Run("Rule File Removed from YAML Config", func(t *testing.T) { + // Modify the YAML content to remove the rule file. + modifiedYamlContent := ` +scrape_config_files: + - scrape_config.yml +` + require.NoError(t, os.WriteFile(yamlFilePath, []byte(modifiedYamlContent), 0o644)) + + // Checksum should change. + modifiedChecksum := calculateChecksum(t, yamlFilePath) + require.NotEqual(t, originalChecksum, modifiedChecksum) + + // Revert the YAML content. + require.NoError(t, os.WriteFile(yamlFilePath, []byte(yamlContent), 0o644)) + + // Checksum should return to the original. + revertedChecksum := calculateChecksum(t, yamlFilePath) + require.Equal(t, originalChecksum, revertedChecksum) + }) + + t.Run("Scrape Config Removed from YAML Config", func(t *testing.T) { + // Modify the YAML content to remove the scrape config file. + modifiedYamlContent := ` +rule_files: + - rule_file.yml +` + require.NoError(t, os.WriteFile(yamlFilePath, []byte(modifiedYamlContent), 0o644)) + + // Checksum should change. + modifiedChecksum := calculateChecksum(t, yamlFilePath) + require.NotEqual(t, originalChecksum, modifiedChecksum) + + // Revert the YAML content. + require.NoError(t, os.WriteFile(yamlFilePath, []byte(yamlContent), 0o644)) + + // Checksum should return to the original. + revertedChecksum := calculateChecksum(t, yamlFilePath) + require.Equal(t, originalChecksum, revertedChecksum) + }) + + t.Run("Empty Rule File", func(t *testing.T) { + // Write an empty rule file. + require.NoError(t, os.WriteFile(ruleFilePath, []byte(""), 0o644)) + + // Checksum should change. + emptyChecksum := calculateChecksum(t, yamlFilePath) + require.NotEqual(t, originalChecksum, emptyChecksum) + + // Restore the rule file. + require.NoError(t, os.WriteFile(ruleFilePath, []byte(originalRuleContent), 0o644)) + + // Checksum should return to the original. + revertedChecksum := calculateChecksum(t, yamlFilePath) + require.Equal(t, originalChecksum, revertedChecksum) + }) + + t.Run("Empty Scrape Config File", func(t *testing.T) { + // Write an empty scrape config file. + require.NoError(t, os.WriteFile(scrapeConfigFilePath, []byte(""), 0o644)) + + // Checksum should change. + emptyChecksum := calculateChecksum(t, yamlFilePath) + require.NotEqual(t, originalChecksum, emptyChecksum) + + // Restore the scrape config file. + require.NoError(t, os.WriteFile(scrapeConfigFilePath, []byte(originalScrapeConfigContent), 0o644)) + + // Checksum should return to the original. + revertedChecksum := calculateChecksum(t, yamlFilePath) + require.Equal(t, originalChecksum, revertedChecksum) + }) +} + +// calculateChecksum generates a checksum for the given YAML file path. +func calculateChecksum(t *testing.T, yamlFilePath string) string { + checksum, err := GenerateChecksum(yamlFilePath) + require.NoError(t, err) + require.NotEmpty(t, checksum) + return checksum +} diff --git a/docs/command-line/prometheus.md b/docs/command-line/prometheus.md index 7d9e5a3c8..8fefa8ecc 100644 --- a/docs/command-line/prometheus.md +++ b/docs/command-line/prometheus.md @@ -15,6 +15,7 @@ The Prometheus monitoring server | -h, --help | Show context-sensitive help (also try --help-long and --help-man). | | | --version | Show application version. | | | --config.file | Prometheus configuration file path. | `prometheus.yml` | +| --config.auto-reload-interval | Specifies the interval for checking and automatically reloading the Prometheus configuration file upon detecting changes. | `30s` | | --web.listen-address ... | Address to listen on for UI, API, and telemetry. Can be repeated. | `0.0.0.0:9090` | | --auto-gomemlimit.ratio | The ratio of reserved GOMEMLIMIT memory to the detected maximum container or system memory | `0.9` | | --web.config.file | [EXPERIMENTAL] Path to configuration file that can enable TLS or authentication. | | @@ -57,7 +58,7 @@ The Prometheus monitoring server | --query.max-concurrency | Maximum number of queries executed concurrently. Use with server mode only. | `20` | | --query.max-samples | Maximum number of samples a single query can load into memory. Note that queries will fail if they try to load more samples than this into memory, so this also limits the number of samples a query can return. Use with server mode only. | `50000000` | | --scrape.name-escaping-scheme | Method for escaping legacy invalid names when sending to Prometheus that does not support UTF-8. Can be one of "values", "underscores", or "dots". | `values` | -| --enable-feature ... | Comma separated feature names to enable. Valid options: agent, auto-gomaxprocs, auto-gomemlimit, concurrent-rule-eval, created-timestamp-zero-ingestion, delayed-compaction, exemplar-storage, expand-external-labels, extra-scrape-metrics, memory-snapshot-on-shutdown, native-histograms, new-service-discovery-manager, no-default-scrape-port, otlp-write-receiver, promql-experimental-functions, promql-delayed-name-removal, promql-per-step-stats, remote-write-receiver (DEPRECATED), utf8-names. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details. | | +| --enable-feature ... | Comma separated feature names to enable. Valid options: agent, auto-gomaxprocs, auto-gomemlimit, auto-reload-config, concurrent-rule-eval, created-timestamp-zero-ingestion, delayed-compaction, exemplar-storage, expand-external-labels, extra-scrape-metrics, memory-snapshot-on-shutdown, native-histograms, new-service-discovery-manager, no-default-scrape-port, otlp-write-receiver, promql-experimental-functions, promql-delayed-name-removal, promql-per-step-stats, remote-write-receiver (DEPRECATED), utf8-names. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details. | | | --log.level | Only log messages with the given severity or above. One of: [debug, info, warn, error] | `info` | | --log.format | Output format of log messages. One of: [logfmt, json] | `logfmt` | diff --git a/docs/feature_flags.md b/docs/feature_flags.md index 7b07a04d0..1ef9efb9b 100644 --- a/docs/feature_flags.md +++ b/docs/feature_flags.md @@ -265,3 +265,15 @@ This allows optionally preserving the `__name__` label via the `label_replace` a When enabled, changes the metric and label name validation scheme inside Prometheus to allow the full UTF-8 character set. By itself, this flag does not enable the request of UTF-8 names via content negotiation. Users will also have to set `metric_name_validation_scheme` in scrape configs to enable the feature either on the global config or on a per-scrape config basis. + +## Auto Reload Config + +`--enable-feature=auto-reload-config` + +When enabled, Prometheus will automatically reload its configuration file at a +specified interval. The interval is defined by the +`--config.auto-reload-interval` flag, which defaults to `30s`. + +Configuration reloads are triggered by detecting changes in the checksum of the +main configuration file or any referenced files, such as rule and scrape +configurations.