mirror of https://github.com/prometheus/prometheus
Scrape: Add scrape_failure_log_file to log Scrape Failures
Signed-off-by: Julien <roidelapluie@o11y.eu>pull/14734/head
parent
70bb219d33
commit
ce0f09b125
|
@ -755,6 +755,7 @@ func main() {
|
|||
scrapeManager, err := scrape.NewManager(
|
||||
&cfg.scrape,
|
||||
log.With(logger, "component", "scrape manager"),
|
||||
func(s string) (log.Logger, error) { return logging.NewJSONFileLogger(s) },
|
||||
fanoutStorage,
|
||||
prometheus.DefaultRegisterer,
|
||||
)
|
||||
|
|
|
@ -0,0 +1,193 @@
|
|||
// Copyright 2024 The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/atomic"
|
||||
|
||||
"github.com/prometheus/prometheus/util/testutil"
|
||||
)
|
||||
|
||||
func TestScrapeFailureLogFile(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping test in short mode.")
|
||||
}
|
||||
|
||||
// Tracks the number of requests made to the mock server.
|
||||
var requestCount atomic.Int32
|
||||
|
||||
// Starts a server that always returns HTTP 500 errors.
|
||||
mockServerAddress := startGarbageServer(t, &requestCount)
|
||||
|
||||
// Create a temporary directory for Prometheus configuration and logs.
|
||||
tempDir := t.TempDir()
|
||||
|
||||
// Define file paths for the scrape failure log and Prometheus configuration.
|
||||
// Like other files, the scrape failure log file should be relative to the
|
||||
// config file. Therefore, we split the name we put in the file and the full
|
||||
// path used to check the content of the file.
|
||||
scrapeFailureLogFileName := "scrape_failure.log"
|
||||
scrapeFailureLogFile := filepath.Join(tempDir, scrapeFailureLogFileName)
|
||||
promConfigFile := filepath.Join(tempDir, "prometheus.yml")
|
||||
|
||||
// Step 1: Set up an initial Prometheus configuration that globally
|
||||
// specifies a scrape failure log file.
|
||||
promConfig := fmt.Sprintf(`
|
||||
global:
|
||||
scrape_interval: 500ms
|
||||
scrape_failure_log_file: %s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'test_job'
|
||||
static_configs:
|
||||
- targets: ['%s']
|
||||
`, scrapeFailureLogFileName, mockServerAddress)
|
||||
|
||||
err := os.WriteFile(promConfigFile, []byte(promConfig), 0o644)
|
||||
require.NoError(t, err, "Failed to write Prometheus configuration file")
|
||||
|
||||
// Start Prometheus with the generated configuration and a random port, enabling the lifecycle API.
|
||||
port := testutil.RandomUnprivilegedPort(t)
|
||||
params := []string{
|
||||
"-test.main",
|
||||
"--config.file=" + promConfigFile,
|
||||
"--storage.tsdb.path=" + filepath.Join(tempDir, "data"),
|
||||
fmt.Sprintf("--web.listen-address=127.0.0.1:%d", port),
|
||||
"--web.enable-lifecycle",
|
||||
}
|
||||
prometheusProcess := exec.Command(promPath, params...)
|
||||
prometheusProcess.Stdout = os.Stdout
|
||||
prometheusProcess.Stderr = os.Stderr
|
||||
|
||||
err = prometheusProcess.Start()
|
||||
require.NoError(t, err, "Failed to start Prometheus")
|
||||
defer prometheusProcess.Process.Kill()
|
||||
|
||||
// Wait until the mock server receives at least two requests from Prometheus.
|
||||
require.Eventually(t, func() bool {
|
||||
return requestCount.Load() >= 2
|
||||
}, 30*time.Second, 500*time.Millisecond, "Expected at least two requests to the mock server")
|
||||
|
||||
// Verify that the scrape failures have been logged to the specified file.
|
||||
content, err := os.ReadFile(scrapeFailureLogFile)
|
||||
require.NoError(t, err, "Failed to read scrape failure log")
|
||||
require.Contains(t, string(content), "server returned HTTP status 500 Internal Server Error", "Expected scrape failure log entry not found")
|
||||
|
||||
// Step 2: Update the Prometheus configuration to remove the scrape failure
|
||||
// log file setting.
|
||||
promConfig = fmt.Sprintf(`
|
||||
global:
|
||||
scrape_interval: 1s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'test_job'
|
||||
static_configs:
|
||||
- targets: ['%s']
|
||||
`, mockServerAddress)
|
||||
|
||||
err = os.WriteFile(promConfigFile, []byte(promConfig), 0o644)
|
||||
require.NoError(t, err, "Failed to update Prometheus configuration file")
|
||||
|
||||
// Reload Prometheus with the updated configuration.
|
||||
reloadPrometheus(t, port)
|
||||
|
||||
// Count the number of lines in the scrape failure log file before any
|
||||
// further requests.
|
||||
preReloadLogLineCount := countLinesInFile(scrapeFailureLogFile)
|
||||
|
||||
// Wait for at least two more requests to the mock server to ensure
|
||||
// Prometheus continues scraping.
|
||||
requestsBeforeReload := requestCount.Load()
|
||||
require.Eventually(t, func() bool {
|
||||
return requestCount.Load() >= requestsBeforeReload+2
|
||||
}, 30*time.Second, 500*time.Millisecond, "Expected two more requests to the mock server after configuration reload")
|
||||
|
||||
// Ensure that no new lines were added to the scrape failure log file after
|
||||
// the configuration change.
|
||||
require.Equal(t, preReloadLogLineCount, countLinesInFile(scrapeFailureLogFile), "No new lines should be added to the scrape failure log file after removing the log setting")
|
||||
|
||||
// Step 3: Re-add the scrape failure log file setting, but this time under
|
||||
// scrape_configs, and reload Prometheus.
|
||||
promConfig = fmt.Sprintf(`
|
||||
global:
|
||||
scrape_interval: 1s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'test_job'
|
||||
scrape_failure_log_file: %s
|
||||
static_configs:
|
||||
- targets: ['%s']
|
||||
`, scrapeFailureLogFileName, mockServerAddress)
|
||||
|
||||
err = os.WriteFile(promConfigFile, []byte(promConfig), 0o644)
|
||||
require.NoError(t, err, "Failed to update Prometheus configuration file")
|
||||
|
||||
// Reload Prometheus with the updated configuration.
|
||||
reloadPrometheus(t, port)
|
||||
|
||||
// Wait for at least two more requests to the mock server and verify that
|
||||
// new log entries are created.
|
||||
postReloadLogLineCount := countLinesInFile(scrapeFailureLogFile)
|
||||
requestsBeforeReAddingLog := requestCount.Load()
|
||||
require.Eventually(t, func() bool {
|
||||
return requestCount.Load() >= requestsBeforeReAddingLog+2
|
||||
}, 30*time.Second, 500*time.Millisecond, "Expected two additional requests after re-adding the log setting")
|
||||
|
||||
// Confirm that new lines were added to the scrape failure log file.
|
||||
require.Greater(t, countLinesInFile(scrapeFailureLogFile), postReloadLogLineCount, "New lines should be added to the scrape failure log file after re-adding the log setting")
|
||||
}
|
||||
|
||||
// reloadPrometheus sends a reload request to the Prometheus server to apply
|
||||
// updated configurations.
|
||||
func reloadPrometheus(t *testing.T, port int) {
|
||||
resp, err := http.Post(fmt.Sprintf("http://127.0.0.1:%d/-/reload", port), "", nil)
|
||||
require.NoError(t, err, "Failed to reload Prometheus")
|
||||
require.Equal(t, http.StatusOK, resp.StatusCode, "Unexpected status code when reloading Prometheus")
|
||||
}
|
||||
|
||||
// startGarbageServer sets up a mock server that returns a 500 Internal Server Error
|
||||
// for all requests. It also increments the request count each time it's hit.
|
||||
func startGarbageServer(t *testing.T, requestCount *atomic.Int32) string {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
requestCount.Inc()
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
}))
|
||||
t.Cleanup(server.Close)
|
||||
|
||||
parsedURL, err := url.Parse(server.URL)
|
||||
require.NoError(t, err, "Failed to parse mock server URL")
|
||||
|
||||
return parsedURL.Host
|
||||
}
|
||||
|
||||
// countLinesInFile counts and returns the number of lines in the specified file.
|
||||
func countLinesInFile(filePath string) int {
|
||||
data, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
return 0 // Return 0 if the file doesn't exist or can't be read.
|
||||
}
|
||||
return bytes.Count(data, []byte{'\n'})
|
||||
}
|
|
@ -429,6 +429,8 @@ type GlobalConfig struct {
|
|||
RuleQueryOffset model.Duration `yaml:"rule_query_offset,omitempty"`
|
||||
// File to which PromQL queries are logged.
|
||||
QueryLogFile string `yaml:"query_log_file,omitempty"`
|
||||
// File to which scrape failures are logged.
|
||||
ScrapeFailureLogFile string `yaml:"scrape_failure_log_file,omitempty"`
|
||||
// The labels to add to any timeseries that this Prometheus instance scrapes.
|
||||
ExternalLabels labels.Labels `yaml:"external_labels,omitempty"`
|
||||
// An uncompressed response body larger than this many bytes will cause the
|
||||
|
@ -529,6 +531,7 @@ func validateAcceptScrapeProtocols(sps []ScrapeProtocol) error {
|
|||
// SetDirectory joins any relative file paths with dir.
|
||||
func (c *GlobalConfig) SetDirectory(dir string) {
|
||||
c.QueryLogFile = config.JoinDir(dir, c.QueryLogFile)
|
||||
c.ScrapeFailureLogFile = config.JoinDir(dir, c.ScrapeFailureLogFile)
|
||||
}
|
||||
|
||||
// UnmarshalYAML implements the yaml.Unmarshaler interface.
|
||||
|
@ -591,6 +594,7 @@ func (c *GlobalConfig) isZero() bool {
|
|||
c.EvaluationInterval == 0 &&
|
||||
c.RuleQueryOffset == 0 &&
|
||||
c.QueryLogFile == "" &&
|
||||
c.ScrapeFailureLogFile == "" &&
|
||||
c.ScrapeProtocols == nil
|
||||
}
|
||||
|
||||
|
@ -632,6 +636,8 @@ type ScrapeConfig struct {
|
|||
ScrapeProtocols []ScrapeProtocol `yaml:"scrape_protocols,omitempty"`
|
||||
// Whether to scrape a classic histogram that is also exposed as a native histogram.
|
||||
ScrapeClassicHistograms bool `yaml:"scrape_classic_histograms,omitempty"`
|
||||
// File to which scrape failures are logged.
|
||||
ScrapeFailureLogFile string `yaml:"scrape_failure_log_file,omitempty"`
|
||||
// The HTTP resource path on which to fetch metrics from targets.
|
||||
MetricsPath string `yaml:"metrics_path,omitempty"`
|
||||
// The URL scheme with which to fetch metrics from targets.
|
||||
|
@ -684,6 +690,7 @@ type ScrapeConfig struct {
|
|||
func (c *ScrapeConfig) SetDirectory(dir string) {
|
||||
c.ServiceDiscoveryConfigs.SetDirectory(dir)
|
||||
c.HTTPClientConfig.SetDirectory(dir)
|
||||
c.ScrapeFailureLogFile = config.JoinDir(dir, c.ScrapeFailureLogFile)
|
||||
}
|
||||
|
||||
// UnmarshalYAML implements the yaml.Unmarshaler interface.
|
||||
|
@ -765,6 +772,9 @@ func (c *ScrapeConfig) Validate(globalConfig GlobalConfig) error {
|
|||
if c.KeepDroppedTargets == 0 {
|
||||
c.KeepDroppedTargets = globalConfig.KeepDroppedTargets
|
||||
}
|
||||
if c.ScrapeFailureLogFile == "" {
|
||||
c.ScrapeFailureLogFile = globalConfig.ScrapeFailureLogFile
|
||||
}
|
||||
|
||||
if c.ScrapeProtocols == nil {
|
||||
c.ScrapeProtocols = globalConfig.ScrapeProtocols
|
||||
|
|
|
@ -78,14 +78,16 @@ const (
|
|||
globLabelNameLengthLimit = 200
|
||||
globLabelValueLengthLimit = 200
|
||||
globalGoGC = 42
|
||||
globScrapeFailureLogFile = "testdata/fail.log"
|
||||
)
|
||||
|
||||
var expectedConf = &Config{
|
||||
GlobalConfig: GlobalConfig{
|
||||
ScrapeInterval: model.Duration(15 * time.Second),
|
||||
ScrapeTimeout: DefaultGlobalConfig.ScrapeTimeout,
|
||||
EvaluationInterval: model.Duration(30 * time.Second),
|
||||
QueryLogFile: "",
|
||||
ScrapeInterval: model.Duration(15 * time.Second),
|
||||
ScrapeTimeout: DefaultGlobalConfig.ScrapeTimeout,
|
||||
EvaluationInterval: model.Duration(30 * time.Second),
|
||||
QueryLogFile: "testdata/query.log",
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
ExternalLabels: labels.FromStrings("foo", "bar", "monitor", "codelab"),
|
||||
|
||||
|
@ -211,6 +213,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: "testdata/fail_prom.log",
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -314,6 +317,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: 210,
|
||||
LabelValueLengthLimit: 210,
|
||||
ScrapeProtocols: []ScrapeProtocol{PrometheusText0_0_4},
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
HTTPClientConfig: config.HTTPClientConfig{
|
||||
BasicAuth: &config.BasicAuth{
|
||||
|
@ -411,6 +415,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -466,6 +471,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: "/metrics",
|
||||
Scheme: "http",
|
||||
|
@ -499,6 +505,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -538,6 +545,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -577,6 +585,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -606,6 +615,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -643,6 +653,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -677,6 +688,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -718,6 +730,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -749,6 +762,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -783,6 +797,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -810,6 +825,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -840,6 +856,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: "/federate",
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -870,6 +887,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -900,6 +918,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -927,6 +946,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -962,6 +982,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -996,6 +1017,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -1027,6 +1049,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -1057,6 +1080,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -1091,6 +1115,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -1128,6 +1153,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -1184,6 +1210,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -1211,6 +1238,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
HTTPClientConfig: config.DefaultHTTPClientConfig,
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
|
@ -1249,6 +1277,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
HTTPClientConfig: config.DefaultHTTPClientConfig,
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
|
@ -1293,6 +1322,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -1328,6 +1358,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
HTTPClientConfig: config.DefaultHTTPClientConfig,
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
|
@ -1357,6 +1388,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
@ -1389,6 +1421,7 @@ var expectedConf = &Config{
|
|||
LabelNameLengthLimit: globLabelNameLengthLimit,
|
||||
LabelValueLengthLimit: globLabelValueLengthLimit,
|
||||
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
|
||||
ScrapeFailureLogFile: globScrapeFailureLogFile,
|
||||
|
||||
MetricsPath: DefaultScrapeConfig.MetricsPath,
|
||||
Scheme: DefaultScrapeConfig.Scheme,
|
||||
|
|
|
@ -8,6 +8,8 @@ global:
|
|||
label_limit: 30
|
||||
label_name_length_limit: 200
|
||||
label_value_length_limit: 200
|
||||
query_log_file: query.log
|
||||
scrape_failure_log_file: fail.log
|
||||
# scrape_timeout is set to the global default (10s).
|
||||
|
||||
external_labels:
|
||||
|
@ -72,6 +74,7 @@ scrape_configs:
|
|||
# metrics_path defaults to '/metrics'
|
||||
# scheme defaults to 'http'.
|
||||
|
||||
scrape_failure_log_file: fail_prom.log
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- foo/*.slow.json
|
||||
|
|
|
@ -84,6 +84,10 @@ global:
|
|||
# Reloading the configuration will reopen the file.
|
||||
[ query_log_file: <string> ]
|
||||
|
||||
# File to which scrape failures are logged.
|
||||
# Reloading the configuration will reopen the file.
|
||||
[ scrape_failure_log_file: <string> ]
|
||||
|
||||
# An uncompressed response body larger than this many bytes will cause the
|
||||
# scrape to fail. 0 means no limit. Example: 100MB.
|
||||
# This is an experimental feature, this behaviour could
|
||||
|
@ -319,6 +323,10 @@ http_headers:
|
|||
# Files to read header values from.
|
||||
[ files: [<string>, ...] ] ]
|
||||
|
||||
# File to which scrape failures are logged.
|
||||
# Reloading the configuration will reopen the file.
|
||||
[ scrape_failure_log_file: <string> ]
|
||||
|
||||
# List of Azure service discovery configurations.
|
||||
azure_sd_configs:
|
||||
[ - <azure_sd_config> ... ]
|
||||
|
|
|
@ -17,6 +17,7 @@ import (
|
|||
"errors"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"io"
|
||||
"reflect"
|
||||
"sync"
|
||||
"time"
|
||||
|
@ -36,7 +37,7 @@ import (
|
|||
)
|
||||
|
||||
// NewManager is the Manager constructor.
|
||||
func NewManager(o *Options, logger log.Logger, app storage.Appendable, registerer prometheus.Registerer) (*Manager, error) {
|
||||
func NewManager(o *Options, logger log.Logger, newScrapeFailureLogger func(string) (log.Logger, error), app storage.Appendable, registerer prometheus.Registerer) (*Manager, error) {
|
||||
if o == nil {
|
||||
o = &Options{}
|
||||
}
|
||||
|
@ -50,15 +51,16 @@ func NewManager(o *Options, logger log.Logger, app storage.Appendable, registere
|
|||
}
|
||||
|
||||
m := &Manager{
|
||||
append: app,
|
||||
opts: o,
|
||||
logger: logger,
|
||||
scrapeConfigs: make(map[string]*config.ScrapeConfig),
|
||||
scrapePools: make(map[string]*scrapePool),
|
||||
graceShut: make(chan struct{}),
|
||||
triggerReload: make(chan struct{}, 1),
|
||||
metrics: sm,
|
||||
buffers: pool.New(1e3, 100e6, 3, func(sz int) interface{} { return make([]byte, 0, sz) }),
|
||||
append: app,
|
||||
opts: o,
|
||||
logger: logger,
|
||||
newScrapeFailureLogger: newScrapeFailureLogger,
|
||||
scrapeConfigs: make(map[string]*config.ScrapeConfig),
|
||||
scrapePools: make(map[string]*scrapePool),
|
||||
graceShut: make(chan struct{}),
|
||||
triggerReload: make(chan struct{}, 1),
|
||||
metrics: sm,
|
||||
buffers: pool.New(1e3, 100e6, 3, func(sz int) interface{} { return make([]byte, 0, sz) }),
|
||||
}
|
||||
|
||||
m.metrics.setTargetMetadataCacheGatherer(m)
|
||||
|
@ -103,12 +105,14 @@ type Manager struct {
|
|||
append storage.Appendable
|
||||
graceShut chan struct{}
|
||||
|
||||
offsetSeed uint64 // Global offsetSeed seed is used to spread scrape workload across HA setup.
|
||||
mtxScrape sync.Mutex // Guards the fields below.
|
||||
scrapeConfigs map[string]*config.ScrapeConfig
|
||||
scrapePools map[string]*scrapePool
|
||||
targetSets map[string][]*targetgroup.Group
|
||||
buffers *pool.Pool
|
||||
offsetSeed uint64 // Global offsetSeed seed is used to spread scrape workload across HA setup.
|
||||
mtxScrape sync.Mutex // Guards the fields below.
|
||||
scrapeConfigs map[string]*config.ScrapeConfig
|
||||
scrapePools map[string]*scrapePool
|
||||
newScrapeFailureLogger func(string) (log.Logger, error)
|
||||
scrapeFailureLoggers map[string]log.Logger
|
||||
targetSets map[string][]*targetgroup.Group
|
||||
buffers *pool.Pool
|
||||
|
||||
triggerReload chan struct{}
|
||||
|
||||
|
@ -183,6 +187,11 @@ func (m *Manager) reload() {
|
|||
continue
|
||||
}
|
||||
m.scrapePools[setName] = sp
|
||||
if l, ok := m.scrapeFailureLoggers[scrapeConfig.ScrapeFailureLogFile]; ok {
|
||||
sp.SetScrapeFailureLogger(l)
|
||||
} else {
|
||||
level.Error(sp.logger).Log("msg", "No logger found. This is a bug in Prometheus that should be reported upstream.", "scrape_pool", setName)
|
||||
}
|
||||
}
|
||||
|
||||
wg.Add(1)
|
||||
|
@ -238,11 +247,36 @@ func (m *Manager) ApplyConfig(cfg *config.Config) error {
|
|||
}
|
||||
|
||||
c := make(map[string]*config.ScrapeConfig)
|
||||
scrapeFailureLoggers := map[string]log.Logger{
|
||||
"": nil, // Emptying the file name sets the scrape logger to nil.
|
||||
}
|
||||
for _, scfg := range scfgs {
|
||||
c[scfg.JobName] = scfg
|
||||
if _, ok := scrapeFailureLoggers[scfg.ScrapeFailureLogFile]; !ok {
|
||||
// We promise to reopen the file on each reload.
|
||||
var (
|
||||
l log.Logger
|
||||
err error
|
||||
)
|
||||
if m.newScrapeFailureLogger != nil {
|
||||
if l, err = m.newScrapeFailureLogger(scfg.ScrapeFailureLogFile); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
scrapeFailureLoggers[scfg.ScrapeFailureLogFile] = l
|
||||
}
|
||||
}
|
||||
m.scrapeConfigs = c
|
||||
|
||||
oldScrapeFailureLoggers := m.scrapeFailureLoggers
|
||||
for _, s := range oldScrapeFailureLoggers {
|
||||
if closer, ok := s.(io.Closer); ok {
|
||||
defer closer.Close()
|
||||
}
|
||||
}
|
||||
|
||||
m.scrapeFailureLoggers = scrapeFailureLoggers
|
||||
|
||||
if err := m.setOffsetSeed(cfg.GlobalConfig.ExternalLabels); err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -260,6 +294,13 @@ func (m *Manager) ApplyConfig(cfg *config.Config) error {
|
|||
level.Error(m.logger).Log("msg", "error reloading scrape pool", "err", err, "scrape_pool", name)
|
||||
failed = true
|
||||
}
|
||||
fallthrough
|
||||
case ok:
|
||||
if l, ok := m.scrapeFailureLoggers[cfg.ScrapeFailureLogFile]; ok {
|
||||
sp.SetScrapeFailureLogger(l)
|
||||
} else {
|
||||
level.Error(sp.logger).Log("msg", "No logger found. This is a bug in Prometheus that should be reported upstream.", "scrape_pool", name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -511,7 +511,7 @@ scrape_configs:
|
|||
)
|
||||
|
||||
opts := Options{}
|
||||
scrapeManager, err := NewManager(&opts, nil, nil, testRegistry)
|
||||
scrapeManager, err := NewManager(&opts, nil, nil, nil, testRegistry)
|
||||
require.NoError(t, err)
|
||||
newLoop := func(scrapeLoopOptions) loop {
|
||||
ch <- struct{}{}
|
||||
|
@ -576,7 +576,7 @@ scrape_configs:
|
|||
func TestManagerTargetsUpdates(t *testing.T) {
|
||||
opts := Options{}
|
||||
testRegistry := prometheus.NewRegistry()
|
||||
m, err := NewManager(&opts, nil, nil, testRegistry)
|
||||
m, err := NewManager(&opts, nil, nil, nil, testRegistry)
|
||||
require.NoError(t, err)
|
||||
|
||||
ts := make(chan map[string][]*targetgroup.Group)
|
||||
|
@ -629,7 +629,7 @@ global:
|
|||
|
||||
opts := Options{}
|
||||
testRegistry := prometheus.NewRegistry()
|
||||
scrapeManager, err := NewManager(&opts, nil, nil, testRegistry)
|
||||
scrapeManager, err := NewManager(&opts, nil, nil, nil, testRegistry)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Load the first config.
|
||||
|
@ -706,7 +706,7 @@ scrape_configs:
|
|||
}
|
||||
|
||||
opts := Options{}
|
||||
scrapeManager, err := NewManager(&opts, nil, nil, testRegistry)
|
||||
scrapeManager, err := NewManager(&opts, nil, nil, nil, testRegistry)
|
||||
require.NoError(t, err)
|
||||
|
||||
reload(scrapeManager, cfg1)
|
||||
|
@ -758,6 +758,7 @@ func TestManagerCTZeroIngestion(t *testing.T) {
|
|||
skipOffsetting: true,
|
||||
},
|
||||
log.NewLogfmtLogger(os.Stderr),
|
||||
nil,
|
||||
&collectResultAppendable{app},
|
||||
prometheus.NewRegistry(),
|
||||
)
|
||||
|
@ -857,7 +858,7 @@ func TestUnregisterMetrics(t *testing.T) {
|
|||
// Check that all metrics can be unregistered, allowing a second manager to be created.
|
||||
for i := 0; i < 2; i++ {
|
||||
opts := Options{}
|
||||
manager, err := NewManager(&opts, nil, nil, reg)
|
||||
manager, err := NewManager(&opts, nil, nil, nil, reg)
|
||||
require.NotNil(t, manager)
|
||||
require.NoError(t, err)
|
||||
// Unregister all metrics.
|
||||
|
@ -901,6 +902,7 @@ func runManagers(t *testing.T, ctx context.Context) (*discovery.Manager, *Manage
|
|||
scrapeManager, err := NewManager(
|
||||
&Options{DiscoveryReloadInterval: model.Duration(100 * time.Millisecond)},
|
||||
nil,
|
||||
nil,
|
||||
nopAppendable{},
|
||||
prometheus.NewRegistry(),
|
||||
)
|
||||
|
|
|
@ -90,6 +90,9 @@ type scrapePool struct {
|
|||
noDefaultPort bool
|
||||
|
||||
metrics *scrapeMetrics
|
||||
|
||||
scrapeFailureLogger log.Logger
|
||||
scrapeFailureLoggerMtx sync.RWMutex
|
||||
}
|
||||
|
||||
type labelLimits struct {
|
||||
|
@ -218,6 +221,27 @@ func (sp *scrapePool) DroppedTargetsCount() int {
|
|||
return sp.droppedTargetsCount
|
||||
}
|
||||
|
||||
func (sp *scrapePool) SetScrapeFailureLogger(l log.Logger) {
|
||||
sp.scrapeFailureLoggerMtx.Lock()
|
||||
defer sp.scrapeFailureLoggerMtx.Unlock()
|
||||
if l != nil {
|
||||
l = log.With(l, "job_name", sp.config.JobName)
|
||||
}
|
||||
sp.scrapeFailureLogger = l
|
||||
|
||||
sp.targetMtx.Lock()
|
||||
defer sp.targetMtx.Unlock()
|
||||
for _, s := range sp.loops {
|
||||
s.setScrapeFailureLogger(sp.scrapeFailureLogger)
|
||||
}
|
||||
}
|
||||
|
||||
func (sp *scrapePool) getScrapeFailureLogger() log.Logger {
|
||||
sp.scrapeFailureLoggerMtx.RLock()
|
||||
defer sp.scrapeFailureLoggerMtx.RUnlock()
|
||||
return sp.scrapeFailureLogger
|
||||
}
|
||||
|
||||
// stop terminates all scrape loops and returns after they all terminated.
|
||||
func (sp *scrapePool) stop() {
|
||||
sp.mtx.Lock()
|
||||
|
@ -361,6 +385,7 @@ func (sp *scrapePool) restartLoops(reuseCache bool) {
|
|||
wg.Done()
|
||||
|
||||
newLoop.setForcedError(forcedErr)
|
||||
newLoop.setScrapeFailureLogger(sp.getScrapeFailureLogger())
|
||||
newLoop.run(nil)
|
||||
}(oldLoop, newLoop)
|
||||
|
||||
|
@ -503,6 +528,7 @@ func (sp *scrapePool) sync(targets []*Target) {
|
|||
if err != nil {
|
||||
l.setForcedError(err)
|
||||
}
|
||||
l.setScrapeFailureLogger(sp.scrapeFailureLogger)
|
||||
|
||||
sp.activeTargets[hash] = t
|
||||
sp.loops[hash] = l
|
||||
|
@ -825,6 +851,7 @@ func (s *targetScraper) readResponse(ctx context.Context, resp *http.Response, w
|
|||
type loop interface {
|
||||
run(errc chan<- error)
|
||||
setForcedError(err error)
|
||||
setScrapeFailureLogger(log.Logger)
|
||||
stop()
|
||||
getCache() *scrapeCache
|
||||
disableEndOfRunStalenessMarkers()
|
||||
|
@ -840,6 +867,8 @@ type cacheEntry struct {
|
|||
type scrapeLoop struct {
|
||||
scraper scraper
|
||||
l log.Logger
|
||||
scrapeFailureLogger log.Logger
|
||||
scrapeFailureLoggerMtx sync.RWMutex
|
||||
cache *scrapeCache
|
||||
lastScrapeSize int
|
||||
buffers *pool.Pool
|
||||
|
@ -1223,6 +1252,15 @@ func newScrapeLoop(ctx context.Context,
|
|||
return sl
|
||||
}
|
||||
|
||||
func (sl *scrapeLoop) setScrapeFailureLogger(l log.Logger) {
|
||||
sl.scrapeFailureLoggerMtx.Lock()
|
||||
defer sl.scrapeFailureLoggerMtx.Unlock()
|
||||
if ts, ok := sl.scraper.(fmt.Stringer); ok && l != nil {
|
||||
l = log.With(l, "target", ts.String())
|
||||
}
|
||||
sl.scrapeFailureLogger = l
|
||||
}
|
||||
|
||||
func (sl *scrapeLoop) run(errc chan<- error) {
|
||||
if !sl.skipOffsetting {
|
||||
select {
|
||||
|
@ -1366,6 +1404,11 @@ func (sl *scrapeLoop) scrapeAndReport(last, appendTime time.Time, errc chan<- er
|
|||
bytesRead = len(b)
|
||||
} else {
|
||||
level.Debug(sl.l).Log("msg", "Scrape failed", "err", scrapeErr)
|
||||
sl.scrapeFailureLoggerMtx.RLock()
|
||||
if sl.scrapeFailureLogger != nil {
|
||||
sl.scrapeFailureLogger.Log("err", scrapeErr)
|
||||
}
|
||||
sl.scrapeFailureLoggerMtx.RUnlock()
|
||||
if errc != nil {
|
||||
errc <- scrapeErr
|
||||
}
|
||||
|
|
|
@ -158,6 +158,9 @@ type testLoop struct {
|
|||
timeout time.Duration
|
||||
}
|
||||
|
||||
func (l *testLoop) setScrapeFailureLogger(log.Logger) {
|
||||
}
|
||||
|
||||
func (l *testLoop) run(errc chan<- error) {
|
||||
if l.runOnce {
|
||||
panic("loop must be started only once")
|
||||
|
@ -3782,7 +3785,7 @@ scrape_configs:
|
|||
s.DB.EnableNativeHistograms()
|
||||
reg := prometheus.NewRegistry()
|
||||
|
||||
mng, err := NewManager(&Options{EnableNativeHistogramsIngestion: true}, nil, s, reg)
|
||||
mng, err := NewManager(&Options{EnableNativeHistogramsIngestion: true}, nil, nil, s, reg)
|
||||
require.NoError(t, err)
|
||||
cfg, err := config.Load(configStr, false, log.NewNopLogger())
|
||||
require.NoError(t, err)
|
||||
|
|
Loading…
Reference in New Issue