Scrape: Add scrape_failure_log_file to log Scrape Failures

Signed-off-by: Julien <roidelapluie@o11y.eu>
pull/14734/head
Julien 2024-08-26 11:41:56 +02:00
parent 70bb219d33
commit ce0f09b125
10 changed files with 363 additions and 26 deletions

View File

@ -755,6 +755,7 @@ func main() {
scrapeManager, err := scrape.NewManager(
&cfg.scrape,
log.With(logger, "component", "scrape manager"),
func(s string) (log.Logger, error) { return logging.NewJSONFileLogger(s) },
fanoutStorage,
prometheus.DefaultRegisterer,
)

View File

@ -0,0 +1,193 @@
// Copyright 2024 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"bytes"
"fmt"
"net/http"
"net/http/httptest"
"net/url"
"os"
"os/exec"
"path/filepath"
"testing"
"time"
"github.com/stretchr/testify/require"
"go.uber.org/atomic"
"github.com/prometheus/prometheus/util/testutil"
)
func TestScrapeFailureLogFile(t *testing.T) {
if testing.Short() {
t.Skip("skipping test in short mode.")
}
// Tracks the number of requests made to the mock server.
var requestCount atomic.Int32
// Starts a server that always returns HTTP 500 errors.
mockServerAddress := startGarbageServer(t, &requestCount)
// Create a temporary directory for Prometheus configuration and logs.
tempDir := t.TempDir()
// Define file paths for the scrape failure log and Prometheus configuration.
// Like other files, the scrape failure log file should be relative to the
// config file. Therefore, we split the name we put in the file and the full
// path used to check the content of the file.
scrapeFailureLogFileName := "scrape_failure.log"
scrapeFailureLogFile := filepath.Join(tempDir, scrapeFailureLogFileName)
promConfigFile := filepath.Join(tempDir, "prometheus.yml")
// Step 1: Set up an initial Prometheus configuration that globally
// specifies a scrape failure log file.
promConfig := fmt.Sprintf(`
global:
scrape_interval: 500ms
scrape_failure_log_file: %s
scrape_configs:
- job_name: 'test_job'
static_configs:
- targets: ['%s']
`, scrapeFailureLogFileName, mockServerAddress)
err := os.WriteFile(promConfigFile, []byte(promConfig), 0o644)
require.NoError(t, err, "Failed to write Prometheus configuration file")
// Start Prometheus with the generated configuration and a random port, enabling the lifecycle API.
port := testutil.RandomUnprivilegedPort(t)
params := []string{
"-test.main",
"--config.file=" + promConfigFile,
"--storage.tsdb.path=" + filepath.Join(tempDir, "data"),
fmt.Sprintf("--web.listen-address=127.0.0.1:%d", port),
"--web.enable-lifecycle",
}
prometheusProcess := exec.Command(promPath, params...)
prometheusProcess.Stdout = os.Stdout
prometheusProcess.Stderr = os.Stderr
err = prometheusProcess.Start()
require.NoError(t, err, "Failed to start Prometheus")
defer prometheusProcess.Process.Kill()
// Wait until the mock server receives at least two requests from Prometheus.
require.Eventually(t, func() bool {
return requestCount.Load() >= 2
}, 30*time.Second, 500*time.Millisecond, "Expected at least two requests to the mock server")
// Verify that the scrape failures have been logged to the specified file.
content, err := os.ReadFile(scrapeFailureLogFile)
require.NoError(t, err, "Failed to read scrape failure log")
require.Contains(t, string(content), "server returned HTTP status 500 Internal Server Error", "Expected scrape failure log entry not found")
// Step 2: Update the Prometheus configuration to remove the scrape failure
// log file setting.
promConfig = fmt.Sprintf(`
global:
scrape_interval: 1s
scrape_configs:
- job_name: 'test_job'
static_configs:
- targets: ['%s']
`, mockServerAddress)
err = os.WriteFile(promConfigFile, []byte(promConfig), 0o644)
require.NoError(t, err, "Failed to update Prometheus configuration file")
// Reload Prometheus with the updated configuration.
reloadPrometheus(t, port)
// Count the number of lines in the scrape failure log file before any
// further requests.
preReloadLogLineCount := countLinesInFile(scrapeFailureLogFile)
// Wait for at least two more requests to the mock server to ensure
// Prometheus continues scraping.
requestsBeforeReload := requestCount.Load()
require.Eventually(t, func() bool {
return requestCount.Load() >= requestsBeforeReload+2
}, 30*time.Second, 500*time.Millisecond, "Expected two more requests to the mock server after configuration reload")
// Ensure that no new lines were added to the scrape failure log file after
// the configuration change.
require.Equal(t, preReloadLogLineCount, countLinesInFile(scrapeFailureLogFile), "No new lines should be added to the scrape failure log file after removing the log setting")
// Step 3: Re-add the scrape failure log file setting, but this time under
// scrape_configs, and reload Prometheus.
promConfig = fmt.Sprintf(`
global:
scrape_interval: 1s
scrape_configs:
- job_name: 'test_job'
scrape_failure_log_file: %s
static_configs:
- targets: ['%s']
`, scrapeFailureLogFileName, mockServerAddress)
err = os.WriteFile(promConfigFile, []byte(promConfig), 0o644)
require.NoError(t, err, "Failed to update Prometheus configuration file")
// Reload Prometheus with the updated configuration.
reloadPrometheus(t, port)
// Wait for at least two more requests to the mock server and verify that
// new log entries are created.
postReloadLogLineCount := countLinesInFile(scrapeFailureLogFile)
requestsBeforeReAddingLog := requestCount.Load()
require.Eventually(t, func() bool {
return requestCount.Load() >= requestsBeforeReAddingLog+2
}, 30*time.Second, 500*time.Millisecond, "Expected two additional requests after re-adding the log setting")
// Confirm that new lines were added to the scrape failure log file.
require.Greater(t, countLinesInFile(scrapeFailureLogFile), postReloadLogLineCount, "New lines should be added to the scrape failure log file after re-adding the log setting")
}
// reloadPrometheus sends a reload request to the Prometheus server to apply
// updated configurations.
func reloadPrometheus(t *testing.T, port int) {
resp, err := http.Post(fmt.Sprintf("http://127.0.0.1:%d/-/reload", port), "", nil)
require.NoError(t, err, "Failed to reload Prometheus")
require.Equal(t, http.StatusOK, resp.StatusCode, "Unexpected status code when reloading Prometheus")
}
// startGarbageServer sets up a mock server that returns a 500 Internal Server Error
// for all requests. It also increments the request count each time it's hit.
func startGarbageServer(t *testing.T, requestCount *atomic.Int32) string {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
requestCount.Inc()
w.WriteHeader(http.StatusInternalServerError)
}))
t.Cleanup(server.Close)
parsedURL, err := url.Parse(server.URL)
require.NoError(t, err, "Failed to parse mock server URL")
return parsedURL.Host
}
// countLinesInFile counts and returns the number of lines in the specified file.
func countLinesInFile(filePath string) int {
data, err := os.ReadFile(filePath)
if err != nil {
return 0 // Return 0 if the file doesn't exist or can't be read.
}
return bytes.Count(data, []byte{'\n'})
}

View File

@ -429,6 +429,8 @@ type GlobalConfig struct {
RuleQueryOffset model.Duration `yaml:"rule_query_offset,omitempty"`
// File to which PromQL queries are logged.
QueryLogFile string `yaml:"query_log_file,omitempty"`
// File to which scrape failures are logged.
ScrapeFailureLogFile string `yaml:"scrape_failure_log_file,omitempty"`
// The labels to add to any timeseries that this Prometheus instance scrapes.
ExternalLabels labels.Labels `yaml:"external_labels,omitempty"`
// An uncompressed response body larger than this many bytes will cause the
@ -529,6 +531,7 @@ func validateAcceptScrapeProtocols(sps []ScrapeProtocol) error {
// SetDirectory joins any relative file paths with dir.
func (c *GlobalConfig) SetDirectory(dir string) {
c.QueryLogFile = config.JoinDir(dir, c.QueryLogFile)
c.ScrapeFailureLogFile = config.JoinDir(dir, c.ScrapeFailureLogFile)
}
// UnmarshalYAML implements the yaml.Unmarshaler interface.
@ -591,6 +594,7 @@ func (c *GlobalConfig) isZero() bool {
c.EvaluationInterval == 0 &&
c.RuleQueryOffset == 0 &&
c.QueryLogFile == "" &&
c.ScrapeFailureLogFile == "" &&
c.ScrapeProtocols == nil
}
@ -632,6 +636,8 @@ type ScrapeConfig struct {
ScrapeProtocols []ScrapeProtocol `yaml:"scrape_protocols,omitempty"`
// Whether to scrape a classic histogram that is also exposed as a native histogram.
ScrapeClassicHistograms bool `yaml:"scrape_classic_histograms,omitempty"`
// File to which scrape failures are logged.
ScrapeFailureLogFile string `yaml:"scrape_failure_log_file,omitempty"`
// The HTTP resource path on which to fetch metrics from targets.
MetricsPath string `yaml:"metrics_path,omitempty"`
// The URL scheme with which to fetch metrics from targets.
@ -684,6 +690,7 @@ type ScrapeConfig struct {
func (c *ScrapeConfig) SetDirectory(dir string) {
c.ServiceDiscoveryConfigs.SetDirectory(dir)
c.HTTPClientConfig.SetDirectory(dir)
c.ScrapeFailureLogFile = config.JoinDir(dir, c.ScrapeFailureLogFile)
}
// UnmarshalYAML implements the yaml.Unmarshaler interface.
@ -765,6 +772,9 @@ func (c *ScrapeConfig) Validate(globalConfig GlobalConfig) error {
if c.KeepDroppedTargets == 0 {
c.KeepDroppedTargets = globalConfig.KeepDroppedTargets
}
if c.ScrapeFailureLogFile == "" {
c.ScrapeFailureLogFile = globalConfig.ScrapeFailureLogFile
}
if c.ScrapeProtocols == nil {
c.ScrapeProtocols = globalConfig.ScrapeProtocols

View File

@ -78,14 +78,16 @@ const (
globLabelNameLengthLimit = 200
globLabelValueLengthLimit = 200
globalGoGC = 42
globScrapeFailureLogFile = "testdata/fail.log"
)
var expectedConf = &Config{
GlobalConfig: GlobalConfig{
ScrapeInterval: model.Duration(15 * time.Second),
ScrapeTimeout: DefaultGlobalConfig.ScrapeTimeout,
EvaluationInterval: model.Duration(30 * time.Second),
QueryLogFile: "",
ScrapeInterval: model.Duration(15 * time.Second),
ScrapeTimeout: DefaultGlobalConfig.ScrapeTimeout,
EvaluationInterval: model.Duration(30 * time.Second),
QueryLogFile: "testdata/query.log",
ScrapeFailureLogFile: globScrapeFailureLogFile,
ExternalLabels: labels.FromStrings("foo", "bar", "monitor", "codelab"),
@ -211,6 +213,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: "testdata/fail_prom.log",
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -314,6 +317,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: 210,
LabelValueLengthLimit: 210,
ScrapeProtocols: []ScrapeProtocol{PrometheusText0_0_4},
ScrapeFailureLogFile: globScrapeFailureLogFile,
HTTPClientConfig: config.HTTPClientConfig{
BasicAuth: &config.BasicAuth{
@ -411,6 +415,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -466,6 +471,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: "/metrics",
Scheme: "http",
@ -499,6 +505,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -538,6 +545,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -577,6 +585,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -606,6 +615,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -643,6 +653,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -677,6 +688,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -718,6 +730,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -749,6 +762,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -783,6 +797,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -810,6 +825,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -840,6 +856,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: "/federate",
Scheme: DefaultScrapeConfig.Scheme,
@ -870,6 +887,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -900,6 +918,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -927,6 +946,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -962,6 +982,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -996,6 +1017,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -1027,6 +1049,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -1057,6 +1080,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -1091,6 +1115,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -1128,6 +1153,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -1184,6 +1210,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -1211,6 +1238,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
HTTPClientConfig: config.DefaultHTTPClientConfig,
MetricsPath: DefaultScrapeConfig.MetricsPath,
@ -1249,6 +1277,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
HTTPClientConfig: config.DefaultHTTPClientConfig,
MetricsPath: DefaultScrapeConfig.MetricsPath,
@ -1293,6 +1322,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -1328,6 +1358,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
HTTPClientConfig: config.DefaultHTTPClientConfig,
MetricsPath: DefaultScrapeConfig.MetricsPath,
@ -1357,6 +1388,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,
@ -1389,6 +1421,7 @@ var expectedConf = &Config{
LabelNameLengthLimit: globLabelNameLengthLimit,
LabelValueLengthLimit: globLabelValueLengthLimit,
ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,
ScrapeFailureLogFile: globScrapeFailureLogFile,
MetricsPath: DefaultScrapeConfig.MetricsPath,
Scheme: DefaultScrapeConfig.Scheme,

View File

@ -8,6 +8,8 @@ global:
label_limit: 30
label_name_length_limit: 200
label_value_length_limit: 200
query_log_file: query.log
scrape_failure_log_file: fail.log
# scrape_timeout is set to the global default (10s).
external_labels:
@ -72,6 +74,7 @@ scrape_configs:
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
scrape_failure_log_file: fail_prom.log
file_sd_configs:
- files:
- foo/*.slow.json

View File

@ -84,6 +84,10 @@ global:
# Reloading the configuration will reopen the file.
[ query_log_file: <string> ]
# File to which scrape failures are logged.
# Reloading the configuration will reopen the file.
[ scrape_failure_log_file: <string> ]
# An uncompressed response body larger than this many bytes will cause the
# scrape to fail. 0 means no limit. Example: 100MB.
# This is an experimental feature, this behaviour could
@ -319,6 +323,10 @@ http_headers:
# Files to read header values from.
[ files: [<string>, ...] ] ]
# File to which scrape failures are logged.
# Reloading the configuration will reopen the file.
[ scrape_failure_log_file: <string> ]
# List of Azure service discovery configurations.
azure_sd_configs:
[ - <azure_sd_config> ... ]

View File

@ -17,6 +17,7 @@ import (
"errors"
"fmt"
"hash/fnv"
"io"
"reflect"
"sync"
"time"
@ -36,7 +37,7 @@ import (
)
// NewManager is the Manager constructor.
func NewManager(o *Options, logger log.Logger, app storage.Appendable, registerer prometheus.Registerer) (*Manager, error) {
func NewManager(o *Options, logger log.Logger, newScrapeFailureLogger func(string) (log.Logger, error), app storage.Appendable, registerer prometheus.Registerer) (*Manager, error) {
if o == nil {
o = &Options{}
}
@ -50,15 +51,16 @@ func NewManager(o *Options, logger log.Logger, app storage.Appendable, registere
}
m := &Manager{
append: app,
opts: o,
logger: logger,
scrapeConfigs: make(map[string]*config.ScrapeConfig),
scrapePools: make(map[string]*scrapePool),
graceShut: make(chan struct{}),
triggerReload: make(chan struct{}, 1),
metrics: sm,
buffers: pool.New(1e3, 100e6, 3, func(sz int) interface{} { return make([]byte, 0, sz) }),
append: app,
opts: o,
logger: logger,
newScrapeFailureLogger: newScrapeFailureLogger,
scrapeConfigs: make(map[string]*config.ScrapeConfig),
scrapePools: make(map[string]*scrapePool),
graceShut: make(chan struct{}),
triggerReload: make(chan struct{}, 1),
metrics: sm,
buffers: pool.New(1e3, 100e6, 3, func(sz int) interface{} { return make([]byte, 0, sz) }),
}
m.metrics.setTargetMetadataCacheGatherer(m)
@ -103,12 +105,14 @@ type Manager struct {
append storage.Appendable
graceShut chan struct{}
offsetSeed uint64 // Global offsetSeed seed is used to spread scrape workload across HA setup.
mtxScrape sync.Mutex // Guards the fields below.
scrapeConfigs map[string]*config.ScrapeConfig
scrapePools map[string]*scrapePool
targetSets map[string][]*targetgroup.Group
buffers *pool.Pool
offsetSeed uint64 // Global offsetSeed seed is used to spread scrape workload across HA setup.
mtxScrape sync.Mutex // Guards the fields below.
scrapeConfigs map[string]*config.ScrapeConfig
scrapePools map[string]*scrapePool
newScrapeFailureLogger func(string) (log.Logger, error)
scrapeFailureLoggers map[string]log.Logger
targetSets map[string][]*targetgroup.Group
buffers *pool.Pool
triggerReload chan struct{}
@ -183,6 +187,11 @@ func (m *Manager) reload() {
continue
}
m.scrapePools[setName] = sp
if l, ok := m.scrapeFailureLoggers[scrapeConfig.ScrapeFailureLogFile]; ok {
sp.SetScrapeFailureLogger(l)
} else {
level.Error(sp.logger).Log("msg", "No logger found. This is a bug in Prometheus that should be reported upstream.", "scrape_pool", setName)
}
}
wg.Add(1)
@ -238,11 +247,36 @@ func (m *Manager) ApplyConfig(cfg *config.Config) error {
}
c := make(map[string]*config.ScrapeConfig)
scrapeFailureLoggers := map[string]log.Logger{
"": nil, // Emptying the file name sets the scrape logger to nil.
}
for _, scfg := range scfgs {
c[scfg.JobName] = scfg
if _, ok := scrapeFailureLoggers[scfg.ScrapeFailureLogFile]; !ok {
// We promise to reopen the file on each reload.
var (
l log.Logger
err error
)
if m.newScrapeFailureLogger != nil {
if l, err = m.newScrapeFailureLogger(scfg.ScrapeFailureLogFile); err != nil {
return err
}
}
scrapeFailureLoggers[scfg.ScrapeFailureLogFile] = l
}
}
m.scrapeConfigs = c
oldScrapeFailureLoggers := m.scrapeFailureLoggers
for _, s := range oldScrapeFailureLoggers {
if closer, ok := s.(io.Closer); ok {
defer closer.Close()
}
}
m.scrapeFailureLoggers = scrapeFailureLoggers
if err := m.setOffsetSeed(cfg.GlobalConfig.ExternalLabels); err != nil {
return err
}
@ -260,6 +294,13 @@ func (m *Manager) ApplyConfig(cfg *config.Config) error {
level.Error(m.logger).Log("msg", "error reloading scrape pool", "err", err, "scrape_pool", name)
failed = true
}
fallthrough
case ok:
if l, ok := m.scrapeFailureLoggers[cfg.ScrapeFailureLogFile]; ok {
sp.SetScrapeFailureLogger(l)
} else {
level.Error(sp.logger).Log("msg", "No logger found. This is a bug in Prometheus that should be reported upstream.", "scrape_pool", name)
}
}
}

View File

@ -511,7 +511,7 @@ scrape_configs:
)
opts := Options{}
scrapeManager, err := NewManager(&opts, nil, nil, testRegistry)
scrapeManager, err := NewManager(&opts, nil, nil, nil, testRegistry)
require.NoError(t, err)
newLoop := func(scrapeLoopOptions) loop {
ch <- struct{}{}
@ -576,7 +576,7 @@ scrape_configs:
func TestManagerTargetsUpdates(t *testing.T) {
opts := Options{}
testRegistry := prometheus.NewRegistry()
m, err := NewManager(&opts, nil, nil, testRegistry)
m, err := NewManager(&opts, nil, nil, nil, testRegistry)
require.NoError(t, err)
ts := make(chan map[string][]*targetgroup.Group)
@ -629,7 +629,7 @@ global:
opts := Options{}
testRegistry := prometheus.NewRegistry()
scrapeManager, err := NewManager(&opts, nil, nil, testRegistry)
scrapeManager, err := NewManager(&opts, nil, nil, nil, testRegistry)
require.NoError(t, err)
// Load the first config.
@ -706,7 +706,7 @@ scrape_configs:
}
opts := Options{}
scrapeManager, err := NewManager(&opts, nil, nil, testRegistry)
scrapeManager, err := NewManager(&opts, nil, nil, nil, testRegistry)
require.NoError(t, err)
reload(scrapeManager, cfg1)
@ -758,6 +758,7 @@ func TestManagerCTZeroIngestion(t *testing.T) {
skipOffsetting: true,
},
log.NewLogfmtLogger(os.Stderr),
nil,
&collectResultAppendable{app},
prometheus.NewRegistry(),
)
@ -857,7 +858,7 @@ func TestUnregisterMetrics(t *testing.T) {
// Check that all metrics can be unregistered, allowing a second manager to be created.
for i := 0; i < 2; i++ {
opts := Options{}
manager, err := NewManager(&opts, nil, nil, reg)
manager, err := NewManager(&opts, nil, nil, nil, reg)
require.NotNil(t, manager)
require.NoError(t, err)
// Unregister all metrics.
@ -901,6 +902,7 @@ func runManagers(t *testing.T, ctx context.Context) (*discovery.Manager, *Manage
scrapeManager, err := NewManager(
&Options{DiscoveryReloadInterval: model.Duration(100 * time.Millisecond)},
nil,
nil,
nopAppendable{},
prometheus.NewRegistry(),
)

View File

@ -90,6 +90,9 @@ type scrapePool struct {
noDefaultPort bool
metrics *scrapeMetrics
scrapeFailureLogger log.Logger
scrapeFailureLoggerMtx sync.RWMutex
}
type labelLimits struct {
@ -218,6 +221,27 @@ func (sp *scrapePool) DroppedTargetsCount() int {
return sp.droppedTargetsCount
}
func (sp *scrapePool) SetScrapeFailureLogger(l log.Logger) {
sp.scrapeFailureLoggerMtx.Lock()
defer sp.scrapeFailureLoggerMtx.Unlock()
if l != nil {
l = log.With(l, "job_name", sp.config.JobName)
}
sp.scrapeFailureLogger = l
sp.targetMtx.Lock()
defer sp.targetMtx.Unlock()
for _, s := range sp.loops {
s.setScrapeFailureLogger(sp.scrapeFailureLogger)
}
}
func (sp *scrapePool) getScrapeFailureLogger() log.Logger {
sp.scrapeFailureLoggerMtx.RLock()
defer sp.scrapeFailureLoggerMtx.RUnlock()
return sp.scrapeFailureLogger
}
// stop terminates all scrape loops and returns after they all terminated.
func (sp *scrapePool) stop() {
sp.mtx.Lock()
@ -361,6 +385,7 @@ func (sp *scrapePool) restartLoops(reuseCache bool) {
wg.Done()
newLoop.setForcedError(forcedErr)
newLoop.setScrapeFailureLogger(sp.getScrapeFailureLogger())
newLoop.run(nil)
}(oldLoop, newLoop)
@ -503,6 +528,7 @@ func (sp *scrapePool) sync(targets []*Target) {
if err != nil {
l.setForcedError(err)
}
l.setScrapeFailureLogger(sp.scrapeFailureLogger)
sp.activeTargets[hash] = t
sp.loops[hash] = l
@ -825,6 +851,7 @@ func (s *targetScraper) readResponse(ctx context.Context, resp *http.Response, w
type loop interface {
run(errc chan<- error)
setForcedError(err error)
setScrapeFailureLogger(log.Logger)
stop()
getCache() *scrapeCache
disableEndOfRunStalenessMarkers()
@ -840,6 +867,8 @@ type cacheEntry struct {
type scrapeLoop struct {
scraper scraper
l log.Logger
scrapeFailureLogger log.Logger
scrapeFailureLoggerMtx sync.RWMutex
cache *scrapeCache
lastScrapeSize int
buffers *pool.Pool
@ -1223,6 +1252,15 @@ func newScrapeLoop(ctx context.Context,
return sl
}
func (sl *scrapeLoop) setScrapeFailureLogger(l log.Logger) {
sl.scrapeFailureLoggerMtx.Lock()
defer sl.scrapeFailureLoggerMtx.Unlock()
if ts, ok := sl.scraper.(fmt.Stringer); ok && l != nil {
l = log.With(l, "target", ts.String())
}
sl.scrapeFailureLogger = l
}
func (sl *scrapeLoop) run(errc chan<- error) {
if !sl.skipOffsetting {
select {
@ -1366,6 +1404,11 @@ func (sl *scrapeLoop) scrapeAndReport(last, appendTime time.Time, errc chan<- er
bytesRead = len(b)
} else {
level.Debug(sl.l).Log("msg", "Scrape failed", "err", scrapeErr)
sl.scrapeFailureLoggerMtx.RLock()
if sl.scrapeFailureLogger != nil {
sl.scrapeFailureLogger.Log("err", scrapeErr)
}
sl.scrapeFailureLoggerMtx.RUnlock()
if errc != nil {
errc <- scrapeErr
}

View File

@ -158,6 +158,9 @@ type testLoop struct {
timeout time.Duration
}
func (l *testLoop) setScrapeFailureLogger(log.Logger) {
}
func (l *testLoop) run(errc chan<- error) {
if l.runOnce {
panic("loop must be started only once")
@ -3782,7 +3785,7 @@ scrape_configs:
s.DB.EnableNativeHistograms()
reg := prometheus.NewRegistry()
mng, err := NewManager(&Options{EnableNativeHistogramsIngestion: true}, nil, s, reg)
mng, err := NewManager(&Options{EnableNativeHistogramsIngestion: true}, nil, nil, s, reg)
require.NoError(t, err)
cfg, err := config.Load(configStr, false, log.NewNopLogger())
require.NoError(t, err)