From 1df05bfd497fa8070aa6a86b81dad1dece484eb1 Mon Sep 17 00:00:00 2001 From: hanjm Date: Sun, 16 May 2021 10:19:22 +0800 Subject: [PATCH] Add body_size_limit to prevent bad targets response large body cause Prometheus server OOM (#8827) Signed-off-by: hanjm --- config/config.go | 4 ++ config/config_test.go | 6 ++ config/testdata/conf.good.yml | 1 + .../testdata/scrape_body_size_limit.bad.yml | 3 + docs/configuration/configuration.md | 3 + .../prometheus-mixin/dashboards.libsonnet | 2 + scrape/scrape.go | 52 +++++++++++---- scrape/scrape_test.go | 64 +++++++++++++++++++ 8 files changed, 121 insertions(+), 14 deletions(-) create mode 100644 config/testdata/scrape_body_size_limit.bad.yml diff --git a/config/config.go b/config/config.go index d71f6b72e..ce0deedff 100644 --- a/config/config.go +++ b/config/config.go @@ -23,6 +23,7 @@ import ( "strings" "time" + "github.com/alecthomas/units" "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" "github.com/pkg/errors" @@ -382,6 +383,9 @@ type ScrapeConfig struct { MetricsPath string `yaml:"metrics_path,omitempty"` // The URL scheme with which to fetch metrics from targets. Scheme string `yaml:"scheme,omitempty"` + // An uncompressed response body larger than this many bytes will cause the + // scrape to fail. 0 means no limit. + BodySizeLimit units.Base2Bytes `yaml:"body_size_limit,omitempty"` // More than this many samples post metric-relabeling will cause the scrape to // fail. SampleLimit uint `yaml:"sample_limit,omitempty"` diff --git a/config/config_test.go b/config/config_test.go index ddc014bc6..9afcfd9b9 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -23,6 +23,7 @@ import ( "testing" "time" + "github.com/alecthomas/units" "github.com/go-kit/kit/log" "github.com/prometheus/common/config" "github.com/prometheus/common/model" @@ -223,6 +224,7 @@ var expectedConf = &Config{ HonorTimestamps: true, ScrapeInterval: model.Duration(50 * time.Second), ScrapeTimeout: model.Duration(5 * time.Second), + BodySizeLimit: 10 * units.MiB, SampleLimit: 1000, HTTPClientConfig: config.HTTPClientConfig{ @@ -1200,6 +1202,10 @@ var expectedErrors = []struct { filename: "scaleway_two_secrets.bad.yml", errMsg: "at most one of secret_key & secret_key_file must be configured", }, + { + filename: "scrape_body_size_limit.bad.yml", + errMsg: "units: unknown unit in 100", + }, } func TestBadConfigs(t *testing.T) { diff --git a/config/testdata/conf.good.yml b/config/testdata/conf.good.yml index d0f565d58..0895e23ae 100644 --- a/config/testdata/conf.good.yml +++ b/config/testdata/conf.good.yml @@ -97,6 +97,7 @@ scrape_configs: scrape_interval: 50s scrape_timeout: 5s + body_size_limit: 10MB sample_limit: 1000 metrics_path: /my_path diff --git a/config/testdata/scrape_body_size_limit.bad.yml b/config/testdata/scrape_body_size_limit.bad.yml new file mode 100644 index 000000000..f463e1c04 --- /dev/null +++ b/config/testdata/scrape_body_size_limit.bad.yml @@ -0,0 +1,3 @@ +scrape_configs: +- job_name: prometheus + body_size_limit: 100 diff --git a/docs/configuration/configuration.md b/docs/configuration/configuration.md index 71e32f14e..18ae2674f 100644 --- a/docs/configuration/configuration.md +++ b/docs/configuration/configuration.md @@ -283,6 +283,9 @@ relabel_configs: metric_relabel_configs: [ - ... ] +# An uncompressed response body larger than this many bytes will cause the +# scrape to fail. 0 means no limit. Example: 100MB. +[ body_size_limit: | default = 0 ] # Per-scrape limit on number of scraped samples that will be accepted. # If more than this number of samples are present after metric relabeling # the entire scrape will be treated as failed. 0 means no limit. diff --git a/documentation/prometheus-mixin/dashboards.libsonnet b/documentation/prometheus-mixin/dashboards.libsonnet index c1336cd18..a26104230 100644 --- a/documentation/prometheus-mixin/dashboards.libsonnet +++ b/documentation/prometheus-mixin/dashboards.libsonnet @@ -54,11 +54,13 @@ local template = grafana.template; .addPanel( g.panel('Scrape failures') + g.queryPanel([ + 'sum by (job) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total[1m]))', 'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))', 'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))', 'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))', 'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))', ], [ + 'exceeded body size limit: {{job}}', 'exceeded sample limit: {{job}}', 'duplicate timestamp: {{job}}', 'out of bounds: {{job}}', diff --git a/scrape/scrape.go b/scrape/scrape.go index 63295e2a2..32bb5fb1c 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -134,6 +134,12 @@ var ( }, []string{"scrape_job"}, ) + targetScrapeExceededBodySizeLimit = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_target_scrapes_exceeded_body_size_limit_total", + Help: "Total number of scrapes that hit the body size limit", + }, + ) targetScrapeSampleLimit = prometheus.NewCounter( prometheus.CounterOpts{ Name: "prometheus_target_scrapes_exceeded_sample_limit_total", @@ -195,6 +201,7 @@ func init() { targetScrapePoolReloadsFailed, targetSyncIntervalLength, targetScrapePoolSyncsCounter, + targetScrapeExceededBodySizeLimit, targetScrapeSampleLimit, targetScrapeSampleDuplicate, targetScrapeSampleOutOfOrder, @@ -381,11 +388,12 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error { targetScrapePoolTargetLimit.WithLabelValues(sp.config.JobName).Set(float64(sp.config.TargetLimit)) var ( - wg sync.WaitGroup - interval = time.Duration(sp.config.ScrapeInterval) - timeout = time.Duration(sp.config.ScrapeTimeout) - sampleLimit = int(sp.config.SampleLimit) - labelLimits = &labelLimits{ + wg sync.WaitGroup + interval = time.Duration(sp.config.ScrapeInterval) + timeout = time.Duration(sp.config.ScrapeTimeout) + bodySizeLimit = int64(sp.config.BodySizeLimit) + sampleLimit = int(sp.config.SampleLimit) + labelLimits = &labelLimits{ labelLimit: int(sp.config.LabelLimit), labelNameLengthLimit: int(sp.config.LabelNameLengthLimit), labelValueLengthLimit: int(sp.config.LabelValueLengthLimit), @@ -408,7 +416,7 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error { } var ( t = sp.activeTargets[fp] - s = &targetScraper{Target: t, client: sp.client, timeout: timeout} + s = &targetScraper{Target: t, client: sp.client, timeout: timeout, bodySizeLimit: bodySizeLimit} newLoop = sp.newLoop(scrapeLoopOptions{ target: t, scraper: s, @@ -481,11 +489,12 @@ func (sp *scrapePool) Sync(tgs []*targetgroup.Group) { // It returns after all stopped scrape loops terminated. func (sp *scrapePool) sync(targets []*Target) { var ( - uniqueLoops = make(map[uint64]loop) - interval = time.Duration(sp.config.ScrapeInterval) - timeout = time.Duration(sp.config.ScrapeTimeout) - sampleLimit = int(sp.config.SampleLimit) - labelLimits = &labelLimits{ + uniqueLoops = make(map[uint64]loop) + interval = time.Duration(sp.config.ScrapeInterval) + timeout = time.Duration(sp.config.ScrapeTimeout) + bodySizeLimit = int64(sp.config.BodySizeLimit) + sampleLimit = int(sp.config.SampleLimit) + labelLimits = &labelLimits{ labelLimit: int(sp.config.LabelLimit), labelNameLengthLimit: int(sp.config.LabelNameLengthLimit), labelValueLengthLimit: int(sp.config.LabelValueLengthLimit), @@ -500,7 +509,7 @@ func (sp *scrapePool) sync(targets []*Target) { hash := t.hash() if _, ok := sp.activeTargets[hash]; !ok { - s := &targetScraper{Target: t, client: sp.client, timeout: timeout} + s := &targetScraper{Target: t, client: sp.client, timeout: timeout, bodySizeLimit: bodySizeLimit} l := sp.newLoop(scrapeLoopOptions{ target: t, scraper: s, @@ -690,8 +699,12 @@ type targetScraper struct { gzipr *gzip.Reader buf *bufio.Reader + + bodySizeLimit int64 } +var errBodySizeLimit = errors.New("body size limit exceeded") + const acceptHeader = `application/openmetrics-text; version=0.0.1,text/plain;version=0.0.4;q=0.5,*/*;q=0.1` var userAgentHeader = fmt.Sprintf("Prometheus/%s", version.Version) @@ -723,11 +736,18 @@ func (s *targetScraper) scrape(ctx context.Context, w io.Writer) (string, error) return "", errors.Errorf("server returned HTTP status %s", resp.Status) } + if s.bodySizeLimit <= 0 { + s.bodySizeLimit = math.MaxInt64 + } if resp.Header.Get("Content-Encoding") != "gzip" { - _, err = io.Copy(w, resp.Body) + n, err := io.Copy(w, io.LimitReader(resp.Body, s.bodySizeLimit)) if err != nil { return "", err } + if n >= s.bodySizeLimit { + targetScrapeExceededBodySizeLimit.Inc() + return "", errBodySizeLimit + } return resp.Header.Get("Content-Type"), nil } @@ -744,11 +764,15 @@ func (s *targetScraper) scrape(ctx context.Context, w io.Writer) (string, error) } } - _, err = io.Copy(w, s.gzipr) + n, err := io.Copy(w, io.LimitReader(s.gzipr, s.bodySizeLimit)) s.gzipr.Close() if err != nil { return "", err } + if n >= s.bodySizeLimit { + targetScrapeExceededBodySizeLimit.Inc() + return "", errBodySizeLimit + } return resp.Header.Get("Content-Type"), nil } diff --git a/scrape/scrape_test.go b/scrape/scrape_test.go index 93877035e..91f680763 100644 --- a/scrape/scrape_test.go +++ b/scrape/scrape_test.go @@ -15,6 +15,7 @@ package scrape import ( "bytes" + "compress/gzip" "context" "fmt" "io" @@ -1950,6 +1951,69 @@ func TestTargetScrapeScrapeNotFound(t *testing.T) { require.Contains(t, err.Error(), "404", "Expected \"404 NotFound\" error but got: %s", err) } +func TestTargetScraperBodySizeLimit(t *testing.T) { + const ( + bodySizeLimit = 15 + responseBody = "metric_a 1\nmetric_b 2\n" + ) + var gzipResponse bool + server := httptest.NewServer( + http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", `text/plain; version=0.0.4`) + if gzipResponse { + w.Header().Set("Content-Encoding", "gzip") + gw := gzip.NewWriter(w) + defer gw.Close() + gw.Write([]byte(responseBody)) + return + } + w.Write([]byte(responseBody)) + }), + ) + defer server.Close() + + serverURL, err := url.Parse(server.URL) + if err != nil { + panic(err) + } + + ts := &targetScraper{ + Target: &Target{ + labels: labels.FromStrings( + model.SchemeLabel, serverURL.Scheme, + model.AddressLabel, serverURL.Host, + ), + }, + client: http.DefaultClient, + bodySizeLimit: bodySizeLimit, + } + var buf bytes.Buffer + + // Target response uncompressed body, scrape with body size limit. + _, err = ts.scrape(context.Background(), &buf) + require.ErrorIs(t, err, errBodySizeLimit) + require.Equal(t, bodySizeLimit, buf.Len()) + // Target response gzip compressed body, scrape with body size limit. + gzipResponse = true + buf.Reset() + _, err = ts.scrape(context.Background(), &buf) + require.ErrorIs(t, err, errBodySizeLimit) + require.Equal(t, bodySizeLimit, buf.Len()) + // Target response uncompressed body, scrape without body size limit. + gzipResponse = false + buf.Reset() + ts.bodySizeLimit = 0 + _, err = ts.scrape(context.Background(), &buf) + require.NoError(t, err) + require.Equal(t, len(responseBody), buf.Len()) + // Target response gzip compressed body, scrape without body size limit. + gzipResponse = true + buf.Reset() + _, err = ts.scrape(context.Background(), &buf) + require.NoError(t, err) + require.Equal(t, len(responseBody), buf.Len()) +} + // testScraper implements the scraper interface and allows setting values // returned by its methods. It also allows setting a custom scrape function. type testScraper struct {