Add body_size_limit to prevent bad targets response large body cause Prometheus server OOM (#8827)

Signed-off-by: hanjm <hanjinming@outlook.com>
4 years ago · 1df05bfd49
8 changed files with 121 additions and 14 deletions
--- a/config/config.go
+++ b/config/config.go
@ -23,6 +23,7 @@ import (
 	"strings"
 	"time"

+	"github.com/alecthomas/units"
 	"github.com/go-kit/kit/log"
 	"github.com/go-kit/kit/log/level"
 	"github.com/pkg/errors"
@ -382,6 +383,9 @@ type ScrapeConfig struct {
 	MetricsPath string `yaml:"metrics_path,omitempty"`
 	// The URL scheme with which to fetch metrics from targets.
 	Scheme string `yaml:"scheme,omitempty"`
+	// An uncompressed response body larger than this many bytes will cause the
+	// scrape to fail. 0 means no limit.
+	BodySizeLimit units.Base2Bytes `yaml:"body_size_limit,omitempty"`
 	// More than this many samples post metric-relabeling will cause the scrape to
 	// fail.
 	SampleLimit uint `yaml:"sample_limit,omitempty"`
--- a/config/config_test.go
+++ b/config/config_test.go
@ -23,6 +23,7 @@ import (
 	"testing"
 	"time"

+	"github.com/alecthomas/units"
 	"github.com/go-kit/kit/log"
 	"github.com/prometheus/common/config"
 	"github.com/prometheus/common/model"
@ -223,6 +224,7 @@ var expectedConf = &Config{
 			HonorTimestamps: true,
 			ScrapeInterval:  model.Duration(50 * time.Second),
 			ScrapeTimeout:   model.Duration(5 * time.Second),
+			BodySizeLimit:   10 * units.MiB,
 			SampleLimit:     1000,

 			HTTPClientConfig: config.HTTPClientConfig{
@ -1200,6 +1202,10 @@ var expectedErrors = []struct {
 		filename: "scaleway_two_secrets.bad.yml",
 		errMsg:   "at most one of secret_key & secret_key_file must be configured",
 	},
+	{
+		filename: "scrape_body_size_limit.bad.yml",
+		errMsg:   "units: unknown unit  in 100",
+	},
 }

 func TestBadConfigs(t *testing.T) {
--- a/config/testdata/conf.good.yml
+++ b/config/testdata/conf.good.yml
@ -97,6 +97,7 @@ scrape_configs:
  scrape_interval: 50s
  scrape_timeout:  5s

+  body_size_limit: 10MB
  sample_limit: 1000

  metrics_path: /my_path
--- a/config/testdata/scrape_body_size_limit.bad.yml
+++ b/config/testdata/scrape_body_size_limit.bad.yml
@ -0,0 +1,3 @@
+scrape_configs:
+- job_name: prometheus
+  body_size_limit: 100
--- a/docs/configuration/configuration.md
+++ b/docs/configuration/configuration.md
@ -283,6 +283,9 @@ relabel_configs:
 metric_relabel_configs:
  [ - <relabel_config> ... ]

+# An uncompressed response body larger than this many bytes will cause the
+# scrape to fail. 0 means no limit. Example: 100MB.
+[ body_size_limit: <string> | default = 0 ]
 # Per-scrape limit on number of scraped samples that will be accepted.
 # If more than this number of samples are present after metric relabeling
 # the entire scrape will be treated as failed. 0 means no limit.
--- a/documentation/prometheus-mixin/dashboards.libsonnet
+++ b/documentation/prometheus-mixin/dashboards.libsonnet
@ -54,11 +54,13 @@ local template = grafana.template;
        .addPanel(
          g.panel('Scrape failures') +
          g.queryPanel([
+            'sum by (job) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total[1m]))',
            'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))',
            'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))',
            'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))',
            'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))',
          ], [
+            'exceeded body size limit: {{job}}',
            'exceeded sample limit: {{job}}',
            'duplicate timestamp: {{job}}',
            'out of bounds: {{job}}',
--- a/scrape/scrape.go
+++ b/scrape/scrape.go
@ -134,6 +134,12 @@ var (
 		},
 		[]string{"scrape_job"},
 	)
+	targetScrapeExceededBodySizeLimit = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Name: "prometheus_target_scrapes_exceeded_body_size_limit_total",
+			Help: "Total number of scrapes that hit the body size limit",
+		},
+	)
 	targetScrapeSampleLimit = prometheus.NewCounter(
 		prometheus.CounterOpts{
 			Name: "prometheus_target_scrapes_exceeded_sample_limit_total",
@ -195,6 +201,7 @@ func init() {
 		targetScrapePoolReloadsFailed,
 		targetSyncIntervalLength,
 		targetScrapePoolSyncsCounter,
+		targetScrapeExceededBodySizeLimit,
 		targetScrapeSampleLimit,
 		targetScrapeSampleDuplicate,
 		targetScrapeSampleOutOfOrder,
@ -381,11 +388,12 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error {
 	targetScrapePoolTargetLimit.WithLabelValues(sp.config.JobName).Set(float64(sp.config.TargetLimit))

 	var (
-		wg          sync.WaitGroup
-		interval    = time.Duration(sp.config.ScrapeInterval)
-		timeout     = time.Duration(sp.config.ScrapeTimeout)
-		sampleLimit = int(sp.config.SampleLimit)
-		labelLimits = &labelLimits{
+		wg            sync.WaitGroup
+		interval      = time.Duration(sp.config.ScrapeInterval)
+		timeout       = time.Duration(sp.config.ScrapeTimeout)
+		bodySizeLimit = int64(sp.config.BodySizeLimit)
+		sampleLimit   = int(sp.config.SampleLimit)
+		labelLimits   = &labelLimits{
 			labelLimit:            int(sp.config.LabelLimit),
 			labelNameLengthLimit:  int(sp.config.LabelNameLengthLimit),
 			labelValueLengthLimit: int(sp.config.LabelValueLengthLimit),
@ -408,7 +416,7 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error {
 		}
 		var (
 			t       = sp.activeTargets[fp]
-			s       = &targetScraper{Target: t, client: sp.client, timeout: timeout}
+			s       = &targetScraper{Target: t, client: sp.client, timeout: timeout, bodySizeLimit: bodySizeLimit}
 			newLoop = sp.newLoop(scrapeLoopOptions{
 				target:          t,
 				scraper:         s,
@ -481,11 +489,12 @@ func (sp *scrapePool) Sync(tgs []*targetgroup.Group) {
 // It returns after all stopped scrape loops terminated.
 func (sp *scrapePool) sync(targets []*Target) {
 	var (
-		uniqueLoops = make(map[uint64]loop)
-		interval    = time.Duration(sp.config.ScrapeInterval)
-		timeout     = time.Duration(sp.config.ScrapeTimeout)
-		sampleLimit = int(sp.config.SampleLimit)
-		labelLimits = &labelLimits{
+		uniqueLoops   = make(map[uint64]loop)
+		interval      = time.Duration(sp.config.ScrapeInterval)
+		timeout       = time.Duration(sp.config.ScrapeTimeout)
+		bodySizeLimit = int64(sp.config.BodySizeLimit)
+		sampleLimit   = int(sp.config.SampleLimit)
+		labelLimits   = &labelLimits{
 			labelLimit:            int(sp.config.LabelLimit),
 			labelNameLengthLimit:  int(sp.config.LabelNameLengthLimit),
 			labelValueLengthLimit: int(sp.config.LabelValueLengthLimit),
@ -500,7 +509,7 @@ func (sp *scrapePool) sync(targets []*Target) {
 		hash := t.hash()

 		if _, ok := sp.activeTargets[hash]; !ok {
-			s := &targetScraper{Target: t, client: sp.client, timeout: timeout}
+			s := &targetScraper{Target: t, client: sp.client, timeout: timeout, bodySizeLimit: bodySizeLimit}
 			l := sp.newLoop(scrapeLoopOptions{
 				target:          t,
 				scraper:         s,
@ -690,8 +699,12 @@ type targetScraper struct {

 	gzipr *gzip.Reader
 	buf   *bufio.Reader
+
+	bodySizeLimit int64
 }

+var errBodySizeLimit = errors.New("body size limit exceeded")
+
 const acceptHeader = `application/openmetrics-text; version=0.0.1,text/plain;version=0.0.4;q=0.5,*/*;q=0.1`

 var userAgentHeader = fmt.Sprintf("Prometheus/%s", version.Version)
@ -723,11 +736,18 @@ func (s *targetScraper) scrape(ctx context.Context, w io.Writer) (string, error)
 		return "", errors.Errorf("server returned HTTP status %s", resp.Status)
 	}

+	if s.bodySizeLimit <= 0 {
+		s.bodySizeLimit = math.MaxInt64
+	}
 	if resp.Header.Get("Content-Encoding") != "gzip" {
-		_, err = io.Copy(w, resp.Body)
+		n, err := io.Copy(w, io.LimitReader(resp.Body, s.bodySizeLimit))
 		if err != nil {
 			return "", err
 		}
+		if n >= s.bodySizeLimit {
+			targetScrapeExceededBodySizeLimit.Inc()
+			return "", errBodySizeLimit
+		}
 		return resp.Header.Get("Content-Type"), nil
 	}

@ -744,11 +764,15 @@ func (s *targetScraper) scrape(ctx context.Context, w io.Writer) (string, error)
 		}
 	}

-	_, err = io.Copy(w, s.gzipr)
+	n, err := io.Copy(w, io.LimitReader(s.gzipr, s.bodySizeLimit))
 	s.gzipr.Close()
 	if err != nil {
 		return "", err
 	}
+	if n >= s.bodySizeLimit {
+		targetScrapeExceededBodySizeLimit.Inc()
+		return "", errBodySizeLimit
+	}
 	return resp.Header.Get("Content-Type"), nil
 }

--- a/scrape/scrape_test.go
+++ b/scrape/scrape_test.go
@ -15,6 +15,7 @@ package scrape

 import (
 	"bytes"
+	"compress/gzip"
 	"context"
 	"fmt"
 	"io"
@ -1950,6 +1951,69 @@ func TestTargetScrapeScrapeNotFound(t *testing.T) {
 	require.Contains(t, err.Error(), "404", "Expected \"404 NotFound\" error but got: %s", err)
 }

+func TestTargetScraperBodySizeLimit(t *testing.T) {
+	const (
+		bodySizeLimit = 15
+		responseBody  = "metric_a 1\nmetric_b 2\n"
+	)
+	var gzipResponse bool
+	server := httptest.NewServer(
+		http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			w.Header().Set("Content-Type", `text/plain; version=0.0.4`)
+			if gzipResponse {
+				w.Header().Set("Content-Encoding", "gzip")
+				gw := gzip.NewWriter(w)
+				defer gw.Close()
+				gw.Write([]byte(responseBody))
+				return
+			}
+			w.Write([]byte(responseBody))
+		}),
+	)
+	defer server.Close()
+
+	serverURL, err := url.Parse(server.URL)
+	if err != nil {
+		panic(err)
+	}
+
+	ts := &targetScraper{
+		Target: &Target{
+			labels: labels.FromStrings(
+				model.SchemeLabel, serverURL.Scheme,
+				model.AddressLabel, serverURL.Host,
+			),
+		},
+		client:        http.DefaultClient,
+		bodySizeLimit: bodySizeLimit,
+	}
+	var buf bytes.Buffer
+
+	// Target response uncompressed body, scrape with body size limit.
+	_, err = ts.scrape(context.Background(), &buf)
+	require.ErrorIs(t, err, errBodySizeLimit)
+	require.Equal(t, bodySizeLimit, buf.Len())
+	// Target response gzip compressed body, scrape with body size limit.
+	gzipResponse = true
+	buf.Reset()
+	_, err = ts.scrape(context.Background(), &buf)
+	require.ErrorIs(t, err, errBodySizeLimit)
+	require.Equal(t, bodySizeLimit, buf.Len())
+	// Target response uncompressed body, scrape without body size limit.
+	gzipResponse = false
+	buf.Reset()
+	ts.bodySizeLimit = 0
+	_, err = ts.scrape(context.Background(), &buf)
+	require.NoError(t, err)
+	require.Equal(t, len(responseBody), buf.Len())
+	// Target response gzip compressed body, scrape without body size limit.
+	gzipResponse = true
+	buf.Reset()
+	_, err = ts.scrape(context.Background(), &buf)
+	require.NoError(t, err)
+	require.Equal(t, len(responseBody), buf.Len())
+}
+
 // testScraper implements the scraper interface and allows setting values
 // returned by its methods. It also allows setting a custom scrape function.
 type testScraper struct {