Browse Source

Add body_size_limit to prevent bad targets response large body cause Prometheus server OOM (#8827)

Signed-off-by: hanjm <hanjinming@outlook.com>
pull/8833/head
hanjm 4 years ago
parent
commit
1df05bfd49
  1. 4
      config/config.go
  2. 6
      config/config_test.go
  3. 1
      config/testdata/conf.good.yml
  4. 3
      config/testdata/scrape_body_size_limit.bad.yml
  5. 3
      docs/configuration/configuration.md
  6. 2
      documentation/prometheus-mixin/dashboards.libsonnet
  7. 52
      scrape/scrape.go
  8. 64
      scrape/scrape_test.go

4
config/config.go

@ -23,6 +23,7 @@ import (
"strings"
"time"
"github.com/alecthomas/units"
"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
"github.com/pkg/errors"
@ -382,6 +383,9 @@ type ScrapeConfig struct {
MetricsPath string `yaml:"metrics_path,omitempty"`
// The URL scheme with which to fetch metrics from targets.
Scheme string `yaml:"scheme,omitempty"`
// An uncompressed response body larger than this many bytes will cause the
// scrape to fail. 0 means no limit.
BodySizeLimit units.Base2Bytes `yaml:"body_size_limit,omitempty"`
// More than this many samples post metric-relabeling will cause the scrape to
// fail.
SampleLimit uint `yaml:"sample_limit,omitempty"`

6
config/config_test.go

@ -23,6 +23,7 @@ import (
"testing"
"time"
"github.com/alecthomas/units"
"github.com/go-kit/kit/log"
"github.com/prometheus/common/config"
"github.com/prometheus/common/model"
@ -223,6 +224,7 @@ var expectedConf = &Config{
HonorTimestamps: true,
ScrapeInterval: model.Duration(50 * time.Second),
ScrapeTimeout: model.Duration(5 * time.Second),
BodySizeLimit: 10 * units.MiB,
SampleLimit: 1000,
HTTPClientConfig: config.HTTPClientConfig{
@ -1200,6 +1202,10 @@ var expectedErrors = []struct {
filename: "scaleway_two_secrets.bad.yml",
errMsg: "at most one of secret_key & secret_key_file must be configured",
},
{
filename: "scrape_body_size_limit.bad.yml",
errMsg: "units: unknown unit in 100",
},
}
func TestBadConfigs(t *testing.T) {

1
config/testdata/conf.good.yml vendored

@ -97,6 +97,7 @@ scrape_configs:
scrape_interval: 50s
scrape_timeout: 5s
body_size_limit: 10MB
sample_limit: 1000
metrics_path: /my_path

3
config/testdata/scrape_body_size_limit.bad.yml vendored

@ -0,0 +1,3 @@
scrape_configs:
- job_name: prometheus
body_size_limit: 100

3
docs/configuration/configuration.md

@ -283,6 +283,9 @@ relabel_configs:
metric_relabel_configs:
[ - <relabel_config> ... ]
# An uncompressed response body larger than this many bytes will cause the
# scrape to fail. 0 means no limit. Example: 100MB.
[ body_size_limit: <string> | default = 0 ]
# Per-scrape limit on number of scraped samples that will be accepted.
# If more than this number of samples are present after metric relabeling
# the entire scrape will be treated as failed. 0 means no limit.

2
documentation/prometheus-mixin/dashboards.libsonnet

@ -54,11 +54,13 @@ local template = grafana.template;
.addPanel(
g.panel('Scrape failures') +
g.queryPanel([
'sum by (job) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))',
], [
'exceeded body size limit: {{job}}',
'exceeded sample limit: {{job}}',
'duplicate timestamp: {{job}}',
'out of bounds: {{job}}',

52
scrape/scrape.go

@ -134,6 +134,12 @@ var (
},
[]string{"scrape_job"},
)
targetScrapeExceededBodySizeLimit = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "prometheus_target_scrapes_exceeded_body_size_limit_total",
Help: "Total number of scrapes that hit the body size limit",
},
)
targetScrapeSampleLimit = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "prometheus_target_scrapes_exceeded_sample_limit_total",
@ -195,6 +201,7 @@ func init() {
targetScrapePoolReloadsFailed,
targetSyncIntervalLength,
targetScrapePoolSyncsCounter,
targetScrapeExceededBodySizeLimit,
targetScrapeSampleLimit,
targetScrapeSampleDuplicate,
targetScrapeSampleOutOfOrder,
@ -381,11 +388,12 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error {
targetScrapePoolTargetLimit.WithLabelValues(sp.config.JobName).Set(float64(sp.config.TargetLimit))
var (
wg sync.WaitGroup
interval = time.Duration(sp.config.ScrapeInterval)
timeout = time.Duration(sp.config.ScrapeTimeout)
sampleLimit = int(sp.config.SampleLimit)
labelLimits = &labelLimits{
wg sync.WaitGroup
interval = time.Duration(sp.config.ScrapeInterval)
timeout = time.Duration(sp.config.ScrapeTimeout)
bodySizeLimit = int64(sp.config.BodySizeLimit)
sampleLimit = int(sp.config.SampleLimit)
labelLimits = &labelLimits{
labelLimit: int(sp.config.LabelLimit),
labelNameLengthLimit: int(sp.config.LabelNameLengthLimit),
labelValueLengthLimit: int(sp.config.LabelValueLengthLimit),
@ -408,7 +416,7 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error {
}
var (
t = sp.activeTargets[fp]
s = &targetScraper{Target: t, client: sp.client, timeout: timeout}
s = &targetScraper{Target: t, client: sp.client, timeout: timeout, bodySizeLimit: bodySizeLimit}
newLoop = sp.newLoop(scrapeLoopOptions{
target: t,
scraper: s,
@ -481,11 +489,12 @@ func (sp *scrapePool) Sync(tgs []*targetgroup.Group) {
// It returns after all stopped scrape loops terminated.
func (sp *scrapePool) sync(targets []*Target) {
var (
uniqueLoops = make(map[uint64]loop)
interval = time.Duration(sp.config.ScrapeInterval)
timeout = time.Duration(sp.config.ScrapeTimeout)
sampleLimit = int(sp.config.SampleLimit)
labelLimits = &labelLimits{
uniqueLoops = make(map[uint64]loop)
interval = time.Duration(sp.config.ScrapeInterval)
timeout = time.Duration(sp.config.ScrapeTimeout)
bodySizeLimit = int64(sp.config.BodySizeLimit)
sampleLimit = int(sp.config.SampleLimit)
labelLimits = &labelLimits{
labelLimit: int(sp.config.LabelLimit),
labelNameLengthLimit: int(sp.config.LabelNameLengthLimit),
labelValueLengthLimit: int(sp.config.LabelValueLengthLimit),
@ -500,7 +509,7 @@ func (sp *scrapePool) sync(targets []*Target) {
hash := t.hash()
if _, ok := sp.activeTargets[hash]; !ok {
s := &targetScraper{Target: t, client: sp.client, timeout: timeout}
s := &targetScraper{Target: t, client: sp.client, timeout: timeout, bodySizeLimit: bodySizeLimit}
l := sp.newLoop(scrapeLoopOptions{
target: t,
scraper: s,
@ -690,8 +699,12 @@ type targetScraper struct {
gzipr *gzip.Reader
buf *bufio.Reader
bodySizeLimit int64
}
var errBodySizeLimit = errors.New("body size limit exceeded")
const acceptHeader = `application/openmetrics-text; version=0.0.1,text/plain;version=0.0.4;q=0.5,*/*;q=0.1`
var userAgentHeader = fmt.Sprintf("Prometheus/%s", version.Version)
@ -723,11 +736,18 @@ func (s *targetScraper) scrape(ctx context.Context, w io.Writer) (string, error)
return "", errors.Errorf("server returned HTTP status %s", resp.Status)
}
if s.bodySizeLimit <= 0 {
s.bodySizeLimit = math.MaxInt64
}
if resp.Header.Get("Content-Encoding") != "gzip" {
_, err = io.Copy(w, resp.Body)
n, err := io.Copy(w, io.LimitReader(resp.Body, s.bodySizeLimit))
if err != nil {
return "", err
}
if n >= s.bodySizeLimit {
targetScrapeExceededBodySizeLimit.Inc()
return "", errBodySizeLimit
}
return resp.Header.Get("Content-Type"), nil
}
@ -744,11 +764,15 @@ func (s *targetScraper) scrape(ctx context.Context, w io.Writer) (string, error)
}
}
_, err = io.Copy(w, s.gzipr)
n, err := io.Copy(w, io.LimitReader(s.gzipr, s.bodySizeLimit))
s.gzipr.Close()
if err != nil {
return "", err
}
if n >= s.bodySizeLimit {
targetScrapeExceededBodySizeLimit.Inc()
return "", errBodySizeLimit
}
return resp.Header.Get("Content-Type"), nil
}

64
scrape/scrape_test.go

@ -15,6 +15,7 @@ package scrape
import (
"bytes"
"compress/gzip"
"context"
"fmt"
"io"
@ -1950,6 +1951,69 @@ func TestTargetScrapeScrapeNotFound(t *testing.T) {
require.Contains(t, err.Error(), "404", "Expected \"404 NotFound\" error but got: %s", err)
}
func TestTargetScraperBodySizeLimit(t *testing.T) {
const (
bodySizeLimit = 15
responseBody = "metric_a 1\nmetric_b 2\n"
)
var gzipResponse bool
server := httptest.NewServer(
http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", `text/plain; version=0.0.4`)
if gzipResponse {
w.Header().Set("Content-Encoding", "gzip")
gw := gzip.NewWriter(w)
defer gw.Close()
gw.Write([]byte(responseBody))
return
}
w.Write([]byte(responseBody))
}),
)
defer server.Close()
serverURL, err := url.Parse(server.URL)
if err != nil {
panic(err)
}
ts := &targetScraper{
Target: &Target{
labels: labels.FromStrings(
model.SchemeLabel, serverURL.Scheme,
model.AddressLabel, serverURL.Host,
),
},
client: http.DefaultClient,
bodySizeLimit: bodySizeLimit,
}
var buf bytes.Buffer
// Target response uncompressed body, scrape with body size limit.
_, err = ts.scrape(context.Background(), &buf)
require.ErrorIs(t, err, errBodySizeLimit)
require.Equal(t, bodySizeLimit, buf.Len())
// Target response gzip compressed body, scrape with body size limit.
gzipResponse = true
buf.Reset()
_, err = ts.scrape(context.Background(), &buf)
require.ErrorIs(t, err, errBodySizeLimit)
require.Equal(t, bodySizeLimit, buf.Len())
// Target response uncompressed body, scrape without body size limit.
gzipResponse = false
buf.Reset()
ts.bodySizeLimit = 0
_, err = ts.scrape(context.Background(), &buf)
require.NoError(t, err)
require.Equal(t, len(responseBody), buf.Len())
// Target response gzip compressed body, scrape without body size limit.
gzipResponse = true
buf.Reset()
_, err = ts.scrape(context.Background(), &buf)
require.NoError(t, err)
require.Equal(t, len(responseBody), buf.Len())
}
// testScraper implements the scraper interface and allows setting values
// returned by its methods. It also allows setting a custom scrape function.
type testScraper struct {

Loading…
Cancel
Save