Browse Source

Fix cache maintenance on changing metric representations

We were not properly maintaining the scrape cache when the same metric
was exposed with a different string representation.
This overall reduces the scraping cache's complexity, which fixes the
issue and saves about 10% of memory in a scraping-only Prometheus
instance.
pull/3192/head
Fabian Reinartz 7 years ago
parent
commit
437f51a85f
  1. 19
      retrieval/helpers_test.go
  2. 103
      retrieval/scrape.go
  3. 49
      retrieval/scrape_test.go

19
retrieval/helpers_test.go

@ -32,13 +32,27 @@ func (a nopAppender) AddFast(labels.Labels, uint64, int64, float64) error { retu
func (a nopAppender) Commit() error { return nil } func (a nopAppender) Commit() error { return nil }
func (a nopAppender) Rollback() error { return nil } func (a nopAppender) Rollback() error { return nil }
// collectResultAppender records all samples that were added through the appender.
// It can be used as its zero value or be backed by another appender it writes samples through.
type collectResultAppender struct { type collectResultAppender struct {
next storage.Appender
result []sample result []sample
} }
func (a *collectResultAppender) AddFast(m labels.Labels, ref uint64, t int64, v float64) error { func (a *collectResultAppender) AddFast(m labels.Labels, ref uint64, t int64, v float64) error {
// Not implemented. if a.next == nil {
return storage.ErrNotFound return storage.ErrNotFound
}
err := a.next.AddFast(m, ref, t, v)
if err != nil {
return err
}
a.result = append(a.result, sample{
metric: m,
t: t,
v: v,
})
return err
} }
func (a *collectResultAppender) Add(m labels.Labels, t int64, v float64) (uint64, error) { func (a *collectResultAppender) Add(m labels.Labels, t int64, v float64) (uint64, error) {
@ -47,7 +61,10 @@ func (a *collectResultAppender) Add(m labels.Labels, t int64, v float64) (uint64
t: t, t: t,
v: v, v: v,
}) })
if a.next == nil {
return 0, nil return 0, nil
}
return a.next.Add(m, t, v)
} }
func (a *collectResultAppender) Commit() error { return nil } func (a *collectResultAppender) Commit() error { return nil }

103
retrieval/scrape.go

@ -45,13 +45,6 @@ import (
"github.com/prometheus/prometheus/util/httputil" "github.com/prometheus/prometheus/util/httputil"
) )
const (
scrapeHealthMetricName = "up"
scrapeDurationMetricName = "scrape_duration_seconds"
scrapeSamplesMetricName = "scrape_samples_scraped"
samplesPostRelabelMetricName = "scrape_samples_post_metric_relabeling"
)
var ( var (
targetIntervalLength = prometheus.NewSummaryVec( targetIntervalLength = prometheus.NewSummaryVec(
prometheus.SummaryOpts{ prometheus.SummaryOpts{
@ -460,15 +453,11 @@ type loop interface {
stop() stop()
} }
type lsetCacheEntry struct { type cacheEntry struct {
metric string
lset labels.Labels
hash uint64
}
type refEntry struct {
ref uint64 ref uint64
lastIter uint64 lastIter uint64
hash uint64
lset labels.Labels
} }
type scrapeLoop struct { type scrapeLoop struct {
@ -494,8 +483,9 @@ type scrapeLoop struct {
type scrapeCache struct { type scrapeCache struct {
iter uint64 // Current scrape iteration. iter uint64 // Current scrape iteration.
refs map[string]*refEntry // Parsed string to ref. // Parsed string to an entry with information about the actual label set
lsets map[uint64]*lsetCacheEntry // Ref to labelset and string. // and its storage reference.
entries map[string]*cacheEntry
// Cache of dropped metric strings and their iteration. The iteration must // Cache of dropped metric strings and their iteration. The iteration must
// be a pointer so we can update it without setting a new entry with an unsafe // be a pointer so we can update it without setting a new entry with an unsafe
@ -511,8 +501,7 @@ type scrapeCache struct {
func newScrapeCache() *scrapeCache { func newScrapeCache() *scrapeCache {
return &scrapeCache{ return &scrapeCache{
refs: map[string]*refEntry{}, entries: map[string]*cacheEntry{},
lsets: map[uint64]*lsetCacheEntry{},
dropped: map[string]*uint64{}, dropped: map[string]*uint64{},
seriesCur: map[uint64]labels.Labels{}, seriesCur: map[uint64]labels.Labels{},
seriesPrev: map[uint64]labels.Labels{}, seriesPrev: map[uint64]labels.Labels{},
@ -523,14 +512,13 @@ func (c *scrapeCache) iterDone() {
// refCache and lsetCache may grow over time through series churn // refCache and lsetCache may grow over time through series churn
// or multiple string representations of the same metric. Clean up entries // or multiple string representations of the same metric. Clean up entries
// that haven't appeared in the last scrape. // that haven't appeared in the last scrape.
for s, e := range c.refs { for s, e := range c.entries {
if e.lastIter < c.iter { if c.iter-e.lastIter > 2 {
delete(c.refs, s) delete(c.entries, s)
delete(c.lsets, e.ref)
} }
} }
for s, iter := range c.dropped { for s, iter := range c.dropped {
if *iter < c.iter { if c.iter-*iter > 2 {
delete(c.dropped, s) delete(c.dropped, s)
} }
} }
@ -546,29 +534,20 @@ func (c *scrapeCache) iterDone() {
c.iter++ c.iter++
} }
func (c *scrapeCache) getRef(met string) (uint64, bool) { func (c *scrapeCache) get(met string) (*cacheEntry, bool) {
e, ok := c.refs[met] e, ok := c.entries[met]
if !ok { if !ok {
return 0, false return nil, false
} }
e.lastIter = c.iter e.lastIter = c.iter
return e.ref, true return e, true
} }
func (c *scrapeCache) addRef(met string, ref uint64, lset labels.Labels, hash uint64) { func (c *scrapeCache) addRef(met string, ref uint64, lset labels.Labels, hash uint64) {
if ref == 0 { if ref == 0 {
return return
} }
// Clean up the label set cache before overwriting the ref for a previously seen c.entries[met] = &cacheEntry{ref: ref, lastIter: c.iter, lset: lset, hash: hash}
// metric representation. It won't be caught by the cleanup in iterDone otherwise.
if e, ok := c.refs[met]; ok {
delete(c.lsets, e.ref)
}
c.refs[met] = &refEntry{ref: ref, lastIter: c.iter}
// met is the raw string the metric was ingested as. The label set is not ordered
// and thus it's not suitable to uniquely identify cache entries.
// We store a hash over the label set instead.
c.lsets[ref] = &lsetCacheEntry{metric: met, lset: lset, hash: hash}
} }
func (c *scrapeCache) addDropped(met string) { func (c *scrapeCache) addDropped(met string) {
@ -825,14 +804,12 @@ loop:
if sl.cache.getDropped(yoloString(met)) { if sl.cache.getDropped(yoloString(met)) {
continue continue
} }
ref, ok := sl.cache.getRef(yoloString(met)) ce, ok := sl.cache.get(yoloString(met))
if ok { if ok {
lset := sl.cache.lsets[ref].lset switch err = app.AddFast(ce.lset, ce.ref, t, v); err {
switch err = app.AddFast(lset, ref, t, v); err {
case nil: case nil:
if tp == nil { if tp == nil {
e := sl.cache.lsets[ref] sl.cache.trackStaleness(ce.hash, ce.lset)
sl.cache.trackStaleness(e.hash, e.lset)
} }
case storage.ErrNotFound: case storage.ErrNotFound:
ok = false ok = false
@ -862,18 +839,10 @@ loop:
} }
} }
if !ok { if !ok {
var ( var lset labels.Labels
lset labels.Labels
mets string mets := p.Metric(&lset)
hash uint64 hash := lset.Hash()
)
if e, ok := sl.cache.lsets[ref]; ok {
mets = e.metric
lset = e.lset
hash = e.hash
} else {
mets = p.Metric(&lset)
hash = lset.Hash()
// Hash label set as it is seen local to the target. Then add target labels // Hash label set as it is seen local to the target. Then add target labels
// and relabeling and store the final label set. // and relabeling and store the final label set.
@ -884,7 +853,6 @@ loop:
sl.cache.addDropped(mets) sl.cache.addDropped(mets)
continue continue
} }
}
var ref uint64 var ref uint64
ref, err = app.Add(lset, t, v) ref, err = app.Add(lset, t, v)
@ -970,6 +938,15 @@ func yoloString(b []byte) string {
return *((*string)(unsafe.Pointer(&b))) return *((*string)(unsafe.Pointer(&b)))
} }
// The constants are suffixed with the invalid \xff unicode rune to avoid collisions
// with scraped metrics in the cache.
const (
scrapeHealthMetricName = "up" + "\xff"
scrapeDurationMetricName = "scrape_duration_seconds" + "\xff"
scrapeSamplesMetricName = "scrape_samples_scraped" + "\xff"
samplesPostRelabelMetricName = "scrape_samples_post_metric_relabeling" + "\xff"
)
func (sl *scrapeLoop) report(start time.Time, duration time.Duration, scraped, appended int, err error) error { func (sl *scrapeLoop) report(start time.Time, duration time.Duration, scraped, appended int, err error) error {
sl.scraper.report(start, duration, err) sl.scraper.report(start, duration, err)
@ -1026,14 +1003,9 @@ func (sl *scrapeLoop) reportStale(start time.Time) error {
} }
func (sl *scrapeLoop) addReportSample(app storage.Appender, s string, t int64, v float64) error { func (sl *scrapeLoop) addReportSample(app storage.Appender, s string, t int64, v float64) error {
// Suffix s with the invalid \xff unicode rune to avoid collisions ce, ok := sl.cache.get(s)
// with scraped metrics.
s2 := s + "\xff"
ref, ok := sl.cache.getRef(s2)
if ok { if ok {
lset := sl.cache.lsets[ref].lset err := app.AddFast(ce.lset, ce.ref, t, v)
err := app.AddFast(lset, ref, t, v)
switch err { switch err {
case nil: case nil:
return nil return nil
@ -1048,7 +1020,10 @@ func (sl *scrapeLoop) addReportSample(app storage.Appender, s string, t int64, v
} }
} }
lset := labels.Labels{ lset := labels.Labels{
labels.Label{Name: labels.MetricName, Value: s}, // The constants are suffixed with the invalid \xff unicode rune to avoid collisions
// with scraped metrics in the cache.
// We have to drop it when building the actual metric.
labels.Label{Name: labels.MetricName, Value: s[:len(s)-1]},
} }
hash := lset.Hash() hash := lset.Hash()
@ -1057,7 +1032,7 @@ func (sl *scrapeLoop) addReportSample(app storage.Appender, s string, t int64, v
ref, err := app.Add(lset, t, v) ref, err := app.Add(lset, t, v)
switch err { switch err {
case nil: case nil:
sl.cache.addRef(s2, ref, lset, hash) sl.cache.addRef(s, ref, lset, hash)
return nil return nil
case storage.ErrOutOfOrderSample, storage.ErrDuplicateSampleForTimestamp: case storage.ErrOutOfOrderSample, storage.ErrDuplicateSampleForTimestamp:
return nil return nil

49
retrieval/scrape_test.go

@ -37,6 +37,7 @@ import (
"github.com/prometheus/prometheus/pkg/timestamp" "github.com/prometheus/prometheus/pkg/timestamp"
"github.com/prometheus/prometheus/pkg/value" "github.com/prometheus/prometheus/pkg/value"
"github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/storage"
"github.com/prometheus/prometheus/util/testutil"
) )
func TestNewScrapePool(t *testing.T) { func TestNewScrapePool(t *testing.T) {
@ -625,6 +626,54 @@ func TestScrapeLoopAppend(t *testing.T) {
} }
} }
func TestScrapeLoop_ChangingMetricString(t *testing.T) {
// This is a regression test for the scrape loop cache not properly maintaining
// IDs when the string representation of a metric changes across a scrape. Thus
// we use a real storage appender here.
s := testutil.NewStorage(t)
defer s.Close()
app, err := s.Appender()
if err != nil {
t.Error(err)
}
capp := &collectResultAppender{next: app}
sl := newScrapeLoop(context.Background(),
nil, nil, nil,
nopMutator,
nopMutator,
func() storage.Appender { return capp },
)
now := time.Now()
_, _, err = sl.append([]byte(`metric_a{a="1",b="1"} 1`), now)
if err != nil {
t.Fatalf("Unexpected append error: %s", err)
}
_, _, err = sl.append([]byte(`metric_a{b="1",a="1"} 2`), now.Add(time.Minute))
if err != nil {
t.Fatalf("Unexpected append error: %s", err)
}
// DeepEqual will report NaNs as being different, so replace with a different value.
want := []sample{
{
metric: labels.FromStrings("__name__", "metric_a", "a", "1", "b", "1"),
t: timestamp.FromTime(now),
v: 1,
},
{
metric: labels.FromStrings("__name__", "metric_a", "a", "1", "b", "1"),
t: timestamp.FromTime(now.Add(time.Minute)),
v: 2,
},
}
if !reflect.DeepEqual(want, capp.result) {
t.Fatalf("Appended samples not as expected. Wanted: %+v Got: %+v", want, capp.result)
}
}
func TestScrapeLoopAppendStaleness(t *testing.T) { func TestScrapeLoopAppendStaleness(t *testing.T) {
app := &collectResultAppender{} app := &collectResultAppender{}

Loading…
Cancel
Save