From 220e78b9c3dc63b0c8b83190f042183852b42772 Mon Sep 17 00:00:00 2001 From: Brian Brazil Date: Tue, 23 May 2017 17:36:35 +0100 Subject: [PATCH] Consider a series stale after 4.1 intervals with no data. To cover the cases where stale markers may not be available, we need to infer the interval and mark series stale based on that. As we're lacking stale markers this is less accurate, however it should be good enough for these cases. We need 4 intervals as if say we had data at t=0 and t=10, coming via federation. The next data point should be at t=20 however it could take up to t=30 for it actually to be ingested, t=40 for it to be scraped via federation and t=50 for it to be ingested. We then add 10% on to that for slack, as we do elsewhere. --- promql/engine.go | 19 ++++++++++++++++++- promql/testdata/staleness.test | 27 +++++++++++++++++++++++++++ web/federate.go | 2 +- 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/promql/engine.go b/promql/engine.go index a5498bf56..f43015c74 100644 --- a/promql/engine.go +++ b/promql/engine.go @@ -751,8 +751,10 @@ func (ev *evaluator) vectorSelector(node *VectorSelector) Vector { } t, v := it.Values() + peek := 1 if !ok || t > refTime { - t, v, ok = it.PeekBack(1) + t, v, ok = it.PeekBack(peek) + peek += 1 if !ok || t < refTime-durationMilliseconds(StalenessDelta) { continue } @@ -760,6 +762,21 @@ func (ev *evaluator) vectorSelector(node *VectorSelector) Vector { if value.IsStaleNaN(v) { continue } + // Find timestamp before this point, within the staleness delta. + prevT, _, ok := it.PeekBack(peek) + if ok && prevT >= refTime-durationMilliseconds(StalenessDelta) { + interval := t - prevT + if interval*4+interval/10 < refTime-t { + // It is more than 4 (+10% for safety) intervals + // since the last data point, skip as stale. + // + // We need 4 to allow for federation, as with a 10s einterval an eval + // started at t=10 could be ingested at t=20, scraped for federation at + // t=30 and only ingested by federation at t=40. + continue + } + } + vec = append(vec, Sample{ Metric: node.series[i].Labels(), Point: Point{V: v, T: t}, diff --git a/promql/testdata/staleness.test b/promql/testdata/staleness.test index abb67ba0b..9c24fe31c 100644 --- a/promql/testdata/staleness.test +++ b/promql/testdata/staleness.test @@ -10,6 +10,15 @@ eval instant at 20s metric eval instant at 30s metric {__name__="metric"} 2 +eval instant at 40s metric + {__name__="metric"} 2 + +# It goes stale 4 intervals + 10% after the last sample. +eval instant at 71s metric + {__name__="metric"} 2 + +eval instant at 72s metric + # Range vector ignores stale sample. eval instant at 30s count_over_time(metric[1m]) @@ -22,3 +31,21 @@ eval instant at 20s count_over_time(metric[1s]) eval instant at 20s count_over_time(metric[10s]) {} 1 + + +clear + +load 10s + metric 0 + +# Series with single point goes stale after 5 minutes. +eval instant at 0s metric + {__name__="metric"} 0 + +eval instant at 150s metric + {__name__="metric"} 0 + +eval instant at 300s metric + {__name__="metric"} 0 + +eval instant at 301s metric diff --git a/web/federate.go b/web/federate.go index 80068e85e..bc79ffc91 100644 --- a/web/federate.go +++ b/web/federate.go @@ -94,7 +94,7 @@ func (h *Handler) federation(w http.ResponseWriter, req *http.Request) { if ok { t, v = it.Values() } else { - t, v, ok = it.PeekBack(0) + t, v, ok = it.PeekBack(1) if !ok { continue }