PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases. This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`. --------- Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
1 year ago · ccfe14d7e7
parent 2e205ee95c
commit ccfe14d7e7
7 changed files with 416 additions and 36 deletions
--- a/docs/querying/functions.md
+++ b/docs/querying/functions.md
@ -323,6 +323,24 @@ a histogram.
 You can use `histogram_quantile(1, v instant-vector)` to get the estimated maximum value stored in
 a histogram.

+Buckets of classic histograms are cumulative. Therefore, the following should always be the case:
+
+- The counts in the buckets are monotonically increasing (strictly non-decreasing).
+- A lack of observations between the upper limits of two consecutive buckets results in equal counts
+in those two buckets.
+
+However, floating point precision issues (e.g. small discrepancies introduced by computing of buckets
+with `sum(rate(...))`) or invalid data might violate these assumptions. In that case,
+`histogram_quantile` would be unable to return meaningful results. To mitigate the issue,
+`histogram_quantile` assumes that tiny relative differences between consecutive buckets are happening
+because of floating point precision errors and ignores them. (The threshold to ignore a difference
+between two buckets is a trillionth (1e-12) of the sum of both buckets.) Furthermore, if there are
+non-monotonic bucket counts even after this adjustment, they are increased to the value of the
+previous buckets to enforce monotonicity. The latter is evidence for an actual issue with the input
+data and is therefore flagged with an informational annotation reading `input to histogram_quantile
+needed to be fixed for monotonicity`. If you encounter this annotation, you should find and remove
+the source of the invalid data.
+
 ## `histogram_stddev()` and `histogram_stdvar()`

 _Both functions only act on native histograms, which are an experimental
--- a/promql/engine_test.go
+++ b/promql/engine_test.go
@ -3699,7 +3699,7 @@ func TestNativeHistogram_HistogramQuantile(t *testing.T) {

 						require.Len(t, vector, 1)
 						require.Nil(t, vector[0].H)
-						require.True(t, almostEqual(sc.value, vector[0].F))
+						require.True(t, almostEqual(sc.value, vector[0].F, defaultEpsilon))
 					})
 				}
 				idx++
--- a/promql/functions.go
+++ b/promql/functions.go
@ -1163,7 +1163,7 @@ func funcHistogramQuantile(vals []parser.Value, args parser.Expressions, enh *Ev

 	for _, mb := range enh.signatureToMetricWithBuckets {
 		if len(mb.buckets) > 0 {
-			res, forcedMonotonicity := bucketQuantile(q, mb.buckets)
+			res, forcedMonotonicity, _ := bucketQuantile(q, mb.buckets)
 			enh.Out = append(enh.Out, Sample{
 				Metric: mb.metric,
 				F:      res,
--- a/promql/quantile.go
+++ b/promql/quantile.go
@ -23,6 +23,25 @@ import (
 	"github.com/prometheus/prometheus/model/labels"
 )

+// smallDeltaTolerance is the threshold for relative deltas between classic
+// histogram buckets that will be ignored by the histogram_quantile function
+// because they are most likely artifacts of floating point precision issues.
+// Testing on 2 sets of real data with bugs arising from small deltas,
+// the safe ranges were from:
+// - 1e-05 to 1e-15
+// - 1e-06 to 1e-15
+// Anything to the left of that would cause non-query-sharded data to have
+// small deltas ignored (unnecessary and we should avoid this), and anything
+// to the right of that would cause query-sharded data to not have its small
+// deltas ignored (so the problem won't be fixed).
+// For context, query sharding triggers these float precision errors in Mimir.
+// To illustrate, with a relative deviation of 1e-12, we need to have 1e12
+// observations in the bucket so that the change of one observation is small
+// enough to get ignored. With the usual observation rate even of very busy
+// services, this will hardly be reached in timeframes that matters for
+// monitoring.
+const smallDeltaTolerance = 1e-12
+
 // Helpers to calculate quantiles.

 // excludedLabels are the labels to exclude from signature calculation for
@ -72,16 +91,19 @@ type metricWithBuckets struct {
 //
 // If q>1, +Inf is returned.
 //
-// We also return a bool to indicate if monotonicity needed to be forced.
-func bucketQuantile(q float64, buckets buckets) (float64, bool) {
+// We also return a bool to indicate if monotonicity needed to be forced,
+// and another bool to indicate if small differences between buckets (that
+// are likely artifacts of floating point precision issues) have been
+// ignored.
+func bucketQuantile(q float64, buckets buckets) (float64, bool, bool) {
 	if math.IsNaN(q) {
-		return math.NaN(), false
+		return math.NaN(), false, false
 	}
 	if q < 0 {
-		return math.Inf(-1), false
+		return math.Inf(-1), false, false
 	}
 	if q > 1 {
-		return math.Inf(+1), false
+		return math.Inf(+1), false, false
 	}
 	slices.SortFunc(buckets, func(a, b bucket) int {
 		// We don't expect the bucket boundary to be a NaN.
@ -94,27 +116,27 @@ func bucketQuantile(q float64, buckets buckets) (float64, bool) {
 		return 0
 	})
 	if !math.IsInf(buckets[len(buckets)-1].upperBound, +1) {
-		return math.NaN(), false
+		return math.NaN(), false, false
 	}

 	buckets = coalesceBuckets(buckets)
-	forcedMonotonic := ensureMonotonic(buckets)
+	forcedMonotonic, fixedPrecision := ensureMonotonicAndIgnoreSmallDeltas(buckets, smallDeltaTolerance)

 	if len(buckets) < 2 {
-		return math.NaN(), false
+		return math.NaN(), false, false
 	}
 	observations := buckets[len(buckets)-1].count
 	if observations == 0 {
-		return math.NaN(), false
+		return math.NaN(), false, false
 	}
 	rank := q * observations
 	b := sort.Search(len(buckets)-1, func(i int) bool { return buckets[i].count >= rank })

 	if b == len(buckets)-1 {
-		return buckets[len(buckets)-2].upperBound, forcedMonotonic
+		return buckets[len(buckets)-2].upperBound, forcedMonotonic, fixedPrecision
 	}
 	if b == 0 && buckets[0].upperBound <= 0 {
-		return buckets[0].upperBound, forcedMonotonic
+		return buckets[0].upperBound, forcedMonotonic, fixedPrecision
 	}
 	var (
 		bucketStart float64
@ -126,7 +148,7 @@ func bucketQuantile(q float64, buckets buckets) (float64, bool) {
 		count -= buckets[b-1].count
 		rank -= buckets[b-1].count
 	}
-	return bucketStart + (bucketEnd-bucketStart)*(rank/count), forcedMonotonic
+	return bucketStart + (bucketEnd-bucketStart)*(rank/count), forcedMonotonic, fixedPrecision
 }

 // histogramQuantile calculates the quantile 'q' based on the given histogram.
@ -348,6 +370,7 @@ func coalesceBuckets(buckets buckets) buckets {
 //   - Ingestion via the remote write receiver that Prometheus implements.
 //   - Optimisation of query execution where precision is sacrificed for other
 //     benefits, not by Prometheus but by systems built on top of it.
+//   - Circumstances where floating point precision errors accumulate.
 //
 // Monotonicity is usually guaranteed because if a bucket with upper bound
 // u1 has count c1, then any bucket with a higher upper bound u > u1 must
@ -357,22 +380,42 @@ func coalesceBuckets(buckets buckets) buckets {
 // bucket with the φ-quantile count, so breaking the monotonicity
 // guarantee causes bucketQuantile() to return undefined (nonsense) results.
 //
-// As a somewhat hacky solution, we calculate the "envelope" of the histogram
-// buckets, essentially removing any decreases in the count between successive
-// buckets. We return a bool to indicate if this monotonicity was forced or not.
-func ensureMonotonic(buckets buckets) bool {
-	forced := false
-	max := buckets[0].count
+// As a somewhat hacky solution, we first silently ignore any numerically
+// insignificant (relative delta below the requested tolerance and likely to
+// be from floating point precision errors) differences between successive
+// buckets regardless of the direction. Then we calculate the "envelope" of
+// the histogram buckets, essentially removing any decreases in the count
+// between successive buckets.
+//
+// We return a bool to indicate if this monotonicity was forced or not, and
+// another bool to indicate if small deltas were ignored or not.
+func ensureMonotonicAndIgnoreSmallDeltas(buckets buckets, tolerance float64) (bool, bool) {
+	var forcedMonotonic, fixedPrecision bool
+	prev := buckets[0].count
 	for i := 1; i < len(buckets); i++ {
-		switch {
-		case buckets[i].count > max:
-			max = buckets[i].count
-		case buckets[i].count < max:
-			buckets[i].count = max
-			forced = true
-		}
-	}
-	return forced
+		curr := buckets[i].count // Assumed always positive.
+		if curr == prev {
+			// No correction needed if the counts are identical between buckets.
+			continue
+		}
+		if almostEqual(prev, curr, tolerance) {
+			// Silently correct numerically insignificant differences from floating
+			// point precision errors, regardless of direction.
+			// Do not update the 'prev' value as we are ignoring the difference.
+			buckets[i].count = prev
+			fixedPrecision = true
+			continue
+		}
+		if curr < prev {
+			// Force monotonicity by removing any decreases regardless of magnitude.
+			// Do not update the 'prev' value as we are ignoring the decrease.
+			buckets[i].count = prev
+			forcedMonotonic = true
+			continue
+		}
+		prev = curr
+	}
+	return forcedMonotonic, fixedPrecision
 }

 // quantile calculates the given quantile of a vector of samples.
--- a/promql/quantile_test.go
+++ b/promql/quantile_test.go
@ -0,0 +1,318 @@
+// Copyright 2023 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package promql
+
+import (
+	"math"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestBucketQuantile_ForcedMonotonicity(t *testing.T) {
+	eps := 1e-12
+
+	for name, tc := range map[string]struct {
+		getInput       func() buckets // The buckets can be modified in-place so return a new one each time.
+		expectedForced bool
+		expectedFixed  bool
+		expectedValues map[float64]float64
+	}{
+		"simple - monotonic": {
+			getInput: func() buckets {
+				return buckets{
+					{
+						upperBound: 10,
+						count:      10,
+					}, {
+						upperBound: 15,
+						count:      15,
+					}, {
+						upperBound: 20,
+						count:      15,
+					}, {
+						upperBound: 30,
+						count:      15,
+					}, {
+						upperBound: math.Inf(1),
+						count:      15,
+					},
+				}
+			},
+			expectedForced: false,
+			expectedFixed:  false,
+			expectedValues: map[float64]float64{
+				1:    15.,
+				0.99: 14.85,
+				0.9:  13.5,
+				0.5:  7.5,
+			},
+		},
+		"simple - non-monotonic middle": {
+			getInput: func() buckets {
+				return buckets{
+					{
+						upperBound: 10,
+						count:      10,
+					}, {
+						upperBound: 15,
+						count:      15,
+					}, {
+						upperBound: 20,
+						count:      15.00000000001, // Simulate the case there's a small imprecision in float64.
+					}, {
+						upperBound: 30,
+						count:      15,
+					}, {
+						upperBound: math.Inf(1),
+						count:      15,
+					},
+				}
+			},
+			expectedForced: false,
+			expectedFixed:  true,
+			expectedValues: map[float64]float64{
+				1:    15.,
+				0.99: 14.85,
+				0.9:  13.5,
+				0.5:  7.5,
+			},
+		},
+		"real example - monotonic": {
+			getInput: func() buckets {
+				return buckets{
+					{
+						upperBound: 1,
+						count:      6454661.3014166197,
+					}, {
+						upperBound: 5,
+						count:      8339611.2001912938,
+					}, {
+						upperBound: 10,
+						count:      14118319.2444762159,
+					}, {
+						upperBound: 25,
+						count:      14130031.5272856522,
+					}, {
+						upperBound: 50,
+						count:      46001270.3030008152,
+					}, {
+						upperBound: 64,
+						count:      46008473.8585563600,
+					}, {
+						upperBound: 80,
+						count:      46008473.8585563600,
+					}, {
+						upperBound: 100,
+						count:      46008473.8585563600,
+					}, {
+						upperBound: 250,
+						count:      46008473.8585563600,
+					}, {
+						upperBound: 1000,
+						count:      46008473.8585563600,
+					}, {
+						upperBound: math.Inf(1),
+						count:      46008473.8585563600,
+					},
+				}
+			},
+			expectedForced: false,
+			expectedFixed:  false,
+			expectedValues: map[float64]float64{
+				1:    64.,
+				0.99: 49.64475715376406,
+				0.9:  46.39671690938454,
+				0.5:  31.96098248992002,
+			},
+		},
+		"real example - non-monotonic": {
+			getInput: func() buckets {
+				return buckets{
+					{
+						upperBound: 1,
+						count:      6454661.3014166225,
+					}, {
+						upperBound: 5,
+						count:      8339611.2001912957,
+					}, {
+						upperBound: 10,
+						count:      14118319.2444762159,
+					}, {
+						upperBound: 25,
+						count:      14130031.5272856504,
+					}, {
+						upperBound: 50,
+						count:      46001270.3030008227,
+					}, {
+						upperBound: 64,
+						count:      46008473.8585563824,
+					}, {
+						upperBound: 80,
+						count:      46008473.8585563898,
+					}, {
+						upperBound: 100,
+						count:      46008473.8585563824,
+					}, {
+						upperBound: 250,
+						count:      46008473.8585563824,
+					}, {
+						upperBound: 1000,
+						count:      46008473.8585563898,
+					}, {
+						upperBound: math.Inf(1),
+						count:      46008473.8585563824,
+					},
+				}
+			},
+			expectedForced: false,
+			expectedFixed:  true,
+			expectedValues: map[float64]float64{
+				1:    64.,
+				0.99: 49.64475715376406,
+				0.9:  46.39671690938454,
+				0.5:  31.96098248992002,
+			},
+		},
+		"real example 2 - monotonic": {
+			getInput: func() buckets {
+				return buckets{
+					{
+						upperBound: 0.005,
+						count:      9.6,
+					}, {
+						upperBound: 0.01,
+						count:      9.688888889,
+					}, {
+						upperBound: 0.025,
+						count:      9.755555556,
+					}, {
+						upperBound: 0.05,
+						count:      9.844444444,
+					}, {
+						upperBound: 0.1,
+						count:      9.888888889,
+					}, {
+						upperBound: 0.25,
+						count:      9.888888889,
+					}, {
+						upperBound: 0.5,
+						count:      9.888888889,
+					}, {
+						upperBound: 1,
+						count:      9.888888889,
+					}, {
+						upperBound: 2.5,
+						count:      9.888888889,
+					}, {
+						upperBound: 5,
+						count:      9.888888889,
+					}, {
+						upperBound: 10,
+						count:      9.888888889,
+					}, {
+						upperBound: 25,
+						count:      9.888888889,
+					}, {
+						upperBound: 50,
+						count:      9.888888889,
+					}, {
+						upperBound: 100,
+						count:      9.888888889,
+					}, {
+						upperBound: math.Inf(1),
+						count:      9.888888889,
+					},
+				}
+			},
+			expectedForced: false,
+			expectedFixed:  false,
+			expectedValues: map[float64]float64{
+				1:    0.1,
+				0.99: 0.03468750000281261,
+				0.9:  0.00463541666671875,
+				0.5:  0.0025752314815104174,
+			},
+		},
+		"real example 2 - non-monotonic": {
+			getInput: func() buckets {
+				return buckets{
+					{
+						upperBound: 0.005,
+						count:      9.6,
+					}, {
+						upperBound: 0.01,
+						count:      9.688888889,
+					}, {
+						upperBound: 0.025,
+						count:      9.755555556,
+					}, {
+						upperBound: 0.05,
+						count:      9.844444444,
+					}, {
+						upperBound: 0.1,
+						count:      9.888888889,
+					}, {
+						upperBound: 0.25,
+						count:      9.888888889,
+					}, {
+						upperBound: 0.5,
+						count:      9.888888889,
+					}, {
+						upperBound: 1,
+						count:      9.888888889,
+					}, {
+						upperBound: 2.5,
+						count:      9.888888889,
+					}, {
+						upperBound: 5,
+						count:      9.888888889,
+					}, {
+						upperBound: 10,
+						count:      9.888888889001, // Simulate the case there's a small imprecision in float64.
+					}, {
+						upperBound: 25,
+						count:      9.888888889,
+					}, {
+						upperBound: 50,
+						count:      9.888888888999, // Simulate the case there's a small imprecision in float64.
+					}, {
+						upperBound: 100,
+						count:      9.888888889,
+					}, {
+						upperBound: math.Inf(1),
+						count:      9.888888889,
+					},
+				}
+			},
+			expectedForced: false,
+			expectedFixed:  true,
+			expectedValues: map[float64]float64{
+				1:    0.1,
+				0.99: 0.03468750000281261,
+				0.9:  0.00463541666671875,
+				0.5:  0.0025752314815104174,
+			},
+		},
+	} {
+		t.Run(name, func(t *testing.T) {
+			for q, v := range tc.expectedValues {
+				res, forced, fixed := bucketQuantile(q, tc.getInput())
+				require.Equal(t, tc.expectedForced, forced)
+				require.Equal(t, tc.expectedFixed, fixed)
+				require.InEpsilon(t, v, res, eps)
+			}
+		})
+	}
+}
--- a/promql/test.go
+++ b/promql/test.go
@ -49,7 +49,7 @@ var (
 )

 const (
-	epsilon = 0.000001 // Relative error allowed for sample values.
+	defaultEpsilon = 0.000001 // Relative error allowed for sample values.
 )

 var testStartTime = time.Unix(0, 0).UTC()
@ -440,7 +440,7 @@ func (ev *evalCmd) compareResult(result parser.Value) error {
 			if (expH == nil) != (v.H == nil) || (expH != nil && !expH.Equals(v.H)) {
 				return fmt.Errorf("expected %v for %s but got %s", HistogramTestExpression(expH), v.Metric, HistogramTestExpression(v.H))
 			}
-			if !almostEqual(exp0.Value, v.F) {
+			if !almostEqual(exp0.Value, v.F, defaultEpsilon) {
 				return fmt.Errorf("expected %v for %s but got %v", exp0.Value, v.Metric, v.F)
 			}

@ -464,7 +464,7 @@ func (ev *evalCmd) compareResult(result parser.Value) error {
 		if exp0.Histogram != nil {
 			return fmt.Errorf("expected Histogram %v but got scalar %s", exp0.Histogram.TestExpression(), val.String())
 		}
-		if !almostEqual(exp0.Value, val.V) {
+		if !almostEqual(exp0.Value, val.V, defaultEpsilon) {
 			return fmt.Errorf("expected Scalar %v but got %v", val.V, exp0.Value)
 		}

@ -663,9 +663,9 @@ func (t *test) clear() {
 	t.context, t.cancelCtx = context.WithCancel(context.Background())
 }

-// samplesAlmostEqual returns true if the two sample lines only differ by a
-// small relative error in their sample value.
-func almostEqual(a, b float64) bool {
+// almostEqual returns true if a and b differ by less than their sum
+// multiplied by epsilon.
+func almostEqual(a, b, epsilon float64) bool {
 	// NaN has no equality but for testing we still want to know whether both values
 	// are NaN.
 	if math.IsNaN(a) && math.IsNaN(b) {
@ -677,12 +677,13 @@ func almostEqual(a, b float64) bool {
 		return true
 	}

+	absSum := math.Abs(a) + math.Abs(b)
 	diff := math.Abs(a - b)

-	if a == 0 || b == 0 || diff < minNormal {
+	if a == 0 || b == 0 || absSum < minNormal {
 		return diff < epsilon*minNormal
 	}
-	return diff/(math.Abs(a)+math.Abs(b)) < epsilon
+	return diff/math.Min(absSum, math.MaxFloat64) < epsilon
 }

 func parseNumber(s string) (float64, error) {
--- a/util/annotations/annotations.go
+++ b/util/annotations/annotations.go
@ -108,7 +108,7 @@ var (
 	MixedClassicNativeHistogramsWarning = fmt.Errorf("%w: vector contains a mix of classic and native histograms for metric name", PromQLWarning)

 	PossibleNonCounterInfo                  = fmt.Errorf("%w: metric might not be a counter, name does not end in _total/_sum/_count/_bucket:", PromQLInfo)
-	HistogramQuantileForcedMonotonicityInfo = fmt.Errorf("%w: input to histogram_quantile needed to be fixed for monotonicity (and may give inaccurate results) for metric name", PromQLInfo)
+	HistogramQuantileForcedMonotonicityInfo = fmt.Errorf("%w: input to histogram_quantile needed to be fixed for monotonicity (see https://prometheus.io/docs/prometheus/latest/querying/functions/#histogram_quantile) for metric name", PromQLInfo)
 )

 type annoErr struct {