diff --git a/docs/querying/functions.md b/docs/querying/functions.md index 6838bb72b..00afa1d22 100644 --- a/docs/querying/functions.md +++ b/docs/querying/functions.md @@ -323,6 +323,24 @@ a histogram. You can use `histogram_quantile(1, v instant-vector)` to get the estimated maximum value stored in a histogram. +Buckets of classic histograms are cumulative. Therefore, the following should always be the case: + +- The counts in the buckets are monotonically increasing (strictly non-decreasing). +- A lack of observations between the upper limits of two consecutive buckets results in equal counts +in those two buckets. + +However, floating point precision issues (e.g. small discrepancies introduced by computing of buckets +with `sum(rate(...))`) or invalid data might violate these assumptions. In that case, +`histogram_quantile` would be unable to return meaningful results. To mitigate the issue, +`histogram_quantile` assumes that tiny relative differences between consecutive buckets are happening +because of floating point precision errors and ignores them. (The threshold to ignore a difference +between two buckets is a trillionth (1e-12) of the sum of both buckets.) Furthermore, if there are +non-monotonic bucket counts even after this adjustment, they are increased to the value of the +previous buckets to enforce monotonicity. The latter is evidence for an actual issue with the input +data and is therefore flagged with an informational annotation reading `input to histogram_quantile +needed to be fixed for monotonicity`. If you encounter this annotation, you should find and remove +the source of the invalid data. + ## `histogram_stddev()` and `histogram_stdvar()` _Both functions only act on native histograms, which are an experimental diff --git a/promql/engine_test.go b/promql/engine_test.go index 7532a3294..d6a10455a 100644 --- a/promql/engine_test.go +++ b/promql/engine_test.go @@ -3699,7 +3699,7 @@ func TestNativeHistogram_HistogramQuantile(t *testing.T) { require.Len(t, vector, 1) require.Nil(t, vector[0].H) - require.True(t, almostEqual(sc.value, vector[0].F)) + require.True(t, almostEqual(sc.value, vector[0].F, defaultEpsilon)) }) } idx++ diff --git a/promql/functions.go b/promql/functions.go index e9a03406a..07e439cf2 100644 --- a/promql/functions.go +++ b/promql/functions.go @@ -1163,7 +1163,7 @@ func funcHistogramQuantile(vals []parser.Value, args parser.Expressions, enh *Ev for _, mb := range enh.signatureToMetricWithBuckets { if len(mb.buckets) > 0 { - res, forcedMonotonicity := bucketQuantile(q, mb.buckets) + res, forcedMonotonicity, _ := bucketQuantile(q, mb.buckets) enh.Out = append(enh.Out, Sample{ Metric: mb.metric, F: res, diff --git a/promql/quantile.go b/promql/quantile.go index f28944868..f62519f5b 100644 --- a/promql/quantile.go +++ b/promql/quantile.go @@ -23,6 +23,25 @@ import ( "github.com/prometheus/prometheus/model/labels" ) +// smallDeltaTolerance is the threshold for relative deltas between classic +// histogram buckets that will be ignored by the histogram_quantile function +// because they are most likely artifacts of floating point precision issues. +// Testing on 2 sets of real data with bugs arising from small deltas, +// the safe ranges were from: +// - 1e-05 to 1e-15 +// - 1e-06 to 1e-15 +// Anything to the left of that would cause non-query-sharded data to have +// small deltas ignored (unnecessary and we should avoid this), and anything +// to the right of that would cause query-sharded data to not have its small +// deltas ignored (so the problem won't be fixed). +// For context, query sharding triggers these float precision errors in Mimir. +// To illustrate, with a relative deviation of 1e-12, we need to have 1e12 +// observations in the bucket so that the change of one observation is small +// enough to get ignored. With the usual observation rate even of very busy +// services, this will hardly be reached in timeframes that matters for +// monitoring. +const smallDeltaTolerance = 1e-12 + // Helpers to calculate quantiles. // excludedLabels are the labels to exclude from signature calculation for @@ -72,16 +91,19 @@ type metricWithBuckets struct { // // If q>1, +Inf is returned. // -// We also return a bool to indicate if monotonicity needed to be forced. -func bucketQuantile(q float64, buckets buckets) (float64, bool) { +// We also return a bool to indicate if monotonicity needed to be forced, +// and another bool to indicate if small differences between buckets (that +// are likely artifacts of floating point precision issues) have been +// ignored. +func bucketQuantile(q float64, buckets buckets) (float64, bool, bool) { if math.IsNaN(q) { - return math.NaN(), false + return math.NaN(), false, false } if q < 0 { - return math.Inf(-1), false + return math.Inf(-1), false, false } if q > 1 { - return math.Inf(+1), false + return math.Inf(+1), false, false } slices.SortFunc(buckets, func(a, b bucket) int { // We don't expect the bucket boundary to be a NaN. @@ -94,27 +116,27 @@ func bucketQuantile(q float64, buckets buckets) (float64, bool) { return 0 }) if !math.IsInf(buckets[len(buckets)-1].upperBound, +1) { - return math.NaN(), false + return math.NaN(), false, false } buckets = coalesceBuckets(buckets) - forcedMonotonic := ensureMonotonic(buckets) + forcedMonotonic, fixedPrecision := ensureMonotonicAndIgnoreSmallDeltas(buckets, smallDeltaTolerance) if len(buckets) < 2 { - return math.NaN(), false + return math.NaN(), false, false } observations := buckets[len(buckets)-1].count if observations == 0 { - return math.NaN(), false + return math.NaN(), false, false } rank := q * observations b := sort.Search(len(buckets)-1, func(i int) bool { return buckets[i].count >= rank }) if b == len(buckets)-1 { - return buckets[len(buckets)-2].upperBound, forcedMonotonic + return buckets[len(buckets)-2].upperBound, forcedMonotonic, fixedPrecision } if b == 0 && buckets[0].upperBound <= 0 { - return buckets[0].upperBound, forcedMonotonic + return buckets[0].upperBound, forcedMonotonic, fixedPrecision } var ( bucketStart float64 @@ -126,7 +148,7 @@ func bucketQuantile(q float64, buckets buckets) (float64, bool) { count -= buckets[b-1].count rank -= buckets[b-1].count } - return bucketStart + (bucketEnd-bucketStart)*(rank/count), forcedMonotonic + return bucketStart + (bucketEnd-bucketStart)*(rank/count), forcedMonotonic, fixedPrecision } // histogramQuantile calculates the quantile 'q' based on the given histogram. @@ -348,6 +370,7 @@ func coalesceBuckets(buckets buckets) buckets { // - Ingestion via the remote write receiver that Prometheus implements. // - Optimisation of query execution where precision is sacrificed for other // benefits, not by Prometheus but by systems built on top of it. +// - Circumstances where floating point precision errors accumulate. // // Monotonicity is usually guaranteed because if a bucket with upper bound // u1 has count c1, then any bucket with a higher upper bound u > u1 must @@ -357,22 +380,42 @@ func coalesceBuckets(buckets buckets) buckets { // bucket with the φ-quantile count, so breaking the monotonicity // guarantee causes bucketQuantile() to return undefined (nonsense) results. // -// As a somewhat hacky solution, we calculate the "envelope" of the histogram -// buckets, essentially removing any decreases in the count between successive -// buckets. We return a bool to indicate if this monotonicity was forced or not. -func ensureMonotonic(buckets buckets) bool { - forced := false - max := buckets[0].count +// As a somewhat hacky solution, we first silently ignore any numerically +// insignificant (relative delta below the requested tolerance and likely to +// be from floating point precision errors) differences between successive +// buckets regardless of the direction. Then we calculate the "envelope" of +// the histogram buckets, essentially removing any decreases in the count +// between successive buckets. +// +// We return a bool to indicate if this monotonicity was forced or not, and +// another bool to indicate if small deltas were ignored or not. +func ensureMonotonicAndIgnoreSmallDeltas(buckets buckets, tolerance float64) (bool, bool) { + var forcedMonotonic, fixedPrecision bool + prev := buckets[0].count for i := 1; i < len(buckets); i++ { - switch { - case buckets[i].count > max: - max = buckets[i].count - case buckets[i].count < max: - buckets[i].count = max - forced = true + curr := buckets[i].count // Assumed always positive. + if curr == prev { + // No correction needed if the counts are identical between buckets. + continue + } + if almostEqual(prev, curr, tolerance) { + // Silently correct numerically insignificant differences from floating + // point precision errors, regardless of direction. + // Do not update the 'prev' value as we are ignoring the difference. + buckets[i].count = prev + fixedPrecision = true + continue + } + if curr < prev { + // Force monotonicity by removing any decreases regardless of magnitude. + // Do not update the 'prev' value as we are ignoring the decrease. + buckets[i].count = prev + forcedMonotonic = true + continue } + prev = curr } - return forced + return forcedMonotonic, fixedPrecision } // quantile calculates the given quantile of a vector of samples. diff --git a/promql/quantile_test.go b/promql/quantile_test.go new file mode 100644 index 000000000..84f4c0b06 --- /dev/null +++ b/promql/quantile_test.go @@ -0,0 +1,318 @@ +// Copyright 2023 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package promql + +import ( + "math" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestBucketQuantile_ForcedMonotonicity(t *testing.T) { + eps := 1e-12 + + for name, tc := range map[string]struct { + getInput func() buckets // The buckets can be modified in-place so return a new one each time. + expectedForced bool + expectedFixed bool + expectedValues map[float64]float64 + }{ + "simple - monotonic": { + getInput: func() buckets { + return buckets{ + { + upperBound: 10, + count: 10, + }, { + upperBound: 15, + count: 15, + }, { + upperBound: 20, + count: 15, + }, { + upperBound: 30, + count: 15, + }, { + upperBound: math.Inf(1), + count: 15, + }, + } + }, + expectedForced: false, + expectedFixed: false, + expectedValues: map[float64]float64{ + 1: 15., + 0.99: 14.85, + 0.9: 13.5, + 0.5: 7.5, + }, + }, + "simple - non-monotonic middle": { + getInput: func() buckets { + return buckets{ + { + upperBound: 10, + count: 10, + }, { + upperBound: 15, + count: 15, + }, { + upperBound: 20, + count: 15.00000000001, // Simulate the case there's a small imprecision in float64. + }, { + upperBound: 30, + count: 15, + }, { + upperBound: math.Inf(1), + count: 15, + }, + } + }, + expectedForced: false, + expectedFixed: true, + expectedValues: map[float64]float64{ + 1: 15., + 0.99: 14.85, + 0.9: 13.5, + 0.5: 7.5, + }, + }, + "real example - monotonic": { + getInput: func() buckets { + return buckets{ + { + upperBound: 1, + count: 6454661.3014166197, + }, { + upperBound: 5, + count: 8339611.2001912938, + }, { + upperBound: 10, + count: 14118319.2444762159, + }, { + upperBound: 25, + count: 14130031.5272856522, + }, { + upperBound: 50, + count: 46001270.3030008152, + }, { + upperBound: 64, + count: 46008473.8585563600, + }, { + upperBound: 80, + count: 46008473.8585563600, + }, { + upperBound: 100, + count: 46008473.8585563600, + }, { + upperBound: 250, + count: 46008473.8585563600, + }, { + upperBound: 1000, + count: 46008473.8585563600, + }, { + upperBound: math.Inf(1), + count: 46008473.8585563600, + }, + } + }, + expectedForced: false, + expectedFixed: false, + expectedValues: map[float64]float64{ + 1: 64., + 0.99: 49.64475715376406, + 0.9: 46.39671690938454, + 0.5: 31.96098248992002, + }, + }, + "real example - non-monotonic": { + getInput: func() buckets { + return buckets{ + { + upperBound: 1, + count: 6454661.3014166225, + }, { + upperBound: 5, + count: 8339611.2001912957, + }, { + upperBound: 10, + count: 14118319.2444762159, + }, { + upperBound: 25, + count: 14130031.5272856504, + }, { + upperBound: 50, + count: 46001270.3030008227, + }, { + upperBound: 64, + count: 46008473.8585563824, + }, { + upperBound: 80, + count: 46008473.8585563898, + }, { + upperBound: 100, + count: 46008473.8585563824, + }, { + upperBound: 250, + count: 46008473.8585563824, + }, { + upperBound: 1000, + count: 46008473.8585563898, + }, { + upperBound: math.Inf(1), + count: 46008473.8585563824, + }, + } + }, + expectedForced: false, + expectedFixed: true, + expectedValues: map[float64]float64{ + 1: 64., + 0.99: 49.64475715376406, + 0.9: 46.39671690938454, + 0.5: 31.96098248992002, + }, + }, + "real example 2 - monotonic": { + getInput: func() buckets { + return buckets{ + { + upperBound: 0.005, + count: 9.6, + }, { + upperBound: 0.01, + count: 9.688888889, + }, { + upperBound: 0.025, + count: 9.755555556, + }, { + upperBound: 0.05, + count: 9.844444444, + }, { + upperBound: 0.1, + count: 9.888888889, + }, { + upperBound: 0.25, + count: 9.888888889, + }, { + upperBound: 0.5, + count: 9.888888889, + }, { + upperBound: 1, + count: 9.888888889, + }, { + upperBound: 2.5, + count: 9.888888889, + }, { + upperBound: 5, + count: 9.888888889, + }, { + upperBound: 10, + count: 9.888888889, + }, { + upperBound: 25, + count: 9.888888889, + }, { + upperBound: 50, + count: 9.888888889, + }, { + upperBound: 100, + count: 9.888888889, + }, { + upperBound: math.Inf(1), + count: 9.888888889, + }, + } + }, + expectedForced: false, + expectedFixed: false, + expectedValues: map[float64]float64{ + 1: 0.1, + 0.99: 0.03468750000281261, + 0.9: 0.00463541666671875, + 0.5: 0.0025752314815104174, + }, + }, + "real example 2 - non-monotonic": { + getInput: func() buckets { + return buckets{ + { + upperBound: 0.005, + count: 9.6, + }, { + upperBound: 0.01, + count: 9.688888889, + }, { + upperBound: 0.025, + count: 9.755555556, + }, { + upperBound: 0.05, + count: 9.844444444, + }, { + upperBound: 0.1, + count: 9.888888889, + }, { + upperBound: 0.25, + count: 9.888888889, + }, { + upperBound: 0.5, + count: 9.888888889, + }, { + upperBound: 1, + count: 9.888888889, + }, { + upperBound: 2.5, + count: 9.888888889, + }, { + upperBound: 5, + count: 9.888888889, + }, { + upperBound: 10, + count: 9.888888889001, // Simulate the case there's a small imprecision in float64. + }, { + upperBound: 25, + count: 9.888888889, + }, { + upperBound: 50, + count: 9.888888888999, // Simulate the case there's a small imprecision in float64. + }, { + upperBound: 100, + count: 9.888888889, + }, { + upperBound: math.Inf(1), + count: 9.888888889, + }, + } + }, + expectedForced: false, + expectedFixed: true, + expectedValues: map[float64]float64{ + 1: 0.1, + 0.99: 0.03468750000281261, + 0.9: 0.00463541666671875, + 0.5: 0.0025752314815104174, + }, + }, + } { + t.Run(name, func(t *testing.T) { + for q, v := range tc.expectedValues { + res, forced, fixed := bucketQuantile(q, tc.getInput()) + require.Equal(t, tc.expectedForced, forced) + require.Equal(t, tc.expectedFixed, fixed) + require.InEpsilon(t, v, res, eps) + } + }) + } +} diff --git a/promql/test.go b/promql/test.go index f6a31ee43..7274a8f0d 100644 --- a/promql/test.go +++ b/promql/test.go @@ -49,7 +49,7 @@ var ( ) const ( - epsilon = 0.000001 // Relative error allowed for sample values. + defaultEpsilon = 0.000001 // Relative error allowed for sample values. ) var testStartTime = time.Unix(0, 0).UTC() @@ -440,7 +440,7 @@ func (ev *evalCmd) compareResult(result parser.Value) error { if (expH == nil) != (v.H == nil) || (expH != nil && !expH.Equals(v.H)) { return fmt.Errorf("expected %v for %s but got %s", HistogramTestExpression(expH), v.Metric, HistogramTestExpression(v.H)) } - if !almostEqual(exp0.Value, v.F) { + if !almostEqual(exp0.Value, v.F, defaultEpsilon) { return fmt.Errorf("expected %v for %s but got %v", exp0.Value, v.Metric, v.F) } @@ -464,7 +464,7 @@ func (ev *evalCmd) compareResult(result parser.Value) error { if exp0.Histogram != nil { return fmt.Errorf("expected Histogram %v but got scalar %s", exp0.Histogram.TestExpression(), val.String()) } - if !almostEqual(exp0.Value, val.V) { + if !almostEqual(exp0.Value, val.V, defaultEpsilon) { return fmt.Errorf("expected Scalar %v but got %v", val.V, exp0.Value) } @@ -663,9 +663,9 @@ func (t *test) clear() { t.context, t.cancelCtx = context.WithCancel(context.Background()) } -// samplesAlmostEqual returns true if the two sample lines only differ by a -// small relative error in their sample value. -func almostEqual(a, b float64) bool { +// almostEqual returns true if a and b differ by less than their sum +// multiplied by epsilon. +func almostEqual(a, b, epsilon float64) bool { // NaN has no equality but for testing we still want to know whether both values // are NaN. if math.IsNaN(a) && math.IsNaN(b) { @@ -677,12 +677,13 @@ func almostEqual(a, b float64) bool { return true } + absSum := math.Abs(a) + math.Abs(b) diff := math.Abs(a - b) - if a == 0 || b == 0 || diff < minNormal { + if a == 0 || b == 0 || absSum < minNormal { return diff < epsilon*minNormal } - return diff/(math.Abs(a)+math.Abs(b)) < epsilon + return diff/math.Min(absSum, math.MaxFloat64) < epsilon } func parseNumber(s string) (float64, error) { diff --git a/util/annotations/annotations.go b/util/annotations/annotations.go index fa4983fc9..2041d0217 100644 --- a/util/annotations/annotations.go +++ b/util/annotations/annotations.go @@ -108,7 +108,7 @@ var ( MixedClassicNativeHistogramsWarning = fmt.Errorf("%w: vector contains a mix of classic and native histograms for metric name", PromQLWarning) PossibleNonCounterInfo = fmt.Errorf("%w: metric might not be a counter, name does not end in _total/_sum/_count/_bucket:", PromQLInfo) - HistogramQuantileForcedMonotonicityInfo = fmt.Errorf("%w: input to histogram_quantile needed to be fixed for monotonicity (and may give inaccurate results) for metric name", PromQLInfo) + HistogramQuantileForcedMonotonicityInfo = fmt.Errorf("%w: input to histogram_quantile needed to be fixed for monotonicity (see https://prometheus.io/docs/prometheus/latest/querying/functions/#histogram_quantile) for metric name", PromQLInfo) ) type annoErr struct {