[BUGFIX] PromQL: Fix behaviour of some aggregations with histograms (#15432)

promql: fix some aggregations for histograms

This PR fixes the behaviour of `topk`,`bottomk`, `limitk` and `limit_ratio` with histograms. The fixed behaviour are as follows:
- For `topk` and `bottomk` histograms are ignored and add info annotations added.
- For `limitk` and `limit_ratio` histograms are included in the results(if applicable).

Signed-off-by: Neeraj Gartia <neerajgartia211002@gmail.com>
pull/15473/head
Neeraj Gartia 2024-11-26 23:42:36 +05:30 committed by GitHub
parent 634afbc130
commit 38bb6ece25
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 161 additions and 46 deletions

View File

@ -3187,18 +3187,19 @@ func (ev *evaluator) aggregationK(e *parser.AggregateExpr, k int, r float64, inp
seriesLoop:
for si := range inputMatrix {
f, _, ok := ev.nextValues(enh.Ts, &inputMatrix[si])
f, h, ok := ev.nextValues(enh.Ts, &inputMatrix[si])
if !ok {
continue
}
s = Sample{Metric: inputMatrix[si].Metric, F: f, DropName: inputMatrix[si].DropName}
s = Sample{Metric: inputMatrix[si].Metric, F: f, H: h, DropName: inputMatrix[si].DropName}
group := &groups[seriesToResult[si]]
// Initialize this group if it's the first time we've seen it.
if !group.seen {
// LIMIT_RATIO is a special case, as we may not add this very sample to the heap,
// while we also don't know the final size of it.
if op == parser.LIMIT_RATIO {
switch op {
case parser.LIMIT_RATIO:
*group = groupedAggregation{
seen: true,
heap: make(vectorByValueHeap, 0),
@ -3206,12 +3207,34 @@ seriesLoop:
if ratiosampler.AddRatioSample(r, &s) {
heap.Push(&group.heap, &s)
}
} else {
case parser.LIMITK:
*group = groupedAggregation{
seen: true,
heap: make(vectorByValueHeap, 1, k),
}
group.heap[0] = s
case parser.TOPK:
*group = groupedAggregation{
seen: true,
heap: make(vectorByValueHeap, 0, k),
}
if s.H != nil {
group.seen = false
annos.Add(annotations.NewHistogramIgnoredInAggregationInfo("topk", e.PosRange))
} else {
heap.Push(&group.heap, &s)
}
case parser.BOTTOMK:
*group = groupedAggregation{
seen: true,
heap: make(vectorByValueHeap, 0, k),
}
if s.H != nil {
group.seen = false
annos.Add(annotations.NewHistogramIgnoredInAggregationInfo("bottomk", e.PosRange))
} else {
heap.Push(&group.heap, &s)
}
}
continue
}
@ -3220,6 +3243,9 @@ seriesLoop:
case parser.TOPK:
// We build a heap of up to k elements, with the smallest element at heap[0].
switch {
case s.H != nil:
// Ignore histogram sample and add info annotation.
annos.Add(annotations.NewHistogramIgnoredInAggregationInfo("topk", e.PosRange))
case len(group.heap) < k:
heap.Push(&group.heap, &s)
case group.heap[0].F < s.F || (math.IsNaN(group.heap[0].F) && !math.IsNaN(s.F)):
@ -3233,6 +3259,9 @@ seriesLoop:
case parser.BOTTOMK:
// We build a heap of up to k elements, with the biggest element at heap[0].
switch {
case s.H != nil:
// Ignore histogram sample and add info annotation.
annos.Add(annotations.NewHistogramIgnoredInAggregationInfo("bottomk", e.PosRange))
case len(group.heap) < k:
heap.Push((*vectorByReverseValueHeap)(&group.heap), &s)
case group.heap[0].F > s.F || (math.IsNaN(group.heap[0].F) && !math.IsNaN(s.F)):
@ -3275,10 +3304,14 @@ seriesLoop:
mat = make(Matrix, 0, len(groups))
}
add := func(lbls labels.Labels, f float64, dropName bool) {
add := func(lbls labels.Labels, f float64, h *histogram.FloatHistogram, dropName bool) {
// If this could be an instant query, add directly to the matrix so the result is in consistent order.
if ev.endTimestamp == ev.startTimestamp {
mat = append(mat, Series{Metric: lbls, Floats: []FPoint{{T: enh.Ts, F: f}}, DropName: dropName})
if h != nil {
mat = append(mat, Series{Metric: lbls, Histograms: []HPoint{{T: enh.Ts, H: h}}, DropName: dropName})
} else {
mat = append(mat, Series{Metric: lbls, Floats: []FPoint{{T: enh.Ts, F: f}}, DropName: dropName})
}
} else {
// Otherwise the results are added into seriess elements.
hash := lbls.Hash()
@ -3286,7 +3319,7 @@ seriesLoop:
if !ok {
ss = Series{Metric: lbls, DropName: dropName}
}
addToSeries(&ss, enh.Ts, f, nil, numSteps)
addToSeries(&ss, enh.Ts, f, h, numSteps)
seriess[hash] = ss
}
}
@ -3301,7 +3334,7 @@ seriesLoop:
sort.Sort(sort.Reverse(aggr.heap))
}
for _, v := range aggr.heap {
add(v.Metric, v.F, v.DropName)
add(v.Metric, v.F, v.H, v.DropName)
}
case parser.BOTTOMK:
@ -3310,12 +3343,12 @@ seriesLoop:
sort.Sort(sort.Reverse((*vectorByReverseValueHeap)(&aggr.heap)))
}
for _, v := range aggr.heap {
add(v.Metric, v.F, v.DropName)
add(v.Metric, v.F, v.H, v.DropName)
}
case parser.LIMITK, parser.LIMIT_RATIO:
for _, v := range aggr.heap {
add(v.Metric, v.F, v.DropName)
add(v.Metric, v.F, v.H, v.DropName)
}
}
}

View File

@ -272,6 +272,8 @@ load 5m
http_requests{job="app-server", instance="1", group="production"} 0+60x10
http_requests{job="app-server", instance="0", group="canary"} 0+70x10
http_requests{job="app-server", instance="1", group="canary"} 0+80x10
http_requests_histogram{job="app-server", instance="2", group="canary"} {{schema:0 sum:10 count:10}}x11
http_requests_histogram{job="api-server", instance="3", group="production"} {{schema:0 sum:20 count:20}}x11
foo 3+0x10
eval_ordered instant at 50m topk(3, http_requests)
@ -338,6 +340,47 @@ eval_ordered instant at 50m topk(scalar(foo), http_requests)
http_requests{group="canary", instance="0", job="app-server"} 700
http_requests{group="production", instance="1", job="app-server"} 600
# Tests for histogram: should ignore histograms.
eval_info instant at 50m topk(100, http_requests_histogram)
#empty
eval_info range from 0 to 50m step 5m topk(100, http_requests_histogram)
#empty
eval_info instant at 50m topk(1, {__name__=~"http_requests(_histogram)?"})
{__name__="http_requests", group="canary", instance="1", job="app-server"} 800
eval_info instant at 50m count(topk(1000, {__name__=~"http_requests(_histogram)?"}))
{} 9
eval_info range from 0 to 50m step 5m count(topk(1000, {__name__=~"http_requests(_histogram)?"}))
{} 9x10
eval_info instant at 50m topk by (instance) (1, {__name__=~"http_requests(_histogram)?"})
{__name__="http_requests", group="canary", instance="0", job="app-server"} 700
{__name__="http_requests", group="canary", instance="1", job="app-server"} 800
{__name__="http_requests", group="production", instance="2", job="api-server"} NaN
eval_info instant at 50m bottomk(100, http_requests_histogram)
#empty
eval_info range from 0 to 50m step 5m bottomk(100, http_requests_histogram)
#empty
eval_info instant at 50m bottomk(1, {__name__=~"http_requests(_histogram)?"})
{__name__="http_requests", group="production", instance="0", job="api-server"} 100
eval_info instant at 50m count(bottomk(1000, {__name__=~"http_requests(_histogram)?"}))
{} 9
eval_info range from 0 to 50m step 5m count(bottomk(1000, {__name__=~"http_requests(_histogram)?"}))
{} 9x10
eval_info instant at 50m bottomk by (instance) (1, {__name__=~"http_requests(_histogram)?"})
{__name__="http_requests", group="production", instance="0", job="api-server"} 100
{__name__="http_requests", group="production", instance="1", job="api-server"} 200
{__name__="http_requests", group="production", instance="2", job="api-server"} NaN
clear
# Tests for count_values.

View File

@ -9,6 +9,8 @@ load 5m
http_requests{job="api-server", instance="1", group="canary"} 0+40x10
http_requests{job="api-server", instance="2", group="canary"} 0+50x10
http_requests{job="api-server", instance="3", group="canary"} 0+60x10
http_requests{job="api-server", instance="histogram_1", group="canary"} {{schema:0 sum:10 count:10}}x11
http_requests{job="api-server", instance="histogram_2", group="canary"} {{schema:0 sum:20 count:20}}x11
eval instant at 50m count(limitk by (group) (0, http_requests))
# empty
@ -16,25 +18,56 @@ eval instant at 50m count(limitk by (group) (0, http_requests))
eval instant at 50m count(limitk by (group) (-1, http_requests))
# empty
# Exercise k==1 special case (as sample is added before the main series loop
# Exercise k==1 special case (as sample is added before the main series loop).
eval instant at 50m count(limitk by (group) (1, http_requests) and http_requests)
{} 2
{} 2
eval instant at 50m count(limitk by (group) (2, http_requests) and http_requests)
{} 4
{} 4
eval instant at 50m count(limitk(100, http_requests) and http_requests)
{} 6
{} 8
# Exercise k==1 special case (as sample is added before the main series loop
# Exercise k==1 special case (as sample is added before the main series loop).
eval instant at 50m count(limitk by (group) (1, http_requests) and http_requests)
{} 2
{} 2
eval instant at 50m count(limitk by (group) (2, http_requests) and http_requests)
{} 4
{} 4
eval instant at 50m count(limitk(100, http_requests) and http_requests)
{} 6
{} 8
# Test for histograms.
# k==1: verify that histogram is included in the result.
eval instant at 50m limitk(1, http_requests{instance="histogram_1"})
{__name__="http_requests", group="canary", instance="histogram_1", job="api-server"} {{count:10 sum:10}}
eval range from 0 to 50m step 5m limitk(1, http_requests{instance="histogram_1"})
{__name__="http_requests", group="canary", instance="histogram_1", job="api-server"} {{count:10 sum:10}}x10
# Histogram is included with mix of floats as well.
eval instant at 50m limitk(8, http_requests{instance=~"(histogram_2|0)"})
{__name__="http_requests", group="canary", instance="histogram_2", job="api-server"} {{count:20 sum:20}}
{__name__="http_requests", group="production", instance="0", job="api-server"} 100
{__name__="http_requests", group="canary", instance="0", job="api-server"} 300
eval range from 0 to 50m step 5m limitk(8, http_requests{instance=~"(histogram_2|0)"})
{__name__="http_requests", group="canary", instance="histogram_2", job="api-server"} {{count:20 sum:20}}x10
{__name__="http_requests", group="production", instance="0", job="api-server"} 0+10x10
{__name__="http_requests", group="canary", instance="0", job="api-server"} 0+30x10
eval instant at 50m count(limitk(2, http_requests{instance=~"histogram_[0-9]"}))
{} 2
eval range from 0 to 50m step 5m count(limitk(2, http_requests{instance=~"histogram_[0-9]"}))
{} 2+0x10
eval instant at 50m count(limitk(1000, http_requests{instance=~"histogram_[0-9]"}))
{} 2
eval range from 0 to 50m step 5m count(limitk(1000, http_requests{instance=~"histogram_[0-9]"}))
{} 2+0x10
# limit_ratio
eval range from 0 to 50m step 5m count(limit_ratio(0.0, http_requests))
@ -42,9 +75,9 @@ eval range from 0 to 50m step 5m count(limit_ratio(0.0, http_requests))
# limitk(2, ...) should always return a 2-count subset of the timeseries (hence the AND'ing)
eval range from 0 to 50m step 5m count(limitk(2, http_requests) and http_requests)
{} 2+0x10
{} 2+0x10
# Tests for limit_ratio
# Tests for limit_ratio.
#
# NB: below 0.5 ratio will depend on some hashing "luck" (also there's no guarantee that
# an integer comes from: total number of series * ratio), as it depends on:
@ -56,50 +89,50 @@ eval range from 0 to 50m step 5m count(limitk(2, http_requests) and http_request
#
# See `AddRatioSample()` in promql/engine.go for more details.
# Half~ish samples: verify we get "near" 3 (of 0.5 * 6)
eval range from 0 to 50m step 5m count(limit_ratio(0.5, http_requests) and http_requests) <= bool (3+1)
{} 1+0x10
# Half~ish samples: verify we get "near" 3 (of 0.5 * 6).
eval range from 0 to 50m step 5m count(limit_ratio(0.5, http_requests) and http_requests) <= bool (4+1)
{} 1+0x10
eval range from 0 to 50m step 5m count(limit_ratio(0.5, http_requests) and http_requests) >= bool (3-1)
{} 1+0x10
eval range from 0 to 50m step 5m count(limit_ratio(0.5, http_requests) and http_requests) >= bool (4-1)
{} 1+0x10
# All samples
# All samples.
eval range from 0 to 50m step 5m count(limit_ratio(1.0, http_requests) and http_requests)
{} 6+0x10
{} 8+0x10
# All samples
# All samples.
eval range from 0 to 50m step 5m count(limit_ratio(-1.0, http_requests) and http_requests)
{} 6+0x10
{} 8+0x10
# Capped to 1.0 -> all samples
# Capped to 1.0 -> all samples.
eval_warn range from 0 to 50m step 5m count(limit_ratio(1.1, http_requests) and http_requests)
{} 6+0x10
{} 8+0x10
# Capped to -1.0 -> all samples
# Capped to -1.0 -> all samples.
eval_warn range from 0 to 50m step 5m count(limit_ratio(-1.1, http_requests) and http_requests)
{} 6+0x10
{} 8+0x10
# Verify that limit_ratio(value) and limit_ratio(1.0-value) return the "complement" of each other
# Complement below for [0.2, -0.8]
# Verify that limit_ratio(value) and limit_ratio(1.0-value) return the "complement" of each other.
# Complement below for [0.2, -0.8].
#
# Complement 1of2: `or` should return all samples
# Complement 1of2: `or` should return all samples.
eval range from 0 to 50m step 5m count(limit_ratio(0.2, http_requests) or limit_ratio(-0.8, http_requests))
{} 6+0x10
{} 8+0x10
# Complement 2of2: `and` should return no samples
# Complement 2of2: `and` should return no samples.
eval range from 0 to 50m step 5m count(limit_ratio(0.2, http_requests) and limit_ratio(-0.8, http_requests))
# empty
# Complement below for [0.5, -0.5]
# Complement below for [0.5, -0.5].
eval range from 0 to 50m step 5m count(limit_ratio(0.5, http_requests) or limit_ratio(-0.5, http_requests))
{} 6+0x10
{} 8+0x10
eval range from 0 to 50m step 5m count(limit_ratio(0.5, http_requests) and limit_ratio(-0.5, http_requests))
# empty
# Complement below for [0.8, -0.2]
# Complement below for [0.8, -0.2].
eval range from 0 to 50m step 5m count(limit_ratio(0.8, http_requests) or limit_ratio(-0.2, http_requests))
{} 6+0x10
{} 8+0x10
eval range from 0 to 50m step 5m count(limit_ratio(0.8, http_requests) and limit_ratio(-0.2, http_requests))
# empty
@ -107,13 +140,19 @@ eval range from 0 to 50m step 5m count(limit_ratio(0.8, http_requests) and limit
# Complement below for [some_ratio, 1.0 - some_ratio], some_ratio derived from time(),
# using a small prime number to avoid rounded ratio values, and a small set of them.
eval range from 0 to 50m step 5m count(limit_ratio(time() % 17/17, http_requests) or limit_ratio(1.0 - (time() % 17/17), http_requests))
{} 6+0x10
{} 8+0x10
eval range from 0 to 50m step 5m count(limit_ratio(time() % 17/17, http_requests) and limit_ratio(1.0 - (time() % 17/17), http_requests))
# empty
# Poor man's normality check: ok (loaded samples follow a nice linearity over labels and time)
# The check giving: 1 (i.e. true)
eval range from 0 to 50m step 5m abs(avg(limit_ratio(0.5, http_requests)) - avg(limit_ratio(-0.5, http_requests))) <= bool stddev(http_requests)
# Poor man's normality check: ok (loaded samples follow a nice linearity over labels and time).
# The check giving: 1 (i.e. true).
eval range from 0 to 50m step 5m abs(avg(limit_ratio(0.5, http_requests{instance!~"histogram_[0-9]"})) - avg(limit_ratio(-0.5, http_requests{instance!~"histogram_[0-9]"}))) <= bool stddev(http_requests{instance!~"histogram_[0-9]"})
{} 1+0x10
# All specified histograms are included for r=1.
eval instant at 50m limit_ratio(1, http_requests{instance="histogram_1"})
{__name__="http_requests", group="canary", instance="histogram_1", job="api-server"} {{count:10 sum:10}}
eval range from 0 to 50m step 5m limit_ratio(1, http_requests{instance="histogram_1"})
{__name__="http_requests", group="canary", instance="histogram_1", job="api-server"} {{count:10 sum:10}}x10