Fix OOM when a large K is used in topk queries (#4087)

This attempts to close #3973.

Handles cases where the length of the input vector to an aggregate topk
/ bottomk function is less than the K paramater. The change updates
Prometheus to allocate a result vector the same length as the input
vector in these cases.

Previously Prometheus would out-of-memory panic for large K values. This
change makes that unlikely unless the size of the input vector is
equally large.

Signed-off-by: David King <dave@davbo.org>
pull/4091/head
David King 2018-04-16 09:03:04 +01:00 committed by Brian Brazil
parent e7584ee345
commit 6286c10df0
2 changed files with 17 additions and 2 deletions

View File

@ -1333,14 +1333,19 @@ func (ev *evaluator) aggregation(op ItemType, grouping []string, without bool, p
valuesSquaredSum: s.V * s.V,
groupCount: 1,
}
input_vec_len := int64(len(vec))
result_size := k
if k > input_vec_len {
result_size = input_vec_len
}
if op == itemTopK || op == itemQuantile {
result[groupingKey].heap = make(vectorByValueHeap, 0, k)
result[groupingKey].heap = make(vectorByValueHeap, 0, result_size)
heap.Push(&result[groupingKey].heap, &Sample{
Point: Point{V: s.V},
Metric: s.Metric,
})
} else if op == itemBottomK {
result[groupingKey].reverseHeap = make(vectorByReverseValueHeap, 0, k)
result[groupingKey].reverseHeap = make(vectorByReverseValueHeap, 0, result_size)
heap.Push(&result[groupingKey].reverseHeap, &Sample{
Point: Point{V: s.V},
Metric: s.Metric,

View File

@ -184,6 +184,16 @@ eval_ordered instant at 50m bottomk(3, http_requests{job="api-server",group="pro
http_requests{job="api-server", instance="1", group="production"} 200
http_requests{job="api-server", instance="2", group="production"} NaN
# Test topk and bottomk allocate min(k, input_vector) for results vector
eval_ordered instant at 50m bottomk(9999999999, http_requests{job="app-server",group="canary"})
http_requests{group="canary", instance="0", job="app-server"} 700
http_requests{group="canary", instance="1", job="app-server"} 800
eval_ordered instant at 50m topk(9999999999, http_requests{job="api-server",group="production"})
http_requests{job="api-server", instance="1", group="production"} 200
http_requests{job="api-server", instance="0", group="production"} 100
http_requests{job="api-server", instance="2", group="production"} NaN
clear
# Tests for count_values.