From 9f4cb06a379c45bb6ec12c241d0492a2f2699887 Mon Sep 17 00:00:00 2001
From: Dan Cech <dan@aussiedan.com>
Date: Sun, 26 Aug 2018 18:28:47 +0900
Subject: [PATCH] use Welford/Knuth method to compute standard deviation and
 variance (#4533)

* use Welford/Knuth method to compute standard deviation and variance, avoids float precision issues
* use better method for calculating avg and avg_over_time

Signed-off-by: Dan Cech <dcech@grafana.com>
---
 promql/engine.go                 | 39 ++++++++++++++++----------------
 promql/functions.go              | 27 +++++++++++-----------
 promql/testdata/aggregators.test | 13 +++++++++++
 promql/testdata/functions.test   | 19 ++++++++++++++++
 4 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/promql/engine.go b/promql/engine.go
index f22b585df..721e0d255 100644
--- a/promql/engine.go
+++ b/promql/engine.go
@@ -1464,12 +1464,12 @@ func intersection(ls1, ls2 labels.Labels) labels.Labels {
 }
 
 type groupedAggregation struct {
-	labels           labels.Labels
-	value            float64
-	valuesSquaredSum float64
-	groupCount       int
-	heap             vectorByValueHeap
-	reverseHeap      vectorByReverseValueHeap
+	labels      labels.Labels
+	value       float64
+	mean        float64
+	groupCount  int
+	heap        vectorByValueHeap
+	reverseHeap vectorByReverseValueHeap
 }
 
 // aggregation evaluates an aggregation operation on a Vector.
@@ -1540,17 +1540,19 @@ func (ev *evaluator) aggregation(op ItemType, grouping []string, without bool, p
 				sort.Sort(m)
 			}
 			result[groupingKey] = &groupedAggregation{
-				labels:           m,
-				value:            s.V,
-				valuesSquaredSum: s.V * s.V,
-				groupCount:       1,
+				labels:     m,
+				value:      s.V,
+				mean:       s.V,
+				groupCount: 1,
 			}
 			inputVecLen := int64(len(vec))
 			resultSize := k
 			if k > inputVecLen {
 				resultSize = inputVecLen
 			}
-			if op == itemTopK || op == itemQuantile {
+			if op == itemStdvar || op == itemStddev {
+				result[groupingKey].value = 0.0
+			} else if op == itemTopK || op == itemQuantile {
 				result[groupingKey].heap = make(vectorByValueHeap, 0, resultSize)
 				heap.Push(&result[groupingKey].heap, &Sample{
 					Point:  Point{V: s.V},
@@ -1571,8 +1573,8 @@ func (ev *evaluator) aggregation(op ItemType, grouping []string, without bool, p
 			group.value += s.V
 
 		case itemAvg:
-			group.value += s.V
 			group.groupCount++
+			group.mean += (s.V - group.mean) / float64(group.groupCount)
 
 		case itemMax:
 			if group.value < s.V || math.IsNaN(group.value) {
@@ -1588,9 +1590,10 @@ func (ev *evaluator) aggregation(op ItemType, grouping []string, without bool, p
 			group.groupCount++
 
 		case itemStdvar, itemStddev:
-			group.value += s.V
-			group.valuesSquaredSum += s.V * s.V
 			group.groupCount++
+			delta := s.V - group.mean
+			group.mean += delta / float64(group.groupCount)
+			group.value += delta * (s.V - group.mean)
 
 		case itemTopK:
 			if int64(len(group.heap)) < k || group.heap[0].V < s.V || math.IsNaN(group.heap[0].V) {
@@ -1626,18 +1629,16 @@ func (ev *evaluator) aggregation(op ItemType, grouping []string, without bool, p
 	for _, aggr := range result {
 		switch op {
 		case itemAvg:
-			aggr.value = aggr.value / float64(aggr.groupCount)
+			aggr.value = aggr.mean
 
 		case itemCount, itemCountValues:
 			aggr.value = float64(aggr.groupCount)
 
 		case itemStdvar:
-			avg := aggr.value / float64(aggr.groupCount)
-			aggr.value = aggr.valuesSquaredSum/float64(aggr.groupCount) - avg*avg
+			aggr.value = aggr.value / float64(aggr.groupCount)
 
 		case itemStddev:
-			avg := aggr.value / float64(aggr.groupCount)
-			aggr.value = math.Sqrt(aggr.valuesSquaredSum/float64(aggr.groupCount) - avg*avg)
+			aggr.value = math.Sqrt(aggr.value / float64(aggr.groupCount))
 
 		case itemTopK:
 			// The heap keeps the lowest value on top, so reverse it.
diff --git a/promql/functions.go b/promql/functions.go
index c40c5eae4..46fa47397 100644
--- a/promql/functions.go
+++ b/promql/functions.go
@@ -371,11 +371,12 @@ func aggrOverTime(vals []Value, enh *EvalNodeHelper, aggrFn func([]Point) float6
 // === avg_over_time(Matrix ValueTypeMatrix) Vector ===
 func funcAvgOverTime(vals []Value, args Expressions, enh *EvalNodeHelper) Vector {
 	return aggrOverTime(vals, enh, func(values []Point) float64 {
-		var sum float64
+		var mean, count float64
 		for _, v := range values {
-			sum += v.V
+			count++
+			mean += (v.V - mean) / count
 		}
-		return sum / float64(len(values))
+		return mean
 	})
 }
 
@@ -444,28 +445,28 @@ func funcQuantileOverTime(vals []Value, args Expressions, enh *EvalNodeHelper) V
 // === stddev_over_time(Matrix ValueTypeMatrix) Vector ===
 func funcStddevOverTime(vals []Value, args Expressions, enh *EvalNodeHelper) Vector {
 	return aggrOverTime(vals, enh, func(values []Point) float64 {
-		var sum, squaredSum, count float64
+		var aux, count, mean float64
 		for _, v := range values {
-			sum += v.V
-			squaredSum += v.V * v.V
 			count++
+			delta := v.V - mean
+			mean += delta / count
+			aux += delta * (v.V - mean)
 		}
-		avg := sum / count
-		return math.Sqrt(squaredSum/count - avg*avg)
+		return math.Sqrt(aux / count)
 	})
 }
 
 // === stdvar_over_time(Matrix ValueTypeMatrix) Vector ===
 func funcStdvarOverTime(vals []Value, args Expressions, enh *EvalNodeHelper) Vector {
 	return aggrOverTime(vals, enh, func(values []Point) float64 {
-		var sum, squaredSum, count float64
+		var aux, count, mean float64
 		for _, v := range values {
-			sum += v.V
-			squaredSum += v.V * v.V
 			count++
+			delta := v.V - mean
+			mean += delta / count
+			aux += delta * (v.V - mean)
 		}
-		avg := sum / count
-		return squaredSum/count - avg*avg
+		return aux / count
 	})
 }
 
diff --git a/promql/testdata/aggregators.test b/promql/testdata/aggregators.test
index 0cbcd5590..8f6e04712 100644
--- a/promql/testdata/aggregators.test
+++ b/promql/testdata/aggregators.test
@@ -90,6 +90,19 @@ eval instant at 50m stdvar by (instance)(http_requests)
   {instance="0"} 50000
   {instance="1"} 50000
 
+# Float precision test for standard deviation and variance
+clear
+load 5m
+  http_requests{job="api-server", instance="0", group="production"} 0+1.33x10
+  http_requests{job="api-server", instance="1", group="production"} 0+1.33x10
+  http_requests{job="api-server", instance="0", group="canary"} 0+1.33x10
+
+eval instant at 50m stddev(http_requests)
+  {} 0.0
+
+eval instant at 50m stdvar(http_requests)
+  {} 0.0
+
 
 
 # Regression test for missing separator byte in labelsToGroupingKey.
diff --git a/promql/testdata/functions.test b/promql/testdata/functions.test
index 28700f2d2..70baae573 100644
--- a/promql/testdata/functions.test
+++ b/promql/testdata/functions.test
@@ -374,6 +374,14 @@ eval instant at 8000s holt_winters(http_requests[1m], 0.01, 0.1)
 	{job="api-server", instance="0", group="canary"} 24000
 	{job="api-server", instance="1", group="canary"} -32000
 
+# Tests for avg_over_time
+clear
+load 10s
+  metric 1 2 3 4 5
+
+eval instant at 1m avg_over_time(metric[1m])
+  {} 3
+
 # Tests for stddev_over_time and stdvar_over_time.
 clear
 load 10s
@@ -385,6 +393,17 @@ eval instant at 1m stdvar_over_time(metric[1m])
 eval instant at 1m stddev_over_time(metric[1m])
   {} 3.249615
 
+# Tests for stddev_over_time and stdvar_over_time #4927.
+clear
+load 10s
+  metric 1.5990505637277868 1.5990505637277868 1.5990505637277868
+
+eval instant at 1m stdvar_over_time(metric[1m])
+  {} 0
+
+eval instant at 1m stddev_over_time(metric[1m])
+  {} 0
+
 # Tests for quantile_over_time
 clear