Improve predict_linear

Fixes https://github.com/prometheus/prometheus/issues/1401 This remove the last (and in fact bogus) use of BoundaryValues. Thus, a whole lot of unused (and arguably sub-optimal / ugly) code can be removed here, too.
2016-02-24 17:16:24 +01:00 · 2016-02-24 17:16:24 +01:00 · c740789ce3
parent 4b503ed9a5
commit c740789ce3
8 changed files with 66 additions and 196 deletions
--- a/promql/engine.go
+++ b/promql/engine.go
@ -575,15 +575,6 @@ func (ev *evaluator) evalMatrix(e Expr) matrix {
 	return mat
 }
 // evalMatrixBounds attempts to evaluate e to matrix boundaries and errors otherwise.
 func (ev *evaluator) evalMatrixBounds(e Expr) matrix {
 	ms, ok := e.(*MatrixSelector)
 	if !ok {
 		ev.errorf("matrix bounds can only be evaluated for matrix selectors, got %T", e)
 	}
 	return ev.matrixSelectorBounds(ms)
 }
 // evalString attempts to evaluate e to a string value and errors otherwise.
 func (ev *evaluator) evalString(e Expr) *model.String {
 	val := ev.eval(e)
@ -731,29 +722,6 @@ func (ev *evaluator) matrixSelector(node *MatrixSelector) matrix {
 	return matrix(sampleStreams)
 }
 // matrixSelectorBounds evaluates the boundaries of a *MatrixSelector.
 func (ev *evaluator) matrixSelectorBounds(node *MatrixSelector) matrix {
 	interval := metric.Interval{
 		OldestInclusive: ev.Timestamp.Add(-node.Range - node.Offset),
 		NewestInclusive: ev.Timestamp.Add(-node.Offset),
 	}
 	sampleStreams := make([]*sampleStream, 0, len(node.iterators))
 	for fp, it := range node.iterators {
 		samplePairs := it.BoundaryValues(interval)
 		if len(samplePairs) == 0 {
 			continue
 		}
 		ss := &sampleStream{
 			Metric: node.metrics[fp],
 			Values: samplePairs,
 		}
 		sampleStreams = append(sampleStreams, ss)
 	}
 	return matrix(sampleStreams)
 }
 func (ev *evaluator) vectorAnd(lhs, rhs vector, matching *VectorMatching) vector {
 	if matching.Card != CardManyToMany {
 		panic("logical operations must always be many-to-many matching")
--- a/promql/functions.go
+++ b/promql/functions.go
@ -524,10 +524,37 @@ func funcLog10(ev *evaluator, args Expressions) model.Value {
 	return vector
 }
 // linearRegression performs a least-square linear regression analysis on the
 // provided SamplePairs. It returns the slope, and the intercept value at the
 // provided time.
 func linearRegression(samples []model.SamplePair, interceptTime model.Time) (slope, intercept model.SampleValue) {
 	var (
 		n            model.SampleValue
 		sumX, sumY   model.SampleValue
 		sumXY, sumX2 model.SampleValue
 	)
 	for _, sample := range samples {
 		x := model.SampleValue(
 			model.Time(sample.Timestamp-interceptTime).UnixNano(),
 		) / 1e9
 		n += 1.0
 		sumY += sample.Value
 		sumX += x
 		sumXY += x * sample.Value
 		sumX2 += x * x
 	}
 	covXY := sumXY - sumX*sumY/n
 	varX := sumX2 - sumX*sumX/n
 	slope = covXY / varX
 	intercept = sumY/n - slope*sumX/n
 	return
 }
 // === deriv(node model.ValMatrix) Vector ===
 func funcDeriv(ev *evaluator, args Expressions) model.Value {
 	resultVector := vector{}
 	mat := ev.evalMatrix(args[0])
 	resultVector := make(vector, 0, len(mat))
 	for _, samples := range mat {
 		// No sense in trying to compute a derivative without at least two points.
@ -535,29 +562,10 @@ func funcDeriv(ev *evaluator, args Expressions) model.Value {
 		if len(samples.Values) < 2 {
 			continue
 		}
-
+		slope, _ := linearRegression(samples.Values, 0)
 		// Least squares.
 		var (
 			n            model.SampleValue
 			sumX, sumY   model.SampleValue
 			sumXY, sumX2 model.SampleValue
 		)
 		for _, sample := range samples.Values {
 			x := model.SampleValue(sample.Timestamp.UnixNano() / 1e9)
 			n += 1.0
 			sumY += sample.Value
 			sumX += x
 			sumXY += x * sample.Value
 			sumX2 += x * x
 		}
 		numerator := sumXY - sumX*sumY/n
 		denominator := sumX2 - (sumX*sumX)/n
 		resultValue := numerator / denominator
 		resultSample := &sample{
 			Metric:    samples.Metric,
-			Value:     resultValue,
+			Value:     slope,
 			Timestamp: ev.Timestamp,
 		}
 		resultSample.Metric.Del(model.MetricNameLabel)
@ -568,44 +576,26 @@ func funcDeriv(ev *evaluator, args Expressions) model.Value {
 // === predict_linear(node model.ValMatrix, k model.ValScalar) Vector ===
 func funcPredictLinear(ev *evaluator, args Expressions) model.Value {
-	vec := funcDeriv(ev, args[0:1]).(vector)
+	mat := ev.evalMatrix(args[0])
-	duration := model.SampleValue(model.SampleValue(ev.evalFloat(args[1])))
+	resultVector := make(vector, 0, len(mat))
 	duration := model.SampleValue(ev.evalFloat(args[1]))
-	excludedLabels := map[model.LabelName]struct{}{
+	for _, samples := range mat {
-		model.MetricNameLabel: {},
+		// No sense in trying to predict anything without at least two points.
-	}
+		// Drop this vector element.
 	// Calculate predicted delta over the duration.
 	signatureToDelta := map[uint64]model.SampleValue{}
 	for _, el := range vec {
 		signature := model.SignatureWithoutLabels(el.Metric.Metric, excludedLabels)
 		signatureToDelta[signature] = el.Value * duration
 	}
 	// add predicted delta to last value.
 	// TODO(beorn7): This is arguably suboptimal. The funcDeriv above has
 	// given us an estimate over the range. So we should add the delta to
 	// the value predicted for the end of the range. Also, once this has
 	// been rectified, we are not using BoundaryValues anywhere anymore, so
 	// we can kick out a whole lot of code.
 	matrixBounds := ev.evalMatrixBounds(args[0])
 	outVec := make(vector, 0, len(signatureToDelta))
 	for _, samples := range matrixBounds {
 		if len(samples.Values) < 2 {
 			continue
 		}
-		signature := model.SignatureWithoutLabels(samples.Metric.Metric, excludedLabels)
+		slope, intercept := linearRegression(samples.Values, ev.Timestamp)
-		delta, ok := signatureToDelta[signature]
+		resultSample := &sample{
-		if ok {
+			Metric:    samples.Metric,
-			samples.Metric.Del(model.MetricNameLabel)
+			Value:     slope*duration + intercept,
-			outVec = append(outVec, &sample{
+			Timestamp: ev.Timestamp,
 				Metric:    samples.Metric,
 				Value:     delta + samples.Values[1].Value,
 				Timestamp: ev.Timestamp,
 			})
 		}
 		resultSample.Metric.Del(model.MetricNameLabel)
 		resultVector = append(resultVector, resultSample)
 	}
-	return outVec
+	return resultVector
 }
 // === histogram_quantile(k model.ValScalar, vector model.ValVector) Vector ===
--- a/promql/testdata/functions.test
+++ b/promql/testdata/functions.test
@ -102,16 +102,28 @@ eval instant at 50m deriv(testcounter_reset_middle[100m])
 	{} 0.010606060606060607
 # predict_linear should return correct result.
 # X/s = [  0, 300, 600, 900,1200,1500,1800,2100,2400,2700,3000]
 # Y   = [  0,  10,  20,  30,  40,   0,  10,  20,  30,  40,  50]
 # sumX  = 16500
 # sumY  = 250
 # sumXY = 480000
 # sumX2 = 34650000
 # n     = 11
 # covXY = 105000
 # varX  = 9900000
 # slope = 0.010606060606060607
 # intercept at t=0: 6.818181818181818
 # intercept at t=3000: 38.63636363636364
 # intercept at t=3000+3600: 76.81818181818181
 eval instant at 50m predict_linear(testcounter_reset_middle[100m], 3600)
-	{} 88.181818181818185200
+	{} 76.81818181818181
-# predict_linear is syntactic sugar around deriv.
+# With http_requests, there is a sample value exactly at the end of
 # the range, and it has exactly the predicted value, so predict_linear
 # can be emulated with deriv.
 eval instant at 50m predict_linear(http_requests[50m], 3600) - (http_requests + deriv(http_requests[50m]) * 3600)
 	{group="canary", instance="1", job="app-server"} 0
 eval instant at 50m predict_linear(testcounter_reset_middle[100m], 3600) - (testcounter_reset_middle + deriv(testcounter_reset_middle[100m]) * 3600)
 	{} 0
 clear
 # Tests for label_replace.
--- a/storage/local/interface.go
+++ b/storage/local/interface.go
@ -79,9 +79,6 @@ type SeriesIterator interface {
 	// exist at precisely the given time, that value is returned. If no
 	// applicable value exists, ZeroSamplePair is returned.
 	ValueAtOrBeforeTime(model.Time) model.SamplePair
 	// Gets the boundary values of an interval: the first and last value
 	// within a given interval.
 	BoundaryValues(metric.Interval) []model.SamplePair
 	// Gets all values contained within a given interval.
 	RangeValues(metric.Interval) []model.SamplePair
 }
--- a/storage/local/series.go
+++ b/storage/local/series.go
@ -540,71 +540,6 @@ func (it *memorySeriesIterator) ValueAtOrBeforeTime(t model.Time) model.SamplePa
 	return it.chunkIt.valueAtOrBeforeTime(t)
 }
 // BoundaryValues implements SeriesIterator.
 func (it *memorySeriesIterator) BoundaryValues(in metric.Interval) []model.SamplePair {
 	// Find the first chunk for which the first sample is within the interval.
 	i := sort.Search(len(it.chunks), func(i int) bool {
 		return !it.chunks[i].firstTime().Before(in.OldestInclusive)
 	})
 	// Only now check the last timestamp of the previous chunk (which is
 	// fairly expensive).
 	if i > 0 && !it.chunkIterator(i-1).lastTimestamp().Before(in.OldestInclusive) {
 		i--
 	}
 	values := make([]model.SamplePair, 0, 2)
 	for j, c := range it.chunks[i:] {
 		if c.firstTime().After(in.NewestInclusive) {
 			if len(values) == 1 {
 				// We found the first value before but are now
 				// already past the last value. The value we
 				// want must be the last value of the previous
 				// chunk. So backtrack...
 				chunkIt := it.chunkIterator(i + j - 1)
 				values = append(values, model.SamplePair{
 					Timestamp: chunkIt.lastTimestamp(),
 					Value:     chunkIt.lastSampleValue(),
 				})
 			}
 			break
 		}
 		chunkIt := it.chunkIterator(i + j)
 		if len(values) == 0 {
 			for s := range chunkIt.values() {
 				if len(values) == 0 && !s.Timestamp.Before(in.OldestInclusive) {
 					values = append(values, *s)
 					// We cannot just break out here as we have to consume all
 					// the values to not leak a goroutine. This could obviously
 					// be made much neater with more suitable methods in the chunk
 					// interface. But currently, BoundaryValues is only used by
 					// `predict_linear` so we would pollute the chunk interface
 					// unduly just for one single corner case. Plus, even that use
 					// of BoundaryValues is suboptimal and should be replaced.
 				}
 			}
 		}
 		if chunkIt.lastTimestamp().After(in.NewestInclusive) {
 			s := chunkIt.valueAtOrBeforeTime(in.NewestInclusive)
 			if s.Timestamp != model.Earliest {
 				values = append(values, s)
 			}
 			break
 		}
 	}
 	if len(values) == 1 {
 		// We found exactly one value. In that case, add the most recent we know.
 		chunkIt := it.chunkIterator(len(it.chunks) - 1)
 		values = append(values, model.SamplePair{
 			Timestamp: chunkIt.lastTimestamp(),
 			Value:     chunkIt.lastSampleValue(),
 		})
 	}
 	if len(values) == 2 && values[0].Equal(&values[1]) {
 		return values[:1]
 	}
 	return values
 }
 // RangeValues implements SeriesIterator.
 func (it *memorySeriesIterator) RangeValues(in metric.Interval) []model.SamplePair {
 	// Find the first chunk for which the first sample is within the interval.
@ -653,11 +588,6 @@ func (it *singleSampleSeriesIterator) ValueAtOrBeforeTime(t model.Time) model.Sa
 	return it.samplePair
 }
 // BoundaryValues implements SeriesIterator.
 func (it *singleSampleSeriesIterator) BoundaryValues(in metric.Interval) []model.SamplePair {
 	return it.RangeValues(in)
 }
 // RangeValues implements SeriesIterator.
 func (it *singleSampleSeriesIterator) RangeValues(in metric.Interval) []model.SamplePair {
 	if it.samplePair.Timestamp.After(in.NewestInclusive) ||
@ -675,11 +605,6 @@ func (i nopSeriesIterator) ValueAtOrBeforeTime(t model.Time) model.SamplePair {
 	return ZeroSamplePair
 }
 // BoundaryValues implements SeriesIterator.
 func (i nopSeriesIterator) BoundaryValues(in metric.Interval) []model.SamplePair {
 	return []model.SamplePair{}
 }
 // RangeValues implements SeriesIterator.
 func (i nopSeriesIterator) RangeValues(in metric.Interval) []model.SamplePair {
 	return []model.SamplePair{}
--- a/storage/local/storage.go
+++ b/storage/local/storage.go
@ -374,17 +374,6 @@ func (bit *boundedIterator) ValueAtOrBeforeTime(ts model.Time) model.SamplePair
 	return bit.it.ValueAtOrBeforeTime(ts)
 }
 // BoundaryValues implements the SeriesIterator interface.
 func (bit *boundedIterator) BoundaryValues(interval metric.Interval) []model.SamplePair {
 	if interval.NewestInclusive < bit.start {
 		return []model.SamplePair{}
 	}
 	if interval.OldestInclusive < bit.start {
 		interval.OldestInclusive = bit.start
 	}
 	return bit.it.BoundaryValues(interval)
 }
 // RangeValues implements the SeriesIterator interface.
 func (bit *boundedIterator) RangeValues(interval metric.Interval) []model.SamplePair {
 	if interval.NewestInclusive < bit.start {
--- a/storage/local/storage_test.go
+++ b/storage/local/storage_test.go
@ -424,14 +424,6 @@ func TestRetentionCutoff(t *testing.T) {
 	if expt := now.Add(-1 * time.Hour).Add(time.Minute); vals[0].Timestamp != expt {
 		t.Errorf("unexpected timestamp for first sample: %v, expected %v", vals[0].Timestamp.Time(), expt.Time())
 	}
 	vals = it.BoundaryValues(metric.Interval{OldestInclusive: insertStart, NewestInclusive: now})
 	if len(vals) != 2 {
 		t.Errorf("expected 2 values but got %d", len(vals))
 	}
 	if expt := now.Add(-1 * time.Hour).Add(time.Minute); vals[0].Timestamp != expt {
 		t.Errorf("unexpected timestamp for first sample: %v, expected %v", vals[0].Timestamp.Time(), expt.Time())
 	}
 }
 func TestDropMetrics(t *testing.T) {
@ -1036,18 +1028,18 @@ func testEvictAndPurgeSeries(t *testing.T, encoding chunkEncoding) {
 	if err != nil {
 		t.Fatalf("Error preloading everything: %s", err)
 	}
-	actual := it.BoundaryValues(metric.Interval{
+	actual := it.RangeValues(metric.Interval{
 		OldestInclusive: 0,
 		NewestInclusive: 100000,
 	})
-	if len(actual) != 2 {
+	if len(actual) < 4000 {
-		t.Fatal("expected two results after purging half of series")
+		t.Fatalf("expected more than %d results after purging half of series, got %d", 4000, len(actual))
 	}
 	if actual[0].Timestamp < 6000 || actual[0].Timestamp > 10000 {
 		t.Errorf("1st timestamp out of expected range: %v", actual[0].Timestamp)
 	}
 	want := model.Time(19998)
-	if actual[1].Timestamp != want {
+	if actual[len(actual)-1].Timestamp != want {
 		t.Errorf("2nd timestamp: want %v, got %v", want, actual[1].Timestamp)
 	}
@ -1057,7 +1049,7 @@ func testEvictAndPurgeSeries(t *testing.T, encoding chunkEncoding) {
 	if err != nil {
 		t.Fatalf("Error preloading everything: %s", err)
 	}
-	actual = it.BoundaryValues(metric.Interval{
+	actual = it.RangeValues(metric.Interval{
 		OldestInclusive: 0,
 		NewestInclusive: 100000,
 	})
--- a/util/stats/query_stats.go
+++ b/util/stats/query_stats.go
@ -29,7 +29,6 @@ const (
 	ResultAppendTime
 	QueryAnalysisTime
 	GetValueAtTimeTime
 	GetBoundaryValuesTime
 	GetRangeValuesTime
 	ExecQueueTime
 	ViewDiskPreparationTime
@ -60,8 +59,6 @@ func (s QueryTiming) String() string {
 		return "Query analysis time"
 	case GetValueAtTimeTime:
 		return "GetValueAtTime() time"
 	case GetBoundaryValuesTime:
 		return "GetBoundaryValues() time"
 	case GetRangeValuesTime:
 		return "GetRangeValues() time"
 	case ExecQueueTime: