From 296f943ec4aad56409f35e3916ceabf9f8c0b61c Mon Sep 17 00:00:00 2001
From: Brian Brazil <brian.brazil@robustperception.io>
Date: Thu, 3 Jan 2019 16:59:52 +0000
Subject: [PATCH] More efficient Merge implementation. (#486)

Avoid a tree of merge objects, which can result in
what I suspect is n^2 calls to Seek when using Without.

With 100k metrics, and a regex of ^$ in BenchmarkHeadPostingForMatchers:

Before:
BenchmarkHeadPostingForMatchers-8              1        51633185216 ns/op      29745528 B/op      200357 allocs/op

After:
BenchmarkHeadPostingForMatchers-8             10         108924996 ns/op 25715025 B/op     101748 allocs/op

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
---
 index/postings.go      | 89 ++++++++----------------------------------
 index/postings_test.go |  6 +--
 2 files changed, 20 insertions(+), 75 deletions(-)

diff --git a/index/postings.go b/index/postings.go
index 1f2ee0d6f..13df1c69a 100644
--- a/index/postings.go
+++ b/index/postings.go
@@ -366,80 +366,25 @@ func Merge(its ...Postings) Postings {
 	if len(its) == 1 {
 		return its[0]
 	}
-	l := len(its) / 2
-	return newMergedPostings(Merge(its[:l]...), Merge(its[l:]...))
-}
-
-type mergedPostings struct {
-	a, b        Postings
-	initialized bool
-	aok, bok    bool
-	cur         uint64
-}
-
-func newMergedPostings(a, b Postings) *mergedPostings {
-	return &mergedPostings{a: a, b: b}
-}
-
-func (it *mergedPostings) At() uint64 {
-	return it.cur
-}
-
-func (it *mergedPostings) Next() bool {
-	if !it.initialized {
-		it.aok = it.a.Next()
-		it.bok = it.b.Next()
-		it.initialized = true
-	}
-
-	if !it.aok && !it.bok {
-		return false
-	}
-
-	if !it.aok {
-		it.cur = it.b.At()
-		it.bok = it.b.Next()
-		return true
-	}
-	if !it.bok {
-		it.cur = it.a.At()
-		it.aok = it.a.Next()
-		return true
-	}
-
-	acur, bcur := it.a.At(), it.b.At()
-
-	if acur < bcur {
-		it.cur = acur
-		it.aok = it.a.Next()
-	} else if acur > bcur {
-		it.cur = bcur
-		it.bok = it.b.Next()
-	} else {
-		it.cur = acur
-		it.aok = it.a.Next()
-		it.bok = it.b.Next()
-	}
-	return true
-}
-
-func (it *mergedPostings) Seek(id uint64) bool {
-	if it.cur >= id {
-		return true
+	// All the uses of this function immediately expand it, so
+	// collect everything in a map. This is more efficient
+	// when there's 100ks of postings, compared to
+	// having a tree of merge objects.
+	pm := make(map[uint64]struct{}, len(its))
+	for _, it := range its {
+		for it.Next() {
+			pm[it.At()] = struct{}{}
+		}
+		if it.Err() != nil {
+			return ErrPostings(it.Err())
+		}
 	}
-
-	it.aok = it.a.Seek(id)
-	it.bok = it.b.Seek(id)
-	it.initialized = true
-
-	return it.Next()
-}
-
-func (it *mergedPostings) Err() error {
-	if it.a.Err() != nil {
-		return it.a.Err()
+	pl := make([]uint64, 0, len(pm))
+	for p := range pm {
+		pl = append(pl, p)
 	}
-	return it.b.Err()
+	sort.Slice(pl, func(i, j int) bool { return pl[i] < pl[j] })
+	return newListPostings(pl)
 }
 
 // Without returns a new postings list that contains all elements from the full list that
diff --git a/index/postings_test.go b/index/postings_test.go
index 53a9d95fc..54c37f480 100644
--- a/index/postings_test.go
+++ b/index/postings_test.go
@@ -233,7 +233,7 @@ func TestMergedPostings(t *testing.T) {
 		a := newListPostings(c.a)
 		b := newListPostings(c.b)
 
-		res, err := ExpandPostings(newMergedPostings(a, b))
+		res, err := ExpandPostings(Merge(a, b))
 		testutil.Ok(t, err)
 		testutil.Equals(t, c.res, res)
 	}
@@ -286,7 +286,7 @@ func TestMergedPostingsSeek(t *testing.T) {
 		a := newListPostings(c.a)
 		b := newListPostings(c.b)
 
-		p := newMergedPostings(a, b)
+		p := Merge(a, b)
 
 		testutil.Equals(t, c.success, p.Seek(c.seek))
 
@@ -546,7 +546,7 @@ func TestIntersectWithMerge(t *testing.T) {
 	// https://github.com/prometheus/prometheus/issues/2616
 	a := newListPostings([]uint64{21, 22, 23, 24, 25, 30})
 
-	b := newMergedPostings(
+	b := Merge(
 		newListPostings([]uint64{10, 20, 30}),
 		newListPostings([]uint64{15, 26, 30}),
 	)