From b1e4052682c710526681155f4067eafc86f0809d Mon Sep 17 00:00:00 2001
From: Oleg Zaytsev <mail@olegzaytsev.com>
Date: Tue, 5 Nov 2024 12:59:57 +0100
Subject: [PATCH] MemPostings.Delete(): make pauses to unlock and let the
 readers read (#15242)

This introduces back some unlocking that was removed in #13286 but in a
more balanced way, as suggested by @pracucci.

For TSDBs with a lot of churn, Delete() can take a couple of seconds,
and while it's holding the mutex, reads and writes are blocked waiting
for that mutex, increasing the number of connections handled and memory
usage.

This implementation pauses every 4K labels processed (note that also
compared to #13286 we're not processing all the label-values anymore,
but only the affected ones, because of #14307), makes sure that it's
possible to get the read lock, and waits for a few milliseconds more.

Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com>
Co-authored-by: Marco Pracucci <marco@pracucci.com>
---
 tsdb/index/postings.go | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tsdb/index/postings.go b/tsdb/index/postings.go
index 5ed41f769..d7b497e61 100644
--- a/tsdb/index/postings.go
+++ b/tsdb/index/postings.go
@@ -24,6 +24,7 @@ import (
 	"sort"
 	"strings"
 	"sync"
+	"time"
 
 	"github.com/bboreham/go-loser"
 
@@ -312,8 +313,30 @@ func (p *MemPostings) Delete(deleted map[storage.SeriesRef]struct{}, affected ma
 		}
 	}
 
+	i := 0
 	for l := range affected {
+		i++
 		process(l)
+
+		// From time to time we want some readers to go through and read their postings.
+		// It takes around 50ms to process a 1K series batch, and 120ms to process a 10K series batch (local benchmarks on an M3).
+		// Note that a read query will most likely want to read multiple postings lists, say 5, 10 or 20 (depending on the number of matchers)
+		// And that read query will most likely evaluate only one of those matchers before we unpause here, so we want to pause often.
+		if i%512 == 0 {
+			p.mtx.Unlock()
+			// While it's tempting to just do a `time.Sleep(time.Millisecond)` here,
+			// it wouldn't ensure use that readers actually were able to get the read lock,
+			// because if there are writes waiting on same mutex, readers won't be able to get it.
+			// So we just grab one RLock ourselves.
+			p.mtx.RLock()
+			// We shouldn't wait here, because we would be blocking a potential write for no reason.
+			// Note that if there's a writer waiting for us to unlock, no reader will be able to get the read lock.
+			p.mtx.RUnlock() //nolint:staticcheck // SA2001: this is an intentionally empty critical section.
+			// Now we can wait a little bit just to increase the chance of a reader getting the lock.
+			// If we were deleting 100M series here, pausing every 512 with 1ms sleeps would be an extra of 200s, which is negligible.
+			time.Sleep(time.Millisecond)
+			p.mtx.Lock()
+		}
 	}
 	process(allPostingsKey)
 }