From acc114fe553b660cefc71a0311792ef8be4a186a Mon Sep 17 00:00:00 2001 From: George Krajcsovits Date: Sun, 12 Nov 2023 15:51:37 +0100 Subject: [PATCH] Fix panic during tsdb Commit (#13092) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix panic during tsdb Commit Fixes the following panic: runtime error: invalid memory address or nil pointer dereference [signal SIGSEGV: segmentation violation code=0x1 addr=0x20 pc=0x19deb45] goroutine 651118930 [running]: github.com/prometheus/prometheus/tsdb.(*headAppender).Commit(0xc19100f7c0) /drone/src/vendor/github.com/prometheus/prometheus/tsdb/head_append.go:855 +0x245 github.com/prometheus/prometheus/tsdb.dbAppender.Commit({{0x35bd6f0?, 0xc19100f7c0?}, 0xc000fa4c00?}) /drone/src/vendor/github.com/prometheus/prometheus/tsdb/db.go:1159 +0x2f We theorize that the panic happened due the the series referenced by the exemplar being removed between AppendExemplar and Commit due to being idle. Signed-off-by: György Krajcsovits --- tsdb/head_append.go | 6 ++++++ tsdb/head_test.go | 28 ++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/tsdb/head_append.go b/tsdb/head_append.go index 785e99db0..be53a4f3f 100644 --- a/tsdb/head_append.go +++ b/tsdb/head_append.go @@ -751,6 +751,12 @@ func (a *headAppender) Commit() (err error) { // No errors logging to WAL, so pass the exemplars along to the in memory storage. for _, e := range a.exemplars { s := a.head.series.getByID(chunks.HeadSeriesRef(e.ref)) + if s == nil { + // This is very unlikely to happen, but we have seen it in the wild. + // It means that the series was truncated between AppendExemplar and Commit. + // See TestHeadCompactionWhileAppendAndCommitExemplar. + continue + } // We don't instrument exemplar appends here, all is instrumented by storage. if err := a.head.exemplars.AddExemplar(s.lset, e.exemplar); err != nil { if err == storage.ErrOutOfOrderExemplar { diff --git a/tsdb/head_test.go b/tsdb/head_test.go index 253f92d61..f2325039a 100644 --- a/tsdb/head_test.go +++ b/tsdb/head_test.go @@ -5514,3 +5514,31 @@ func TestWALSampleAndExemplarOrder(t *testing.T) { }) } } + +// TestHeadCompactionWhileAppendAndCommitExemplar simulates a use case where +// a series is removed from the head while an exemplar is being appended to it. +// This can happen in theory by compacting the head at the right time due to +// a series being idle. +// The test cheats a little bit by not appending a sample with the exemplar. +// If you also add a sample and run Truncate in a concurrent goroutine and run +// the test around a million(!) times, you can get +// `unknown HeadSeriesRef when trying to add exemplar: 1` error on push. +// It is likely that running the test for much longer and with more time variations +// would trigger the +// `signal SIGSEGV: segmentation violation code=0x1 addr=0x20 pc=0xbb03d1` +// panic, that we have seen in the wild once. +func TestHeadCompactionWhileAppendAndCommitExemplar(t *testing.T) { + h, _ := newTestHead(t, DefaultBlockDuration, wlog.CompressionNone, false) + app := h.Appender(context.Background()) + lbls := labels.FromStrings("foo", "bar") + ref, err := app.Append(0, lbls, 1, 1) + require.NoError(t, err) + app.Commit() + // Not adding a sample here to trigger the fault. + app = h.Appender(context.Background()) + _, err = app.AppendExemplar(ref, lbls, exemplar.Exemplar{Value: 1, Ts: 20}) + require.NoError(t, err) + h.Truncate(10) + app.Commit() + h.Close() +}