prometheus/tsdb/tombstones/tombstones.go

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tombstones

import (
	"encoding/binary"
	"errors"
	"fmt"
	"hash"
	"hash/crc32"
	"log/slog"
	"math"
	"os"
	"path/filepath"
	"sort"
	"sync"

	"github.com/prometheus/prometheus/storage"
	"github.com/prometheus/prometheus/tsdb/encoding"
	tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
	"github.com/prometheus/prometheus/tsdb/fileutil"
)

const TombstonesFilename = "tombstones"

const (
	// MagicTombstone is 4 bytes at the head of a tombstone file.
	MagicTombstone = 0x0130BA30

	tombstoneFormatV1          = 1
	tombstoneFormatVersionSize = 1
	tombstonesHeaderSize       = 5
	tombstonesCRCSize          = 4
)

// The table gets initialized with sync.Once but may still cause a race
// with any other use of the crc32 package anywhere. Thus we initialize it
// before.
var castagnoliTable *crc32.Table

func init() {
	castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
}

// newCRC32 initializes a CRC32 hash with a preconfigured polynomial, so the
// polynomial may be easily changed in one location at a later time, if necessary.
func newCRC32() hash.Hash32 {
	return crc32.New(castagnoliTable)
}

// Reader gives access to tombstone intervals by series reference.
type Reader interface {
	// Get returns deletion intervals for the series with the given reference.
	Get(ref storage.SeriesRef) (Intervals, error)

	// Iter calls the given function for each encountered interval.
	Iter(func(storage.SeriesRef, Intervals) error) error

	// Total returns the total count of tombstones.
	Total() uint64

	// Close any underlying resources
	Close() error
}

func WriteFile(logger *slog.Logger, dir string, tr Reader) (int64, error) {
	path := filepath.Join(dir, TombstonesFilename)
	tmp := path + ".tmp"
	hash := newCRC32()
	var size int

	f, err := os.Create(tmp)
	if err != nil {
		return 0, err
	}
	defer func() {
		if f != nil {
			if err := f.Close(); err != nil {
				logger.Error("close tmp file", "err", err.Error())
			}
		}
		if err := os.RemoveAll(tmp); err != nil {
			logger.Error("remove tmp file", "err", err.Error())
		}
	}()

	buf := encoding.Encbuf{B: make([]byte, 3*binary.MaxVarintLen64)}
	buf.Reset()
	// Write the meta.
	buf.PutBE32(MagicTombstone)
	n, err := f.Write(buf.Get())
	if err != nil {
		return 0, err
	}
	size += n

	bytes, err := Encode(tr)
	if err != nil {
		return 0, fmt.Errorf("encoding tombstones: %w", err)
	}

	// Ignore first byte which is the format type. We do this for compatibility.
	if _, err := hash.Write(bytes[tombstoneFormatVersionSize:]); err != nil {
		return 0, fmt.Errorf("calculating hash for tombstones: %w", err)
	}

	n, err = f.Write(bytes)
	if err != nil {
		return 0, fmt.Errorf("writing tombstones: %w", err)
	}
	size += n

	n, err = f.Write(hash.Sum(nil))
	if err != nil {
		return 0, err
	}
	size += n

	if err := f.Sync(); err != nil {
		return 0, tsdb_errors.NewMulti(err, f.Close()).Err()
	}

	if err = f.Close(); err != nil {
		return 0, err
	}
	f = nil
	return int64(size), fileutil.Replace(tmp, path)
}

// Encode encodes the tombstones from the reader.
// It does not attach any magic number or checksum.
func Encode(tr Reader) ([]byte, error) {
	buf := encoding.Encbuf{}
	buf.PutByte(tombstoneFormatV1)
	err := tr.Iter(func(ref storage.SeriesRef, ivs Intervals) error {
		for _, iv := range ivs {
			buf.PutUvarint64(uint64(ref))
			buf.PutVarint64(iv.Mint)
			buf.PutVarint64(iv.Maxt)
		}
		return nil
	})
	return buf.Get(), err
}

// Decode decodes the tombstones from the bytes
// which was encoded using the Encode method.
func Decode(b []byte) (Reader, error) {
	d := &encoding.Decbuf{B: b}
	if flag := d.Byte(); flag != tombstoneFormatV1 {
		return nil, fmt.Errorf("invalid tombstone format %x", flag)
	}

	if d.Err() != nil {
		return nil, d.Err()
	}

	stonesMap := NewMemTombstones()
	for d.Len() > 0 {
		k := storage.SeriesRef(d.Uvarint64())
		mint := d.Varint64()
		maxt := d.Varint64()
		if d.Err() != nil {
			return nil, d.Err()
		}

		stonesMap.AddInterval(k, Interval{mint, maxt})
	}
	return stonesMap, nil
}

// Stone holds the information on the posting and time-range
// that is deleted.
type Stone struct {
	Ref       storage.SeriesRef
	Intervals Intervals
}

func ReadTombstones(dir string) (Reader, int64, error) {
	b, err := os.ReadFile(filepath.Join(dir, TombstonesFilename))
	switch {
	case os.IsNotExist(err):
		return NewMemTombstones(), 0, nil
	case err != nil:
		return nil, 0, err
	}

	if len(b) < tombstonesHeaderSize {
		return nil, 0, fmt.Errorf("tombstones header: %w", encoding.ErrInvalidSize)
	}

	d := &encoding.Decbuf{B: b[:len(b)-tombstonesCRCSize]}
	if mg := d.Be32(); mg != MagicTombstone {
		return nil, 0, fmt.Errorf("invalid magic number %x", mg)
	}

	// Verify checksum.
	hash := newCRC32()
	// Ignore first byte which is the format type.
	if _, err := hash.Write(d.Get()[tombstoneFormatVersionSize:]); err != nil {
		return nil, 0, fmt.Errorf("write to hash: %w", err)
	}
	if binary.BigEndian.Uint32(b[len(b)-tombstonesCRCSize:]) != hash.Sum32() {
		return nil, 0, errors.New("checksum did not match")
	}

	if d.Err() != nil {
		return nil, 0, d.Err()
	}

	stonesMap, err := Decode(d.Get())
	if err != nil {
		return nil, 0, err
	}

	return stonesMap, int64(len(b)), nil
}

type MemTombstones struct {
	intvlGroups map[storage.SeriesRef]Intervals
	mtx         sync.RWMutex
}

// NewMemTombstones creates new in memory Tombstone Reader
// that allows adding new intervals.
func NewMemTombstones() *MemTombstones {
	return &MemTombstones{intvlGroups: make(map[storage.SeriesRef]Intervals)}
}

func NewTestMemTombstones(intervals []Intervals) *MemTombstones {
	ret := NewMemTombstones()
	for i, intervalsGroup := range intervals {
		for _, interval := range intervalsGroup {
			ret.AddInterval(storage.SeriesRef(i+1), interval)
		}
	}
	return ret
}

func (t *MemTombstones) Get(ref storage.SeriesRef) (Intervals, error) {
	t.mtx.RLock()
	defer t.mtx.RUnlock()
	intervals, ok := t.intvlGroups[ref]
	if !ok {
		return nil, nil
	}
	// Make a copy to avoid race.
	res := make(Intervals, len(intervals))
	copy(res, intervals)
	return res, nil
}

func (t *MemTombstones) DeleteTombstones(refs map[storage.SeriesRef]struct{}) {
	t.mtx.Lock()
	defer t.mtx.Unlock()
	for ref := range refs {
		delete(t.intvlGroups, ref)
	}
}

func (t *MemTombstones) TruncateBefore(beforeT int64) {
	t.mtx.Lock()
	defer t.mtx.Unlock()
	for ref, ivs := range t.intvlGroups {
		i := len(ivs) - 1
		for ; i >= 0; i-- {
			if beforeT > ivs[i].Maxt {
				break
			}
		}
		if len(ivs[i+1:]) == 0 {
			delete(t.intvlGroups, ref)
		} else {
			newIvs := make(Intervals, len(ivs[i+1:]))
			copy(newIvs, ivs[i+1:])
			t.intvlGroups[ref] = newIvs
		}
	}
}

func (t *MemTombstones) Iter(f func(storage.SeriesRef, Intervals) error) error {
	t.mtx.RLock()
	defer t.mtx.RUnlock()
	for ref, ivs := range t.intvlGroups {
		if err := f(ref, ivs); err != nil {
			return err
		}
	}
	return nil
}

func (t *MemTombstones) Total() uint64 {
	t.mtx.RLock()
	defer t.mtx.RUnlock()

	total := uint64(0)
	for _, ivs := range t.intvlGroups {
		total += uint64(len(ivs))
	}
	return total
}

// AddInterval to an existing memTombstones.
func (t *MemTombstones) AddInterval(ref storage.SeriesRef, itvs ...Interval) {
	t.mtx.Lock()
	defer t.mtx.Unlock()
	for _, itv := range itvs {
		t.intvlGroups[ref] = t.intvlGroups[ref].Add(itv)
	}
}

func (*MemTombstones) Close() error {
	return nil
}

// Interval represents a single time-interval.
type Interval struct {
	Mint, Maxt int64
}

func (tr Interval) InBounds(t int64) bool {
	return t >= tr.Mint && t <= tr.Maxt
}

func (tr Interval) IsSubrange(dranges Intervals) bool {
	for _, r := range dranges {
		if r.InBounds(tr.Mint) && r.InBounds(tr.Maxt) {
			return true
		}
	}

	return false
}

// Intervals represents	a set of increasing and non-overlapping time-intervals.
type Intervals []Interval

// Add the new time-range to the existing ones.
// The existing ones must be sorted.
func (in Intervals) Add(n Interval) Intervals {
	if len(in) == 0 {
		return append(in, n)
	}
	// Find min and max indexes of intervals that overlap with the new interval.
	// Intervals are closed [t1, t2] and t is discreet, so if neighbour intervals are 1 step difference
	// to the new one, we can merge those together.
	mini := 0
	if n.Mint != math.MinInt64 { // Avoid overflow.
		mini = sort.Search(len(in), func(i int) bool { return in[i].Maxt >= n.Mint-1 })
		if mini == len(in) {
			return append(in, n)
		}
	}

	maxi := len(in)
	if n.Maxt != math.MaxInt64 { // Avoid overflow.
		maxi = sort.Search(len(in)-mini, func(i int) bool { return in[mini+i].Mint > n.Maxt+1 })
		if maxi == 0 {
			if mini == 0 {
				return append(Intervals{n}, in...)
			}
			return append(in[:mini], append(Intervals{n}, in[mini:]...)...)
		}
	}

	if n.Mint < in[mini].Mint {
		in[mini].Mint = n.Mint
	}
	in[mini].Maxt = in[maxi+mini-1].Maxt
	if n.Maxt > in[mini].Maxt {
		in[mini].Maxt = n.Maxt
	}
	return append(in[:mini+1], in[maxi+mini:]...)
}