prometheus/tsdb/wlog/reader.go

// Copyright 2019 The Prometheus Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package wlog

import (
	"encoding/binary"
	"hash/crc32"
	"io"

	"github.com/golang/snappy"
	"github.com/klauspost/compress/zstd"
	"github.com/pkg/errors"
)

// Reader reads WAL records from an io.Reader.
type Reader struct {
	rdr         io.Reader
	err         error
	rec         []byte
	compressBuf []byte
	zstdReader  *zstd.Decoder
	buf         [pageSize]byte
	total       int64   // Total bytes processed.
	curRecTyp   recType // Used for checking that the last record is not torn.
}

// NewReader returns a new reader.
func NewReader(r io.Reader) *Reader {
	// Calling zstd.NewReader with a nil io.Reader and no options cannot return an error.
	zstdReader, _ := zstd.NewReader(nil)
	return &Reader{rdr: r, zstdReader: zstdReader}
}

// Next advances the reader to the next records and returns true if it exists.
// It must not be called again after it returned false.
func (r *Reader) Next() bool {
	err := r.next()
	if errors.Is(err, io.EOF) {
		// The last WAL segment record shouldn't be torn(should be full or last).
		// The last record would be torn after a crash just before
		// the last record part could be persisted to disk.
		if r.curRecTyp == recFirst || r.curRecTyp == recMiddle {
			r.err = errors.New("last record is torn")
		}
		return false
	}
	r.err = err
	return r.err == nil
}

func (r *Reader) next() (err error) {
	// We have to use r.buf since allocating byte arrays here fails escape
	// analysis and ends up on the heap, even though it seemingly should not.
	hdr := r.buf[:recordHeaderSize]
	buf := r.buf[recordHeaderSize:]

	r.rec = r.rec[:0]
	r.compressBuf = r.compressBuf[:0]

	i := 0
	for {
		if _, err = io.ReadFull(r.rdr, hdr[:1]); err != nil {
			return errors.Wrap(err, "read first header byte")
		}
		r.total++
		r.curRecTyp = recTypeFromHeader(hdr[0])
		isSnappyCompressed := hdr[0]&snappyMask == snappyMask
		isZstdCompressed := hdr[0]&zstdMask == zstdMask

		// Gobble up zero bytes.
		if r.curRecTyp == recPageTerm {
			// recPageTerm is a single byte that indicates the rest of the page is padded.
			// If it's the first byte in a page, buf is too small and
			// needs to be resized to fit pageSize-1 bytes.
			buf = r.buf[1:]

			// We are pedantic and check whether the zeros are actually up
			// to a page boundary.
			// It's not strictly necessary but may catch sketchy state early.
			k := pageSize - (r.total % pageSize)
			if k == pageSize {
				continue // Initial 0 byte was last page byte.
			}
			n, err := io.ReadFull(r.rdr, buf[:k])
			if err != nil {
				return errors.Wrap(err, "read remaining zeros")
			}
			r.total += int64(n)

			for _, c := range buf[:k] {
				if c != 0 {
					return errors.New("unexpected non-zero byte in padded page")
				}
			}
			continue
		}
		n, err := io.ReadFull(r.rdr, hdr[1:])
		if err != nil {
			return errors.Wrap(err, "read remaining header")
		}
		r.total += int64(n)

		var (
			length = binary.BigEndian.Uint16(hdr[1:])
			crc    = binary.BigEndian.Uint32(hdr[3:])
		)

		if length > pageSize-recordHeaderSize {
			return errors.Errorf("invalid record size %d", length)
		}
		n, err = io.ReadFull(r.rdr, buf[:length])
		if err != nil {
			return err
		}
		r.total += int64(n)

		if n != int(length) {
			return errors.Errorf("invalid size: expected %d, got %d", length, n)
		}
		if c := crc32.Checksum(buf[:length], castagnoliTable); c != crc {
			return errors.Errorf("unexpected checksum %x, expected %x", c, crc)
		}

		if isSnappyCompressed || isZstdCompressed {
			r.compressBuf = append(r.compressBuf, buf[:length]...)
		} else {
			r.rec = append(r.rec, buf[:length]...)
		}

		if err := validateRecord(r.curRecTyp, i); err != nil {
			return err
		}
		if r.curRecTyp == recLast || r.curRecTyp == recFull {
			if isSnappyCompressed && len(r.compressBuf) > 0 {
				// The snappy library uses `len` to calculate if we need a new buffer.
				// In order to allocate as few buffers as possible make the length
				// equal to the capacity.
				r.rec = r.rec[:cap(r.rec)]
				r.rec, err = snappy.Decode(r.rec, r.compressBuf)
				return err
			} else if isZstdCompressed && len(r.compressBuf) > 0 {
				r.rec, err = r.zstdReader.DecodeAll(r.compressBuf, r.rec[:0])
				return err
			}
			return nil
		}

		// Only increment i for non-zero records since we use it
		// to determine valid content record sequences.
		i++
	}
}

// Err returns the last encountered error wrapped in a corruption error.
// If the reader does not allow to infer a segment index and offset, a total
// offset in the reader stream will be provided.
func (r *Reader) Err() error {
	if r.err == nil {
		return nil
	}
	if b, ok := r.rdr.(*segmentBufReader); ok {
		return &CorruptionErr{
			Err:     r.err,
			Dir:     b.segs[b.cur].Dir(),
			Segment: b.segs[b.cur].Index(),
			Offset:  int64(b.off),
		}
	}
	return &CorruptionErr{
		Err:     r.err,
		Segment: -1,
		Offset:  r.total,
	}
}

// Record returns the current record. The returned byte slice is only
// valid until the next call to Next.
func (r *Reader) Record() []byte {
	return r.rec
}

// Segment returns the current segment being read.
func (r *Reader) Segment() int {
	if b, ok := r.rdr.(*segmentBufReader); ok {
		return b.segs[b.cur].Index()
	}
	return -1
}

// Offset returns the current position of the segment being read.
func (r *Reader) Offset() int64 {
	if b, ok := r.rdr.(*segmentBufReader); ok {
		return int64(b.off)
	}
	return r.total
}
LiveReader can get into an infinite loop on corrupt WALs. (#524) Make WAL live tailer return EOF when the there is a half-written record at the end of the file. Previously, this would cause an infinite loop as we ignored EOFs when filling the buffer. We now differentiate between EOFs that read >0 bytes, and EOFs that didn't. Add some more unit tests for tailing a corrupt WAL, and unify interfaces Reader and LiveReader for the purposes of testing. Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com> 2019-02-19 14:33:57 +00:00			`// Copyright 2019 The Prometheus Authors`
			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`//`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`

TSDB: Rename wal package to wlog (#11352) The wlog.WL type can now be used to create a Write Ahead Log or a Write Behind Log. Before the prefix for wbl metrics was 'prometheus_tsdb_out_of_order_wal_' and has been replaced with 'prometheus_tsdb_out_of_order_wbl_'. Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> Signed-off-by: Jesus Vazquez <jesusvazquez@users.noreply.github.com> Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> 2022-10-10 15:08:46 +00:00			`package wlog`
LiveReader can get into an infinite loop on corrupt WALs. (#524) Make WAL live tailer return EOF when the there is a half-written record at the end of the file. Previously, this would cause an infinite loop as we ignored EOFs when filling the buffer. We now differentiate between EOFs that read >0 bytes, and EOFs that didn't. Add some more unit tests for tailing a corrupt WAL, and unify interfaces Reader and LiveReader for the purposes of testing. Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com> 2019-02-19 14:33:57 +00:00
			`import (`
			`"encoding/binary"`
			`"hash/crc32"`
			`"io"`

Provide option to compress WAL records (#609) In running Prometheus instances, compressing the records was shown to reduce disk usage by half while incurring a negligible CPU cost. Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com> 2019-06-19 13:46:24 +00:00			`"github.com/golang/snappy"`
Add Zstandard compression option for wlog (#11666) Snappy remains as the default compression but there is now a flag to switch the compression algorithm. Signed-off-by: Justin Lei <justin.lei@grafana.com> 2023-07-11 12:57:57 +00:00			`"github.com/klauspost/compress/zstd"`
LiveReader can get into an infinite loop on corrupt WALs. (#524) Make WAL live tailer return EOF when the there is a half-written record at the end of the file. Previously, this would cause an infinite loop as we ignored EOFs when filling the buffer. We now differentiate between EOFs that read >0 bytes, and EOFs that didn't. Add some more unit tests for tailing a corrupt WAL, and unify interfaces Reader and LiveReader for the purposes of testing. Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com> 2019-02-19 14:33:57 +00:00			`"github.com/pkg/errors"`
			`)`

			`// Reader reads WAL records from an io.Reader.`
			`type Reader struct {`
Add Zstandard compression option for wlog (#11666) Snappy remains as the default compression but there is now a flag to switch the compression algorithm. Signed-off-by: Justin Lei <justin.lei@grafana.com> 2023-07-11 12:57:57 +00:00			`rdr io.Reader`
			`err error`
			`rec []byte`
			`compressBuf []byte`
			`zstdReader *zstd.Decoder`
			`buf [pageSize]byte`
			`total int64 // Total bytes processed.`
			`curRecTyp recType // Used for checking that the last record is not torn.`
LiveReader can get into an infinite loop on corrupt WALs. (#524) Make WAL live tailer return EOF when the there is a half-written record at the end of the file. Previously, this would cause an infinite loop as we ignored EOFs when filling the buffer. We now differentiate between EOFs that read >0 bytes, and EOFs that didn't. Add some more unit tests for tailing a corrupt WAL, and unify interfaces Reader and LiveReader for the purposes of testing. Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com> 2019-02-19 14:33:57 +00:00			`}`

			`// NewReader returns a new reader.`
			`func NewReader(r io.Reader) *Reader {`
Add Zstandard compression option for wlog (#11666) Snappy remains as the default compression but there is now a flag to switch the compression algorithm. Signed-off-by: Justin Lei <justin.lei@grafana.com> 2023-07-11 12:57:57 +00:00			`// Calling zstd.NewReader with a nil io.Reader and no options cannot return an error.`
			`zstdReader, _ := zstd.NewReader(nil)`
			`return &Reader{rdr: r, zstdReader: zstdReader}`
LiveReader can get into an infinite loop on corrupt WALs. (#524) Make WAL live tailer return EOF when the there is a half-written record at the end of the file. Previously, this would cause an infinite loop as we ignored EOFs when filling the buffer. We now differentiate between EOFs that read >0 bytes, and EOFs that didn't. Add some more unit tests for tailing a corrupt WAL, and unify interfaces Reader and LiveReader for the purposes of testing. Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com> 2019-02-19 14:33:57 +00:00			`}`

			`// Next advances the reader to the next records and returns true if it exists.`
			`// It must not be called again after it returned false.`
			`func (r *Reader) Next() bool {`
			`err := r.next()`
Use errors.Is to check for a specific error Signed-off-by: Fish-pro <zechun.chen@daocloud.io> 2022-12-29 15:23:07 +00:00			`if errors.Is(err, io.EOF) {`
LiveReader can get into an infinite loop on corrupt WALs. (#524) Make WAL live tailer return EOF when the there is a half-written record at the end of the file. Previously, this would cause an infinite loop as we ignored EOFs when filling the buffer. We now differentiate between EOFs that read >0 bytes, and EOFs that didn't. Add some more unit tests for tailing a corrupt WAL, and unify interfaces Reader and LiveReader for the purposes of testing. Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com> 2019-02-19 14:33:57 +00:00			`// The last WAL segment record shouldn't be torn(should be full or last).`
			`// The last record would be torn after a crash just before`
			`// the last record part could be persisted to disk.`
Provide option to compress WAL records (#609) In running Prometheus instances, compressing the records was shown to reduce disk usage by half while incurring a negligible CPU cost. Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com> 2019-06-19 13:46:24 +00:00			`if r.curRecTyp == recFirst \|\| r.curRecTyp == recMiddle {`
LiveReader can get into an infinite loop on corrupt WALs. (#524) Make WAL live tailer return EOF when the there is a half-written record at the end of the file. Previously, this would cause an infinite loop as we ignored EOFs when filling the buffer. We now differentiate between EOFs that read >0 bytes, and EOFs that didn't. Add some more unit tests for tailing a corrupt WAL, and unify interfaces Reader and LiveReader for the purposes of testing. Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com> 2019-02-19 14:33:57 +00:00			`r.err = errors.New("last record is torn")`
			`}`
			`return false`
			`}`
			`r.err = err`
			`return r.err == nil`
			`}`

			`func (r *Reader) next() (err error) {`
			`// We have to use r.buf since allocating byte arrays here fails escape`
			`// analysis and ends up on the heap, even though it seemingly should not.`
			`hdr := r.buf[:recordHeaderSize]`
			`buf := r.buf[recordHeaderSize:]`

			`r.rec = r.rec[:0]`
Add Zstandard compression option for wlog (#11666) Snappy remains as the default compression but there is now a flag to switch the compression algorithm. Signed-off-by: Justin Lei <justin.lei@grafana.com> 2023-07-11 12:57:57 +00:00			`r.compressBuf = r.compressBuf[:0]`
LiveReader can get into an infinite loop on corrupt WALs. (#524) Make WAL live tailer return EOF when the there is a half-written record at the end of the file. Previously, this would cause an infinite loop as we ignored EOFs when filling the buffer. We now differentiate between EOFs that read >0 bytes, and EOFs that didn't. Add some more unit tests for tailing a corrupt WAL, and unify interfaces Reader and LiveReader for the purposes of testing. Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com> 2019-02-19 14:33:57 +00:00
			`i := 0`
			`for {`
			`if _, err = io.ReadFull(r.rdr, hdr[:1]); err != nil {`
			`return errors.Wrap(err, "read first header byte")`
			`}`
			`r.total++`
Provide option to compress WAL records (#609) In running Prometheus instances, compressing the records was shown to reduce disk usage by half while incurring a negligible CPU cost. Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com> 2019-06-19 13:46:24 +00:00			`r.curRecTyp = recTypeFromHeader(hdr[0])`
Add Zstandard compression option for wlog (#11666) Snappy remains as the default compression but there is now a flag to switch the compression algorithm. Signed-off-by: Justin Lei <justin.lei@grafana.com> 2023-07-11 12:57:57 +00:00			`isSnappyCompressed := hdr[0]&snappyMask == snappyMask`
			`isZstdCompressed := hdr[0]&zstdMask == zstdMask`
LiveReader can get into an infinite loop on corrupt WALs. (#524) Make WAL live tailer return EOF when the there is a half-written record at the end of the file. Previously, this would cause an infinite loop as we ignored EOFs when filling the buffer. We now differentiate between EOFs that read >0 bytes, and EOFs that didn't. Add some more unit tests for tailing a corrupt WAL, and unify interfaces Reader and LiveReader for the purposes of testing. Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com> 2019-02-19 14:33:57 +00:00
			`// Gobble up zero bytes.`
			`if r.curRecTyp == recPageTerm {`
			`// recPageTerm is a single byte that indicates the rest of the page is padded.`
			`// If it's the first byte in a page, buf is too small and`
			`// needs to be resized to fit pageSize-1 bytes.`
			`buf = r.buf[1:]`

			`// We are pedantic and check whether the zeros are actually up`
			`// to a page boundary.`
			`// It's not strictly necessary but may catch sketchy state early.`
			`k := pageSize - (r.total % pageSize)`
			`if k == pageSize {`
			`continue // Initial 0 byte was last page byte.`
			`}`
			`n, err := io.ReadFull(r.rdr, buf[:k])`
			`if err != nil {`
			`return errors.Wrap(err, "read remaining zeros")`
			`}`
			`r.total += int64(n)`

			`for _, c := range buf[:k] {`
			`if c != 0 {`
			`return errors.New("unexpected non-zero byte in padded page")`
			`}`
			`}`
			`continue`
			`}`
			`n, err := io.ReadFull(r.rdr, hdr[1:])`
			`if err != nil {`
			`return errors.Wrap(err, "read remaining header")`
			`}`
			`r.total += int64(n)`

			`var (`
			`length = binary.BigEndian.Uint16(hdr[1:])`
			`crc = binary.BigEndian.Uint32(hdr[3:])`
			`)`

			`if length > pageSize-recordHeaderSize {`
			`return errors.Errorf("invalid record size %d", length)`
			`}`
			`n, err = io.ReadFull(r.rdr, buf[:length])`
			`if err != nil {`
			`return err`
			`}`
			`r.total += int64(n)`

			`if n != int(length) {`
			`return errors.Errorf("invalid size: expected %d, got %d", length, n)`
			`}`
			`if c := crc32.Checksum(buf[:length], castagnoliTable); c != crc {`
			`return errors.Errorf("unexpected checksum %x, expected %x", c, crc)`
			`}`
Provide option to compress WAL records (#609) In running Prometheus instances, compressing the records was shown to reduce disk usage by half while incurring a negligible CPU cost. Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com> 2019-06-19 13:46:24 +00:00
Add Zstandard compression option for wlog (#11666) Snappy remains as the default compression but there is now a flag to switch the compression algorithm. Signed-off-by: Justin Lei <justin.lei@grafana.com> 2023-07-11 12:57:57 +00:00			`if isSnappyCompressed \|\| isZstdCompressed {`
			`r.compressBuf = append(r.compressBuf, buf[:length]...)`
Provide option to compress WAL records (#609) In running Prometheus instances, compressing the records was shown to reduce disk usage by half while incurring a negligible CPU cost. Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com> 2019-06-19 13:46:24 +00:00			`} else {`
			`r.rec = append(r.rec, buf[:length]...)`
			`}`
LiveReader can get into an infinite loop on corrupt WALs. (#524) Make WAL live tailer return EOF when the there is a half-written record at the end of the file. Previously, this would cause an infinite loop as we ignored EOFs when filling the buffer. We now differentiate between EOFs that read >0 bytes, and EOFs that didn't. Add some more unit tests for tailing a corrupt WAL, and unify interfaces Reader and LiveReader for the purposes of testing. Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com> 2019-02-19 14:33:57 +00:00
			`if err := validateRecord(r.curRecTyp, i); err != nil {`
			`return err`
			`}`
			`if r.curRecTyp == recLast \|\| r.curRecTyp == recFull {`
Add Zstandard compression option for wlog (#11666) Snappy remains as the default compression but there is now a flag to switch the compression algorithm. Signed-off-by: Justin Lei <justin.lei@grafana.com> 2023-07-11 12:57:57 +00:00			`if isSnappyCompressed && len(r.compressBuf) > 0 {`
Provide option to compress WAL records (#609) In running Prometheus instances, compressing the records was shown to reduce disk usage by half while incurring a negligible CPU cost. Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com> 2019-06-19 13:46:24 +00:00			// The snappy library uses `len` to calculate if we need a new buffer.
			`// In order to allocate as few buffers as possible make the length`
			`// equal to the capacity.`
			`r.rec = r.rec[:cap(r.rec)]`
Add Zstandard compression option for wlog (#11666) Snappy remains as the default compression but there is now a flag to switch the compression algorithm. Signed-off-by: Justin Lei <justin.lei@grafana.com> 2023-07-11 12:57:57 +00:00			`r.rec, err = snappy.Decode(r.rec, r.compressBuf)`
			`return err`
			`} else if isZstdCompressed && len(r.compressBuf) > 0 {`
			`r.rec, err = r.zstdReader.DecodeAll(r.compressBuf, r.rec[:0])`
Provide option to compress WAL records (#609) In running Prometheus instances, compressing the records was shown to reduce disk usage by half while incurring a negligible CPU cost. Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com> 2019-06-19 13:46:24 +00:00			`return err`
			`}`
LiveReader can get into an infinite loop on corrupt WALs. (#524) Make WAL live tailer return EOF when the there is a half-written record at the end of the file. Previously, this would cause an infinite loop as we ignored EOFs when filling the buffer. We now differentiate between EOFs that read >0 bytes, and EOFs that didn't. Add some more unit tests for tailing a corrupt WAL, and unify interfaces Reader and LiveReader for the purposes of testing. Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com> 2019-02-19 14:33:57 +00:00			`return nil`
			`}`

			`// Only increment i for non-zero records since we use it`
			`// to determine valid content record sequences.`
			`i++`
			`}`
			`}`

			`// Err returns the last encountered error wrapped in a corruption error.`
			`// If the reader does not allow to infer a segment index and offset, a total`
			`// offset in the reader stream will be provided.`
			`func (r *Reader) Err() error {`
			`if r.err == nil {`
			`return nil`
			`}`
			`if b, ok := r.rdr.(*segmentBufReader); ok {`
			`return &CorruptionErr{`
			`Err: r.err,`
			`Dir: b.segs[b.cur].Dir(),`
			`Segment: b.segs[b.cur].Index(),`
			`Offset: int64(b.off),`
			`}`
			`}`
			`return &CorruptionErr{`
			`Err: r.err,`
			`Segment: -1,`
			`Offset: r.total,`
			`}`
			`}`

			`// Record returns the current record. The returned byte slice is only`
			`// valid until the next call to Next.`
			`func (r *Reader) Record() []byte {`
			`return r.rec`
			`}`

			`// Segment returns the current segment being read.`
			`func (r *Reader) Segment() int {`
			`if b, ok := r.rdr.(*segmentBufReader); ok {`
			`return b.segs[b.cur].Index()`
			`}`
			`return -1`
			`}`

			`// Offset returns the current position of the segment being read.`
			`func (r *Reader) Offset() int64 {`
			`if b, ok := r.rdr.(*segmentBufReader); ok {`
			`return int64(b.off)`
			`}`
			`return r.total`
			`}`