// Copyright 2018 The Prometheus Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package wlog import ( "errors" "fmt" "io" "log/slog" "math" "os" "path/filepath" "slices" "strconv" "strings" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/tsdb/chunks" tsdb_errors "github.com/prometheus/prometheus/tsdb/errors" "github.com/prometheus/prometheus/tsdb/fileutil" "github.com/prometheus/prometheus/tsdb/record" "github.com/prometheus/prometheus/tsdb/tombstones" ) // CheckpointStats returns stats about a created checkpoint. type CheckpointStats struct { DroppedSeries int DroppedSamples int // Includes histograms. DroppedTombstones int DroppedExemplars int DroppedMetadata int TotalSeries int // Processed series including dropped ones. TotalSamples int // Processed float and histogram samples including dropped ones. TotalTombstones int // Processed tombstones including dropped ones. TotalExemplars int // Processed exemplars including dropped ones. TotalMetadata int // Processed metadata including dropped ones. } // LastCheckpoint returns the directory name and index of the most recent checkpoint. // If dir does not contain any checkpoints, ErrNotFound is returned. func LastCheckpoint(dir string) (string, int, error) { checkpoints, err := listCheckpoints(dir) if err != nil { return "", 0, err } if len(checkpoints) == 0 { return "", 0, record.ErrNotFound } checkpoint := checkpoints[len(checkpoints)-1] return filepath.Join(dir, checkpoint.name), checkpoint.index, nil } // DeleteCheckpoints deletes all checkpoints in a directory below a given index. func DeleteCheckpoints(dir string, maxIndex int) error { checkpoints, err := listCheckpoints(dir) if err != nil { return err } errs := tsdb_errors.NewMulti() for _, checkpoint := range checkpoints { if checkpoint.index >= maxIndex { break } errs.Add(os.RemoveAll(filepath.Join(dir, checkpoint.name))) } return errs.Err() } const checkpointPrefix = "checkpoint." // Checkpoint creates a compacted checkpoint of segments in range [from, to] in the given WAL. // It includes the most recent checkpoint if it exists. // All series not satisfying keep, samples/tombstones/exemplars below mint and // metadata that are not the latest are dropped. // // The checkpoint is stored in a directory named checkpoint.N in the same // segmented format as the original WAL itself. // This makes it easy to read it through the WAL package and concatenate // it with the original WAL. func Checkpoint(logger *slog.Logger, w *WL, from, to int, keep func(id chunks.HeadSeriesRef) bool, mint int64) (*CheckpointStats, error) { stats := &CheckpointStats{} var sgmReader io.ReadCloser logger.Info("Creating checkpoint", "from_segment", from, "to_segment", to, "mint", mint) { var sgmRange []SegmentRange dir, idx, err := LastCheckpoint(w.Dir()) if err != nil && !errors.Is(err, record.ErrNotFound) { return nil, fmt.Errorf("find last checkpoint: %w", err) } last := idx + 1 if err == nil { if from > last { return nil, fmt.Errorf("unexpected gap to last checkpoint. expected:%v, requested:%v", last, from) } // Ignore WAL files below the checkpoint. They shouldn't exist to begin with. from = last sgmRange = append(sgmRange, SegmentRange{Dir: dir, Last: math.MaxInt32}) } sgmRange = append(sgmRange, SegmentRange{Dir: w.Dir(), First: from, Last: to}) sgmReader, err = NewSegmentsRangeReader(sgmRange...) if err != nil { return nil, fmt.Errorf("create segment reader: %w", err) } defer sgmReader.Close() } cpdir := checkpointDir(w.Dir(), to) cpdirtmp := cpdir + ".tmp" if err := os.RemoveAll(cpdirtmp); err != nil { return nil, fmt.Errorf("remove previous temporary checkpoint dir: %w", err) } if err := os.MkdirAll(cpdirtmp, 0o777); err != nil { return nil, fmt.Errorf("create checkpoint dir: %w", err) } cp, err := New(nil, nil, cpdirtmp, w.CompressionType()) if err != nil { return nil, fmt.Errorf("open checkpoint: %w", err) } // Ensures that an early return caused by an error doesn't leave any tmp files. defer func() { cp.Close() os.RemoveAll(cpdirtmp) }() r := NewReader(sgmReader) var ( series []record.RefSeries samples []record.RefSample histogramSamples []record.RefHistogramSample floatHistogramSamples []record.RefFloatHistogramSample tstones []tombstones.Stone exemplars []record.RefExemplar metadata []record.RefMetadata st = labels.NewSymbolTable() // Needed for decoding; labels do not outlive this function. dec = record.NewDecoder(st) enc record.Encoder buf []byte recs [][]byte latestMetadataMap = make(map[chunks.HeadSeriesRef]record.RefMetadata) ) for r.Next() { series, samples, histogramSamples, floatHistogramSamples, tstones, exemplars, metadata = series[:0], samples[:0], histogramSamples[:0], floatHistogramSamples[:0], tstones[:0], exemplars[:0], metadata[:0] // We don't reset the buffer since we batch up multiple records // before writing them to the checkpoint. // Remember where the record for this iteration starts. start := len(buf) rec := r.Record() switch dec.Type(rec) { case record.Series: series, err = dec.Series(rec, series) if err != nil { return nil, fmt.Errorf("decode series: %w", err) } // Drop irrelevant series in place. repl := series[:0] for _, s := range series { if keep(s.Ref) { repl = append(repl, s) } } if len(repl) > 0 { buf = enc.Series(repl, buf) } stats.TotalSeries += len(series) stats.DroppedSeries += len(series) - len(repl) case record.Samples: samples, err = dec.Samples(rec, samples) if err != nil { return nil, fmt.Errorf("decode samples: %w", err) } // Drop irrelevant samples in place. repl := samples[:0] for _, s := range samples { if s.T >= mint { repl = append(repl, s) } } if len(repl) > 0 { buf = enc.Samples(repl, buf) } stats.TotalSamples += len(samples) stats.DroppedSamples += len(samples) - len(repl) case record.HistogramSamples: histogramSamples, err = dec.HistogramSamples(rec, histogramSamples) if err != nil { return nil, fmt.Errorf("decode histogram samples: %w", err) } // Drop irrelevant histogramSamples in place. repl := histogramSamples[:0] for _, h := range histogramSamples { if h.T >= mint { repl = append(repl, h) } } if len(repl) > 0 { buf = enc.HistogramSamples(repl, buf) } stats.TotalSamples += len(histogramSamples) stats.DroppedSamples += len(histogramSamples) - len(repl) case record.FloatHistogramSamples: floatHistogramSamples, err = dec.FloatHistogramSamples(rec, floatHistogramSamples) if err != nil { return nil, fmt.Errorf("decode float histogram samples: %w", err) } // Drop irrelevant floatHistogramSamples in place. repl := floatHistogramSamples[:0] for _, fh := range floatHistogramSamples { if fh.T >= mint { repl = append(repl, fh) } } if len(repl) > 0 { buf = enc.FloatHistogramSamples(repl, buf) } stats.TotalSamples += len(floatHistogramSamples) stats.DroppedSamples += len(floatHistogramSamples) - len(repl) case record.Tombstones: tstones, err = dec.Tombstones(rec, tstones) if err != nil { return nil, fmt.Errorf("decode deletes: %w", err) } // Drop irrelevant tombstones in place. repl := tstones[:0] for _, s := range tstones { for _, iv := range s.Intervals { if iv.Maxt >= mint { repl = append(repl, s) break } } } if len(repl) > 0 { buf = enc.Tombstones(repl, buf) } stats.TotalTombstones += len(tstones) stats.DroppedTombstones += len(tstones) - len(repl) case record.Exemplars: exemplars, err = dec.Exemplars(rec, exemplars) if err != nil { return nil, fmt.Errorf("decode exemplars: %w", err) } // Drop irrelevant exemplars in place. repl := exemplars[:0] for _, e := range exemplars { if e.T >= mint { repl = append(repl, e) } } if len(repl) > 0 { buf = enc.Exemplars(repl, buf) } stats.TotalExemplars += len(exemplars) stats.DroppedExemplars += len(exemplars) - len(repl) case record.Metadata: metadata, err := dec.Metadata(rec, metadata) if err != nil { return nil, fmt.Errorf("decode metadata: %w", err) } // Only keep reference to the latest found metadata for each refID. repl := 0 for _, m := range metadata { if keep(m.Ref) { if _, ok := latestMetadataMap[m.Ref]; !ok { repl++ } latestMetadataMap[m.Ref] = m } } stats.TotalMetadata += len(metadata) stats.DroppedMetadata += len(metadata) - repl default: // Unknown record type, probably from a future Prometheus version. continue } if len(buf[start:]) == 0 { continue // All contents discarded. } recs = append(recs, buf[start:]) // Flush records in 1 MB increments. if len(buf) > 1*1024*1024 { if err := cp.Log(recs...); err != nil { return nil, fmt.Errorf("flush records: %w", err) } buf, recs = buf[:0], recs[:0] } } // If we hit any corruption during checkpointing, repairing is not an option. // The head won't know which series records are lost. if r.Err() != nil { return nil, fmt.Errorf("read segments: %w", r.Err()) } // Flush remaining records. if err := cp.Log(recs...); err != nil { return nil, fmt.Errorf("flush records: %w", err) } // Flush latest metadata records for each series. if len(latestMetadataMap) > 0 { latestMetadata := make([]record.RefMetadata, 0, len(latestMetadataMap)) for _, m := range latestMetadataMap { latestMetadata = append(latestMetadata, m) } if err := cp.Log(enc.Metadata(latestMetadata, buf[:0])); err != nil { return nil, fmt.Errorf("flush metadata records: %w", err) } } if err := cp.Close(); err != nil { return nil, fmt.Errorf("close checkpoint: %w", err) } // Sync temporary directory before rename. df, err := fileutil.OpenDir(cpdirtmp) if err != nil { return nil, fmt.Errorf("open temporary checkpoint directory: %w", err) } if err := df.Sync(); err != nil { df.Close() return nil, fmt.Errorf("sync temporary checkpoint directory: %w", err) } if err = df.Close(); err != nil { return nil, fmt.Errorf("close temporary checkpoint directory: %w", err) } if err := fileutil.Replace(cpdirtmp, cpdir); err != nil { return nil, fmt.Errorf("rename checkpoint directory: %w", err) } return stats, nil } func checkpointDir(dir string, i int) string { return filepath.Join(dir, fmt.Sprintf(checkpointPrefix+"%08d", i)) } type checkpointRef struct { name string index int } func listCheckpoints(dir string) (refs []checkpointRef, err error) { files, err := os.ReadDir(dir) if err != nil { return nil, err } for i := 0; i < len(files); i++ { fi := files[i] if !strings.HasPrefix(fi.Name(), checkpointPrefix) { continue } if !fi.IsDir() { return nil, fmt.Errorf("checkpoint %s is not a directory", fi.Name()) } idx, err := strconv.Atoi(fi.Name()[len(checkpointPrefix):]) if err != nil { continue } refs = append(refs, checkpointRef{name: fi.Name(), index: idx}) } slices.SortFunc(refs, func(a, b checkpointRef) int { return a.index - b.index }) return refs, nil }