mirror of https://github.com/prometheus/prometheus
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
699 lines
16 KiB
699 lines
16 KiB
// Copyright 2017 The Prometheus Authors
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package tsdb
|
|
|
|
import (
|
|
"fmt"
|
|
"math/rand"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"time"
|
|
|
|
"github.com/coreos/etcd/pkg/fileutil"
|
|
"github.com/go-kit/kit/log"
|
|
"github.com/oklog/ulid"
|
|
"github.com/pkg/errors"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/tsdb/chunks"
|
|
"github.com/prometheus/tsdb/labels"
|
|
)
|
|
|
|
// ExponentialBlockRanges returns the time ranges based on the stepSize
|
|
func ExponentialBlockRanges(minSize int64, steps, stepSize int) []int64 {
|
|
ranges := make([]int64, 0, steps)
|
|
curRange := minSize
|
|
for i := 0; i < steps; i++ {
|
|
ranges = append(ranges, curRange)
|
|
curRange = curRange * int64(stepSize)
|
|
}
|
|
|
|
return ranges
|
|
}
|
|
|
|
// Compactor provides compaction against an underlying storage
|
|
// of time series data.
|
|
type Compactor interface {
|
|
// Plan returns a set of non-overlapping directories that can
|
|
// be compacted concurrently.
|
|
// Results returned when compactions are in progress are undefined.
|
|
Plan() ([][]string, error)
|
|
|
|
// Write persists a Block into a directory.
|
|
Write(b Block) error
|
|
|
|
// Compact runs compaction against the provided directories. Must
|
|
// only be called concurrently with results of Plan().
|
|
Compact(dirs ...string) error
|
|
}
|
|
|
|
// compactor implements the Compactor interface.
|
|
type compactor struct {
|
|
dir string
|
|
metrics *compactorMetrics
|
|
logger log.Logger
|
|
opts *compactorOptions
|
|
}
|
|
|
|
type compactorMetrics struct {
|
|
ran prometheus.Counter
|
|
failed prometheus.Counter
|
|
duration prometheus.Histogram
|
|
}
|
|
|
|
func newCompactorMetrics(r prometheus.Registerer) *compactorMetrics {
|
|
m := &compactorMetrics{}
|
|
|
|
m.ran = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: "tsdb_compactions_total",
|
|
Help: "Total number of compactions that were executed for the partition.",
|
|
})
|
|
m.failed = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: "tsdb_compactions_failed_total",
|
|
Help: "Total number of compactions that failed for the partition.",
|
|
})
|
|
m.duration = prometheus.NewSummary(prometheus.SummaryOpts{
|
|
Name: "tsdb_compaction_duration",
|
|
Help: "Duration of compaction runs.",
|
|
})
|
|
|
|
if r != nil {
|
|
r.MustRegister(
|
|
m.ran,
|
|
m.failed,
|
|
m.duration,
|
|
)
|
|
}
|
|
return m
|
|
}
|
|
|
|
type compactorOptions struct {
|
|
blockRanges []int64
|
|
}
|
|
|
|
func newCompactor(dir string, r prometheus.Registerer, l log.Logger, opts *compactorOptions) *compactor {
|
|
return &compactor{
|
|
dir: dir,
|
|
opts: opts,
|
|
logger: l,
|
|
metrics: newCompactorMetrics(r),
|
|
}
|
|
}
|
|
|
|
type compactionInfo struct {
|
|
seq int
|
|
generation int
|
|
mint, maxt int64
|
|
}
|
|
|
|
const compactionBlocksLen = 3
|
|
|
|
type dirMeta struct {
|
|
dir string
|
|
meta *BlockMeta
|
|
}
|
|
|
|
func (c *compactor) Plan() ([][]string, error) {
|
|
dirs, err := blockDirs(c.dir)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var dms []dirMeta
|
|
|
|
for _, dir := range dirs {
|
|
meta, err := readMetaFile(dir)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if meta.Compaction.Generation > 0 {
|
|
dms = append(dms, dirMeta{dir, meta})
|
|
}
|
|
}
|
|
sort.Slice(dms, func(i, j int) bool {
|
|
return dms[i].meta.MinTime < dms[j].meta.MinTime
|
|
})
|
|
|
|
if len(dms) <= 1 {
|
|
return nil, nil
|
|
}
|
|
|
|
sliceDirs := func(dms []dirMeta) [][]string {
|
|
if len(dms) == 0 {
|
|
return nil
|
|
}
|
|
var res []string
|
|
for _, dm := range dms {
|
|
res = append(res, dm.dir)
|
|
}
|
|
return [][]string{res}
|
|
}
|
|
|
|
planDirs := sliceDirs(c.selectDirs(dms))
|
|
if len(dirs) > 1 {
|
|
return planDirs, nil
|
|
}
|
|
|
|
// Compact any blocks that have >5% tombstones.
|
|
for i := len(dms) - 1; i >= 0; i-- {
|
|
meta := dms[i].meta
|
|
if meta.MaxTime-meta.MinTime < c.opts.blockRanges[len(c.opts.blockRanges)/2] {
|
|
break
|
|
}
|
|
|
|
if meta.Stats.NumSeries/meta.Stats.NumTombstones <= 20 { // 5%
|
|
return [][]string{{dms[i].dir}}, nil
|
|
}
|
|
}
|
|
|
|
return nil, nil
|
|
}
|
|
|
|
func (c *compactor) selectDirs(ds []dirMeta) []dirMeta {
|
|
// The way to skip compaction is to not have blockRanges.
|
|
if len(c.opts.blockRanges) == 1 {
|
|
return nil
|
|
}
|
|
|
|
return selectRecurse(ds, c.opts.blockRanges)
|
|
}
|
|
|
|
func selectRecurse(dms []dirMeta, intervals []int64) []dirMeta {
|
|
if len(intervals) == 0 {
|
|
return dms
|
|
}
|
|
|
|
// Get the blocks by the max interval
|
|
blocks := splitByRange(dms, intervals[len(intervals)-1])
|
|
dirs := []dirMeta{}
|
|
for i := len(blocks) - 1; i >= 0; i-- {
|
|
// We need to choose the oldest blocks to compact. If there are a couple of blocks in
|
|
// the largest interval, we should compact those first.
|
|
if len(blocks[i]) > 1 {
|
|
dirs = blocks[i]
|
|
break
|
|
}
|
|
}
|
|
|
|
// If there are too many blocks, see if a smaller interval will catch them.
|
|
// i.e, if we have 0-20, 60-80, 80-100; all fall under 0-240, but we'd rather compact 60-100
|
|
// than all at once.
|
|
// Again if have 0-1d, 1d-2d, 3-6d we compact 0-1d, 1d-2d to compact it into the 0-3d block instead of compacting all three
|
|
// This is to honor the boundaries as much as possible.
|
|
if len(dirs) > 2 {
|
|
smallerDirs := selectRecurse(dirs, intervals[:len(intervals)-1])
|
|
if len(smallerDirs) > 1 {
|
|
return smallerDirs
|
|
}
|
|
}
|
|
|
|
return dirs
|
|
}
|
|
|
|
// splitByRange splits the directories by the time range.
|
|
// for example if we have blocks 0-10, 10-20, 50-60, 90-100 and want to split them into 30 interval ranges
|
|
// splitByRange returns [0-10, 10-20], [50-60], [90-100].
|
|
func splitByRange(ds []dirMeta, tr int64) [][]dirMeta {
|
|
var splitDirs [][]dirMeta
|
|
|
|
for i := 0; i < len(ds); {
|
|
var group []dirMeta
|
|
var t0 int64
|
|
m := ds[i].meta
|
|
// Compute start of aligned time range of size tr closest to the current block's start.
|
|
if m.MinTime >= 0 {
|
|
t0 = tr * (m.MinTime / tr)
|
|
} else {
|
|
t0 = tr * ((m.MinTime - tr + 1) / tr)
|
|
}
|
|
// Skip blocks that don't fall into the range. This can happen via mis-alignment or
|
|
// by being the multiple of the intended range.
|
|
if ds[i].meta.MinTime < t0 || ds[i].meta.MaxTime > t0+tr {
|
|
i++
|
|
continue
|
|
}
|
|
|
|
// Add all dirs to the current group that are within [t0, t0+tr].
|
|
for ; i < len(ds); i++ {
|
|
// Either the block falls into the next range or doesn't fit at all (checked above).
|
|
if ds[i].meta.MinTime < t0 || ds[i].meta.MaxTime > t0+tr {
|
|
break
|
|
}
|
|
group = append(group, ds[i])
|
|
}
|
|
|
|
if len(group) > 0 {
|
|
splitDirs = append(splitDirs, group)
|
|
}
|
|
}
|
|
|
|
return splitDirs
|
|
}
|
|
|
|
func compactBlockMetas(blocks ...BlockMeta) (res BlockMeta) {
|
|
res.MinTime = blocks[0].MinTime
|
|
res.MaxTime = blocks[len(blocks)-1].MaxTime
|
|
|
|
sources := map[ulid.ULID]struct{}{}
|
|
|
|
for _, b := range blocks {
|
|
if b.Compaction.Generation > res.Compaction.Generation {
|
|
res.Compaction.Generation = b.Compaction.Generation
|
|
}
|
|
for _, s := range b.Compaction.Sources {
|
|
sources[s] = struct{}{}
|
|
}
|
|
// If it's an in memory block, its ULID goes into the sources.
|
|
if b.Compaction.Generation == 0 {
|
|
sources[b.ULID] = struct{}{}
|
|
}
|
|
}
|
|
res.Compaction.Generation++
|
|
|
|
for s := range sources {
|
|
res.Compaction.Sources = append(res.Compaction.Sources, s)
|
|
}
|
|
sort.Slice(res.Compaction.Sources, func(i, j int) bool {
|
|
return res.Compaction.Sources[i].Compare(res.Compaction.Sources[j]) < 0
|
|
})
|
|
|
|
return res
|
|
}
|
|
|
|
func (c *compactor) Compact(dirs ...string) (err error) {
|
|
var blocks []Block
|
|
|
|
for _, d := range dirs {
|
|
b, err := newPersistedBlock(d)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer b.Close()
|
|
|
|
blocks = append(blocks, b)
|
|
}
|
|
|
|
entropy := rand.New(rand.NewSource(time.Now().UnixNano()))
|
|
uid := ulid.MustNew(ulid.Now(), entropy)
|
|
|
|
return c.write(uid, blocks...)
|
|
}
|
|
|
|
func (c *compactor) Write(b Block) error {
|
|
// Buffering blocks might have been created that often have no data.
|
|
if b.Meta().Stats.NumSeries == 0 {
|
|
return errors.Wrap(os.RemoveAll(b.Dir()), "remove empty block")
|
|
}
|
|
|
|
entropy := rand.New(rand.NewSource(time.Now().UnixNano()))
|
|
uid := ulid.MustNew(ulid.Now(), entropy)
|
|
|
|
return c.write(uid, b)
|
|
}
|
|
|
|
// write creates a new block that is the union of the provided blocks into dir.
|
|
// It cleans up all files of the old blocks after completing successfully.
|
|
func (c *compactor) write(uid ulid.ULID, blocks ...Block) (err error) {
|
|
c.logger.Log("msg", "compact blocks", "blocks", fmt.Sprintf("%v", blocks))
|
|
|
|
defer func(t time.Time) {
|
|
if err != nil {
|
|
c.metrics.failed.Inc()
|
|
}
|
|
c.metrics.ran.Inc()
|
|
c.metrics.duration.Observe(time.Since(t).Seconds())
|
|
}(time.Now())
|
|
|
|
dir := filepath.Join(c.dir, uid.String())
|
|
tmp := dir + ".tmp"
|
|
|
|
if err = os.RemoveAll(tmp); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err = os.MkdirAll(tmp, 0777); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Populate chunk and index files into temporary directory with
|
|
// data of all blocks.
|
|
chunkw, err := newChunkWriter(chunkDir(tmp))
|
|
if err != nil {
|
|
return errors.Wrap(err, "open chunk writer")
|
|
}
|
|
indexw, err := newIndexWriter(tmp)
|
|
if err != nil {
|
|
return errors.Wrap(err, "open index writer")
|
|
}
|
|
|
|
meta, err := populateBlock(blocks, indexw, chunkw)
|
|
if err != nil {
|
|
return errors.Wrap(err, "write compaction")
|
|
}
|
|
meta.ULID = uid
|
|
|
|
if err = writeMetaFile(tmp, meta); err != nil {
|
|
return errors.Wrap(err, "write merged meta")
|
|
}
|
|
|
|
if err = chunkw.Close(); err != nil {
|
|
return errors.Wrap(err, "close chunk writer")
|
|
}
|
|
if err = indexw.Close(); err != nil {
|
|
return errors.Wrap(err, "close index writer")
|
|
}
|
|
|
|
// Create an empty tombstones file.
|
|
if err := writeTombstoneFile(tmp, newEmptyTombstoneReader()); err != nil {
|
|
return errors.Wrap(err, "write new tombstones file")
|
|
}
|
|
|
|
// Block successfully written, make visible and remove old ones.
|
|
if err := renameFile(tmp, dir); err != nil {
|
|
return errors.Wrap(err, "rename block dir")
|
|
}
|
|
for _, b := range blocks {
|
|
if err := os.RemoveAll(b.Dir()); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
// Properly sync parent dir to ensure changes are visible.
|
|
df, err := fileutil.OpenDir(dir)
|
|
if err != nil {
|
|
return errors.Wrap(err, "sync block dir")
|
|
}
|
|
defer df.Close()
|
|
|
|
if err := fileutil.Fsync(df); err != nil {
|
|
return errors.Wrap(err, "sync block dir")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// populateBlock fills the index and chunk writers with new data gathered as the union
|
|
// of the provided blocks. It returns meta information for the new block.
|
|
func populateBlock(blocks []Block, indexw IndexWriter, chunkw ChunkWriter) (*BlockMeta, error) {
|
|
var set compactionSet
|
|
var metas []BlockMeta
|
|
|
|
for i, b := range blocks {
|
|
metas = append(metas, b.Meta())
|
|
|
|
all, err := b.Index().Postings("", "")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
s := newCompactionSeriesSet(b.Index(), b.Chunks(), b.Tombstones(), all)
|
|
|
|
if i == 0 {
|
|
set = s
|
|
continue
|
|
}
|
|
set, err = newCompactionMerger(set, s)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
// We fully rebuild the postings list index from merged series.
|
|
var (
|
|
postings = &memPostings{m: make(map[term][]uint32, 512)}
|
|
values = map[string]stringset{}
|
|
i = uint32(0)
|
|
meta = compactBlockMetas(metas...)
|
|
)
|
|
|
|
for set.Next() {
|
|
lset, chks, dranges := set.At() // The chunks here are not fully deleted.
|
|
|
|
// Skip the series with all deleted chunks.
|
|
if len(chks) == 0 {
|
|
continue
|
|
}
|
|
|
|
if len(dranges) > 0 {
|
|
// Re-encode the chunk to not have deleted values.
|
|
for _, chk := range chks {
|
|
if intervalOverlap(dranges[0].mint, dranges[len(dranges)-1].maxt, chk.MinTime, chk.MaxTime) {
|
|
newChunk := chunks.NewXORChunk()
|
|
app, err := newChunk.Appender()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
it := &deletedIterator{it: chk.Chunk.Iterator(), intervals: dranges}
|
|
for it.Next() {
|
|
ts, v := it.At()
|
|
app.Append(ts, v)
|
|
}
|
|
|
|
chk.Chunk = newChunk
|
|
}
|
|
}
|
|
}
|
|
|
|
if err := chunkw.WriteChunks(chks...); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
indexw.AddSeries(i, lset, chks...)
|
|
|
|
meta.Stats.NumChunks += uint64(len(chks))
|
|
meta.Stats.NumSeries++
|
|
for _, chk := range chks {
|
|
meta.Stats.NumSamples += uint64(chk.Chunk.NumSamples())
|
|
}
|
|
|
|
for _, l := range lset {
|
|
valset, ok := values[l.Name]
|
|
if !ok {
|
|
valset = stringset{}
|
|
values[l.Name] = valset
|
|
}
|
|
valset.set(l.Value)
|
|
|
|
postings.add(i, term{name: l.Name, value: l.Value})
|
|
}
|
|
i++
|
|
}
|
|
if set.Err() != nil {
|
|
return nil, set.Err()
|
|
}
|
|
|
|
s := make([]string, 0, 256)
|
|
for n, v := range values {
|
|
s = s[:0]
|
|
|
|
for x := range v {
|
|
s = append(s, x)
|
|
}
|
|
if err := indexw.WriteLabelIndex([]string{n}, s); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
for t := range postings.m {
|
|
if err := indexw.WritePostings(t.name, t.value, postings.get(t)); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
// Write a postings list containing all series.
|
|
all := make([]uint32, i)
|
|
for i := range all {
|
|
all[i] = uint32(i)
|
|
}
|
|
if err := indexw.WritePostings("", "", newListPostings(all)); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &meta, nil
|
|
}
|
|
|
|
type compactionSet interface {
|
|
Next() bool
|
|
At() (labels.Labels, []*ChunkMeta, intervals)
|
|
Err() error
|
|
}
|
|
|
|
type compactionSeriesSet struct {
|
|
p Postings
|
|
index IndexReader
|
|
chunks ChunkReader
|
|
tombstones TombstoneReader
|
|
|
|
l labels.Labels
|
|
c []*ChunkMeta
|
|
intervals intervals
|
|
err error
|
|
}
|
|
|
|
func newCompactionSeriesSet(i IndexReader, c ChunkReader, t TombstoneReader, p Postings) *compactionSeriesSet {
|
|
return &compactionSeriesSet{
|
|
index: i,
|
|
chunks: c,
|
|
tombstones: t,
|
|
p: p,
|
|
}
|
|
}
|
|
|
|
func (c *compactionSeriesSet) Next() bool {
|
|
if !c.p.Next() {
|
|
return false
|
|
}
|
|
|
|
c.intervals = c.tombstones.Get(c.p.At())
|
|
|
|
c.l, c.c, c.err = c.index.Series(c.p.At())
|
|
if c.err != nil {
|
|
return false
|
|
}
|
|
|
|
// Remove completely deleted chunks.
|
|
if len(c.intervals) > 0 {
|
|
chks := make([]*ChunkMeta, 0, len(c.c))
|
|
for _, chk := range c.c {
|
|
if !(interval{chk.MinTime, chk.MaxTime}.isSubrange(c.intervals)) {
|
|
chks = append(chks, chk)
|
|
}
|
|
}
|
|
|
|
c.c = chks
|
|
}
|
|
|
|
for _, chk := range c.c {
|
|
chk.Chunk, c.err = c.chunks.Chunk(chk.Ref)
|
|
if c.err != nil {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func (c *compactionSeriesSet) Err() error {
|
|
if c.err != nil {
|
|
return c.err
|
|
}
|
|
return c.p.Err()
|
|
}
|
|
|
|
func (c *compactionSeriesSet) At() (labels.Labels, []*ChunkMeta, intervals) {
|
|
return c.l, c.c, c.intervals
|
|
}
|
|
|
|
type compactionMerger struct {
|
|
a, b compactionSet
|
|
|
|
aok, bok bool
|
|
l labels.Labels
|
|
c []*ChunkMeta
|
|
intervals intervals
|
|
}
|
|
|
|
type compactionSeries struct {
|
|
labels labels.Labels
|
|
chunks []*ChunkMeta
|
|
}
|
|
|
|
func newCompactionMerger(a, b compactionSet) (*compactionMerger, error) {
|
|
c := &compactionMerger{
|
|
a: a,
|
|
b: b,
|
|
}
|
|
// Initialize first elements of both sets as Next() needs
|
|
// one element look-ahead.
|
|
c.aok = c.a.Next()
|
|
c.bok = c.b.Next()
|
|
|
|
return c, c.Err()
|
|
}
|
|
|
|
func (c *compactionMerger) compare() int {
|
|
if !c.aok {
|
|
return 1
|
|
}
|
|
if !c.bok {
|
|
return -1
|
|
}
|
|
a, _, _ := c.a.At()
|
|
b, _, _ := c.b.At()
|
|
return labels.Compare(a, b)
|
|
}
|
|
|
|
func (c *compactionMerger) Next() bool {
|
|
if !c.aok && !c.bok || c.Err() != nil {
|
|
return false
|
|
}
|
|
|
|
d := c.compare()
|
|
// Both sets contain the current series. Chain them into a single one.
|
|
if d > 0 {
|
|
c.l, c.c, c.intervals = c.b.At()
|
|
c.bok = c.b.Next()
|
|
} else if d < 0 {
|
|
c.l, c.c, c.intervals = c.a.At()
|
|
c.aok = c.a.Next()
|
|
} else {
|
|
l, ca, ra := c.a.At()
|
|
_, cb, rb := c.b.At()
|
|
for _, r := range rb {
|
|
ra = ra.add(r)
|
|
}
|
|
|
|
c.l = l
|
|
c.c = append(ca, cb...)
|
|
c.intervals = ra
|
|
|
|
c.aok = c.a.Next()
|
|
c.bok = c.b.Next()
|
|
}
|
|
return true
|
|
}
|
|
|
|
func (c *compactionMerger) Err() error {
|
|
if c.a.Err() != nil {
|
|
return c.a.Err()
|
|
}
|
|
return c.b.Err()
|
|
}
|
|
|
|
func (c *compactionMerger) At() (labels.Labels, []*ChunkMeta, intervals) {
|
|
return c.l, c.c, c.intervals
|
|
}
|
|
|
|
func renameFile(from, to string) error {
|
|
if err := os.RemoveAll(to); err != nil {
|
|
return err
|
|
}
|
|
if err := os.Rename(from, to); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Directory was renamed; sync parent dir to persist rename.
|
|
pdir, err := fileutil.OpenDir(filepath.Dir(to))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer pdir.Close()
|
|
|
|
if err = fileutil.Fsync(pdir); err != nil {
|
|
return err
|
|
}
|
|
if err = pdir.Close(); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|