The Prometheus monitoring system and time series database.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

622 lines
19 KiB

// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"context"
"errors"
"fmt"
"math"
"slices"
"sync"
"github.com/go-kit/log/level"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/storage"
"github.com/prometheus/prometheus/tsdb/chunkenc"
"github.com/prometheus/prometheus/tsdb/chunks"
"github.com/prometheus/prometheus/tsdb/index"
)
func (h *Head) ExemplarQuerier(ctx context.Context) (storage.ExemplarQuerier, error) {
return h.exemplars.ExemplarQuerier(ctx)
}
// Index returns an IndexReader against the block.
func (h *Head) Index() (IndexReader, error) {
return h.indexRange(math.MinInt64, math.MaxInt64), nil
}
func (h *Head) indexRange(mint, maxt int64) *headIndexReader {
if hmin := h.MinTime(); hmin > mint {
mint = hmin
}
return &headIndexReader{head: h, mint: mint, maxt: maxt}
}
type headIndexReader struct {
head *Head
mint, maxt int64
}
func (h *headIndexReader) Close() error {
return nil
}
func (h *headIndexReader) Symbols() index.StringIter {
return h.head.postings.Symbols()
}
// SortedLabelValues returns label values present in the head for the
// specific label name that are within the time range mint to maxt.
// If matchers are specified the returned result set is reduced
// to label values of metrics matching the matchers.
func (h *headIndexReader) SortedLabelValues(ctx context.Context, name string, matchers ...*labels.Matcher) ([]string, error) {
values, err := h.LabelValues(ctx, name, matchers...)
if err == nil {
slices.Sort(values)
}
return values, err
}
// LabelValues returns label values present in the head for the
// specific label name that are within the time range mint to maxt.
// If matchers are specified the returned result set is reduced
// to label values of metrics matching the matchers.
func (h *headIndexReader) LabelValues(ctx context.Context, name string, matchers ...*labels.Matcher) ([]string, error) {
if h.maxt < h.head.MinTime() || h.mint > h.head.MaxTime() {
return []string{}, nil
}
if len(matchers) == 0 {
return h.head.postings.LabelValues(ctx, name), nil
}
return labelValuesWithMatchers(ctx, h, name, matchers...)
}
// LabelNames returns all the unique label names present in the head
// that are within the time range mint to maxt.
func (h *headIndexReader) LabelNames(ctx context.Context, matchers ...*labels.Matcher) ([]string, error) {
if h.maxt < h.head.MinTime() || h.mint > h.head.MaxTime() {
return []string{}, nil
}
if len(matchers) == 0 {
labelNames := h.head.postings.LabelNames()
slices.Sort(labelNames)
return labelNames, nil
}
return labelNamesWithMatchers(ctx, h, matchers...)
}
// Postings returns the postings list iterator for the label pairs.
func (h *headIndexReader) Postings(ctx context.Context, name string, values ...string) (index.Postings, error) {
Label values with matchers by intersecting postings (#9907) * LabelValues w/matchers by intersecting postings Instead of iterating all matched series to find the values, this checks if each one of the label values is present in the matched series (postings). Pending to be benchmarked. Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Benchmark labelValuesWithMatchers name old time/op new time/op Querier/Head/labelValuesWithMatchers/i_with_n="1" 157ms ± 0% 48ms ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="^.+$" 1.80s ± 0% 0.46s ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",j!="foo" 144ms ± 0% 57ms ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 304ms ± 0% 111ms ± 0% Querier/Head/labelValuesWithMatchers/n_with_j!="foo" 761ms ± 0% 164ms ± 0% Querier/Head/labelValuesWithMatchers/n_with_i="1" 6.11µs ± 0% 6.62µs ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1" 117ms ± 0% 62ms ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="^.+$" 1.44s ± 0% 0.24s ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",j!="foo" 92.1ms ± 0% 70.3ms ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 196ms ± 0% 115ms ± 0% Querier/Block/labelValuesWithMatchers/n_with_j!="foo" 1.23s ± 0% 0.21s ± 0% Querier/Block/labelValuesWithMatchers/n_with_i="1" 1.06ms ± 0% 0.88ms ± 0% name old alloc/op new alloc/op Querier/Head/labelValuesWithMatchers/i_with_n="1" 29.5MB ± 0% 26.9MB ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="^.+$" 46.8MB ± 0% 251.5MB ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",j!="foo" 29.5MB ± 0% 22.3MB ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 46.8MB ± 0% 23.9MB ± 0% Querier/Head/labelValuesWithMatchers/n_with_j!="foo" 10.3kB ± 0% 138535.2kB ± 0% Querier/Head/labelValuesWithMatchers/n_with_i="1" 5.54kB ± 0% 7.09kB ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1" 39.1MB ± 0% 28.5MB ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="^.+$" 287MB ± 0% 253MB ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",j!="foo" 34.3MB ± 0% 23.9MB ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 51.6MB ± 0% 25.5MB ± 0% Querier/Block/labelValuesWithMatchers/n_with_j!="foo" 144MB ± 0% 139MB ± 0% Querier/Block/labelValuesWithMatchers/n_with_i="1" 6.43kB ± 0% 8.66kB ± 0% name old allocs/op new allocs/op Querier/Head/labelValuesWithMatchers/i_with_n="1" 104k ± 0% 500k ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="^.+$" 204k ± 0% 600k ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",j!="foo" 104k ± 0% 500k ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 204k ± 0% 500k ± 0% Querier/Head/labelValuesWithMatchers/n_with_j!="foo" 66.0 ± 0% 255.0 ± 0% Querier/Head/labelValuesWithMatchers/n_with_i="1" 61.0 ± 0% 205.0 ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1" 304k ± 0% 600k ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="^.+$" 5.20M ± 0% 0.70M ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",j!="foo" 204k ± 0% 600k ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 304k ± 0% 600k ± 0% Querier/Block/labelValuesWithMatchers/n_with_j!="foo" 3.00M ± 0% 0.00M ± 0% Querier/Block/labelValuesWithMatchers/n_with_i="1" 61.0 ± 0% 247.0 ± 0% Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Don't expand postings to intersect them Using a min heap we can check whether matched postings intersect with each one of the label values postings. This avoid expanding postings (and thus having all of them in memory at any point). Slightly slower than the expanding postings version for some cases, but definitely pays the price once the cardinality grows. Still offers 10x latency improvement where previous latencies were reaching 1s. Benchmark results: name \ time/op old.txt intersect.txt intersect_noexpand.txt Querier/Head/labelValuesWithMatchers/i_with_n="1" 157ms ± 0% 48ms ± 0% 110ms ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="^.+$" 1.80s ± 0% 0.46s ± 0% 0.18s ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",j!="foo" 144ms ± 0% 57ms ± 0% 125ms ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 304ms ± 0% 111ms ± 0% 177ms ± 0% Querier/Head/labelValuesWithMatchers/n_with_j!="foo" 761ms ± 0% 164ms ± 0% 134ms ± 0% Querier/Head/labelValuesWithMatchers/n_with_i="1" 6.11µs ± 0% 6.62µs ± 0% 4.29µs ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1" 117ms ± 0% 62ms ± 0% 120ms ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="^.+$" 1.44s ± 0% 0.24s ± 0% 0.15s ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",j!="foo" 92.1ms ± 0% 70.3ms ± 0% 125.4ms ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 196ms ± 0% 115ms ± 0% 170ms ± 0% Querier/Block/labelValuesWithMatchers/n_with_j!="foo" 1.23s ± 0% 0.21s ± 0% 0.14s ± 0% Querier/Block/labelValuesWithMatchers/n_with_i="1" 1.06ms ± 0% 0.88ms ± 0% 0.92ms ± 0% name \ alloc/op old.txt intersect.txt intersect_noexpand.txt Querier/Head/labelValuesWithMatchers/i_with_n="1" 29.5MB ± 0% 26.9MB ± 0% 19.1MB ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="^.+$" 46.8MB ± 0% 251.5MB ± 0% 36.3MB ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",j!="foo" 29.5MB ± 0% 22.3MB ± 0% 19.1MB ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 46.8MB ± 0% 23.9MB ± 0% 20.7MB ± 0% Querier/Head/labelValuesWithMatchers/n_with_j!="foo" 10.3kB ± 0% 138535.2kB ± 0% 6.4kB ± 0% Querier/Head/labelValuesWithMatchers/n_with_i="1" 5.54kB ± 0% 7.09kB ± 0% 4.30kB ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1" 39.1MB ± 0% 28.5MB ± 0% 20.7MB ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="^.+$" 287MB ± 0% 253MB ± 0% 38MB ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",j!="foo" 34.3MB ± 0% 23.9MB ± 0% 20.7MB ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 51.6MB ± 0% 25.5MB ± 0% 22.3MB ± 0% Querier/Block/labelValuesWithMatchers/n_with_j!="foo" 144MB ± 0% 139MB ± 0% 0MB ± 0% Querier/Block/labelValuesWithMatchers/n_with_i="1" 6.43kB ± 0% 8.66kB ± 0% 5.86kB ± 0% name \ allocs/op old.txt intersect.txt intersect_noexpand.txt Querier/Head/labelValuesWithMatchers/i_with_n="1" 104k ± 0% 500k ± 0% 300k ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="^.+$" 204k ± 0% 600k ± 0% 400k ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",j!="foo" 104k ± 0% 500k ± 0% 300k ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 204k ± 0% 500k ± 0% 300k ± 0% Querier/Head/labelValuesWithMatchers/n_with_j!="foo" 66.0 ± 0% 255.0 ± 0% 139.0 ± 0% Querier/Head/labelValuesWithMatchers/n_with_i="1" 61.0 ± 0% 205.0 ± 0% 87.0 ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1" 304k ± 0% 600k ± 0% 400k ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="^.+$" 5.20M ± 0% 0.70M ± 0% 0.50M ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",j!="foo" 204k ± 0% 600k ± 0% 400k ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 304k ± 0% 600k ± 0% 400k ± 0% Querier/Block/labelValuesWithMatchers/n_with_j!="foo" 3.00M ± 0% 0.00M ± 0% 0.00M ± 0% Querier/Block/labelValuesWithMatchers/n_with_i="1" 61.0 ± 0% 247.0 ± 0% 129.0 ± 0% Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Apply comment suggestions from the code review Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> * Change else { if } to else if Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Remove sorting of label values We were not sorting them before, so no need to sort them now Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com>
3 years ago
switch len(values) {
case 0:
return index.EmptyPostings(), nil
case 1:
return h.head.postings.Get(name, values[0]), nil
default:
res := make([]index.Postings, 0, len(values))
for _, value := range values {
if p := h.head.postings.Get(name, value); !index.IsEmptyPostingsType(p) {
res = append(res, p)
}
Label values with matchers by intersecting postings (#9907) * LabelValues w/matchers by intersecting postings Instead of iterating all matched series to find the values, this checks if each one of the label values is present in the matched series (postings). Pending to be benchmarked. Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Benchmark labelValuesWithMatchers name old time/op new time/op Querier/Head/labelValuesWithMatchers/i_with_n="1" 157ms ± 0% 48ms ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="^.+$" 1.80s ± 0% 0.46s ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",j!="foo" 144ms ± 0% 57ms ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 304ms ± 0% 111ms ± 0% Querier/Head/labelValuesWithMatchers/n_with_j!="foo" 761ms ± 0% 164ms ± 0% Querier/Head/labelValuesWithMatchers/n_with_i="1" 6.11µs ± 0% 6.62µs ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1" 117ms ± 0% 62ms ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="^.+$" 1.44s ± 0% 0.24s ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",j!="foo" 92.1ms ± 0% 70.3ms ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 196ms ± 0% 115ms ± 0% Querier/Block/labelValuesWithMatchers/n_with_j!="foo" 1.23s ± 0% 0.21s ± 0% Querier/Block/labelValuesWithMatchers/n_with_i="1" 1.06ms ± 0% 0.88ms ± 0% name old alloc/op new alloc/op Querier/Head/labelValuesWithMatchers/i_with_n="1" 29.5MB ± 0% 26.9MB ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="^.+$" 46.8MB ± 0% 251.5MB ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",j!="foo" 29.5MB ± 0% 22.3MB ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 46.8MB ± 0% 23.9MB ± 0% Querier/Head/labelValuesWithMatchers/n_with_j!="foo" 10.3kB ± 0% 138535.2kB ± 0% Querier/Head/labelValuesWithMatchers/n_with_i="1" 5.54kB ± 0% 7.09kB ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1" 39.1MB ± 0% 28.5MB ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="^.+$" 287MB ± 0% 253MB ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",j!="foo" 34.3MB ± 0% 23.9MB ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 51.6MB ± 0% 25.5MB ± 0% Querier/Block/labelValuesWithMatchers/n_with_j!="foo" 144MB ± 0% 139MB ± 0% Querier/Block/labelValuesWithMatchers/n_with_i="1" 6.43kB ± 0% 8.66kB ± 0% name old allocs/op new allocs/op Querier/Head/labelValuesWithMatchers/i_with_n="1" 104k ± 0% 500k ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="^.+$" 204k ± 0% 600k ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",j!="foo" 104k ± 0% 500k ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 204k ± 0% 500k ± 0% Querier/Head/labelValuesWithMatchers/n_with_j!="foo" 66.0 ± 0% 255.0 ± 0% Querier/Head/labelValuesWithMatchers/n_with_i="1" 61.0 ± 0% 205.0 ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1" 304k ± 0% 600k ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="^.+$" 5.20M ± 0% 0.70M ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",j!="foo" 204k ± 0% 600k ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 304k ± 0% 600k ± 0% Querier/Block/labelValuesWithMatchers/n_with_j!="foo" 3.00M ± 0% 0.00M ± 0% Querier/Block/labelValuesWithMatchers/n_with_i="1" 61.0 ± 0% 247.0 ± 0% Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Don't expand postings to intersect them Using a min heap we can check whether matched postings intersect with each one of the label values postings. This avoid expanding postings (and thus having all of them in memory at any point). Slightly slower than the expanding postings version for some cases, but definitely pays the price once the cardinality grows. Still offers 10x latency improvement where previous latencies were reaching 1s. Benchmark results: name \ time/op old.txt intersect.txt intersect_noexpand.txt Querier/Head/labelValuesWithMatchers/i_with_n="1" 157ms ± 0% 48ms ± 0% 110ms ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="^.+$" 1.80s ± 0% 0.46s ± 0% 0.18s ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",j!="foo" 144ms ± 0% 57ms ± 0% 125ms ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 304ms ± 0% 111ms ± 0% 177ms ± 0% Querier/Head/labelValuesWithMatchers/n_with_j!="foo" 761ms ± 0% 164ms ± 0% 134ms ± 0% Querier/Head/labelValuesWithMatchers/n_with_i="1" 6.11µs ± 0% 6.62µs ± 0% 4.29µs ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1" 117ms ± 0% 62ms ± 0% 120ms ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="^.+$" 1.44s ± 0% 0.24s ± 0% 0.15s ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",j!="foo" 92.1ms ± 0% 70.3ms ± 0% 125.4ms ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 196ms ± 0% 115ms ± 0% 170ms ± 0% Querier/Block/labelValuesWithMatchers/n_with_j!="foo" 1.23s ± 0% 0.21s ± 0% 0.14s ± 0% Querier/Block/labelValuesWithMatchers/n_with_i="1" 1.06ms ± 0% 0.88ms ± 0% 0.92ms ± 0% name \ alloc/op old.txt intersect.txt intersect_noexpand.txt Querier/Head/labelValuesWithMatchers/i_with_n="1" 29.5MB ± 0% 26.9MB ± 0% 19.1MB ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="^.+$" 46.8MB ± 0% 251.5MB ± 0% 36.3MB ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",j!="foo" 29.5MB ± 0% 22.3MB ± 0% 19.1MB ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 46.8MB ± 0% 23.9MB ± 0% 20.7MB ± 0% Querier/Head/labelValuesWithMatchers/n_with_j!="foo" 10.3kB ± 0% 138535.2kB ± 0% 6.4kB ± 0% Querier/Head/labelValuesWithMatchers/n_with_i="1" 5.54kB ± 0% 7.09kB ± 0% 4.30kB ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1" 39.1MB ± 0% 28.5MB ± 0% 20.7MB ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="^.+$" 287MB ± 0% 253MB ± 0% 38MB ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",j!="foo" 34.3MB ± 0% 23.9MB ± 0% 20.7MB ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 51.6MB ± 0% 25.5MB ± 0% 22.3MB ± 0% Querier/Block/labelValuesWithMatchers/n_with_j!="foo" 144MB ± 0% 139MB ± 0% 0MB ± 0% Querier/Block/labelValuesWithMatchers/n_with_i="1" 6.43kB ± 0% 8.66kB ± 0% 5.86kB ± 0% name \ allocs/op old.txt intersect.txt intersect_noexpand.txt Querier/Head/labelValuesWithMatchers/i_with_n="1" 104k ± 0% 500k ± 0% 300k ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="^.+$" 204k ± 0% 600k ± 0% 400k ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",j!="foo" 104k ± 0% 500k ± 0% 300k ± 0% Querier/Head/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 204k ± 0% 500k ± 0% 300k ± 0% Querier/Head/labelValuesWithMatchers/n_with_j!="foo" 66.0 ± 0% 255.0 ± 0% 139.0 ± 0% Querier/Head/labelValuesWithMatchers/n_with_i="1" 61.0 ± 0% 205.0 ± 0% 87.0 ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1" 304k ± 0% 600k ± 0% 400k ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="^.+$" 5.20M ± 0% 0.70M ± 0% 0.50M ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",j!="foo" 204k ± 0% 600k ± 0% 400k ± 0% Querier/Block/labelValuesWithMatchers/i_with_n="1",i=~"^.*$",j!="foo" 304k ± 0% 600k ± 0% 400k ± 0% Querier/Block/labelValuesWithMatchers/n_with_j!="foo" 3.00M ± 0% 0.00M ± 0% 0.00M ± 0% Querier/Block/labelValuesWithMatchers/n_with_i="1" 61.0 ± 0% 247.0 ± 0% 129.0 ± 0% Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Apply comment suggestions from the code review Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> * Change else { if } to else if Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Remove sorting of label values We were not sorting them before, so no need to sort them now Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com>
3 years ago
}
return index.Merge(ctx, res...), nil
}
}
func (h *headIndexReader) PostingsForLabelMatching(ctx context.Context, name string, match func(string) bool) index.Postings {
return h.head.postings.PostingsForLabelMatching(ctx, name, match)
}
func (h *headIndexReader) SortedPostings(p index.Postings) index.Postings {
series := make([]*memSeries, 0, 128)
// Fetch all the series only once.
for p.Next() {
s := h.head.series.getByID(chunks.HeadSeriesRef(p.At()))
if s == nil {
level.Debug(h.head.logger).Log("msg", "Looked up series not found")
} else {
series = append(series, s)
}
}
if err := p.Err(); err != nil {
return index.ErrPostings(fmt.Errorf("expand postings: %w", err))
}
slices.SortFunc(series, func(a, b *memSeries) int {
return labels.Compare(a.labels(), b.labels())
})
// Convert back to list.
ep := make([]storage.SeriesRef, 0, len(series))
for _, p := range series {
ep = append(ep, storage.SeriesRef(p.ref))
}
return index.NewListPostings(ep)
}
// ShardedPostings implements IndexReader. This function returns an failing postings list if sharding
// has not been enabled in the Head.
func (h *headIndexReader) ShardedPostings(p index.Postings, shardIndex, shardCount uint64) index.Postings {
if !h.head.opts.EnableSharding {
return index.ErrPostings(errors.New("sharding is disabled"))
}
out := make([]storage.SeriesRef, 0, 128)
for p.Next() {
s := h.head.series.getByID(chunks.HeadSeriesRef(p.At()))
if s == nil {
level.Debug(h.head.logger).Log("msg", "Looked up series not found")
continue
}
// Check if the series belong to the shard.
if s.shardHash%shardCount != shardIndex {
continue
}
out = append(out, storage.SeriesRef(s.ref))
}
return index.NewListPostings(out)
}
// Series returns the series for the given reference.
// Chunks are skipped if chks is nil.
func (h *headIndexReader) Series(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error {
s := h.head.series.getByID(chunks.HeadSeriesRef(ref))
if s == nil {
h.head.metrics.seriesNotFound.Inc()
return storage.ErrNotFound
}
builder.Assign(s.labels())
if chks == nil {
return nil
}
s.Lock()
defer s.Unlock()
*chks = (*chks)[:0]
*chks = appendSeriesChunks(s, h.mint, h.maxt, *chks)
return nil
}
func appendSeriesChunks(s *memSeries, mint, maxt int64, chks []chunks.Meta) []chunks.Meta {
for i, c := range s.mmappedChunks {
// Do not expose chunks that are outside of the specified range.
if !c.OverlapsClosedInterval(mint, maxt) {
continue
}
chks = append(chks, chunks.Meta{
MinTime: c.minTime,
MaxTime: c.maxTime,
Ref: chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.headChunkID(i))),
})
}
Use a linked list for memSeries.headChunk (#11818) Currently memSeries holds a single head chunk in-memory and a slice of mmapped chunks. When append() is called on memSeries it might decide that a new headChunk is needed to use for given append() call. If that happens it will first mmap existing head chunk and only after that happens it will create a new empty headChunk and continue appending our sample to it. Since appending samples uses write lock on memSeries no other read or write can happen until any append is completed. When we have an append() that must create a new head chunk the whole memSeries is blocked until mmapping of existing head chunk finishes. Mmapping itself uses a lock as it needs to be serialised, which means that the more chunks to mmap we have the longer each chunk might wait for it to be mmapped. If there's enough chunks that require mmapping some memSeries will be locked for long enough that it will start affecting queries and scrapes. Queries might timeout, since by default they have a 2 minute timeout set. Scrapes will be blocked inside append() call, which means there will be a gap between samples. This will first affect range queries or calls using rate() and such, since the time range requested in the query might have too few samples to calculate anything. To avoid this we need to remove mmapping from append path, since mmapping is blocking. But this means that when we cut a new head chunk we need to keep the old one around, so we can mmap it later. This change makes memSeries.headChunk a linked list, memSeries.headChunk still points to the 'open' head chunk that receives new samples, while older, yet to be mmapped, chunks are linked to it. Mmapping is done on a schedule by iterating all memSeries one by one. Thanks to this we control when mmapping is done, since we trigger it manually, which reduces the risk that it will have to compete for mmap locks with other chunks. Signed-off-by: Łukasz Mierzwa <l.mierzwa@gmail.com>
1 year ago
if s.headChunks != nil {
var maxTime int64
var i, j int
for i = s.headChunks.len() - 1; i >= 0; i-- {
chk := s.headChunks.atOffset(i)
if i == 0 {
// Set the head chunk as open (being appended to) for the first headChunk.
maxTime = math.MaxInt64
} else {
maxTime = chk.maxTime
}
if chk.OverlapsClosedInterval(mint, maxt) {
chks = append(chks, chunks.Meta{
Use a linked list for memSeries.headChunk (#11818) Currently memSeries holds a single head chunk in-memory and a slice of mmapped chunks. When append() is called on memSeries it might decide that a new headChunk is needed to use for given append() call. If that happens it will first mmap existing head chunk and only after that happens it will create a new empty headChunk and continue appending our sample to it. Since appending samples uses write lock on memSeries no other read or write can happen until any append is completed. When we have an append() that must create a new head chunk the whole memSeries is blocked until mmapping of existing head chunk finishes. Mmapping itself uses a lock as it needs to be serialised, which means that the more chunks to mmap we have the longer each chunk might wait for it to be mmapped. If there's enough chunks that require mmapping some memSeries will be locked for long enough that it will start affecting queries and scrapes. Queries might timeout, since by default they have a 2 minute timeout set. Scrapes will be blocked inside append() call, which means there will be a gap between samples. This will first affect range queries or calls using rate() and such, since the time range requested in the query might have too few samples to calculate anything. To avoid this we need to remove mmapping from append path, since mmapping is blocking. But this means that when we cut a new head chunk we need to keep the old one around, so we can mmap it later. This change makes memSeries.headChunk a linked list, memSeries.headChunk still points to the 'open' head chunk that receives new samples, while older, yet to be mmapped, chunks are linked to it. Mmapping is done on a schedule by iterating all memSeries one by one. Thanks to this we control when mmapping is done, since we trigger it manually, which reduces the risk that it will have to compete for mmap locks with other chunks. Signed-off-by: Łukasz Mierzwa <l.mierzwa@gmail.com>
1 year ago
MinTime: chk.minTime,
MaxTime: maxTime,
Ref: chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.headChunkID(len(s.mmappedChunks)+j))),
})
}
j++
}
}
return chks
}
Add out-of-order sample support to the TSDB (#11075) * Introduce out-of-order TSDB support This implementation is based on this design doc: https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing This commit adds support to accept out-of-order ("OOO") sample into the TSDB up to a configurable time allowance. If OOO is enabled, overlapping querying are automatically enabled. Most of the additions have been borrowed from https://github.com/grafana/mimir-prometheus/ Here is the list ist of the original commits cherry picked from mimir-prometheus into this branch: - 4b2198d7ec47d50989b7c2df66b7b207c32f7f6e - 2836e5513f1bc591535a859f5d41154a75e7c6bc - 00b379c3a5b1ec3799699b6242f300a2b3ea30f0 - ff0dc757587cada63ca948d2d5eb00bf090d63e0 - a632c73352a7e39d60b445700beb47d691549c3e - c6f3d4ab339ab80bbbce74c9946237ced01f0509 - 5e8406a1d4a50d0052bbee83e28ca3b3371408aa - abde1e0ba128936b9eb0224ee1551e56216ebd4a - e70e7698897bb03860bee0467c733fa44e14c9bd - df59320886e03a555d379ac4b0b3130f661407e0 Co-authored-by: Jesus Vazquez <jesus.vazquez@grafana.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * gofumpt files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add license header to missing files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO tests due to existing chunk disk mapper implementation Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix truncate int overflow Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add Sync method to the WAL and update tests Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * remove useless sync Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Update minOOOTime after truncating Head * Update minOOOTime after truncating Head Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix lint Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Add a unit test Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Load OutOfOrderTimeWindow only once per appender Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO Head LabelValues and PostingsForMatchers Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix replay of OOO mmap chunks Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Remove unnecessary err check Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Prevent panic with ApplyConfig Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run OOO compaction after restart if there is OOO data from WBL Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Apply Bartek's suggestions Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Refactor OOO compaction Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address comments and TODOs - Added a comment explaining why we need the allow overlapping compaction toggle - Clarified TSDBConfig OutOfOrderTimeWindow doc - Added an owner to all the TODOs in the code Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run go format Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix remaining review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix tests Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Change wbl reference when truncating ooo in TestHeadMinOOOTimeUpdate Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix TestWBLAndMmapReplay test failure on windows Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address most of the feedback Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Refactor the block meta for out of order Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix windows error Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Co-authored-by: Oleg Zaytsev <mail@olegzaytsev.com> Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com>
2 years ago
// headChunkID returns the HeadChunkID referred to by the given position.
// * 0 <= pos < len(s.mmappedChunks) refer to s.mmappedChunks[pos]
// * pos >= len(s.mmappedChunks) refers to s.headChunks linked list.
func (s *memSeries) headChunkID(pos int) chunks.HeadChunkID {
return chunks.HeadChunkID(pos) + s.firstChunkID
}
const oooChunkIDMask = 1 << 23
Add out-of-order sample support to the TSDB (#11075) * Introduce out-of-order TSDB support This implementation is based on this design doc: https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing This commit adds support to accept out-of-order ("OOO") sample into the TSDB up to a configurable time allowance. If OOO is enabled, overlapping querying are automatically enabled. Most of the additions have been borrowed from https://github.com/grafana/mimir-prometheus/ Here is the list ist of the original commits cherry picked from mimir-prometheus into this branch: - 4b2198d7ec47d50989b7c2df66b7b207c32f7f6e - 2836e5513f1bc591535a859f5d41154a75e7c6bc - 00b379c3a5b1ec3799699b6242f300a2b3ea30f0 - ff0dc757587cada63ca948d2d5eb00bf090d63e0 - a632c73352a7e39d60b445700beb47d691549c3e - c6f3d4ab339ab80bbbce74c9946237ced01f0509 - 5e8406a1d4a50d0052bbee83e28ca3b3371408aa - abde1e0ba128936b9eb0224ee1551e56216ebd4a - e70e7698897bb03860bee0467c733fa44e14c9bd - df59320886e03a555d379ac4b0b3130f661407e0 Co-authored-by: Jesus Vazquez <jesus.vazquez@grafana.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * gofumpt files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add license header to missing files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO tests due to existing chunk disk mapper implementation Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix truncate int overflow Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add Sync method to the WAL and update tests Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * remove useless sync Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Update minOOOTime after truncating Head * Update minOOOTime after truncating Head Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix lint Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Add a unit test Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Load OutOfOrderTimeWindow only once per appender Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO Head LabelValues and PostingsForMatchers Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix replay of OOO mmap chunks Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Remove unnecessary err check Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Prevent panic with ApplyConfig Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run OOO compaction after restart if there is OOO data from WBL Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Apply Bartek's suggestions Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Refactor OOO compaction Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address comments and TODOs - Added a comment explaining why we need the allow overlapping compaction toggle - Clarified TSDBConfig OutOfOrderTimeWindow doc - Added an owner to all the TODOs in the code Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run go format Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix remaining review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix tests Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Change wbl reference when truncating ooo in TestHeadMinOOOTimeUpdate Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix TestWBLAndMmapReplay test failure on windows Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address most of the feedback Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Refactor the block meta for out of order Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix windows error Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Co-authored-by: Oleg Zaytsev <mail@olegzaytsev.com> Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com>
2 years ago
// oooHeadChunkID returns the HeadChunkID referred to by the given position.
// Only the bottom 24 bits are used. Bit 23 is always 1 for an OOO chunk; for the rest:
Add out-of-order sample support to the TSDB (#11075) * Introduce out-of-order TSDB support This implementation is based on this design doc: https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing This commit adds support to accept out-of-order ("OOO") sample into the TSDB up to a configurable time allowance. If OOO is enabled, overlapping querying are automatically enabled. Most of the additions have been borrowed from https://github.com/grafana/mimir-prometheus/ Here is the list ist of the original commits cherry picked from mimir-prometheus into this branch: - 4b2198d7ec47d50989b7c2df66b7b207c32f7f6e - 2836e5513f1bc591535a859f5d41154a75e7c6bc - 00b379c3a5b1ec3799699b6242f300a2b3ea30f0 - ff0dc757587cada63ca948d2d5eb00bf090d63e0 - a632c73352a7e39d60b445700beb47d691549c3e - c6f3d4ab339ab80bbbce74c9946237ced01f0509 - 5e8406a1d4a50d0052bbee83e28ca3b3371408aa - abde1e0ba128936b9eb0224ee1551e56216ebd4a - e70e7698897bb03860bee0467c733fa44e14c9bd - df59320886e03a555d379ac4b0b3130f661407e0 Co-authored-by: Jesus Vazquez <jesus.vazquez@grafana.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * gofumpt files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add license header to missing files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO tests due to existing chunk disk mapper implementation Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix truncate int overflow Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add Sync method to the WAL and update tests Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * remove useless sync Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Update minOOOTime after truncating Head * Update minOOOTime after truncating Head Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix lint Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Add a unit test Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Load OutOfOrderTimeWindow only once per appender Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO Head LabelValues and PostingsForMatchers Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix replay of OOO mmap chunks Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Remove unnecessary err check Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Prevent panic with ApplyConfig Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run OOO compaction after restart if there is OOO data from WBL Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Apply Bartek's suggestions Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Refactor OOO compaction Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address comments and TODOs - Added a comment explaining why we need the allow overlapping compaction toggle - Clarified TSDBConfig OutOfOrderTimeWindow doc - Added an owner to all the TODOs in the code Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run go format Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix remaining review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix tests Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Change wbl reference when truncating ooo in TestHeadMinOOOTimeUpdate Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix TestWBLAndMmapReplay test failure on windows Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address most of the feedback Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Refactor the block meta for out of order Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix windows error Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Co-authored-by: Oleg Zaytsev <mail@olegzaytsev.com> Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com>
2 years ago
// * 0 <= pos < len(s.oooMmappedChunks) refer to s.oooMmappedChunks[pos]
// * pos == len(s.oooMmappedChunks) refers to s.oooHeadChunk
// The caller must ensure that s.ooo is not nil.
Add out-of-order sample support to the TSDB (#11075) * Introduce out-of-order TSDB support This implementation is based on this design doc: https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing This commit adds support to accept out-of-order ("OOO") sample into the TSDB up to a configurable time allowance. If OOO is enabled, overlapping querying are automatically enabled. Most of the additions have been borrowed from https://github.com/grafana/mimir-prometheus/ Here is the list ist of the original commits cherry picked from mimir-prometheus into this branch: - 4b2198d7ec47d50989b7c2df66b7b207c32f7f6e - 2836e5513f1bc591535a859f5d41154a75e7c6bc - 00b379c3a5b1ec3799699b6242f300a2b3ea30f0 - ff0dc757587cada63ca948d2d5eb00bf090d63e0 - a632c73352a7e39d60b445700beb47d691549c3e - c6f3d4ab339ab80bbbce74c9946237ced01f0509 - 5e8406a1d4a50d0052bbee83e28ca3b3371408aa - abde1e0ba128936b9eb0224ee1551e56216ebd4a - e70e7698897bb03860bee0467c733fa44e14c9bd - df59320886e03a555d379ac4b0b3130f661407e0 Co-authored-by: Jesus Vazquez <jesus.vazquez@grafana.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * gofumpt files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add license header to missing files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO tests due to existing chunk disk mapper implementation Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix truncate int overflow Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add Sync method to the WAL and update tests Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * remove useless sync Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Update minOOOTime after truncating Head * Update minOOOTime after truncating Head Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix lint Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Add a unit test Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Load OutOfOrderTimeWindow only once per appender Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO Head LabelValues and PostingsForMatchers Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix replay of OOO mmap chunks Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Remove unnecessary err check Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Prevent panic with ApplyConfig Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run OOO compaction after restart if there is OOO data from WBL Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Apply Bartek's suggestions Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Refactor OOO compaction Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address comments and TODOs - Added a comment explaining why we need the allow overlapping compaction toggle - Clarified TSDBConfig OutOfOrderTimeWindow doc - Added an owner to all the TODOs in the code Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run go format Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix remaining review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix tests Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Change wbl reference when truncating ooo in TestHeadMinOOOTimeUpdate Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix TestWBLAndMmapReplay test failure on windows Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address most of the feedback Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Refactor the block meta for out of order Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix windows error Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Co-authored-by: Oleg Zaytsev <mail@olegzaytsev.com> Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com>
2 years ago
func (s *memSeries) oooHeadChunkID(pos int) chunks.HeadChunkID {
return (chunks.HeadChunkID(pos) + s.ooo.firstOOOChunkID) | oooChunkIDMask
}
func unpackHeadChunkRef(ref chunks.ChunkRef) (seriesID chunks.HeadSeriesRef, chunkID chunks.HeadChunkID, isOOO bool) {
sid, cid := chunks.HeadChunkRef(ref).Unpack()
return sid, (cid & (oooChunkIDMask - 1)), (cid & oooChunkIDMask) != 0
Add out-of-order sample support to the TSDB (#11075) * Introduce out-of-order TSDB support This implementation is based on this design doc: https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing This commit adds support to accept out-of-order ("OOO") sample into the TSDB up to a configurable time allowance. If OOO is enabled, overlapping querying are automatically enabled. Most of the additions have been borrowed from https://github.com/grafana/mimir-prometheus/ Here is the list ist of the original commits cherry picked from mimir-prometheus into this branch: - 4b2198d7ec47d50989b7c2df66b7b207c32f7f6e - 2836e5513f1bc591535a859f5d41154a75e7c6bc - 00b379c3a5b1ec3799699b6242f300a2b3ea30f0 - ff0dc757587cada63ca948d2d5eb00bf090d63e0 - a632c73352a7e39d60b445700beb47d691549c3e - c6f3d4ab339ab80bbbce74c9946237ced01f0509 - 5e8406a1d4a50d0052bbee83e28ca3b3371408aa - abde1e0ba128936b9eb0224ee1551e56216ebd4a - e70e7698897bb03860bee0467c733fa44e14c9bd - df59320886e03a555d379ac4b0b3130f661407e0 Co-authored-by: Jesus Vazquez <jesus.vazquez@grafana.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * gofumpt files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add license header to missing files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO tests due to existing chunk disk mapper implementation Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix truncate int overflow Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add Sync method to the WAL and update tests Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * remove useless sync Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Update minOOOTime after truncating Head * Update minOOOTime after truncating Head Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix lint Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Add a unit test Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Load OutOfOrderTimeWindow only once per appender Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO Head LabelValues and PostingsForMatchers Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix replay of OOO mmap chunks Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Remove unnecessary err check Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Prevent panic with ApplyConfig Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run OOO compaction after restart if there is OOO data from WBL Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Apply Bartek's suggestions Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Refactor OOO compaction Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address comments and TODOs - Added a comment explaining why we need the allow overlapping compaction toggle - Clarified TSDBConfig OutOfOrderTimeWindow doc - Added an owner to all the TODOs in the code Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run go format Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix remaining review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix tests Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Change wbl reference when truncating ooo in TestHeadMinOOOTimeUpdate Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix TestWBLAndMmapReplay test failure on windows Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address most of the feedback Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Refactor the block meta for out of order Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix windows error Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Co-authored-by: Oleg Zaytsev <mail@olegzaytsev.com> Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com>
2 years ago
}
// LabelValueFor returns label value for the given label name in the series referred to by ID.
func (h *headIndexReader) LabelValueFor(_ context.Context, id storage.SeriesRef, label string) (string, error) {
memSeries := h.head.series.getByID(chunks.HeadSeriesRef(id))
if memSeries == nil {
return "", storage.ErrNotFound
}
value := memSeries.labels().Get(label)
if value == "" {
return "", storage.ErrNotFound
}
return value, nil
}
// LabelNamesFor returns all the label names for the series referred to by the postings.
// The names returned are sorted.
func (h *headIndexReader) LabelNamesFor(ctx context.Context, series index.Postings) ([]string, error) {
namesMap := make(map[string]struct{})
i := 0
for series.Next() {
i++
if i%checkContextEveryNIterations == 0 && ctx.Err() != nil {
return nil, ctx.Err()
}
memSeries := h.head.series.getByID(chunks.HeadSeriesRef(series.At()))
if memSeries == nil {
// Series not found, this happens during compaction,
// when series was garbage collected after the caller got the series IDs.
continue
}
memSeries.labels().Range(func(lbl labels.Label) {
namesMap[lbl.Name] = struct{}{}
})
}
if err := series.Err(); err != nil {
return nil, err
}
names := make([]string, 0, len(namesMap))
for name := range namesMap {
names = append(names, name)
}
slices.Sort(names)
return names, nil
}
// Chunks returns a ChunkReader against the block.
func (h *Head) Chunks() (ChunkReader, error) {
return h.chunksRange(math.MinInt64, math.MaxInt64, h.iso.State(math.MinInt64, math.MaxInt64))
}
func (h *Head) chunksRange(mint, maxt int64, is *isolationState) (*headChunkReader, error) {
h.closedMtx.Lock()
defer h.closedMtx.Unlock()
if h.closed {
return nil, errors.New("can't read from a closed head")
}
if hmin := h.MinTime(); hmin > mint {
mint = hmin
}
return &headChunkReader{
head: h,
mint: mint,
maxt: maxt,
isoState: is,
}, nil
}
type headChunkReader struct {
head *Head
mint, maxt int64
isoState *isolationState
}
func (h *headChunkReader) Close() error {
if h.isoState != nil {
h.isoState.Close()
}
return nil
}
// ChunkOrIterable returns the chunk for the reference number.
func (h *headChunkReader) ChunkOrIterable(meta chunks.Meta) (chunkenc.Chunk, chunkenc.Iterable, error) {
chk, _, err := h.chunk(meta, false)
return chk, nil, err
}
type ChunkReaderWithCopy interface {
ChunkOrIterableWithCopy(meta chunks.Meta) (chunkenc.Chunk, chunkenc.Iterable, int64, error)
}
// ChunkOrIterableWithCopy returns the chunk for the reference number.
// If the chunk is the in-memory chunk, then it makes a copy and returns the copied chunk, plus the max time of the chunk.
func (h *headChunkReader) ChunkOrIterableWithCopy(meta chunks.Meta) (chunkenc.Chunk, chunkenc.Iterable, int64, error) {
chk, maxTime, err := h.chunk(meta, true)
return chk, nil, maxTime, err
}
// chunk returns the chunk for the reference number.
// If copyLastChunk is true, then it makes a copy of the head chunk if asked for it.
// Also returns max time of the chunk.
func (h *headChunkReader) chunk(meta chunks.Meta, copyLastChunk bool) (chunkenc.Chunk, int64, error) {
sid, cid, isOOO := unpackHeadChunkRef(meta.Ref)
s := h.head.series.getByID(sid)
// This means that the series has been garbage collected.
if s == nil {
return nil, 0, storage.ErrNotFound
}
s.Lock()
defer s.Unlock()
return h.head.chunkFromSeries(s, cid, isOOO, h.mint, h.maxt, h.isoState, copyLastChunk)
}
// Dumb thing to defeat chunk pool.
type wrapOOOHeadChunk struct {
chunkenc.Chunk
}
// Call with s locked.
func (h *Head) chunkFromSeries(s *memSeries, cid chunks.HeadChunkID, isOOO bool, mint, maxt int64, isoState *isolationState, copyLastChunk bool) (chunkenc.Chunk, int64, error) {
if isOOO {
chk, maxTime, err := s.oooChunk(cid, h.chunkDiskMapper, &h.memChunkPool)
return wrapOOOHeadChunk{chk}, maxTime, err
}
c, headChunk, isOpen, err := s.chunk(cid, h.chunkDiskMapper, &h.memChunkPool)
if err != nil {
return nil, 0, err
}
defer func() {
if !headChunk {
// Set this to nil so that Go GC can collect it after it has been used.
c.chunk = nil
Use a linked list for memSeries.headChunk (#11818) Currently memSeries holds a single head chunk in-memory and a slice of mmapped chunks. When append() is called on memSeries it might decide that a new headChunk is needed to use for given append() call. If that happens it will first mmap existing head chunk and only after that happens it will create a new empty headChunk and continue appending our sample to it. Since appending samples uses write lock on memSeries no other read or write can happen until any append is completed. When we have an append() that must create a new head chunk the whole memSeries is blocked until mmapping of existing head chunk finishes. Mmapping itself uses a lock as it needs to be serialised, which means that the more chunks to mmap we have the longer each chunk might wait for it to be mmapped. If there's enough chunks that require mmapping some memSeries will be locked for long enough that it will start affecting queries and scrapes. Queries might timeout, since by default they have a 2 minute timeout set. Scrapes will be blocked inside append() call, which means there will be a gap between samples. This will first affect range queries or calls using rate() and such, since the time range requested in the query might have too few samples to calculate anything. To avoid this we need to remove mmapping from append path, since mmapping is blocking. But this means that when we cut a new head chunk we need to keep the old one around, so we can mmap it later. This change makes memSeries.headChunk a linked list, memSeries.headChunk still points to the 'open' head chunk that receives new samples, while older, yet to be mmapped, chunks are linked to it. Mmapping is done on a schedule by iterating all memSeries one by one. Thanks to this we control when mmapping is done, since we trigger it manually, which reduces the risk that it will have to compete for mmap locks with other chunks. Signed-off-by: Łukasz Mierzwa <l.mierzwa@gmail.com>
1 year ago
c.prev = nil
h.memChunkPool.Put(c)
}
}()
// This means that the chunk is outside the specified range.
if !c.OverlapsClosedInterval(mint, maxt) {
return nil, 0, storage.ErrNotFound
}
chk, maxTime := c.chunk, c.maxTime
Use a linked list for memSeries.headChunk (#11818) Currently memSeries holds a single head chunk in-memory and a slice of mmapped chunks. When append() is called on memSeries it might decide that a new headChunk is needed to use for given append() call. If that happens it will first mmap existing head chunk and only after that happens it will create a new empty headChunk and continue appending our sample to it. Since appending samples uses write lock on memSeries no other read or write can happen until any append is completed. When we have an append() that must create a new head chunk the whole memSeries is blocked until mmapping of existing head chunk finishes. Mmapping itself uses a lock as it needs to be serialised, which means that the more chunks to mmap we have the longer each chunk might wait for it to be mmapped. If there's enough chunks that require mmapping some memSeries will be locked for long enough that it will start affecting queries and scrapes. Queries might timeout, since by default they have a 2 minute timeout set. Scrapes will be blocked inside append() call, which means there will be a gap between samples. This will first affect range queries or calls using rate() and such, since the time range requested in the query might have too few samples to calculate anything. To avoid this we need to remove mmapping from append path, since mmapping is blocking. But this means that when we cut a new head chunk we need to keep the old one around, so we can mmap it later. This change makes memSeries.headChunk a linked list, memSeries.headChunk still points to the 'open' head chunk that receives new samples, while older, yet to be mmapped, chunks are linked to it. Mmapping is done on a schedule by iterating all memSeries one by one. Thanks to this we control when mmapping is done, since we trigger it manually, which reduces the risk that it will have to compete for mmap locks with other chunks. Signed-off-by: Łukasz Mierzwa <l.mierzwa@gmail.com>
1 year ago
if headChunk && isOpen && copyLastChunk {
// The caller may ask to copy the head chunk in order to take the
// bytes of the chunk without causing the race between read and append.
Use a linked list for memSeries.headChunk (#11818) Currently memSeries holds a single head chunk in-memory and a slice of mmapped chunks. When append() is called on memSeries it might decide that a new headChunk is needed to use for given append() call. If that happens it will first mmap existing head chunk and only after that happens it will create a new empty headChunk and continue appending our sample to it. Since appending samples uses write lock on memSeries no other read or write can happen until any append is completed. When we have an append() that must create a new head chunk the whole memSeries is blocked until mmapping of existing head chunk finishes. Mmapping itself uses a lock as it needs to be serialised, which means that the more chunks to mmap we have the longer each chunk might wait for it to be mmapped. If there's enough chunks that require mmapping some memSeries will be locked for long enough that it will start affecting queries and scrapes. Queries might timeout, since by default they have a 2 minute timeout set. Scrapes will be blocked inside append() call, which means there will be a gap between samples. This will first affect range queries or calls using rate() and such, since the time range requested in the query might have too few samples to calculate anything. To avoid this we need to remove mmapping from append path, since mmapping is blocking. But this means that when we cut a new head chunk we need to keep the old one around, so we can mmap it later. This change makes memSeries.headChunk a linked list, memSeries.headChunk still points to the 'open' head chunk that receives new samples, while older, yet to be mmapped, chunks are linked to it. Mmapping is done on a schedule by iterating all memSeries one by one. Thanks to this we control when mmapping is done, since we trigger it manually, which reduces the risk that it will have to compete for mmap locks with other chunks. Signed-off-by: Łukasz Mierzwa <l.mierzwa@gmail.com>
1 year ago
b := s.headChunks.chunk.Bytes()
newB := make([]byte, len(b))
copy(newB, b) // TODO(codesome): Use bytes.Clone() when we upgrade to Go 1.20.
// TODO(codesome): Put back in the pool (non-trivial).
chk, err = h.opts.ChunkPool.Get(s.headChunks.chunk.Encoding(), newB)
if err != nil {
return nil, 0, err
}
}
return &safeHeadChunk{
Chunk: chk,
s: s,
cid: cid,
isoState: isoState,
}, maxTime, nil
}
// chunk returns the chunk for the HeadChunkID from memory or by m-mapping it from the disk.
// If headChunk is false, it means that the returned *memChunk
Add basic initial developer docs for TSDB (#9451) * Add basic initial developer docs for TSDB There's a decent amount of content already out there (blog posts, conference talks, etc), but: * when they get stale, they don't tend to get updated * they still leave me with questions that I'ld like to answer for developers (like me) who want to use, or work with, TSDB What I propose is developer docs inside the prometheus repository. Easy to find and harness the power of the community to expand it and keep it up to date. * perfect is the enemy of good. Let's have a base and incrementally improve * Markdown docs should be broad but not too deep. Source code comments can complement them, and are the ideal place for implementation details. Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * use example code that works out of the box Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * Apply suggestions from code review Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * PR feedback Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * more docs Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * PR feedback Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * Apply suggestions from code review Signed-off-by: Dieter Plaetinck <dieter@grafana.com> Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com> * Apply suggestions from code review Signed-off-by: Dieter Plaetinck <dieter@grafana.com> Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> * feedback Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * Update tsdb/docs/usage.md Signed-off-by: Dieter Plaetinck <dieter@grafana.com> Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> * final tweaks Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * workaround docs versioning issue Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * Move example code to real executable, testable example. Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * cleanup example test and make sure it always reproduces Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * obtain temp dir in a way that works with older Go versions Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * Fix Ganesh's comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com>
3 years ago
// (and not the chunkenc.Chunk inside it) can be garbage collected after its usage.
Use a linked list for memSeries.headChunk (#11818) Currently memSeries holds a single head chunk in-memory and a slice of mmapped chunks. When append() is called on memSeries it might decide that a new headChunk is needed to use for given append() call. If that happens it will first mmap existing head chunk and only after that happens it will create a new empty headChunk and continue appending our sample to it. Since appending samples uses write lock on memSeries no other read or write can happen until any append is completed. When we have an append() that must create a new head chunk the whole memSeries is blocked until mmapping of existing head chunk finishes. Mmapping itself uses a lock as it needs to be serialised, which means that the more chunks to mmap we have the longer each chunk might wait for it to be mmapped. If there's enough chunks that require mmapping some memSeries will be locked for long enough that it will start affecting queries and scrapes. Queries might timeout, since by default they have a 2 minute timeout set. Scrapes will be blocked inside append() call, which means there will be a gap between samples. This will first affect range queries or calls using rate() and such, since the time range requested in the query might have too few samples to calculate anything. To avoid this we need to remove mmapping from append path, since mmapping is blocking. But this means that when we cut a new head chunk we need to keep the old one around, so we can mmap it later. This change makes memSeries.headChunk a linked list, memSeries.headChunk still points to the 'open' head chunk that receives new samples, while older, yet to be mmapped, chunks are linked to it. Mmapping is done on a schedule by iterating all memSeries one by one. Thanks to this we control when mmapping is done, since we trigger it manually, which reduces the risk that it will have to compete for mmap locks with other chunks. Signed-off-by: Łukasz Mierzwa <l.mierzwa@gmail.com>
1 year ago
// if isOpen is true, it means that the returned *memChunk is used for appends.
func (s *memSeries) chunk(id chunks.HeadChunkID, chunkDiskMapper *chunks.ChunkDiskMapper, memChunkPool *sync.Pool) (chunk *memChunk, headChunk, isOpen bool, err error) {
// ix represents the index of chunk in the s.mmappedChunks slice. The chunk id's are
// incremented by 1 when new chunk is created, hence (id - firstChunkID) gives the slice index.
// The max index for the s.mmappedChunks slice can be len(s.mmappedChunks)-1, hence if the ix
Use a linked list for memSeries.headChunk (#11818) Currently memSeries holds a single head chunk in-memory and a slice of mmapped chunks. When append() is called on memSeries it might decide that a new headChunk is needed to use for given append() call. If that happens it will first mmap existing head chunk and only after that happens it will create a new empty headChunk and continue appending our sample to it. Since appending samples uses write lock on memSeries no other read or write can happen until any append is completed. When we have an append() that must create a new head chunk the whole memSeries is blocked until mmapping of existing head chunk finishes. Mmapping itself uses a lock as it needs to be serialised, which means that the more chunks to mmap we have the longer each chunk might wait for it to be mmapped. If there's enough chunks that require mmapping some memSeries will be locked for long enough that it will start affecting queries and scrapes. Queries might timeout, since by default they have a 2 minute timeout set. Scrapes will be blocked inside append() call, which means there will be a gap between samples. This will first affect range queries or calls using rate() and such, since the time range requested in the query might have too few samples to calculate anything. To avoid this we need to remove mmapping from append path, since mmapping is blocking. But this means that when we cut a new head chunk we need to keep the old one around, so we can mmap it later. This change makes memSeries.headChunk a linked list, memSeries.headChunk still points to the 'open' head chunk that receives new samples, while older, yet to be mmapped, chunks are linked to it. Mmapping is done on a schedule by iterating all memSeries one by one. Thanks to this we control when mmapping is done, since we trigger it manually, which reduces the risk that it will have to compete for mmap locks with other chunks. Signed-off-by: Łukasz Mierzwa <l.mierzwa@gmail.com>
1 year ago
// is >= len(s.mmappedChunks), it represents one of the chunks on s.headChunks linked list.
// The order of elements is different for slice and linked list.
Use a linked list for memSeries.headChunk (#11818) Currently memSeries holds a single head chunk in-memory and a slice of mmapped chunks. When append() is called on memSeries it might decide that a new headChunk is needed to use for given append() call. If that happens it will first mmap existing head chunk and only after that happens it will create a new empty headChunk and continue appending our sample to it. Since appending samples uses write lock on memSeries no other read or write can happen until any append is completed. When we have an append() that must create a new head chunk the whole memSeries is blocked until mmapping of existing head chunk finishes. Mmapping itself uses a lock as it needs to be serialised, which means that the more chunks to mmap we have the longer each chunk might wait for it to be mmapped. If there's enough chunks that require mmapping some memSeries will be locked for long enough that it will start affecting queries and scrapes. Queries might timeout, since by default they have a 2 minute timeout set. Scrapes will be blocked inside append() call, which means there will be a gap between samples. This will first affect range queries or calls using rate() and such, since the time range requested in the query might have too few samples to calculate anything. To avoid this we need to remove mmapping from append path, since mmapping is blocking. But this means that when we cut a new head chunk we need to keep the old one around, so we can mmap it later. This change makes memSeries.headChunk a linked list, memSeries.headChunk still points to the 'open' head chunk that receives new samples, while older, yet to be mmapped, chunks are linked to it. Mmapping is done on a schedule by iterating all memSeries one by one. Thanks to this we control when mmapping is done, since we trigger it manually, which reduces the risk that it will have to compete for mmap locks with other chunks. Signed-off-by: Łukasz Mierzwa <l.mierzwa@gmail.com>
1 year ago
// For s.mmappedChunks slice newer chunks are appended to it.
// For s.headChunks list newer chunks are prepended to it.
//
// memSeries {
// mmappedChunks: [t0, t1, t2]
// headChunk: {t5}->{t4}->{t3}
// }
ix := int(id) - int(s.firstChunkID)
Use a linked list for memSeries.headChunk (#11818) Currently memSeries holds a single head chunk in-memory and a slice of mmapped chunks. When append() is called on memSeries it might decide that a new headChunk is needed to use for given append() call. If that happens it will first mmap existing head chunk and only after that happens it will create a new empty headChunk and continue appending our sample to it. Since appending samples uses write lock on memSeries no other read or write can happen until any append is completed. When we have an append() that must create a new head chunk the whole memSeries is blocked until mmapping of existing head chunk finishes. Mmapping itself uses a lock as it needs to be serialised, which means that the more chunks to mmap we have the longer each chunk might wait for it to be mmapped. If there's enough chunks that require mmapping some memSeries will be locked for long enough that it will start affecting queries and scrapes. Queries might timeout, since by default they have a 2 minute timeout set. Scrapes will be blocked inside append() call, which means there will be a gap between samples. This will first affect range queries or calls using rate() and such, since the time range requested in the query might have too few samples to calculate anything. To avoid this we need to remove mmapping from append path, since mmapping is blocking. But this means that when we cut a new head chunk we need to keep the old one around, so we can mmap it later. This change makes memSeries.headChunk a linked list, memSeries.headChunk still points to the 'open' head chunk that receives new samples, while older, yet to be mmapped, chunks are linked to it. Mmapping is done on a schedule by iterating all memSeries one by one. Thanks to this we control when mmapping is done, since we trigger it manually, which reduces the risk that it will have to compete for mmap locks with other chunks. Signed-off-by: Łukasz Mierzwa <l.mierzwa@gmail.com>
1 year ago
var headChunksLen int
if s.headChunks != nil {
headChunksLen = s.headChunks.len()
}
Use a linked list for memSeries.headChunk (#11818) Currently memSeries holds a single head chunk in-memory and a slice of mmapped chunks. When append() is called on memSeries it might decide that a new headChunk is needed to use for given append() call. If that happens it will first mmap existing head chunk and only after that happens it will create a new empty headChunk and continue appending our sample to it. Since appending samples uses write lock on memSeries no other read or write can happen until any append is completed. When we have an append() that must create a new head chunk the whole memSeries is blocked until mmapping of existing head chunk finishes. Mmapping itself uses a lock as it needs to be serialised, which means that the more chunks to mmap we have the longer each chunk might wait for it to be mmapped. If there's enough chunks that require mmapping some memSeries will be locked for long enough that it will start affecting queries and scrapes. Queries might timeout, since by default they have a 2 minute timeout set. Scrapes will be blocked inside append() call, which means there will be a gap between samples. This will first affect range queries or calls using rate() and such, since the time range requested in the query might have too few samples to calculate anything. To avoid this we need to remove mmapping from append path, since mmapping is blocking. But this means that when we cut a new head chunk we need to keep the old one around, so we can mmap it later. This change makes memSeries.headChunk a linked list, memSeries.headChunk still points to the 'open' head chunk that receives new samples, while older, yet to be mmapped, chunks are linked to it. Mmapping is done on a schedule by iterating all memSeries one by one. Thanks to this we control when mmapping is done, since we trigger it manually, which reduces the risk that it will have to compete for mmap locks with other chunks. Signed-off-by: Łukasz Mierzwa <l.mierzwa@gmail.com>
1 year ago
if ix < 0 || ix > len(s.mmappedChunks)+headChunksLen-1 {
return nil, false, false, storage.ErrNotFound
}
Use a linked list for memSeries.headChunk (#11818) Currently memSeries holds a single head chunk in-memory and a slice of mmapped chunks. When append() is called on memSeries it might decide that a new headChunk is needed to use for given append() call. If that happens it will first mmap existing head chunk and only after that happens it will create a new empty headChunk and continue appending our sample to it. Since appending samples uses write lock on memSeries no other read or write can happen until any append is completed. When we have an append() that must create a new head chunk the whole memSeries is blocked until mmapping of existing head chunk finishes. Mmapping itself uses a lock as it needs to be serialised, which means that the more chunks to mmap we have the longer each chunk might wait for it to be mmapped. If there's enough chunks that require mmapping some memSeries will be locked for long enough that it will start affecting queries and scrapes. Queries might timeout, since by default they have a 2 minute timeout set. Scrapes will be blocked inside append() call, which means there will be a gap between samples. This will first affect range queries or calls using rate() and such, since the time range requested in the query might have too few samples to calculate anything. To avoid this we need to remove mmapping from append path, since mmapping is blocking. But this means that when we cut a new head chunk we need to keep the old one around, so we can mmap it later. This change makes memSeries.headChunk a linked list, memSeries.headChunk still points to the 'open' head chunk that receives new samples, while older, yet to be mmapped, chunks are linked to it. Mmapping is done on a schedule by iterating all memSeries one by one. Thanks to this we control when mmapping is done, since we trigger it manually, which reduces the risk that it will have to compete for mmap locks with other chunks. Signed-off-by: Łukasz Mierzwa <l.mierzwa@gmail.com>
1 year ago
if ix < len(s.mmappedChunks) {
chk, err := chunkDiskMapper.Chunk(s.mmappedChunks[ix].ref)
if err != nil {
var cerr *chunks.CorruptionErr
if errors.As(err, &cerr) {
Use a linked list for memSeries.headChunk (#11818) Currently memSeries holds a single head chunk in-memory and a slice of mmapped chunks. When append() is called on memSeries it might decide that a new headChunk is needed to use for given append() call. If that happens it will first mmap existing head chunk and only after that happens it will create a new empty headChunk and continue appending our sample to it. Since appending samples uses write lock on memSeries no other read or write can happen until any append is completed. When we have an append() that must create a new head chunk the whole memSeries is blocked until mmapping of existing head chunk finishes. Mmapping itself uses a lock as it needs to be serialised, which means that the more chunks to mmap we have the longer each chunk might wait for it to be mmapped. If there's enough chunks that require mmapping some memSeries will be locked for long enough that it will start affecting queries and scrapes. Queries might timeout, since by default they have a 2 minute timeout set. Scrapes will be blocked inside append() call, which means there will be a gap between samples. This will first affect range queries or calls using rate() and such, since the time range requested in the query might have too few samples to calculate anything. To avoid this we need to remove mmapping from append path, since mmapping is blocking. But this means that when we cut a new head chunk we need to keep the old one around, so we can mmap it later. This change makes memSeries.headChunk a linked list, memSeries.headChunk still points to the 'open' head chunk that receives new samples, while older, yet to be mmapped, chunks are linked to it. Mmapping is done on a schedule by iterating all memSeries one by one. Thanks to this we control when mmapping is done, since we trigger it manually, which reduces the risk that it will have to compete for mmap locks with other chunks. Signed-off-by: Łukasz Mierzwa <l.mierzwa@gmail.com>
1 year ago
panic(err)
}
return nil, false, false, err
}
Use a linked list for memSeries.headChunk (#11818) Currently memSeries holds a single head chunk in-memory and a slice of mmapped chunks. When append() is called on memSeries it might decide that a new headChunk is needed to use for given append() call. If that happens it will first mmap existing head chunk and only after that happens it will create a new empty headChunk and continue appending our sample to it. Since appending samples uses write lock on memSeries no other read or write can happen until any append is completed. When we have an append() that must create a new head chunk the whole memSeries is blocked until mmapping of existing head chunk finishes. Mmapping itself uses a lock as it needs to be serialised, which means that the more chunks to mmap we have the longer each chunk might wait for it to be mmapped. If there's enough chunks that require mmapping some memSeries will be locked for long enough that it will start affecting queries and scrapes. Queries might timeout, since by default they have a 2 minute timeout set. Scrapes will be blocked inside append() call, which means there will be a gap between samples. This will first affect range queries or calls using rate() and such, since the time range requested in the query might have too few samples to calculate anything. To avoid this we need to remove mmapping from append path, since mmapping is blocking. But this means that when we cut a new head chunk we need to keep the old one around, so we can mmap it later. This change makes memSeries.headChunk a linked list, memSeries.headChunk still points to the 'open' head chunk that receives new samples, while older, yet to be mmapped, chunks are linked to it. Mmapping is done on a schedule by iterating all memSeries one by one. Thanks to this we control when mmapping is done, since we trigger it manually, which reduces the risk that it will have to compete for mmap locks with other chunks. Signed-off-by: Łukasz Mierzwa <l.mierzwa@gmail.com>
1 year ago
mc := memChunkPool.Get().(*memChunk)
mc.chunk = chk
mc.minTime = s.mmappedChunks[ix].minTime
mc.maxTime = s.mmappedChunks[ix].maxTime
return mc, false, false, nil
}
Use a linked list for memSeries.headChunk (#11818) Currently memSeries holds a single head chunk in-memory and a slice of mmapped chunks. When append() is called on memSeries it might decide that a new headChunk is needed to use for given append() call. If that happens it will first mmap existing head chunk and only after that happens it will create a new empty headChunk and continue appending our sample to it. Since appending samples uses write lock on memSeries no other read or write can happen until any append is completed. When we have an append() that must create a new head chunk the whole memSeries is blocked until mmapping of existing head chunk finishes. Mmapping itself uses a lock as it needs to be serialised, which means that the more chunks to mmap we have the longer each chunk might wait for it to be mmapped. If there's enough chunks that require mmapping some memSeries will be locked for long enough that it will start affecting queries and scrapes. Queries might timeout, since by default they have a 2 minute timeout set. Scrapes will be blocked inside append() call, which means there will be a gap between samples. This will first affect range queries or calls using rate() and such, since the time range requested in the query might have too few samples to calculate anything. To avoid this we need to remove mmapping from append path, since mmapping is blocking. But this means that when we cut a new head chunk we need to keep the old one around, so we can mmap it later. This change makes memSeries.headChunk a linked list, memSeries.headChunk still points to the 'open' head chunk that receives new samples, while older, yet to be mmapped, chunks are linked to it. Mmapping is done on a schedule by iterating all memSeries one by one. Thanks to this we control when mmapping is done, since we trigger it manually, which reduces the risk that it will have to compete for mmap locks with other chunks. Signed-off-by: Łukasz Mierzwa <l.mierzwa@gmail.com>
1 year ago
ix -= len(s.mmappedChunks)
offset := headChunksLen - ix - 1
// headChunks is a linked list where first element is the most recent one and the last one is the oldest.
// This order is reversed when compared with mmappedChunks, since mmappedChunks[0] is the oldest chunk,
// while headChunk.atOffset(0) would give us the most recent chunk.
// So when calling headChunk.atOffset() we need to reverse the value of ix.
elem := s.headChunks.atOffset(offset)
if elem == nil {
// This should never really happen and would mean that headChunksLen value is NOT equal
// to the length of the headChunks list.
return nil, false, false, storage.ErrNotFound
}
return elem, true, offset == 0, nil
}
// oooChunk returns the chunk for the HeadChunkID by m-mapping it from the disk.
// It never returns the head OOO chunk.
func (s *memSeries) oooChunk(id chunks.HeadChunkID, chunkDiskMapper *chunks.ChunkDiskMapper, memChunkPool *sync.Pool) (chunk chunkenc.Chunk, maxTime int64, err error) {
// ix represents the index of chunk in the s.ooo.oooMmappedChunks slice. The chunk id's are
// incremented by 1 when new chunk is created, hence (id - firstOOOChunkID) gives the slice index.
ix := int(id) - int(s.ooo.firstOOOChunkID)
Add out-of-order sample support to the TSDB (#11075) * Introduce out-of-order TSDB support This implementation is based on this design doc: https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing This commit adds support to accept out-of-order ("OOO") sample into the TSDB up to a configurable time allowance. If OOO is enabled, overlapping querying are automatically enabled. Most of the additions have been borrowed from https://github.com/grafana/mimir-prometheus/ Here is the list ist of the original commits cherry picked from mimir-prometheus into this branch: - 4b2198d7ec47d50989b7c2df66b7b207c32f7f6e - 2836e5513f1bc591535a859f5d41154a75e7c6bc - 00b379c3a5b1ec3799699b6242f300a2b3ea30f0 - ff0dc757587cada63ca948d2d5eb00bf090d63e0 - a632c73352a7e39d60b445700beb47d691549c3e - c6f3d4ab339ab80bbbce74c9946237ced01f0509 - 5e8406a1d4a50d0052bbee83e28ca3b3371408aa - abde1e0ba128936b9eb0224ee1551e56216ebd4a - e70e7698897bb03860bee0467c733fa44e14c9bd - df59320886e03a555d379ac4b0b3130f661407e0 Co-authored-by: Jesus Vazquez <jesus.vazquez@grafana.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * gofumpt files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add license header to missing files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO tests due to existing chunk disk mapper implementation Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix truncate int overflow Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add Sync method to the WAL and update tests Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * remove useless sync Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Update minOOOTime after truncating Head * Update minOOOTime after truncating Head Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix lint Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Add a unit test Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Load OutOfOrderTimeWindow only once per appender Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO Head LabelValues and PostingsForMatchers Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix replay of OOO mmap chunks Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Remove unnecessary err check Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Prevent panic with ApplyConfig Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run OOO compaction after restart if there is OOO data from WBL Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Apply Bartek's suggestions Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Refactor OOO compaction Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address comments and TODOs - Added a comment explaining why we need the allow overlapping compaction toggle - Clarified TSDBConfig OutOfOrderTimeWindow doc - Added an owner to all the TODOs in the code Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run go format Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix remaining review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix tests Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Change wbl reference when truncating ooo in TestHeadMinOOOTimeUpdate Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix TestWBLAndMmapReplay test failure on windows Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address most of the feedback Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Refactor the block meta for out of order Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix windows error Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Co-authored-by: Oleg Zaytsev <mail@olegzaytsev.com> Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com>
2 years ago
if ix < 0 || ix >= len(s.ooo.oooMmappedChunks) {
return nil, 0, storage.ErrNotFound
Add out-of-order sample support to the TSDB (#11075) * Introduce out-of-order TSDB support This implementation is based on this design doc: https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing This commit adds support to accept out-of-order ("OOO") sample into the TSDB up to a configurable time allowance. If OOO is enabled, overlapping querying are automatically enabled. Most of the additions have been borrowed from https://github.com/grafana/mimir-prometheus/ Here is the list ist of the original commits cherry picked from mimir-prometheus into this branch: - 4b2198d7ec47d50989b7c2df66b7b207c32f7f6e - 2836e5513f1bc591535a859f5d41154a75e7c6bc - 00b379c3a5b1ec3799699b6242f300a2b3ea30f0 - ff0dc757587cada63ca948d2d5eb00bf090d63e0 - a632c73352a7e39d60b445700beb47d691549c3e - c6f3d4ab339ab80bbbce74c9946237ced01f0509 - 5e8406a1d4a50d0052bbee83e28ca3b3371408aa - abde1e0ba128936b9eb0224ee1551e56216ebd4a - e70e7698897bb03860bee0467c733fa44e14c9bd - df59320886e03a555d379ac4b0b3130f661407e0 Co-authored-by: Jesus Vazquez <jesus.vazquez@grafana.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * gofumpt files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add license header to missing files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO tests due to existing chunk disk mapper implementation Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix truncate int overflow Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add Sync method to the WAL and update tests Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * remove useless sync Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Update minOOOTime after truncating Head * Update minOOOTime after truncating Head Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix lint Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Add a unit test Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Load OutOfOrderTimeWindow only once per appender Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO Head LabelValues and PostingsForMatchers Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix replay of OOO mmap chunks Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Remove unnecessary err check Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Prevent panic with ApplyConfig Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run OOO compaction after restart if there is OOO data from WBL Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Apply Bartek's suggestions Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Refactor OOO compaction Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address comments and TODOs - Added a comment explaining why we need the allow overlapping compaction toggle - Clarified TSDBConfig OutOfOrderTimeWindow doc - Added an owner to all the TODOs in the code Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run go format Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix remaining review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix tests Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Change wbl reference when truncating ooo in TestHeadMinOOOTimeUpdate Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix TestWBLAndMmapReplay test failure on windows Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address most of the feedback Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Refactor the block meta for out of order Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix windows error Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Co-authored-by: Oleg Zaytsev <mail@olegzaytsev.com> Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com>
2 years ago
}
chk, err := chunkDiskMapper.Chunk(s.ooo.oooMmappedChunks[ix].ref)
return chk, s.ooo.oooMmappedChunks[ix].maxTime, err
Add out-of-order sample support to the TSDB (#11075) * Introduce out-of-order TSDB support This implementation is based on this design doc: https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing This commit adds support to accept out-of-order ("OOO") sample into the TSDB up to a configurable time allowance. If OOO is enabled, overlapping querying are automatically enabled. Most of the additions have been borrowed from https://github.com/grafana/mimir-prometheus/ Here is the list ist of the original commits cherry picked from mimir-prometheus into this branch: - 4b2198d7ec47d50989b7c2df66b7b207c32f7f6e - 2836e5513f1bc591535a859f5d41154a75e7c6bc - 00b379c3a5b1ec3799699b6242f300a2b3ea30f0 - ff0dc757587cada63ca948d2d5eb00bf090d63e0 - a632c73352a7e39d60b445700beb47d691549c3e - c6f3d4ab339ab80bbbce74c9946237ced01f0509 - 5e8406a1d4a50d0052bbee83e28ca3b3371408aa - abde1e0ba128936b9eb0224ee1551e56216ebd4a - e70e7698897bb03860bee0467c733fa44e14c9bd - df59320886e03a555d379ac4b0b3130f661407e0 Co-authored-by: Jesus Vazquez <jesus.vazquez@grafana.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * gofumpt files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add license header to missing files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO tests due to existing chunk disk mapper implementation Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix truncate int overflow Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add Sync method to the WAL and update tests Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * remove useless sync Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Update minOOOTime after truncating Head * Update minOOOTime after truncating Head Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix lint Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Add a unit test Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Load OutOfOrderTimeWindow only once per appender Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO Head LabelValues and PostingsForMatchers Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix replay of OOO mmap chunks Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Remove unnecessary err check Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Prevent panic with ApplyConfig Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run OOO compaction after restart if there is OOO data from WBL Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Apply Bartek's suggestions Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Refactor OOO compaction Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address comments and TODOs - Added a comment explaining why we need the allow overlapping compaction toggle - Clarified TSDBConfig OutOfOrderTimeWindow doc - Added an owner to all the TODOs in the code Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run go format Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix remaining review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix tests Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Change wbl reference when truncating ooo in TestHeadMinOOOTimeUpdate Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix TestWBLAndMmapReplay test failure on windows Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address most of the feedback Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Refactor the block meta for out of order Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix windows error Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Co-authored-by: Oleg Zaytsev <mail@olegzaytsev.com> Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com>
2 years ago
}
// safeHeadChunk makes sure that the chunk can be accessed without a race condition.
type safeHeadChunk struct {
chunkenc.Chunk
s *memSeries
cid chunks.HeadChunkID
isoState *isolationState
}
func (c *safeHeadChunk) Iterator(reuseIter chunkenc.Iterator) chunkenc.Iterator {
c.s.Lock()
it := c.s.iterator(c.cid, c.Chunk, c.isoState, reuseIter)
c.s.Unlock()
return it
}
// iterator returns a chunk iterator for the requested chunkID, or a NopIterator if the requested ID is out of range.
// It is unsafe to call this concurrently with s.append(...) without holding the series lock.
func (s *memSeries) iterator(id chunks.HeadChunkID, c chunkenc.Chunk, isoState *isolationState, it chunkenc.Iterator) chunkenc.Iterator {
ix := int(id) - int(s.firstChunkID)
numSamples := c.NumSamples()
stopAfter := numSamples
if isoState != nil && !isoState.IsolationDisabled() {
totalSamples := 0 // Total samples in this series.
previousSamples := 0 // Samples before this chunk.
for j, d := range s.mmappedChunks {
totalSamples += int(d.numSamples)
if j < ix {
previousSamples += int(d.numSamples)
}
}
Use a linked list for memSeries.headChunk (#11818) Currently memSeries holds a single head chunk in-memory and a slice of mmapped chunks. When append() is called on memSeries it might decide that a new headChunk is needed to use for given append() call. If that happens it will first mmap existing head chunk and only after that happens it will create a new empty headChunk and continue appending our sample to it. Since appending samples uses write lock on memSeries no other read or write can happen until any append is completed. When we have an append() that must create a new head chunk the whole memSeries is blocked until mmapping of existing head chunk finishes. Mmapping itself uses a lock as it needs to be serialised, which means that the more chunks to mmap we have the longer each chunk might wait for it to be mmapped. If there's enough chunks that require mmapping some memSeries will be locked for long enough that it will start affecting queries and scrapes. Queries might timeout, since by default they have a 2 minute timeout set. Scrapes will be blocked inside append() call, which means there will be a gap between samples. This will first affect range queries or calls using rate() and such, since the time range requested in the query might have too few samples to calculate anything. To avoid this we need to remove mmapping from append path, since mmapping is blocking. But this means that when we cut a new head chunk we need to keep the old one around, so we can mmap it later. This change makes memSeries.headChunk a linked list, memSeries.headChunk still points to the 'open' head chunk that receives new samples, while older, yet to be mmapped, chunks are linked to it. Mmapping is done on a schedule by iterating all memSeries one by one. Thanks to this we control when mmapping is done, since we trigger it manually, which reduces the risk that it will have to compete for mmap locks with other chunks. Signed-off-by: Łukasz Mierzwa <l.mierzwa@gmail.com>
1 year ago
ix -= len(s.mmappedChunks)
if s.headChunks != nil {
// Iterate all head chunks from the oldest to the newest.
headChunksLen := s.headChunks.len()
for j := headChunksLen - 1; j >= 0; j-- {
chk := s.headChunks.atOffset(j)
chkSamples := chk.chunk.NumSamples()
totalSamples += chkSamples
// Chunk ID is len(s.mmappedChunks) + $(headChunks list position).
// Where $(headChunks list position) is zero for the oldest chunk and $(s.headChunks.len() - 1)
// for the newest (open) chunk.
if headChunksLen-1-j < ix {
previousSamples += chkSamples
}
}
}
// Removing the extra transactionIDs that are relevant for samples that
// come after this chunk, from the total transactionIDs.
appendIDsToConsider := int(s.txs.txIDCount) - (totalSamples - (previousSamples + numSamples))
// Iterate over the appendIDs, find the first one that the isolation state says not
// to return.
it := s.txs.iterator()
for index := 0; index < appendIDsToConsider; index++ {
appendID := it.At()
if appendID <= isoState.maxAppendID { // Easy check first.
if _, ok := isoState.incompleteAppends[appendID]; !ok {
it.Next()
continue
}
}
stopAfter = numSamples - (appendIDsToConsider - index)
if stopAfter < 0 {
stopAfter = 0 // Stopped in a previous chunk.
}
break
}
}
if stopAfter == 0 {
return chunkenc.NewNopIterator()
}
tsdb: stop saving a copy of last 4 samples in memSeries (#11296) * TSDB chunks: remove race between writing and reading Because the data is stored as a bit-stream, the last byte in the stream could change if the stream is appended to after an Iterator is obtained. Copy the last byte when the Iterator is created, so we don't have to read it later. Clarify in comments that concurrent Iterator and Appender are allowed, but the chunk must not be modified while an Iterator is created. (This was already the case, in order to copy the bstream slice header.) * TSDB: stop saving last 4 samples in memSeries This extra copy of the last 4 samples was introduced to avoid a race condition between reading the last byte of the chunk and writing to it. But now we have fixed that by having `bstreamReader` copy the last byte, we don't need to copy the last 4 samples. This change saves 56 bytes per series, which is very worthwhile when you have millions or tens of millions of series. * TSDB: tidy up stopIterator re-use Previous changes have left this code duplicating some lines; pull them out to a separate function and tidy up. * TSDB head_test: stop checking when iterators are wrapped The behaviour has changed so chunk iterators are only wrapped when transaction isolation requires them to stop short of the end. This makes tests fail which are checking the type. Tests should check the observable behaviour, not the type. Signed-off-by: Bryan Boreham <bjboreham@gmail.com> Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com>
2 years ago
if stopAfter == numSamples {
return c.Iterator(it)
}
return makeStopIterator(c, it, stopAfter)
}
Add basic initial developer docs for TSDB (#9451) * Add basic initial developer docs for TSDB There's a decent amount of content already out there (blog posts, conference talks, etc), but: * when they get stale, they don't tend to get updated * they still leave me with questions that I'ld like to answer for developers (like me) who want to use, or work with, TSDB What I propose is developer docs inside the prometheus repository. Easy to find and harness the power of the community to expand it and keep it up to date. * perfect is the enemy of good. Let's have a base and incrementally improve * Markdown docs should be broad but not too deep. Source code comments can complement them, and are the ideal place for implementation details. Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * use example code that works out of the box Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * Apply suggestions from code review Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * PR feedback Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * more docs Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * PR feedback Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * Apply suggestions from code review Signed-off-by: Dieter Plaetinck <dieter@grafana.com> Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com> * Apply suggestions from code review Signed-off-by: Dieter Plaetinck <dieter@grafana.com> Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> * feedback Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * Update tsdb/docs/usage.md Signed-off-by: Dieter Plaetinck <dieter@grafana.com> Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> * final tweaks Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * workaround docs versioning issue Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * Move example code to real executable, testable example. Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * cleanup example test and make sure it always reproduces Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * obtain temp dir in a way that works with older Go versions Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * Fix Ganesh's comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com>
3 years ago
// stopIterator wraps an Iterator, but only returns the first
// stopAfter values, if initialized with i=-1.
type stopIterator struct {
chunkenc.Iterator
i, stopAfter int
}
func (it *stopIterator) Next() chunkenc.ValueType {
if it.i+1 >= it.stopAfter {
return chunkenc.ValNone
}
it.i++
return it.Iterator.Next()
}
func makeStopIterator(c chunkenc.Chunk, it chunkenc.Iterator, stopAfter int) chunkenc.Iterator {
// Re-use the Iterator object if it is a stopIterator.
if stopIter, ok := it.(*stopIterator); ok {
stopIter.Iterator = c.Iterator(stopIter.Iterator)
stopIter.i = -1
stopIter.stopAfter = stopAfter
return stopIter
}
return &stopIterator{
Iterator: c.Iterator(it),
i: -1,
stopAfter: stopAfter,
}
}