rules: Add new `RuleEvaluationTimeSum` field to groups (#15672)

* feat(ruler): Add new `RuleEvaluationTimeSum` field to groups
Coupled with a metric: `rule_group_last_rule_duration_sum_seconds`

This will give us more observability into how fast a group runs with or without concurrency

Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com>

* Update rules/group.go

Co-authored-by: gotjosh <josue.abreu@gmail.com>
Signed-off-by: Julien Duchesne <julienduchesne@live.com>
Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com>

* Apply suggestions from code review

Co-authored-by: gotjosh <josue.abreu@gmail.com>
Signed-off-by: Julien Duchesne <julienduchesne@live.com>
Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com>

* Remove `in seconds`. A duration is a duration

Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com>

---------

Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com>
Signed-off-by: Julien Duchesne <julienduchesne@live.com>
Co-authored-by: gotjosh <josue.abreu@gmail.com>
pull/15679/head
Julien Duchesne 2024-12-13 16:42:46 -05:00 committed by GitHub
parent 7802ca263d
commit e2f037e554
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 56 additions and 17 deletions

View File

@ -44,19 +44,20 @@ import (
// Group is a set of rules that have a logical relation. // Group is a set of rules that have a logical relation.
type Group struct { type Group struct {
name string name string
file string file string
interval time.Duration interval time.Duration
queryOffset *time.Duration queryOffset *time.Duration
limit int limit int
rules []Rule rules []Rule
seriesInPreviousEval []map[string]labels.Labels // One per Rule. seriesInPreviousEval []map[string]labels.Labels // One per Rule.
staleSeries []labels.Labels staleSeries []labels.Labels
opts *ManagerOptions opts *ManagerOptions
mtx sync.Mutex mtx sync.Mutex
evaluationTime time.Duration evaluationTime time.Duration // Time it took to evaluate the group.
lastEvaluation time.Time // Wall-clock time of most recent evaluation. evaluationRuleTimeSum time.Duration // Sum of time it took to evaluate each rule in the group.
lastEvalTimestamp time.Time // Time slot used for most recent evaluation. lastEvaluation time.Time // Wall-clock time of most recent evaluation.
lastEvalTimestamp time.Time // Time slot used for most recent evaluation.
shouldRestore bool shouldRestore bool
@ -115,6 +116,7 @@ func NewGroup(o GroupOptions) *Group {
metrics.EvalFailures.WithLabelValues(key) metrics.EvalFailures.WithLabelValues(key)
metrics.GroupLastEvalTime.WithLabelValues(key) metrics.GroupLastEvalTime.WithLabelValues(key)
metrics.GroupLastDuration.WithLabelValues(key) metrics.GroupLastDuration.WithLabelValues(key)
metrics.GroupLastRuleDurationSum.WithLabelValues(key)
metrics.GroupRules.WithLabelValues(key).Set(float64(len(o.Rules))) metrics.GroupRules.WithLabelValues(key).Set(float64(len(o.Rules)))
metrics.GroupSamples.WithLabelValues(key) metrics.GroupSamples.WithLabelValues(key)
metrics.GroupInterval.WithLabelValues(key).Set(o.Interval.Seconds()) metrics.GroupInterval.WithLabelValues(key).Set(o.Interval.Seconds())
@ -370,6 +372,28 @@ func (g *Group) setEvaluationTime(dur time.Duration) {
g.evaluationTime = dur g.evaluationTime = dur
} }
// GetRuleEvaluationTimeSum returns the sum of the time it took to evaluate each rule in the group irrespective of concurrency.
func (g *Group) GetRuleEvaluationTimeSum() time.Duration {
g.mtx.Lock()
defer g.mtx.Unlock()
return g.evaluationRuleTimeSum
}
// updateRuleEvaluationTimeSum updates evaluationRuleTimeSum which is the sum of the time it took to evaluate each rule in the group irrespective of concurrency.
// It collects the times from the rules themselves.
func (g *Group) updateRuleEvaluationTimeSum() {
var sum time.Duration
for _, rule := range g.rules {
sum += rule.GetEvaluationDuration()
}
g.metrics.GroupLastRuleDurationSum.WithLabelValues(GroupKey(g.file, g.name)).Set(sum.Seconds())
g.mtx.Lock()
defer g.mtx.Unlock()
g.evaluationRuleTimeSum = sum
}
// GetLastEvaluation returns the time the last evaluation of the rule group took place. // GetLastEvaluation returns the time the last evaluation of the rule group took place.
func (g *Group) GetLastEvaluation() time.Time { func (g *Group) GetLastEvaluation() time.Time {
g.mtx.Lock() g.mtx.Lock()
@ -874,6 +898,7 @@ type Metrics struct {
GroupInterval *prometheus.GaugeVec GroupInterval *prometheus.GaugeVec
GroupLastEvalTime *prometheus.GaugeVec GroupLastEvalTime *prometheus.GaugeVec
GroupLastDuration *prometheus.GaugeVec GroupLastDuration *prometheus.GaugeVec
GroupLastRuleDurationSum *prometheus.GaugeVec
GroupLastRestoreDuration *prometheus.GaugeVec GroupLastRestoreDuration *prometheus.GaugeVec
GroupRules *prometheus.GaugeVec GroupRules *prometheus.GaugeVec
GroupSamples *prometheus.GaugeVec GroupSamples *prometheus.GaugeVec
@ -952,6 +977,14 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
}, },
[]string{"rule_group"}, []string{"rule_group"},
), ),
GroupLastRuleDurationSum: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "rule_group_last_rule_duration_sum_seconds",
Help: "The sum of time in seconds it took to evaluate each rule in the group regardless of concurrency. This should be higher than the group duration if rules are evaluated concurrently.",
},
[]string{"rule_group"},
),
GroupLastRestoreDuration: prometheus.NewGaugeVec( GroupLastRestoreDuration: prometheus.NewGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Namespace: namespace, Namespace: namespace,
@ -989,6 +1022,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
m.GroupInterval, m.GroupInterval,
m.GroupLastEvalTime, m.GroupLastEvalTime,
m.GroupLastDuration, m.GroupLastDuration,
m.GroupLastRuleDurationSum,
m.GroupLastRestoreDuration, m.GroupLastRestoreDuration,
m.GroupRules, m.GroupRules,
m.GroupSamples, m.GroupSamples,

View File

@ -82,6 +82,7 @@ func DefaultEvalIterationFunc(ctx context.Context, g *Group, evalTimestamp time.
timeSinceStart := time.Since(start) timeSinceStart := time.Since(start)
g.metrics.IterationDuration.Observe(timeSinceStart.Seconds()) g.metrics.IterationDuration.Observe(timeSinceStart.Seconds())
g.updateRuleEvaluationTimeSum()
g.setEvaluationTime(timeSinceStart) g.setEvaluationTime(timeSinceStart)
g.setLastEvaluation(start) g.setLastEvaluation(start)
g.setLastEvalTimestamp(evalTimestamp) g.setLastEvalTimestamp(evalTimestamp)

View File

@ -1985,7 +1985,7 @@ func TestAsyncRuleEvaluation(t *testing.T) {
require.Len(t, group.rules, ruleCount) require.Len(t, group.rules, ruleCount)
start := time.Now() start := time.Now()
group.Eval(ctx, start) DefaultEvalIterationFunc(ctx, group, start)
// Never expect more than 1 inflight query at a time. // Never expect more than 1 inflight query at a time.
require.EqualValues(t, 1, maxInflight.Load()) require.EqualValues(t, 1, maxInflight.Load())
@ -1993,6 +1993,8 @@ func TestAsyncRuleEvaluation(t *testing.T) {
require.GreaterOrEqual(t, time.Since(start).Seconds(), (time.Duration(ruleCount) * artificialDelay).Seconds()) require.GreaterOrEqual(t, time.Since(start).Seconds(), (time.Duration(ruleCount) * artificialDelay).Seconds())
// Each rule produces one vector. // Each rule produces one vector.
require.EqualValues(t, ruleCount, testutil.ToFloat64(group.metrics.GroupSamples)) require.EqualValues(t, ruleCount, testutil.ToFloat64(group.metrics.GroupSamples))
// Group duration is higher than the sum of rule durations (group overhead).
require.GreaterOrEqual(t, group.GetEvaluationTime(), group.GetRuleEvaluationTimeSum())
} }
}) })
@ -2023,7 +2025,7 @@ func TestAsyncRuleEvaluation(t *testing.T) {
require.Len(t, group.rules, ruleCount) require.Len(t, group.rules, ruleCount)
start := time.Now() start := time.Now()
group.Eval(ctx, start) DefaultEvalIterationFunc(ctx, group, start)
// Max inflight can be 1 synchronous eval and up to MaxConcurrentEvals concurrent evals. // Max inflight can be 1 synchronous eval and up to MaxConcurrentEvals concurrent evals.
require.EqualValues(t, opts.MaxConcurrentEvals+1, maxInflight.Load()) require.EqualValues(t, opts.MaxConcurrentEvals+1, maxInflight.Load())
@ -2061,7 +2063,7 @@ func TestAsyncRuleEvaluation(t *testing.T) {
require.Len(t, group.rules, ruleCount) require.Len(t, group.rules, ruleCount)
start := time.Now() start := time.Now()
group.Eval(ctx, start) DefaultEvalIterationFunc(ctx, group, start)
// Max inflight can be 1 synchronous eval and up to MaxConcurrentEvals concurrent evals. // Max inflight can be 1 synchronous eval and up to MaxConcurrentEvals concurrent evals.
require.EqualValues(t, opts.MaxConcurrentEvals+1, maxInflight.Load()) require.EqualValues(t, opts.MaxConcurrentEvals+1, maxInflight.Load())
@ -2100,7 +2102,7 @@ func TestAsyncRuleEvaluation(t *testing.T) {
start := time.Now() start := time.Now()
group.Eval(ctx, start) DefaultEvalIterationFunc(ctx, group, start)
// Max inflight can be up to MaxConcurrentEvals concurrent evals, since there is sufficient concurrency to run all rules at once. // Max inflight can be up to MaxConcurrentEvals concurrent evals, since there is sufficient concurrency to run all rules at once.
require.LessOrEqual(t, int64(maxInflight.Load()), opts.MaxConcurrentEvals) require.LessOrEqual(t, int64(maxInflight.Load()), opts.MaxConcurrentEvals)
@ -2108,6 +2110,8 @@ func TestAsyncRuleEvaluation(t *testing.T) {
require.Less(t, time.Since(start).Seconds(), (time.Duration(ruleCount) * artificialDelay).Seconds()) require.Less(t, time.Since(start).Seconds(), (time.Duration(ruleCount) * artificialDelay).Seconds())
// Each rule produces one vector. // Each rule produces one vector.
require.EqualValues(t, ruleCount, testutil.ToFloat64(group.metrics.GroupSamples)) require.EqualValues(t, ruleCount, testutil.ToFloat64(group.metrics.GroupSamples))
// Group duration is less than the sum of rule durations
require.Less(t, group.GetEvaluationTime(), group.GetRuleEvaluationTimeSum())
} }
}) })