Merge pull request #13974 from prometheus/measure-restore-time-rules

Rule Manager: Add `rule_group_last_restore_duration_seconds` to measure restore time per rule group
pull/13990/head
gotjosh 7 months ago committed by GitHub
commit 4ac78063ee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -3,6 +3,7 @@
## unreleased ## unreleased
* [CHANGE] TSDB: Fix the predicate checking for blocks which are beyond the retention period to include the ones right at the retention boundary. #9633 * [CHANGE] TSDB: Fix the predicate checking for blocks which are beyond the retention period to include the ones right at the retention boundary. #9633
* [ENHANCEMENT] Rules: Add `rule_group_last_restore_duration_seconds` to measure the time it takes to restore a rule group. #13974
## 2.51.2 / 2024-04-09 ## 2.51.2 / 2024-04-09

@ -230,7 +230,11 @@ func (g *Group) run(ctx context.Context) {
g.evalIterationFunc(ctx, g, evalTimestamp) g.evalIterationFunc(ctx, g, evalTimestamp)
} }
g.RestoreForState(time.Now()) restoreStartTime := time.Now()
g.RestoreForState(restoreStartTime)
totalRestoreTimeSeconds := time.Since(restoreStartTime).Seconds()
g.metrics.GroupLastRestoreDuration.WithLabelValues(GroupKey(g.file, g.name)).Set(totalRestoreTimeSeconds)
level.Debug(g.logger).Log("msg", "'for' state restoration completed", "duration_seconds", totalRestoreTimeSeconds)
g.shouldRestore = false g.shouldRestore = false
} }
@ -779,17 +783,18 @@ const namespace = "prometheus"
// Metrics for rule evaluation. // Metrics for rule evaluation.
type Metrics struct { type Metrics struct {
EvalDuration prometheus.Summary EvalDuration prometheus.Summary
IterationDuration prometheus.Summary IterationDuration prometheus.Summary
IterationsMissed *prometheus.CounterVec IterationsMissed *prometheus.CounterVec
IterationsScheduled *prometheus.CounterVec IterationsScheduled *prometheus.CounterVec
EvalTotal *prometheus.CounterVec EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec EvalFailures *prometheus.CounterVec
GroupInterval *prometheus.GaugeVec GroupInterval *prometheus.GaugeVec
GroupLastEvalTime *prometheus.GaugeVec GroupLastEvalTime *prometheus.GaugeVec
GroupLastDuration *prometheus.GaugeVec GroupLastDuration *prometheus.GaugeVec
GroupRules *prometheus.GaugeVec GroupLastRestoreDuration *prometheus.GaugeVec
GroupSamples *prometheus.GaugeVec GroupRules *prometheus.GaugeVec
GroupSamples *prometheus.GaugeVec
} }
// NewGroupMetrics creates a new instance of Metrics and registers it with the provided registerer, // NewGroupMetrics creates a new instance of Metrics and registers it with the provided registerer,
@ -865,6 +870,14 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
}, },
[]string{"rule_group"}, []string{"rule_group"},
), ),
GroupLastRestoreDuration: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "rule_group_last_restore_duration_seconds",
Help: "The duration of the last alert rules alerts restoration using the `ALERTS_FOR_STATE` series.",
},
[]string{"rule_group"},
),
GroupRules: prometheus.NewGaugeVec( GroupRules: prometheus.NewGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Namespace: namespace, Namespace: namespace,
@ -894,6 +907,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
m.GroupInterval, m.GroupInterval,
m.GroupLastEvalTime, m.GroupLastEvalTime,
m.GroupLastDuration, m.GroupLastDuration,
m.GroupLastRestoreDuration,
m.GroupRules, m.GroupRules,
m.GroupSamples, m.GroupSamples,
) )

Loading…
Cancel
Save