Merge pull request #5787 from cstyan/reshard-max-logging

Add metrics for max/min/desired shards to queue manager.
pull/6002/head
Björn Rabenstein 2019-09-09 22:32:54 +02:00 committed by GitHub
commit 3b3eaf3496
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 60 additions and 0 deletions

View File

@ -211,6 +211,26 @@
description: 'Prometheus %(prometheusName)s remote write is {{ printf "%%.1f" $value }}s behind for queue {{$labels.queue}}.' % $._config,
},
},
{
alert: 'PrometheusRemoteWriteDesiredShards',
expr: |||
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_shards_desired{%(prometheusSelector)s}[5m])
> on(job, instance) group_right
max_over_time(prometheus_remote_storage_shards_max{%(prometheusSelector)s}[5m])
)
||| % $._config,
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Prometheus remote write desired shards calculation wants to run more than configured max shards.',
description: 'Prometheus %(prometheusName)s remote write desired shards calculation wants to run {{ printf $value }} shards, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%%s",%(prometheusSelector)s}` $labels.instance | query | first | value }}.' % $._config,
},
},
{
alert: 'PrometheusRuleFailures',
expr: |||

View File

@ -143,6 +143,33 @@ var (
},
[]string{queue},
)
maxNumShards = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "shards_max",
Help: "The maximum number of shards that the queue is allowed to run.",
},
[]string{queue},
)
minNumShards = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "shards_min",
Help: "The minimum number of shards that the queue is allowed to run.",
},
[]string{queue},
)
desiredNumShards = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "shards_desired",
Help: "The number of shards that the queues shard calculation wants to run based on the rate of samples in vs. samples out.",
},
[]string{queue},
)
)
// StorageClient defines an interface for sending a batch of samples to an
@ -190,6 +217,9 @@ type QueueManager struct {
succeededSamplesTotal prometheus.Counter
retriedSamplesTotal prometheus.Counter
shardCapacity prometheus.Gauge
maxNumShards prometheus.Gauge
minNumShards prometheus.Gauge
desiredNumShards prometheus.Gauge
}
// NewQueueManager builds a new QueueManager.
@ -291,10 +321,16 @@ func (t *QueueManager) Start() {
t.succeededSamplesTotal = succeededSamplesTotal.WithLabelValues(name)
t.retriedSamplesTotal = retriedSamplesTotal.WithLabelValues(name)
t.shardCapacity = shardCapacity.WithLabelValues(name)
t.maxNumShards = maxNumShards.WithLabelValues(name)
t.minNumShards = minNumShards.WithLabelValues(name)
t.desiredNumShards = desiredNumShards.WithLabelValues(name)
// Initialise some metrics.
t.shardCapacity.Set(float64(t.cfg.Capacity))
t.pendingSamplesMetric.Set(0)
t.maxNumShards.Set(float64(t.cfg.MaxShards))
t.minNumShards.Set(float64(t.cfg.MinShards))
t.desiredNumShards.Set(float64(t.cfg.MinShards))
t.shards.start(t.numShards)
t.watcher.Start()
@ -334,6 +370,9 @@ func (t *QueueManager) Stop() {
succeededSamplesTotal.DeleteLabelValues(name)
retriedSamplesTotal.DeleteLabelValues(name)
shardCapacity.DeleteLabelValues(name)
maxNumShards.DeleteLabelValues(name)
minNumShards.DeleteLabelValues(name)
desiredNumShards.DeleteLabelValues(name)
}
// StoreSeries keeps track of which series we know about for lookups when sending samples to remote.
@ -502,6 +541,7 @@ func (t *QueueManager) calculateDesiredShards() {
}
numShards := int(math.Ceil(desiredShards))
t.desiredNumShards.Set(float64(numShards))
if numShards > t.cfg.MaxShards {
numShards = t.cfg.MaxShards
} else if numShards < t.cfg.MinShards {