diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b36d3c38..358284c98 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## unreleased +* [CHANGE] Notifier: Increment the prometheus_notifications_errors_total metric by the number of affected alerts rather than by one per batch of affected alerts. #15428 * [ENHANCEMENT] OTLP receiver: Convert also metric metadata. #15416 ## 3.0.0 / 2024-11-14 diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 563daab80..9a6de90d8 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -84,8 +84,8 @@ severity: 'warning', }, annotations: { - summary: 'Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.', - description: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.alertmanager}}.' % $._config, + summary: 'More than 1% of alerts sent by Prometheus to a specific Alertmanager were affected by errors.', + description: '{{ printf "%%.1f" $value }}%% of alerts sent by Prometheus %(prometheusName)s to Alertmanager {{$labels.alertmanager}} were affected by errors.' % $._config, }, }, { diff --git a/notifier/notifier.go b/notifier/notifier.go index 09a2005a3..956fd4652 100644 --- a/notifier/notifier.go +++ b/notifier/notifier.go @@ -160,7 +160,7 @@ func newAlertMetrics(r prometheus.Registerer, queueCap int, queueLen, alertmanag Namespace: namespace, Subsystem: subsystem, Name: "errors_total", - Help: "Total number of errors sending alert notifications.", + Help: "Total number of sent alerts affected by errors.", }, []string{alertmanagerLabel}, ), @@ -619,13 +619,13 @@ func (n *Manager) sendAll(alerts ...*Alert) bool { go func(ctx context.Context, client *http.Client, url string, payload []byte, count int) { if err := n.sendOne(ctx, client, url, payload); err != nil { - n.logger.Error("Error sending alert", "alertmanager", url, "count", count, "err", err) - n.metrics.errors.WithLabelValues(url).Inc() + n.logger.Error("Error sending alerts", "alertmanager", url, "count", count, "err", err) + n.metrics.errors.WithLabelValues(url).Add(float64(count)) } else { numSuccess.Inc() } n.metrics.latency.WithLabelValues(url).Observe(time.Since(begin).Seconds()) - n.metrics.sent.WithLabelValues(url).Add(float64(len(amAlerts))) + n.metrics.sent.WithLabelValues(url).Add(float64(count)) wg.Done() }(ctx, ams.client, am.url().String(), payload, len(amAlerts))