diff --git a/notification/notification.go b/notification/notification.go index e9a472590..d27c2feeb 100644 --- a/notification/notification.go +++ b/notification/notification.go @@ -39,11 +39,6 @@ const ( const ( namespace = "prometheus" subsystem = "notifications" - - result = "result" - success = "success" - failure = "failure" - dropped = "dropped" ) var ( @@ -88,7 +83,9 @@ type NotificationHandler struct { // HTTP client with custom timeout settings. httpClient httpPoster - notificationLatency *prometheus.SummaryVec + notificationLatency prometheus.Summary + notificationErrors prometheus.Counter + notificationDropped prometheus.Counter notificationsQueueLength prometheus.Gauge notificationsQueueCapacity prometheus.Metric @@ -103,15 +100,24 @@ func NewNotificationHandler(alertmanagerURL string, notificationQueueCapacity in httpClient: utility.NewDeadlineClient(*deadline), - notificationLatency: prometheus.NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "latency_milliseconds", - Help: "Latency quantiles for sending alert notifications.", - }, - []string{result}, - ), + notificationLatency: prometheus.NewSummary(prometheus.SummaryOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "latency_milliseconds", + Help: "Latency quantiles for sending alert notifications (not including dropped notifications).", + }), + notificationErrors: prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "errors_total", + Help: "Total number of errors sending alert notifications.", + }), + notificationDropped: prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "dropped_total", + Help: "Total number of alert notifications dropped due to alert manager missing in configuration.", + }), notificationsQueueLength: prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: subsystem, @@ -175,22 +181,19 @@ func (n *NotificationHandler) Run() { for reqs := range n.pendingNotifications { if n.alertmanagerURL == "" { glog.Warning("No alert manager configured, not dispatching notification") - n.notificationLatency.WithLabelValues(dropped).Observe(0) + n.notificationDropped.Inc() continue } begin := time.Now() err := n.sendNotifications(reqs) - labelValue := success if err != nil { glog.Error("Error sending notification: ", err) - labelValue = failure + n.notificationErrors.Inc() } - n.notificationLatency.WithLabelValues(labelValue).Observe( - float64(time.Since(begin) / time.Millisecond), - ) + n.notificationLatency.Observe(float64(time.Since(begin) / time.Millisecond)) } close(n.stopped) } diff --git a/storage/remote/queue_manager.go b/storage/remote/queue_manager.go index f1bbdf571..0deff6c1c 100644 --- a/storage/remote/queue_manager.go +++ b/storage/remote/queue_manager.go @@ -59,7 +59,8 @@ type TSDBQueueManager struct { drained chan bool samplesCount *prometheus.CounterVec - sendLatency *prometheus.SummaryVec + sendLatency prometheus.Summary + sendErrors prometheus.Counter queueLength prometheus.Gauge queueCapacity prometheus.Metric } @@ -81,15 +82,18 @@ func NewTSDBQueueManager(tsdb TSDBClient, queueCapacity int) *TSDBQueueManager { }, []string{result}, ), - sendLatency: prometheus.NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: namespace, - Subsystem: subsystem, - Name: "sent_latency_milliseconds", - Help: "Latency quantiles for sending samples to the remote TSDB.", - }, - []string{result}, - ), + sendLatency: prometheus.NewSummary(prometheus.SummaryOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "sent_latency_milliseconds", + Help: "Latency quantiles for sending sample batches to the remote TSDB.", + }), + sendErrors: prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "sent_errors_total", + Help: "Total number of errors sending sample batches to the remote TSDB.", + }), queueLength: prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: subsystem, @@ -164,9 +168,10 @@ func (t *TSDBQueueManager) sendSamples(s clientmodel.Samples) { if err != nil { glog.Warningf("error sending %d samples to TSDB: %s", len(s), err) labelValue = failure + t.sendErrors.Inc() } t.samplesCount.WithLabelValues(labelValue).Add(float64(len(s))) - t.sendLatency.WithLabelValues(labelValue).Observe(float64(duration)) + t.sendLatency.Observe(float64(duration)) } // Run continuously sends samples to the TSDB.