From 5f77fce5787f2138e1b762ca85bab4657bac337b Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Wed, 16 Sep 2015 16:22:18 +0200 Subject: [PATCH] Improve remote storage queue manager metrics. --- storage/remote/queue_manager.go | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/storage/remote/queue_manager.go b/storage/remote/queue_manager.go index fa6b65b67..6e33ee734 100644 --- a/storage/remote/queue_manager.go +++ b/storage/remote/queue_manager.go @@ -62,7 +62,8 @@ type StorageQueueManager struct { samplesCount *prometheus.CounterVec sendLatency prometheus.Summary - sendErrors prometheus.Counter + failedBatches prometheus.Counter + failedSamples prometheus.Counter queueLength prometheus.Gauge queueCapacity prometheus.Metric } @@ -92,15 +93,22 @@ func NewStorageQueueManager(tsdb StorageClient, queueCapacity int) *StorageQueue sendLatency: prometheus.NewSummary(prometheus.SummaryOpts{ Namespace: namespace, Subsystem: subsystem, - Name: "sent_latency_milliseconds", + Name: "send_latency_seconds", Help: "Latency quantiles for sending sample batches to the remote storage.", ConstLabels: constLabels, }), - sendErrors: prometheus.NewCounter(prometheus.CounterOpts{ + failedBatches: prometheus.NewCounter(prometheus.CounterOpts{ Namespace: namespace, Subsystem: subsystem, - Name: "sent_errors_total", - Help: "Total number of errors sending sample batches to the remote storage.", + Name: "failed_batches_total", + Help: "Total number of sample batches that encountered an error while being sent to the remote storage.", + ConstLabels: constLabels, + }), + failedSamples: prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "failed_samples_total", + Help: "Total number of samples that encountered an error while being sent to the remote storage.", ConstLabels: constLabels, }), queueLength: prometheus.NewGauge(prometheus.GaugeOpts{ @@ -151,6 +159,8 @@ func (t *StorageQueueManager) Stop() { func (t *StorageQueueManager) Describe(ch chan<- *prometheus.Desc) { t.samplesCount.Describe(ch) t.sendLatency.Describe(ch) + ch <- t.failedBatches.Desc() + ch <- t.failedSamples.Desc() ch <- t.queueLength.Desc() ch <- t.queueCapacity.Desc() } @@ -160,6 +170,8 @@ func (t *StorageQueueManager) Collect(ch chan<- prometheus.Metric) { t.samplesCount.Collect(ch) t.sendLatency.Collect(ch) t.queueLength.Set(float64(len(t.queue))) + ch <- t.failedBatches + ch <- t.failedSamples ch <- t.queueLength ch <- t.queueCapacity } @@ -175,13 +187,14 @@ func (t *StorageQueueManager) sendSamples(s model.Samples) { // floor. begin := time.Now() err := t.tsdb.Store(s) - duration := time.Since(begin) / time.Millisecond + duration := time.Since(begin) / time.Second labelValue := success if err != nil { log.Warnf("error sending %d samples to remote storage: %s", len(s), err) labelValue = failure - t.sendErrors.Inc() + t.failedBatches.Inc() + t.failedSamples.Add(float64(len(s))) } t.samplesCount.WithLabelValues(labelValue).Add(float64(len(s))) t.sendLatency.Observe(float64(duration))