Make metrics exported by the Prometheus server itself more consistent.

- Always spell out the time unit (e.g. milliseconds instead of ms). - Remove "_total" from the names of metrics that are not counters. - Make use of the "Namespace" and "Subsystem" fields in the options. - Removed the "capacity" facet from all metrics about channels/queues. These are all fixed via command line flags and will never change during the runtime of a process. Also, they should not be part of the same metric family. I have added separate metrics for the capacity of queues as convenience. (They will never change and are only set once.) - I left "metric_disk_latency_microseconds" unchanged, although that metric measures the latency of the storage device, even if it is not a spinning disk. "SSD" is read by many as "solid state disk", so it's not too far off. (It should be "solid state drive", of course, but "metric_drive_latency_microseconds" is probably confusing.) - Brian suggested to not mix "failure" and "success" outcome in the same metric family (distinguished by labels). For now, I left it as it is. We are touching some bigger issue here, especially as other parts in the Prometheus ecosystem are following the same principle. We still need to come to terms here and then change things consistently everywhere. Change-Id: If799458b450d18f78500f05990301c12525197d3
10 years ago · 24ece38f7c
8 changed files with 137 additions and 101 deletions
--- a/notification/notification.go
+++ b/notification/notification.go
@ -37,14 +37,13 @@ const (

 // String constants for instrumentation.
 const (
+	namespace = "prometheus"
+	subsystem = "notifications"
+
 	result  = "result"
 	success = "success"
 	failure = "failure"
 	dropped = "dropped"
-
-	facet     = "facet"
-	occupancy = "occupancy"
-	capacity  = "capacity"
 )

 var (
@ -86,8 +85,9 @@ type NotificationHandler struct {
 	// HTTP client with custom timeout settings.
 	httpClient httpPoster

-	notificationLatency    *prometheus.SummaryVec
-	notificationsQueueSize *prometheus.GaugeVec
+	notificationLatency        *prometheus.SummaryVec
+	notificationsQueueLength   prometheus.Gauge
+	notificationsQueueCapacity prometheus.Metric
 }

 // Construct a new NotificationHandler.
@ -99,17 +99,27 @@ func NewNotificationHandler(alertmanagerUrl string, notificationReqs <-chan Noti

 		notificationLatency: prometheus.NewSummaryVec(
 			prometheus.SummaryOpts{
-				Name: "prometheus_notifications_latency_ms",
-				Help: "Latency quantiles for sending alert notifications in milliseconds.",
+				Namespace: namespace,
+				Subsystem: subsystem,
+				Name:      "latency_milliseconds",
+				Help:      "Latency quantiles for sending alert notifications.",
 			},
 			[]string{result},
 		),
-		notificationsQueueSize: prometheus.NewGaugeVec(
-			prometheus.GaugeOpts{
-				Name: "prometheus_notifications_queue_size_total",
-				Help: "The size and capacity of the alert notification queue.",
-			},
-			[]string{facet},
+		notificationsQueueLength: prometheus.NewGauge(prometheus.GaugeOpts{
+			Namespace: namespace,
+			Subsystem: subsystem,
+			Name:      "queue_length",
+			Help:      "The number of alert notifications in the queue.",
+		}),
+		notificationsQueueCapacity: prometheus.MustNewConstMetric(
+			prometheus.NewDesc(
+				prometheus.BuildFQName(namespace, subsystem, "queue_capacity"),
+				"The capacity of the alert notifications queue.",
+				nil, nil,
+			),
+			prometheus.GaugeValue,
+			float64(cap(notificationReqs)),
 		),
 	}
 }
@ -180,13 +190,14 @@ func (n *NotificationHandler) Run() {
 // Describe implements prometheus.Collector.
 func (n *NotificationHandler) Describe(ch chan<- *prometheus.Desc) {
 	n.notificationLatency.Describe(ch)
-	n.notificationsQueueSize.Describe(ch)
+	ch <- n.notificationsQueueLength.Desc()
+	ch <- n.notificationsQueueCapacity.Desc()
 }

 // Collect implements prometheus.Collector.
 func (n *NotificationHandler) Collect(ch chan<- prometheus.Metric) {
 	n.notificationLatency.Collect(ch)
-	n.notificationsQueueSize.WithLabelValues(occupancy).Set(float64(len(n.pendingNotifications)))
-	n.notificationsQueueSize.WithLabelValues(capacity).Set(float64(cap(n.pendingNotifications)))
-	n.notificationsQueueSize.Collect(ch)
+	n.notificationsQueueLength.Set(float64(len(n.pendingNotifications)))
+	ch <- n.notificationsQueueLength
+	ch <- n.notificationsQueueCapacity
 }
--- a/retrieval/target.go
+++ b/retrieval/target.go
@ -35,12 +35,12 @@ const (
 	ScrapeHealthMetricName clientmodel.LabelValue = "up"

 	// Constants for instrumentation.
-	address = "instance"
-	alive   = "alive"
-	failure = "failure"
-	outcome = "outcome"
-	state   = "state"
-	success = "success"
+	namespace = "prometheus"
+	job       = "target_job"
+	instance  = "target_instance"
+	failure   = "failure"
+	outcome   = "outcome"
+	success   = "success"
 )

 var (
@ -48,11 +48,12 @@ var (

 	targetOperationLatencies = prometheus.NewSummaryVec(
 		prometheus.SummaryOpts{
-			Name:       "prometheus_target_operation_latency_ms",
-			Help:       "The latencies for various target operations.",
+			Namespace:  namespace,
+			Name:       "target_operation_latency_milliseconds",
+			Help:       "The latencies for target operations.",
 			Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
 		},
-		[]string{address, outcome},
+		[]string{job, instance, outcome},
 	)
 )

@ -196,7 +197,11 @@ const acceptHeader = `application/vnd.google.protobuf;proto=io.prometheus.client
 func (t *target) scrape(timestamp clientmodel.Timestamp, ingester extraction.Ingester) (err error) {
 	defer func(start time.Time) {
 		ms := float64(time.Since(start)) / float64(time.Millisecond)
-		labels := prometheus.Labels{address: t.Address(), outcome: success}
+		labels := prometheus.Labels{
+			job:      string(t.baseLabels[clientmodel.JobLabel]),
+			instance: t.Address(),
+			outcome:  success,
+		}
 		if err != nil {
 			labels[outcome] = failure
 		}
--- a/retrieval/target_provider.go
+++ b/retrieval/target_provider.go
@ -35,8 +35,9 @@ const resolvConf = "/etc/resolv.conf"
 var (
 	dnsSDLookupsCount = prometheus.NewCounterVec(
 		prometheus.CounterOpts{
-			Name: "prometheus_dns_sd_lookups_total",
-			Help: "The number of DNS-SD lookup successes/failures per pool.",
+			Namespace: namespace,
+			Name:      "dns_sd_lookups_total",
+			Help:      "The number of DNS-SD lookup successes/failures per pool.",
 		},
 		[]string{outcome},
 	)
--- a/retrieval/targetpool.go
+++ b/retrieval/targetpool.go
@ -32,8 +32,9 @@ const (
 var (
 	retrievalDurations = prometheus.NewSummaryVec(
 		prometheus.SummaryOpts{
-			Name:       "prometheus_targetpool_duration_ms",
-			Help:       "The durations for each TargetPool to retrieve state from all included entities.",
+			Namespace:  namespace,
+			Name:       "targetpool_retrieve_time_milliseconds",
+			Help:       "The time needed for each TargetPool to retrieve state from all included entities.",
 			Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
 		},
 		[]string{intervalKey},
--- a/rules/manager/manager.go
+++ b/rules/manager/manager.go
@ -33,7 +33,8 @@ import (

 // Constants for instrumentation.
 const (
-	intervalLabel     = "interval"
+	namespace = "prometheus"
+
 	ruleTypeLabel     = "rule_type"
 	alertingRuleType  = "alerting"
 	recordingRuleType = "recording"
@ -42,19 +43,18 @@ const (
 var (
 	evalDuration = prometheus.NewSummaryVec(
 		prometheus.SummaryOpts{
-			Name: "prometheus_rule_evaluation_duration_ms",
-			Help: "The duration for a rule to execute.",
+			Namespace: namespace,
+			Name:      "rule_evaluation_duration_milliseconds",
+			Help:      "The duration for a rule to execute.",
 		},
 		[]string{ruleTypeLabel},
 	)
-	iterationDuration = prometheus.NewSummaryVec(
-		prometheus.SummaryOpts{
-			Name:       "prometheus_evaluator_duration_ms",
-			Help:       "The duration for each evaluation pool to execute.",
-			Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
-		},
-		[]string{intervalLabel},
-	)
+	iterationDuration = prometheus.NewSummary(prometheus.SummaryOpts{
+		Namespace:  namespace,
+		Name:       "evaluator_duration_milliseconds",
+		Help:       "The duration for all evaluations to execute.",
+		Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
+	})
 )

 func init() {
@ -124,7 +124,7 @@ func (m *ruleManager) Run() {
 		case <-ticker.C:
 			start := time.Now()
 			m.runIteration(m.results)
-			iterationDuration.WithLabelValues(m.interval.String()).Observe(float64(time.Since(start) / time.Millisecond))
+			iterationDuration.Observe(float64(time.Since(start) / time.Millisecond))
 		case <-m.done:
 			glog.Info("rules.Rule manager exiting...")
 			return
--- a/storage/metric/tiered/curator.go
+++ b/storage/metric/tiered/curator.go
@ -47,16 +47,18 @@ const (
 var (
 	curationDurations = prometheus.NewSummaryVec(
 		prometheus.SummaryOpts{
-			Name:       "prometheus_curation_durations_ms",
-			Help:       "Histogram of time spent in curation (ms).",
+			Namespace:  namespace,
+			Name:       "curation_durations_milliseconds",
+			Help:       "Histogram of time spent in curation.",
 			Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
 		},
 		[]string{cutOff, processorName, result},
 	)
 	curationFilterOperations = prometheus.NewCounterVec(
 		prometheus.CounterOpts{
-			Name: "prometheus_curation_filter_operations_total",
-			Help: "The number of curation filter operations completed.",
+			Namespace: namespace,
+			Name:      "curation_filter_operations_total",
+			Help:      "The number of curation filter operations completed.",
 		},
 		[]string{cutOff, processorName, result},
 	)
--- a/storage/metric/tiered/tiered.go
+++ b/storage/metric/tiered/tiered.go
@ -33,6 +33,8 @@ import (

 // Constants for instrumentation.
 const (
+	namespace = "prometheus"
+
 	operation = "operation"
 	success   = "success"
 	failure   = "failure"
@ -51,24 +53,22 @@ const (
 	queue          = "queue"
 	appendToDisk   = "append_to_disk"
 	viewGeneration = "view_generation"
-
-	facet     = "facet"
-	occupancy = "occupancy"
-	capacity  = "capacity"
 )

 var (
 	storageLatency = prometheus.NewSummaryVec(
 		prometheus.SummaryOpts{
-			Name:       "prometheus_metric_disk_latency_microseconds",
-			Help:       "Latency for metric disk operations in microseconds.",
+			Namespace:  namespace,
+			Name:       "metric_disk_latency_milliseconds",
+			Help:       "Latency for metric disk operations (includes any storage drive even if it is not strictly a disk, e.g. SSD).",
 			Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
 		},
 		[]string{operation, result},
 	)
 	storedSamplesCount = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "prometheus_stored_samples_total",
-		Help: "The number of samples that have been stored.",
+		Namespace: namespace,
+		Name:      "stored_samples_total",
+		Help:      "The number of samples that have been stored.",
 	})
 )

@ -145,7 +145,8 @@ type TieredStorage struct {
 	dtoSampleKeys *dtoSampleKeyList
 	sampleKeys    *sampleKeyList

-	queueSizes *prometheus.GaugeVec
+	queueLength   *prometheus.GaugeVec
+	queueCapacity *prometheus.GaugeVec
 }

 // viewJob encapsulates a request to extract sample values from the datastore.
@ -159,10 +160,9 @@ type viewJob struct {

 const (
 	tieredMemorySemaphores = 5
+	watermarkCacheLimit    = 1024 * 1024
 )

-const watermarkCacheLimit = 1024 * 1024
-
 // NewTieredStorage returns a TieredStorage object ready to use.
 func NewTieredStorage(
 	appendToDiskQueueDepth,
@ -208,14 +208,25 @@ func NewTieredStorage(
 		dtoSampleKeys: newDtoSampleKeyList(10),
 		sampleKeys:    newSampleKeyList(10),

-		queueSizes: prometheus.NewGaugeVec(
+		queueLength: prometheus.NewGaugeVec(
 			prometheus.GaugeOpts{
-				Name: "prometheus_storage_queue_sizes_total",
-				Help: "The various sizes and capacities of the storage queues.",
+				Namespace: namespace,
+				Name:      "storage_queue_length",
+				Help:      "The number of items in the storage queues.",
 			},
-			[]string{queue, facet},
+			[]string{queue},
+		),
+		queueCapacity: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Namespace: namespace,
+				Name:      "storage_queue_capacity",
+				Help:      "The capacity of the storage queues.",
+			},
+			[]string{queue},
 		),
 	}
+	s.queueCapacity.WithLabelValues(appendToDisk).Set(float64(appendToDiskQueueDepth))
+	s.queueCapacity.WithLabelValues(viewGeneration).Set(float64(viewQueueDepth))

 	for i := 0; i < tieredMemorySemaphores; i++ {
 		s.memorySemaphore <- true
@ -444,13 +455,13 @@ func (t *TieredStorage) renderView(viewJob viewJob) {
 			storageLatency.With(
 				prometheus.Labels{operation: renderView, result: success},
 			).Observe(
-				float64(time.Since(begin) / time.Microsecond),
+				float64(time.Since(begin) / time.Millisecond),
 			)
 		} else {
 			storageLatency.With(
 				prometheus.Labels{operation: renderView, result: failure},
 			).Observe(
-				float64(time.Since(begin) / time.Microsecond),
+				float64(time.Since(begin) / time.Millisecond),
 			)
 		}
 	}()
@ -788,23 +799,15 @@ func (t *TieredStorage) GetMetricForFingerprint(f *clientmodel.Fingerprint) (cli

 // Describe implements prometheus.Collector.
 func (t *TieredStorage) Describe(ch chan<- *prometheus.Desc) {
-	t.queueSizes.Describe(ch)
+	t.queueLength.Describe(ch)
+	t.queueCapacity.Describe(ch)
 }

 // Collect implements prometheus.Collector.
 func (t *TieredStorage) Collect(ch chan<- prometheus.Metric) {
-	t.queueSizes.With(prometheus.Labels{
-		queue: appendToDisk, facet: occupancy,
-	}).Set(float64(len(t.appendToDiskQueue)))
-	t.queueSizes.With(prometheus.Labels{
-		queue: appendToDisk, facet: capacity,
-	}).Set(float64(cap(t.appendToDiskQueue)))
-	t.queueSizes.With(prometheus.Labels{
-		queue: viewGeneration, facet: occupancy,
-	}).Set(float64(len(t.ViewQueue)))
-	t.queueSizes.With(prometheus.Labels{
-		queue: viewGeneration, facet: capacity,
-	}).Set(float64(cap(t.ViewQueue)))
-
-	t.queueSizes.Collect(ch)
+	t.queueLength.WithLabelValues(appendToDisk).Set(float64(len(t.appendToDiskQueue)))
+	t.queueLength.WithLabelValues(viewGeneration).Set(float64(len(t.ViewQueue)))
+
+	t.queueLength.Collect(ch)
+	t.queueCapacity.Collect(ch)
 }
--- a/storage/remote/queue_manager.go
+++ b/storage/remote/queue_manager.go
@ -34,14 +34,13 @@ const (

 // String constants for instrumentation.
 const (
+	namespace = "prometheus"
+	subsystem = "remote_tsdb"
+
 	result  = "result"
 	success = "success"
 	failure = "failure"
 	dropped = "dropped"
-
-	facet     = "facet"
-	occupancy = "occupancy"
-	capacity  = "capacity"
 )

 // TSDBClient defines an interface for sending a batch of samples to an
@ -59,9 +58,10 @@ type TSDBQueueManager struct {
 	sendSemaphore  chan bool
 	drained        chan bool

-	samplesCount *prometheus.CounterVec
-	sendLatency  *prometheus.SummaryVec
-	queueSize    *prometheus.GaugeVec
+	samplesCount  *prometheus.CounterVec
+	sendLatency   *prometheus.SummaryVec
+	queueLength   prometheus.Gauge
+	queueCapacity prometheus.Metric
 }

 // NewTSDBQueueManager builds a new TSDBQueueManager.
@ -74,24 +74,36 @@ func NewTSDBQueueManager(tsdb TSDBClient, queueCapacity int) *TSDBQueueManager {

 		samplesCount: prometheus.NewCounterVec(
 			prometheus.CounterOpts{
-				Name: "prometheus_remote_tsdb_sent_samples_total",
-				Help: "Total number of samples processed to be sent to remote TSDB.",
+				Namespace: namespace,
+				Subsystem: subsystem,
+				Name:      "sent_samples_total",
+				Help:      "Total number of processed samples to be sent to remote TSDB.",
 			},
 			[]string{result},
 		),
 		sendLatency: prometheus.NewSummaryVec(
 			prometheus.SummaryOpts{
-				Name: "prometheus_remote_tsdb_latency_ms",
-				Help: "Latency quantiles for sending samples to the remote TSDB in milliseconds.",
+				Namespace: namespace,
+				Subsystem: subsystem,
+				Name:      "sent_latency_milliseconds",
+				Help:      "Latency quantiles for sending samples to the remote TSDB.",
 			},
 			[]string{result},
 		),
-		queueSize: prometheus.NewGaugeVec(
-			prometheus.GaugeOpts{
-				Name: "prometheus_remote_tsdb_queue_size_total",
-				Help: "The size and capacity of the queue of samples to be sent to the remote TSDB.",
-			},
-			[]string{facet},
+		queueLength: prometheus.NewGauge(prometheus.GaugeOpts{
+			Namespace: namespace,
+			Subsystem: subsystem,
+			Name:      "queue_length",
+			Help:      "The number of processed samples queued to be sent to the remote TSDB.",
+		}),
+		queueCapacity: prometheus.MustNewConstMetric(
+			prometheus.NewDesc(
+				prometheus.BuildFQName(namespace, subsystem, "queue_capacity"),
+				"The capacity of the queue of samples to be sent to the remote TSDB.",
+				nil, nil,
+			),
+			prometheus.GaugeValue,
+			float64(queueCapacity),
 		),
 	}
 }
@ -122,16 +134,17 @@ func (t *TSDBQueueManager) Close() {
 func (t *TSDBQueueManager) Describe(ch chan<- *prometheus.Desc) {
 	t.samplesCount.Describe(ch)
 	t.sendLatency.Describe(ch)
-	t.queueSize.Describe(ch)
+	ch <- t.queueLength.Desc()
+	ch <- t.queueCapacity.Desc()
 }

 // Collect implements prometheus.Collector.
 func (t *TSDBQueueManager) Collect(ch chan<- prometheus.Metric) {
 	t.samplesCount.Collect(ch)
 	t.sendLatency.Collect(ch)
-	t.queueSize.WithLabelValues(occupancy).Set(float64(len(t.queue)))
-	t.queueSize.WithLabelValues(capacity).Set(float64(cap(t.queue)))
-	t.queueSize.Collect(ch)
+	t.queueLength.Set(float64(len(t.queue)))
+	ch <- t.queueLength
+	ch <- t.queueCapacity
 }

 func (t *TSDBQueueManager) sendSamples(s clientmodel.Samples) {