diff --git a/notification/notification.go b/notification/notification.go index 21012ba95..cf9d285ef 100644 --- a/notification/notification.go +++ b/notification/notification.go @@ -37,14 +37,13 @@ const ( // String constants for instrumentation. const ( + namespace = "prometheus" + subsystem = "notifications" + result = "result" success = "success" failure = "failure" dropped = "dropped" - - facet = "facet" - occupancy = "occupancy" - capacity = "capacity" ) var ( @@ -86,8 +85,9 @@ type NotificationHandler struct { // HTTP client with custom timeout settings. httpClient httpPoster - notificationLatency *prometheus.SummaryVec - notificationsQueueSize *prometheus.GaugeVec + notificationLatency *prometheus.SummaryVec + notificationsQueueLength prometheus.Gauge + notificationsQueueCapacity prometheus.Metric } // Construct a new NotificationHandler. @@ -99,17 +99,27 @@ func NewNotificationHandler(alertmanagerUrl string, notificationReqs <-chan Noti notificationLatency: prometheus.NewSummaryVec( prometheus.SummaryOpts{ - Name: "prometheus_notifications_latency_ms", - Help: "Latency quantiles for sending alert notifications in milliseconds.", + Namespace: namespace, + Subsystem: subsystem, + Name: "latency_milliseconds", + Help: "Latency quantiles for sending alert notifications.", }, []string{result}, ), - notificationsQueueSize: prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "prometheus_notifications_queue_size_total", - Help: "The size and capacity of the alert notification queue.", - }, - []string{facet}, + notificationsQueueLength: prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "queue_length", + Help: "The number of alert notifications in the queue.", + }), + notificationsQueueCapacity: prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(namespace, subsystem, "queue_capacity"), + "The capacity of the alert notifications queue.", + nil, nil, + ), + prometheus.GaugeValue, + float64(cap(notificationReqs)), ), } } @@ -180,13 +190,14 @@ func (n *NotificationHandler) Run() { // Describe implements prometheus.Collector. func (n *NotificationHandler) Describe(ch chan<- *prometheus.Desc) { n.notificationLatency.Describe(ch) - n.notificationsQueueSize.Describe(ch) + ch <- n.notificationsQueueLength.Desc() + ch <- n.notificationsQueueCapacity.Desc() } // Collect implements prometheus.Collector. func (n *NotificationHandler) Collect(ch chan<- prometheus.Metric) { n.notificationLatency.Collect(ch) - n.notificationsQueueSize.WithLabelValues(occupancy).Set(float64(len(n.pendingNotifications))) - n.notificationsQueueSize.WithLabelValues(capacity).Set(float64(cap(n.pendingNotifications))) - n.notificationsQueueSize.Collect(ch) + n.notificationsQueueLength.Set(float64(len(n.pendingNotifications))) + ch <- n.notificationsQueueLength + ch <- n.notificationsQueueCapacity } diff --git a/retrieval/target.go b/retrieval/target.go index 7a761593d..976f8f871 100644 --- a/retrieval/target.go +++ b/retrieval/target.go @@ -35,12 +35,12 @@ const ( ScrapeHealthMetricName clientmodel.LabelValue = "up" // Constants for instrumentation. - address = "instance" - alive = "alive" - failure = "failure" - outcome = "outcome" - state = "state" - success = "success" + namespace = "prometheus" + job = "target_job" + instance = "target_instance" + failure = "failure" + outcome = "outcome" + success = "success" ) var ( @@ -48,11 +48,12 @@ var ( targetOperationLatencies = prometheus.NewSummaryVec( prometheus.SummaryOpts{ - Name: "prometheus_target_operation_latency_ms", - Help: "The latencies for various target operations.", + Namespace: namespace, + Name: "target_operation_latency_milliseconds", + Help: "The latencies for target operations.", Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99}, }, - []string{address, outcome}, + []string{job, instance, outcome}, ) ) @@ -196,7 +197,11 @@ const acceptHeader = `application/vnd.google.protobuf;proto=io.prometheus.client func (t *target) scrape(timestamp clientmodel.Timestamp, ingester extraction.Ingester) (err error) { defer func(start time.Time) { ms := float64(time.Since(start)) / float64(time.Millisecond) - labels := prometheus.Labels{address: t.Address(), outcome: success} + labels := prometheus.Labels{ + job: string(t.baseLabels[clientmodel.JobLabel]), + instance: t.Address(), + outcome: success, + } if err != nil { labels[outcome] = failure } diff --git a/retrieval/target_provider.go b/retrieval/target_provider.go index 0b19b39c8..15cc282ae 100644 --- a/retrieval/target_provider.go +++ b/retrieval/target_provider.go @@ -35,8 +35,9 @@ const resolvConf = "/etc/resolv.conf" var ( dnsSDLookupsCount = prometheus.NewCounterVec( prometheus.CounterOpts{ - Name: "prometheus_dns_sd_lookups_total", - Help: "The number of DNS-SD lookup successes/failures per pool.", + Namespace: namespace, + Name: "dns_sd_lookups_total", + Help: "The number of DNS-SD lookup successes/failures per pool.", }, []string{outcome}, ) diff --git a/retrieval/targetpool.go b/retrieval/targetpool.go index f1a87d0e1..bce49be4c 100644 --- a/retrieval/targetpool.go +++ b/retrieval/targetpool.go @@ -32,8 +32,9 @@ const ( var ( retrievalDurations = prometheus.NewSummaryVec( prometheus.SummaryOpts{ - Name: "prometheus_targetpool_duration_ms", - Help: "The durations for each TargetPool to retrieve state from all included entities.", + Namespace: namespace, + Name: "targetpool_retrieve_time_milliseconds", + Help: "The time needed for each TargetPool to retrieve state from all included entities.", Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99}, }, []string{intervalKey}, diff --git a/rules/manager/manager.go b/rules/manager/manager.go index b69546d8a..e1917454d 100644 --- a/rules/manager/manager.go +++ b/rules/manager/manager.go @@ -33,7 +33,8 @@ import ( // Constants for instrumentation. const ( - intervalLabel = "interval" + namespace = "prometheus" + ruleTypeLabel = "rule_type" alertingRuleType = "alerting" recordingRuleType = "recording" @@ -42,19 +43,18 @@ const ( var ( evalDuration = prometheus.NewSummaryVec( prometheus.SummaryOpts{ - Name: "prometheus_rule_evaluation_duration_ms", - Help: "The duration for a rule to execute.", + Namespace: namespace, + Name: "rule_evaluation_duration_milliseconds", + Help: "The duration for a rule to execute.", }, []string{ruleTypeLabel}, ) - iterationDuration = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ - Name: "prometheus_evaluator_duration_ms", - Help: "The duration for each evaluation pool to execute.", - Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99}, - }, - []string{intervalLabel}, - ) + iterationDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Namespace: namespace, + Name: "evaluator_duration_milliseconds", + Help: "The duration for all evaluations to execute.", + Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99}, + }) ) func init() { @@ -124,7 +124,7 @@ func (m *ruleManager) Run() { case <-ticker.C: start := time.Now() m.runIteration(m.results) - iterationDuration.WithLabelValues(m.interval.String()).Observe(float64(time.Since(start) / time.Millisecond)) + iterationDuration.Observe(float64(time.Since(start) / time.Millisecond)) case <-m.done: glog.Info("rules.Rule manager exiting...") return diff --git a/storage/metric/tiered/curator.go b/storage/metric/tiered/curator.go index ba2c63b3a..ace1d58eb 100644 --- a/storage/metric/tiered/curator.go +++ b/storage/metric/tiered/curator.go @@ -47,16 +47,18 @@ const ( var ( curationDurations = prometheus.NewSummaryVec( prometheus.SummaryOpts{ - Name: "prometheus_curation_durations_ms", - Help: "Histogram of time spent in curation (ms).", + Namespace: namespace, + Name: "curation_durations_milliseconds", + Help: "Histogram of time spent in curation.", Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99}, }, []string{cutOff, processorName, result}, ) curationFilterOperations = prometheus.NewCounterVec( prometheus.CounterOpts{ - Name: "prometheus_curation_filter_operations_total", - Help: "The number of curation filter operations completed.", + Namespace: namespace, + Name: "curation_filter_operations_total", + Help: "The number of curation filter operations completed.", }, []string{cutOff, processorName, result}, ) diff --git a/storage/metric/tiered/tiered.go b/storage/metric/tiered/tiered.go index cc02e8480..cc5e4fc58 100644 --- a/storage/metric/tiered/tiered.go +++ b/storage/metric/tiered/tiered.go @@ -33,6 +33,8 @@ import ( // Constants for instrumentation. const ( + namespace = "prometheus" + operation = "operation" success = "success" failure = "failure" @@ -51,24 +53,22 @@ const ( queue = "queue" appendToDisk = "append_to_disk" viewGeneration = "view_generation" - - facet = "facet" - occupancy = "occupancy" - capacity = "capacity" ) var ( storageLatency = prometheus.NewSummaryVec( prometheus.SummaryOpts{ - Name: "prometheus_metric_disk_latency_microseconds", - Help: "Latency for metric disk operations in microseconds.", + Namespace: namespace, + Name: "metric_disk_latency_milliseconds", + Help: "Latency for metric disk operations (includes any storage drive even if it is not strictly a disk, e.g. SSD).", Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99}, }, []string{operation, result}, ) storedSamplesCount = prometheus.NewCounter(prometheus.CounterOpts{ - Name: "prometheus_stored_samples_total", - Help: "The number of samples that have been stored.", + Namespace: namespace, + Name: "stored_samples_total", + Help: "The number of samples that have been stored.", }) ) @@ -145,7 +145,8 @@ type TieredStorage struct { dtoSampleKeys *dtoSampleKeyList sampleKeys *sampleKeyList - queueSizes *prometheus.GaugeVec + queueLength *prometheus.GaugeVec + queueCapacity *prometheus.GaugeVec } // viewJob encapsulates a request to extract sample values from the datastore. @@ -159,10 +160,9 @@ type viewJob struct { const ( tieredMemorySemaphores = 5 + watermarkCacheLimit = 1024 * 1024 ) -const watermarkCacheLimit = 1024 * 1024 - // NewTieredStorage returns a TieredStorage object ready to use. func NewTieredStorage( appendToDiskQueueDepth, @@ -208,14 +208,25 @@ func NewTieredStorage( dtoSampleKeys: newDtoSampleKeyList(10), sampleKeys: newSampleKeyList(10), - queueSizes: prometheus.NewGaugeVec( + queueLength: prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Name: "prometheus_storage_queue_sizes_total", - Help: "The various sizes and capacities of the storage queues.", + Namespace: namespace, + Name: "storage_queue_length", + Help: "The number of items in the storage queues.", }, - []string{queue, facet}, + []string{queue}, + ), + queueCapacity: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "storage_queue_capacity", + Help: "The capacity of the storage queues.", + }, + []string{queue}, ), } + s.queueCapacity.WithLabelValues(appendToDisk).Set(float64(appendToDiskQueueDepth)) + s.queueCapacity.WithLabelValues(viewGeneration).Set(float64(viewQueueDepth)) for i := 0; i < tieredMemorySemaphores; i++ { s.memorySemaphore <- true @@ -444,13 +455,13 @@ func (t *TieredStorage) renderView(viewJob viewJob) { storageLatency.With( prometheus.Labels{operation: renderView, result: success}, ).Observe( - float64(time.Since(begin) / time.Microsecond), + float64(time.Since(begin) / time.Millisecond), ) } else { storageLatency.With( prometheus.Labels{operation: renderView, result: failure}, ).Observe( - float64(time.Since(begin) / time.Microsecond), + float64(time.Since(begin) / time.Millisecond), ) } }() @@ -788,23 +799,15 @@ func (t *TieredStorage) GetMetricForFingerprint(f *clientmodel.Fingerprint) (cli // Describe implements prometheus.Collector. func (t *TieredStorage) Describe(ch chan<- *prometheus.Desc) { - t.queueSizes.Describe(ch) + t.queueLength.Describe(ch) + t.queueCapacity.Describe(ch) } // Collect implements prometheus.Collector. func (t *TieredStorage) Collect(ch chan<- prometheus.Metric) { - t.queueSizes.With(prometheus.Labels{ - queue: appendToDisk, facet: occupancy, - }).Set(float64(len(t.appendToDiskQueue))) - t.queueSizes.With(prometheus.Labels{ - queue: appendToDisk, facet: capacity, - }).Set(float64(cap(t.appendToDiskQueue))) - t.queueSizes.With(prometheus.Labels{ - queue: viewGeneration, facet: occupancy, - }).Set(float64(len(t.ViewQueue))) - t.queueSizes.With(prometheus.Labels{ - queue: viewGeneration, facet: capacity, - }).Set(float64(cap(t.ViewQueue))) - - t.queueSizes.Collect(ch) + t.queueLength.WithLabelValues(appendToDisk).Set(float64(len(t.appendToDiskQueue))) + t.queueLength.WithLabelValues(viewGeneration).Set(float64(len(t.ViewQueue))) + + t.queueLength.Collect(ch) + t.queueCapacity.Collect(ch) } diff --git a/storage/remote/queue_manager.go b/storage/remote/queue_manager.go index 6ef4f3c04..50e083765 100644 --- a/storage/remote/queue_manager.go +++ b/storage/remote/queue_manager.go @@ -34,14 +34,13 @@ const ( // String constants for instrumentation. const ( + namespace = "prometheus" + subsystem = "remote_tsdb" + result = "result" success = "success" failure = "failure" dropped = "dropped" - - facet = "facet" - occupancy = "occupancy" - capacity = "capacity" ) // TSDBClient defines an interface for sending a batch of samples to an @@ -59,9 +58,10 @@ type TSDBQueueManager struct { sendSemaphore chan bool drained chan bool - samplesCount *prometheus.CounterVec - sendLatency *prometheus.SummaryVec - queueSize *prometheus.GaugeVec + samplesCount *prometheus.CounterVec + sendLatency *prometheus.SummaryVec + queueLength prometheus.Gauge + queueCapacity prometheus.Metric } // NewTSDBQueueManager builds a new TSDBQueueManager. @@ -74,24 +74,36 @@ func NewTSDBQueueManager(tsdb TSDBClient, queueCapacity int) *TSDBQueueManager { samplesCount: prometheus.NewCounterVec( prometheus.CounterOpts{ - Name: "prometheus_remote_tsdb_sent_samples_total", - Help: "Total number of samples processed to be sent to remote TSDB.", + Namespace: namespace, + Subsystem: subsystem, + Name: "sent_samples_total", + Help: "Total number of processed samples to be sent to remote TSDB.", }, []string{result}, ), sendLatency: prometheus.NewSummaryVec( prometheus.SummaryOpts{ - Name: "prometheus_remote_tsdb_latency_ms", - Help: "Latency quantiles for sending samples to the remote TSDB in milliseconds.", + Namespace: namespace, + Subsystem: subsystem, + Name: "sent_latency_milliseconds", + Help: "Latency quantiles for sending samples to the remote TSDB.", }, []string{result}, ), - queueSize: prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "prometheus_remote_tsdb_queue_size_total", - Help: "The size and capacity of the queue of samples to be sent to the remote TSDB.", - }, - []string{facet}, + queueLength: prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "queue_length", + Help: "The number of processed samples queued to be sent to the remote TSDB.", + }), + queueCapacity: prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(namespace, subsystem, "queue_capacity"), + "The capacity of the queue of samples to be sent to the remote TSDB.", + nil, nil, + ), + prometheus.GaugeValue, + float64(queueCapacity), ), } } @@ -122,16 +134,17 @@ func (t *TSDBQueueManager) Close() { func (t *TSDBQueueManager) Describe(ch chan<- *prometheus.Desc) { t.samplesCount.Describe(ch) t.sendLatency.Describe(ch) - t.queueSize.Describe(ch) + ch <- t.queueLength.Desc() + ch <- t.queueCapacity.Desc() } // Collect implements prometheus.Collector. func (t *TSDBQueueManager) Collect(ch chan<- prometheus.Metric) { t.samplesCount.Collect(ch) t.sendLatency.Collect(ch) - t.queueSize.WithLabelValues(occupancy).Set(float64(len(t.queue))) - t.queueSize.WithLabelValues(capacity).Set(float64(cap(t.queue))) - t.queueSize.Collect(ch) + t.queueLength.Set(float64(len(t.queue))) + ch <- t.queueLength + ch <- t.queueCapacity } func (t *TSDBQueueManager) sendSamples(s clientmodel.Samples) {