Browse Source

Make metrics exported by the Prometheus server itself more consistent.

- Always spell out the time unit (e.g. milliseconds instead of ms).

- Remove "_total" from the names of metrics that are not counters.

- Make use of the "Namespace" and "Subsystem" fields in the options.

- Removed the "capacity" facet from all metrics about channels/queues.
  These are all fixed via command line flags and will never change
  during the runtime of a process. Also, they should not be part of
  the same metric family. I have added separate metrics for the
  capacity of queues as convenience. (They will never change and are
  only set once.)

- I left "metric_disk_latency_microseconds" unchanged, although that
  metric measures the latency of the storage device, even if it is not
  a spinning disk. "SSD" is read by many as "solid state disk", so
  it's not too far off. (It should be "solid state drive", of course,
  but "metric_drive_latency_microseconds" is probably confusing.)

- Brian suggested to not mix "failure" and "success" outcome in the
  same metric family (distinguished by labels). For now, I left it as
  it is. We are touching some bigger issue here, especially as other
  parts in the Prometheus ecosystem are following the same
  principle. We still need to come to terms here and then change
  things consistently everywhere.

Change-Id: If799458b450d18f78500f05990301c12525197d3
changes/13/313/5
Bjoern Rabenstein 10 years ago
parent
commit
24ece38f7c
  1. 47
      notification/notification.go
  2. 25
      retrieval/target.go
  3. 5
      retrieval/target_provider.go
  4. 5
      retrieval/targetpool.go
  5. 24
      rules/manager/manager.go
  6. 10
      storage/metric/tiered/curator.go
  7. 67
      storage/metric/tiered/tiered.go
  8. 55
      storage/remote/queue_manager.go

47
notification/notification.go

@ -37,14 +37,13 @@ const (
// String constants for instrumentation.
const (
namespace = "prometheus"
subsystem = "notifications"
result = "result"
success = "success"
failure = "failure"
dropped = "dropped"
facet = "facet"
occupancy = "occupancy"
capacity = "capacity"
)
var (
@ -86,8 +85,9 @@ type NotificationHandler struct {
// HTTP client with custom timeout settings.
httpClient httpPoster
notificationLatency *prometheus.SummaryVec
notificationsQueueSize *prometheus.GaugeVec
notificationLatency *prometheus.SummaryVec
notificationsQueueLength prometheus.Gauge
notificationsQueueCapacity prometheus.Metric
}
// Construct a new NotificationHandler.
@ -99,17 +99,27 @@ func NewNotificationHandler(alertmanagerUrl string, notificationReqs <-chan Noti
notificationLatency: prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "prometheus_notifications_latency_ms",
Help: "Latency quantiles for sending alert notifications in milliseconds.",
Namespace: namespace,
Subsystem: subsystem,
Name: "latency_milliseconds",
Help: "Latency quantiles for sending alert notifications.",
},
[]string{result},
),
notificationsQueueSize: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "prometheus_notifications_queue_size_total",
Help: "The size and capacity of the alert notification queue.",
},
[]string{facet},
notificationsQueueLength: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "queue_length",
Help: "The number of alert notifications in the queue.",
}),
notificationsQueueCapacity: prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "queue_capacity"),
"The capacity of the alert notifications queue.",
nil, nil,
),
prometheus.GaugeValue,
float64(cap(notificationReqs)),
),
}
}
@ -180,13 +190,14 @@ func (n *NotificationHandler) Run() {
// Describe implements prometheus.Collector.
func (n *NotificationHandler) Describe(ch chan<- *prometheus.Desc) {
n.notificationLatency.Describe(ch)
n.notificationsQueueSize.Describe(ch)
ch <- n.notificationsQueueLength.Desc()
ch <- n.notificationsQueueCapacity.Desc()
}
// Collect implements prometheus.Collector.
func (n *NotificationHandler) Collect(ch chan<- prometheus.Metric) {
n.notificationLatency.Collect(ch)
n.notificationsQueueSize.WithLabelValues(occupancy).Set(float64(len(n.pendingNotifications)))
n.notificationsQueueSize.WithLabelValues(capacity).Set(float64(cap(n.pendingNotifications)))
n.notificationsQueueSize.Collect(ch)
n.notificationsQueueLength.Set(float64(len(n.pendingNotifications)))
ch <- n.notificationsQueueLength
ch <- n.notificationsQueueCapacity
}

25
retrieval/target.go

@ -35,12 +35,12 @@ const (
ScrapeHealthMetricName clientmodel.LabelValue = "up"
// Constants for instrumentation.
address = "instance"
alive = "alive"
failure = "failure"
outcome = "outcome"
state = "state"
success = "success"
namespace = "prometheus"
job = "target_job"
instance = "target_instance"
failure = "failure"
outcome = "outcome"
success = "success"
)
var (
@ -48,11 +48,12 @@ var (
targetOperationLatencies = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "prometheus_target_operation_latency_ms",
Help: "The latencies for various target operations.",
Namespace: namespace,
Name: "target_operation_latency_milliseconds",
Help: "The latencies for target operations.",
Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
},
[]string{address, outcome},
[]string{job, instance, outcome},
)
)
@ -196,7 +197,11 @@ const acceptHeader = `application/vnd.google.protobuf;proto=io.prometheus.client
func (t *target) scrape(timestamp clientmodel.Timestamp, ingester extraction.Ingester) (err error) {
defer func(start time.Time) {
ms := float64(time.Since(start)) / float64(time.Millisecond)
labels := prometheus.Labels{address: t.Address(), outcome: success}
labels := prometheus.Labels{
job: string(t.baseLabels[clientmodel.JobLabel]),
instance: t.Address(),
outcome: success,
}
if err != nil {
labels[outcome] = failure
}

5
retrieval/target_provider.go

@ -35,8 +35,9 @@ const resolvConf = "/etc/resolv.conf"
var (
dnsSDLookupsCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "prometheus_dns_sd_lookups_total",
Help: "The number of DNS-SD lookup successes/failures per pool.",
Namespace: namespace,
Name: "dns_sd_lookups_total",
Help: "The number of DNS-SD lookup successes/failures per pool.",
},
[]string{outcome},
)

5
retrieval/targetpool.go

@ -32,8 +32,9 @@ const (
var (
retrievalDurations = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "prometheus_targetpool_duration_ms",
Help: "The durations for each TargetPool to retrieve state from all included entities.",
Namespace: namespace,
Name: "targetpool_retrieve_time_milliseconds",
Help: "The time needed for each TargetPool to retrieve state from all included entities.",
Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
},
[]string{intervalKey},

24
rules/manager/manager.go

@ -33,7 +33,8 @@ import (
// Constants for instrumentation.
const (
intervalLabel = "interval"
namespace = "prometheus"
ruleTypeLabel = "rule_type"
alertingRuleType = "alerting"
recordingRuleType = "recording"
@ -42,19 +43,18 @@ const (
var (
evalDuration = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "prometheus_rule_evaluation_duration_ms",
Help: "The duration for a rule to execute.",
Namespace: namespace,
Name: "rule_evaluation_duration_milliseconds",
Help: "The duration for a rule to execute.",
},
[]string{ruleTypeLabel},
)
iterationDuration = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "prometheus_evaluator_duration_ms",
Help: "The duration for each evaluation pool to execute.",
Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
},
[]string{intervalLabel},
)
iterationDuration = prometheus.NewSummary(prometheus.SummaryOpts{
Namespace: namespace,
Name: "evaluator_duration_milliseconds",
Help: "The duration for all evaluations to execute.",
Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
})
)
func init() {
@ -124,7 +124,7 @@ func (m *ruleManager) Run() {
case <-ticker.C:
start := time.Now()
m.runIteration(m.results)
iterationDuration.WithLabelValues(m.interval.String()).Observe(float64(time.Since(start) / time.Millisecond))
iterationDuration.Observe(float64(time.Since(start) / time.Millisecond))
case <-m.done:
glog.Info("rules.Rule manager exiting...")
return

10
storage/metric/tiered/curator.go

@ -47,16 +47,18 @@ const (
var (
curationDurations = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "prometheus_curation_durations_ms",
Help: "Histogram of time spent in curation (ms).",
Namespace: namespace,
Name: "curation_durations_milliseconds",
Help: "Histogram of time spent in curation.",
Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
},
[]string{cutOff, processorName, result},
)
curationFilterOperations = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "prometheus_curation_filter_operations_total",
Help: "The number of curation filter operations completed.",
Namespace: namespace,
Name: "curation_filter_operations_total",
Help: "The number of curation filter operations completed.",
},
[]string{cutOff, processorName, result},
)

67
storage/metric/tiered/tiered.go

@ -33,6 +33,8 @@ import (
// Constants for instrumentation.
const (
namespace = "prometheus"
operation = "operation"
success = "success"
failure = "failure"
@ -51,24 +53,22 @@ const (
queue = "queue"
appendToDisk = "append_to_disk"
viewGeneration = "view_generation"
facet = "facet"
occupancy = "occupancy"
capacity = "capacity"
)
var (
storageLatency = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "prometheus_metric_disk_latency_microseconds",
Help: "Latency for metric disk operations in microseconds.",
Namespace: namespace,
Name: "metric_disk_latency_milliseconds",
Help: "Latency for metric disk operations (includes any storage drive even if it is not strictly a disk, e.g. SSD).",
Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
},
[]string{operation, result},
)
storedSamplesCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_stored_samples_total",
Help: "The number of samples that have been stored.",
Namespace: namespace,
Name: "stored_samples_total",
Help: "The number of samples that have been stored.",
})
)
@ -145,7 +145,8 @@ type TieredStorage struct {
dtoSampleKeys *dtoSampleKeyList
sampleKeys *sampleKeyList
queueSizes *prometheus.GaugeVec
queueLength *prometheus.GaugeVec
queueCapacity *prometheus.GaugeVec
}
// viewJob encapsulates a request to extract sample values from the datastore.
@ -159,10 +160,9 @@ type viewJob struct {
const (
tieredMemorySemaphores = 5
watermarkCacheLimit = 1024 * 1024
)
const watermarkCacheLimit = 1024 * 1024
// NewTieredStorage returns a TieredStorage object ready to use.
func NewTieredStorage(
appendToDiskQueueDepth,
@ -208,14 +208,25 @@ func NewTieredStorage(
dtoSampleKeys: newDtoSampleKeyList(10),
sampleKeys: newSampleKeyList(10),
queueSizes: prometheus.NewGaugeVec(
queueLength: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "prometheus_storage_queue_sizes_total",
Help: "The various sizes and capacities of the storage queues.",
Namespace: namespace,
Name: "storage_queue_length",
Help: "The number of items in the storage queues.",
},
[]string{queue, facet},
[]string{queue},
),
queueCapacity: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "storage_queue_capacity",
Help: "The capacity of the storage queues.",
},
[]string{queue},
),
}
s.queueCapacity.WithLabelValues(appendToDisk).Set(float64(appendToDiskQueueDepth))
s.queueCapacity.WithLabelValues(viewGeneration).Set(float64(viewQueueDepth))
for i := 0; i < tieredMemorySemaphores; i++ {
s.memorySemaphore <- true
@ -444,13 +455,13 @@ func (t *TieredStorage) renderView(viewJob viewJob) {
storageLatency.With(
prometheus.Labels{operation: renderView, result: success},
).Observe(
float64(time.Since(begin) / time.Microsecond),
float64(time.Since(begin) / time.Millisecond),
)
} else {
storageLatency.With(
prometheus.Labels{operation: renderView, result: failure},
).Observe(
float64(time.Since(begin) / time.Microsecond),
float64(time.Since(begin) / time.Millisecond),
)
}
}()
@ -788,23 +799,15 @@ func (t *TieredStorage) GetMetricForFingerprint(f *clientmodel.Fingerprint) (cli
// Describe implements prometheus.Collector.
func (t *TieredStorage) Describe(ch chan<- *prometheus.Desc) {
t.queueSizes.Describe(ch)
t.queueLength.Describe(ch)
t.queueCapacity.Describe(ch)
}
// Collect implements prometheus.Collector.
func (t *TieredStorage) Collect(ch chan<- prometheus.Metric) {
t.queueSizes.With(prometheus.Labels{
queue: appendToDisk, facet: occupancy,
}).Set(float64(len(t.appendToDiskQueue)))
t.queueSizes.With(prometheus.Labels{
queue: appendToDisk, facet: capacity,
}).Set(float64(cap(t.appendToDiskQueue)))
t.queueSizes.With(prometheus.Labels{
queue: viewGeneration, facet: occupancy,
}).Set(float64(len(t.ViewQueue)))
t.queueSizes.With(prometheus.Labels{
queue: viewGeneration, facet: capacity,
}).Set(float64(cap(t.ViewQueue)))
t.queueSizes.Collect(ch)
t.queueLength.WithLabelValues(appendToDisk).Set(float64(len(t.appendToDiskQueue)))
t.queueLength.WithLabelValues(viewGeneration).Set(float64(len(t.ViewQueue)))
t.queueLength.Collect(ch)
t.queueCapacity.Collect(ch)
}

55
storage/remote/queue_manager.go

@ -34,14 +34,13 @@ const (
// String constants for instrumentation.
const (
namespace = "prometheus"
subsystem = "remote_tsdb"
result = "result"
success = "success"
failure = "failure"
dropped = "dropped"
facet = "facet"
occupancy = "occupancy"
capacity = "capacity"
)
// TSDBClient defines an interface for sending a batch of samples to an
@ -59,9 +58,10 @@ type TSDBQueueManager struct {
sendSemaphore chan bool
drained chan bool
samplesCount *prometheus.CounterVec
sendLatency *prometheus.SummaryVec
queueSize *prometheus.GaugeVec
samplesCount *prometheus.CounterVec
sendLatency *prometheus.SummaryVec
queueLength prometheus.Gauge
queueCapacity prometheus.Metric
}
// NewTSDBQueueManager builds a new TSDBQueueManager.
@ -74,24 +74,36 @@ func NewTSDBQueueManager(tsdb TSDBClient, queueCapacity int) *TSDBQueueManager {
samplesCount: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "prometheus_remote_tsdb_sent_samples_total",
Help: "Total number of samples processed to be sent to remote TSDB.",
Namespace: namespace,
Subsystem: subsystem,
Name: "sent_samples_total",
Help: "Total number of processed samples to be sent to remote TSDB.",
},
[]string{result},
),
sendLatency: prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "prometheus_remote_tsdb_latency_ms",
Help: "Latency quantiles for sending samples to the remote TSDB in milliseconds.",
Namespace: namespace,
Subsystem: subsystem,
Name: "sent_latency_milliseconds",
Help: "Latency quantiles for sending samples to the remote TSDB.",
},
[]string{result},
),
queueSize: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "prometheus_remote_tsdb_queue_size_total",
Help: "The size and capacity of the queue of samples to be sent to the remote TSDB.",
},
[]string{facet},
queueLength: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "queue_length",
Help: "The number of processed samples queued to be sent to the remote TSDB.",
}),
queueCapacity: prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "queue_capacity"),
"The capacity of the queue of samples to be sent to the remote TSDB.",
nil, nil,
),
prometheus.GaugeValue,
float64(queueCapacity),
),
}
}
@ -122,16 +134,17 @@ func (t *TSDBQueueManager) Close() {
func (t *TSDBQueueManager) Describe(ch chan<- *prometheus.Desc) {
t.samplesCount.Describe(ch)
t.sendLatency.Describe(ch)
t.queueSize.Describe(ch)
ch <- t.queueLength.Desc()
ch <- t.queueCapacity.Desc()
}
// Collect implements prometheus.Collector.
func (t *TSDBQueueManager) Collect(ch chan<- prometheus.Metric) {
t.samplesCount.Collect(ch)
t.sendLatency.Collect(ch)
t.queueSize.WithLabelValues(occupancy).Set(float64(len(t.queue)))
t.queueSize.WithLabelValues(capacity).Set(float64(cap(t.queue)))
t.queueSize.Collect(ch)
t.queueLength.Set(float64(len(t.queue)))
ch <- t.queueLength
ch <- t.queueCapacity
}
func (t *TSDBQueueManager) sendSamples(s clientmodel.Samples) {

Loading…
Cancel
Save