Merge pull request #72336 from danielqsj/apimetrics

Change apiserver metrics to conform metrics guidelines
2019-01-17 21:40:03 -08:00 · 2019-01-17 21:40:03 -08:00 · 24643fd116
parent 7f0e04a089 8b418631c0
commit 24643fd116
6 changed files with 178 additions and 54 deletions
--- a/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go
+++ b/staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go
@ -49,11 +49,18 @@ var (
 	// the upstream library supports it.
 	requestCounter = prometheus.NewCounterVec(
 		prometheus.CounterOpts{
-			Name: "apiserver_request_count",
+			Name: "apiserver_request_total",
 			Help: "Counter of apiserver requests broken out for each verb, group, version, resource, scope, component, client, and HTTP response contentType and code.",
 		},
 		[]string{"verb", "group", "version", "resource", "subresource", "scope", "component", "client", "contentType", "code"},
 	)
+	deprecatedRequestCounter = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "apiserver_request_count",
+			Help: "(Deprecated) Counter of apiserver requests broken out for each verb, group, version, resource, scope, component, client, and HTTP response contentType and code.",
+		},
+		[]string{"verb", "group", "version", "resource", "subresource", "scope", "component", "client", "contentType", "code"},
+	)
 	longRunningRequestGauge = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Name: "apiserver_longrunning_gauge",
@ -62,18 +69,27 @@ var (
 		[]string{"verb", "group", "version", "resource", "subresource", "scope", "component"},
 	)
 	requestLatencies = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name: "apiserver_request_latency_seconds",
+			Help: "Response latency distribution in seconds for each verb, group, version, resource, subresource, scope and component.",
+			// Use buckets ranging from 125 ms to 8 seconds.
+			Buckets: prometheus.ExponentialBuckets(0.125, 2.0, 7),
+		},
+		[]string{"verb", "group", "version", "resource", "subresource", "scope", "component"},
+	)
+	deprecatedRequestLatencies = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Name: "apiserver_request_latencies",
-			Help: "Response latency distribution in microseconds for each verb, group, version, resource, subresource, scope and component.",
+			Help: "(Deprecated) Response latency distribution in microseconds for each verb, group, version, resource, subresource, scope and component.",
 			// Use buckets ranging from 125 ms to 8 seconds.
 			Buckets: prometheus.ExponentialBuckets(125000, 2.0, 7),
 		},
 		[]string{"verb", "group", "version", "resource", "subresource", "scope", "component"},
 	)
-	requestLatenciesSummary = prometheus.NewSummaryVec(
+	deprecatedRequestLatenciesSummary = prometheus.NewSummaryVec(
 		prometheus.SummaryOpts{
 			Name: "apiserver_request_latencies_summary",
-			Help: "Response latency summary in microseconds for each verb, group, version, resource, subresource, scope and component.",
+			Help: "(Deprecated) Response latency summary in microseconds for each verb, group, version, resource, subresource, scope and component.",
 			// Make the sliding window of 5h.
 			// TODO: The value for this should be based on our SLI definition (medium term).
 			MaxAge: 5 * time.Hour,
@ -92,11 +108,18 @@ var (
 	// DroppedRequests is a number of requests dropped with 'Try again later' response"
 	DroppedRequests = prometheus.NewCounterVec(
 		prometheus.CounterOpts{
-			Name: "apiserver_dropped_requests",
+			Name: "apiserver_dropped_requests_total",
 			Help: "Number of requests dropped with 'Try again later' response",
 		},
 		[]string{"requestKind"},
 	)
+	DeprecatedDroppedRequests = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "apiserver_dropped_requests",
+			Help: "(Deprecated) Number of requests dropped with 'Try again later' response",
+		},
+		[]string{"requestKind"},
+	)
 	// RegisteredWatchers is a number of currently registered watchers splitted by resource.
 	RegisteredWatchers = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
@ -118,11 +141,14 @@ var (

 	metrics = []resettableCollector{
 		requestCounter,
+		deprecatedRequestCounter,
 		longRunningRequestGauge,
 		requestLatencies,
-		requestLatenciesSummary,
+		deprecatedRequestLatencies,
+		deprecatedRequestLatenciesSummary,
 		responseSizes,
 		DroppedRequests,
+		DeprecatedDroppedRequests,
 		RegisteredWatchers,
 		currentInflightRequests,
 	}
@ -198,9 +224,12 @@ func MonitorRequest(req *http.Request, verb, group, version, resource, subresour
 	reportedVerb := cleanVerb(verb, req)
 	client := cleanUserAgent(utilnet.GetHTTPClient(req))
 	elapsedMicroseconds := float64(elapsed / time.Microsecond)
+	elapsedSeconds := elapsed.Seconds()
 	requestCounter.WithLabelValues(reportedVerb, group, version, resource, subresource, scope, component, client, contentType, codeToString(httpCode)).Inc()
-	requestLatencies.WithLabelValues(reportedVerb, group, version, resource, subresource, scope, component).Observe(elapsedMicroseconds)
-	requestLatenciesSummary.WithLabelValues(reportedVerb, group, version, resource, subresource, scope, component).Observe(elapsedMicroseconds)
+	deprecatedRequestCounter.WithLabelValues(reportedVerb, group, version, resource, subresource, scope, component, client, contentType, codeToString(httpCode)).Inc()
+	requestLatencies.WithLabelValues(reportedVerb, group, version, resource, subresource, scope, component).Observe(elapsedSeconds)
+	deprecatedRequestLatencies.WithLabelValues(reportedVerb, group, version, resource, subresource, scope, component).Observe(elapsedMicroseconds)
+	deprecatedRequestLatenciesSummary.WithLabelValues(reportedVerb, group, version, resource, subresource, scope, component).Observe(elapsedMicroseconds)
 	// We are only interested in response sizes of read requests.
 	if verb == "GET" || verb == "LIST" {
 		responseSizes.WithLabelValues(reportedVerb, group, version, resource, subresource, scope, component).Observe(float64(respSize))
--- a/staging/src/k8s.io/apiserver/pkg/server/filters/maxinflight.go
+++ b/staging/src/k8s.io/apiserver/pkg/server/filters/maxinflight.go
@ -163,8 +163,10 @@ func WithMaxInFlightLimit(
 				// We need to split this data between buckets used for throttling.
 				if isMutatingRequest {
 					metrics.DroppedRequests.WithLabelValues(metrics.MutatingKind).Inc()
+					metrics.DeprecatedDroppedRequests.WithLabelValues(metrics.MutatingKind).Inc()
 				} else {
 					metrics.DroppedRequests.WithLabelValues(metrics.ReadOnlyKind).Inc()
+					metrics.DeprecatedDroppedRequests.WithLabelValues(metrics.ReadOnlyKind).Inc()
 				}
 				// at this point we're about to return a 429, BUT not all actors should be rate limited.  A system:master is so powerful
 				// that they should always get an answer.  It's a super-admin or a loopback connection.
--- a/staging/src/k8s.io/apiserver/pkg/storage/etcd/metrics/metrics.go
+++ b/staging/src/k8s.io/apiserver/pkg/storage/etcd/metrics/metrics.go
@ -25,37 +25,37 @@ import (

 var (
 	cacheHitCounterOpts = prometheus.CounterOpts{
-		Name: "etcd_helper_cache_hit_count",
+		Name: "etcd_helper_cache_hit_total",
 		Help: "Counter of etcd helper cache hits.",
 	}
 	cacheHitCounter      = prometheus.NewCounter(cacheHitCounterOpts)
 	cacheMissCounterOpts = prometheus.CounterOpts{
-		Name: "etcd_helper_cache_miss_count",
+		Name: "etcd_helper_cache_miss_total",
 		Help: "Counter of etcd helper cache miss.",
 	}
 	cacheMissCounter      = prometheus.NewCounter(cacheMissCounterOpts)
 	cacheEntryCounterOpts = prometheus.CounterOpts{
-		Name: "etcd_helper_cache_entry_count",
+		Name: "etcd_helper_cache_entry_total",
 		Help: "Counter of etcd helper cache entries. This can be different from etcd_helper_cache_miss_count " +
 			"because two concurrent threads can miss the cache and generate the same entry twice.",
 	}
 	cacheEntryCounter = prometheus.NewCounter(cacheEntryCounterOpts)
-	cacheGetLatency   = prometheus.NewSummary(
-		prometheus.SummaryOpts{
-			Name: "etcd_request_cache_get_latencies_summary",
-			Help: "Latency in microseconds of getting an object from etcd cache",
+	cacheGetLatency   = prometheus.NewHistogram(
+		prometheus.HistogramOpts{
+			Name: "etcd_request_cache_get_latency_seconds",
+			Help: "Latency in seconds of getting an object from etcd cache",
 		},
 	)
-	cacheAddLatency = prometheus.NewSummary(
-		prometheus.SummaryOpts{
-			Name: "etcd_request_cache_add_latencies_summary",
-			Help: "Latency in microseconds of adding an object to etcd cache",
+	cacheAddLatency = prometheus.NewHistogram(
+		prometheus.HistogramOpts{
+			Name: "etcd_request_cache_add_latency_seconds",
+			Help: "Latency in seconds of adding an object to etcd cache",
 		},
 	)
-	etcdRequestLatenciesSummary = prometheus.NewSummaryVec(
-		prometheus.SummaryOpts{
-			Name: "etcd_request_latencies_summary",
-			Help: "Etcd request latency summary in microseconds for each operation and object type.",
+	etcdRequestLatency = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name: "etcd_request_latency_seconds",
+			Help: "Etcd request latency in seconds for each operation and object type.",
 		},
 		[]string{"operation", "type"},
 	)
@ -66,6 +66,42 @@ var (
 		},
 		[]string{"resource"},
 	)
+
+	deprecatedCacheHitCounterOpts = prometheus.CounterOpts{
+		Name: "etcd_helper_cache_hit_count",
+		Help: "(Deprecated) Counter of etcd helper cache hits.",
+	}
+	deprecatedCacheHitCounter      = prometheus.NewCounter(deprecatedCacheHitCounterOpts)
+	deprecatedCacheMissCounterOpts = prometheus.CounterOpts{
+		Name: "etcd_helper_cache_miss_count",
+		Help: "(Deprecated) Counter of etcd helper cache miss.",
+	}
+	deprecatedCacheMissCounter      = prometheus.NewCounter(deprecatedCacheMissCounterOpts)
+	deprecatedCacheEntryCounterOpts = prometheus.CounterOpts{
+		Name: "etcd_helper_cache_entry_count",
+		Help: "(Deprecated) Counter of etcd helper cache entries. This can be different from etcd_helper_cache_miss_count " +
+			"because two concurrent threads can miss the cache and generate the same entry twice.",
+	}
+	deprecatedCacheEntryCounter = prometheus.NewCounter(deprecatedCacheEntryCounterOpts)
+	deprecatedCacheGetLatency   = prometheus.NewSummary(
+		prometheus.SummaryOpts{
+			Name: "etcd_request_cache_get_latencies_summary",
+			Help: "(Deprecated) Latency in microseconds of getting an object from etcd cache",
+		},
+	)
+	deprecatedCacheAddLatency = prometheus.NewSummary(
+		prometheus.SummaryOpts{
+			Name: "etcd_request_cache_add_latencies_summary",
+			Help: "(Deprecated) Latency in microseconds of adding an object to etcd cache",
+		},
+	)
+	deprecatedEtcdRequestLatenciesSummary = prometheus.NewSummaryVec(
+		prometheus.SummaryOpts{
+			Name: "etcd_request_latencies_summary",
+			Help: "(Deprecated) Etcd request latency summary in microseconds for each operation and object type.",
+		},
+		[]string{"operation", "type"},
+	)
 )

 var registerMetrics sync.Once
@ -79,8 +115,16 @@ func Register() {
 		prometheus.MustRegister(cacheEntryCounter)
 		prometheus.MustRegister(cacheAddLatency)
 		prometheus.MustRegister(cacheGetLatency)
-		prometheus.MustRegister(etcdRequestLatenciesSummary)
+		prometheus.MustRegister(etcdRequestLatency)
 		prometheus.MustRegister(objectCounts)
+
+		// TODO(danielqsj): Remove the following metrics, they are deprecated
+		prometheus.MustRegister(deprecatedCacheHitCounter)
+		prometheus.MustRegister(deprecatedCacheMissCounter)
+		prometheus.MustRegister(deprecatedCacheEntryCounter)
+		prometheus.MustRegister(deprecatedCacheAddLatency)
+		prometheus.MustRegister(deprecatedCacheGetLatency)
+		prometheus.MustRegister(deprecatedEtcdRequestLatenciesSummary)
 	})
 }

@ -89,27 +133,33 @@ func UpdateObjectCount(resourcePrefix string, count int64) {
 }

 func RecordEtcdRequestLatency(verb, resource string, startTime time.Time) {
-	etcdRequestLatenciesSummary.WithLabelValues(verb, resource).Observe(float64(time.Since(startTime) / time.Microsecond))
+	etcdRequestLatency.WithLabelValues(verb, resource).Observe(sinceInSeconds(startTime))
+	deprecatedEtcdRequestLatenciesSummary.WithLabelValues(verb, resource).Observe(sinceInMicroseconds(startTime))
 }

 func ObserveGetCache(startTime time.Time) {
-	cacheGetLatency.Observe(float64(time.Since(startTime) / time.Microsecond))
+	cacheGetLatency.Observe(sinceInSeconds(startTime))
+	deprecatedCacheGetLatency.Observe(sinceInMicroseconds(startTime))
 }

 func ObserveAddCache(startTime time.Time) {
-	cacheAddLatency.Observe(float64(time.Since(startTime) / time.Microsecond))
+	cacheAddLatency.Observe(sinceInSeconds(startTime))
+	deprecatedCacheAddLatency.Observe(sinceInMicroseconds(startTime))
 }

 func ObserveCacheHit() {
 	cacheHitCounter.Inc()
+	deprecatedCacheHitCounter.Inc()
 }

 func ObserveCacheMiss() {
 	cacheMissCounter.Inc()
+	deprecatedCacheMissCounter.Inc()
 }

 func ObserveNewEntry() {
 	cacheEntryCounter.Inc()
+	deprecatedCacheEntryCounter.Inc()
 }

 func Reset() {
@ -118,5 +168,20 @@ func Reset() {
 	cacheEntryCounter = prometheus.NewCounter(cacheEntryCounterOpts)
 	// TODO: Reset cacheAddLatency.
 	// TODO: Reset cacheGetLatency.
-	etcdRequestLatenciesSummary.Reset()
+	etcdRequestLatency.Reset()
+
+	deprecatedCacheHitCounter = prometheus.NewCounter(deprecatedCacheHitCounterOpts)
+	deprecatedCacheMissCounter = prometheus.NewCounter(deprecatedCacheMissCounterOpts)
+	deprecatedCacheEntryCounter = prometheus.NewCounter(deprecatedCacheEntryCounterOpts)
+	deprecatedEtcdRequestLatenciesSummary.Reset()
+}
+
+// sinceInMicroseconds gets the time since the specified start in microseconds.
+func sinceInMicroseconds(start time.Time) float64 {
+	return float64(time.Since(start).Nanoseconds() / time.Microsecond.Nanoseconds())
+}
+
+// sinceInSeconds gets the time since the specified start in seconds.
+func sinceInSeconds(start time.Time) float64 {
+	return time.Since(start).Seconds()
 }
--- a/staging/src/k8s.io/apiserver/pkg/storage/value/metrics.go
+++ b/staging/src/k8s.io/apiserver/pkg/storage/value/metrics.go
@ -30,11 +30,23 @@ const (

 var (
 	transformerLatencies = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Namespace: namespace,
+			Subsystem: subsystem,
+			Name:      "transformation_latencies_seconds",
+			Help:      "Latencies in seconds of value transformation operations.",
+			// In-process transformations (ex. AES CBC) complete on the order of 20 microseconds. However, when
+			// external KMS is involved latencies may climb into milliseconds.
+			Buckets: prometheus.ExponentialBuckets(5e-6, 2, 14),
+		},
+		[]string{"transformation_type"},
+	)
+	deprecatedTransformerLatencies = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Namespace: namespace,
 			Subsystem: subsystem,
 			Name:      "transformation_latencies_microseconds",
-			Help:      "Latencies in microseconds of value transformation operations.",
+			Help:      "(Deprecated) Latencies in microseconds of value transformation operations.",
 			// In-process transformations (ex. AES CBC) complete on the order of 20 microseconds. However, when
 			// external KMS is involved latencies may climb into milliseconds.
 			Buckets: prometheus.ExponentialBuckets(5, 2, 14),
@ -61,11 +73,20 @@ var (
 	)

 	dataKeyGenerationLatencies = prometheus.NewHistogram(
+		prometheus.HistogramOpts{
+			Namespace: namespace,
+			Subsystem: subsystem,
+			Name:      "data_key_generation_latencies_seconds",
+			Help:      "Latencies in seconds of data encryption key(DEK) generation operations.",
+			Buckets:   prometheus.ExponentialBuckets(5e-6, 2, 14),
+		},
+	)
+	deprecatedDataKeyGenerationLatencies = prometheus.NewHistogram(
 		prometheus.HistogramOpts{
 			Namespace: namespace,
 			Subsystem: subsystem,
 			Name:      "data_key_generation_latencies_microseconds",
-			Help:      "Latencies in microseconds of data encryption key(DEK) generation operations.",
+			Help:      "(Deprecated) Latencies in microseconds of data encryption key(DEK) generation operations.",
 			Buckets:   prometheus.ExponentialBuckets(5, 2, 14),
 		},
 	)
@ -84,9 +105,11 @@ var registerMetrics sync.Once
 func RegisterMetrics() {
 	registerMetrics.Do(func() {
 		prometheus.MustRegister(transformerLatencies)
+		prometheus.MustRegister(deprecatedTransformerLatencies)
 		prometheus.MustRegister(transformerFailuresTotal)
 		prometheus.MustRegister(envelopeTransformationCacheMissTotal)
 		prometheus.MustRegister(dataKeyGenerationLatencies)
+		prometheus.MustRegister(deprecatedDataKeyGenerationLatencies)
 		prometheus.MustRegister(dataKeyGenerationFailuresTotal)
 	})
 }
@ -98,8 +121,8 @@ func RecordTransformation(transformationType string, start time.Time, err error)
 		return
 	}

-	since := sinceInMicroseconds(start)
-	transformerLatencies.WithLabelValues(transformationType).Observe(float64(since))
+	transformerLatencies.WithLabelValues(transformationType).Observe(sinceInSeconds(start))
+	deprecatedTransformerLatencies.WithLabelValues(transformationType).Observe(sinceInMicroseconds(start))
 }

 // RecordCacheMiss records a miss on Key Encryption Key(KEK) - call to KMS was required to decrypt KEK.
@ -114,11 +137,16 @@ func RecordDataKeyGeneration(start time.Time, err error) {
 		return
 	}

-	since := sinceInMicroseconds(start)
-	dataKeyGenerationLatencies.Observe(float64(since))
+	dataKeyGenerationLatencies.Observe(sinceInSeconds(start))
+	deprecatedDataKeyGenerationLatencies.Observe(sinceInMicroseconds(start))
 }

-func sinceInMicroseconds(start time.Time) int64 {
-	elapsedNanoseconds := time.Since(start).Nanoseconds()
-	return elapsedNanoseconds / int64(time.Microsecond)
+// sinceInMicroseconds gets the time since the specified start in microseconds.
+func sinceInMicroseconds(start time.Time) float64 {
+	return float64(time.Since(start).Nanoseconds() / time.Microsecond.Nanoseconds())
+}
+
+// sinceInSeconds gets the time since the specified start in seconds.
+func sinceInSeconds(start time.Time) float64 {
+	return time.Since(start).Seconds()
 }
--- a/test/e2e/framework/metrics_util.go
+++ b/test/e2e/framework/metrics_util.go
@ -136,14 +136,14 @@ func (m *MetricsForE2E) SummaryKind() string {
 var SchedulingLatencyMetricName = model.LabelValue(schedulermetric.SchedulerSubsystem + "_" + schedulermetric.SchedulingLatencyName)

 var InterestingApiServerMetrics = []string{
-	"apiserver_request_count",
-	"apiserver_request_latencies_summary",
-	"etcd_helper_cache_entry_count",
-	"etcd_helper_cache_hit_count",
-	"etcd_helper_cache_miss_count",
-	"etcd_request_cache_add_latencies_summary",
-	"etcd_request_cache_get_latencies_summary",
-	"etcd_request_latencies_summary",
+	"apiserver_request_total",
+	"apiserver_request_latency_seconds_summary",
+	"etcd_helper_cache_entry_total",
+	"etcd_helper_cache_hit_total",
+	"etcd_helper_cache_miss_total",
+	"etcd_request_cache_add_latency_seconds",
+	"etcd_request_cache_get_latency_seconds",
+	"etcd_request_latency_seconds",
 }

 var InterestingControllerManagerMetrics = []string{
@ -475,10 +475,10 @@ func readLatencyMetrics(c clientset.Interface) (*APIResponsiveness, error) {

 	for _, sample := range samples {
 		// Example line:
-		// apiserver_request_latencies_summary{resource="namespaces",verb="LIST",quantile="0.99"} 908
-		// apiserver_request_count{resource="pods",verb="LIST",client="kubectl",code="200",contentType="json"} 233
-		if sample.Metric[model.MetricNameLabel] != "apiserver_request_latencies_summary" &&
-			sample.Metric[model.MetricNameLabel] != "apiserver_request_count" {
+		// apiserver_request_latency_seconds_summary{resource="namespaces",verb="LIST",quantile="0.99"} 0.000908
+		// apiserver_request_total{resource="pods",verb="LIST",client="kubectl",code="200",contentType="json"} 233
+		if sample.Metric[model.MetricNameLabel] != "apiserver_request_latency_seconds_summary" &&
+			sample.Metric[model.MetricNameLabel] != "apiserver_request_total" {
 			continue
 		}

@ -491,14 +491,14 @@ func readLatencyMetrics(c clientset.Interface) (*APIResponsiveness, error) {
 		}

 		switch sample.Metric[model.MetricNameLabel] {
-		case "apiserver_request_latencies_summary":
+		case "apiserver_request_latency_seconds_summary":
 			latency := sample.Value
 			quantile, err := strconv.ParseFloat(string(sample.Metric[model.QuantileLabel]), 64)
 			if err != nil {
 				return nil, err
 			}
-			a.addMetricRequestLatency(resource, subresource, verb, scope, quantile, time.Duration(int64(latency))*time.Microsecond)
-		case "apiserver_request_count":
+			a.addMetricRequestLatency(resource, subresource, verb, scope, quantile, time.Duration(int64(latency))*time.Second)
+		case "apiserver_request_total":
 			count := sample.Value
 			a.addMetricRequestCount(resource, subresource, verb, scope, int(count))

--- a/test/integration/metrics/metrics_test.go
+++ b/test/integration/metrics/metrics_test.go
@ -120,7 +120,7 @@ func TestApiserverMetrics(t *testing.T) {
 		t.Fatal(err)
 	}
 	checkForExpectedMetrics(t, metrics, []string{
-		"apiserver_request_count",
-		"apiserver_request_latencies",
+		"apiserver_request_total",
+		"apiserver_request_latency_seconds",
 	})
 }