Merge pull request #63260 from misterikkit/ecache-metrics

Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. scheduler: add metrics to equivalence cache This adds counters to equiv. cache reads & writes. Reads are labeled by hit/miss, while writes are labeled to indicate whether the write was discarded. This will give us visibility into, - hit rate of cache reads - ratio of reads to writes - rate of discarded writes **What this PR does / why we need it**: **Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*: Fixes https://github.com/kubernetes/kubernetes/issues/63259 **Special notes for your reviewer**: **Release note**: ```release-note NONE ```
2018-08-17 01:10:51 -07:00 · 2018-08-17 01:10:51 -07:00 · eeb3389f3b
parent 7ff2feea9a b874d2789b
commit eeb3389f3b
3 changed files with 29 additions and 0 deletions
--- a/pkg/scheduler/core/equivalence/BUILD
+++ b/pkg/scheduler/core/equivalence/BUILD
@ -9,6 +9,7 @@ go_library(
        "//pkg/scheduler/algorithm:go_default_library",
        "//pkg/scheduler/algorithm/predicates:go_default_library",
        "//pkg/scheduler/cache:go_default_library",
+        "//pkg/scheduler/metrics:go_default_library",
        "//pkg/util/hash:go_default_library",
        "//staging/src/k8s.io/api/core/v1:go_default_library",
        "//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library",
--- a/pkg/scheduler/core/equivalence/eqivalence.go
+++ b/pkg/scheduler/core/equivalence/eqivalence.go
@ -23,6 +23,8 @@ import (
 	"hash/fnv"
 	"sync"

+	"k8s.io/kubernetes/pkg/scheduler/metrics"
+
 	"k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/kubernetes/pkg/scheduler/algorithm"
@ -244,10 +246,12 @@ func (n *NodeCache) updateResult(
 ) {
 	if nodeInfo == nil || nodeInfo.Node() == nil {
 		// This may happen during tests.
+		metrics.EquivalenceCacheWrites.WithLabelValues("discarded_bad_node").Inc()
 		return
 	}
 	// Skip update if NodeInfo is stale.
 	if !cache.IsUpToDate(nodeInfo) {
+		metrics.EquivalenceCacheWrites.WithLabelValues("discarded_stale").Inc()
 		return
 	}

@ -282,6 +286,11 @@ func (n *NodeCache) lookupResult(
 	n.mu.RLock()
 	defer n.mu.RUnlock()
 	value, ok = n.cache[predicateKey][equivalenceHash]
+	if ok {
+		metrics.EquivalenceCacheHits.Inc()
+	} else {
+		metrics.EquivalenceCacheMisses.Inc()
+	}
 	return value, ok
 }

--- a/pkg/scheduler/metrics/metrics.go
+++ b/pkg/scheduler/metrics/metrics.go
@ -117,6 +117,23 @@ var (
 			Name:      "total_preemption_attempts",
 			Help:      "Total preemption attempts in the cluster till now",
 		})
+
+	equivalenceCacheLookups = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: SchedulerSubsystem,
+			Name:      "equiv_cache_lookups_total",
+			Help:      "Total number of equivalence cache lookups, by whether or not a cache entry was found",
+		}, []string{"result"})
+	EquivalenceCacheHits   = equivalenceCacheLookups.With(prometheus.Labels{"result": "hit"})
+	EquivalenceCacheMisses = equivalenceCacheLookups.With(prometheus.Labels{"result": "miss"})
+
+	EquivalenceCacheWrites = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: SchedulerSubsystem,
+			Name:      "equiv_cache_writes",
+			Help:      "Total number of equivalence cache writes, by result",
+		}, []string{"result"})
+
 	metricsList = []prometheus.Collector{
 		SchedulingLatency,
 		E2eSchedulingLatency,
@ -127,6 +144,8 @@ var (
 		SchedulingAlgorithmPremptionEvaluationDuration,
 		PreemptionVictims,
 		PreemptionAttempts,
+		equivalenceCacheLookups,
+		EquivalenceCacheWrites,
 	}
 )