From e32910a5441a30b25dbe04aaafb48e154b2e3953 Mon Sep 17 00:00:00 2001
From: Krzysztof Siedlecki <ksiedlecki@google.com>
Date: Thu, 7 Jun 2018 13:20:26 +0200
Subject: [PATCH] Readding summary metrics

---
 cmd/kube-scheduler/app/BUILD       |  1 +
 cmd/kube-scheduler/app/server.go   | 21 +++++++--
 pkg/scheduler/metrics/metrics.go   | 75 ++++++++++++++++++++++-------
 pkg/scheduler/scheduler.go         |  2 +
 test/e2e/framework/BUILD           |  1 +
 test/e2e/framework/metrics_util.go | 76 +++++++++++++++++++-----------
 test/e2e/scalability/density.go    |  3 +-
 7 files changed, 128 insertions(+), 51 deletions(-)

diff --git a/cmd/kube-scheduler/app/BUILD b/cmd/kube-scheduler/app/BUILD
index d6de2aaef8..a98dc30914 100644
--- a/cmd/kube-scheduler/app/BUILD
+++ b/cmd/kube-scheduler/app/BUILD
@@ -21,6 +21,7 @@ go_library(
         "//pkg/scheduler/api:go_default_library",
         "//pkg/scheduler/api/latest:go_default_library",
         "//pkg/scheduler/factory:go_default_library",
+        "//pkg/scheduler/metrics:go_default_library",
         "//pkg/util/configz:go_default_library",
         "//pkg/util/flag:go_default_library",
         "//pkg/version:go_default_library",
diff --git a/cmd/kube-scheduler/app/server.go b/cmd/kube-scheduler/app/server.go
index 11ace1a983..ee776c89b6 100644
--- a/cmd/kube-scheduler/app/server.go
+++ b/cmd/kube-scheduler/app/server.go
@@ -19,6 +19,7 @@ package app
 
 import (
 	"fmt"
+	"io"
 	"io/ioutil"
 	"net/http"
 	"os"
@@ -51,6 +52,7 @@ import (
 	schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
 	latestschedulerapi "k8s.io/kubernetes/pkg/scheduler/api/latest"
 	"k8s.io/kubernetes/pkg/scheduler/factory"
+	"k8s.io/kubernetes/pkg/scheduler/metrics"
 	"k8s.io/kubernetes/pkg/util/configz"
 	utilflag "k8s.io/kubernetes/pkg/util/flag"
 	"k8s.io/kubernetes/pkg/version"
@@ -221,11 +223,23 @@ func buildHandlerChain(handler http.Handler, authn authenticator.Request, authz
 	return handler
 }
 
+func installMetricHandler(pathRecorderMux *mux.PathRecorderMux) {
+	configz.InstallHandler(pathRecorderMux)
+	defaultMetricsHandler := prometheus.Handler().ServeHTTP
+	pathRecorderMux.HandleFunc("/metrics", func(w http.ResponseWriter, req *http.Request) {
+		if req.Method == "DELETE" {
+			metrics.Reset()
+			io.WriteString(w, "metrics reset\n")
+			return
+		}
+		defaultMetricsHandler(w, req)
+	})
+}
+
 // newMetricsHandler builds a metrics server from the config.
 func newMetricsHandler(config *componentconfig.KubeSchedulerConfiguration) http.Handler {
 	pathRecorderMux := mux.NewPathRecorderMux("kube-scheduler")
-	configz.InstallHandler(pathRecorderMux)
-	pathRecorderMux.Handle("/metrics", prometheus.Handler())
+	installMetricHandler(pathRecorderMux)
 	if config.EnableProfiling {
 		routes.Profiling{}.Install(pathRecorderMux)
 		if config.EnableContentionProfiling {
@@ -242,8 +256,7 @@ func newHealthzHandler(config *componentconfig.KubeSchedulerConfiguration, separ
 	pathRecorderMux := mux.NewPathRecorderMux("kube-scheduler")
 	healthz.InstallHandler(pathRecorderMux)
 	if !separateMetrics {
-		configz.InstallHandler(pathRecorderMux)
-		pathRecorderMux.Handle("/metrics", prometheus.Handler())
+		installMetricHandler(pathRecorderMux)
 	}
 	if config.EnableProfiling {
 		routes.Profiling{}.Install(pathRecorderMux)
diff --git a/pkg/scheduler/metrics/metrics.go b/pkg/scheduler/metrics/metrics.go
index 4836ac4b93..2fbba9980c 100644
--- a/pkg/scheduler/metrics/metrics.go
+++ b/pkg/scheduler/metrics/metrics.go
@@ -23,13 +23,37 @@ import (
 	"github.com/prometheus/client_golang/prometheus"
 )
 
-const schedulerSubsystem = "scheduler"
+const (
+	// SchedulerSubsystem - subsystem name used by scheduler
+	SchedulerSubsystem = "scheduler"
+	// SchedulingLatencyName - scheduler latency metric name
+	SchedulingLatencyName = "scheduling_latency_seconds"
+
+	// OperationLabel - operation label name
+	OperationLabel = "operation"
+	// Binding - binding operation label value
+	Binding = "binding"
+	// SelectingNode - selecting node operation label value
+	SelectingNode = "selecting_node"
+	// E2eScheduling - e2e scheduling operation label value
+)
 
 // All the histogram based metrics have 1ms as size for the smallest bucket.
 var (
+	SchedulingLatency = prometheus.NewSummaryVec(
+		prometheus.SummaryOpts{
+			Subsystem: SchedulerSubsystem,
+			Name:      SchedulingLatencyName,
+			Help:      "Scheduling latency in seconds split by sub-parts of the scheduling operation",
+			// Make the sliding window of 5h.
+			// TODO: The value for this should be based on some SLI definition (long term).
+			MaxAge: 5 * time.Hour,
+		},
+		[]string{OperationLabel},
+	)
 	E2eSchedulingLatency = prometheus.NewHistogram(
 		prometheus.HistogramOpts{
-			Subsystem: schedulerSubsystem,
+			Subsystem: SchedulerSubsystem,
 			Name:      "e2e_scheduling_latency_microseconds",
 			Help:      "E2e scheduling latency (scheduling algorithm + binding)",
 			Buckets:   prometheus.ExponentialBuckets(1000, 2, 15),
@@ -37,7 +61,7 @@ var (
 	)
 	SchedulingAlgorithmLatency = prometheus.NewHistogram(
 		prometheus.HistogramOpts{
-			Subsystem: schedulerSubsystem,
+			Subsystem: SchedulerSubsystem,
 			Name:      "scheduling_algorithm_latency_microseconds",
 			Help:      "Scheduling algorithm latency",
 			Buckets:   prometheus.ExponentialBuckets(1000, 2, 15),
@@ -45,7 +69,7 @@ var (
 	)
 	SchedulingAlgorithmPredicateEvaluationDuration = prometheus.NewHistogram(
 		prometheus.HistogramOpts{
-			Subsystem: schedulerSubsystem,
+			Subsystem: SchedulerSubsystem,
 			Name:      "scheduling_algorithm_predicate_evaluation",
 			Help:      "Scheduling algorithm predicate evaluation duration",
 			Buckets:   prometheus.ExponentialBuckets(1000, 2, 15),
@@ -53,7 +77,7 @@ var (
 	)
 	SchedulingAlgorithmPriorityEvaluationDuration = prometheus.NewHistogram(
 		prometheus.HistogramOpts{
-			Subsystem: schedulerSubsystem,
+			Subsystem: SchedulerSubsystem,
 			Name:      "scheduling_algorithm_priority_evaluation",
 			Help:      "Scheduling algorithm priority evaluation duration",
 			Buckets:   prometheus.ExponentialBuckets(1000, 2, 15),
@@ -61,7 +85,7 @@ var (
 	)
 	SchedulingAlgorithmPremptionEvaluationDuration = prometheus.NewHistogram(
 		prometheus.HistogramOpts{
-			Subsystem: schedulerSubsystem,
+			Subsystem: SchedulerSubsystem,
 			Name:      "scheduling_algorithm_preemption_evaluation",
 			Help:      "Scheduling algorithm preemption evaluation duration",
 			Buckets:   prometheus.ExponentialBuckets(1000, 2, 15),
@@ -69,7 +93,7 @@ var (
 	)
 	BindingLatency = prometheus.NewHistogram(
 		prometheus.HistogramOpts{
-			Subsystem: schedulerSubsystem,
+			Subsystem: SchedulerSubsystem,
 			Name:      "binding_latency_microseconds",
 			Help:      "Binding latency",
 			Buckets:   prometheus.ExponentialBuckets(1000, 2, 15),
@@ -77,16 +101,27 @@ var (
 	)
 	PreemptionVictims = prometheus.NewGauge(
 		prometheus.GaugeOpts{
-			Subsystem: schedulerSubsystem,
+			Subsystem: SchedulerSubsystem,
 			Name:      "pod_preemption_victims",
 			Help:      "Number of selected preemption victims",
 		})
 	PreemptionAttempts = prometheus.NewCounter(
 		prometheus.CounterOpts{
-			Subsystem: schedulerSubsystem,
+			Subsystem: SchedulerSubsystem,
 			Name:      "total_preemption_attempts",
 			Help:      "Total preemption attempts in the cluster till now",
 		})
+	metricsList = []prometheus.Collector{
+		SchedulingLatency,
+		E2eSchedulingLatency,
+		SchedulingAlgorithmLatency,
+		BindingLatency,
+		SchedulingAlgorithmPredicateEvaluationDuration,
+		SchedulingAlgorithmPriorityEvaluationDuration,
+		SchedulingAlgorithmPremptionEvaluationDuration,
+		PreemptionVictims,
+		PreemptionAttempts,
+	}
 )
 
 var registerMetrics sync.Once
@@ -95,19 +130,23 @@ var registerMetrics sync.Once
 func Register() {
 	// Register the metrics.
 	registerMetrics.Do(func() {
-		prometheus.MustRegister(E2eSchedulingLatency)
-		prometheus.MustRegister(SchedulingAlgorithmLatency)
-		prometheus.MustRegister(BindingLatency)
-
-		prometheus.MustRegister(SchedulingAlgorithmPredicateEvaluationDuration)
-		prometheus.MustRegister(SchedulingAlgorithmPriorityEvaluationDuration)
-		prometheus.MustRegister(SchedulingAlgorithmPremptionEvaluationDuration)
-		prometheus.MustRegister(PreemptionVictims)
-		prometheus.MustRegister(PreemptionAttempts)
+		for _, metric := range metricsList {
+			prometheus.MustRegister(metric)
+		}
 	})
 }
 
+// Reset resets metrics
+func Reset() {
+	SchedulingLatency.Reset()
+}
+
 // SinceInMicroseconds gets the time since the specified start in microseconds.
 func SinceInMicroseconds(start time.Time) float64 {
 	return float64(time.Since(start).Nanoseconds() / time.Microsecond.Nanoseconds())
 }
+
+// SinceInSeconds gets the time since the specified start in seconds.
+func SinceInSeconds(start time.Time) float64 {
+	return time.Since(start).Seconds()
+}
diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go
index eb8d948af1..bc646548b4 100644
--- a/pkg/scheduler/scheduler.go
+++ b/pkg/scheduler/scheduler.go
@@ -430,6 +430,7 @@ func (sched *Scheduler) bind(assumed *v1.Pod, b *v1.Binding) error {
 	}
 
 	metrics.BindingLatency.Observe(metrics.SinceInMicroseconds(bindingStart))
+	metrics.SchedulingLatency.WithLabelValues(metrics.Binding).Observe(metrics.SinceInSeconds(bindingStart))
 	sched.config.Recorder.Eventf(assumed, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v/%v to %v", assumed.Namespace, assumed.Name, b.Target.Name)
 	return nil
 }
@@ -462,6 +463,7 @@ func (sched *Scheduler) scheduleOne() {
 		return
 	}
 	metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInMicroseconds(start))
+	metrics.SchedulingLatency.WithLabelValues(metrics.SelectingNode).Observe(metrics.SinceInSeconds(start))
 	// Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet.
 	// This allows us to keep scheduling without waiting on binding to occur.
 	assumedPod := pod.DeepCopy()
diff --git a/test/e2e/framework/BUILD b/test/e2e/framework/BUILD
index 022339ae39..fc650ed9fc 100644
--- a/test/e2e/framework/BUILD
+++ b/test/e2e/framework/BUILD
@@ -72,6 +72,7 @@ go_library(
         "//pkg/master/ports:go_default_library",
         "//pkg/scheduler/algorithm/predicates:go_default_library",
         "//pkg/scheduler/cache:go_default_library",
+        "//pkg/scheduler/metrics:go_default_library",
         "//pkg/security/podsecuritypolicy/seccomp:go_default_library",
         "//pkg/ssh:go_default_library",
         "//pkg/util/file:go_default_library",
diff --git a/test/e2e/framework/metrics_util.go b/test/e2e/framework/metrics_util.go
index e8ae03d375..2562dc4720 100644
--- a/test/e2e/framework/metrics_util.go
+++ b/test/e2e/framework/metrics_util.go
@@ -32,6 +32,7 @@ import (
 	"k8s.io/apimachinery/pkg/util/sets"
 	clientset "k8s.io/client-go/kubernetes"
 	"k8s.io/kubernetes/pkg/master/ports"
+	schedulermetric "k8s.io/kubernetes/pkg/scheduler/metrics"
 	"k8s.io/kubernetes/pkg/util/system"
 	"k8s.io/kubernetes/test/e2e/framework/metrics"
 
@@ -130,6 +131,8 @@ func (m *MetricsForE2E) SummaryKind() string {
 	return "MetricsForE2E"
 }
 
+var SchedulingLatencyMetricName = model.LabelValue(schedulermetric.SchedulerSubsystem + "_" + schedulermetric.SchedulingLatencyName)
+
 var InterestingApiServerMetrics = []string{
 	"apiserver_request_count",
 	"apiserver_request_latencies_summary",
@@ -207,13 +210,12 @@ func (l *PodStartupLatency) PrintJSON() string {
 }
 
 type SchedulingMetrics struct {
-	SchedulingLatency LatencyMetric `json:"schedulingLatency"`
-	BindingLatency    LatencyMetric `json:"bindingLatency"`
-	E2ELatency        LatencyMetric `json:"e2eLatency"`
-	ThroughputAverage float64       `json:"throughputAverage"`
-	ThroughputPerc50  float64       `json:"throughputPerc50"`
-	ThroughputPerc90  float64       `json:"throughputPerc90"`
-	ThroughputPerc99  float64       `json:"throughputPerc99"`
+	SelectingNodeLatency LatencyMetric `json:"selectingNodeLatency"`
+	BindingLatency       LatencyMetric `json:"bindingLatency"`
+	ThroughputAverage    float64       `json:"throughputAverage"`
+	ThroughputPerc50     float64       `json:"throughputPerc50"`
+	ThroughputPerc90     float64       `json:"throughputPerc90"`
+	ThroughputPerc99     float64       `json:"throughputPerc99"`
 }
 
 func (l *SchedulingMetrics) SummaryKind() string {
@@ -442,27 +444,29 @@ func getMetrics(c clientset.Interface) (string, error) {
 	return string(body), nil
 }
 
-// Retrieves scheduler latency metrics.
-func getSchedulingLatency(c clientset.Interface) (*SchedulingMetrics, error) {
-	result := SchedulingMetrics{}
+// Sends REST request to kube scheduler metrics
+func sendRestRequestToScheduler(c clientset.Interface, op string) (string, error) {
+	opUpper := strings.ToUpper(op)
+	if opUpper != "GET" && opUpper != "DELETE" {
+		return "", fmt.Errorf("Unknown REST request")
+	}
 
-	// Check if master Node is registered
 	nodes, err := c.CoreV1().Nodes().List(metav1.ListOptions{})
 	ExpectNoError(err)
 
-	var data string
 	var masterRegistered = false
 	for _, node := range nodes.Items {
 		if system.IsMasterNode(node.Name) {
 			masterRegistered = true
 		}
 	}
+
+	var responseText string
 	if masterRegistered {
 		ctx, cancel := context.WithTimeout(context.Background(), SingleCallTimeout)
 		defer cancel()
 
-		var rawData []byte
-		rawData, err = c.CoreV1().RESTClient().Get().
+		body, err := c.CoreV1().RESTClient().Verb(opUpper).
 			Context(ctx).
 			Namespace(metav1.NamespaceSystem).
 			Resource("pods").
@@ -472,45 +476,55 @@ func getSchedulingLatency(c clientset.Interface) (*SchedulingMetrics, error) {
 			Do().Raw()
 
 		ExpectNoError(err)
-		data = string(rawData)
+		responseText = string(body)
 	} else {
 		// If master is not registered fall back to old method of using SSH.
 		if TestContext.Provider == "gke" {
 			Logf("Not grabbing scheduler metrics through master SSH: unsupported for gke")
-			return nil, nil
+			return "", nil
 		}
-		cmd := "curl http://localhost:10251/metrics"
+
+		cmd := "curl -X " + opUpper + " http://localhost:10251/metrics"
 		sshResult, err := SSH(cmd, GetMasterHost()+":22", TestContext.Provider)
 		if err != nil || sshResult.Code != 0 {
-			return &result, fmt.Errorf("unexpected error (code: %d) in ssh connection to master: %#v", sshResult.Code, err)
+			return "", fmt.Errorf("unexpected error (code: %d) in ssh connection to master: %#v", sshResult.Code, err)
 		}
-		data = sshResult.Stdout
+		responseText = sshResult.Stdout
 	}
+	return responseText, nil
+}
+
+// Retrieves scheduler latency metrics.
+func getSchedulingLatency(c clientset.Interface) (*SchedulingMetrics, error) {
+	result := SchedulingMetrics{}
+	data, err := sendRestRequestToScheduler(c, "GET")
+
 	samples, err := extractMetricSamples(data)
 	if err != nil {
 		return nil, err
 	}
 
 	for _, sample := range samples {
+		if sample.Metric[model.MetricNameLabel] != SchedulingLatencyMetricName {
+			continue
+		}
+
 		var metric *LatencyMetric = nil
-		switch sample.Metric[model.MetricNameLabel] {
-		case "scheduler_scheduling_algorithm_latency_microseconds":
-			metric = &result.SchedulingLatency
-		case "scheduler_binding_latency_microseconds":
+		switch sample.Metric[schedulermetric.OperationLabel] {
+		case schedulermetric.SelectingNode:
+			metric = &result.SelectingNodeLatency
+		case schedulermetric.Binding:
 			metric = &result.BindingLatency
-		case "scheduler_e2e_scheduling_latency_microseconds":
-			metric = &result.E2ELatency
 		}
 		if metric == nil {
 			continue
 		}
 
-		latency := sample.Value
 		quantile, err := strconv.ParseFloat(string(sample.Metric[model.QuantileLabel]), 64)
 		if err != nil {
 			return nil, err
 		}
-		setQuantile(metric, quantile, time.Duration(int64(latency))*time.Microsecond)
+		setQuantile(metric, quantile, time.Duration(int64(float64(sample.Value)*float64(time.Second))))
 	}
 	return &result, nil
 }
@@ -524,6 +538,14 @@ func VerifySchedulerLatency(c clientset.Interface) (*SchedulingMetrics, error) {
 	return latency, nil
 }
 
+func ResetSchedulerMetrics(c clientset.Interface) error {
+	responseText, err := sendRestRequestToScheduler(c, "DELETE")
+	if err != nil || responseText != "metrics reset\n" {
+		return fmt.Errorf("Unexpected response: %q", responseText)
+	}
+	return nil
+}
+
 func PrettyPrintJSON(metrics interface{}) string {
 	output := &bytes.Buffer{}
 	if err := json.NewEncoder(output).Encode(metrics); err != nil {
diff --git a/test/e2e/scalability/density.go b/test/e2e/scalability/density.go
index 92d4f8b991..30df63e7af 100644
--- a/test/e2e/scalability/density.go
+++ b/test/e2e/scalability/density.go
@@ -413,8 +413,6 @@ var _ = SIGDescribe("Density", func() {
 		}
 
 		// Verify scheduler metrics.
-		// TODO: Reset metrics at the beginning of the test.
-		// We should do something similar to how we do it for APIserver.
 		latency, err := framework.VerifySchedulerLatency(c)
 		framework.ExpectNoError(err)
 		if err == nil {
@@ -469,6 +467,7 @@ var _ = SIGDescribe("Density", func() {
 
 		uuid = string(utiluuid.NewUUID())
 
+		framework.ExpectNoError(framework.ResetSchedulerMetrics(c))
 		framework.ExpectNoError(framework.ResetMetrics(c))
 		framework.ExpectNoError(os.Mkdir(fmt.Sprintf(framework.TestContext.OutputDir+"/%s", uuid), 0777))