Merge pull request #64526 from misterikkit/metrics

Add scheduler throughput metric
2018-09-25 00:17:40 -07:00 · 2018-09-25 00:17:40 -07:00 · e1989af060
parent 69da1f8399 b0a8dbbc9d
commit e1989af060
2 changed files with 31 additions and 2 deletions
--- a/pkg/scheduler/metrics/metrics.go
+++ b/pkg/scheduler/metrics/metrics.go
@ -46,6 +46,18 @@ const (

 // All the histogram based metrics have 1ms as size for the smallest bucket.
 var (
+	scheduleAttempts = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: SchedulerSubsystem,
+			Name:      "schedule_attempts_total",
+			Help:      "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
+		}, []string{"result"})
+	// PodScheduleSuccesses counts how many pods were scheduled.
+	PodScheduleSuccesses = scheduleAttempts.With(prometheus.Labels{"result": "scheduled"})
+	// PodScheduleFailures counts how many pods could not be scheduled.
+	PodScheduleFailures = scheduleAttempts.With(prometheus.Labels{"result": "unschedulable"})
+	// PodScheduleErrors counts how many pods could not be scheduled due to a scheduler error.
+	PodScheduleErrors = scheduleAttempts.With(prometheus.Labels{"result": "error"})
 	SchedulingLatency = prometheus.NewSummaryVec(
 		prometheus.SummaryOpts{
 			Subsystem: SchedulerSubsystem,
@ -135,6 +147,7 @@ var (
 		}, []string{"result"})

 	metricsList = []prometheus.Collector{
+		scheduleAttempts,
 		SchedulingLatency,
 		E2eSchedulingLatency,
 		SchedulingAlgorithmLatency,
--- a/pkg/scheduler/scheduler.go
+++ b/pkg/scheduler/scheduler.go
@ -416,6 +416,13 @@ func (sched *Scheduler) scheduleOne() {
 			metrics.PreemptionAttempts.Inc()
 			metrics.SchedulingAlgorithmPremptionEvaluationDuration.Observe(metrics.SinceInMicroseconds(preemptionStartTime))
 			metrics.SchedulingLatency.WithLabelValues(metrics.PreemptionEvaluation).Observe(metrics.SinceInSeconds(preemptionStartTime))
+			// Pod did not fit anywhere, so it is counted as a failure. If preemption
+			// succeeds, the pod should get counted as a success the next time we try to
+			// schedule it. (hopefully)
+			metrics.PodScheduleFailures.Inc()
+		} else {
+			glog.Errorf("error selecting node for pod: %v", err)
+			metrics.PodScheduleErrors.Inc()
 		}
 		return
 	}
@ -433,20 +440,26 @@ func (sched *Scheduler) scheduleOne() {
 	// This function modifies 'assumedPod' if volume binding is required.
 	allBound, err := sched.assumeVolumes(assumedPod, suggestedHost)
 	if err != nil {
+		glog.Errorf("error assuming volumes: %v", err)
+		metrics.PodScheduleErrors.Inc()
 		return
 	}

 	// assume modifies `assumedPod` by setting NodeName=suggestedHost
 	err = sched.assume(assumedPod, suggestedHost)
 	if err != nil {
+		glog.Errorf("error assuming pod: %v", err)
+		metrics.PodScheduleErrors.Inc()
 		return
 	}
 	// bind the pod to its host asynchronously (we can do this b/c of the assumption step above).
 	go func() {
 		// Bind volumes first before Pod
 		if !allBound {
-			err = sched.bindVolumes(assumedPod)
+			err := sched.bindVolumes(assumedPod)
 			if err != nil {
+				glog.Errorf("error binding volumes: %v", err)
+				metrics.PodScheduleErrors.Inc()
 				return
 			}
 		}
@ -460,7 +473,10 @@ func (sched *Scheduler) scheduleOne() {
 		})
 		metrics.E2eSchedulingLatency.Observe(metrics.SinceInMicroseconds(start))
 		if err != nil {
-			glog.Errorf("Internal error binding pod: (%v)", err)
+			glog.Errorf("error binding pod: %v", err)
+			metrics.PodScheduleErrors.Inc()
+		} else {
+			metrics.PodScheduleSuccesses.Inc()
 		}
 	}()
 }