Add scheduler throughput metric.

This adds a counter to the scheduler that can be used to calculate
throughput and error ratio. Pods which fail to schedule are not counted
as errors, but can still be tracked separately from successes.

We already measure scheduler latency, but throughput was missing. This
should be considered a key metric for the scheduler.
pull/58/head
Jonathan Basseri 2018-05-30 11:59:55 -07:00
parent 170dcc2ea0
commit b0a8dbbc9d
2 changed files with 31 additions and 2 deletions

View File

@ -46,6 +46,18 @@ const (
// All the histogram based metrics have 1ms as size for the smallest bucket.
var (
scheduleAttempts = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: SchedulerSubsystem,
Name: "schedule_attempts_total",
Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
}, []string{"result"})
// PodScheduleSuccesses counts how many pods were scheduled.
PodScheduleSuccesses = scheduleAttempts.With(prometheus.Labels{"result": "scheduled"})
// PodScheduleFailures counts how many pods could not be scheduled.
PodScheduleFailures = scheduleAttempts.With(prometheus.Labels{"result": "unschedulable"})
// PodScheduleErrors counts how many pods could not be scheduled due to a scheduler error.
PodScheduleErrors = scheduleAttempts.With(prometheus.Labels{"result": "error"})
SchedulingLatency = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Subsystem: SchedulerSubsystem,
@ -135,6 +147,7 @@ var (
}, []string{"result"})
metricsList = []prometheus.Collector{
scheduleAttempts,
SchedulingLatency,
E2eSchedulingLatency,
SchedulingAlgorithmLatency,

View File

@ -416,6 +416,13 @@ func (sched *Scheduler) scheduleOne() {
metrics.PreemptionAttempts.Inc()
metrics.SchedulingAlgorithmPremptionEvaluationDuration.Observe(metrics.SinceInMicroseconds(preemptionStartTime))
metrics.SchedulingLatency.WithLabelValues(metrics.PreemptionEvaluation).Observe(metrics.SinceInSeconds(preemptionStartTime))
// Pod did not fit anywhere, so it is counted as a failure. If preemption
// succeeds, the pod should get counted as a success the next time we try to
// schedule it. (hopefully)
metrics.PodScheduleFailures.Inc()
} else {
glog.Errorf("error selecting node for pod: %v", err)
metrics.PodScheduleErrors.Inc()
}
return
}
@ -433,20 +440,26 @@ func (sched *Scheduler) scheduleOne() {
// This function modifies 'assumedPod' if volume binding is required.
allBound, err := sched.assumeVolumes(assumedPod, suggestedHost)
if err != nil {
glog.Errorf("error assuming volumes: %v", err)
metrics.PodScheduleErrors.Inc()
return
}
// assume modifies `assumedPod` by setting NodeName=suggestedHost
err = sched.assume(assumedPod, suggestedHost)
if err != nil {
glog.Errorf("error assuming pod: %v", err)
metrics.PodScheduleErrors.Inc()
return
}
// bind the pod to its host asynchronously (we can do this b/c of the assumption step above).
go func() {
// Bind volumes first before Pod
if !allBound {
err = sched.bindVolumes(assumedPod)
err := sched.bindVolumes(assumedPod)
if err != nil {
glog.Errorf("error binding volumes: %v", err)
metrics.PodScheduleErrors.Inc()
return
}
}
@ -460,7 +473,10 @@ func (sched *Scheduler) scheduleOne() {
})
metrics.E2eSchedulingLatency.Observe(metrics.SinceInMicroseconds(start))
if err != nil {
glog.Errorf("Internal error binding pod: (%v)", err)
glog.Errorf("error binding pod: %v", err)
metrics.PodScheduleErrors.Inc()
} else {
metrics.PodScheduleSuccesses.Inc()
}
}()
}