diff --git a/pkg/scheduler/metrics/metrics.go b/pkg/scheduler/metrics/metrics.go index bd02b23fb4..81e047e0c8 100644 --- a/pkg/scheduler/metrics/metrics.go +++ b/pkg/scheduler/metrics/metrics.go @@ -46,6 +46,18 @@ const ( // All the histogram based metrics have 1ms as size for the smallest bucket. var ( + scheduleAttempts = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: SchedulerSubsystem, + Name: "schedule_attempts_total", + Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.", + }, []string{"result"}) + // PodScheduleSuccesses counts how many pods were scheduled. + PodScheduleSuccesses = scheduleAttempts.With(prometheus.Labels{"result": "scheduled"}) + // PodScheduleFailures counts how many pods could not be scheduled. + PodScheduleFailures = scheduleAttempts.With(prometheus.Labels{"result": "unschedulable"}) + // PodScheduleErrors counts how many pods could not be scheduled due to a scheduler error. + PodScheduleErrors = scheduleAttempts.With(prometheus.Labels{"result": "error"}) SchedulingLatency = prometheus.NewSummaryVec( prometheus.SummaryOpts{ Subsystem: SchedulerSubsystem, @@ -135,6 +147,7 @@ var ( }, []string{"result"}) metricsList = []prometheus.Collector{ + scheduleAttempts, SchedulingLatency, E2eSchedulingLatency, SchedulingAlgorithmLatency, diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 3d836b6ee5..85c9383ff2 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -416,6 +416,13 @@ func (sched *Scheduler) scheduleOne() { metrics.PreemptionAttempts.Inc() metrics.SchedulingAlgorithmPremptionEvaluationDuration.Observe(metrics.SinceInMicroseconds(preemptionStartTime)) metrics.SchedulingLatency.WithLabelValues(metrics.PreemptionEvaluation).Observe(metrics.SinceInSeconds(preemptionStartTime)) + // Pod did not fit anywhere, so it is counted as a failure. If preemption + // succeeds, the pod should get counted as a success the next time we try to + // schedule it. (hopefully) + metrics.PodScheduleFailures.Inc() + } else { + glog.Errorf("error selecting node for pod: %v", err) + metrics.PodScheduleErrors.Inc() } return } @@ -433,20 +440,26 @@ func (sched *Scheduler) scheduleOne() { // This function modifies 'assumedPod' if volume binding is required. allBound, err := sched.assumeVolumes(assumedPod, suggestedHost) if err != nil { + glog.Errorf("error assuming volumes: %v", err) + metrics.PodScheduleErrors.Inc() return } // assume modifies `assumedPod` by setting NodeName=suggestedHost err = sched.assume(assumedPod, suggestedHost) if err != nil { + glog.Errorf("error assuming pod: %v", err) + metrics.PodScheduleErrors.Inc() return } // bind the pod to its host asynchronously (we can do this b/c of the assumption step above). go func() { // Bind volumes first before Pod if !allBound { - err = sched.bindVolumes(assumedPod) + err := sched.bindVolumes(assumedPod) if err != nil { + glog.Errorf("error binding volumes: %v", err) + metrics.PodScheduleErrors.Inc() return } } @@ -460,7 +473,10 @@ func (sched *Scheduler) scheduleOne() { }) metrics.E2eSchedulingLatency.Observe(metrics.SinceInMicroseconds(start)) if err != nil { - glog.Errorf("Internal error binding pod: (%v)", err) + glog.Errorf("error binding pod: %v", err) + metrics.PodScheduleErrors.Inc() + } else { + metrics.PodScheduleSuccesses.Inc() } }() }