mirror of https://github.com/k3s-io/k3s
Add scheduler throughput metric.
This adds a counter to the scheduler that can be used to calculate throughput and error ratio. Pods which fail to schedule are not counted as errors, but can still be tracked separately from successes. We already measure scheduler latency, but throughput was missing. This should be considered a key metric for the scheduler.pull/58/head
parent
170dcc2ea0
commit
b0a8dbbc9d
|
@ -46,6 +46,18 @@ const (
|
|||
|
||||
// All the histogram based metrics have 1ms as size for the smallest bucket.
|
||||
var (
|
||||
scheduleAttempts = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Subsystem: SchedulerSubsystem,
|
||||
Name: "schedule_attempts_total",
|
||||
Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
|
||||
}, []string{"result"})
|
||||
// PodScheduleSuccesses counts how many pods were scheduled.
|
||||
PodScheduleSuccesses = scheduleAttempts.With(prometheus.Labels{"result": "scheduled"})
|
||||
// PodScheduleFailures counts how many pods could not be scheduled.
|
||||
PodScheduleFailures = scheduleAttempts.With(prometheus.Labels{"result": "unschedulable"})
|
||||
// PodScheduleErrors counts how many pods could not be scheduled due to a scheduler error.
|
||||
PodScheduleErrors = scheduleAttempts.With(prometheus.Labels{"result": "error"})
|
||||
SchedulingLatency = prometheus.NewSummaryVec(
|
||||
prometheus.SummaryOpts{
|
||||
Subsystem: SchedulerSubsystem,
|
||||
|
@ -135,6 +147,7 @@ var (
|
|||
}, []string{"result"})
|
||||
|
||||
metricsList = []prometheus.Collector{
|
||||
scheduleAttempts,
|
||||
SchedulingLatency,
|
||||
E2eSchedulingLatency,
|
||||
SchedulingAlgorithmLatency,
|
||||
|
|
|
@ -416,6 +416,13 @@ func (sched *Scheduler) scheduleOne() {
|
|||
metrics.PreemptionAttempts.Inc()
|
||||
metrics.SchedulingAlgorithmPremptionEvaluationDuration.Observe(metrics.SinceInMicroseconds(preemptionStartTime))
|
||||
metrics.SchedulingLatency.WithLabelValues(metrics.PreemptionEvaluation).Observe(metrics.SinceInSeconds(preemptionStartTime))
|
||||
// Pod did not fit anywhere, so it is counted as a failure. If preemption
|
||||
// succeeds, the pod should get counted as a success the next time we try to
|
||||
// schedule it. (hopefully)
|
||||
metrics.PodScheduleFailures.Inc()
|
||||
} else {
|
||||
glog.Errorf("error selecting node for pod: %v", err)
|
||||
metrics.PodScheduleErrors.Inc()
|
||||
}
|
||||
return
|
||||
}
|
||||
|
@ -433,20 +440,26 @@ func (sched *Scheduler) scheduleOne() {
|
|||
// This function modifies 'assumedPod' if volume binding is required.
|
||||
allBound, err := sched.assumeVolumes(assumedPod, suggestedHost)
|
||||
if err != nil {
|
||||
glog.Errorf("error assuming volumes: %v", err)
|
||||
metrics.PodScheduleErrors.Inc()
|
||||
return
|
||||
}
|
||||
|
||||
// assume modifies `assumedPod` by setting NodeName=suggestedHost
|
||||
err = sched.assume(assumedPod, suggestedHost)
|
||||
if err != nil {
|
||||
glog.Errorf("error assuming pod: %v", err)
|
||||
metrics.PodScheduleErrors.Inc()
|
||||
return
|
||||
}
|
||||
// bind the pod to its host asynchronously (we can do this b/c of the assumption step above).
|
||||
go func() {
|
||||
// Bind volumes first before Pod
|
||||
if !allBound {
|
||||
err = sched.bindVolumes(assumedPod)
|
||||
err := sched.bindVolumes(assumedPod)
|
||||
if err != nil {
|
||||
glog.Errorf("error binding volumes: %v", err)
|
||||
metrics.PodScheduleErrors.Inc()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
@ -460,7 +473,10 @@ func (sched *Scheduler) scheduleOne() {
|
|||
})
|
||||
metrics.E2eSchedulingLatency.Observe(metrics.SinceInMicroseconds(start))
|
||||
if err != nil {
|
||||
glog.Errorf("Internal error binding pod: (%v)", err)
|
||||
glog.Errorf("error binding pod: %v", err)
|
||||
metrics.PodScheduleErrors.Inc()
|
||||
} else {
|
||||
metrics.PodScheduleSuccesses.Inc()
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue