2014-08-03 07:01:28 +00:00
/ *
2016-06-03 00:25:58 +00:00
Copyright 2014 The Kubernetes Authors .
2014-08-03 07:01:28 +00:00
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package scheduler
import (
2018-11-09 02:08:38 +00:00
"errors"
2018-10-03 23:44:44 +00:00
"fmt"
"io/ioutil"
"os"
2015-04-02 17:24:21 +00:00
"time"
2018-11-09 02:08:38 +00:00
"k8s.io/klog"
2017-06-22 18:24:23 +00:00
"k8s.io/api/core/v1"
2017-01-17 03:38:19 +00:00
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2018-10-03 23:44:44 +00:00
"k8s.io/apimachinery/pkg/runtime"
2017-01-11 14:09:48 +00:00
"k8s.io/apimachinery/pkg/util/wait"
2018-10-03 23:44:44 +00:00
appsinformers "k8s.io/client-go/informers/apps/v1"
coreinformers "k8s.io/client-go/informers/core/v1"
policyinformers "k8s.io/client-go/informers/policy/v1beta1"
storageinformers "k8s.io/client-go/informers/storage/v1"
2017-06-23 20:56:37 +00:00
clientset "k8s.io/client-go/kubernetes"
2017-01-30 18:39:54 +00:00
"k8s.io/client-go/tools/record"
2018-01-04 02:12:18 +00:00
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
2018-10-03 23:44:44 +00:00
latestschedulerapi "k8s.io/kubernetes/pkg/scheduler/api/latest"
kubeschedulerconfig "k8s.io/kubernetes/pkg/scheduler/apis/config"
2018-01-04 02:12:18 +00:00
"k8s.io/kubernetes/pkg/scheduler/core"
2018-10-03 23:44:44 +00:00
"k8s.io/kubernetes/pkg/scheduler/factory"
2019-03-22 01:12:33 +00:00
internalcache "k8s.io/kubernetes/pkg/scheduler/internal/cache"
2018-01-04 02:12:18 +00:00
"k8s.io/kubernetes/pkg/scheduler/metrics"
"k8s.io/kubernetes/pkg/scheduler/util"
2014-08-03 07:01:28 +00:00
)
2018-10-03 23:44:44 +00:00
const (
// BindTimeoutSeconds defines the default bind timeout
BindTimeoutSeconds = 100
2018-11-09 02:08:38 +00:00
// SchedulerError is the reason recorded for events when an error occurs during scheduling a pod.
SchedulerError = "SchedulerError"
2018-10-03 23:44:44 +00:00
)
2017-08-10 01:15:40 +00:00
2014-08-03 07:01:28 +00:00
// Scheduler watches for new unscheduled pods. It attempts to find
2015-09-10 08:40:22 +00:00
// nodes that they fit on and writes bindings back to the api server.
2014-08-03 07:01:28 +00:00
type Scheduler struct {
2018-10-03 23:44:44 +00:00
config * factory . Config
2014-08-03 07:01:28 +00:00
}
2018-08-31 11:42:10 +00:00
// Cache returns the cache in scheduler for test to check the data in scheduler.
2019-03-22 01:12:33 +00:00
func ( sched * Scheduler ) Cache ( ) internalcache . Cache {
2018-08-31 11:42:10 +00:00
return sched . config . SchedulerCache
}
2018-10-03 23:44:44 +00:00
type schedulerOptions struct {
schedulerName string
hardPodAffinitySymmetricWeight int32
disablePreemption bool
percentageOfNodesToScore int32
bindTimeoutSeconds int64
}
// Option configures a Scheduler
type Option func ( * schedulerOptions )
// WithName sets schedulerName for Scheduler, the default schedulerName is default-scheduler
func WithName ( schedulerName string ) Option {
return func ( o * schedulerOptions ) {
o . schedulerName = schedulerName
}
}
// WithHardPodAffinitySymmetricWeight sets hardPodAffinitySymmetricWeight for Scheduler, the default value is 1
func WithHardPodAffinitySymmetricWeight ( hardPodAffinitySymmetricWeight int32 ) Option {
return func ( o * schedulerOptions ) {
o . hardPodAffinitySymmetricWeight = hardPodAffinitySymmetricWeight
}
}
// WithPreemptionDisabled sets disablePreemption for Scheduler, the default value is false
func WithPreemptionDisabled ( disablePreemption bool ) Option {
return func ( o * schedulerOptions ) {
o . disablePreemption = disablePreemption
}
}
// WithPercentageOfNodesToScore sets percentageOfNodesToScore for Scheduler, the default value is 50
func WithPercentageOfNodesToScore ( percentageOfNodesToScore int32 ) Option {
return func ( o * schedulerOptions ) {
o . percentageOfNodesToScore = percentageOfNodesToScore
}
}
// WithBindTimeoutSeconds sets bindTimeoutSeconds for Scheduler, the default value is 100
func WithBindTimeoutSeconds ( bindTimeoutSeconds int64 ) Option {
return func ( o * schedulerOptions ) {
o . bindTimeoutSeconds = bindTimeoutSeconds
}
}
var defaultSchedulerOptions = schedulerOptions {
schedulerName : v1 . DefaultSchedulerName ,
hardPodAffinitySymmetricWeight : v1 . DefaultHardPodAffinitySymmetricWeight ,
disablePreemption : false ,
percentageOfNodesToScore : schedulerapi . DefaultPercentageOfNodesToScore ,
bindTimeoutSeconds : BindTimeoutSeconds ,
}
// New returns a Scheduler
func New ( client clientset . Interface ,
nodeInformer coreinformers . NodeInformer ,
podInformer coreinformers . PodInformer ,
pvInformer coreinformers . PersistentVolumeInformer ,
pvcInformer coreinformers . PersistentVolumeClaimInformer ,
replicationControllerInformer coreinformers . ReplicationControllerInformer ,
replicaSetInformer appsinformers . ReplicaSetInformer ,
statefulSetInformer appsinformers . StatefulSetInformer ,
serviceInformer coreinformers . ServiceInformer ,
pdbInformer policyinformers . PodDisruptionBudgetInformer ,
storageClassInformer storageinformers . StorageClassInformer ,
recorder record . EventRecorder ,
schedulerAlgorithmSource kubeschedulerconfig . SchedulerAlgorithmSource ,
2018-10-11 08:49:31 +00:00
stopCh <- chan struct { } ,
2018-10-03 23:44:44 +00:00
opts ... func ( o * schedulerOptions ) ) ( * Scheduler , error ) {
options := defaultSchedulerOptions
for _ , opt := range opts {
opt ( & options )
}
// Set up the configurator which can create schedulers from configs.
configurator := factory . NewConfigFactory ( & factory . ConfigFactoryArgs {
SchedulerName : options . schedulerName ,
Client : client ,
NodeInformer : nodeInformer ,
PodInformer : podInformer ,
PvInformer : pvInformer ,
PvcInformer : pvcInformer ,
ReplicationControllerInformer : replicationControllerInformer ,
ReplicaSetInformer : replicaSetInformer ,
StatefulSetInformer : statefulSetInformer ,
ServiceInformer : serviceInformer ,
PdbInformer : pdbInformer ,
StorageClassInformer : storageClassInformer ,
HardPodAffinitySymmetricWeight : options . hardPodAffinitySymmetricWeight ,
DisablePreemption : options . disablePreemption ,
PercentageOfNodesToScore : options . percentageOfNodesToScore ,
BindTimeoutSeconds : options . bindTimeoutSeconds ,
} )
var config * factory . Config
source := schedulerAlgorithmSource
switch {
case source . Provider != nil :
// Create the config from a named algorithm provider.
sc , err := configurator . CreateFromProvider ( * source . Provider )
if err != nil {
return nil , fmt . Errorf ( "couldn't create scheduler using provider %q: %v" , * source . Provider , err )
}
config = sc
case source . Policy != nil :
// Create the config from a user specified policy source.
policy := & schedulerapi . Policy { }
switch {
case source . Policy . File != nil :
2018-11-13 03:22:47 +00:00
if err := initPolicyFromFile ( source . Policy . File . Path , policy ) ; err != nil {
return nil , err
2018-10-03 23:44:44 +00:00
}
case source . Policy . ConfigMap != nil :
2018-11-13 03:22:47 +00:00
if err := initPolicyFromConfigMap ( client , source . Policy . ConfigMap , policy ) ; err != nil {
return nil , err
2018-10-03 23:44:44 +00:00
}
}
sc , err := configurator . CreateFromConfig ( * policy )
if err != nil {
return nil , fmt . Errorf ( "couldn't create scheduler from policy: %v" , err )
}
config = sc
default :
return nil , fmt . Errorf ( "unsupported algorithm source: %v" , source )
}
// Additional tweaks to the config produced by the configurator.
config . Recorder = recorder
config . DisablePreemption = options . disablePreemption
2018-10-11 08:49:31 +00:00
config . StopEverything = stopCh
2018-10-01 21:57:17 +00:00
2018-10-03 23:44:44 +00:00
// Create the scheduler.
sched := NewFromConfig ( config )
2018-10-01 21:57:17 +00:00
AddAllEventHandlers ( sched , options . schedulerName , nodeInformer , podInformer , pvInformer , pvcInformer , replicationControllerInformer , replicaSetInformer , statefulSetInformer , serviceInformer , pdbInformer , storageClassInformer )
2018-10-03 23:44:44 +00:00
return sched , nil
2014-08-03 07:01:28 +00:00
}
2018-11-13 03:22:47 +00:00
// initPolicyFromFile initialize policy from file
func initPolicyFromFile ( policyFile string , policy * schedulerapi . Policy ) error {
// Use a policy serialized in a file.
_ , err := os . Stat ( policyFile )
if err != nil {
return fmt . Errorf ( "missing policy config file %s" , policyFile )
}
data , err := ioutil . ReadFile ( policyFile )
if err != nil {
return fmt . Errorf ( "couldn't read policy config: %v" , err )
}
err = runtime . DecodeInto ( latestschedulerapi . Codec , [ ] byte ( data ) , policy )
if err != nil {
return fmt . Errorf ( "invalid policy: %v" , err )
}
return nil
}
// initPolicyFromConfigMap initialize policy from configMap
func initPolicyFromConfigMap ( client clientset . Interface , policyRef * kubeschedulerconfig . SchedulerPolicyConfigMapSource , policy * schedulerapi . Policy ) error {
// Use a policy serialized in a config map value.
policyConfigMap , err := client . CoreV1 ( ) . ConfigMaps ( policyRef . Namespace ) . Get ( policyRef . Name , metav1 . GetOptions { } )
if err != nil {
return fmt . Errorf ( "couldn't get policy config map %s/%s: %v" , policyRef . Namespace , policyRef . Name , err )
}
data , found := policyConfigMap . Data [ kubeschedulerconfig . SchedulerPolicyConfigMapKey ]
if ! found {
return fmt . Errorf ( "missing policy config map value at key %q" , kubeschedulerconfig . SchedulerPolicyConfigMapKey )
}
err = runtime . DecodeInto ( latestschedulerapi . Codec , [ ] byte ( data ) , policy )
if err != nil {
return fmt . Errorf ( "invalid policy: %v" , err )
}
return nil
}
2017-11-07 14:41:39 +00:00
// NewFromConfig returns a new scheduler using the provided Config.
2018-10-03 23:44:44 +00:00
func NewFromConfig ( config * factory . Config ) * Scheduler {
2017-11-07 14:41:39 +00:00
metrics . Register ( )
return & Scheduler {
config : config ,
}
}
2017-05-07 05:45:02 +00:00
// Run begins watching and scheduling. It waits for cache to be synced, then starts a goroutine and returns immediately.
2017-03-07 20:21:16 +00:00
func ( sched * Scheduler ) Run ( ) {
2017-05-07 05:45:02 +00:00
if ! sched . config . WaitForCacheSync ( ) {
return
}
2017-03-07 20:21:16 +00:00
go wait . Until ( sched . scheduleOne , 0 , sched . config . StopEverything )
2014-08-03 07:01:28 +00:00
}
2018-09-21 02:19:35 +00:00
// Config returns scheduler's config pointer. It is exposed for testing purposes.
2018-10-03 23:44:44 +00:00
func ( sched * Scheduler ) Config ( ) * factory . Config {
2017-04-05 23:59:24 +00:00
return sched . config
}
2018-11-09 02:08:38 +00:00
// recordFailedSchedulingEvent records an event for the pod that indicates the
// pod has failed to schedule.
// NOTE: This function modifies "pod". "pod" should be copied before being passed.
func ( sched * Scheduler ) recordSchedulingFailure ( pod * v1 . Pod , err error , reason string , message string ) {
sched . config . Error ( pod , err )
sched . config . Recorder . Event ( pod , v1 . EventTypeWarning , "FailedScheduling" , message )
sched . config . PodConditionUpdater . Update ( pod , & v1 . PodCondition {
2019-02-22 01:46:18 +00:00
Type : v1 . PodScheduled ,
Status : v1 . ConditionFalse ,
Reason : reason ,
Message : err . Error ( ) ,
2018-11-09 02:08:38 +00:00
} )
}
2018-12-07 03:40:45 +00:00
// schedule implements the scheduling algorithm and returns the suggested result(host,
// evaluated nodes number,feasible nodes number).
func ( sched * Scheduler ) schedule ( pod * v1 . Pod ) ( core . ScheduleResult , error ) {
result , err := sched . config . Algorithm . Schedule ( pod , sched . config . NodeLister )
2014-08-03 07:01:28 +00:00
if err != nil {
2017-08-15 12:15:24 +00:00
pod = pod . DeepCopy ( )
2018-11-09 02:08:38 +00:00
sched . recordSchedulingFailure ( pod , err , v1 . PodReasonUnschedulable , err . Error ( ) )
2018-12-07 03:40:45 +00:00
return core . ScheduleResult { } , err
2014-08-03 07:01:28 +00:00
}
2018-12-07 03:40:45 +00:00
return result , err
2017-05-18 15:50:29 +00:00
}
2016-01-04 22:50:37 +00:00
2017-10-30 08:23:42 +00:00
// preempt tries to create room for a pod that has failed to schedule, by preempting lower priority pods if possible.
// If it succeeds, it adds the name of the node where preemption has happened to the pod annotations.
// It returns the node name and an error if any.
2017-08-10 01:15:40 +00:00
func ( sched * Scheduler ) preempt ( preemptor * v1 . Pod , scheduleErr error ) ( string , error ) {
preemptor , err := sched . config . PodPreemptor . GetUpdatedPod ( preemptor )
if err != nil {
2018-11-09 18:49:10 +00:00
klog . Errorf ( "Error getting the updated preemptor pod object: %v" , err )
2017-08-10 01:15:40 +00:00
return "" , err
}
2018-01-13 04:51:06 +00:00
2017-11-08 01:09:21 +00:00
node , victims , nominatedPodsToClear , err := sched . config . Algorithm . Preempt ( preemptor , sched . config . NodeLister , scheduleErr )
2017-08-10 01:15:40 +00:00
if err != nil {
2018-11-09 18:49:10 +00:00
klog . Errorf ( "Error preempting victims to make room for %v/%v." , preemptor . Namespace , preemptor . Name )
2017-08-10 01:15:40 +00:00
return "" , err
}
2017-11-08 01:09:21 +00:00
var nodeName = ""
if node != nil {
nodeName = node . Name
2018-12-18 07:41:53 +00:00
// Update the scheduling queue with the nominated pod information. Without
// this, there would be a race condition between the next scheduling cycle
// and the time the scheduler receives a Pod Update for the nominated pod.
sched . config . SchedulingQueue . UpdateNominatedPodForNode ( preemptor , nodeName )
// Make a call to update nominated node name of the pod on the API server.
2018-02-02 19:24:20 +00:00
err = sched . config . PodPreemptor . SetNominatedNodeName ( preemptor , nodeName )
2017-11-08 01:09:21 +00:00
if err != nil {
2018-11-09 18:49:10 +00:00
klog . Errorf ( "Error in preemption process. Cannot update pod %v/%v annotations: %v" , preemptor . Namespace , preemptor . Name , err )
2018-12-18 07:41:53 +00:00
sched . config . SchedulingQueue . DeleteNominatedPodIfExists ( preemptor )
2017-08-10 01:15:40 +00:00
return "" , err
}
2018-12-18 07:41:53 +00:00
2017-11-08 01:09:21 +00:00
for _ , victim := range victims {
if err := sched . config . PodPreemptor . DeletePod ( victim ) ; err != nil {
2018-11-09 18:49:10 +00:00
klog . Errorf ( "Error preempting pod %v/%v: %v" , victim . Namespace , victim . Name , err )
2017-11-08 01:09:21 +00:00
return "" , err
}
sched . config . Recorder . Eventf ( victim , v1 . EventTypeNormal , "Preempted" , "by %v/%v on node %v" , preemptor . Namespace , preemptor . Name , nodeName )
}
2018-11-16 08:33:57 +00:00
metrics . PreemptionVictims . Set ( float64 ( len ( victims ) ) )
2017-11-08 01:09:21 +00:00
}
2017-11-21 06:15:00 +00:00
// Clearing nominated pods should happen outside of "if node != nil". Node could
// be nil when a pod with nominated node name is eligible to preempt again,
// but preemption logic does not find any node for it. In that case Preempt()
// function of generic_scheduler.go returns the pod itself for removal of the annotation.
2017-11-08 01:09:21 +00:00
for _ , p := range nominatedPodsToClear {
2018-02-02 19:24:20 +00:00
rErr := sched . config . PodPreemptor . RemoveNominatedNodeName ( p )
2017-11-08 01:09:21 +00:00
if rErr != nil {
2018-11-09 18:49:10 +00:00
klog . Errorf ( "Cannot remove nominated node annotation of pod: %v" , rErr )
2017-11-08 01:09:21 +00:00
// We do not return as this error is not critical.
}
2017-08-10 01:15:40 +00:00
}
2017-11-08 01:09:21 +00:00
return nodeName , err
2017-08-10 01:15:40 +00:00
}
2018-08-18 00:45:51 +00:00
// assumeVolumes will update the volume cache with the chosen bindings
2017-11-08 21:09:58 +00:00
//
// This function modifies assumed if volume binding is required.
2018-08-18 00:45:51 +00:00
func ( sched * Scheduler ) assumeVolumes ( assumed * v1 . Pod , host string ) ( allBound bool , err error ) {
2018-12-27 22:45:04 +00:00
allBound , err = sched . config . VolumeBinder . Binder . AssumePodVolumes ( assumed , host )
if err != nil {
sched . recordSchedulingFailure ( assumed , err , SchedulerError ,
fmt . Sprintf ( "AssumePodVolumes failed: %v" , err ) )
2017-11-08 21:09:58 +00:00
}
2018-08-18 00:45:51 +00:00
return
2017-11-08 21:09:58 +00:00
}
2018-08-18 00:45:51 +00:00
// bindVolumes will make the API update with the assumed bindings and wait until
// the PV controller has completely finished the binding operation.
//
// If binding errors, times out or gets undone, then an error will be returned to
// retry scheduling.
func ( sched * Scheduler ) bindVolumes ( assumed * v1 . Pod ) error {
2018-11-09 18:49:10 +00:00
klog . V ( 5 ) . Infof ( "Trying to bind volumes for pod \"%v/%v\"" , assumed . Namespace , assumed . Name )
2018-08-18 00:45:51 +00:00
err := sched . config . VolumeBinder . Binder . BindPodVolumes ( assumed )
if err != nil {
2018-11-09 18:49:10 +00:00
klog . V ( 1 ) . Infof ( "Failed to bind volumes for pod \"%v/%v\": %v" , assumed . Namespace , assumed . Name , err )
2017-11-08 21:09:58 +00:00
2018-08-18 00:45:51 +00:00
// Unassume the Pod and retry scheduling
if forgetErr := sched . config . SchedulerCache . ForgetPod ( assumed ) ; forgetErr != nil {
2018-11-09 18:49:10 +00:00
klog . Errorf ( "scheduler cache ForgetPod failed: %v" , forgetErr )
2017-11-08 21:09:58 +00:00
}
2018-11-09 02:08:38 +00:00
sched . recordSchedulingFailure ( assumed , err , "VolumeBindingFailed" , err . Error ( ) )
2018-08-18 00:45:51 +00:00
return err
2017-11-08 21:09:58 +00:00
}
2018-11-09 18:49:10 +00:00
klog . V ( 5 ) . Infof ( "Success binding volumes for pod \"%v/%v\"" , assumed . Namespace , assumed . Name )
2018-08-18 00:45:51 +00:00
return nil
2017-11-08 21:09:58 +00:00
}
2017-09-06 15:04:51 +00:00
// assume signals to the cache that a pod is already in the cache, so that binding can be asynchronous.
2017-08-02 17:09:24 +00:00
// assume modifies `assumed`.
func ( sched * Scheduler ) assume ( assumed * v1 . Pod , host string ) error {
2016-04-13 13:44:06 +00:00
// Optimistically assume that the binding will succeed and send it to apiserver
// in the background.
2016-08-15 04:10:42 +00:00
// If the binding fails, scheduler will release resources allocated to assumed pod
// immediately.
2017-05-18 15:50:29 +00:00
assumed . Spec . NodeName = host
2018-09-29 11:30:47 +00:00
2017-08-02 17:09:24 +00:00
if err := sched . config . SchedulerCache . AssumePod ( assumed ) ; err != nil {
2018-11-09 18:49:10 +00:00
klog . Errorf ( "scheduler cache AssumePod failed: %v" , err )
2017-08-03 19:40:29 +00:00
// This is most probably result of a BUG in retrying logic.
// We report an error here so that pod scheduling can be retried.
// This relies on the fact that Error will check if the pod has been bound
// to a node and if so will not add it back to the unscheduled pods queue
// (otherwise this would cause an infinite loop).
2018-11-09 02:08:38 +00:00
sched . recordSchedulingFailure ( assumed , err , SchedulerError ,
fmt . Sprintf ( "AssumePod failed: %v" , err ) )
2017-05-18 15:50:29 +00:00
return err
2016-05-29 01:39:57 +00:00
}
2018-11-16 22:22:15 +00:00
// if "assumed" is a nominated pod, we should remove it from internal cache
if sched . config . SchedulingQueue != nil {
sched . config . SchedulingQueue . DeleteNominatedPodIfExists ( assumed )
}
2016-04-13 13:44:06 +00:00
2017-05-18 15:50:29 +00:00
return nil
}
2017-02-15 08:59:30 +00:00
2017-05-18 15:50:29 +00:00
// bind binds a pod to a given node defined in a binding object. We expect this to run asynchronously, so we
// handle binding metrics internally.
func ( sched * Scheduler ) bind ( assumed * v1 . Pod , b * v1 . Binding ) error {
bindingStart := time . Now ( )
// If binding succeeded then PodScheduled condition will be updated in apiserver so that
// it's atomic with setting host.
2018-02-08 08:40:56 +00:00
err := sched . config . GetBinder ( assumed ) . Bind ( b )
2018-09-17 06:49:15 +00:00
if finErr := sched . config . SchedulerCache . FinishBinding ( assumed ) ; finErr != nil {
2018-11-09 18:49:10 +00:00
klog . Errorf ( "scheduler cache FinishBinding failed: %v" , finErr )
2017-08-03 19:40:29 +00:00
}
2017-05-18 15:50:29 +00:00
if err != nil {
2018-11-09 18:49:10 +00:00
klog . V ( 1 ) . Infof ( "Failed to bind pod: %v/%v" , assumed . Namespace , assumed . Name )
2017-05-18 15:50:29 +00:00
if err := sched . config . SchedulerCache . ForgetPod ( assumed ) ; err != nil {
2018-11-09 18:49:10 +00:00
klog . Errorf ( "scheduler cache ForgetPod failed: %v" , err )
2017-05-18 15:50:29 +00:00
}
2018-11-09 02:08:38 +00:00
sched . recordSchedulingFailure ( assumed , err , SchedulerError ,
fmt . Sprintf ( "Binding rejected: %v" , err ) )
2017-05-18 15:50:29 +00:00
return err
}
2017-07-04 03:04:45 +00:00
2018-12-26 08:31:29 +00:00
metrics . BindingLatency . Observe ( metrics . SinceInSeconds ( bindingStart ) )
metrics . DeprecatedBindingLatency . Observe ( metrics . SinceInMicroseconds ( bindingStart ) )
2018-06-07 11:20:26 +00:00
metrics . SchedulingLatency . WithLabelValues ( metrics . Binding ) . Observe ( metrics . SinceInSeconds ( bindingStart ) )
2019-02-22 13:40:13 +00:00
metrics . DeprecatedSchedulingLatency . WithLabelValues ( metrics . Binding ) . Observe ( metrics . SinceInSeconds ( bindingStart ) )
2018-03-13 08:39:48 +00:00
sched . config . Recorder . Eventf ( assumed , v1 . EventTypeNormal , "Scheduled" , "Successfully assigned %v/%v to %v" , assumed . Namespace , assumed . Name , b . Target . Name )
2017-05-18 15:50:29 +00:00
return nil
}
2016-04-13 13:44:06 +00:00
2017-05-18 15:50:29 +00:00
// scheduleOne does the entire scheduling workflow for a single pod. It is serialized on the scheduling algorithm's host fitting.
func ( sched * Scheduler ) scheduleOne ( ) {
2018-11-09 02:08:38 +00:00
plugins := sched . config . PluginSet
// Remove all plugin context data at the beginning of a scheduling cycle.
if plugins . Data ( ) . Ctx != nil {
plugins . Data ( ) . Ctx . Reset ( )
}
2017-05-18 15:50:29 +00:00
pod := sched . config . NextPod ( )
2018-09-14 23:59:54 +00:00
// pod could be nil when schedulerQueue is closed
if pod == nil {
return
}
2017-05-18 15:50:29 +00:00
if pod . DeletionTimestamp != nil {
sched . config . Recorder . Eventf ( pod , v1 . EventTypeWarning , "FailedScheduling" , "skip schedule deleting pod: %v/%v" , pod . Namespace , pod . Name )
2018-11-09 18:49:10 +00:00
klog . V ( 3 ) . Infof ( "Skip schedule deleting pod: %v/%v" , pod . Namespace , pod . Name )
2017-05-18 15:50:29 +00:00
return
}
2018-11-09 18:49:10 +00:00
klog . V ( 3 ) . Infof ( "Attempting to schedule pod: %v/%v" , pod . Namespace , pod . Name )
2017-05-18 15:50:29 +00:00
// Synchronously attempt to find a fit for the pod.
start := time . Now ( )
2018-12-07 03:40:45 +00:00
scheduleResult , err := sched . schedule ( pod )
2017-05-18 15:50:29 +00:00
if err != nil {
2017-08-10 01:15:40 +00:00
// schedule() may have failed because the pod would not fit on any host, so we try to
// preempt, with the expectation that the next time the pod is tried for scheduling it
// will fit due to the preemption. It is also possible that a different pod will schedule
// into the resources that were preempted, but this is harmless.
if fitError , ok := err . ( * core . FitError ) ; ok {
2018-11-16 08:33:57 +00:00
if ! util . PodPriorityEnabled ( ) || sched . config . DisablePreemption {
klog . V ( 3 ) . Infof ( "Pod priority feature is not enabled or preemption is disabled by scheduler configuration." +
" No preemption is performed." )
} else {
preemptionStartTime := time . Now ( )
sched . preempt ( pod , fitError )
metrics . PreemptionAttempts . Inc ( )
2018-12-26 08:31:29 +00:00
metrics . SchedulingAlgorithmPremptionEvaluationDuration . Observe ( metrics . SinceInSeconds ( preemptionStartTime ) )
metrics . DeprecatedSchedulingAlgorithmPremptionEvaluationDuration . Observe ( metrics . SinceInMicroseconds ( preemptionStartTime ) )
2018-11-16 08:33:57 +00:00
metrics . SchedulingLatency . WithLabelValues ( metrics . PreemptionEvaluation ) . Observe ( metrics . SinceInSeconds ( preemptionStartTime ) )
2019-02-22 13:40:13 +00:00
metrics . DeprecatedSchedulingLatency . WithLabelValues ( metrics . PreemptionEvaluation ) . Observe ( metrics . SinceInSeconds ( preemptionStartTime ) )
2018-11-16 08:33:57 +00:00
}
2018-05-30 18:59:55 +00:00
// Pod did not fit anywhere, so it is counted as a failure. If preemption
// succeeds, the pod should get counted as a success the next time we try to
// schedule it. (hopefully)
metrics . PodScheduleFailures . Inc ( )
} else {
2018-11-09 18:49:10 +00:00
klog . Errorf ( "error selecting node for pod: %v" , err )
2018-05-30 18:59:55 +00:00
metrics . PodScheduleErrors . Inc ( )
2017-08-10 01:15:40 +00:00
}
2017-05-18 15:50:29 +00:00
return
}
2018-12-26 08:31:29 +00:00
metrics . SchedulingAlgorithmLatency . Observe ( metrics . SinceInSeconds ( start ) )
metrics . DeprecatedSchedulingAlgorithmLatency . Observe ( metrics . SinceInMicroseconds ( start ) )
2017-05-18 15:50:29 +00:00
// Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet.
// This allows us to keep scheduling without waiting on binding to occur.
2017-11-08 21:09:58 +00:00
assumedPod := pod . DeepCopy ( )
// Assume volumes first before assuming the pod.
//
2018-08-18 00:45:51 +00:00
// If all volumes are completely bound, then allBound is true and binding will be skipped.
2017-11-08 21:09:58 +00:00
//
2018-08-18 00:45:51 +00:00
// Otherwise, binding of volumes is started after the pod is assumed, but before pod binding.
2017-11-08 21:09:58 +00:00
//
// This function modifies 'assumedPod' if volume binding is required.
2018-12-07 03:40:45 +00:00
allBound , err := sched . assumeVolumes ( assumedPod , scheduleResult . SuggestedHost )
2017-11-08 21:09:58 +00:00
if err != nil {
2018-11-09 18:49:10 +00:00
klog . Errorf ( "error assuming volumes: %v" , err )
2018-05-30 18:59:55 +00:00
metrics . PodScheduleErrors . Inc ( )
2017-11-08 21:09:58 +00:00
return
}
2018-11-09 02:08:38 +00:00
// Run "reserve" plugins.
for _ , pl := range plugins . ReservePlugins ( ) {
2018-12-07 03:40:45 +00:00
if err := pl . Reserve ( plugins , assumedPod , scheduleResult . SuggestedHost ) ; err != nil {
2018-11-09 02:08:38 +00:00
klog . Errorf ( "error while running %v reserve plugin for pod %v: %v" , pl . Name ( ) , assumedPod . Name , err )
sched . recordSchedulingFailure ( assumedPod , err , SchedulerError ,
fmt . Sprintf ( "reserve plugin %v failed" , pl . Name ( ) ) )
metrics . PodScheduleErrors . Inc ( )
return
}
}
2018-12-07 03:40:45 +00:00
// assume modifies `assumedPod` by setting NodeName=scheduleResult.SuggestedHost
err = sched . assume ( assumedPod , scheduleResult . SuggestedHost )
2017-05-18 15:50:29 +00:00
if err != nil {
2018-11-09 18:49:10 +00:00
klog . Errorf ( "error assuming pod: %v" , err )
2018-05-30 18:59:55 +00:00
metrics . PodScheduleErrors . Inc ( )
2017-05-18 15:50:29 +00:00
return
}
// bind the pod to its host asynchronously (we can do this b/c of the assumption step above).
go func ( ) {
2018-08-18 00:45:51 +00:00
// Bind volumes first before Pod
if ! allBound {
2018-05-30 18:59:55 +00:00
err := sched . bindVolumes ( assumedPod )
2018-08-18 00:45:51 +00:00
if err != nil {
2018-11-09 18:49:10 +00:00
klog . Errorf ( "error binding volumes: %v" , err )
2018-05-30 18:59:55 +00:00
metrics . PodScheduleErrors . Inc ( )
2018-08-18 00:45:51 +00:00
return
}
}
2018-11-09 02:08:38 +00:00
// Run "prebind" plugins.
for _ , pl := range plugins . PrebindPlugins ( ) {
2018-12-07 03:40:45 +00:00
approved , err := pl . Prebind ( plugins , assumedPod , scheduleResult . SuggestedHost )
2018-11-09 02:08:38 +00:00
if err != nil {
approved = false
klog . Errorf ( "error while running %v prebind plugin for pod %v: %v" , pl . Name ( ) , assumedPod . Name , err )
metrics . PodScheduleErrors . Inc ( )
}
if ! approved {
sched . Cache ( ) . ForgetPod ( assumedPod )
var reason string
if err == nil {
msg := fmt . Sprintf ( "prebind plugin %v rejected pod %v." , pl . Name ( ) , assumedPod . Name )
klog . V ( 4 ) . Infof ( msg )
err = errors . New ( msg )
reason = v1 . PodReasonUnschedulable
} else {
reason = SchedulerError
}
sched . recordSchedulingFailure ( assumedPod , err , reason , err . Error ( ) )
return
}
}
2017-11-08 21:09:58 +00:00
err := sched . bind ( assumedPod , & v1 . Binding {
2017-08-02 17:09:24 +00:00
ObjectMeta : metav1 . ObjectMeta { Namespace : assumedPod . Namespace , Name : assumedPod . Name , UID : assumedPod . UID } ,
2016-11-18 20:52:35 +00:00
Target : v1 . ObjectReference {
2016-04-13 13:44:06 +00:00
Kind : "Node" ,
2018-12-07 03:40:45 +00:00
Name : scheduleResult . SuggestedHost ,
2016-04-13 13:44:06 +00:00
} ,
2017-05-18 15:50:29 +00:00
} )
2018-12-26 08:31:29 +00:00
metrics . E2eSchedulingLatency . Observe ( metrics . SinceInSeconds ( start ) )
metrics . DeprecatedE2eSchedulingLatency . Observe ( metrics . SinceInMicroseconds ( start ) )
2015-04-02 17:24:21 +00:00
if err != nil {
2018-11-09 18:49:10 +00:00
klog . Errorf ( "error binding pod: %v" , err )
2018-05-30 18:59:55 +00:00
metrics . PodScheduleErrors . Inc ( )
} else {
2018-12-07 03:40:45 +00:00
klog . V ( 2 ) . Infof ( "pod %v/%v is bound successfully on node %v, %d nodes evaluated, %d nodes were found feasible" , assumedPod . Namespace , assumedPod . Name , scheduleResult . SuggestedHost , scheduleResult . EvaluatedNodes , scheduleResult . FeasibleNodes )
2018-05-30 18:59:55 +00:00
metrics . PodScheduleSuccesses . Inc ( )
2015-03-27 22:33:03 +00:00
}
2016-04-13 13:44:06 +00:00
} ( )
2014-08-03 07:01:28 +00:00
}