2019-01-12 04:58:27 +00:00
/ *
Copyright 2014 The Kubernetes Authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package scheduler
import (
2019-12-12 01:27:03 +00:00
"context"
2019-01-12 04:58:27 +00:00
"fmt"
"io/ioutil"
2019-12-12 01:27:03 +00:00
"math/rand"
2019-01-12 04:58:27 +00:00
"os"
"time"
2019-04-07 17:07:55 +00:00
"k8s.io/klog"
2019-09-27 21:51:53 +00:00
v1 "k8s.io/api/core/v1"
2019-01-12 04:58:27 +00:00
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/wait"
2019-12-12 01:27:03 +00:00
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/client-go/informers"
2019-01-12 04:58:27 +00:00
coreinformers "k8s.io/client-go/informers/core/v1"
clientset "k8s.io/client-go/kubernetes"
2019-12-12 01:27:03 +00:00
"k8s.io/client-go/tools/cache"
2019-09-27 21:51:53 +00:00
"k8s.io/client-go/tools/events"
2019-12-12 01:27:03 +00:00
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
kubefeatures "k8s.io/kubernetes/pkg/features"
schedulerapi "k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/apis/config/scheme"
2019-01-12 04:58:27 +00:00
"k8s.io/kubernetes/pkg/scheduler/core"
2019-12-12 01:27:03 +00:00
frameworkplugins "k8s.io/kubernetes/pkg/scheduler/framework/plugins"
2019-08-30 18:33:25 +00:00
framework "k8s.io/kubernetes/pkg/scheduler/framework/v1alpha1"
internalcache "k8s.io/kubernetes/pkg/scheduler/internal/cache"
2019-09-27 21:51:53 +00:00
internalqueue "k8s.io/kubernetes/pkg/scheduler/internal/queue"
2019-01-12 04:58:27 +00:00
"k8s.io/kubernetes/pkg/scheduler/metrics"
2019-12-12 01:27:03 +00:00
nodeinfosnapshot "k8s.io/kubernetes/pkg/scheduler/nodeinfo/snapshot"
2019-09-27 21:51:53 +00:00
"k8s.io/kubernetes/pkg/scheduler/volumebinder"
2019-01-12 04:58:27 +00:00
)
const (
// BindTimeoutSeconds defines the default bind timeout
BindTimeoutSeconds = 100
2019-04-07 17:07:55 +00:00
// SchedulerError is the reason recorded for events when an error occurs during scheduling a pod.
SchedulerError = "SchedulerError"
2019-12-12 01:27:03 +00:00
// Percentage of framework metrics to be sampled.
frameworkMetricsSamplePercent = 10
2019-01-12 04:58:27 +00:00
)
2019-12-12 01:27:03 +00:00
// podConditionUpdater updates the condition of a pod based on the passed
// PodCondition
// TODO (ahmad-diaa): Remove type and replace it with scheduler methods
type podConditionUpdater interface {
update ( pod * v1 . Pod , podCondition * v1 . PodCondition ) error
}
// PodPreemptor has methods needed to delete a pod and to update 'NominatedPod'
// field of the preemptor pod.
// TODO (ahmad-diaa): Remove type and replace it with scheduler methods
type podPreemptor interface {
getUpdatedPod ( pod * v1 . Pod ) ( * v1 . Pod , error )
deletePod ( pod * v1 . Pod ) error
setNominatedNodeName ( pod * v1 . Pod , nominatedNode string ) error
removeNominatedNodeName ( pod * v1 . Pod ) error
}
2019-01-12 04:58:27 +00:00
// Scheduler watches for new unscheduled pods. It attempts to find
// nodes that they fit on and writes bindings back to the api server.
type Scheduler struct {
2019-09-27 21:51:53 +00:00
// It is expected that changes made via SchedulerCache will be observed
// by NodeLister and Algorithm.
SchedulerCache internalcache . Cache
Algorithm core . ScheduleAlgorithm
2019-12-12 01:27:03 +00:00
GetBinder func ( pod * v1 . Pod ) Binder
2019-09-27 21:51:53 +00:00
// PodConditionUpdater is used only in case of scheduling errors. If we succeed
// with scheduling, PodScheduled condition will be updated in apiserver in /bind
// handler so that binding and setting PodCondition it is atomic.
2019-12-12 01:27:03 +00:00
podConditionUpdater podConditionUpdater
2019-09-27 21:51:53 +00:00
// PodPreemptor is used to evict pods and update 'NominatedNode' field of
// the preemptor pod.
2019-12-12 01:27:03 +00:00
podPreemptor podPreemptor
2019-09-27 21:51:53 +00:00
// Framework runs scheduler plugins at configured extension points.
Framework framework . Framework
// NextPod should be a function that blocks until the next pod
// is available. We don't use a channel for this, because scheduling
// a pod may take some amount of time and we don't want pods to get
// stale while they sit in a channel.
2019-12-12 01:27:03 +00:00
NextPod func ( ) * framework . PodInfo
2019-09-27 21:51:53 +00:00
// Error is called if there is an error. It is passed the pod in
// question, and the error
2019-12-12 01:27:03 +00:00
Error func ( * framework . PodInfo , error )
2019-09-27 21:51:53 +00:00
// Recorder is the EventRecorder to use
Recorder events . EventRecorder
// Close this to shut down the scheduler.
StopEverything <- chan struct { }
// VolumeBinder handles PVC/PV binding for the pod.
VolumeBinder * volumebinder . VolumeBinder
// Disable pod preemption or not.
DisablePreemption bool
// SchedulingQueue holds pods to be scheduled
SchedulingQueue internalqueue . SchedulingQueue
2019-12-12 01:27:03 +00:00
scheduledPodsHasSynced func ( ) bool
// The final configuration of the framework.
Plugins schedulerapi . Plugins
PluginConfig [ ] schedulerapi . PluginConfig
2019-01-12 04:58:27 +00:00
}
// Cache returns the cache in scheduler for test to check the data in scheduler.
2019-08-30 18:33:25 +00:00
func ( sched * Scheduler ) Cache ( ) internalcache . Cache {
2019-09-27 21:51:53 +00:00
return sched . SchedulerCache
2019-01-12 04:58:27 +00:00
}
type schedulerOptions struct {
schedulerName string
2019-12-12 01:27:03 +00:00
schedulerAlgorithmSource schedulerapi . SchedulerAlgorithmSource
2019-01-12 04:58:27 +00:00
hardPodAffinitySymmetricWeight int32
disablePreemption bool
percentageOfNodesToScore int32
bindTimeoutSeconds int64
2019-12-12 01:27:03 +00:00
podInitialBackoffSeconds int64
podMaxBackoffSeconds int64
// Default registry contains all in-tree plugins
frameworkDefaultRegistry framework . Registry
// This registry contains out of tree plugins to be merged with default registry.
frameworkOutOfTreeRegistry framework . Registry
frameworkConfigProducerRegistry * frameworkplugins . ConfigProducerRegistry
frameworkPlugins * schedulerapi . Plugins
frameworkPluginConfig [ ] schedulerapi . PluginConfig
2019-01-12 04:58:27 +00:00
}
// Option configures a Scheduler
type Option func ( * schedulerOptions )
// WithName sets schedulerName for Scheduler, the default schedulerName is default-scheduler
func WithName ( schedulerName string ) Option {
return func ( o * schedulerOptions ) {
o . schedulerName = schedulerName
}
}
2019-12-12 01:27:03 +00:00
// WithAlgorithmSource sets schedulerAlgorithmSource for Scheduler, the default is a source with DefaultProvider.
func WithAlgorithmSource ( source schedulerapi . SchedulerAlgorithmSource ) Option {
return func ( o * schedulerOptions ) {
o . schedulerAlgorithmSource = source
}
}
2019-01-12 04:58:27 +00:00
// WithHardPodAffinitySymmetricWeight sets hardPodAffinitySymmetricWeight for Scheduler, the default value is 1
func WithHardPodAffinitySymmetricWeight ( hardPodAffinitySymmetricWeight int32 ) Option {
return func ( o * schedulerOptions ) {
o . hardPodAffinitySymmetricWeight = hardPodAffinitySymmetricWeight
}
}
// WithPreemptionDisabled sets disablePreemption for Scheduler, the default value is false
func WithPreemptionDisabled ( disablePreemption bool ) Option {
return func ( o * schedulerOptions ) {
o . disablePreemption = disablePreemption
}
}
// WithPercentageOfNodesToScore sets percentageOfNodesToScore for Scheduler, the default value is 50
func WithPercentageOfNodesToScore ( percentageOfNodesToScore int32 ) Option {
return func ( o * schedulerOptions ) {
o . percentageOfNodesToScore = percentageOfNodesToScore
}
}
// WithBindTimeoutSeconds sets bindTimeoutSeconds for Scheduler, the default value is 100
func WithBindTimeoutSeconds ( bindTimeoutSeconds int64 ) Option {
return func ( o * schedulerOptions ) {
o . bindTimeoutSeconds = bindTimeoutSeconds
}
}
2019-12-12 01:27:03 +00:00
// WithFrameworkDefaultRegistry sets the framework's default registry. This is only used in integration tests.
func WithFrameworkDefaultRegistry ( registry framework . Registry ) Option {
return func ( o * schedulerOptions ) {
o . frameworkDefaultRegistry = registry
}
}
// WithFrameworkOutOfTreeRegistry sets the registry for out-of-tree plugins. Those plugins
// will be appended to the default registry.
func WithFrameworkOutOfTreeRegistry ( registry framework . Registry ) Option {
return func ( o * schedulerOptions ) {
o . frameworkOutOfTreeRegistry = registry
}
}
// WithFrameworkConfigProducerRegistry sets the framework plugin producer registry.
func WithFrameworkConfigProducerRegistry ( registry * frameworkplugins . ConfigProducerRegistry ) Option {
return func ( o * schedulerOptions ) {
o . frameworkConfigProducerRegistry = registry
}
}
// WithFrameworkPlugins sets the plugins that the framework should be configured with.
func WithFrameworkPlugins ( plugins * schedulerapi . Plugins ) Option {
return func ( o * schedulerOptions ) {
o . frameworkPlugins = plugins
}
}
// WithFrameworkPluginConfig sets the PluginConfig slice that the framework should be configured with.
func WithFrameworkPluginConfig ( pluginConfig [ ] schedulerapi . PluginConfig ) Option {
return func ( o * schedulerOptions ) {
o . frameworkPluginConfig = pluginConfig
}
}
// WithPodInitialBackoffSeconds sets podInitialBackoffSeconds for Scheduler, the default value is 1
func WithPodInitialBackoffSeconds ( podInitialBackoffSeconds int64 ) Option {
return func ( o * schedulerOptions ) {
o . podInitialBackoffSeconds = podInitialBackoffSeconds
}
}
// WithPodMaxBackoffSeconds sets podMaxBackoffSeconds for Scheduler, the default value is 10
func WithPodMaxBackoffSeconds ( podMaxBackoffSeconds int64 ) Option {
return func ( o * schedulerOptions ) {
o . podMaxBackoffSeconds = podMaxBackoffSeconds
}
}
2019-01-12 04:58:27 +00:00
var defaultSchedulerOptions = schedulerOptions {
2019-12-12 01:27:03 +00:00
schedulerName : v1 . DefaultSchedulerName ,
schedulerAlgorithmSource : schedulerapi . SchedulerAlgorithmSource {
Provider : defaultAlgorithmSourceProviderName ( ) ,
} ,
hardPodAffinitySymmetricWeight : v1 . DefaultHardPodAffinitySymmetricWeight ,
disablePreemption : false ,
percentageOfNodesToScore : schedulerapi . DefaultPercentageOfNodesToScore ,
bindTimeoutSeconds : BindTimeoutSeconds ,
podInitialBackoffSeconds : int64 ( internalqueue . DefaultPodInitialBackoffDuration . Seconds ( ) ) ,
podMaxBackoffSeconds : int64 ( internalqueue . DefaultPodMaxBackoffDuration . Seconds ( ) ) ,
frameworkConfigProducerRegistry : frameworkplugins . NewDefaultConfigProducerRegistry ( ) ,
// The plugins and pluginConfig options are currently nil because we currently don't have
// "default" plugins. All plugins that we run through the framework currently come from two
// sources: 1) specified in component config, in which case those two options should be
// set using their corresponding With* functions, 2) predicate/priority-mapped plugins, which
// pluginConfigProducerRegistry contains a mapping for and produces their configurations.
// TODO(ahg-g) Once predicates and priorities are migrated to natively run as plugins, the
// below two parameters will be populated accordingly.
frameworkPlugins : nil ,
frameworkPluginConfig : nil ,
2019-01-12 04:58:27 +00:00
}
// New returns a Scheduler
func New ( client clientset . Interface ,
2019-12-12 01:27:03 +00:00
informerFactory informers . SharedInformerFactory ,
2019-01-12 04:58:27 +00:00
podInformer coreinformers . PodInformer ,
2019-09-27 21:51:53 +00:00
recorder events . EventRecorder ,
2019-01-12 04:58:27 +00:00
stopCh <- chan struct { } ,
2019-12-12 01:27:03 +00:00
opts ... Option ) ( * Scheduler , error ) {
stopEverything := stopCh
if stopEverything == nil {
stopEverything = wait . NeverStop
}
2019-01-12 04:58:27 +00:00
options := defaultSchedulerOptions
for _ , opt := range opts {
opt ( & options )
}
2019-12-12 01:27:03 +00:00
schedulerCache := internalcache . New ( 30 * time . Second , stopEverything )
volumeBinder := volumebinder . NewVolumeBinder (
client ,
informerFactory . Core ( ) . V1 ( ) . Nodes ( ) ,
informerFactory . Storage ( ) . V1 ( ) . CSINodes ( ) ,
informerFactory . Core ( ) . V1 ( ) . PersistentVolumeClaims ( ) ,
informerFactory . Core ( ) . V1 ( ) . PersistentVolumes ( ) ,
informerFactory . Storage ( ) . V1 ( ) . StorageClasses ( ) ,
time . Duration ( options . bindTimeoutSeconds ) * time . Second ,
)
registry := options . frameworkDefaultRegistry
if registry == nil {
registry = frameworkplugins . NewDefaultRegistry ( & frameworkplugins . RegistryArgs {
VolumeBinder : volumeBinder ,
} )
}
registry . Merge ( options . frameworkOutOfTreeRegistry )
snapshot := nodeinfosnapshot . NewEmptySnapshot ( )
configurator := & Configurator {
client : client ,
informerFactory : informerFactory ,
podInformer : podInformer ,
volumeBinder : volumeBinder ,
schedulerCache : schedulerCache ,
StopEverything : stopEverything ,
hardPodAffinitySymmetricWeight : options . hardPodAffinitySymmetricWeight ,
disablePreemption : options . disablePreemption ,
percentageOfNodesToScore : options . percentageOfNodesToScore ,
bindTimeoutSeconds : options . bindTimeoutSeconds ,
podInitialBackoffSeconds : options . podInitialBackoffSeconds ,
podMaxBackoffSeconds : options . podMaxBackoffSeconds ,
enableNonPreempting : utilfeature . DefaultFeatureGate . Enabled ( kubefeatures . NonPreemptingPriority ) ,
registry : registry ,
plugins : options . frameworkPlugins ,
pluginConfig : options . frameworkPluginConfig ,
pluginConfigProducerRegistry : options . frameworkConfigProducerRegistry ,
nodeInfoSnapshot : snapshot ,
algorithmFactoryArgs : AlgorithmFactoryArgs {
SharedLister : snapshot ,
InformerFactory : informerFactory ,
VolumeBinder : volumeBinder ,
HardPodAffinitySymmetricWeight : options . hardPodAffinitySymmetricWeight ,
} ,
configProducerArgs : & frameworkplugins . ConfigProducerArgs { } ,
}
var sched * Scheduler
source := options . schedulerAlgorithmSource
2019-01-12 04:58:27 +00:00
switch {
case source . Provider != nil :
// Create the config from a named algorithm provider.
sc , err := configurator . CreateFromProvider ( * source . Provider )
if err != nil {
return nil , fmt . Errorf ( "couldn't create scheduler using provider %q: %v" , * source . Provider , err )
}
2019-12-12 01:27:03 +00:00
sched = sc
2019-01-12 04:58:27 +00:00
case source . Policy != nil :
// Create the config from a user specified policy source.
policy := & schedulerapi . Policy { }
switch {
case source . Policy . File != nil :
if err := initPolicyFromFile ( source . Policy . File . Path , policy ) ; err != nil {
return nil , err
}
case source . Policy . ConfigMap != nil :
if err := initPolicyFromConfigMap ( client , source . Policy . ConfigMap , policy ) ; err != nil {
return nil , err
}
}
sc , err := configurator . CreateFromConfig ( * policy )
if err != nil {
return nil , fmt . Errorf ( "couldn't create scheduler from policy: %v" , err )
}
2019-12-12 01:27:03 +00:00
sched = sc
2019-01-12 04:58:27 +00:00
default :
return nil , fmt . Errorf ( "unsupported algorithm source: %v" , source )
}
2019-12-12 01:27:03 +00:00
metrics . Register ( )
2019-01-12 04:58:27 +00:00
// Additional tweaks to the config produced by the configurator.
2019-12-12 01:27:03 +00:00
sched . Recorder = recorder
sched . DisablePreemption = options . disablePreemption
sched . StopEverything = stopEverything
sched . podConditionUpdater = & podConditionUpdaterImpl { client }
sched . podPreemptor = & podPreemptorImpl { client }
sched . scheduledPodsHasSynced = podInformer . Informer ( ) . HasSynced
AddAllEventHandlers ( sched , options . schedulerName , informerFactory , podInformer )
2019-01-12 04:58:27 +00:00
return sched , nil
}
// initPolicyFromFile initialize policy from file
func initPolicyFromFile ( policyFile string , policy * schedulerapi . Policy ) error {
// Use a policy serialized in a file.
_ , err := os . Stat ( policyFile )
if err != nil {
return fmt . Errorf ( "missing policy config file %s" , policyFile )
}
data , err := ioutil . ReadFile ( policyFile )
if err != nil {
return fmt . Errorf ( "couldn't read policy config: %v" , err )
}
2019-12-12 01:27:03 +00:00
err = runtime . DecodeInto ( scheme . Codecs . UniversalDecoder ( ) , [ ] byte ( data ) , policy )
2019-01-12 04:58:27 +00:00
if err != nil {
return fmt . Errorf ( "invalid policy: %v" , err )
}
return nil
}
// initPolicyFromConfigMap initialize policy from configMap
2019-12-12 01:27:03 +00:00
func initPolicyFromConfigMap ( client clientset . Interface , policyRef * schedulerapi . SchedulerPolicyConfigMapSource , policy * schedulerapi . Policy ) error {
2019-01-12 04:58:27 +00:00
// Use a policy serialized in a config map value.
policyConfigMap , err := client . CoreV1 ( ) . ConfigMaps ( policyRef . Namespace ) . Get ( policyRef . Name , metav1 . GetOptions { } )
if err != nil {
return fmt . Errorf ( "couldn't get policy config map %s/%s: %v" , policyRef . Namespace , policyRef . Name , err )
}
2019-12-12 01:27:03 +00:00
data , found := policyConfigMap . Data [ schedulerapi . SchedulerPolicyConfigMapKey ]
2019-01-12 04:58:27 +00:00
if ! found {
2019-12-12 01:27:03 +00:00
return fmt . Errorf ( "missing policy config map value at key %q" , schedulerapi . SchedulerPolicyConfigMapKey )
2019-01-12 04:58:27 +00:00
}
2019-12-12 01:27:03 +00:00
err = runtime . DecodeInto ( scheme . Codecs . UniversalDecoder ( ) , [ ] byte ( data ) , policy )
2019-01-12 04:58:27 +00:00
if err != nil {
return fmt . Errorf ( "invalid policy: %v" , err )
}
return nil
}
2019-12-12 01:27:03 +00:00
// Run begins watching and scheduling. It waits for cache to be synced, then starts scheduling and blocked until the context is done.
func ( sched * Scheduler ) Run ( ctx context . Context ) {
if ! cache . WaitForCacheSync ( ctx . Done ( ) , sched . scheduledPodsHasSynced ) {
2019-01-12 04:58:27 +00:00
return
}
2019-12-12 01:27:03 +00:00
wait . UntilWithContext ( ctx , sched . scheduleOne , 0 )
2019-01-12 04:58:27 +00:00
}
2019-04-07 17:07:55 +00:00
// recordFailedSchedulingEvent records an event for the pod that indicates the
// pod has failed to schedule.
// NOTE: This function modifies "pod". "pod" should be copied before being passed.
2019-12-12 01:27:03 +00:00
func ( sched * Scheduler ) recordSchedulingFailure ( podInfo * framework . PodInfo , err error , reason string , message string ) {
sched . Error ( podInfo , err )
pod := podInfo . Pod
2019-09-27 21:51:53 +00:00
sched . Recorder . Eventf ( pod , nil , v1 . EventTypeWarning , "FailedScheduling" , "Scheduling" , message )
2019-12-12 01:27:03 +00:00
if err := sched . podConditionUpdater . update ( pod , & v1 . PodCondition {
2019-04-07 17:07:55 +00:00
Type : v1 . PodScheduled ,
Status : v1 . ConditionFalse ,
Reason : reason ,
Message : err . Error ( ) ,
2019-09-27 21:51:53 +00:00
} ) ; err != nil {
klog . Errorf ( "Error updating the condition of the pod %s/%s: %v" , pod . Namespace , pod . Name , err )
}
2019-04-07 17:07:55 +00:00
}
2019-01-12 04:58:27 +00:00
// preempt tries to create room for a pod that has failed to schedule, by preempting lower priority pods if possible.
2019-08-30 18:33:25 +00:00
// If it succeeds, it adds the name of the node where preemption has happened to the pod spec.
2019-01-12 04:58:27 +00:00
// It returns the node name and an error if any.
2019-12-12 01:27:03 +00:00
func ( sched * Scheduler ) preempt ( ctx context . Context , state * framework . CycleState , fwk framework . Framework , preemptor * v1 . Pod , scheduleErr error ) ( string , error ) {
preemptor , err := sched . podPreemptor . getUpdatedPod ( preemptor )
2019-01-12 04:58:27 +00:00
if err != nil {
klog . Errorf ( "Error getting the updated preemptor pod object: %v" , err )
return "" , err
}
2019-12-12 01:27:03 +00:00
node , victims , nominatedPodsToClear , err := sched . Algorithm . Preempt ( ctx , state , preemptor , scheduleErr )
2019-01-12 04:58:27 +00:00
if err != nil {
2019-09-27 21:51:53 +00:00
klog . Errorf ( "Error preempting victims to make room for %v/%v: %v" , preemptor . Namespace , preemptor . Name , err )
2019-01-12 04:58:27 +00:00
return "" , err
}
var nodeName = ""
if node != nil {
nodeName = node . Name
2019-01-22 20:53:35 +00:00
// Update the scheduling queue with the nominated pod information. Without
// this, there would be a race condition between the next scheduling cycle
// and the time the scheduler receives a Pod Update for the nominated pod.
2019-09-27 21:51:53 +00:00
sched . SchedulingQueue . UpdateNominatedPodForNode ( preemptor , nodeName )
2019-01-22 20:53:35 +00:00
// Make a call to update nominated node name of the pod on the API server.
2019-12-12 01:27:03 +00:00
err = sched . podPreemptor . setNominatedNodeName ( preemptor , nodeName )
2019-01-12 04:58:27 +00:00
if err != nil {
2019-08-30 18:33:25 +00:00
klog . Errorf ( "Error in preemption process. Cannot set 'NominatedPod' on pod %v/%v: %v" , preemptor . Namespace , preemptor . Name , err )
2019-09-27 21:51:53 +00:00
sched . SchedulingQueue . DeleteNominatedPodIfExists ( preemptor )
2019-01-12 04:58:27 +00:00
return "" , err
}
2019-01-22 20:53:35 +00:00
2019-01-12 04:58:27 +00:00
for _ , victim := range victims {
2019-12-12 01:27:03 +00:00
if err := sched . podPreemptor . deletePod ( victim ) ; err != nil {
2019-01-12 04:58:27 +00:00
klog . Errorf ( "Error preempting pod %v/%v: %v" , victim . Namespace , victim . Name , err )
return "" , err
}
2019-09-27 21:51:53 +00:00
// If the victim is a WaitingPod, send a reject message to the PermitPlugin
if waitingPod := fwk . GetWaitingPod ( victim . UID ) ; waitingPod != nil {
waitingPod . Reject ( "preempted" )
}
sched . Recorder . Eventf ( victim , preemptor , v1 . EventTypeNormal , "Preempted" , "Preempting" , "Preempted by %v/%v on node %v" , preemptor . Namespace , preemptor . Name , nodeName )
2019-01-12 04:58:27 +00:00
}
2019-12-12 01:27:03 +00:00
metrics . PreemptionVictims . Observe ( float64 ( len ( victims ) ) )
2019-01-12 04:58:27 +00:00
}
// Clearing nominated pods should happen outside of "if node != nil". Node could
// be nil when a pod with nominated node name is eligible to preempt again,
// but preemption logic does not find any node for it. In that case Preempt()
2019-08-30 18:33:25 +00:00
// function of generic_scheduler.go returns the pod itself for removal of
// the 'NominatedPod' field.
2019-01-12 04:58:27 +00:00
for _ , p := range nominatedPodsToClear {
2019-12-12 01:27:03 +00:00
rErr := sched . podPreemptor . removeNominatedNodeName ( p )
2019-01-12 04:58:27 +00:00
if rErr != nil {
2019-08-30 18:33:25 +00:00
klog . Errorf ( "Cannot remove 'NominatedPod' field of pod: %v" , rErr )
2019-01-12 04:58:27 +00:00
// We do not return as this error is not critical.
}
}
return nodeName , err
}
// bindVolumes will make the API update with the assumed bindings and wait until
// the PV controller has completely finished the binding operation.
//
// If binding errors, times out or gets undone, then an error will be returned to
// retry scheduling.
func ( sched * Scheduler ) bindVolumes ( assumed * v1 . Pod ) error {
klog . V ( 5 ) . Infof ( "Trying to bind volumes for pod \"%v/%v\"" , assumed . Namespace , assumed . Name )
2019-09-27 21:51:53 +00:00
err := sched . VolumeBinder . Binder . BindPodVolumes ( assumed )
2019-01-12 04:58:27 +00:00
if err != nil {
klog . V ( 1 ) . Infof ( "Failed to bind volumes for pod \"%v/%v\": %v" , assumed . Namespace , assumed . Name , err )
// Unassume the Pod and retry scheduling
2019-09-27 21:51:53 +00:00
if forgetErr := sched . SchedulerCache . ForgetPod ( assumed ) ; forgetErr != nil {
2019-01-12 04:58:27 +00:00
klog . Errorf ( "scheduler cache ForgetPod failed: %v" , forgetErr )
}
return err
}
klog . V ( 5 ) . Infof ( "Success binding volumes for pod \"%v/%v\"" , assumed . Namespace , assumed . Name )
return nil
}
// assume signals to the cache that a pod is already in the cache, so that binding can be asynchronous.
// assume modifies `assumed`.
func ( sched * Scheduler ) assume ( assumed * v1 . Pod , host string ) error {
// Optimistically assume that the binding will succeed and send it to apiserver
// in the background.
// If the binding fails, scheduler will release resources allocated to assumed pod
// immediately.
assumed . Spec . NodeName = host
2019-04-07 17:07:55 +00:00
2019-09-27 21:51:53 +00:00
if err := sched . SchedulerCache . AssumePod ( assumed ) ; err != nil {
2019-01-12 04:58:27 +00:00
klog . Errorf ( "scheduler cache AssumePod failed: %v" , err )
return err
}
// if "assumed" is a nominated pod, we should remove it from internal cache
2019-09-27 21:51:53 +00:00
if sched . SchedulingQueue != nil {
sched . SchedulingQueue . DeleteNominatedPodIfExists ( assumed )
2019-01-12 04:58:27 +00:00
}
return nil
}
// bind binds a pod to a given node defined in a binding object. We expect this to run asynchronously, so we
// handle binding metrics internally.
2019-12-12 01:27:03 +00:00
func ( sched * Scheduler ) bind ( ctx context . Context , assumed * v1 . Pod , targetNode string , state * framework . CycleState ) error {
2019-01-12 04:58:27 +00:00
bindingStart := time . Now ( )
2019-12-12 01:27:03 +00:00
bindStatus := sched . Framework . RunBindPlugins ( ctx , state , assumed , targetNode )
2019-09-27 21:51:53 +00:00
var err error
if ! bindStatus . IsSuccess ( ) {
if bindStatus . Code ( ) == framework . Skip {
// All bind plugins chose to skip binding of this pod, call original binding function.
// If binding succeeds then PodScheduled condition will be updated in apiserver so that
// it's atomic with setting host.
err = sched . GetBinder ( assumed ) . Bind ( & v1 . Binding {
ObjectMeta : metav1 . ObjectMeta { Namespace : assumed . Namespace , Name : assumed . Name , UID : assumed . UID } ,
Target : v1 . ObjectReference {
Kind : "Node" ,
Name : targetNode ,
} ,
} )
} else {
err = fmt . Errorf ( "Bind failure, code: %d: %v" , bindStatus . Code ( ) , bindStatus . Message ( ) )
}
}
if finErr := sched . SchedulerCache . FinishBinding ( assumed ) ; finErr != nil {
2019-01-12 04:58:27 +00:00
klog . Errorf ( "scheduler cache FinishBinding failed: %v" , finErr )
}
if err != nil {
klog . V ( 1 ) . Infof ( "Failed to bind pod: %v/%v" , assumed . Namespace , assumed . Name )
2019-09-27 21:51:53 +00:00
if err := sched . SchedulerCache . ForgetPod ( assumed ) ; err != nil {
2019-01-12 04:58:27 +00:00
klog . Errorf ( "scheduler cache ForgetPod failed: %v" , err )
}
return err
}
2019-04-07 17:07:55 +00:00
metrics . BindingLatency . Observe ( metrics . SinceInSeconds ( bindingStart ) )
metrics . DeprecatedBindingLatency . Observe ( metrics . SinceInMicroseconds ( bindingStart ) )
2019-01-12 04:58:27 +00:00
metrics . SchedulingLatency . WithLabelValues ( metrics . Binding ) . Observe ( metrics . SinceInSeconds ( bindingStart ) )
2019-04-07 17:07:55 +00:00
metrics . DeprecatedSchedulingLatency . WithLabelValues ( metrics . Binding ) . Observe ( metrics . SinceInSeconds ( bindingStart ) )
2019-09-27 21:51:53 +00:00
sched . Recorder . Eventf ( assumed , nil , v1 . EventTypeNormal , "Scheduled" , "Binding" , "Successfully assigned %v/%v to %v" , assumed . Namespace , assumed . Name , targetNode )
2019-01-12 04:58:27 +00:00
return nil
}
// scheduleOne does the entire scheduling workflow for a single pod. It is serialized on the scheduling algorithm's host fitting.
2019-12-12 01:27:03 +00:00
func ( sched * Scheduler ) scheduleOne ( ctx context . Context ) {
2019-09-27 21:51:53 +00:00
fwk := sched . Framework
2019-04-07 17:07:55 +00:00
2019-12-12 01:27:03 +00:00
podInfo := sched . NextPod ( )
2019-01-12 04:58:27 +00:00
// pod could be nil when schedulerQueue is closed
2019-12-12 01:27:03 +00:00
if podInfo == nil || podInfo . Pod == nil {
2019-01-12 04:58:27 +00:00
return
}
2019-12-12 01:27:03 +00:00
pod := podInfo . Pod
2019-01-12 04:58:27 +00:00
if pod . DeletionTimestamp != nil {
2019-09-27 21:51:53 +00:00
sched . Recorder . Eventf ( pod , nil , v1 . EventTypeWarning , "FailedScheduling" , "Scheduling" , "skip schedule deleting pod: %v/%v" , pod . Namespace , pod . Name )
2019-01-12 04:58:27 +00:00
klog . V ( 3 ) . Infof ( "Skip schedule deleting pod: %v/%v" , pod . Namespace , pod . Name )
return
}
klog . V ( 3 ) . Infof ( "Attempting to schedule pod: %v/%v" , pod . Namespace , pod . Name )
// Synchronously attempt to find a fit for the pod.
start := time . Now ( )
2019-12-12 01:27:03 +00:00
state := framework . NewCycleState ( )
state . SetRecordFrameworkMetrics ( rand . Intn ( 100 ) < frameworkMetricsSamplePercent )
schedulingCycleCtx , cancel := context . WithCancel ( ctx )
defer cancel ( )
scheduleResult , err := sched . Algorithm . Schedule ( schedulingCycleCtx , state , pod )
2019-01-12 04:58:27 +00:00
if err != nil {
2019-12-12 01:27:03 +00:00
sched . recordSchedulingFailure ( podInfo . DeepCopy ( ) , err , v1 . PodReasonUnschedulable , err . Error ( ) )
// Schedule() may have failed because the pod would not fit on any host, so we try to
2019-01-12 04:58:27 +00:00
// preempt, with the expectation that the next time the pod is tried for scheduling it
// will fit due to the preemption. It is also possible that a different pod will schedule
// into the resources that were preempted, but this is harmless.
if fitError , ok := err . ( * core . FitError ) ; ok {
2019-09-27 21:51:53 +00:00
if sched . DisablePreemption {
2019-04-07 17:07:55 +00:00
klog . V ( 3 ) . Infof ( "Pod priority feature is not enabled or preemption is disabled by scheduler configuration." +
" No preemption is performed." )
} else {
preemptionStartTime := time . Now ( )
2019-12-12 01:27:03 +00:00
sched . preempt ( schedulingCycleCtx , state , fwk , pod , fitError )
2019-04-07 17:07:55 +00:00
metrics . PreemptionAttempts . Inc ( )
2019-12-12 01:27:03 +00:00
metrics . SchedulingAlgorithmPreemptionEvaluationDuration . Observe ( metrics . SinceInSeconds ( preemptionStartTime ) )
metrics . DeprecatedSchedulingAlgorithmPreemptionEvaluationDuration . Observe ( metrics . SinceInMicroseconds ( preemptionStartTime ) )
2019-04-07 17:07:55 +00:00
metrics . SchedulingLatency . WithLabelValues ( metrics . PreemptionEvaluation ) . Observe ( metrics . SinceInSeconds ( preemptionStartTime ) )
metrics . DeprecatedSchedulingLatency . WithLabelValues ( metrics . PreemptionEvaluation ) . Observe ( metrics . SinceInSeconds ( preemptionStartTime ) )
}
2019-01-12 04:58:27 +00:00
// Pod did not fit anywhere, so it is counted as a failure. If preemption
// succeeds, the pod should get counted as a success the next time we try to
// schedule it. (hopefully)
metrics . PodScheduleFailures . Inc ( )
} else {
klog . Errorf ( "error selecting node for pod: %v" , err )
metrics . PodScheduleErrors . Inc ( )
}
return
}
2019-04-07 17:07:55 +00:00
metrics . SchedulingAlgorithmLatency . Observe ( metrics . SinceInSeconds ( start ) )
metrics . DeprecatedSchedulingAlgorithmLatency . Observe ( metrics . SinceInMicroseconds ( start ) )
2019-01-12 04:58:27 +00:00
// Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet.
// This allows us to keep scheduling without waiting on binding to occur.
2019-12-12 01:27:03 +00:00
assumedPodInfo := podInfo . DeepCopy ( )
assumedPod := assumedPodInfo . Pod
2019-01-12 04:58:27 +00:00
// Assume volumes first before assuming the pod.
//
// If all volumes are completely bound, then allBound is true and binding will be skipped.
//
// Otherwise, binding of volumes is started after the pod is assumed, but before pod binding.
//
// This function modifies 'assumedPod' if volume binding is required.
2019-12-12 01:27:03 +00:00
allBound , err := sched . VolumeBinder . Binder . AssumePodVolumes ( assumedPod , scheduleResult . SuggestedHost )
2019-01-12 04:58:27 +00:00
if err != nil {
2019-12-12 01:27:03 +00:00
sched . recordSchedulingFailure ( assumedPodInfo , err , SchedulerError ,
fmt . Sprintf ( "AssumePodVolumes failed: %v" , err ) )
2019-01-12 04:58:27 +00:00
metrics . PodScheduleErrors . Inc ( )
return
}
2019-04-07 17:07:55 +00:00
// Run "reserve" plugins.
2019-12-12 01:27:03 +00:00
if sts := fwk . RunReservePlugins ( schedulingCycleCtx , state , assumedPod , scheduleResult . SuggestedHost ) ; ! sts . IsSuccess ( ) {
sched . recordSchedulingFailure ( assumedPodInfo , sts . AsError ( ) , SchedulerError , sts . Message ( ) )
2019-08-30 18:33:25 +00:00
metrics . PodScheduleErrors . Inc ( )
return
2019-04-07 17:07:55 +00:00
}
2019-08-30 18:33:25 +00:00
2019-04-07 17:07:55 +00:00
// assume modifies `assumedPod` by setting NodeName=scheduleResult.SuggestedHost
err = sched . assume ( assumedPod , scheduleResult . SuggestedHost )
2019-01-12 04:58:27 +00:00
if err != nil {
2019-12-12 01:27:03 +00:00
// This is most probably result of a BUG in retrying logic.
// We report an error here so that pod scheduling can be retried.
// This relies on the fact that Error will check if the pod has been bound
// to a node and if so will not add it back to the unscheduled pods queue
// (otherwise this would cause an infinite loop).
sched . recordSchedulingFailure ( assumedPodInfo , err , SchedulerError , fmt . Sprintf ( "AssumePod failed: %v" , err ) )
2019-01-12 04:58:27 +00:00
metrics . PodScheduleErrors . Inc ( )
2019-08-30 18:33:25 +00:00
// trigger un-reserve plugins to clean up state associated with the reserved Pod
2019-12-12 01:27:03 +00:00
fwk . RunUnreservePlugins ( schedulingCycleCtx , state , assumedPod , scheduleResult . SuggestedHost )
2019-01-12 04:58:27 +00:00
return
}
// bind the pod to its host asynchronously (we can do this b/c of the assumption step above).
go func ( ) {
2019-12-12 01:27:03 +00:00
bindingCycleCtx , cancel := context . WithCancel ( ctx )
defer cancel ( )
metrics . SchedulerGoroutines . WithLabelValues ( "binding" ) . Inc ( )
defer metrics . SchedulerGoroutines . WithLabelValues ( "binding" ) . Dec ( )
2019-01-12 04:58:27 +00:00
2019-08-30 18:33:25 +00:00
// Run "permit" plugins.
2019-12-12 01:27:03 +00:00
permitStatus := fwk . RunPermitPlugins ( bindingCycleCtx , state , assumedPod , scheduleResult . SuggestedHost )
2019-08-30 18:33:25 +00:00
if ! permitStatus . IsSuccess ( ) {
var reason string
2019-09-27 21:51:53 +00:00
if permitStatus . IsUnschedulable ( ) {
metrics . PodScheduleFailures . Inc ( )
2019-08-30 18:33:25 +00:00
reason = v1 . PodReasonUnschedulable
} else {
metrics . PodScheduleErrors . Inc ( )
reason = SchedulerError
}
if forgetErr := sched . Cache ( ) . ForgetPod ( assumedPod ) ; forgetErr != nil {
klog . Errorf ( "scheduler cache ForgetPod failed: %v" , forgetErr )
}
// trigger un-reserve plugins to clean up state associated with the reserved Pod
2019-12-12 01:27:03 +00:00
fwk . RunUnreservePlugins ( bindingCycleCtx , state , assumedPod , scheduleResult . SuggestedHost )
sched . recordSchedulingFailure ( assumedPodInfo , permitStatus . AsError ( ) , reason , permitStatus . Message ( ) )
2019-08-30 18:33:25 +00:00
return
}
2019-12-12 01:27:03 +00:00
// Bind volumes first before Pod
if ! allBound {
err := sched . bindVolumes ( assumedPod )
if err != nil {
sched . recordSchedulingFailure ( assumedPodInfo , err , "VolumeBindingFailed" , err . Error ( ) )
metrics . PodScheduleErrors . Inc ( )
// trigger un-reserve plugins to clean up state associated with the reserved Pod
fwk . RunUnreservePlugins ( bindingCycleCtx , state , assumedPod , scheduleResult . SuggestedHost )
return
}
}
2019-04-07 17:07:55 +00:00
// Run "prebind" plugins.
2019-12-12 01:27:03 +00:00
preBindStatus := fwk . RunPreBindPlugins ( bindingCycleCtx , state , assumedPod , scheduleResult . SuggestedHost )
2019-09-27 21:51:53 +00:00
if ! preBindStatus . IsSuccess ( ) {
2019-08-30 18:33:25 +00:00
var reason string
2019-12-12 01:27:03 +00:00
metrics . PodScheduleErrors . Inc ( )
reason = SchedulerError
2019-08-30 18:33:25 +00:00
if forgetErr := sched . Cache ( ) . ForgetPod ( assumedPod ) ; forgetErr != nil {
klog . Errorf ( "scheduler cache ForgetPod failed: %v" , forgetErr )
2019-04-07 17:07:55 +00:00
}
2019-08-30 18:33:25 +00:00
// trigger un-reserve plugins to clean up state associated with the reserved Pod
2019-12-12 01:27:03 +00:00
fwk . RunUnreservePlugins ( bindingCycleCtx , state , assumedPod , scheduleResult . SuggestedHost )
sched . recordSchedulingFailure ( assumedPodInfo , preBindStatus . AsError ( ) , reason , preBindStatus . Message ( ) )
2019-08-30 18:33:25 +00:00
return
2019-04-07 17:07:55 +00:00
}
2019-12-12 01:27:03 +00:00
err := sched . bind ( bindingCycleCtx , assumedPod , scheduleResult . SuggestedHost , state )
2019-04-07 17:07:55 +00:00
metrics . E2eSchedulingLatency . Observe ( metrics . SinceInSeconds ( start ) )
metrics . DeprecatedE2eSchedulingLatency . Observe ( metrics . SinceInMicroseconds ( start ) )
2019-01-12 04:58:27 +00:00
if err != nil {
metrics . PodScheduleErrors . Inc ( )
2019-08-30 18:33:25 +00:00
// trigger un-reserve plugins to clean up state associated with the reserved Pod
2019-12-12 01:27:03 +00:00
fwk . RunUnreservePlugins ( bindingCycleCtx , state , assumedPod , scheduleResult . SuggestedHost )
sched . recordSchedulingFailure ( assumedPodInfo , err , SchedulerError , fmt . Sprintf ( "Binding rejected: %v" , err ) )
2019-01-12 04:58:27 +00:00
} else {
2019-09-27 21:51:53 +00:00
// Calculating nodeResourceString can be heavy. Avoid it if klog verbosity is below 2.
if klog . V ( 2 ) {
2019-12-12 01:27:03 +00:00
klog . Infof ( "pod %v/%v is bound successfully on node %q, %d nodes evaluated, %d nodes were found feasible." , assumedPod . Namespace , assumedPod . Name , scheduleResult . SuggestedHost , scheduleResult . EvaluatedNodes , scheduleResult . FeasibleNodes )
2019-09-27 21:51:53 +00:00
}
2019-01-12 04:58:27 +00:00
metrics . PodScheduleSuccesses . Inc ( )
2019-12-12 01:27:03 +00:00
metrics . PodSchedulingAttempts . Observe ( float64 ( podInfo . Attempts ) )
metrics . PodSchedulingDuration . Observe ( metrics . SinceInSeconds ( podInfo . InitialAttemptTimestamp ) )
2019-08-30 18:33:25 +00:00
// Run "postbind" plugins.
2019-12-12 01:27:03 +00:00
fwk . RunPostBindPlugins ( bindingCycleCtx , state , assumedPod , scheduleResult . SuggestedHost )
2019-01-12 04:58:27 +00:00
}
} ( )
}
2019-09-27 21:51:53 +00:00
2019-12-12 01:27:03 +00:00
type podConditionUpdaterImpl struct {
Client clientset . Interface
}
func ( p * podConditionUpdaterImpl ) update ( pod * v1 . Pod , condition * v1 . PodCondition ) error {
klog . V ( 3 ) . Infof ( "Updating pod condition for %s/%s to (%s==%s, Reason=%s)" , pod . Namespace , pod . Name , condition . Type , condition . Status , condition . Reason )
if podutil . UpdatePodCondition ( & pod . Status , condition ) {
_ , err := p . Client . CoreV1 ( ) . Pods ( pod . Namespace ) . UpdateStatus ( pod )
return err
}
return nil
}
type podPreemptorImpl struct {
Client clientset . Interface
}
func ( p * podPreemptorImpl ) getUpdatedPod ( pod * v1 . Pod ) ( * v1 . Pod , error ) {
return p . Client . CoreV1 ( ) . Pods ( pod . Namespace ) . Get ( pod . Name , metav1 . GetOptions { } )
}
func ( p * podPreemptorImpl ) deletePod ( pod * v1 . Pod ) error {
return p . Client . CoreV1 ( ) . Pods ( pod . Namespace ) . Delete ( pod . Name , & metav1 . DeleteOptions { } )
}
func ( p * podPreemptorImpl ) setNominatedNodeName ( pod * v1 . Pod , nominatedNodeName string ) error {
podCopy := pod . DeepCopy ( )
podCopy . Status . NominatedNodeName = nominatedNodeName
_ , err := p . Client . CoreV1 ( ) . Pods ( pod . Namespace ) . UpdateStatus ( podCopy )
return err
}
func ( p * podPreemptorImpl ) removeNominatedNodeName ( pod * v1 . Pod ) error {
if len ( pod . Status . NominatedNodeName ) == 0 {
return nil
2019-09-27 21:51:53 +00:00
}
2019-12-12 01:27:03 +00:00
return p . setNominatedNodeName ( pod , "" )
2019-09-27 21:51:53 +00:00
}
2019-12-12 01:27:03 +00:00
func defaultAlgorithmSourceProviderName ( ) * string {
provider := schedulerapi . SchedulerDefaultProviderName
return & provider
2019-09-27 21:51:53 +00:00
}