2014-09-24 16:32:36 +00:00
|
|
|
/*
|
2016-06-03 00:25:58 +00:00
|
|
|
Copyright 2014 The Kubernetes Authors.
|
2014-09-24 16:32:36 +00:00
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
*/
|
|
|
|
|
2017-01-31 16:03:46 +00:00
|
|
|
package core
|
2014-09-24 16:32:36 +00:00
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
2017-08-09 18:36:35 +00:00
|
|
|
"math"
|
2016-03-03 12:36:07 +00:00
|
|
|
"sort"
|
2016-08-09 12:01:46 +00:00
|
|
|
"strings"
|
2014-09-24 16:32:36 +00:00
|
|
|
"sync"
|
2016-07-20 06:28:57 +00:00
|
|
|
"sync/atomic"
|
2016-04-20 11:35:06 +00:00
|
|
|
"time"
|
2014-09-24 16:32:36 +00:00
|
|
|
|
2018-03-19 10:25:12 +00:00
|
|
|
"github.com/golang/glog"
|
|
|
|
|
2017-06-22 18:24:23 +00:00
|
|
|
"k8s.io/api/core/v1"
|
2017-11-21 21:29:16 +00:00
|
|
|
policy "k8s.io/api/policy/v1beta1"
|
|
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
|
|
"k8s.io/apimachinery/pkg/labels"
|
2017-01-11 14:09:48 +00:00
|
|
|
"k8s.io/apimachinery/pkg/util/errors"
|
2017-01-24 15:15:11 +00:00
|
|
|
utiltrace "k8s.io/apiserver/pkg/util/trace"
|
2017-11-23 09:04:42 +00:00
|
|
|
corelisters "k8s.io/client-go/listers/core/v1"
|
2017-01-27 15:20:40 +00:00
|
|
|
"k8s.io/client-go/util/workqueue"
|
2018-01-04 02:12:18 +00:00
|
|
|
"k8s.io/kubernetes/pkg/scheduler/algorithm"
|
|
|
|
"k8s.io/kubernetes/pkg/scheduler/algorithm/predicates"
|
|
|
|
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
|
2018-05-15 02:49:54 +00:00
|
|
|
schedulercache "k8s.io/kubernetes/pkg/scheduler/cache"
|
2018-05-23 23:06:45 +00:00
|
|
|
"k8s.io/kubernetes/pkg/scheduler/core/equivalence"
|
2018-01-09 05:12:07 +00:00
|
|
|
"k8s.io/kubernetes/pkg/scheduler/metrics"
|
2018-01-04 02:12:18 +00:00
|
|
|
"k8s.io/kubernetes/pkg/scheduler/util"
|
2018-02-12 01:04:00 +00:00
|
|
|
"k8s.io/kubernetes/pkg/scheduler/volumebinder"
|
2014-09-24 16:32:36 +00:00
|
|
|
)
|
|
|
|
|
2018-02-08 06:42:19 +00:00
|
|
|
// FailedPredicateMap declares a map[string][]algorithm.PredicateFailureReason type.
|
2016-08-09 12:01:46 +00:00
|
|
|
type FailedPredicateMap map[string][]algorithm.PredicateFailureReason
|
2015-02-24 04:36:22 +00:00
|
|
|
|
2018-02-08 06:42:19 +00:00
|
|
|
// FitError describes a fit error of a pod.
|
2015-02-24 04:36:22 +00:00
|
|
|
type FitError struct {
|
2016-11-18 20:52:35 +00:00
|
|
|
Pod *v1.Pod
|
2017-10-06 13:58:59 +00:00
|
|
|
NumAllNodes int
|
2015-02-24 04:36:22 +00:00
|
|
|
FailedPredicates FailedPredicateMap
|
|
|
|
}
|
|
|
|
|
2018-02-27 19:06:24 +00:00
|
|
|
// ErrNoNodesAvailable is used to describe the error that no nodes available to schedule pods.
|
2015-05-16 23:46:50 +00:00
|
|
|
var ErrNoNodesAvailable = fmt.Errorf("no nodes available to schedule pods")
|
|
|
|
|
2017-08-10 01:15:40 +00:00
|
|
|
const (
|
2018-02-08 06:42:19 +00:00
|
|
|
// NoNodeAvailableMsg is used to format message when no nodes available.
|
2017-10-06 13:58:59 +00:00
|
|
|
NoNodeAvailableMsg = "0/%v nodes are available"
|
2017-08-10 01:15:40 +00:00
|
|
|
)
|
2017-01-05 21:23:23 +00:00
|
|
|
|
2016-01-21 03:38:34 +00:00
|
|
|
// Error returns detailed information of why the pod failed to fit on each node
|
2015-02-24 04:36:22 +00:00
|
|
|
func (f *FitError) Error() string {
|
2016-10-30 15:31:53 +00:00
|
|
|
reasons := make(map[string]int)
|
|
|
|
for _, predicates := range f.FailedPredicates {
|
2016-08-09 12:01:46 +00:00
|
|
|
for _, pred := range predicates {
|
2018-02-08 06:42:19 +00:00
|
|
|
reasons[pred.GetReason()]++
|
2016-08-09 12:01:46 +00:00
|
|
|
}
|
2015-02-24 04:36:22 +00:00
|
|
|
}
|
2016-10-30 15:31:53 +00:00
|
|
|
|
|
|
|
sortReasonsHistogram := func() []string {
|
|
|
|
reasonStrings := []string{}
|
|
|
|
for k, v := range reasons {
|
2017-10-06 13:58:59 +00:00
|
|
|
reasonStrings = append(reasonStrings, fmt.Sprintf("%v %v", v, k))
|
2016-10-30 15:31:53 +00:00
|
|
|
}
|
|
|
|
sort.Strings(reasonStrings)
|
|
|
|
return reasonStrings
|
|
|
|
}
|
2017-10-06 13:58:59 +00:00
|
|
|
reasonMsg := fmt.Sprintf(NoNodeAvailableMsg+": %v.", f.NumAllNodes, strings.Join(sortReasonsHistogram(), ", "))
|
2017-01-05 21:23:23 +00:00
|
|
|
return reasonMsg
|
2015-02-24 04:36:22 +00:00
|
|
|
}
|
|
|
|
|
2014-09-24 16:32:36 +00:00
|
|
|
type genericScheduler struct {
|
2017-12-21 15:03:46 +00:00
|
|
|
cache schedulercache.Cache
|
2018-05-23 23:42:30 +00:00
|
|
|
equivalenceCache *equivalence.Cache
|
2017-12-21 15:03:46 +00:00
|
|
|
schedulingQueue SchedulingQueue
|
|
|
|
predicates map[string]algorithm.FitPredicate
|
2018-01-12 03:36:28 +00:00
|
|
|
priorityMetaProducer algorithm.PriorityMetadataProducer
|
2017-12-21 15:03:46 +00:00
|
|
|
predicateMetaProducer algorithm.PredicateMetadataProducer
|
|
|
|
prioritizers []algorithm.PriorityConfig
|
|
|
|
extenders []algorithm.SchedulerExtender
|
|
|
|
lastNodeIndex uint64
|
|
|
|
alwaysCheckAllPredicates bool
|
2018-01-12 03:36:28 +00:00
|
|
|
cachedNodeInfoMap map[string]*schedulercache.NodeInfo
|
|
|
|
volumeBinder *volumebinder.VolumeBinder
|
|
|
|
pvcLister corelisters.PersistentVolumeClaimLister
|
2018-04-12 00:40:07 +00:00
|
|
|
disablePreemption bool
|
2014-09-24 16:32:36 +00:00
|
|
|
}
|
|
|
|
|
2018-03-02 13:30:44 +00:00
|
|
|
// Schedule tries to schedule the given pod to one of the nodes in the node list.
|
2015-12-09 06:36:52 +00:00
|
|
|
// If it succeeds, it will return the name of the node.
|
2018-02-08 02:25:49 +00:00
|
|
|
// If it fails, it will return a FitError error with reasons.
|
2016-11-18 20:52:35 +00:00
|
|
|
func (g *genericScheduler) Schedule(pod *v1.Pod, nodeLister algorithm.NodeLister) (string, error) {
|
2017-01-24 15:15:11 +00:00
|
|
|
trace := utiltrace.New(fmt.Sprintf("Scheduling %s/%s", pod.Namespace, pod.Name))
|
2016-10-06 14:36:58 +00:00
|
|
|
defer trace.LogIfLong(100 * time.Millisecond)
|
2016-04-20 11:35:06 +00:00
|
|
|
|
2017-11-23 09:01:23 +00:00
|
|
|
if err := podPassesBasicChecks(pod, g.pvcLister); err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
|
2015-09-10 08:40:22 +00:00
|
|
|
nodes, err := nodeLister.List()
|
2014-09-24 16:32:36 +00:00
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
2016-07-11 14:55:10 +00:00
|
|
|
if len(nodes) == 0 {
|
2015-05-16 23:46:50 +00:00
|
|
|
return "", ErrNoNodesAvailable
|
2014-11-19 16:16:25 +00:00
|
|
|
}
|
|
|
|
|
2016-03-14 04:58:28 +00:00
|
|
|
// Used for all fit and priority funcs.
|
2016-05-04 11:32:05 +00:00
|
|
|
err = g.cache.UpdateNodeNameToInfoMap(g.cachedNodeInfoMap)
|
2014-09-24 16:32:36 +00:00
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
2016-03-14 04:58:28 +00:00
|
|
|
|
2016-04-20 11:35:06 +00:00
|
|
|
trace.Step("Computing predicates")
|
2018-01-09 05:12:07 +00:00
|
|
|
startPredicateEvalTime := time.Now()
|
2018-04-23 21:24:00 +00:00
|
|
|
filteredNodes, failedPredicateMap, err := g.findNodesThatFit(pod, nodes)
|
2015-12-10 19:30:30 +00:00
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
|
2016-07-11 14:55:10 +00:00
|
|
|
if len(filteredNodes) == 0 {
|
2015-02-24 04:36:22 +00:00
|
|
|
return "", &FitError{
|
|
|
|
Pod: pod,
|
2017-10-06 13:58:59 +00:00
|
|
|
NumAllNodes: len(nodes),
|
2015-02-24 04:36:22 +00:00
|
|
|
FailedPredicates: failedPredicateMap,
|
|
|
|
}
|
2014-09-24 16:32:36 +00:00
|
|
|
}
|
2018-01-09 05:12:07 +00:00
|
|
|
metrics.SchedulingAlgorithmPredicateEvaluationDuration.Observe(metrics.SinceInMicroseconds(startPredicateEvalTime))
|
2018-06-21 10:57:31 +00:00
|
|
|
metrics.SchedulingLatency.WithLabelValues(metrics.PredicateEvaluation).Observe(metrics.SinceInSeconds(startPredicateEvalTime))
|
2014-11-19 16:16:25 +00:00
|
|
|
|
2016-04-20 11:35:06 +00:00
|
|
|
trace.Step("Prioritizing")
|
2018-01-09 05:12:07 +00:00
|
|
|
startPriorityEvalTime := time.Now()
|
2017-09-18 02:06:20 +00:00
|
|
|
// When only one node after predicate, just use it.
|
|
|
|
if len(filteredNodes) == 1 {
|
2018-01-09 05:12:07 +00:00
|
|
|
metrics.SchedulingAlgorithmPriorityEvaluationDuration.Observe(metrics.SinceInMicroseconds(startPriorityEvalTime))
|
2017-09-18 02:06:20 +00:00
|
|
|
return filteredNodes[0].Name, nil
|
|
|
|
}
|
|
|
|
|
2016-10-12 16:03:01 +00:00
|
|
|
metaPrioritiesInterface := g.priorityMetaProducer(pod, g.cachedNodeInfoMap)
|
|
|
|
priorityList, err := PrioritizeNodes(pod, g.cachedNodeInfoMap, metaPrioritiesInterface, g.prioritizers, filteredNodes, g.extenders)
|
2015-12-09 06:36:52 +00:00
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
2018-01-09 05:12:07 +00:00
|
|
|
metrics.SchedulingAlgorithmPriorityEvaluationDuration.Observe(metrics.SinceInMicroseconds(startPriorityEvalTime))
|
2018-06-21 10:57:31 +00:00
|
|
|
metrics.SchedulingLatency.WithLabelValues(metrics.PriorityEvaluation).Observe(metrics.SinceInSeconds(startPriorityEvalTime))
|
2015-12-09 06:36:52 +00:00
|
|
|
|
2016-04-20 11:35:06 +00:00
|
|
|
trace.Step("Selecting host")
|
2014-11-07 05:38:41 +00:00
|
|
|
return g.selectHost(priorityList)
|
2014-09-24 21:18:31 +00:00
|
|
|
}
|
|
|
|
|
2017-04-05 23:59:24 +00:00
|
|
|
// Prioritizers returns a slice containing all the scheduler's priority
|
|
|
|
// functions and their config. It is exposed for testing only.
|
|
|
|
func (g *genericScheduler) Prioritizers() []algorithm.PriorityConfig {
|
|
|
|
return g.prioritizers
|
|
|
|
}
|
|
|
|
|
|
|
|
// Predicates returns a map containing all the scheduler's predicate
|
|
|
|
// functions. It is exposed for testing only.
|
|
|
|
func (g *genericScheduler) Predicates() map[string]algorithm.FitPredicate {
|
|
|
|
return g.predicates
|
|
|
|
}
|
|
|
|
|
2018-06-23 02:13:44 +00:00
|
|
|
// findMaxScores returns the indexes of nodes in the "priorityList" that has the highest "Score".
|
|
|
|
func findMaxScores(priorityList schedulerapi.HostPriorityList) []int {
|
|
|
|
maxScoreIndexes := make([]int, 0, len(priorityList)/2)
|
|
|
|
maxScore := priorityList[0].Score
|
|
|
|
for i, hp := range priorityList {
|
|
|
|
if hp.Score > maxScore {
|
|
|
|
maxScore = hp.Score
|
|
|
|
maxScoreIndexes = maxScoreIndexes[:0]
|
|
|
|
maxScoreIndexes = append(maxScoreIndexes, i)
|
|
|
|
} else if hp.Score == maxScore {
|
|
|
|
maxScoreIndexes = append(maxScoreIndexes, i)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return maxScoreIndexes
|
|
|
|
}
|
|
|
|
|
2016-01-21 02:39:59 +00:00
|
|
|
// selectHost takes a prioritized list of nodes and then picks one
|
2016-05-21 18:36:53 +00:00
|
|
|
// in a round-robin manner from the nodes that had the highest score.
|
2015-09-04 06:50:14 +00:00
|
|
|
func (g *genericScheduler) selectHost(priorityList schedulerapi.HostPriorityList) (string, error) {
|
2014-11-07 05:38:41 +00:00
|
|
|
if len(priorityList) == 0 {
|
|
|
|
return "", fmt.Errorf("empty priorityList")
|
|
|
|
}
|
2014-09-24 16:32:36 +00:00
|
|
|
|
2018-06-23 02:13:44 +00:00
|
|
|
maxScores := findMaxScores(priorityList)
|
|
|
|
ix := int(g.lastNodeIndex % uint64(len(maxScores)))
|
2016-03-03 12:36:07 +00:00
|
|
|
g.lastNodeIndex++
|
2015-11-16 23:16:00 +00:00
|
|
|
|
2018-06-23 02:13:44 +00:00
|
|
|
return priorityList[maxScores[ix]].Host, nil
|
2014-09-24 16:32:36 +00:00
|
|
|
}
|
|
|
|
|
2017-08-03 01:20:45 +00:00
|
|
|
// preempt finds nodes with pods that can be preempted to make room for "pod" to
|
|
|
|
// schedule. It chooses one of the nodes and preempts the pods on the node and
|
2017-11-08 01:09:21 +00:00
|
|
|
// returns 1) the node, 2) the list of preempted pods if such a node is found,
|
|
|
|
// 3) A list of pods whose nominated node name should be cleared, and 4) any
|
|
|
|
// possible error.
|
|
|
|
func (g *genericScheduler) Preempt(pod *v1.Pod, nodeLister algorithm.NodeLister, scheduleErr error) (*v1.Node, []*v1.Pod, []*v1.Pod, error) {
|
2017-08-10 01:15:40 +00:00
|
|
|
// Scheduler may return various types of errors. Consider preemption only if
|
|
|
|
// the error is of type FitError.
|
|
|
|
fitError, ok := scheduleErr.(*FitError)
|
|
|
|
if !ok || fitError == nil {
|
2017-11-08 01:09:21 +00:00
|
|
|
return nil, nil, nil, nil
|
2017-08-10 01:15:40 +00:00
|
|
|
}
|
|
|
|
err := g.cache.UpdateNodeNameToInfoMap(g.cachedNodeInfoMap)
|
2017-08-03 01:20:45 +00:00
|
|
|
if err != nil {
|
2017-11-08 01:09:21 +00:00
|
|
|
return nil, nil, nil, err
|
2017-08-03 01:20:45 +00:00
|
|
|
}
|
2017-08-10 01:15:40 +00:00
|
|
|
if !podEligibleToPreemptOthers(pod, g.cachedNodeInfoMap) {
|
|
|
|
glog.V(5).Infof("Pod %v is not eligible for more preemption.", pod.Name)
|
2017-11-08 01:09:21 +00:00
|
|
|
return nil, nil, nil, nil
|
2017-08-10 01:15:40 +00:00
|
|
|
}
|
|
|
|
allNodes, err := nodeLister.List()
|
|
|
|
if err != nil {
|
2017-11-08 01:09:21 +00:00
|
|
|
return nil, nil, nil, err
|
2017-08-10 01:15:40 +00:00
|
|
|
}
|
|
|
|
if len(allNodes) == 0 {
|
2017-11-08 01:09:21 +00:00
|
|
|
return nil, nil, nil, ErrNoNodesAvailable
|
2017-08-03 01:20:45 +00:00
|
|
|
}
|
2018-05-23 05:56:17 +00:00
|
|
|
potentialNodes := nodesWherePreemptionMightHelp(allNodes, fitError.FailedPredicates)
|
2017-08-10 01:15:40 +00:00
|
|
|
if len(potentialNodes) == 0 {
|
|
|
|
glog.V(3).Infof("Preemption will not help schedule pod %v on any node.", pod.Name)
|
2017-11-08 01:09:21 +00:00
|
|
|
// In this case, we should clean-up any existing nominated node name of the pod.
|
|
|
|
return nil, nil, []*v1.Pod{pod}, nil
|
2017-08-03 01:20:45 +00:00
|
|
|
}
|
2017-11-21 21:29:16 +00:00
|
|
|
pdbs, err := g.cache.ListPDBs(labels.Everything())
|
|
|
|
if err != nil {
|
|
|
|
return nil, nil, nil, err
|
|
|
|
}
|
2017-11-17 02:21:03 +00:00
|
|
|
nodeToVictims, err := selectNodesForPreemption(pod, g.cachedNodeInfoMap, potentialNodes, g.predicates,
|
|
|
|
g.predicateMetaProducer, g.schedulingQueue, pdbs)
|
2017-08-10 01:15:40 +00:00
|
|
|
if err != nil {
|
2017-11-08 01:09:21 +00:00
|
|
|
return nil, nil, nil, err
|
2017-08-10 01:15:40 +00:00
|
|
|
}
|
2017-11-17 02:21:03 +00:00
|
|
|
|
|
|
|
// We will only check nodeToVictims with extenders that support preemption.
|
|
|
|
// Extenders which do not support preemption may later prevent preemptor from being scheduled on the nominated
|
|
|
|
// node. In that case, scheduler will find a different host for the preemptor in subsequent scheduling cycles.
|
|
|
|
nodeToVictims, err = g.processPreemptionWithExtenders(pod, nodeToVictims)
|
|
|
|
if err != nil {
|
|
|
|
return nil, nil, nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
candidateNode := pickOneNodeForPreemption(nodeToVictims)
|
|
|
|
if candidateNode == nil {
|
|
|
|
return nil, nil, nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Lower priority pods nominated to run on this node, may no longer fit on
|
|
|
|
// this node. So, we should remove their nomination. Removing their
|
|
|
|
// nomination updates these pods and moves them to the active queue. It
|
|
|
|
// lets scheduler find another place for them.
|
|
|
|
nominatedPods := g.getLowerPriorityNominatedPods(pod, candidateNode.Name)
|
|
|
|
if nodeInfo, ok := g.cachedNodeInfoMap[candidateNode.Name]; ok {
|
|
|
|
return nodeInfo.Node(), nodeToVictims[candidateNode].Pods, nominatedPods, err
|
|
|
|
}
|
2018-02-27 19:06:24 +00:00
|
|
|
|
|
|
|
return nil, nil, nil, fmt.Errorf(
|
|
|
|
"preemption failed: the target node %s has been deleted from scheduler cache",
|
|
|
|
candidateNode.Name)
|
2017-11-17 02:21:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// processPreemptionWithExtenders processes preemption with extenders
|
|
|
|
func (g *genericScheduler) processPreemptionWithExtenders(
|
|
|
|
pod *v1.Pod,
|
|
|
|
nodeToVictims map[*v1.Node]*schedulerapi.Victims,
|
|
|
|
) (map[*v1.Node]*schedulerapi.Victims, error) {
|
|
|
|
if len(nodeToVictims) > 0 {
|
|
|
|
for _, extender := range g.extenders {
|
|
|
|
if extender.SupportsPreemption() {
|
2018-03-19 19:15:24 +00:00
|
|
|
newNodeToVictims, err := extender.ProcessPreemption(
|
|
|
|
pod,
|
|
|
|
nodeToVictims,
|
|
|
|
g.cachedNodeInfoMap,
|
|
|
|
)
|
|
|
|
if err != nil {
|
|
|
|
if extender.IsIgnorable() {
|
|
|
|
glog.Warningf("Skipping extender %v as it returned error %v and has ignorable flag set",
|
|
|
|
extender, err)
|
|
|
|
continue
|
|
|
|
}
|
2017-11-17 02:21:03 +00:00
|
|
|
return nil, err
|
|
|
|
}
|
2018-03-30 21:01:09 +00:00
|
|
|
|
2018-03-19 19:15:24 +00:00
|
|
|
// Replace nodeToVictims with new result after preemption. So the
|
|
|
|
// rest of extenders can continue use it as parameter.
|
|
|
|
nodeToVictims = newNodeToVictims
|
|
|
|
|
|
|
|
// If node list becomes empty, no preemption can happen regardless of other extenders.
|
2017-11-17 02:21:03 +00:00
|
|
|
if len(nodeToVictims) == 0 {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
2017-08-10 01:15:40 +00:00
|
|
|
}
|
|
|
|
}
|
2017-11-17 02:21:03 +00:00
|
|
|
|
|
|
|
return nodeToVictims, nil
|
2017-11-08 01:09:21 +00:00
|
|
|
}
|
|
|
|
|
2017-11-17 02:21:03 +00:00
|
|
|
// getLowerPriorityNominatedPods returns pods whose priority is smaller than the
|
2017-11-08 01:09:21 +00:00
|
|
|
// priority of the given "pod" and are nominated to run on the given node.
|
|
|
|
// Note: We could possibly check if the nominated lower priority pods still fit
|
|
|
|
// and return those that no longer fit, but that would require lots of
|
|
|
|
// manipulation of NodeInfo and PredicateMeta per nominated pod. It may not be
|
|
|
|
// worth the complexity, especially because we generally expect to have a very
|
|
|
|
// small number of nominated pods per node.
|
|
|
|
func (g *genericScheduler) getLowerPriorityNominatedPods(pod *v1.Pod, nodeName string) []*v1.Pod {
|
|
|
|
pods := g.schedulingQueue.WaitingPodsForNode(nodeName)
|
2017-11-17 02:21:03 +00:00
|
|
|
|
2017-11-08 01:09:21 +00:00
|
|
|
if len(pods) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
var lowerPriorityPods []*v1.Pod
|
|
|
|
podPriority := util.GetPodPriority(pod)
|
|
|
|
for _, p := range pods {
|
|
|
|
if util.GetPodPriority(p) < podPriority {
|
|
|
|
lowerPriorityPods = append(lowerPriorityPods, p)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return lowerPriorityPods
|
2017-08-03 01:20:45 +00:00
|
|
|
}
|
|
|
|
|
2015-09-10 08:40:22 +00:00
|
|
|
// Filters the nodes to find the ones that fit based on the given predicate functions
|
|
|
|
// Each node is passed through the predicate functions to determine if it is a fit
|
2018-04-23 21:24:00 +00:00
|
|
|
func (g *genericScheduler) findNodesThatFit(pod *v1.Pod, nodes []*v1.Node) ([]*v1.Node, FailedPredicateMap, error) {
|
2016-11-18 20:52:35 +00:00
|
|
|
var filtered []*v1.Node
|
2015-02-24 04:36:22 +00:00
|
|
|
failedPredicateMap := FailedPredicateMap{}
|
2015-12-10 19:30:30 +00:00
|
|
|
|
2018-04-23 21:24:00 +00:00
|
|
|
if len(g.predicates) == 0 {
|
2016-07-11 14:55:10 +00:00
|
|
|
filtered = nodes
|
2016-05-20 00:49:53 +00:00
|
|
|
} else {
|
2016-07-20 06:28:57 +00:00
|
|
|
// Create filtered list with enough space to avoid growing it
|
|
|
|
// and allow assigning.
|
2016-11-18 20:52:35 +00:00
|
|
|
filtered = make([]*v1.Node, len(nodes))
|
2017-06-05 23:24:05 +00:00
|
|
|
errs := errors.MessageCountMap{}
|
2016-07-20 06:28:57 +00:00
|
|
|
var predicateResultLock sync.Mutex
|
|
|
|
var filteredLen int32
|
2016-10-12 16:03:01 +00:00
|
|
|
|
|
|
|
// We can use the same metadata producer for all nodes.
|
2018-04-23 21:24:00 +00:00
|
|
|
meta := g.predicateMetaProducer(pod, g.cachedNodeInfoMap)
|
2018-02-02 06:47:07 +00:00
|
|
|
|
2018-05-23 23:42:30 +00:00
|
|
|
var equivClass *equivalence.Class
|
2018-04-23 21:24:00 +00:00
|
|
|
if g.equivalenceCache != nil {
|
2018-02-02 06:47:07 +00:00
|
|
|
// getEquivalenceClassInfo will return immediately if no equivalence pod found
|
2018-05-23 23:42:30 +00:00
|
|
|
equivClass = equivalence.NewClass(pod)
|
2018-02-02 06:47:07 +00:00
|
|
|
}
|
|
|
|
|
2016-05-20 00:49:53 +00:00
|
|
|
checkNode := func(i int) {
|
2016-07-11 14:55:10 +00:00
|
|
|
nodeName := nodes[i].Name
|
2018-02-02 06:47:07 +00:00
|
|
|
fits, failedPredicates, err := podFitsOnNode(
|
|
|
|
pod,
|
|
|
|
meta,
|
2018-04-23 21:24:00 +00:00
|
|
|
g.cachedNodeInfoMap[nodeName],
|
|
|
|
g.predicates,
|
|
|
|
g.cache,
|
|
|
|
g.equivalenceCache,
|
|
|
|
g.schedulingQueue,
|
|
|
|
g.alwaysCheckAllPredicates,
|
2018-05-23 23:42:30 +00:00
|
|
|
equivClass,
|
2018-02-02 06:47:07 +00:00
|
|
|
)
|
2016-05-20 00:49:53 +00:00
|
|
|
if err != nil {
|
2016-07-20 06:28:57 +00:00
|
|
|
predicateResultLock.Lock()
|
2017-06-05 23:24:05 +00:00
|
|
|
errs[err.Error()]++
|
2016-07-20 06:28:57 +00:00
|
|
|
predicateResultLock.Unlock()
|
2016-05-20 00:49:53 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
if fits {
|
2016-07-20 06:28:57 +00:00
|
|
|
filtered[atomic.AddInt32(&filteredLen, 1)-1] = nodes[i]
|
2016-05-20 00:49:53 +00:00
|
|
|
} else {
|
2016-07-20 06:28:57 +00:00
|
|
|
predicateResultLock.Lock()
|
2016-08-09 12:01:46 +00:00
|
|
|
failedPredicateMap[nodeName] = failedPredicates
|
2016-07-20 06:28:57 +00:00
|
|
|
predicateResultLock.Unlock()
|
2016-05-20 00:49:53 +00:00
|
|
|
}
|
2014-09-24 21:18:31 +00:00
|
|
|
}
|
2016-07-11 14:55:10 +00:00
|
|
|
workqueue.Parallelize(16, len(nodes), checkNode)
|
2016-07-20 06:28:57 +00:00
|
|
|
filtered = filtered[:filteredLen]
|
2016-05-20 00:49:53 +00:00
|
|
|
if len(errs) > 0 {
|
2017-06-05 23:24:05 +00:00
|
|
|
return []*v1.Node{}, FailedPredicateMap{}, errors.CreateAggregateFromMessageCountMap(errs)
|
2014-09-24 21:18:31 +00:00
|
|
|
}
|
|
|
|
}
|
2016-04-14 09:13:56 +00:00
|
|
|
|
2018-04-23 21:24:00 +00:00
|
|
|
if len(filtered) > 0 && len(g.extenders) != 0 {
|
|
|
|
for _, extender := range g.extenders {
|
2018-02-08 08:40:56 +00:00
|
|
|
if !extender.IsInterested(pod) {
|
|
|
|
continue
|
|
|
|
}
|
2018-04-23 21:24:00 +00:00
|
|
|
filteredList, failedMap, err := extender.Filter(pod, filtered, g.cachedNodeInfoMap)
|
2015-09-04 06:50:14 +00:00
|
|
|
if err != nil {
|
2018-03-19 19:15:24 +00:00
|
|
|
if extender.IsIgnorable() {
|
|
|
|
glog.Warningf("Skipping extender %v as it returned error %v and has ignorable flag set",
|
|
|
|
extender, err)
|
|
|
|
continue
|
|
|
|
} else {
|
|
|
|
return []*v1.Node{}, FailedPredicateMap{}, err
|
|
|
|
}
|
2015-09-04 06:50:14 +00:00
|
|
|
}
|
2016-06-27 09:31:46 +00:00
|
|
|
|
|
|
|
for failedNodeName, failedMsg := range failedMap {
|
2016-08-09 12:01:46 +00:00
|
|
|
if _, found := failedPredicateMap[failedNodeName]; !found {
|
|
|
|
failedPredicateMap[failedNodeName] = []algorithm.PredicateFailureReason{}
|
|
|
|
}
|
|
|
|
failedPredicateMap[failedNodeName] = append(failedPredicateMap[failedNodeName], predicates.NewFailureReason(failedMsg))
|
2016-06-27 09:31:46 +00:00
|
|
|
}
|
2016-07-11 14:55:10 +00:00
|
|
|
filtered = filteredList
|
2015-09-04 06:50:14 +00:00
|
|
|
if len(filtered) == 0 {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2016-07-11 14:55:10 +00:00
|
|
|
return filtered, failedPredicateMap, nil
|
2014-09-24 21:18:31 +00:00
|
|
|
}
|
|
|
|
|
2017-11-08 01:09:21 +00:00
|
|
|
// addNominatedPods adds pods with equal or greater priority which are nominated
|
|
|
|
// to run on the node given in nodeInfo to meta and nodeInfo. It returns 1) whether
|
|
|
|
// any pod was found, 2) augmented meta data, 3) augmented nodeInfo.
|
|
|
|
func addNominatedPods(podPriority int32, meta algorithm.PredicateMetadata,
|
|
|
|
nodeInfo *schedulercache.NodeInfo, queue SchedulingQueue) (bool, algorithm.PredicateMetadata,
|
|
|
|
*schedulercache.NodeInfo) {
|
|
|
|
if queue == nil || nodeInfo == nil || nodeInfo.Node() == nil {
|
|
|
|
// This may happen only in tests.
|
|
|
|
return false, meta, nodeInfo
|
|
|
|
}
|
|
|
|
nominatedPods := queue.WaitingPodsForNode(nodeInfo.Node().Name)
|
|
|
|
if nominatedPods == nil || len(nominatedPods) == 0 {
|
|
|
|
return false, meta, nodeInfo
|
|
|
|
}
|
2018-02-08 06:42:19 +00:00
|
|
|
var metaOut algorithm.PredicateMetadata
|
2017-11-08 01:09:21 +00:00
|
|
|
if meta != nil {
|
|
|
|
metaOut = meta.ShallowCopy()
|
|
|
|
}
|
|
|
|
nodeInfoOut := nodeInfo.Clone()
|
|
|
|
for _, p := range nominatedPods {
|
|
|
|
if util.GetPodPriority(p) >= podPriority {
|
|
|
|
nodeInfoOut.AddPod(p)
|
|
|
|
if metaOut != nil {
|
|
|
|
metaOut.AddPod(p, nodeInfoOut)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true, metaOut, nodeInfoOut
|
|
|
|
}
|
|
|
|
|
2017-11-21 06:15:00 +00:00
|
|
|
// podFitsOnNode checks whether a node given by NodeInfo satisfies the given predicate functions.
|
2018-02-02 06:47:07 +00:00
|
|
|
// For given pod, podFitsOnNode will check if any equivalent pod exists and try to reuse its cached
|
|
|
|
// predicate results as possible.
|
2017-11-21 06:15:00 +00:00
|
|
|
// This function is called from two different places: Schedule and Preempt.
|
|
|
|
// When it is called from Schedule, we want to test whether the pod is schedulable
|
|
|
|
// on the node with all the existing pods on the node plus higher and equal priority
|
|
|
|
// pods nominated to run on the node.
|
|
|
|
// When it is called from Preempt, we should remove the victims of preemption and
|
|
|
|
// add the nominated pods. Removal of the victims is done by SelectVictimsOnNode().
|
|
|
|
// It removes victims from meta and NodeInfo before calling this function.
|
2017-11-08 01:09:21 +00:00
|
|
|
func podFitsOnNode(
|
|
|
|
pod *v1.Pod,
|
|
|
|
meta algorithm.PredicateMetadata,
|
|
|
|
info *schedulercache.NodeInfo,
|
|
|
|
predicateFuncs map[string]algorithm.FitPredicate,
|
2018-04-23 21:24:00 +00:00
|
|
|
cache schedulercache.Cache,
|
2018-05-23 23:42:30 +00:00
|
|
|
ecache *equivalence.Cache,
|
2017-11-08 01:09:21 +00:00
|
|
|
queue SchedulingQueue,
|
2017-12-21 15:03:46 +00:00
|
|
|
alwaysCheckAllPredicates bool,
|
2018-05-23 23:42:30 +00:00
|
|
|
equivClass *equivalence.Class,
|
2017-11-08 01:09:21 +00:00
|
|
|
) (bool, []algorithm.PredicateFailureReason, error) {
|
2017-02-15 08:59:30 +00:00
|
|
|
var (
|
|
|
|
eCacheAvailable bool
|
2018-02-02 06:47:07 +00:00
|
|
|
failedPredicates []algorithm.PredicateFailureReason
|
2017-02-15 08:59:30 +00:00
|
|
|
)
|
2017-11-08 01:09:21 +00:00
|
|
|
|
|
|
|
podsAdded := false
|
|
|
|
// We run predicates twice in some cases. If the node has greater or equal priority
|
|
|
|
// nominated pods, we run them when those pods are added to meta and nodeInfo.
|
|
|
|
// If all predicates succeed in this pass, we run them again when these
|
|
|
|
// nominated pods are not added. This second pass is necessary because some
|
|
|
|
// predicates such as inter-pod affinity may not pass without the nominated pods.
|
|
|
|
// If there are no nominated pods for the node or if the first run of the
|
|
|
|
// predicates fail, we don't run the second pass.
|
|
|
|
// We consider only equal or higher priority pods in the first pass, because
|
|
|
|
// those are the current "pod" must yield to them and not take a space opened
|
|
|
|
// for running them. It is ok if the current "pod" take resources freed for
|
|
|
|
// lower priority pods.
|
|
|
|
// Requiring that the new pod is schedulable in both circumstances ensures that
|
|
|
|
// we are making a conservative decision: predicates like resources and inter-pod
|
|
|
|
// anti-affinity are more likely to fail when the nominated pods are treated
|
|
|
|
// as running, while predicates like pod affinity are more likely to fail when
|
|
|
|
// the nominated pods are treated as not running. We can't just assume the
|
|
|
|
// nominated pods are running because they are not running right now and in fact,
|
|
|
|
// they may end up getting scheduled to a different node.
|
|
|
|
for i := 0; i < 2; i++ {
|
|
|
|
metaToUse := meta
|
|
|
|
nodeInfoToUse := info
|
|
|
|
if i == 0 {
|
|
|
|
podsAdded, metaToUse, nodeInfoToUse = addNominatedPods(util.GetPodPriority(pod), meta, info, queue)
|
|
|
|
} else if !podsAdded || len(failedPredicates) != 0 {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
// Bypass eCache if node has any nominated pods.
|
|
|
|
// TODO(bsalamat): consider using eCache and adding proper eCache invalidations
|
|
|
|
// when pods are nominated or their nominations change.
|
2018-05-23 23:42:30 +00:00
|
|
|
eCacheAvailable = equivClass != nil && !podsAdded
|
2018-02-08 06:42:19 +00:00
|
|
|
for _, predicateKey := range predicates.Ordering() {
|
2018-03-25 18:54:16 +00:00
|
|
|
var (
|
|
|
|
fit bool
|
|
|
|
reasons []algorithm.PredicateFailureReason
|
|
|
|
err error
|
|
|
|
)
|
2018-02-02 06:47:07 +00:00
|
|
|
//TODO (yastij) : compute average predicate restrictiveness to export it as Prometheus metric
|
2017-12-14 00:57:23 +00:00
|
|
|
if predicate, exist := predicateFuncs[predicateKey]; exist {
|
2018-04-25 18:08:03 +00:00
|
|
|
if eCacheAvailable {
|
2018-05-23 23:42:30 +00:00
|
|
|
fit, reasons, err = ecache.RunPredicate(predicate, predicateKey, pod, metaToUse, nodeInfoToUse, equivClass, cache)
|
2018-04-25 18:08:03 +00:00
|
|
|
} else {
|
|
|
|
fit, reasons, err = predicate(pod, metaToUse, nodeInfoToUse)
|
|
|
|
}
|
2018-03-25 18:54:16 +00:00
|
|
|
if err != nil {
|
|
|
|
return false, []algorithm.PredicateFailureReason{}, err
|
2018-02-08 02:25:49 +00:00
|
|
|
}
|
|
|
|
|
2017-12-14 00:57:23 +00:00
|
|
|
if !fit {
|
|
|
|
// eCache is available and valid, and predicates result is unfit, record the fail reasons
|
|
|
|
failedPredicates = append(failedPredicates, reasons...)
|
2017-12-21 15:03:46 +00:00
|
|
|
// if alwaysCheckAllPredicates is false, short circuit all predicates when one predicate fails.
|
|
|
|
if !alwaysCheckAllPredicates {
|
2018-03-25 18:54:16 +00:00
|
|
|
glog.V(5).Infoln("since alwaysCheckAllPredicates has not been set, the predicate" +
|
|
|
|
"evaluation is short circuited and there are chances" +
|
|
|
|
"of other predicates failing as well.")
|
2017-12-21 15:03:46 +00:00
|
|
|
break
|
|
|
|
}
|
2017-12-14 00:57:23 +00:00
|
|
|
}
|
2017-02-15 08:59:30 +00:00
|
|
|
}
|
2016-04-14 09:13:56 +00:00
|
|
|
}
|
2017-11-08 01:09:21 +00:00
|
|
|
}
|
2017-02-15 08:59:30 +00:00
|
|
|
|
2016-08-09 12:01:46 +00:00
|
|
|
return len(failedPredicates) == 0, failedPredicates, nil
|
2016-04-14 09:13:56 +00:00
|
|
|
}
|
|
|
|
|
2018-02-08 06:42:19 +00:00
|
|
|
// PrioritizeNodes prioritizes the nodes by running the individual priority functions in parallel.
|
2014-12-01 19:49:13 +00:00
|
|
|
// Each priority function is expected to set a score of 0-10
|
2015-09-10 08:40:22 +00:00
|
|
|
// 0 is the lowest priority score (least preferred node) and 10 is the highest
|
2014-12-01 19:49:13 +00:00
|
|
|
// Each priority function can also have its own weight
|
2015-09-10 08:40:22 +00:00
|
|
|
// The node scores returned by the priority function are multiplied by the weights to get weighted scores
|
|
|
|
// All scores are finally combined (added) to get the total weighted scores of all nodes
|
2016-03-03 12:36:07 +00:00
|
|
|
func PrioritizeNodes(
|
2016-11-18 20:52:35 +00:00
|
|
|
pod *v1.Pod,
|
2016-03-03 12:36:07 +00:00
|
|
|
nodeNameToInfo map[string]*schedulercache.NodeInfo,
|
2016-09-13 09:54:13 +00:00
|
|
|
meta interface{},
|
2016-03-03 12:36:07 +00:00
|
|
|
priorityConfigs []algorithm.PriorityConfig,
|
2016-11-18 20:52:35 +00:00
|
|
|
nodes []*v1.Node,
|
2016-03-03 12:36:07 +00:00
|
|
|
extenders []algorithm.SchedulerExtender,
|
|
|
|
) (schedulerapi.HostPriorityList, error) {
|
2015-02-27 20:53:04 +00:00
|
|
|
// If no priority configs are provided, then the EqualPriority function is applied
|
|
|
|
// This is required to generate the priority list in the required format
|
2015-09-04 06:50:14 +00:00
|
|
|
if len(priorityConfigs) == 0 && len(extenders) == 0 {
|
2016-09-30 13:14:29 +00:00
|
|
|
result := make(schedulerapi.HostPriorityList, 0, len(nodes))
|
|
|
|
for i := range nodes {
|
|
|
|
hostPriority, err := EqualPriorityMap(pod, meta, nodeNameToInfo[nodes[i].Name])
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
result = append(result, hostPriority)
|
|
|
|
}
|
|
|
|
return result, nil
|
2015-02-27 20:53:04 +00:00
|
|
|
}
|
|
|
|
|
2015-12-09 05:19:19 +00:00
|
|
|
var (
|
2016-09-13 12:59:13 +00:00
|
|
|
mu = sync.Mutex{}
|
|
|
|
wg = sync.WaitGroup{}
|
|
|
|
errs []error
|
2015-12-09 05:19:19 +00:00
|
|
|
)
|
2016-09-13 12:59:13 +00:00
|
|
|
appendError := func(err error) {
|
|
|
|
mu.Lock()
|
|
|
|
defer mu.Unlock()
|
|
|
|
errs = append(errs, err)
|
|
|
|
}
|
2015-12-09 05:19:19 +00:00
|
|
|
|
2017-08-15 08:14:44 +00:00
|
|
|
results := make([]schedulerapi.HostPriorityList, len(priorityConfigs), len(priorityConfigs))
|
|
|
|
|
2016-09-13 12:59:13 +00:00
|
|
|
for i, priorityConfig := range priorityConfigs {
|
|
|
|
if priorityConfig.Function != nil {
|
|
|
|
// DEPRECATED
|
|
|
|
wg.Add(1)
|
|
|
|
go func(index int, config algorithm.PriorityConfig) {
|
|
|
|
defer wg.Done()
|
|
|
|
var err error
|
|
|
|
results[index], err = config.Function(pod, nodeNameToInfo, nodes)
|
|
|
|
if err != nil {
|
|
|
|
appendError(err)
|
2016-08-26 14:08:40 +00:00
|
|
|
}
|
2016-09-13 12:59:13 +00:00
|
|
|
}(i, priorityConfig)
|
|
|
|
} else {
|
|
|
|
results[i] = make(schedulerapi.HostPriorityList, len(nodes))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
processNode := func(index int) {
|
|
|
|
nodeInfo := nodeNameToInfo[nodes[index].Name]
|
|
|
|
var err error
|
|
|
|
for i := range priorityConfigs {
|
|
|
|
if priorityConfigs[i].Function != nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
results[i][index], err = priorityConfigs[i].Map(pod, meta, nodeInfo)
|
2015-12-09 05:19:19 +00:00
|
|
|
if err != nil {
|
2016-09-13 12:59:13 +00:00
|
|
|
appendError(err)
|
2015-12-09 05:19:19 +00:00
|
|
|
return
|
|
|
|
}
|
2016-09-13 12:59:13 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
workqueue.Parallelize(16, len(nodes), processNode)
|
|
|
|
for i, priorityConfig := range priorityConfigs {
|
|
|
|
if priorityConfig.Reduce == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
wg.Add(1)
|
|
|
|
go func(index int, config algorithm.PriorityConfig) {
|
|
|
|
defer wg.Done()
|
2016-10-26 13:14:38 +00:00
|
|
|
if err := config.Reduce(pod, meta, nodeNameToInfo, results[index]); err != nil {
|
2016-09-13 12:59:13 +00:00
|
|
|
appendError(err)
|
2015-12-09 05:19:19 +00:00
|
|
|
}
|
2017-11-07 19:34:20 +00:00
|
|
|
if glog.V(10) {
|
|
|
|
for _, hostPriority := range results[index] {
|
|
|
|
glog.Infof("%v -> %v: %v, Score: (%d)", pod.Name, hostPriority.Host, config.Name, hostPriority.Score)
|
|
|
|
}
|
|
|
|
}
|
2016-09-13 12:59:13 +00:00
|
|
|
}(i, priorityConfig)
|
2015-12-09 05:19:19 +00:00
|
|
|
}
|
2016-09-13 12:59:13 +00:00
|
|
|
// Wait for all computations to be finished.
|
2016-08-26 14:08:40 +00:00
|
|
|
wg.Wait()
|
2015-12-09 05:19:19 +00:00
|
|
|
if len(errs) != 0 {
|
|
|
|
return schedulerapi.HostPriorityList{}, errors.NewAggregate(errs)
|
2014-11-20 22:42:31 +00:00
|
|
|
}
|
2015-12-09 05:19:19 +00:00
|
|
|
|
2016-09-13 12:59:13 +00:00
|
|
|
// Summarize all scores.
|
|
|
|
result := make(schedulerapi.HostPriorityList, 0, len(nodes))
|
2017-05-23 11:34:14 +00:00
|
|
|
|
2016-09-13 12:59:13 +00:00
|
|
|
for i := range nodes {
|
|
|
|
result = append(result, schedulerapi.HostPriority{Host: nodes[i].Name, Score: 0})
|
|
|
|
for j := range priorityConfigs {
|
|
|
|
result[i].Score += results[j][i].Score * priorityConfigs[j].Weight
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-19 12:35:43 +00:00
|
|
|
if len(extenders) != 0 && nodes != nil {
|
2016-09-13 12:59:13 +00:00
|
|
|
combinedScores := make(map[string]int, len(nodeNameToInfo))
|
2015-09-04 06:50:14 +00:00
|
|
|
for _, extender := range extenders {
|
2018-02-08 08:40:56 +00:00
|
|
|
if !extender.IsInterested(pod) {
|
|
|
|
continue
|
|
|
|
}
|
2015-12-09 05:19:19 +00:00
|
|
|
wg.Add(1)
|
|
|
|
go func(ext algorithm.SchedulerExtender) {
|
|
|
|
defer wg.Done()
|
2016-07-11 14:55:10 +00:00
|
|
|
prioritizedList, weight, err := ext.Prioritize(pod, nodes)
|
2015-12-09 05:19:19 +00:00
|
|
|
if err != nil {
|
|
|
|
// Prioritization errors from extender can be ignored, let k8s/other extenders determine the priorities
|
|
|
|
return
|
|
|
|
}
|
|
|
|
mu.Lock()
|
|
|
|
for i := range *prioritizedList {
|
|
|
|
host, score := (*prioritizedList)[i].Host, (*prioritizedList)[i].Score
|
|
|
|
combinedScores[host] += score * weight
|
|
|
|
}
|
|
|
|
mu.Unlock()
|
|
|
|
}(extender)
|
2015-09-04 06:50:14 +00:00
|
|
|
}
|
2016-09-13 12:59:13 +00:00
|
|
|
// wait for all go routines to finish
|
|
|
|
wg.Wait()
|
|
|
|
for i := range result {
|
|
|
|
result[i].Score += combinedScores[result[i].Host]
|
|
|
|
}
|
2015-09-04 06:50:14 +00:00
|
|
|
}
|
2015-12-09 05:19:19 +00:00
|
|
|
|
2016-09-13 12:59:13 +00:00
|
|
|
if glog.V(10) {
|
|
|
|
for i := range result {
|
|
|
|
glog.V(10).Infof("Host %s => Score %d", result[i].Host, result[i].Score)
|
|
|
|
}
|
2014-11-20 22:42:31 +00:00
|
|
|
}
|
|
|
|
return result, nil
|
|
|
|
}
|
|
|
|
|
2018-02-08 06:42:19 +00:00
|
|
|
// EqualPriorityMap is a prioritizer function that gives an equal weight of one to all nodes
|
2016-11-18 20:52:35 +00:00
|
|
|
func EqualPriorityMap(_ *v1.Pod, _ interface{}, nodeInfo *schedulercache.NodeInfo) (schedulerapi.HostPriority, error) {
|
2016-09-30 13:14:29 +00:00
|
|
|
node := nodeInfo.Node()
|
|
|
|
if node == nil {
|
|
|
|
return schedulerapi.HostPriority{}, fmt.Errorf("node not found")
|
2014-09-25 18:36:22 +00:00
|
|
|
}
|
2016-09-30 13:14:29 +00:00
|
|
|
return schedulerapi.HostPriority{
|
|
|
|
Host: node.Name,
|
|
|
|
Score: 1,
|
|
|
|
}, nil
|
2014-09-25 18:36:22 +00:00
|
|
|
}
|
|
|
|
|
2017-08-09 18:36:35 +00:00
|
|
|
// pickOneNodeForPreemption chooses one node among the given nodes. It assumes
|
|
|
|
// pods in each map entry are ordered by decreasing priority.
|
|
|
|
// It picks a node based on the following criteria:
|
2017-11-21 21:29:16 +00:00
|
|
|
// 1. A node with minimum number of PDB violations.
|
|
|
|
// 2. A node with minimum highest priority victim is picked.
|
|
|
|
// 3. Ties are broken by sum of priorities of all victims.
|
|
|
|
// 4. If there are still ties, node with the minimum number of victims is picked.
|
|
|
|
// 5. If there are still ties, the first such node is picked (sort of randomly).
|
2018-02-13 01:25:37 +00:00
|
|
|
// The 'minNodes1' and 'minNodes2' are being reused here to save the memory
|
|
|
|
// allocation and garbage collection time.
|
2017-11-17 02:21:03 +00:00
|
|
|
func pickOneNodeForPreemption(nodesToVictims map[*v1.Node]*schedulerapi.Victims) *v1.Node {
|
2017-11-21 21:29:16 +00:00
|
|
|
if len(nodesToVictims) == 0 {
|
2017-08-10 01:15:40 +00:00
|
|
|
return nil
|
2017-08-09 18:36:35 +00:00
|
|
|
}
|
2017-11-21 21:29:16 +00:00
|
|
|
minNumPDBViolatingPods := math.MaxInt32
|
2018-02-13 01:25:37 +00:00
|
|
|
var minNodes1 []*v1.Node
|
|
|
|
lenNodes1 := 0
|
2017-11-21 21:29:16 +00:00
|
|
|
for node, victims := range nodesToVictims {
|
2017-11-17 02:21:03 +00:00
|
|
|
if len(victims.Pods) == 0 {
|
2017-08-09 18:36:35 +00:00
|
|
|
// We found a node that doesn't need any preemption. Return it!
|
|
|
|
// This should happen rarely when one or more pods are terminated between
|
|
|
|
// the time that scheduler tries to schedule the pod and the time that
|
|
|
|
// preemption logic tries to find nodes for preemption.
|
2017-08-10 01:15:40 +00:00
|
|
|
return node
|
2017-08-09 18:36:35 +00:00
|
|
|
}
|
2017-11-17 02:21:03 +00:00
|
|
|
numPDBViolatingPods := victims.NumPDBViolations
|
2017-11-21 21:29:16 +00:00
|
|
|
if numPDBViolatingPods < minNumPDBViolatingPods {
|
|
|
|
minNumPDBViolatingPods = numPDBViolatingPods
|
2018-02-13 01:25:37 +00:00
|
|
|
minNodes1 = nil
|
|
|
|
lenNodes1 = 0
|
2017-11-21 21:29:16 +00:00
|
|
|
}
|
|
|
|
if numPDBViolatingPods == minNumPDBViolatingPods {
|
2018-02-13 01:25:37 +00:00
|
|
|
minNodes1 = append(minNodes1, node)
|
|
|
|
lenNodes1++
|
2017-11-21 21:29:16 +00:00
|
|
|
}
|
|
|
|
}
|
2018-02-13 01:25:37 +00:00
|
|
|
if lenNodes1 == 1 {
|
|
|
|
return minNodes1[0]
|
2017-11-21 21:29:16 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// There are more than one node with minimum number PDB violating pods. Find
|
|
|
|
// the one with minimum highest priority victim.
|
|
|
|
minHighestPriority := int32(math.MaxInt32)
|
2018-02-13 01:25:37 +00:00
|
|
|
var minNodes2 = make([]*v1.Node, lenNodes1)
|
|
|
|
lenNodes2 := 0
|
|
|
|
for i := 0; i < lenNodes1; i++ {
|
|
|
|
node := minNodes1[i]
|
2017-11-21 21:29:16 +00:00
|
|
|
victims := nodesToVictims[node]
|
2017-08-09 18:36:35 +00:00
|
|
|
// highestPodPriority is the highest priority among the victims on this node.
|
2017-11-17 02:21:03 +00:00
|
|
|
highestPodPriority := util.GetPodPriority(victims.Pods[0])
|
2017-08-09 18:36:35 +00:00
|
|
|
if highestPodPriority < minHighestPriority {
|
|
|
|
minHighestPriority = highestPodPriority
|
2018-02-13 01:25:37 +00:00
|
|
|
lenNodes2 = 0
|
2017-08-10 01:15:40 +00:00
|
|
|
}
|
|
|
|
if highestPodPriority == minHighestPriority {
|
2018-02-13 01:25:37 +00:00
|
|
|
minNodes2[lenNodes2] = node
|
|
|
|
lenNodes2++
|
2017-08-09 18:36:35 +00:00
|
|
|
}
|
|
|
|
}
|
2018-02-13 01:25:37 +00:00
|
|
|
if lenNodes2 == 1 {
|
|
|
|
return minNodes2[0]
|
2017-08-10 01:15:40 +00:00
|
|
|
}
|
2017-11-21 21:29:16 +00:00
|
|
|
|
2017-08-10 01:15:40 +00:00
|
|
|
// There are a few nodes with minimum highest priority victim. Find the
|
|
|
|
// smallest sum of priorities.
|
2017-08-09 18:36:35 +00:00
|
|
|
minSumPriorities := int64(math.MaxInt64)
|
2018-02-13 01:25:37 +00:00
|
|
|
lenNodes1 = 0
|
|
|
|
for i := 0; i < lenNodes2; i++ {
|
2017-08-10 01:15:40 +00:00
|
|
|
var sumPriorities int64
|
2018-02-13 01:25:37 +00:00
|
|
|
node := minNodes2[i]
|
2017-11-17 02:21:03 +00:00
|
|
|
for _, pod := range nodesToVictims[node].Pods {
|
2017-08-10 01:15:40 +00:00
|
|
|
// We add MaxInt32+1 to all priorities to make all of them >= 0. This is
|
|
|
|
// needed so that a node with a few pods with negative priority is not
|
|
|
|
// picked over a node with a smaller number of pods with the same negative
|
|
|
|
// priority (and similar scenarios).
|
|
|
|
sumPriorities += int64(util.GetPodPriority(pod)) + int64(math.MaxInt32+1)
|
|
|
|
}
|
|
|
|
if sumPriorities < minSumPriorities {
|
|
|
|
minSumPriorities = sumPriorities
|
2018-02-13 01:25:37 +00:00
|
|
|
lenNodes1 = 0
|
2017-08-10 01:15:40 +00:00
|
|
|
}
|
|
|
|
if sumPriorities == minSumPriorities {
|
2018-02-13 01:25:37 +00:00
|
|
|
minNodes1[lenNodes1] = node
|
|
|
|
lenNodes1++
|
2017-08-09 18:36:35 +00:00
|
|
|
}
|
|
|
|
}
|
2018-02-13 01:25:37 +00:00
|
|
|
if lenNodes1 == 1 {
|
|
|
|
return minNodes1[0]
|
2017-08-09 18:36:35 +00:00
|
|
|
}
|
2017-11-21 21:29:16 +00:00
|
|
|
|
2017-08-10 01:15:40 +00:00
|
|
|
// There are a few nodes with minimum highest priority victim and sum of priorities.
|
|
|
|
// Find one with the minimum number of pods.
|
2017-08-09 18:36:35 +00:00
|
|
|
minNumPods := math.MaxInt32
|
2018-02-13 01:25:37 +00:00
|
|
|
lenNodes2 = 0
|
|
|
|
for i := 0; i < lenNodes1; i++ {
|
|
|
|
node := minNodes1[i]
|
2017-11-17 02:21:03 +00:00
|
|
|
numPods := len(nodesToVictims[node].Pods)
|
2017-11-21 21:29:16 +00:00
|
|
|
if numPods < minNumPods {
|
|
|
|
minNumPods = numPods
|
2018-02-13 01:25:37 +00:00
|
|
|
lenNodes2 = 0
|
2017-08-09 18:36:35 +00:00
|
|
|
}
|
2017-11-21 21:29:16 +00:00
|
|
|
if numPods == minNumPods {
|
2018-02-13 01:25:37 +00:00
|
|
|
minNodes2[lenNodes2] = node
|
|
|
|
lenNodes2++
|
2017-08-09 18:36:35 +00:00
|
|
|
}
|
|
|
|
}
|
2017-08-10 01:15:40 +00:00
|
|
|
// At this point, even if there are more than one node with the same score,
|
|
|
|
// return the first one.
|
2018-02-13 01:25:37 +00:00
|
|
|
if lenNodes2 > 0 {
|
|
|
|
return minNodes2[0]
|
2017-08-10 01:15:40 +00:00
|
|
|
}
|
|
|
|
glog.Errorf("Error in logic of node scoring for preemption. We should never reach here!")
|
|
|
|
return nil
|
2017-08-09 18:36:35 +00:00
|
|
|
}
|
|
|
|
|
2017-08-03 01:20:45 +00:00
|
|
|
// selectNodesForPreemption finds all the nodes with possible victims for
|
|
|
|
// preemption in parallel.
|
|
|
|
func selectNodesForPreemption(pod *v1.Pod,
|
|
|
|
nodeNameToInfo map[string]*schedulercache.NodeInfo,
|
2017-08-10 01:15:40 +00:00
|
|
|
potentialNodes []*v1.Node,
|
2017-08-03 01:20:45 +00:00
|
|
|
predicates map[string]algorithm.FitPredicate,
|
2017-08-18 00:08:41 +00:00
|
|
|
metadataProducer algorithm.PredicateMetadataProducer,
|
2017-11-08 01:09:21 +00:00
|
|
|
queue SchedulingQueue,
|
2017-11-21 21:29:16 +00:00
|
|
|
pdbs []*policy.PodDisruptionBudget,
|
2017-11-17 02:21:03 +00:00
|
|
|
) (map[*v1.Node]*schedulerapi.Victims, error) {
|
2017-08-03 01:20:45 +00:00
|
|
|
|
2017-11-17 02:21:03 +00:00
|
|
|
nodeToVictims := map[*v1.Node]*schedulerapi.Victims{}
|
2017-08-03 01:20:45 +00:00
|
|
|
var resultLock sync.Mutex
|
|
|
|
|
|
|
|
// We can use the same metadata producer for all nodes.
|
|
|
|
meta := metadataProducer(pod, nodeNameToInfo)
|
|
|
|
checkNode := func(i int) {
|
2017-08-10 01:15:40 +00:00
|
|
|
nodeName := potentialNodes[i].Name
|
|
|
|
var metaCopy algorithm.PredicateMetadata
|
|
|
|
if meta != nil {
|
|
|
|
metaCopy = meta.ShallowCopy()
|
|
|
|
}
|
2017-11-21 21:29:16 +00:00
|
|
|
pods, numPDBViolations, fits := selectVictimsOnNode(pod, metaCopy, nodeNameToInfo[nodeName], predicates, queue, pdbs)
|
2017-08-09 18:36:35 +00:00
|
|
|
if fits {
|
2017-08-03 01:20:45 +00:00
|
|
|
resultLock.Lock()
|
2017-11-17 02:21:03 +00:00
|
|
|
victims := schedulerapi.Victims{
|
|
|
|
Pods: pods,
|
|
|
|
NumPDBViolations: numPDBViolations,
|
2017-11-21 21:29:16 +00:00
|
|
|
}
|
2017-11-17 02:21:03 +00:00
|
|
|
nodeToVictims[potentialNodes[i]] = &victims
|
2017-08-03 01:20:45 +00:00
|
|
|
resultLock.Unlock()
|
|
|
|
}
|
|
|
|
}
|
2017-08-10 01:15:40 +00:00
|
|
|
workqueue.Parallelize(16, len(potentialNodes), checkNode)
|
2017-11-17 02:21:03 +00:00
|
|
|
return nodeToVictims, nil
|
2017-08-03 01:20:45 +00:00
|
|
|
}
|
|
|
|
|
2017-11-21 21:29:16 +00:00
|
|
|
// filterPodsWithPDBViolation groups the given "pods" into two groups of "violatingPods"
|
|
|
|
// and "nonViolatingPods" based on whether their PDBs will be violated if they are
|
|
|
|
// preempted.
|
|
|
|
// This function is stable and does not change the order of received pods. So, if it
|
|
|
|
// receives a sorted list, grouping will preserve the order of the input list.
|
|
|
|
func filterPodsWithPDBViolation(pods []interface{}, pdbs []*policy.PodDisruptionBudget) (violatingPods, nonViolatingPods []*v1.Pod) {
|
|
|
|
for _, obj := range pods {
|
|
|
|
pod := obj.(*v1.Pod)
|
|
|
|
pdbForPodIsViolated := false
|
|
|
|
// A pod with no labels will not match any PDB. So, no need to check.
|
|
|
|
if len(pod.Labels) != 0 {
|
|
|
|
for _, pdb := range pdbs {
|
|
|
|
if pdb.Namespace != pod.Namespace {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
selector, err := metav1.LabelSelectorAsSelector(pdb.Spec.Selector)
|
|
|
|
if err != nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// A PDB with a nil or empty selector matches nothing.
|
|
|
|
if selector.Empty() || !selector.Matches(labels.Set(pod.Labels)) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// We have found a matching PDB.
|
|
|
|
if pdb.Status.PodDisruptionsAllowed <= 0 {
|
|
|
|
pdbForPodIsViolated = true
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if pdbForPodIsViolated {
|
|
|
|
violatingPods = append(violatingPods, pod)
|
|
|
|
} else {
|
|
|
|
nonViolatingPods = append(nonViolatingPods, pod)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return violatingPods, nonViolatingPods
|
|
|
|
}
|
|
|
|
|
2017-08-03 01:20:45 +00:00
|
|
|
// selectVictimsOnNode finds minimum set of pods on the given node that should
|
|
|
|
// be preempted in order to make enough room for "pod" to be scheduled. The
|
|
|
|
// minimum set selected is subject to the constraint that a higher-priority pod
|
|
|
|
// is never preempted when a lower-priority pod could be (higher/lower relative
|
|
|
|
// to one another, not relative to the preemptor "pod").
|
|
|
|
// The algorithm first checks if the pod can be scheduled on the node when all the
|
|
|
|
// lower priority pods are gone. If so, it sorts all the lower priority pods by
|
2017-11-21 21:29:16 +00:00
|
|
|
// their priority and then puts them into two groups of those whose PodDisruptionBudget
|
|
|
|
// will be violated if preempted and other non-violating pods. Both groups are
|
|
|
|
// sorted by priority. It first tries to reprieve as many PDB violating pods as
|
|
|
|
// possible and then does them same for non-PDB-violating pods while checking
|
|
|
|
// that the "pod" can still fit on the node.
|
2017-08-03 01:20:45 +00:00
|
|
|
// NOTE: This function assumes that it is never called if "pod" cannot be scheduled
|
|
|
|
// due to pod affinity, node affinity, or node anti-affinity reasons. None of
|
|
|
|
// these predicates can be satisfied by removing more pods from the node.
|
2017-08-10 01:15:40 +00:00
|
|
|
func selectVictimsOnNode(
|
|
|
|
pod *v1.Pod,
|
|
|
|
meta algorithm.PredicateMetadata,
|
|
|
|
nodeInfo *schedulercache.NodeInfo,
|
2017-11-08 01:09:21 +00:00
|
|
|
fitPredicates map[string]algorithm.FitPredicate,
|
|
|
|
queue SchedulingQueue,
|
2017-11-21 21:29:16 +00:00
|
|
|
pdbs []*policy.PodDisruptionBudget,
|
|
|
|
) ([]*v1.Pod, int, bool) {
|
2017-08-10 01:15:40 +00:00
|
|
|
potentialVictims := util.SortableList{CompFunc: util.HigherPriorityPod}
|
2017-08-03 01:20:45 +00:00
|
|
|
nodeInfoCopy := nodeInfo.Clone()
|
|
|
|
|
|
|
|
removePod := func(rp *v1.Pod) {
|
|
|
|
nodeInfoCopy.RemovePod(rp)
|
2017-08-10 01:15:40 +00:00
|
|
|
if meta != nil {
|
|
|
|
meta.RemovePod(rp)
|
|
|
|
}
|
2017-08-03 01:20:45 +00:00
|
|
|
}
|
|
|
|
addPod := func(ap *v1.Pod) {
|
|
|
|
nodeInfoCopy.AddPod(ap)
|
2017-08-10 01:15:40 +00:00
|
|
|
if meta != nil {
|
|
|
|
meta.AddPod(ap, nodeInfoCopy)
|
|
|
|
}
|
2017-08-03 01:20:45 +00:00
|
|
|
}
|
|
|
|
// As the first step, remove all the lower priority pods from the node and
|
|
|
|
// check if the given pod can be scheduled.
|
|
|
|
podPriority := util.GetPodPriority(pod)
|
|
|
|
for _, p := range nodeInfoCopy.Pods() {
|
|
|
|
if util.GetPodPriority(p) < podPriority {
|
|
|
|
potentialVictims.Items = append(potentialVictims.Items, p)
|
|
|
|
removePod(p)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
potentialVictims.Sort()
|
|
|
|
// If the new pod does not fit after removing all the lower priority pods,
|
|
|
|
// we are almost done and this node is not suitable for preemption. The only condition
|
|
|
|
// that we should check is if the "pod" is failing to schedule due to pod affinity
|
|
|
|
// failure.
|
2017-08-10 01:15:40 +00:00
|
|
|
// TODO(bsalamat): Consider checking affinity to lower priority pods if feasible with reasonable performance.
|
2018-04-23 21:24:00 +00:00
|
|
|
if fits, _, err := podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil, nil, queue, false, nil); !fits {
|
2017-08-03 01:20:45 +00:00
|
|
|
if err != nil {
|
|
|
|
glog.Warningf("Encountered error while selecting victims on node %v: %v", nodeInfo.Node().Name, err)
|
|
|
|
}
|
2017-11-21 21:29:16 +00:00
|
|
|
return nil, 0, false
|
|
|
|
}
|
|
|
|
var victims []*v1.Pod
|
|
|
|
numViolatingVictim := 0
|
|
|
|
// Try to reprieve as many pods as possible. We first try to reprieve the PDB
|
|
|
|
// violating victims and then other non-violating ones. In both cases, we start
|
|
|
|
// from the highest priority victims.
|
|
|
|
violatingVictims, nonViolatingVictims := filterPodsWithPDBViolation(potentialVictims.Items, pdbs)
|
|
|
|
reprievePod := func(p *v1.Pod) bool {
|
|
|
|
addPod(p)
|
2018-04-23 21:24:00 +00:00
|
|
|
fits, _, _ := podFitsOnNode(pod, meta, nodeInfoCopy, fitPredicates, nil, nil, queue, false, nil)
|
2017-11-21 21:29:16 +00:00
|
|
|
if !fits {
|
|
|
|
removePod(p)
|
|
|
|
victims = append(victims, p)
|
|
|
|
glog.V(5).Infof("Pod %v is a potential preemption victim on node %v.", p.Name, nodeInfo.Node().Name)
|
|
|
|
}
|
|
|
|
return fits
|
2017-08-03 01:20:45 +00:00
|
|
|
}
|
2017-11-21 21:29:16 +00:00
|
|
|
for _, p := range violatingVictims {
|
|
|
|
if !reprievePod(p) {
|
|
|
|
numViolatingVictim++
|
2017-08-03 01:20:45 +00:00
|
|
|
}
|
|
|
|
}
|
2017-11-21 21:29:16 +00:00
|
|
|
// Now we try to reprieve non-violating victims.
|
|
|
|
for _, p := range nonViolatingVictims {
|
|
|
|
reprievePod(p)
|
|
|
|
}
|
|
|
|
return victims, numViolatingVictim, true
|
2017-08-03 01:20:45 +00:00
|
|
|
}
|
|
|
|
|
2017-08-10 01:15:40 +00:00
|
|
|
// nodesWherePreemptionMightHelp returns a list of nodes with failed predicates
|
|
|
|
// that may be satisfied by removing pods from the node.
|
2018-05-23 05:56:17 +00:00
|
|
|
func nodesWherePreemptionMightHelp(nodes []*v1.Node, failedPredicatesMap FailedPredicateMap) []*v1.Node {
|
2017-08-10 01:15:40 +00:00
|
|
|
potentialNodes := []*v1.Node{}
|
|
|
|
for _, node := range nodes {
|
|
|
|
unresolvableReasonExist := false
|
|
|
|
failedPredicates, found := failedPredicatesMap[node.Name]
|
|
|
|
// If we assume that scheduler looks at all nodes and populates the failedPredicateMap
|
|
|
|
// (which is the case today), the !found case should never happen, but we'd prefer
|
|
|
|
// to rely less on such assumptions in the code when checking does not impose
|
|
|
|
// significant overhead.
|
2017-11-17 02:21:03 +00:00
|
|
|
// Also, we currently assume all failures returned by extender as resolvable.
|
2017-08-10 01:15:40 +00:00
|
|
|
for _, failedPredicate := range failedPredicates {
|
|
|
|
switch failedPredicate {
|
|
|
|
case
|
|
|
|
predicates.ErrNodeSelectorNotMatch,
|
2018-06-11 23:39:00 +00:00
|
|
|
predicates.ErrPodAffinityRulesNotMatch,
|
2017-08-10 01:15:40 +00:00
|
|
|
predicates.ErrPodNotMatchHostName,
|
|
|
|
predicates.ErrTaintsTolerationsNotMatch,
|
|
|
|
predicates.ErrNodeLabelPresenceViolated,
|
2018-06-11 23:39:00 +00:00
|
|
|
// Node conditions won't change when scheduler simulates removal of preemption victims.
|
|
|
|
// So, it is pointless to try nodes that have not been able to host the pod due to node
|
|
|
|
// conditions. These include ErrNodeNotReady, ErrNodeUnderPIDPressure, ErrNodeUnderMemoryPressure, ....
|
2017-08-10 01:15:40 +00:00
|
|
|
predicates.ErrNodeNotReady,
|
|
|
|
predicates.ErrNodeNetworkUnavailable,
|
2018-06-11 23:39:00 +00:00
|
|
|
predicates.ErrNodeUnderDiskPressure,
|
|
|
|
predicates.ErrNodeUnderPIDPressure,
|
|
|
|
predicates.ErrNodeUnderMemoryPressure,
|
|
|
|
predicates.ErrNodeOutOfDisk,
|
2017-08-10 01:15:40 +00:00
|
|
|
predicates.ErrNodeUnschedulable,
|
2017-11-08 21:09:53 +00:00
|
|
|
predicates.ErrNodeUnknownCondition,
|
|
|
|
predicates.ErrVolumeZoneConflict,
|
|
|
|
predicates.ErrVolumeNodeConflict,
|
|
|
|
predicates.ErrVolumeBindConflict:
|
2017-08-10 01:15:40 +00:00
|
|
|
unresolvableReasonExist = true
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if !found || !unresolvableReasonExist {
|
|
|
|
glog.V(3).Infof("Node %v is a potential node for preemption.", node.Name)
|
|
|
|
potentialNodes = append(potentialNodes, node)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return potentialNodes
|
|
|
|
}
|
|
|
|
|
|
|
|
// podEligibleToPreemptOthers determines whether this pod should be considered
|
|
|
|
// for preempting other pods or not. If this pod has already preempted other
|
|
|
|
// pods and those are in their graceful termination period, it shouldn't be
|
|
|
|
// considered for preemption.
|
|
|
|
// We look at the node that is nominated for this pod and as long as there are
|
|
|
|
// terminating pods on the node, we don't consider this for preempting more pods.
|
|
|
|
func podEligibleToPreemptOthers(pod *v1.Pod, nodeNameToInfo map[string]*schedulercache.NodeInfo) bool {
|
2018-02-02 19:24:20 +00:00
|
|
|
nomNodeName := pod.Status.NominatedNodeName
|
|
|
|
if len(nomNodeName) > 0 {
|
|
|
|
if nodeInfo, found := nodeNameToInfo[nomNodeName]; found {
|
2017-08-10 01:15:40 +00:00
|
|
|
for _, p := range nodeInfo.Pods() {
|
|
|
|
if p.DeletionTimestamp != nil && util.GetPodPriority(p) < util.GetPodPriority(pod) {
|
|
|
|
// There is a terminating pod on the nominated node.
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2017-11-23 09:01:23 +00:00
|
|
|
// podPassesBasicChecks makes sanity checks on the pod if it can be scheduled.
|
2017-11-23 09:04:42 +00:00
|
|
|
func podPassesBasicChecks(pod *v1.Pod, pvcLister corelisters.PersistentVolumeClaimLister) error {
|
2017-11-23 09:01:23 +00:00
|
|
|
// Check PVCs used by the pod
|
|
|
|
namespace := pod.Namespace
|
|
|
|
manifest := &(pod.Spec)
|
|
|
|
for i := range manifest.Volumes {
|
|
|
|
volume := &manifest.Volumes[i]
|
|
|
|
if volume.PersistentVolumeClaim == nil {
|
|
|
|
// Volume is not a PVC, ignore
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
pvcName := volume.PersistentVolumeClaim.ClaimName
|
2017-11-23 09:04:42 +00:00
|
|
|
pvc, err := pvcLister.PersistentVolumeClaims(namespace).Get(pvcName)
|
2017-11-23 09:01:23 +00:00
|
|
|
if err != nil {
|
|
|
|
// The error has already enough context ("persistentvolumeclaim "myclaim" not found")
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
if pvc.DeletionTimestamp != nil {
|
|
|
|
return fmt.Errorf("persistentvolumeclaim %q is being deleted", pvc.Name)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2018-02-08 06:42:19 +00:00
|
|
|
// NewGenericScheduler creates a genericScheduler object.
|
2016-09-13 09:54:13 +00:00
|
|
|
func NewGenericScheduler(
|
|
|
|
cache schedulercache.Cache,
|
2018-05-23 23:42:30 +00:00
|
|
|
eCache *equivalence.Cache,
|
2017-11-08 01:09:21 +00:00
|
|
|
podQueue SchedulingQueue,
|
2016-09-13 09:54:13 +00:00
|
|
|
predicates map[string]algorithm.FitPredicate,
|
2017-08-18 00:08:41 +00:00
|
|
|
predicateMetaProducer algorithm.PredicateMetadataProducer,
|
2016-09-13 09:54:13 +00:00
|
|
|
prioritizers []algorithm.PriorityConfig,
|
2018-01-12 03:36:28 +00:00
|
|
|
priorityMetaProducer algorithm.PriorityMetadataProducer,
|
2017-11-08 21:09:53 +00:00
|
|
|
extenders []algorithm.SchedulerExtender,
|
2017-11-23 09:01:23 +00:00
|
|
|
volumeBinder *volumebinder.VolumeBinder,
|
2017-12-21 15:03:46 +00:00
|
|
|
pvcLister corelisters.PersistentVolumeClaimLister,
|
2018-04-12 00:40:07 +00:00
|
|
|
alwaysCheckAllPredicates bool,
|
|
|
|
disablePreemption bool,
|
|
|
|
) algorithm.ScheduleAlgorithm {
|
2014-09-24 16:32:36 +00:00
|
|
|
return &genericScheduler{
|
2017-12-21 15:03:46 +00:00
|
|
|
cache: cache,
|
|
|
|
equivalenceCache: eCache,
|
|
|
|
schedulingQueue: podQueue,
|
|
|
|
predicates: predicates,
|
|
|
|
predicateMetaProducer: predicateMetaProducer,
|
|
|
|
prioritizers: prioritizers,
|
|
|
|
priorityMetaProducer: priorityMetaProducer,
|
|
|
|
extenders: extenders,
|
|
|
|
cachedNodeInfoMap: make(map[string]*schedulercache.NodeInfo),
|
|
|
|
volumeBinder: volumeBinder,
|
|
|
|
pvcLister: pvcLister,
|
|
|
|
alwaysCheckAllPredicates: alwaysCheckAllPredicates,
|
2018-04-12 00:40:07 +00:00
|
|
|
disablePreemption: disablePreemption,
|
2014-09-24 16:32:36 +00:00
|
|
|
}
|
|
|
|
}
|