diff --git a/plugin/pkg/scheduler/algorithm/predicates/error.go b/plugin/pkg/scheduler/algorithm/predicates/error.go index 71e0794810..8cc85bce2c 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/error.go +++ b/plugin/pkg/scheduler/algorithm/predicates/error.go @@ -38,6 +38,10 @@ var ( ErrNodeUnderMemoryPressure = newPredicateFailureError("NodeUnderMemoryPressure") ErrNodeUnderDiskPressure = newPredicateFailureError("NodeUnderDiskPressure") ErrNodeOutOfDisk = newPredicateFailureError("NodeOutOfDisk") + ErrNodeNotReady = newPredicateFailureError("NodeNotReady") + ErrNodeNetworkUnavailable = newPredicateFailureError("NodeNetworkUnavailable") + ErrNodeUnschedulable = newPredicateFailureError("NodeUnschedulable") + ErrNodeUnknownCondition = newPredicateFailureError("NodeUnknownCondition") ErrVolumeNodeConflict = newPredicateFailureError("NoVolumeNodeConflict") // ErrFakePredicate is used for test only. The fake predicates returning false also returns error // as ErrFakePredicate. diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates.go b/plugin/pkg/scheduler/algorithm/predicates/predicates.go index e76a2e2017..bb12a45781 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates.go @@ -1301,6 +1301,37 @@ func CheckNodeDiskPressurePredicate(pod *v1.Pod, meta interface{}, nodeInfo *sch return true, nil, nil } +// CheckNodeConditionPredicate checks if a pod can be scheduled on a node reporting out of disk, +// network unavailable and not ready condition. Only node conditions are accounted in this predicate. +func CheckNodeConditionPredicate(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { + reasons := []algorithm.PredicateFailureReason{} + + if nodeInfo == nil || nodeInfo.Node() == nil { + return false, []algorithm.PredicateFailureReason{ErrNodeUnknownCondition}, nil + } + + node := nodeInfo.Node() + for _, cond := range node.Status.Conditions { + // We consider the node for scheduling only when its: + // - NodeReady condition status is ConditionTrue, + // - NodeOutOfDisk condition status is ConditionFalse, + // - NodeNetworkUnavailable condition status is ConditionFalse. + if cond.Type == v1.NodeReady && cond.Status != v1.ConditionTrue { + reasons = append(reasons, ErrNodeNotReady) + } else if cond.Type == v1.NodeOutOfDisk && cond.Status != v1.ConditionFalse { + reasons = append(reasons, ErrNodeOutOfDisk) + } else if cond.Type == v1.NodeNetworkUnavailable && cond.Status != v1.ConditionFalse { + reasons = append(reasons, ErrNodeNetworkUnavailable) + } + } + + if node.Spec.Unschedulable { + reasons = append(reasons, ErrNodeUnschedulable) + } + + return len(reasons) == 0, reasons, nil +} + type VolumeNodeChecker struct { pvInfo PersistentVolumeInfo pvcInfo PersistentVolumeClaimInfo diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go index 5d7b9e2994..d379f9c5ff 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go @@ -3442,6 +3442,78 @@ func TestPodSchedulesOnNodeWithDiskPressureCondition(t *testing.T) { } } +func TestNodeConditionPredicate(t *testing.T) { + tests := []struct { + node *v1.Node + schedulable bool + }{ + // node1 considered + { + node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node1"}, Status: v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionTrue}}}}, + schedulable: true, + }, + // node2 ignored - node not Ready + { + node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node2"}, Status: v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}}}}, + schedulable: false, + }, + // node3 ignored - node out of disk + { + node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node3"}, Status: v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: v1.NodeOutOfDisk, Status: v1.ConditionTrue}}}}, + schedulable: false, + }, + + // node4 considered + { + node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node4"}, Status: v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: v1.NodeOutOfDisk, Status: v1.ConditionFalse}}}}, + schedulable: true, + }, + // node5 ignored - node out of disk + { + node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node5"}, Status: v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionTrue}, {Type: v1.NodeOutOfDisk, Status: v1.ConditionTrue}}}}, + schedulable: false, + }, + // node6 considered + { + node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node6"}, Status: v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionTrue}, {Type: v1.NodeOutOfDisk, Status: v1.ConditionFalse}}}}, + schedulable: true, + }, + // node7 ignored - node out of disk, node not Ready + { + node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node7"}, Status: v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}, {Type: v1.NodeOutOfDisk, Status: v1.ConditionTrue}}}}, + schedulable: false, + }, + // node8 ignored - node not Ready + { + node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node8"}, Status: v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}, {Type: v1.NodeOutOfDisk, Status: v1.ConditionFalse}}}}, + schedulable: false, + }, + // node9 ignored - node unschedulable + { + node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node9"}, Spec: v1.NodeSpec{Unschedulable: true}}, + schedulable: false, + }, + // node10 considered + { + node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node10"}, Spec: v1.NodeSpec{Unschedulable: false}}, + schedulable: true, + }, + // node11 considered + { + node: &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node11"}}, + schedulable: true, + }, + } + + for _, test := range tests { + nodeInfo := makeEmptyNodeInfo(test.node) + if fit, reasons, err := CheckNodeConditionPredicate(nil, nil, nodeInfo); fit != test.schedulable { + t.Errorf("%s: expected: %t, got %t; %+v, %v", + test.node.Name, test.schedulable, fit, reasons, err) + } + } +} + func createPodWithVolume(pod, pv, pvc string) *v1.Pod { return &v1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: pod, Namespace: "default"}, diff --git a/plugin/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go b/plugin/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go index fce476b15b..42ccee1ab0 100644 --- a/plugin/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go +++ b/plugin/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go @@ -387,6 +387,79 @@ func TestCompatibility_v1_Scheduler(t *testing.T) { }, }, }, + // Do not change this JSON after the corresponding release has been tagged. + // A failure indicates backwards compatibility with the specified release was broken. + "1.8": { + JSON: `{ + "kind": "Policy", + "apiVersion": "v1", + "predicates": [ + {"name": "MatchNodeSelector"}, + {"name": "PodFitsResources"}, + {"name": "PodFitsHostPorts"}, + {"name": "HostName"}, + {"name": "NoDiskConflict"}, + {"name": "NoVolumeZoneConflict"}, + {"name": "PodToleratesNodeTaints"}, + {"name": "CheckNodeMemoryPressure"}, + {"name": "CheckNodeDiskPressure"}, + {"name": "CheckNodeCondition"}, + {"name": "MaxEBSVolumeCount"}, + {"name": "MaxGCEPDVolumeCount"}, + {"name": "MaxAzureDiskVolumeCount"}, + {"name": "MatchInterPodAffinity"}, + {"name": "GeneralPredicates"}, + {"name": "TestServiceAffinity", "argument": {"serviceAffinity" : {"labels" : ["region"]}}}, + {"name": "TestLabelsPresence", "argument": {"labelsPresence" : {"labels" : ["foo"], "presence":true}}}, + {"name": "NoVolumeNodeConflict"} + ],"priorities": [ + {"name": "EqualPriority", "weight": 2}, + {"name": "ImageLocalityPriority", "weight": 2}, + {"name": "LeastRequestedPriority", "weight": 2}, + {"name": "BalancedResourceAllocation", "weight": 2}, + {"name": "SelectorSpreadPriority", "weight": 2}, + {"name": "NodePreferAvoidPodsPriority", "weight": 2}, + {"name": "NodeAffinityPriority", "weight": 2}, + {"name": "TaintTolerationPriority", "weight": 2}, + {"name": "InterPodAffinityPriority", "weight": 2}, + {"name": "MostRequestedPriority", "weight": 2} + ] + }`, + ExpectedPolicy: schedulerapi.Policy{ + Predicates: []schedulerapi.PredicatePolicy{ + {Name: "MatchNodeSelector"}, + {Name: "PodFitsResources"}, + {Name: "PodFitsHostPorts"}, + {Name: "HostName"}, + {Name: "NoDiskConflict"}, + {Name: "NoVolumeZoneConflict"}, + {Name: "PodToleratesNodeTaints"}, + {Name: "CheckNodeMemoryPressure"}, + {Name: "CheckNodeDiskPressure"}, + {Name: "CheckNodeCondition"}, + {Name: "MaxEBSVolumeCount"}, + {Name: "MaxGCEPDVolumeCount"}, + {Name: "MaxAzureDiskVolumeCount"}, + {Name: "MatchInterPodAffinity"}, + {Name: "GeneralPredicates"}, + {Name: "TestServiceAffinity", Argument: &schedulerapi.PredicateArgument{ServiceAffinity: &schedulerapi.ServiceAffinity{Labels: []string{"region"}}}}, + {Name: "TestLabelsPresence", Argument: &schedulerapi.PredicateArgument{LabelsPresence: &schedulerapi.LabelsPresence{Labels: []string{"foo"}, Presence: true}}}, + {Name: "NoVolumeNodeConflict"}, + }, + Priorities: []schedulerapi.PriorityPolicy{ + {Name: "EqualPriority", Weight: 2}, + {Name: "ImageLocalityPriority", Weight: 2}, + {Name: "LeastRequestedPriority", Weight: 2}, + {Name: "BalancedResourceAllocation", Weight: 2}, + {Name: "SelectorSpreadPriority", Weight: 2}, + {Name: "NodePreferAvoidPodsPriority", Weight: 2}, + {Name: "NodeAffinityPriority", Weight: 2}, + {Name: "TaintTolerationPriority", Weight: 2}, + {Name: "InterPodAffinityPriority", Weight: 2}, + {Name: "MostRequestedPriority", Weight: 2}, + }, + }, + }, } registeredPredicates := sets.NewString(factory.ListRegisteredFitPredicates()...) diff --git a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go index 46fa0e60bf..7e25b0405c 100644 --- a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go +++ b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go @@ -175,6 +175,10 @@ func defaultPredicates() sets.String { // Fit is determined by node disk pressure condition. factory.RegisterFitPredicate("CheckNodeDiskPressure", predicates.CheckNodeDiskPressurePredicate), + // Fit is determied by node condtions: not ready, network unavailable and out of disk. + factory.RegisterFitPredicate("CheckNodeCondition", predicates.CheckNodeConditionPredicate), + factory.RegisterMandatoryFitPredicate("CheckNodeCondition", predicates.CheckNodeConditionPredicate), + // Fit is determined by volume zone requirements. factory.RegisterFitPredicateFactory( "NoVolumeNodeConflict", diff --git a/plugin/pkg/scheduler/factory/factory.go b/plugin/pkg/scheduler/factory/factory.go index 03348f3ee6..1aae703f4d 100644 --- a/plugin/pkg/scheduler/factory/factory.go +++ b/plugin/pkg/scheduler/factory/factory.go @@ -705,7 +705,7 @@ func (f *ConfigFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String, SchedulerCache: f.schedulerCache, Ecache: f.equivalencePodCache, // The scheduler only needs to consider schedulable nodes. - NodeLister: &nodePredicateLister{f.nodeLister}, + NodeLister: &nodeLister{f.nodeLister}, Algorithm: algo, Binder: f.getBinder(extenders), PodConditionUpdater: &podConditionUpdater{f.client}, @@ -720,12 +720,12 @@ func (f *ConfigFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String, }, nil } -type nodePredicateLister struct { +type nodeLister struct { corelisters.NodeLister } -func (n *nodePredicateLister) List() ([]*v1.Node, error) { - return n.ListWithPredicate(getNodeConditionPredicate()) +func (n *nodeLister) List() ([]*v1.Node, error) { + return n.NodeLister.List(labels.Everything()) } func (f *ConfigFactory) GetPriorityFunctionConfigs(priorityKeys sets.String) ([]algorithm.PriorityConfig, error) { @@ -770,11 +770,10 @@ func (f *ConfigFactory) getPluginArgs() (*PluginFactoryArgs, error) { ControllerLister: f.controllerLister, ReplicaSetLister: f.replicaSetLister, StatefulSetLister: f.statefulSetLister, - // All fit predicates only need to consider schedulable nodes. - NodeLister: &nodePredicateLister{f.nodeLister}, - NodeInfo: &predicates.CachedNodeInfo{NodeLister: f.nodeLister}, - PVInfo: &predicates.CachedPersistentVolumeInfo{PersistentVolumeLister: f.pVLister}, - PVCInfo: &predicates.CachedPersistentVolumeClaimInfo{PersistentVolumeClaimLister: f.pVCLister}, + NodeLister: &nodeLister{f.nodeLister}, + NodeInfo: &predicates.CachedNodeInfo{NodeLister: f.nodeLister}, + PVInfo: &predicates.CachedPersistentVolumeInfo{PersistentVolumeLister: f.pVLister}, + PVCInfo: &predicates.CachedPersistentVolumeClaimInfo{PersistentVolumeClaimLister: f.pVCLister}, HardPodAffinitySymmetricWeight: f.hardPodAffinitySymmetricWeight, }, nil } @@ -793,34 +792,6 @@ func (f *ConfigFactory) ResponsibleForPod(pod *v1.Pod) bool { return f.schedulerName == pod.Spec.SchedulerName } -func getNodeConditionPredicate() corelisters.NodeConditionPredicate { - return func(node *v1.Node) bool { - for i := range node.Status.Conditions { - cond := &node.Status.Conditions[i] - // We consider the node for scheduling only when its: - // - NodeReady condition status is ConditionTrue, - // - NodeOutOfDisk condition status is ConditionFalse, - // - NodeNetworkUnavailable condition status is ConditionFalse. - if cond.Type == v1.NodeReady && cond.Status != v1.ConditionTrue { - glog.V(4).Infof("Ignoring node %v with %v condition status %v", node.Name, cond.Type, cond.Status) - return false - } else if cond.Type == v1.NodeOutOfDisk && cond.Status != v1.ConditionFalse { - glog.V(4).Infof("Ignoring node %v with %v condition status %v", node.Name, cond.Type, cond.Status) - return false - } else if cond.Type == v1.NodeNetworkUnavailable && cond.Status != v1.ConditionFalse { - glog.V(4).Infof("Ignoring node %v with %v condition status %v", node.Name, cond.Type, cond.Status) - return false - } - } - // Ignore nodes that are marked unschedulable - if node.Spec.Unschedulable { - glog.V(4).Infof("Ignoring node %v since it is unschedulable", node.Name) - return false - } - return true - } -} - // unassignedNonTerminatedPod selects pods that are unassigned and non-terminal. func unassignedNonTerminatedPod(pod *v1.Pod) bool { if len(pod.Spec.NodeName) != 0 { diff --git a/plugin/pkg/scheduler/factory/factory_test.go b/plugin/pkg/scheduler/factory/factory_test.go index 730e1bb035..28cfaa56a6 100644 --- a/plugin/pkg/scheduler/factory/factory_test.go +++ b/plugin/pkg/scheduler/factory/factory_test.go @@ -525,46 +525,3 @@ func TestInvalidFactoryArgs(t *testing.T) { } } - -func TestNodeConditionPredicate(t *testing.T) { - nodeFunc := getNodeConditionPredicate() - nodeList := &v1.NodeList{ - Items: []v1.Node{ - // node1 considered - {ObjectMeta: metav1.ObjectMeta{Name: "node1"}, Status: v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionTrue}}}}, - // node2 ignored - node not Ready - {ObjectMeta: metav1.ObjectMeta{Name: "node2"}, Status: v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}}}}, - // node3 ignored - node out of disk - {ObjectMeta: metav1.ObjectMeta{Name: "node3"}, Status: v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: v1.NodeOutOfDisk, Status: v1.ConditionTrue}}}}, - // node4 considered - {ObjectMeta: metav1.ObjectMeta{Name: "node4"}, Status: v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: v1.NodeOutOfDisk, Status: v1.ConditionFalse}}}}, - - // node5 ignored - node out of disk - {ObjectMeta: metav1.ObjectMeta{Name: "node5"}, Status: v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionTrue}, {Type: v1.NodeOutOfDisk, Status: v1.ConditionTrue}}}}, - // node6 considered - {ObjectMeta: metav1.ObjectMeta{Name: "node6"}, Status: v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionTrue}, {Type: v1.NodeOutOfDisk, Status: v1.ConditionFalse}}}}, - // node7 ignored - node out of disk, node not Ready - {ObjectMeta: metav1.ObjectMeta{Name: "node7"}, Status: v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}, {Type: v1.NodeOutOfDisk, Status: v1.ConditionTrue}}}}, - // node8 ignored - node not Ready - {ObjectMeta: metav1.ObjectMeta{Name: "node8"}, Status: v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}, {Type: v1.NodeOutOfDisk, Status: v1.ConditionFalse}}}}, - - // node9 ignored - node unschedulable - {ObjectMeta: metav1.ObjectMeta{Name: "node9"}, Spec: v1.NodeSpec{Unschedulable: true}}, - // node10 considered - {ObjectMeta: metav1.ObjectMeta{Name: "node10"}, Spec: v1.NodeSpec{Unschedulable: false}}, - // node11 considered - {ObjectMeta: metav1.ObjectMeta{Name: "node11"}}, - }, - } - - nodeNames := []string{} - for _, node := range nodeList.Items { - if nodeFunc(&node) { - nodeNames = append(nodeNames, node.Name) - } - } - expectedNodes := []string{"node1", "node4", "node6", "node10", "node11"} - if !reflect.DeepEqual(expectedNodes, nodeNames) { - t.Errorf("expected: %v, got %v", expectedNodes, nodeNames) - } -} diff --git a/plugin/pkg/scheduler/factory/plugins.go b/plugin/pkg/scheduler/factory/plugins.go index df1c6febfc..ad245c7d9c 100644 --- a/plugin/pkg/scheduler/factory/plugins.go +++ b/plugin/pkg/scheduler/factory/plugins.go @@ -72,9 +72,10 @@ var ( schedulerFactoryMutex sync.Mutex // maps that hold registered algorithm types - fitPredicateMap = make(map[string]FitPredicateFactory) - priorityFunctionMap = make(map[string]PriorityConfigFactory) - algorithmProviderMap = make(map[string]AlgorithmProviderConfig) + fitPredicateMap = make(map[string]FitPredicateFactory) + mandatoryFitPredicateMap = make(map[string]FitPredicateFactory) + priorityFunctionMap = make(map[string]PriorityConfigFactory) + algorithmProviderMap = make(map[string]AlgorithmProviderConfig) // Registered metadata producers priorityMetadataProducer MetadataProducerFactory @@ -99,6 +100,17 @@ func RegisterFitPredicate(name string, predicate algorithm.FitPredicate) string return RegisterFitPredicateFactory(name, func(PluginFactoryArgs) algorithm.FitPredicate { return predicate }) } +// RegisterMandatoryFitPredicate registers a fit predicate with the algorithm registry, the predicate is used by +// kubelet, DaemonSet; it is always included in configuration. Returns the name with which the predicate was +// registered. +func RegisterMandatoryFitPredicate(name string, predicate algorithm.FitPredicate) string { + schedulerFactoryMutex.Lock() + defer schedulerFactoryMutex.Unlock() + validateAlgorithmNameOrDie(name) + mandatoryFitPredicateMap[name] = func(PluginFactoryArgs) algorithm.FitPredicate { return predicate } + return name +} + // RegisterFitPredicateFactory registers a fit predicate factory with the // algorithm registry. Returns the name with which the predicate was registered. func RegisterFitPredicateFactory(name string, predicateFactory FitPredicateFactory) string { @@ -308,6 +320,14 @@ func getFitPredicateFunctions(names sets.String, args PluginFactoryArgs) (map[st } predicates[name] = factory(args) } + + // Always include required fit predicates. + for name, factory := range mandatoryFitPredicateMap { + if _, found := predicates[name]; !found { + predicates[name] = factory(args) + } + } + return predicates, nil } diff --git a/test/integration/scheduler/scheduler_test.go b/test/integration/scheduler/scheduler_test.go index cd109648a9..6322e55163 100644 --- a/test/integration/scheduler/scheduler_test.go +++ b/test/integration/scheduler/scheduler_test.go @@ -139,7 +139,8 @@ func TestSchedulerCreationFromConfigMap(t *testing.T) { // Verify that the config is applied correctly. schedPredicates := sched.Config().Algorithm.Predicates() schedPrioritizers := sched.Config().Algorithm.Prioritizers() - if len(schedPredicates) != 2 || len(schedPrioritizers) != 2 { + // Includes one mandatory predicates. + if len(schedPredicates) != 3 || len(schedPrioritizers) != 2 { t.Errorf("Unexpected number of predicates or priority functions. Number of predicates: %v, number of prioritizers: %v", len(schedPredicates), len(schedPrioritizers)) } // Check a predicate and a priority function.