2014-06-28 22:35:51 +00:00
|
|
|
/*
|
2015-05-01 16:19:44 +00:00
|
|
|
Copyright 2014 The Kubernetes Authors All rights reserved.
|
2014-06-28 22:35:51 +00:00
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
*/
|
|
|
|
|
2015-05-08 11:01:09 +00:00
|
|
|
package predicates
|
2014-06-28 22:35:51 +00:00
|
|
|
|
|
|
|
import (
|
2014-09-26 23:28:30 +00:00
|
|
|
"fmt"
|
|
|
|
|
2015-08-05 22:03:47 +00:00
|
|
|
"k8s.io/kubernetes/pkg/api"
|
2015-11-26 08:57:26 +00:00
|
|
|
"k8s.io/kubernetes/pkg/client/cache"
|
2015-08-05 22:03:47 +00:00
|
|
|
"k8s.io/kubernetes/pkg/labels"
|
|
|
|
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
|
2016-01-28 20:14:45 +00:00
|
|
|
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
|
2015-07-08 07:48:49 +00:00
|
|
|
|
|
|
|
"github.com/golang/glog"
|
2015-11-29 19:00:49 +00:00
|
|
|
"k8s.io/kubernetes/pkg/api/unversioned"
|
2014-06-28 22:35:51 +00:00
|
|
|
)
|
|
|
|
|
2014-09-25 20:55:42 +00:00
|
|
|
type NodeInfo interface {
|
2014-12-08 03:44:27 +00:00
|
|
|
GetNodeInfo(nodeID string) (*api.Node, error)
|
2014-09-26 23:28:30 +00:00
|
|
|
}
|
|
|
|
|
2015-11-29 19:00:49 +00:00
|
|
|
type PersistentVolumeInfo interface {
|
|
|
|
GetPersistentVolumeInfo(pvID string) (*api.PersistentVolume, error)
|
|
|
|
}
|
|
|
|
|
|
|
|
type PersistentVolumeClaimInfo interface {
|
|
|
|
GetPersistentVolumeClaimInfo(namespace string, pvcID string) (*api.PersistentVolumeClaim, error)
|
|
|
|
}
|
|
|
|
|
2015-11-26 08:57:26 +00:00
|
|
|
type CachedNodeInfo struct {
|
|
|
|
*cache.StoreToNodeLister
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetNodeInfo returns cached data for the node 'id'.
|
|
|
|
func (c *CachedNodeInfo) GetNodeInfo(id string) (*api.Node, error) {
|
|
|
|
node, exists, err := c.Get(&api.Node{ObjectMeta: api.ObjectMeta{Name: id}})
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("error retrieving node '%v' from cache: %v", id, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if !exists {
|
|
|
|
return nil, fmt.Errorf("node '%v' is not in cache", id)
|
|
|
|
}
|
|
|
|
|
|
|
|
return node.(*api.Node), nil
|
|
|
|
}
|
|
|
|
|
2014-10-13 04:34:23 +00:00
|
|
|
func isVolumeConflict(volume api.Volume, pod *api.Pod) bool {
|
2015-12-09 19:45:56 +00:00
|
|
|
// fast path if there is no conflict checking targets.
|
|
|
|
if volume.GCEPersistentDisk == nil && volume.AWSElasticBlockStore == nil && volume.RBD == nil {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2015-12-09 19:19:57 +00:00
|
|
|
for _, existingVolume := range pod.Spec.Volumes {
|
2015-12-09 19:45:56 +00:00
|
|
|
// Same GCE disk mounted by multiple pods conflicts unless all pods mount it read-only.
|
2015-12-09 19:19:57 +00:00
|
|
|
if volume.GCEPersistentDisk != nil && existingVolume.GCEPersistentDisk != nil {
|
|
|
|
disk, existingDisk := volume.GCEPersistentDisk, existingVolume.GCEPersistentDisk
|
|
|
|
if disk.PDName == existingDisk.PDName && !(disk.ReadOnly && existingDisk.ReadOnly) {
|
2015-03-06 14:26:39 +00:00
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
2014-10-13 04:34:23 +00:00
|
|
|
|
2015-12-09 19:19:57 +00:00
|
|
|
if volume.AWSElasticBlockStore != nil && existingVolume.AWSElasticBlockStore != nil {
|
|
|
|
if volume.AWSElasticBlockStore.VolumeID == existingVolume.AWSElasticBlockStore.VolumeID {
|
2015-03-06 14:26:39 +00:00
|
|
|
return true
|
|
|
|
}
|
2014-10-13 04:34:23 +00:00
|
|
|
}
|
2015-12-09 19:19:57 +00:00
|
|
|
|
|
|
|
if volume.RBD != nil && existingVolume.RBD != nil {
|
|
|
|
mon, pool, image := volume.RBD.CephMonitors, volume.RBD.RBDPool, volume.RBD.RBDImage
|
|
|
|
emon, epool, eimage := existingVolume.RBD.CephMonitors, existingVolume.RBD.RBDPool, existingVolume.RBD.RBDImage
|
|
|
|
if haveSame(mon, emon) && pool == epool && image == eimage {
|
|
|
|
return true
|
2015-10-20 17:24:23 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2015-12-09 19:19:57 +00:00
|
|
|
|
2014-10-13 04:34:23 +00:00
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// NoDiskConflict evaluates if a pod can fit due to the volumes it requests, and those that
|
2015-10-22 13:28:30 +00:00
|
|
|
// are already mounted. If there is already a volume mounted on that node, another pod that uses the same volume
|
2015-11-02 15:18:39 +00:00
|
|
|
// can't be scheduled there.
|
|
|
|
// This is GCE, Amazon EBS, and Ceph RBD specific for now:
|
|
|
|
// - GCE PD allows multiple mounts as long as they're all read-only
|
|
|
|
// - AWS EBS forbids any two pods mounting the same volume ID
|
|
|
|
// - Ceph RBD forbids if any two pods share at least same monitor, and match pool and image.
|
2014-10-13 04:34:23 +00:00
|
|
|
// TODO: migrate this into some per-volume specific code?
|
2016-01-28 20:14:45 +00:00
|
|
|
func NoDiskConflict(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo) (bool, error) {
|
2015-12-09 19:19:57 +00:00
|
|
|
for _, v := range pod.Spec.Volumes {
|
2016-01-28 20:14:45 +00:00
|
|
|
for _, ev := range nodeInfo.Pods() {
|
2015-12-09 19:19:57 +00:00
|
|
|
if isVolumeConflict(v, ev) {
|
2016-01-06 01:10:59 +00:00
|
|
|
return false, ErrDiskConflict
|
2014-10-13 04:34:23 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
|
2016-01-14 20:45:08 +00:00
|
|
|
type MaxPDVolumeCountChecker struct {
|
|
|
|
filter VolumeFilter
|
|
|
|
maxVolumes int
|
|
|
|
pvInfo PersistentVolumeInfo
|
|
|
|
pvcInfo PersistentVolumeClaimInfo
|
|
|
|
}
|
|
|
|
|
|
|
|
// VolumeFilter contains information on how to filter PD Volumes when checking PD Volume caps
|
|
|
|
type VolumeFilter struct {
|
|
|
|
// Filter normal volumes
|
|
|
|
FilterVolume func(vol *api.Volume) (id string, relevant bool)
|
|
|
|
FilterPersistentVolume func(pv *api.PersistentVolume) (id string, relevant bool)
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewMaxPDVolumeCountPredicate creates a predicate which evaluates whether a pod can fit based on the
|
|
|
|
// number of volumes which match a filter that it requests, and those that are already present. The
|
|
|
|
// maximum number is configurable to accommodate different systems.
|
|
|
|
//
|
|
|
|
// The predicate looks for both volumes used directly, as well as PVC volumes that are backed by relevant volume
|
|
|
|
// types, counts the number of unique volumes, and rejects the new pod if it would place the total count over
|
|
|
|
// the maximum.
|
|
|
|
func NewMaxPDVolumeCountPredicate(filter VolumeFilter, maxVolumes int, pvInfo PersistentVolumeInfo, pvcInfo PersistentVolumeClaimInfo) algorithm.FitPredicate {
|
|
|
|
c := &MaxPDVolumeCountChecker{
|
|
|
|
filter: filter,
|
|
|
|
maxVolumes: maxVolumes,
|
|
|
|
pvInfo: pvInfo,
|
|
|
|
pvcInfo: pvcInfo,
|
|
|
|
}
|
|
|
|
|
|
|
|
return c.predicate
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *MaxPDVolumeCountChecker) filterVolumes(volumes []api.Volume, namespace string, filteredVolumes map[string]bool) error {
|
|
|
|
for _, vol := range volumes {
|
|
|
|
if id, ok := c.filter.FilterVolume(&vol); ok {
|
|
|
|
filteredVolumes[id] = true
|
|
|
|
} else if vol.PersistentVolumeClaim != nil {
|
|
|
|
pvcName := vol.PersistentVolumeClaim.ClaimName
|
|
|
|
if pvcName == "" {
|
|
|
|
return fmt.Errorf("PersistentVolumeClaim had no name: %q", pvcName)
|
|
|
|
}
|
|
|
|
pvc, err := c.pvcInfo.GetPersistentVolumeClaimInfo(namespace, pvcName)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
pvName := pvc.Spec.VolumeName
|
|
|
|
if pvName == "" {
|
|
|
|
return fmt.Errorf("PersistentVolumeClaim is not bound: %q", pvcName)
|
|
|
|
}
|
|
|
|
|
|
|
|
pv, err := c.pvInfo.GetPersistentVolumeInfo(pvName)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
if id, ok := c.filter.FilterPersistentVolume(pv); ok {
|
|
|
|
filteredVolumes[id] = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2016-01-28 20:14:45 +00:00
|
|
|
func (c *MaxPDVolumeCountChecker) predicate(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo) (bool, error) {
|
2016-01-14 20:45:08 +00:00
|
|
|
newVolumes := make(map[string]bool)
|
|
|
|
if err := c.filterVolumes(pod.Spec.Volumes, pod.Namespace, newVolumes); err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// quick return
|
|
|
|
if len(newVolumes) == 0 {
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// count unique volumes
|
|
|
|
existingVolumes := make(map[string]bool)
|
2016-01-28 20:14:45 +00:00
|
|
|
for _, existingPod := range nodeInfo.Pods() {
|
2016-01-14 20:45:08 +00:00
|
|
|
if err := c.filterVolumes(existingPod.Spec.Volumes, existingPod.Namespace, existingVolumes); err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
numExistingVolumes := len(existingVolumes)
|
|
|
|
|
|
|
|
// filter out already-mounted volumes
|
|
|
|
for k := range existingVolumes {
|
|
|
|
if _, ok := newVolumes[k]; ok {
|
|
|
|
delete(newVolumes, k)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
numNewVolumes := len(newVolumes)
|
|
|
|
|
|
|
|
if numExistingVolumes+numNewVolumes > c.maxVolumes {
|
2016-01-06 01:10:59 +00:00
|
|
|
// violates MaxEBSVolumeCount or MaxGCEPDVolumeCount
|
|
|
|
return false, ErrMaxVolumeCountExceeded
|
2016-01-14 20:45:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// EBSVolumeFilter is a VolumeFilter for filtering AWS ElasticBlockStore Volumes
|
|
|
|
var EBSVolumeFilter VolumeFilter = VolumeFilter{
|
|
|
|
FilterVolume: func(vol *api.Volume) (string, bool) {
|
|
|
|
if vol.AWSElasticBlockStore != nil {
|
|
|
|
return vol.AWSElasticBlockStore.VolumeID, true
|
|
|
|
}
|
|
|
|
return "", false
|
|
|
|
},
|
|
|
|
|
|
|
|
FilterPersistentVolume: func(pv *api.PersistentVolume) (string, bool) {
|
|
|
|
if pv.Spec.AWSElasticBlockStore != nil {
|
|
|
|
return pv.Spec.AWSElasticBlockStore.VolumeID, true
|
|
|
|
}
|
|
|
|
return "", false
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
// GCEPDVolumeFilter is a VolumeFilter for filtering GCE PersistentDisk Volumes
|
|
|
|
var GCEPDVolumeFilter VolumeFilter = VolumeFilter{
|
|
|
|
FilterVolume: func(vol *api.Volume) (string, bool) {
|
|
|
|
if vol.GCEPersistentDisk != nil {
|
|
|
|
return vol.GCEPersistentDisk.PDName, true
|
|
|
|
}
|
|
|
|
return "", false
|
|
|
|
},
|
|
|
|
|
|
|
|
FilterPersistentVolume: func(pv *api.PersistentVolume) (string, bool) {
|
|
|
|
if pv.Spec.GCEPersistentDisk != nil {
|
|
|
|
return pv.Spec.GCEPersistentDisk.PDName, true
|
|
|
|
}
|
|
|
|
return "", false
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
2015-11-29 19:00:49 +00:00
|
|
|
type VolumeZoneChecker struct {
|
2016-04-21 08:24:12 +00:00
|
|
|
pvInfo PersistentVolumeInfo
|
|
|
|
pvcInfo PersistentVolumeClaimInfo
|
2015-11-29 19:00:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// VolumeZonePredicate evaluates if a pod can fit due to the volumes it requests, given
|
|
|
|
// that some volumes may have zone scheduling constraints. The requirement is that any
|
|
|
|
// volume zone-labels must match the equivalent zone-labels on the node. It is OK for
|
|
|
|
// the node to have more zone-label constraints (for example, a hypothetical replicated
|
|
|
|
// volume might allow region-wide access)
|
|
|
|
//
|
|
|
|
// Currently this is only supported with PersistentVolumeClaims, and looks to the labels
|
|
|
|
// only on the bound PersistentVolume.
|
|
|
|
//
|
|
|
|
// Working with volumes declared inline in the pod specification (i.e. not
|
|
|
|
// using a PersistentVolume) is likely to be harder, as it would require
|
|
|
|
// determining the zone of a volume during scheduling, and that is likely to
|
|
|
|
// require calling out to the cloud provider. It seems that we are moving away
|
|
|
|
// from inline volume declarations anyway.
|
2016-04-21 08:24:12 +00:00
|
|
|
func NewVolumeZonePredicate(pvInfo PersistentVolumeInfo, pvcInfo PersistentVolumeClaimInfo) algorithm.FitPredicate {
|
2015-11-29 19:00:49 +00:00
|
|
|
c := &VolumeZoneChecker{
|
2016-04-21 08:24:12 +00:00
|
|
|
pvInfo: pvInfo,
|
|
|
|
pvcInfo: pvcInfo,
|
2015-11-29 19:00:49 +00:00
|
|
|
}
|
|
|
|
return c.predicate
|
|
|
|
}
|
|
|
|
|
2016-01-28 20:14:45 +00:00
|
|
|
func (c *VolumeZoneChecker) predicate(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo) (bool, error) {
|
2016-04-21 08:24:12 +00:00
|
|
|
node := nodeInfo.Node()
|
2015-11-29 19:00:49 +00:00
|
|
|
if node == nil {
|
2016-01-28 20:14:45 +00:00
|
|
|
return false, fmt.Errorf("node not found: %q", nodeName)
|
2015-11-29 19:00:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
nodeConstraints := make(map[string]string)
|
|
|
|
for k, v := range node.ObjectMeta.Labels {
|
|
|
|
if k != unversioned.LabelZoneFailureDomain && k != unversioned.LabelZoneRegion {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
nodeConstraints[k] = v
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(nodeConstraints) == 0 {
|
|
|
|
// The node has no zone constraints, so we're OK to schedule.
|
|
|
|
// In practice, when using zones, all nodes must be labeled with zone labels.
|
|
|
|
// We want to fast-path this case though.
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace := pod.Namespace
|
|
|
|
|
|
|
|
manifest := &(pod.Spec)
|
|
|
|
for i := range manifest.Volumes {
|
|
|
|
volume := &manifest.Volumes[i]
|
|
|
|
if volume.PersistentVolumeClaim != nil {
|
|
|
|
pvcName := volume.PersistentVolumeClaim.ClaimName
|
|
|
|
if pvcName == "" {
|
|
|
|
return false, fmt.Errorf("PersistentVolumeClaim had no name: %q", pvcName)
|
|
|
|
}
|
|
|
|
pvc, err := c.pvcInfo.GetPersistentVolumeClaimInfo(namespace, pvcName)
|
|
|
|
if err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if pvc == nil {
|
|
|
|
return false, fmt.Errorf("PersistentVolumeClaim was not found: %q", pvcName)
|
|
|
|
}
|
|
|
|
|
|
|
|
pvName := pvc.Spec.VolumeName
|
|
|
|
if pvName == "" {
|
|
|
|
return false, fmt.Errorf("PersistentVolumeClaim is not bound: %q", pvcName)
|
|
|
|
}
|
|
|
|
|
|
|
|
pv, err := c.pvInfo.GetPersistentVolumeInfo(pvName)
|
|
|
|
if err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if pv == nil {
|
|
|
|
return false, fmt.Errorf("PersistentVolume not found: %q", pvName)
|
|
|
|
}
|
|
|
|
|
|
|
|
for k, v := range pv.ObjectMeta.Labels {
|
|
|
|
if k != unversioned.LabelZoneFailureDomain && k != unversioned.LabelZoneRegion {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
nodeV, _ := nodeConstraints[k]
|
|
|
|
if v != nodeV {
|
2016-01-28 20:14:45 +00:00
|
|
|
glog.V(2).Infof("Won't schedule pod %q onto node %q due to volume %q (mismatch on %q)", pod.Name, nodeName, pvName, k)
|
2016-01-06 01:10:59 +00:00
|
|
|
return false, ErrVolumeZoneConflict
|
2015-11-29 19:00:49 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
|
2014-09-25 20:55:42 +00:00
|
|
|
type resourceRequest struct {
|
2015-01-05 21:16:18 +00:00
|
|
|
milliCPU int64
|
|
|
|
memory int64
|
2014-09-25 20:55:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func getResourceRequest(pod *api.Pod) resourceRequest {
|
|
|
|
result := resourceRequest{}
|
2015-07-30 19:59:22 +00:00
|
|
|
for _, container := range pod.Spec.Containers {
|
|
|
|
requests := container.Resources.Requests
|
|
|
|
result.memory += requests.Memory().Value()
|
|
|
|
result.milliCPU += requests.Cpu().MilliValue()
|
2014-09-25 20:55:42 +00:00
|
|
|
}
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
2015-12-24 08:29:18 +00:00
|
|
|
func CheckPodsExceedingFreeResources(pods []*api.Pod, allocatable api.ResourceList) (fitting []*api.Pod, notFittingCPU, notFittingMemory []*api.Pod) {
|
|
|
|
totalMilliCPU := allocatable.Cpu().MilliValue()
|
|
|
|
totalMemory := allocatable.Memory().Value()
|
2015-03-16 12:50:00 +00:00
|
|
|
milliCPURequested := int64(0)
|
|
|
|
memoryRequested := int64(0)
|
2015-04-03 22:51:50 +00:00
|
|
|
for _, pod := range pods {
|
|
|
|
podRequest := getResourceRequest(pod)
|
2016-01-20 23:34:58 +00:00
|
|
|
fitsCPU := (totalMilliCPU - milliCPURequested) >= podRequest.milliCPU
|
|
|
|
fitsMemory := (totalMemory - memoryRequested) >= podRequest.memory
|
2015-07-24 01:27:29 +00:00
|
|
|
if !fitsCPU {
|
2015-08-12 03:16:40 +00:00
|
|
|
// the pod doesn't fit due to CPU request
|
2015-07-24 01:27:29 +00:00
|
|
|
notFittingCPU = append(notFittingCPU, pod)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if !fitsMemory {
|
2015-08-12 03:16:40 +00:00
|
|
|
// the pod doesn't fit due to Memory request
|
2015-07-24 01:27:29 +00:00
|
|
|
notFittingMemory = append(notFittingMemory, pod)
|
2015-03-20 16:52:32 +00:00
|
|
|
continue
|
2015-03-16 12:50:00 +00:00
|
|
|
}
|
2015-03-20 16:52:32 +00:00
|
|
|
// the pod fits
|
|
|
|
milliCPURequested += podRequest.milliCPU
|
|
|
|
memoryRequested += podRequest.memory
|
2015-04-03 22:51:50 +00:00
|
|
|
fitting = append(fitting, pod)
|
2015-03-16 12:50:00 +00:00
|
|
|
}
|
2015-03-20 16:52:32 +00:00
|
|
|
return
|
2015-03-16 12:50:00 +00:00
|
|
|
}
|
|
|
|
|
2015-10-19 22:00:41 +00:00
|
|
|
func podName(pod *api.Pod) string {
|
|
|
|
return pod.Namespace + "/" + pod.Name
|
|
|
|
}
|
|
|
|
|
2016-04-21 08:24:12 +00:00
|
|
|
func podFitsResourcesInternal(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo) (bool, error) {
|
|
|
|
node := nodeInfo.Node()
|
|
|
|
if node == nil {
|
|
|
|
return false, fmt.Errorf("node not found: %q", nodeName)
|
|
|
|
}
|
|
|
|
allocatable := node.Status.Allocatable
|
2016-01-28 20:14:45 +00:00
|
|
|
allowedPodNumber := allocatable.Pods().Value()
|
2016-04-22 16:58:49 +00:00
|
|
|
if int64(len(nodeInfo.Pods()))+1 > allowedPodNumber {
|
|
|
|
return false,
|
|
|
|
newInsufficientResourceError(podCountResourceName, 1, int64(len(nodeInfo.Pods())), allowedPodNumber)
|
|
|
|
}
|
2015-12-09 21:24:54 +00:00
|
|
|
podRequest := getResourceRequest(pod)
|
|
|
|
if podRequest.milliCPU == 0 && podRequest.memory == 0 {
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
|
2016-01-28 20:14:45 +00:00
|
|
|
totalMilliCPU := allocatable.Cpu().MilliValue()
|
|
|
|
totalMemory := allocatable.Memory().Value()
|
2016-01-06 01:10:59 +00:00
|
|
|
|
2016-01-28 20:14:45 +00:00
|
|
|
if totalMilliCPU < podRequest.milliCPU+nodeInfo.RequestedResource().MilliCPU {
|
|
|
|
return false,
|
|
|
|
newInsufficientResourceError(cpuResourceName, podRequest.milliCPU, nodeInfo.RequestedResource().MilliCPU, totalMilliCPU)
|
2015-07-24 01:27:29 +00:00
|
|
|
}
|
2016-01-28 20:14:45 +00:00
|
|
|
if totalMemory < podRequest.memory+nodeInfo.RequestedResource().Memory {
|
|
|
|
return false,
|
|
|
|
newInsufficientResourceError(memoryResoureceName, podRequest.memory, nodeInfo.RequestedResource().Memory, totalMemory)
|
2014-09-25 20:55:42 +00:00
|
|
|
}
|
2016-01-28 20:14:45 +00:00
|
|
|
glog.V(10).Infof("Schedule Pod %+v on Node %+v is allowed, Node is running only %v out of %v Pods.",
|
|
|
|
podName(pod), nodeName, len(nodeInfo.Pods()), allowedPodNumber)
|
2015-03-16 12:50:00 +00:00
|
|
|
return true, nil
|
2014-09-25 20:55:42 +00:00
|
|
|
}
|
|
|
|
|
2016-04-21 08:24:12 +00:00
|
|
|
func PodFitsResources(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo) (bool, error) {
|
|
|
|
return podFitsResourcesInternal(pod, nodeName, nodeInfo)
|
2014-10-22 00:13:52 +00:00
|
|
|
}
|
|
|
|
|
2016-02-23 03:53:10 +00:00
|
|
|
// nodeMatchesNodeSelectorTerms checks if a node's labels satisfy a list of node selector terms,
|
2016-01-26 23:03:18 +00:00
|
|
|
// terms are ORed, and an emtpy a list of terms will match nothing.
|
2016-02-23 03:53:10 +00:00
|
|
|
func nodeMatchesNodeSelectorTerms(node *api.Node, nodeSelectorTerms []api.NodeSelectorTerm) bool {
|
2016-01-26 23:03:18 +00:00
|
|
|
for _, req := range nodeSelectorTerms {
|
|
|
|
nodeSelector, err := api.NodeSelectorRequirementsAsSelector(req.MatchExpressions)
|
|
|
|
if err != nil {
|
|
|
|
glog.V(10).Infof("Failed to parse MatchExpressions: %+v, regarding as not match.", req.MatchExpressions)
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
if nodeSelector.Matches(labels.Set(node.Labels)) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// The pod can only schedule onto nodes that satisfy requirements in both NodeAffinity and nodeSelector.
|
2015-03-20 16:52:32 +00:00
|
|
|
func PodMatchesNodeLabels(pod *api.Pod, node *api.Node) bool {
|
2016-01-26 23:03:18 +00:00
|
|
|
// Check if node.Labels match pod.Spec.NodeSelector.
|
|
|
|
if len(pod.Spec.NodeSelector) > 0 {
|
|
|
|
selector := labels.SelectorFromSet(pod.Spec.NodeSelector)
|
|
|
|
if !selector.Matches(labels.Set(node.Labels)) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Parse required node affinity scheduling requirements
|
|
|
|
// and check if the current node match the requirements.
|
|
|
|
affinity, err := api.GetAffinityFromPodAnnotations(pod.Annotations)
|
|
|
|
if err != nil {
|
|
|
|
glog.V(10).Infof("Failed to get Affinity from Pod %+v, err: %+v", podName(pod), err)
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// 1. nil NodeSelector matches all nodes (i.e. does not filter out any nodes)
|
|
|
|
// 2. nil []NodeSelectorTerm (equivalent to non-nil empty NodeSelector) matches no nodes
|
|
|
|
// 3. zero-length non-nil []NodeSelectorTerm matches no nodes also, just for simplicity
|
|
|
|
// 4. nil []NodeSelectorRequirement (equivalent to non-nil empty NodeSelectorTerm) matches no nodes
|
|
|
|
// 5. zero-length non-nil []NodeSelectorRequirement matches no nodes also, just for simplicity
|
|
|
|
// 6. non-nil empty NodeSelectorRequirement is not allowed
|
|
|
|
nodeAffinityMatches := true
|
|
|
|
if affinity.NodeAffinity != nil {
|
|
|
|
nodeAffinity := affinity.NodeAffinity
|
|
|
|
// if no required NodeAffinity requirements, will do no-op, means select all nodes.
|
2016-02-11 07:06:33 +00:00
|
|
|
// TODO: Replace next line with subsequent commented-out line when implement RequiredDuringSchedulingRequiredDuringExecution.
|
|
|
|
if nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil {
|
|
|
|
// if nodeAffinity.RequiredDuringSchedulingRequiredDuringExecution == nil && nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil {
|
2016-01-26 23:03:18 +00:00
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
// Match node selector for requiredDuringSchedulingRequiredDuringExecution.
|
2016-02-11 07:06:33 +00:00
|
|
|
// TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution.
|
|
|
|
// if nodeAffinity.RequiredDuringSchedulingRequiredDuringExecution != nil {
|
|
|
|
// nodeSelectorTerms := nodeAffinity.RequiredDuringSchedulingRequiredDuringExecution.NodeSelectorTerms
|
|
|
|
// glog.V(10).Infof("Match for RequiredDuringSchedulingRequiredDuringExecution node selector terms %+v", nodeSelectorTerms)
|
2016-02-23 03:53:10 +00:00
|
|
|
// nodeAffinityMatches = nodeMatchesNodeSelectorTerms(node, nodeSelectorTerms)
|
2016-02-11 07:06:33 +00:00
|
|
|
// }
|
|
|
|
|
|
|
|
// Match node selector for requiredDuringSchedulingIgnoredDuringExecution.
|
2016-01-26 23:03:18 +00:00
|
|
|
if nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil {
|
|
|
|
nodeSelectorTerms := nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms
|
|
|
|
glog.V(10).Infof("Match for RequiredDuringSchedulingIgnoredDuringExecution node selector terms %+v", nodeSelectorTerms)
|
2016-02-23 03:53:10 +00:00
|
|
|
nodeAffinityMatches = nodeAffinityMatches && nodeMatchesNodeSelectorTerms(node, nodeSelectorTerms)
|
2016-01-26 23:03:18 +00:00
|
|
|
}
|
|
|
|
|
2015-03-20 16:52:32 +00:00
|
|
|
}
|
2016-01-26 23:03:18 +00:00
|
|
|
return nodeAffinityMatches
|
2015-03-20 16:52:32 +00:00
|
|
|
}
|
|
|
|
|
2016-04-21 08:24:12 +00:00
|
|
|
func PodSelectorMatches(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo) (bool, error) {
|
|
|
|
node := nodeInfo.Node()
|
|
|
|
if node == nil {
|
|
|
|
return false, fmt.Errorf("node not found: %q", nodeName)
|
2014-10-22 00:13:52 +00:00
|
|
|
}
|
2016-01-06 01:10:59 +00:00
|
|
|
if PodMatchesNodeLabels(pod, node) {
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
return false, ErrNodeSelectorNotMatch
|
2014-10-22 00:13:52 +00:00
|
|
|
}
|
|
|
|
|
2016-01-28 20:14:45 +00:00
|
|
|
func PodFitsHost(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo) (bool, error) {
|
2015-05-22 23:40:57 +00:00
|
|
|
if len(pod.Spec.NodeName) == 0 {
|
2014-12-18 22:12:58 +00:00
|
|
|
return true, nil
|
|
|
|
}
|
2016-01-06 01:10:59 +00:00
|
|
|
if pod.Spec.NodeName == nodeName {
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
return false, ErrPodNotMatchHostName
|
2014-12-18 22:12:58 +00:00
|
|
|
}
|
|
|
|
|
2014-12-22 21:54:41 +00:00
|
|
|
type NodeLabelChecker struct {
|
|
|
|
labels []string
|
|
|
|
presence bool
|
|
|
|
}
|
|
|
|
|
2016-04-21 08:24:12 +00:00
|
|
|
func NewNodeLabelPredicate(labels []string, presence bool) algorithm.FitPredicate {
|
2014-12-22 21:54:41 +00:00
|
|
|
labelChecker := &NodeLabelChecker{
|
|
|
|
labels: labels,
|
|
|
|
presence: presence,
|
|
|
|
}
|
|
|
|
return labelChecker.CheckNodeLabelPresence
|
|
|
|
}
|
|
|
|
|
2015-09-10 08:40:22 +00:00
|
|
|
// CheckNodeLabelPresence checks whether all of the specified labels exists on a node or not, regardless of their value
|
|
|
|
// If "presence" is false, then returns false if any of the requested labels matches any of the node's labels,
|
2015-01-05 22:51:22 +00:00
|
|
|
// otherwise returns true.
|
2015-09-10 08:40:22 +00:00
|
|
|
// If "presence" is true, then returns false if any of the requested labels does not match any of the node's labels,
|
2015-01-05 22:51:22 +00:00
|
|
|
// otherwise returns true.
|
|
|
|
//
|
2015-09-10 08:40:22 +00:00
|
|
|
// Consider the cases where the nodes are placed in regions/zones/racks and these are identified by labels
|
|
|
|
// In some cases, it is required that only nodes that are part of ANY of the defined regions/zones/racks be selected
|
2014-12-22 21:54:41 +00:00
|
|
|
//
|
2015-09-10 08:40:22 +00:00
|
|
|
// Alternately, eliminating nodes that have a certain label, regardless of value, is also useful
|
|
|
|
// A node may have a label with "retiring" as key and the date as the value
|
|
|
|
// and it may be desirable to avoid scheduling new pods on this node
|
2016-01-28 20:14:45 +00:00
|
|
|
func (n *NodeLabelChecker) CheckNodeLabelPresence(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo) (bool, error) {
|
2016-04-21 08:24:12 +00:00
|
|
|
node := nodeInfo.Node()
|
|
|
|
if node == nil {
|
|
|
|
return false, fmt.Errorf("node not found: %q", nodeName)
|
2014-12-22 21:54:41 +00:00
|
|
|
}
|
2016-04-21 08:24:12 +00:00
|
|
|
|
|
|
|
var exists bool
|
2015-09-10 08:40:22 +00:00
|
|
|
nodeLabels := labels.Set(node.Labels)
|
2014-12-22 21:54:41 +00:00
|
|
|
for _, label := range n.labels {
|
2015-09-10 08:40:22 +00:00
|
|
|
exists = nodeLabels.Has(label)
|
2014-12-22 21:54:41 +00:00
|
|
|
if (exists && !n.presence) || (!exists && n.presence) {
|
2016-01-06 01:10:59 +00:00
|
|
|
return false, ErrNodeLabelPresenceViolated
|
2014-12-22 21:54:41 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
|
2014-12-22 23:55:31 +00:00
|
|
|
type ServiceAffinity struct {
|
2015-05-08 11:01:09 +00:00
|
|
|
podLister algorithm.PodLister
|
|
|
|
serviceLister algorithm.ServiceLister
|
2014-12-22 23:55:31 +00:00
|
|
|
nodeInfo NodeInfo
|
|
|
|
labels []string
|
|
|
|
}
|
|
|
|
|
2015-05-08 11:01:09 +00:00
|
|
|
func NewServiceAffinityPredicate(podLister algorithm.PodLister, serviceLister algorithm.ServiceLister, nodeInfo NodeInfo, labels []string) algorithm.FitPredicate {
|
2014-12-22 23:55:31 +00:00
|
|
|
affinity := &ServiceAffinity{
|
|
|
|
podLister: podLister,
|
|
|
|
serviceLister: serviceLister,
|
|
|
|
nodeInfo: nodeInfo,
|
|
|
|
labels: labels,
|
|
|
|
}
|
|
|
|
return affinity.CheckServiceAffinity
|
|
|
|
}
|
|
|
|
|
2015-09-10 08:40:22 +00:00
|
|
|
// CheckServiceAffinity ensures that only the nodes that match the specified labels are considered for scheduling.
|
2015-01-05 22:51:22 +00:00
|
|
|
// The set of labels to be considered are provided to the struct (ServiceAffinity).
|
2015-09-10 08:40:22 +00:00
|
|
|
// The pod is checked for the labels and any missing labels are then checked in the node
|
2015-01-05 22:51:22 +00:00
|
|
|
// that hosts the service pods (peers) for the given pod.
|
2015-01-08 06:18:22 +00:00
|
|
|
//
|
|
|
|
// We add an implicit selector requiring some particular value V for label L to a pod, if:
|
|
|
|
// - L is listed in the ServiceAffinity object that is passed into the function
|
|
|
|
// - the pod does not have any NodeSelector for L
|
2015-09-10 08:40:22 +00:00
|
|
|
// - some other pod from the same service is already scheduled onto a node that has value V for label L
|
2016-01-28 20:14:45 +00:00
|
|
|
func (s *ServiceAffinity) CheckServiceAffinity(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo) (bool, error) {
|
2014-12-22 23:55:31 +00:00
|
|
|
var affinitySelector labels.Selector
|
|
|
|
|
2015-01-05 22:51:22 +00:00
|
|
|
// check if the pod being scheduled has the affinity labels specified in its NodeSelector
|
2014-12-22 23:55:31 +00:00
|
|
|
affinityLabels := map[string]string{}
|
2015-01-05 22:51:22 +00:00
|
|
|
nodeSelector := labels.Set(pod.Spec.NodeSelector)
|
2014-12-22 23:55:31 +00:00
|
|
|
labelsExist := true
|
|
|
|
for _, l := range s.labels {
|
2015-01-05 22:51:22 +00:00
|
|
|
if nodeSelector.Has(l) {
|
|
|
|
affinityLabels[l] = nodeSelector.Get(l)
|
2014-12-22 23:55:31 +00:00
|
|
|
} else {
|
|
|
|
// the current pod does not specify all the labels, look in the existing service pods
|
|
|
|
labelsExist = false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// skip looking at other pods in the service if the current pod defines all the required affinity labels
|
|
|
|
if !labelsExist {
|
2015-01-05 22:51:22 +00:00
|
|
|
services, err := s.serviceLister.GetPodServices(pod)
|
2014-12-22 23:55:31 +00:00
|
|
|
if err == nil {
|
2015-01-05 22:51:22 +00:00
|
|
|
// just use the first service and get the other pods within the service
|
|
|
|
// TODO: a separate predicate can be created that tries to handle all services for the pod
|
|
|
|
selector := labels.SelectorFromSet(services[0].Spec.Selector)
|
2015-01-13 17:52:37 +00:00
|
|
|
servicePods, err := s.podLister.List(selector)
|
2014-12-22 23:55:31 +00:00
|
|
|
if err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
2015-03-06 22:29:44 +00:00
|
|
|
// consider only the pods that belong to the same namespace
|
2015-04-03 22:51:50 +00:00
|
|
|
nsServicePods := []*api.Pod{}
|
2015-03-06 22:29:44 +00:00
|
|
|
for _, nsPod := range servicePods {
|
|
|
|
if nsPod.Namespace == pod.Namespace {
|
|
|
|
nsServicePods = append(nsServicePods, nsPod)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if len(nsServicePods) > 0 {
|
2015-09-10 08:40:22 +00:00
|
|
|
// consider any service pod and fetch the node its hosted on
|
|
|
|
otherNode, err := s.nodeInfo.GetNodeInfo(nsServicePods[0].Spec.NodeName)
|
2014-12-22 23:55:31 +00:00
|
|
|
if err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
|
|
|
for _, l := range s.labels {
|
|
|
|
// If the pod being scheduled has the label value specified, do not override it
|
|
|
|
if _, exists := affinityLabels[l]; exists {
|
|
|
|
continue
|
|
|
|
}
|
2015-09-10 08:40:22 +00:00
|
|
|
if labels.Set(otherNode.Labels).Has(l) {
|
|
|
|
affinityLabels[l] = labels.Set(otherNode.Labels).Get(l)
|
2014-12-22 23:55:31 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-09-10 08:40:22 +00:00
|
|
|
// if there are no existing pods in the service, consider all nodes
|
2014-12-22 23:55:31 +00:00
|
|
|
if len(affinityLabels) == 0 {
|
|
|
|
affinitySelector = labels.Everything()
|
|
|
|
} else {
|
|
|
|
affinitySelector = labels.Set(affinityLabels).AsSelector()
|
|
|
|
}
|
|
|
|
|
2016-01-28 20:14:45 +00:00
|
|
|
node, err := s.nodeInfo.GetNodeInfo(nodeName)
|
2014-12-22 23:55:31 +00:00
|
|
|
if err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
|
|
|
|
2015-09-10 08:40:22 +00:00
|
|
|
// check if the node matches the selector
|
2016-01-06 01:10:59 +00:00
|
|
|
if affinitySelector.Matches(labels.Set(node.Labels)) {
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
return false, ErrServiceAffinityViolated
|
2014-12-22 23:55:31 +00:00
|
|
|
}
|
|
|
|
|
2016-01-28 20:14:45 +00:00
|
|
|
func PodFitsHostPorts(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo) (bool, error) {
|
2014-11-05 05:21:26 +00:00
|
|
|
wantPorts := getUsedPorts(pod)
|
2016-01-10 04:03:52 +00:00
|
|
|
if len(wantPorts) == 0 {
|
|
|
|
return true, nil
|
|
|
|
}
|
2016-01-28 20:14:45 +00:00
|
|
|
existingPorts := getUsedPorts(nodeInfo.Pods()...)
|
2014-11-05 05:21:26 +00:00
|
|
|
for wport := range wantPorts {
|
|
|
|
if wport == 0 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if existingPorts[wport] {
|
2016-01-06 01:10:59 +00:00
|
|
|
return false, ErrPodNotFitsHostPorts
|
2014-09-23 23:14:54 +00:00
|
|
|
}
|
2014-06-28 22:35:51 +00:00
|
|
|
}
|
2014-09-23 23:14:54 +00:00
|
|
|
return true, nil
|
2014-06-28 22:35:51 +00:00
|
|
|
}
|
|
|
|
|
2015-04-03 22:51:50 +00:00
|
|
|
func getUsedPorts(pods ...*api.Pod) map[int]bool {
|
2016-04-21 08:24:12 +00:00
|
|
|
// TODO: Aggregate it at the NodeInfo level.
|
2014-11-05 05:21:26 +00:00
|
|
|
ports := make(map[int]bool)
|
|
|
|
for _, pod := range pods {
|
2014-11-13 15:52:13 +00:00
|
|
|
for _, container := range pod.Spec.Containers {
|
2014-11-05 05:21:26 +00:00
|
|
|
for _, podPort := range container.Ports {
|
2016-04-21 08:24:12 +00:00
|
|
|
// "0" is explicitly ignored in PodFitsHostPorts,
|
|
|
|
// which is the only function that uses this value.
|
|
|
|
if podPort.HostPort != 0 {
|
|
|
|
ports[podPort.HostPort] = true
|
|
|
|
}
|
2014-06-28 22:35:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2014-11-05 05:21:26 +00:00
|
|
|
return ports
|
2014-06-28 22:35:51 +00:00
|
|
|
}
|
|
|
|
|
2015-10-20 17:24:23 +00:00
|
|
|
// search two arrays and return true if they have at least one common element; return false otherwise
|
|
|
|
func haveSame(a1, a2 []string) bool {
|
|
|
|
for _, val1 := range a1 {
|
|
|
|
for _, val2 := range a2 {
|
|
|
|
if val1 == val2 {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
2016-01-06 01:10:59 +00:00
|
|
|
|
2016-04-21 08:24:12 +00:00
|
|
|
func GeneralPredicates(pod *api.Pod, nodeName string, nodeInfo *schedulercache.NodeInfo) (bool, error) {
|
|
|
|
fit, err := podFitsResourcesInternal(pod, nodeName, nodeInfo)
|
2016-01-06 01:10:59 +00:00
|
|
|
if !fit {
|
|
|
|
return fit, err
|
|
|
|
}
|
2016-04-22 16:58:49 +00:00
|
|
|
|
2016-01-06 01:10:59 +00:00
|
|
|
fit, err = PodFitsHost(pod, nodeName, nodeInfo)
|
|
|
|
if !fit {
|
|
|
|
return fit, err
|
|
|
|
}
|
|
|
|
fit, err = PodFitsHostPorts(pod, nodeName, nodeInfo)
|
|
|
|
if !fit {
|
|
|
|
return fit, err
|
|
|
|
}
|
2016-04-21 08:24:12 +00:00
|
|
|
fit, err = PodSelectorMatches(pod, nodeName, nodeInfo)
|
|
|
|
if !fit {
|
|
|
|
return fit, err
|
2016-01-06 01:10:59 +00:00
|
|
|
}
|
|
|
|
return true, nil
|
|
|
|
}
|