k3s/plugin/pkg/scheduler/algorithm/predicates/predicates.go

471 lines
15 KiB
Go
Raw Normal View History

/*
Copyright 2014 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package predicates
import (
"fmt"
2015-08-05 22:03:47 +00:00
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/client/cache"
2015-08-13 19:01:50 +00:00
client "k8s.io/kubernetes/pkg/client/unversioned"
2015-08-05 22:03:47 +00:00
"k8s.io/kubernetes/pkg/labels"
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
"github.com/golang/glog"
)
2014-09-25 20:55:42 +00:00
type NodeInfo interface {
2014-12-08 03:44:27 +00:00
GetNodeInfo(nodeID string) (*api.Node, error)
}
type StaticNodeInfo struct {
2014-12-08 03:44:27 +00:00
*api.NodeList
}
2014-12-08 03:44:27 +00:00
func (nodes StaticNodeInfo) GetNodeInfo(nodeID string) (*api.Node, error) {
for ix := range nodes.Items {
2014-10-22 17:02:02 +00:00
if nodes.Items[ix].Name == nodeID {
return &nodes.Items[ix], nil
}
}
2014-10-10 20:49:09 +00:00
return nil, fmt.Errorf("failed to find node: %s, %#v", nodeID, nodes)
}
type ClientNodeInfo struct {
*client.Client
}
2014-12-08 03:44:27 +00:00
func (nodes ClientNodeInfo) GetNodeInfo(nodeID string) (*api.Node, error) {
return nodes.Nodes().Get(nodeID)
2014-09-25 20:55:42 +00:00
}
type CachedNodeInfo struct {
*cache.StoreToNodeLister
}
// GetNodeInfo returns cached data for the node 'id'.
func (c *CachedNodeInfo) GetNodeInfo(id string) (*api.Node, error) {
node, exists, err := c.Get(&api.Node{ObjectMeta: api.ObjectMeta{Name: id}})
if err != nil {
return nil, fmt.Errorf("error retrieving node '%v' from cache: %v", id, err)
}
if !exists {
return nil, fmt.Errorf("node '%v' is not in cache", id)
}
return node.(*api.Node), nil
}
func isVolumeConflict(volume api.Volume, pod *api.Pod) bool {
2015-03-06 14:26:39 +00:00
if volume.GCEPersistentDisk != nil {
disk := volume.GCEPersistentDisk
2015-03-06 14:26:39 +00:00
existingPod := &(pod.Spec)
for ix := range existingPod.Volumes {
if existingPod.Volumes[ix].GCEPersistentDisk != nil &&
existingPod.Volumes[ix].GCEPersistentDisk.PDName == disk.PDName &&
!(existingPod.Volumes[ix].GCEPersistentDisk.ReadOnly && disk.ReadOnly) {
2015-03-06 14:26:39 +00:00
return true
}
}
}
if volume.AWSElasticBlockStore != nil {
2015-04-09 13:34:16 +00:00
volumeID := volume.AWSElasticBlockStore.VolumeID
existingPod := &(pod.Spec)
for ix := range existingPod.Volumes {
if existingPod.Volumes[ix].AWSElasticBlockStore != nil &&
existingPod.Volumes[ix].AWSElasticBlockStore.VolumeID == volumeID {
2015-03-06 14:26:39 +00:00
return true
}
}
}
if volume.RBD != nil {
mon := volume.RBD.CephMonitors
pool := volume.RBD.RBDPool
image := volume.RBD.RBDImage
existingPod := &(pod.Spec)
for ix := range existingPod.Volumes {
if existingPod.Volumes[ix].RBD != nil {
mon_m := existingPod.Volumes[ix].RBD.CephMonitors
pool_m := existingPod.Volumes[ix].RBD.RBDPool
image_m := existingPod.Volumes[ix].RBD.RBDImage
if haveSame(mon, mon_m) && pool_m == pool && image_m == image {
return true
}
}
}
}
return false
}
// NoDiskConflict evaluates if a pod can fit due to the volumes it requests, and those that
// are already mounted. If there is already a volume mounted on that node, another pod that uses the same volume
// can't be scheduled there.
// This is GCE, Amazon EBS, and Ceph RBD specific for now:
// - GCE PD allows multiple mounts as long as they're all read-only
// - AWS EBS forbids any two pods mounting the same volume ID
// - Ceph RBD forbids if any two pods share at least same monitor, and match pool and image.
// TODO: migrate this into some per-volume specific code?
func NoDiskConflict(pod *api.Pod, existingPods []*api.Pod, node string) (bool, error) {
podSpec := &(pod.Spec)
for ix := range podSpec.Volumes {
for podIx := range existingPods {
if isVolumeConflict(podSpec.Volumes[ix], existingPods[podIx]) {
return false, nil
}
}
}
return true, nil
}
2014-09-25 20:55:42 +00:00
type ResourceFit struct {
info NodeInfo
}
type resourceRequest struct {
milliCPU int64
memory int64
2014-09-25 20:55:42 +00:00
}
var FailedResourceType string
2014-09-25 20:55:42 +00:00
func getResourceRequest(pod *api.Pod) resourceRequest {
result := resourceRequest{}
2015-07-30 19:59:22 +00:00
for _, container := range pod.Spec.Containers {
requests := container.Resources.Requests
result.memory += requests.Memory().Value()
result.milliCPU += requests.Cpu().MilliValue()
2014-09-25 20:55:42 +00:00
}
return result
}
func CheckPodsExceedingFreeResources(pods []*api.Pod, capacity api.ResourceList) (fitting []*api.Pod, notFittingCPU, notFittingMemory []*api.Pod) {
totalMilliCPU := capacity.Cpu().MilliValue()
totalMemory := capacity.Memory().Value()
milliCPURequested := int64(0)
memoryRequested := int64(0)
for _, pod := range pods {
podRequest := getResourceRequest(pod)
fitsCPU := totalMilliCPU == 0 || (totalMilliCPU-milliCPURequested) >= podRequest.milliCPU
fitsMemory := totalMemory == 0 || (totalMemory-memoryRequested) >= podRequest.memory
if !fitsCPU {
// the pod doesn't fit due to CPU request
notFittingCPU = append(notFittingCPU, pod)
continue
}
if !fitsMemory {
// the pod doesn't fit due to Memory request
notFittingMemory = append(notFittingMemory, pod)
continue
}
// the pod fits
milliCPURequested += podRequest.milliCPU
memoryRequested += podRequest.memory
fitting = append(fitting, pod)
}
return
}
2015-10-19 22:00:41 +00:00
func podName(pod *api.Pod) string {
return pod.Namespace + "/" + pod.Name
}
2014-09-25 20:55:42 +00:00
// PodFitsResources calculates fit based on requested, rather than used resources
func (r *ResourceFit) PodFitsResources(pod *api.Pod, existingPods []*api.Pod, node string) (bool, error) {
podRequest := getResourceRequest(pod)
2014-09-25 20:55:42 +00:00
info, err := r.info.GetNodeInfo(node)
if err != nil {
return false, err
}
if podRequest.milliCPU == 0 && podRequest.memory == 0 {
return int64(len(existingPods)) < info.Status.Capacity.Pods().Value(), nil
}
pods := []*api.Pod{}
copy(pods, existingPods)
pods = append(existingPods, pod)
_, exceedingCPU, exceedingMemory := CheckPodsExceedingFreeResources(pods, info.Status.Capacity)
if int64(len(pods)) > info.Status.Capacity.Pods().Value() {
2015-10-19 22:00:41 +00:00
glog.V(10).Infof("Cannot schedule Pod %+v, because Node %+v is full, running %v out of %v Pods.", podName(pod), node, len(pods)-1, info.Status.Capacity.Pods().Value())
FailedResourceType = "PodExceedsMaxPodNumber"
return false, nil
}
if len(exceedingCPU) > 0 {
2015-10-19 22:00:41 +00:00
glog.V(10).Infof("Cannot schedule Pod %+v, because Node %v does not have sufficient CPU", podName(pod), node)
FailedResourceType = "PodExceedsFreeCPU"
return false, nil
}
if len(exceedingMemory) > 0 {
2015-10-19 22:00:41 +00:00
glog.V(10).Infof("Cannot schedule Pod %+v, because Node %v does not have sufficient Memory", podName(pod), node)
FailedResourceType = "PodExceedsFreeMemory"
return false, nil
2014-09-25 20:55:42 +00:00
}
2015-10-19 22:00:41 +00:00
glog.V(10).Infof("Schedule Pod %+v on Node %+v is allowed, Node is running only %v out of %v Pods.", podName(pod), node, len(pods)-1, info.Status.Capacity.Pods().Value())
return true, nil
2014-09-25 20:55:42 +00:00
}
func NewResourceFitPredicate(info NodeInfo) algorithm.FitPredicate {
fit := &ResourceFit{
info: info,
}
return fit.PodFitsResources
}
func NewSelectorMatchPredicate(info NodeInfo) algorithm.FitPredicate {
2014-10-22 00:13:52 +00:00
selector := &NodeSelector{
info: info,
}
return selector.PodSelectorMatches
}
func PodMatchesNodeLabels(pod *api.Pod, node *api.Node) bool {
if len(pod.Spec.NodeSelector) == 0 {
return true
}
selector := labels.SelectorFromSet(pod.Spec.NodeSelector)
return selector.Matches(labels.Set(node.Labels))
}
2014-10-22 00:13:52 +00:00
type NodeSelector struct {
info NodeInfo
}
func (n *NodeSelector) PodSelectorMatches(pod *api.Pod, existingPods []*api.Pod, nodeID string) (bool, error) {
node, err := n.info.GetNodeInfo(nodeID)
2014-10-22 00:13:52 +00:00
if err != nil {
return false, err
}
return PodMatchesNodeLabels(pod, node), nil
2014-10-22 00:13:52 +00:00
}
func PodFitsHost(pod *api.Pod, existingPods []*api.Pod, node string) (bool, error) {
if len(pod.Spec.NodeName) == 0 {
return true, nil
}
return pod.Spec.NodeName == node, nil
}
type NodeLabelChecker struct {
info NodeInfo
labels []string
presence bool
}
func NewNodeLabelPredicate(info NodeInfo, labels []string, presence bool) algorithm.FitPredicate {
labelChecker := &NodeLabelChecker{
info: info,
labels: labels,
presence: presence,
}
return labelChecker.CheckNodeLabelPresence
}
// CheckNodeLabelPresence checks whether all of the specified labels exists on a node or not, regardless of their value
// If "presence" is false, then returns false if any of the requested labels matches any of the node's labels,
2015-01-05 22:51:22 +00:00
// otherwise returns true.
// If "presence" is true, then returns false if any of the requested labels does not match any of the node's labels,
2015-01-05 22:51:22 +00:00
// otherwise returns true.
//
// Consider the cases where the nodes are placed in regions/zones/racks and these are identified by labels
// In some cases, it is required that only nodes that are part of ANY of the defined regions/zones/racks be selected
//
// Alternately, eliminating nodes that have a certain label, regardless of value, is also useful
// A node may have a label with "retiring" as key and the date as the value
// and it may be desirable to avoid scheduling new pods on this node
func (n *NodeLabelChecker) CheckNodeLabelPresence(pod *api.Pod, existingPods []*api.Pod, nodeID string) (bool, error) {
var exists bool
node, err := n.info.GetNodeInfo(nodeID)
if err != nil {
return false, err
}
nodeLabels := labels.Set(node.Labels)
for _, label := range n.labels {
exists = nodeLabels.Has(label)
if (exists && !n.presence) || (!exists && n.presence) {
return false, nil
}
}
return true, nil
}
2014-12-22 23:55:31 +00:00
type ServiceAffinity struct {
podLister algorithm.PodLister
serviceLister algorithm.ServiceLister
2014-12-22 23:55:31 +00:00
nodeInfo NodeInfo
labels []string
}
func NewServiceAffinityPredicate(podLister algorithm.PodLister, serviceLister algorithm.ServiceLister, nodeInfo NodeInfo, labels []string) algorithm.FitPredicate {
2014-12-22 23:55:31 +00:00
affinity := &ServiceAffinity{
podLister: podLister,
serviceLister: serviceLister,
nodeInfo: nodeInfo,
labels: labels,
}
return affinity.CheckServiceAffinity
}
// CheckServiceAffinity ensures that only the nodes that match the specified labels are considered for scheduling.
2015-01-05 22:51:22 +00:00
// The set of labels to be considered are provided to the struct (ServiceAffinity).
// The pod is checked for the labels and any missing labels are then checked in the node
2015-01-05 22:51:22 +00:00
// that hosts the service pods (peers) for the given pod.
//
// We add an implicit selector requiring some particular value V for label L to a pod, if:
// - L is listed in the ServiceAffinity object that is passed into the function
// - the pod does not have any NodeSelector for L
// - some other pod from the same service is already scheduled onto a node that has value V for label L
func (s *ServiceAffinity) CheckServiceAffinity(pod *api.Pod, existingPods []*api.Pod, nodeID string) (bool, error) {
2014-12-22 23:55:31 +00:00
var affinitySelector labels.Selector
2015-01-05 22:51:22 +00:00
// check if the pod being scheduled has the affinity labels specified in its NodeSelector
2014-12-22 23:55:31 +00:00
affinityLabels := map[string]string{}
2015-01-05 22:51:22 +00:00
nodeSelector := labels.Set(pod.Spec.NodeSelector)
2014-12-22 23:55:31 +00:00
labelsExist := true
for _, l := range s.labels {
2015-01-05 22:51:22 +00:00
if nodeSelector.Has(l) {
affinityLabels[l] = nodeSelector.Get(l)
2014-12-22 23:55:31 +00:00
} else {
// the current pod does not specify all the labels, look in the existing service pods
labelsExist = false
}
}
// skip looking at other pods in the service if the current pod defines all the required affinity labels
if !labelsExist {
2015-01-05 22:51:22 +00:00
services, err := s.serviceLister.GetPodServices(pod)
2014-12-22 23:55:31 +00:00
if err == nil {
2015-01-05 22:51:22 +00:00
// just use the first service and get the other pods within the service
// TODO: a separate predicate can be created that tries to handle all services for the pod
selector := labels.SelectorFromSet(services[0].Spec.Selector)
servicePods, err := s.podLister.List(selector)
2014-12-22 23:55:31 +00:00
if err != nil {
return false, err
}
// consider only the pods that belong to the same namespace
nsServicePods := []*api.Pod{}
for _, nsPod := range servicePods {
if nsPod.Namespace == pod.Namespace {
nsServicePods = append(nsServicePods, nsPod)
}
}
if len(nsServicePods) > 0 {
// consider any service pod and fetch the node its hosted on
otherNode, err := s.nodeInfo.GetNodeInfo(nsServicePods[0].Spec.NodeName)
2014-12-22 23:55:31 +00:00
if err != nil {
return false, err
}
for _, l := range s.labels {
// If the pod being scheduled has the label value specified, do not override it
if _, exists := affinityLabels[l]; exists {
continue
}
if labels.Set(otherNode.Labels).Has(l) {
affinityLabels[l] = labels.Set(otherNode.Labels).Get(l)
2014-12-22 23:55:31 +00:00
}
}
}
}
}
// if there are no existing pods in the service, consider all nodes
2014-12-22 23:55:31 +00:00
if len(affinityLabels) == 0 {
affinitySelector = labels.Everything()
} else {
affinitySelector = labels.Set(affinityLabels).AsSelector()
}
node, err := s.nodeInfo.GetNodeInfo(nodeID)
2014-12-22 23:55:31 +00:00
if err != nil {
return false, err
}
// check if the node matches the selector
return affinitySelector.Matches(labels.Set(node.Labels)), nil
2014-12-22 23:55:31 +00:00
}
func PodFitsHostPorts(pod *api.Pod, existingPods []*api.Pod, node string) (bool, error) {
2014-11-05 05:21:26 +00:00
existingPorts := getUsedPorts(existingPods...)
wantPorts := getUsedPorts(pod)
for wport := range wantPorts {
if wport == 0 {
continue
}
if existingPorts[wport] {
return false, nil
}
}
return true, nil
}
func getUsedPorts(pods ...*api.Pod) map[int]bool {
2014-11-05 05:21:26 +00:00
ports := make(map[int]bool)
for _, pod := range pods {
2014-11-13 15:52:13 +00:00
for _, container := range pod.Spec.Containers {
2014-11-05 05:21:26 +00:00
for _, podPort := range container.Ports {
ports[podPort.HostPort] = true
}
}
}
2014-11-05 05:21:26 +00:00
return ports
}
func filterNonRunningPods(pods []*api.Pod) []*api.Pod {
if len(pods) == 0 {
return pods
}
result := []*api.Pod{}
for _, pod := range pods {
if pod.Status.Phase == api.PodSucceeded || pod.Status.Phase == api.PodFailed {
continue
}
result = append(result, pod)
}
return result
}
// MapPodsToMachines obtains a list of pods and pivots that list into a map where the keys are host names
// and the values are the list of pods running on that host.
func MapPodsToMachines(lister algorithm.PodLister) (map[string][]*api.Pod, error) {
machineToPods := map[string][]*api.Pod{}
2014-09-24 16:32:36 +00:00
// TODO: perform more targeted query...
pods, err := lister.List(labels.Everything())
2014-09-24 16:32:36 +00:00
if err != nil {
return map[string][]*api.Pod{}, err
2014-09-24 16:32:36 +00:00
}
pods = filterNonRunningPods(pods)
2014-09-24 16:32:36 +00:00
for _, scheduledPod := range pods {
host := scheduledPod.Spec.NodeName
2014-09-24 16:32:36 +00:00
machineToPods[host] = append(machineToPods[host], scheduledPod)
}
return machineToPods, nil
}
// search two arrays and return true if they have at least one common element; return false otherwise
func haveSame(a1, a2 []string) bool {
for _, val1 := range a1 {
for _, val2 := range a2 {
if val1 == val2 {
return true
}
}
}
return false
}