2014-06-28 22:35:51 +00:00
|
|
|
/*
|
|
|
|
Copyright 2014 Google Inc. All rights reserved.
|
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package scheduler
|
|
|
|
|
|
|
|
import (
|
2014-09-26 23:28:30 +00:00
|
|
|
"fmt"
|
|
|
|
|
2014-06-28 22:35:51 +00:00
|
|
|
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
|
2014-10-10 20:49:09 +00:00
|
|
|
"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
|
2014-06-28 22:35:51 +00:00
|
|
|
"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
|
2014-09-25 20:55:42 +00:00
|
|
|
"github.com/golang/glog"
|
2014-06-28 22:35:51 +00:00
|
|
|
)
|
|
|
|
|
2014-09-25 20:55:42 +00:00
|
|
|
type NodeInfo interface {
|
2014-12-08 03:44:27 +00:00
|
|
|
GetNodeInfo(nodeID string) (*api.Node, error)
|
2014-09-26 23:28:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
type StaticNodeInfo struct {
|
2014-12-08 03:44:27 +00:00
|
|
|
*api.NodeList
|
2014-09-26 23:28:30 +00:00
|
|
|
}
|
|
|
|
|
2014-12-08 03:44:27 +00:00
|
|
|
func (nodes StaticNodeInfo) GetNodeInfo(nodeID string) (*api.Node, error) {
|
2014-09-26 23:28:30 +00:00
|
|
|
for ix := range nodes.Items {
|
2014-10-22 17:02:02 +00:00
|
|
|
if nodes.Items[ix].Name == nodeID {
|
2014-09-26 23:28:30 +00:00
|
|
|
return &nodes.Items[ix], nil
|
|
|
|
}
|
|
|
|
}
|
2014-10-10 20:49:09 +00:00
|
|
|
return nil, fmt.Errorf("failed to find node: %s, %#v", nodeID, nodes)
|
|
|
|
}
|
|
|
|
|
|
|
|
type ClientNodeInfo struct {
|
|
|
|
*client.Client
|
|
|
|
}
|
|
|
|
|
2014-12-08 03:44:27 +00:00
|
|
|
func (nodes ClientNodeInfo) GetNodeInfo(nodeID string) (*api.Node, error) {
|
2014-12-08 05:56:43 +00:00
|
|
|
return nodes.Nodes().Get(nodeID)
|
2014-09-25 20:55:42 +00:00
|
|
|
}
|
|
|
|
|
2014-10-13 04:34:23 +00:00
|
|
|
func isVolumeConflict(volume api.Volume, pod *api.Pod) bool {
|
2015-03-03 22:48:55 +00:00
|
|
|
if volume.GCEPersistentDisk == nil {
|
2014-10-13 04:34:23 +00:00
|
|
|
return false
|
|
|
|
}
|
2015-03-03 22:48:55 +00:00
|
|
|
pdName := volume.GCEPersistentDisk.PDName
|
2014-10-13 04:34:23 +00:00
|
|
|
|
2014-11-13 15:52:13 +00:00
|
|
|
manifest := &(pod.Spec)
|
2014-10-13 04:34:23 +00:00
|
|
|
for ix := range manifest.Volumes {
|
2015-03-03 22:48:55 +00:00
|
|
|
if manifest.Volumes[ix].GCEPersistentDisk != nil &&
|
|
|
|
manifest.Volumes[ix].GCEPersistentDisk.PDName == pdName {
|
2014-10-13 04:34:23 +00:00
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// NoDiskConflict evaluates if a pod can fit due to the volumes it requests, and those that
|
|
|
|
// are already mounted. Some times of volumes are mounted onto node machines. For now, these mounts
|
|
|
|
// are exclusive so if there is already a volume mounted on that node, another pod can't schedule
|
|
|
|
// there. This is GCE specific for now.
|
|
|
|
// TODO: migrate this into some per-volume specific code?
|
|
|
|
func NoDiskConflict(pod api.Pod, existingPods []api.Pod, node string) (bool, error) {
|
2014-11-13 15:52:13 +00:00
|
|
|
manifest := &(pod.Spec)
|
2014-10-13 04:34:23 +00:00
|
|
|
for ix := range manifest.Volumes {
|
|
|
|
for podIx := range existingPods {
|
|
|
|
if isVolumeConflict(manifest.Volumes[ix], &existingPods[podIx]) {
|
|
|
|
return false, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
|
2014-09-25 20:55:42 +00:00
|
|
|
type ResourceFit struct {
|
|
|
|
info NodeInfo
|
|
|
|
}
|
|
|
|
|
|
|
|
type resourceRequest struct {
|
2015-01-05 21:16:18 +00:00
|
|
|
milliCPU int64
|
|
|
|
memory int64
|
2014-09-25 20:55:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func getResourceRequest(pod *api.Pod) resourceRequest {
|
|
|
|
result := resourceRequest{}
|
2014-11-13 15:52:13 +00:00
|
|
|
for ix := range pod.Spec.Containers {
|
2015-01-25 04:19:36 +00:00
|
|
|
limits := pod.Spec.Containers[ix].Resources.Limits
|
|
|
|
result.memory += limits.Memory().Value()
|
|
|
|
result.milliCPU += limits.Cpu().MilliValue()
|
2014-09-25 20:55:42 +00:00
|
|
|
}
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
|
|
|
// PodFitsResources calculates fit based on requested, rather than used resources
|
|
|
|
func (r *ResourceFit) PodFitsResources(pod api.Pod, existingPods []api.Pod, node string) (bool, error) {
|
|
|
|
podRequest := getResourceRequest(&pod)
|
|
|
|
if podRequest.milliCPU == 0 && podRequest.memory == 0 {
|
|
|
|
// no resources requested always fits.
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
info, err := r.info.GetNodeInfo(node)
|
|
|
|
if err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
2015-01-05 21:16:18 +00:00
|
|
|
milliCPURequested := int64(0)
|
|
|
|
memoryRequested := int64(0)
|
2014-09-25 20:55:42 +00:00
|
|
|
for ix := range existingPods {
|
|
|
|
existingRequest := getResourceRequest(&existingPods[ix])
|
|
|
|
milliCPURequested += existingRequest.milliCPU
|
|
|
|
memoryRequested += existingRequest.memory
|
|
|
|
}
|
|
|
|
|
2015-01-25 04:19:36 +00:00
|
|
|
totalMilliCPU := info.Spec.Capacity.Cpu().MilliValue()
|
|
|
|
totalMemory := info.Spec.Capacity.Memory().Value()
|
2014-09-25 20:55:42 +00:00
|
|
|
|
|
|
|
fitsCPU := totalMilliCPU == 0 || (totalMilliCPU-milliCPURequested) >= podRequest.milliCPU
|
|
|
|
fitsMemory := totalMemory == 0 || (totalMemory-memoryRequested) >= podRequest.memory
|
2015-01-14 23:08:53 +00:00
|
|
|
glog.V(3).Infof("Calculated fit: cpu: %v, memory %v", fitsCPU, fitsMemory)
|
2014-09-25 20:55:42 +00:00
|
|
|
|
|
|
|
return fitsCPU && fitsMemory, nil
|
|
|
|
}
|
|
|
|
|
2014-09-26 23:28:30 +00:00
|
|
|
func NewResourceFitPredicate(info NodeInfo) FitPredicate {
|
|
|
|
fit := &ResourceFit{
|
|
|
|
info: info,
|
|
|
|
}
|
|
|
|
return fit.PodFitsResources
|
|
|
|
}
|
|
|
|
|
2014-10-22 00:13:52 +00:00
|
|
|
func NewSelectorMatchPredicate(info NodeInfo) FitPredicate {
|
|
|
|
selector := &NodeSelector{
|
|
|
|
info: info,
|
|
|
|
}
|
|
|
|
return selector.PodSelectorMatches
|
|
|
|
}
|
|
|
|
|
|
|
|
type NodeSelector struct {
|
|
|
|
info NodeInfo
|
|
|
|
}
|
|
|
|
|
|
|
|
func (n *NodeSelector) PodSelectorMatches(pod api.Pod, existingPods []api.Pod, node string) (bool, error) {
|
2014-11-13 15:52:13 +00:00
|
|
|
if len(pod.Spec.NodeSelector) == 0 {
|
2014-10-22 00:13:52 +00:00
|
|
|
return true, nil
|
|
|
|
}
|
2014-11-13 15:52:13 +00:00
|
|
|
selector := labels.SelectorFromSet(pod.Spec.NodeSelector)
|
2014-10-22 00:13:52 +00:00
|
|
|
minion, err := n.info.GetNodeInfo(node)
|
|
|
|
if err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
|
|
|
return selector.Matches(labels.Set(minion.Labels)), nil
|
|
|
|
}
|
|
|
|
|
2014-12-18 22:12:58 +00:00
|
|
|
func PodFitsHost(pod api.Pod, existingPods []api.Pod, node string) (bool, error) {
|
|
|
|
if len(pod.Spec.Host) == 0 {
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
return pod.Spec.Host == node, nil
|
|
|
|
}
|
|
|
|
|
2014-12-22 21:54:41 +00:00
|
|
|
type NodeLabelChecker struct {
|
|
|
|
info NodeInfo
|
|
|
|
labels []string
|
|
|
|
presence bool
|
|
|
|
}
|
|
|
|
|
|
|
|
func NewNodeLabelPredicate(info NodeInfo, labels []string, presence bool) FitPredicate {
|
|
|
|
labelChecker := &NodeLabelChecker{
|
|
|
|
info: info,
|
|
|
|
labels: labels,
|
|
|
|
presence: presence,
|
|
|
|
}
|
|
|
|
return labelChecker.CheckNodeLabelPresence
|
|
|
|
}
|
|
|
|
|
2015-01-05 22:51:22 +00:00
|
|
|
// CheckNodeLabelPresence checks whether all of the specified labels exists on a minion or not, regardless of their value
|
|
|
|
// If "presence" is false, then returns false if any of the requested labels matches any of the minion's labels,
|
|
|
|
// otherwise returns true.
|
|
|
|
// If "presence" is true, then returns false if any of the requested labels does not match any of the minion's labels,
|
|
|
|
// otherwise returns true.
|
|
|
|
//
|
2014-12-22 23:55:31 +00:00
|
|
|
// Consider the cases where the minions are placed in regions/zones/racks and these are identified by labels
|
2014-12-22 21:54:41 +00:00
|
|
|
// In some cases, it is required that only minions that are part of ANY of the defined regions/zones/racks be selected
|
|
|
|
//
|
|
|
|
// Alternately, eliminating minions that have a certain label, regardless of value, is also useful
|
|
|
|
// A minion may have a label with "retiring" as key and the date as the value
|
|
|
|
// and it may be desirable to avoid scheduling new pods on this minion
|
|
|
|
func (n *NodeLabelChecker) CheckNodeLabelPresence(pod api.Pod, existingPods []api.Pod, node string) (bool, error) {
|
|
|
|
var exists bool
|
|
|
|
minion, err := n.info.GetNodeInfo(node)
|
|
|
|
if err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
2015-01-05 22:51:22 +00:00
|
|
|
minionLabels := labels.Set(minion.Labels)
|
2014-12-22 21:54:41 +00:00
|
|
|
for _, label := range n.labels {
|
2015-01-05 22:51:22 +00:00
|
|
|
exists = minionLabels.Has(label)
|
2014-12-22 21:54:41 +00:00
|
|
|
if (exists && !n.presence) || (!exists && n.presence) {
|
|
|
|
return false, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
|
2014-12-22 23:55:31 +00:00
|
|
|
type ServiceAffinity struct {
|
|
|
|
podLister PodLister
|
|
|
|
serviceLister ServiceLister
|
|
|
|
nodeInfo NodeInfo
|
|
|
|
labels []string
|
|
|
|
}
|
|
|
|
|
|
|
|
func NewServiceAffinityPredicate(podLister PodLister, serviceLister ServiceLister, nodeInfo NodeInfo, labels []string) FitPredicate {
|
|
|
|
affinity := &ServiceAffinity{
|
|
|
|
podLister: podLister,
|
|
|
|
serviceLister: serviceLister,
|
|
|
|
nodeInfo: nodeInfo,
|
|
|
|
labels: labels,
|
|
|
|
}
|
|
|
|
return affinity.CheckServiceAffinity
|
|
|
|
}
|
|
|
|
|
2015-01-05 22:51:22 +00:00
|
|
|
// CheckServiceAffinity ensures that only the minions that match the specified labels are considered for scheduling.
|
|
|
|
// The set of labels to be considered are provided to the struct (ServiceAffinity).
|
|
|
|
// The pod is checked for the labels and any missing labels are then checked in the minion
|
|
|
|
// that hosts the service pods (peers) for the given pod.
|
2015-01-08 06:18:22 +00:00
|
|
|
//
|
|
|
|
// We add an implicit selector requiring some particular value V for label L to a pod, if:
|
|
|
|
// - L is listed in the ServiceAffinity object that is passed into the function
|
|
|
|
// - the pod does not have any NodeSelector for L
|
|
|
|
// - some other pod from the same service is already scheduled onto a minion that has value V for label L
|
2014-12-22 23:55:31 +00:00
|
|
|
func (s *ServiceAffinity) CheckServiceAffinity(pod api.Pod, existingPods []api.Pod, node string) (bool, error) {
|
|
|
|
var affinitySelector labels.Selector
|
|
|
|
|
2015-01-05 22:51:22 +00:00
|
|
|
// check if the pod being scheduled has the affinity labels specified in its NodeSelector
|
2014-12-22 23:55:31 +00:00
|
|
|
affinityLabels := map[string]string{}
|
2015-01-05 22:51:22 +00:00
|
|
|
nodeSelector := labels.Set(pod.Spec.NodeSelector)
|
2014-12-22 23:55:31 +00:00
|
|
|
labelsExist := true
|
|
|
|
for _, l := range s.labels {
|
2015-01-05 22:51:22 +00:00
|
|
|
if nodeSelector.Has(l) {
|
|
|
|
affinityLabels[l] = nodeSelector.Get(l)
|
2014-12-22 23:55:31 +00:00
|
|
|
} else {
|
|
|
|
// the current pod does not specify all the labels, look in the existing service pods
|
|
|
|
labelsExist = false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// skip looking at other pods in the service if the current pod defines all the required affinity labels
|
|
|
|
if !labelsExist {
|
2015-01-05 22:51:22 +00:00
|
|
|
services, err := s.serviceLister.GetPodServices(pod)
|
2014-12-22 23:55:31 +00:00
|
|
|
if err == nil {
|
2015-01-05 22:51:22 +00:00
|
|
|
// just use the first service and get the other pods within the service
|
|
|
|
// TODO: a separate predicate can be created that tries to handle all services for the pod
|
|
|
|
selector := labels.SelectorFromSet(services[0].Spec.Selector)
|
2015-01-13 17:52:37 +00:00
|
|
|
servicePods, err := s.podLister.List(selector)
|
2014-12-22 23:55:31 +00:00
|
|
|
if err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
2015-03-06 22:29:44 +00:00
|
|
|
// consider only the pods that belong to the same namespace
|
|
|
|
nsServicePods := []api.Pod{}
|
|
|
|
for _, nsPod := range servicePods {
|
|
|
|
if nsPod.Namespace == pod.Namespace {
|
|
|
|
nsServicePods = append(nsServicePods, nsPod)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if len(nsServicePods) > 0 {
|
2014-12-22 23:55:31 +00:00
|
|
|
// consider any service pod and fetch the minion its hosted on
|
2015-03-06 22:29:44 +00:00
|
|
|
otherMinion, err := s.nodeInfo.GetNodeInfo(nsServicePods[0].Status.Host)
|
2014-12-22 23:55:31 +00:00
|
|
|
if err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
|
|
|
for _, l := range s.labels {
|
|
|
|
// If the pod being scheduled has the label value specified, do not override it
|
|
|
|
if _, exists := affinityLabels[l]; exists {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if labels.Set(otherMinion.Labels).Has(l) {
|
|
|
|
affinityLabels[l] = labels.Set(otherMinion.Labels).Get(l)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// if there are no existing pods in the service, consider all minions
|
|
|
|
if len(affinityLabels) == 0 {
|
|
|
|
affinitySelector = labels.Everything()
|
|
|
|
} else {
|
|
|
|
affinitySelector = labels.Set(affinityLabels).AsSelector()
|
|
|
|
}
|
|
|
|
|
|
|
|
minion, err := s.nodeInfo.GetNodeInfo(node)
|
|
|
|
if err != nil {
|
|
|
|
return false, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// check if the minion matches the selector
|
|
|
|
return affinitySelector.Matches(labels.Set(minion.Labels)), nil
|
|
|
|
}
|
|
|
|
|
2014-09-25 18:56:57 +00:00
|
|
|
func PodFitsPorts(pod api.Pod, existingPods []api.Pod, node string) (bool, error) {
|
2014-11-05 05:21:26 +00:00
|
|
|
existingPorts := getUsedPorts(existingPods...)
|
|
|
|
wantPorts := getUsedPorts(pod)
|
|
|
|
for wport := range wantPorts {
|
|
|
|
if wport == 0 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if existingPorts[wport] {
|
|
|
|
return false, nil
|
2014-09-23 23:14:54 +00:00
|
|
|
}
|
2014-06-28 22:35:51 +00:00
|
|
|
}
|
2014-09-23 23:14:54 +00:00
|
|
|
return true, nil
|
2014-06-28 22:35:51 +00:00
|
|
|
}
|
|
|
|
|
2014-11-05 05:21:26 +00:00
|
|
|
func getUsedPorts(pods ...api.Pod) map[int]bool {
|
|
|
|
ports := make(map[int]bool)
|
|
|
|
for _, pod := range pods {
|
2014-11-13 15:52:13 +00:00
|
|
|
for _, container := range pod.Spec.Containers {
|
2014-11-05 05:21:26 +00:00
|
|
|
for _, podPort := range container.Ports {
|
|
|
|
ports[podPort.HostPort] = true
|
2014-06-28 22:35:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2014-11-05 05:21:26 +00:00
|
|
|
return ports
|
2014-06-28 22:35:51 +00:00
|
|
|
}
|
|
|
|
|
2014-09-24 21:18:31 +00:00
|
|
|
// MapPodsToMachines obtains a list of pods and pivots that list into a map where the keys are host names
|
|
|
|
// and the values are the list of pods running on that host.
|
2014-09-24 16:32:36 +00:00
|
|
|
func MapPodsToMachines(lister PodLister) (map[string][]api.Pod, error) {
|
|
|
|
machineToPods := map[string][]api.Pod{}
|
|
|
|
// TODO: perform more targeted query...
|
2015-01-08 05:57:45 +00:00
|
|
|
pods, err := lister.List(labels.Everything())
|
2014-09-24 16:32:36 +00:00
|
|
|
if err != nil {
|
|
|
|
return map[string][]api.Pod{}, err
|
|
|
|
}
|
|
|
|
for _, scheduledPod := range pods {
|
2015-03-13 00:28:00 +00:00
|
|
|
// TODO: switch to Spec.Host! There was some confusion previously
|
|
|
|
// about whether components should judge a pod's location
|
|
|
|
// based on spec.Host or status.Host. It has been decided that
|
|
|
|
// spec.Host is the canonical location of the pod. Status.Host
|
|
|
|
// will either be removed, be a copy, or in theory it could be
|
|
|
|
// used as a signal that kubelet has agreed to run the pod.
|
|
|
|
//
|
|
|
|
// This could be fixed now, but just requires someone to try it
|
|
|
|
// and verify that e2e still passes.
|
2014-11-13 15:52:13 +00:00
|
|
|
host := scheduledPod.Status.Host
|
2014-09-24 16:32:36 +00:00
|
|
|
machineToPods[host] = append(machineToPods[host], scheduledPod)
|
|
|
|
}
|
|
|
|
return machineToPods, nil
|
|
|
|
}
|