Ubernetes Lite: Volumes can dictate zone scheduling

For AWS EBS, a volume can only be attached to a node in the same AZ.
The scheduler must therefore detect if a volume is being attached to a
pod, and ensure that the pod is scheduled on a node in the same AZ as
the volume.

So that the scheduler need not query the cloud provider every time, and
to support decoupled operation (e.g. bare metal) we tag the volume with
our placement labels.  This is done automatically by means of an
admission controller on AWS when a PersistentVolume is created backed by
an EBS volume.

Support for tagging GCE PVs will follow.

Pods that specify a volume directly (i.e. without using a
PersistentVolumeClaim) will not currently be scheduled correctly (i.e.
they will be scheduled without zone-awareness).
pull/6/head
Justin Santa Barbara 2015-11-29 14:00:49 -05:00
parent 7743a4ca89
commit f9a6ac077e
15 changed files with 536 additions and 4 deletions

View File

@ -126,7 +126,7 @@ if [[ "${ENABLE_NODE_AUTOSCALER}" == "true" ]]; then
fi
# Admission Controllers to invoke prior to persisting objects in cluster
ADMISSION_CONTROL=NamespaceLifecycle,LimitRanger,SecurityContextDeny,ServiceAccount,ResourceQuota
ADMISSION_CONTROL=NamespaceLifecycle,LimitRanger,SecurityContextDeny,ServiceAccount,ResourceQuota,PersistentVolumeLabel
# Optional: Enable/disable public IP assignment for minions.
# Important Note: disable only if you have setup a NAT instance for internet access and configured appropriate routes!

View File

@ -122,7 +122,7 @@ if [[ "${ENABLE_NODE_AUTOSCALER}" == "true" ]]; then
fi
# Admission Controllers to invoke prior to persisting objects in cluster
ADMISSION_CONTROL=NamespaceLifecycle,LimitRanger,SecurityContextDeny,ServiceAccount,ResourceQuota
ADMISSION_CONTROL=NamespaceLifecycle,LimitRanger,SecurityContextDeny,ServiceAccount,ResourceQuota,PersistentVolumeLabel
# Optional: Enable/disable public IP assignment for minions.
# Important Note: disable only if you have setup a NAT instance for internet access and configured appropriate routes!

View File

@ -33,6 +33,7 @@ import (
_ "k8s.io/kubernetes/plugin/pkg/admission/namespace/autoprovision"
_ "k8s.io/kubernetes/plugin/pkg/admission/namespace/exists"
_ "k8s.io/kubernetes/plugin/pkg/admission/namespace/lifecycle"
_ "k8s.io/kubernetes/plugin/pkg/admission/persistentvolume/label"
_ "k8s.io/kubernetes/plugin/pkg/admission/resourcequota"
_ "k8s.io/kubernetes/plugin/pkg/admission/securitycontext/scdeny"
_ "k8s.io/kubernetes/plugin/pkg/admission/serviceaccount"

View File

@ -51,7 +51,7 @@ kube-apiserver
### Options
```
--admission-control="AlwaysAdmit": Ordered list of plug-ins to do admission control of resources into cluster. Comma-delimited list of: AlwaysAdmit, AlwaysDeny, AlwaysPullImages, DenyEscalatingExec, DenyExecOnPrivileged, InitialResources, LimitRanger, NamespaceAutoProvision, NamespaceExists, NamespaceLifecycle, ResourceQuota, SecurityContextDeny, ServiceAccount
--admission-control="AlwaysAdmit": Ordered list of plug-ins to do admission control of resources into cluster. Comma-delimited list of: AlwaysAdmit, AlwaysDeny, AlwaysPullImages, DenyEscalatingExec, DenyExecOnPrivileged, InitialResources, LimitRanger, NamespaceAutoProvision, NamespaceExists, NamespaceLifecycle, PersistentVolumeLabel, ResourceQuota, SecurityContextDeny, ServiceAccount
--admission-control-config-file="": File with admission control configuration.
--advertise-address=<nil>: The IP address on which to advertise the apiserver to members of the cluster. This address must be reachable by the rest of the cluster. If blank, the --bind-address will be used. If --bind-address is unspecified, the host's default interface will be used.
--allow-privileged[=false]: If true, allow privileged containers.

View File

@ -41,6 +41,7 @@ For each unscheduled Pod, the Kubernetes scheduler tries to find a node across t
The purpose of filtering the nodes is to filter out the nodes that do not meet certain requirements of the Pod. For example, if the free resource on a node (measured by the capacity minus the sum of the resource requests of all the Pods that already run on the node) is less than the Pod's required resource, the node should not be considered in the ranking phase so it is filtered out. Currently, there are several "predicates" implementing different filtering policies, including:
- `NoDiskConflict`: Evaluate if a pod can fit due to the volumes it requests, and those that are already mounted.
- `NoVolumeZoneConflict`: Evaluate if the volumes a pod requests are available on the node, given the Zone restrictions.
- `PodFitsResources`: Check if the free resource (CPU and Memory) meets the requirement of the Pod. The free resource is measured by the capacity minus the sum of requests of all Pods on the node. To learn more about the resource QoS in Kubernetes, please check [QoS proposal](../proposals/resource-qos.md).
- `PodFitsHostPorts`: Check if any HostPort required by the Pod is already occupied on the node.
- `PodFitsHost`: Filter out all nodes except the one specified in the PodSpec's NodeName field.

View File

@ -5,6 +5,7 @@
{"name" : "PodFitsPorts"},
{"name" : "PodFitsResources"},
{"name" : "NoDiskConflict"},
{"name" : "NoVolumeZoneConflict"},
{"name" : "MatchNodeSelector"},
{"name" : "HostName"}
],

View File

@ -417,3 +417,42 @@ func (s *StoreToJobLister) GetPodJobs(pod *api.Pod) (jobs []extensions.Job, err
}
return
}
// Typed wrapper around a store of PersistentVolumes
type StoreToPVFetcher struct {
Store
}
// GetPersistentVolumeInfo returns cached data for the PersistentVolume 'id'.
func (s *StoreToPVFetcher) GetPersistentVolumeInfo(id string) (*api.PersistentVolume, error) {
o, exists, err := s.Get(&api.PersistentVolume{ObjectMeta: api.ObjectMeta{Name: id}})
if err != nil {
return nil, fmt.Errorf("error retrieving PersistentVolume '%v' from cache: %v", id, err)
}
if !exists {
return nil, fmt.Errorf("PersistentVolume '%v' is not in cache", id)
}
return o.(*api.PersistentVolume), nil
}
// Typed wrapper around a store of PersistentVolumeClaims
type StoreToPVCFetcher struct {
Store
}
// GetPersistentVolumeClaimInfo returns cached data for the PersistentVolumeClaim 'id'.
func (s *StoreToPVCFetcher) GetPersistentVolumeClaimInfo(namespace string, id string) (*api.PersistentVolumeClaim, error) {
o, exists, err := s.Get(&api.PersistentVolumeClaim{ObjectMeta: api.ObjectMeta{Namespace: namespace, Name: id}})
if err != nil {
return nil, fmt.Errorf("error retrieving PersistentVolumeClaim '%s/%s' from cache: %v", namespace, id, err)
}
if !exists {
return nil, fmt.Errorf("PersistentVolumeClaim '%s/%s' is not in cache", namespace, id)
}
return o.(*api.PersistentVolumeClaim), nil
}

View File

@ -45,6 +45,7 @@ import (
"k8s.io/kubernetes/pkg/util/sets"
"github.com/golang/glog"
"k8s.io/kubernetes/pkg/api/unversioned"
)
const ProviderName = "aws"
@ -158,6 +159,9 @@ type Volumes interface {
// Create a volume with the specified options
CreateVolume(volumeOptions *VolumeOptions) (volumeName string, err error)
DeleteVolume(volumeName string) error
// Get labels to apply to volume on creation
GetVolumeLabels(volumeName string) (map[string]string, error)
}
// InstanceGroups is an interface for managing cloud-managed instance groups / autoscaling instance groups
@ -510,6 +514,16 @@ func isRegionValid(region string) bool {
return false
}
// Derives the region from a valid az name.
// Returns an error if the az is known invalid (empty)
func azToRegion(az string) (string, error) {
if len(az) < 1 {
return "", fmt.Errorf("invalid (empty) AZ")
}
region := az[:len(az)-1]
return region, nil
}
// newAWSCloud creates a new instance of AWSCloud.
// AWSProvider and instanceId are primarily for tests
func newAWSCloud(config io.Reader, awsServices AWSServices) (*AWSCloud, error) {
@ -527,7 +541,10 @@ func newAWSCloud(config io.Reader, awsServices AWSServices) (*AWSCloud, error) {
if len(zone) <= 1 {
return nil, fmt.Errorf("invalid AWS zone in config file: %s", zone)
}
regionName := zone[:len(zone)-1]
regionName, err := azToRegion(zone)
if err != nil {
return nil, err
}
valid := isRegionValid(regionName)
if !valid {
@ -1274,6 +1291,32 @@ func (aws *AWSCloud) DeleteVolume(volumeName string) error {
return awsDisk.deleteVolume()
}
// Implements Volumes.GetVolumeLabels
func (c *AWSCloud) GetVolumeLabels(volumeName string) (map[string]string, error) {
awsDisk, err := newAWSDisk(c, volumeName)
if err != nil {
return nil, err
}
info, err := awsDisk.getInfo()
if err != nil {
return nil, err
}
labels := make(map[string]string)
az := aws.StringValue(info.AvailabilityZone)
if az == "" {
return nil, fmt.Errorf("volume did not have AZ information: %q", info.VolumeId)
}
labels[unversioned.LabelZoneFailureDomain] = az
region, err := azToRegion(az)
if err != nil {
return nil, err
}
labels[unversioned.LabelZoneRegion] = region
return labels, nil
}
func (v *AWSCloud) Configure(name string, spec *api.NodeSpec) error {
return nil
}

View File

@ -0,0 +1,131 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package label
import (
"fmt"
"io"
"sync"
"k8s.io/kubernetes/pkg/admission"
"k8s.io/kubernetes/pkg/api"
client "k8s.io/kubernetes/pkg/client/unversioned"
"k8s.io/kubernetes/pkg/cloudprovider"
"k8s.io/kubernetes/pkg/cloudprovider/providers/aws"
)
func init() {
admission.RegisterPlugin("PersistentVolumeLabel", func(client client.Interface, config io.Reader) (admission.Interface, error) {
persistentVolumeLabelAdmission := NewPersistentVolumeLabel()
return persistentVolumeLabelAdmission, nil
})
}
var _ = admission.Interface(&persistentVolumeLabel{})
type persistentVolumeLabel struct {
*admission.Handler
mutex sync.Mutex
ebsVolumes aws.Volumes
}
// NewPersistentVolumeLabel returns an admission.Interface implementation which adds labels to PersistentVolume CREATE requests,
// based on the labels provided by the underlying cloud provider.
//
// As a side effect, the cloud provider may block invalid or non-existent volumes.
func NewPersistentVolumeLabel() *persistentVolumeLabel {
return &persistentVolumeLabel{
Handler: admission.NewHandler(admission.Create),
}
}
func (l *persistentVolumeLabel) Admit(a admission.Attributes) (err error) {
if a.GetResource() != api.Resource("persistentvolumes") {
return nil
}
obj := a.GetObject()
if obj == nil {
return nil
}
volume, ok := obj.(*api.PersistentVolume)
if !ok {
return nil
}
var volumeLabels map[string]string
if volume.Spec.AWSElasticBlockStore != nil {
labels, err := l.findAWSEBSLabels(volume)
if err != nil {
return admission.NewForbidden(a, fmt.Errorf("error querying AWS EBS volume %s: %v", volume.Spec.AWSElasticBlockStore.VolumeID, err))
}
volumeLabels = labels
}
if len(volumeLabels) != 0 {
if volume.Labels == nil {
volume.Labels = make(map[string]string)
}
for k, v := range volumeLabels {
// We (silently) replace labels if they are provided.
// This should be OK because they are in the kubernetes.io namespace
// i.e. we own them
volume.Labels[k] = v
}
}
return nil
}
func (l *persistentVolumeLabel) findAWSEBSLabels(volume *api.PersistentVolume) (map[string]string, error) {
ebsVolumes, err := l.getEBSVolumes()
if err != nil {
return nil, err
}
if ebsVolumes == nil {
return nil, fmt.Errorf("unable to build AWS cloud provider for EBS")
}
// TODO: GetVolumeLabels is actually a method on the Volumes interface
// If that gets standardized we can refactor to reduce code duplication
labels, err := ebsVolumes.GetVolumeLabels(volume.Spec.AWSElasticBlockStore.VolumeID)
if err != nil {
return nil, err
}
return labels, err
}
// getEBSVolumes returns the AWS Volumes interface for ebs
func (l *persistentVolumeLabel) getEBSVolumes() (aws.Volumes, error) {
l.mutex.Lock()
defer l.mutex.Unlock()
if l.ebsVolumes == nil {
cloudProvider, err := cloudprovider.GetCloudProvider("aws", nil)
if err != nil || cloudProvider == nil {
return nil, err
}
awsCloudProvider, ok := cloudProvider.(*aws.AWSCloud)
if !ok {
// GetCloudProvider has gone very wrong
return nil, fmt.Errorf("error retrieving AWS cloud provider")
}
l.ebsVolumes = awsCloudProvider
}
return l.ebsVolumes, nil
}

View File

@ -0,0 +1,154 @@
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package label
import (
"testing"
"fmt"
"k8s.io/kubernetes/pkg/admission"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/cloudprovider/providers/aws"
)
type mockVolumes struct {
volumeLabels map[string]string
volumeLabelsError error
}
var _ aws.Volumes = &mockVolumes{}
func (v *mockVolumes) AttachDisk(instanceName string, volumeName string, readOnly bool) (string, error) {
return "", fmt.Errorf("not implemented")
}
func (v *mockVolumes) DetachDisk(instanceName string, volumeName string) error {
return fmt.Errorf("not implemented")
}
func (v *mockVolumes) CreateVolume(volumeOptions *aws.VolumeOptions) (volumeName string, err error) {
return "", fmt.Errorf("not implemented")
}
func (v *mockVolumes) DeleteVolume(volumeName string) error {
return fmt.Errorf("not implemented")
}
func (v *mockVolumes) GetVolumeLabels(volumeName string) (map[string]string, error) {
return v.volumeLabels, v.volumeLabelsError
}
func mockVolumeFailure(err error) *mockVolumes {
return &mockVolumes{volumeLabelsError: err}
}
func mockVolumeLabels(labels map[string]string) *mockVolumes {
return &mockVolumes{volumeLabels: labels}
}
// TestAdmission
func TestAdmission(t *testing.T) {
pvHandler := NewPersistentVolumeLabel()
handler := admission.NewChainHandler(pvHandler)
ignoredPV := api.PersistentVolume{
ObjectMeta: api.ObjectMeta{Name: "noncloud", Namespace: "myns"},
Spec: api.PersistentVolumeSpec{
PersistentVolumeSource: api.PersistentVolumeSource{
HostPath: &api.HostPathVolumeSource{
Path: "/",
},
},
},
}
awsPV := api.PersistentVolume{
ObjectMeta: api.ObjectMeta{Name: "noncloud", Namespace: "myns"},
Spec: api.PersistentVolumeSpec{
PersistentVolumeSource: api.PersistentVolumeSource{
AWSElasticBlockStore: &api.AWSElasticBlockStoreVolumeSource{
VolumeID: "123",
},
},
},
}
// Non-cloud PVs are ignored
err := handler.Admit(admission.NewAttributesRecord(&ignoredPV, api.Kind("PersistentVolume"), ignoredPV.Namespace, ignoredPV.Name, api.Resource("persistentvolumes"), "", admission.Create, nil))
if err != nil {
t.Errorf("Unexpected error returned from admission handler (on ignored pv): %v", err)
}
// We only add labels on creation
err = handler.Admit(admission.NewAttributesRecord(&awsPV, api.Kind("PersistentVolume"), awsPV.Namespace, awsPV.Name, api.Resource("persistentvolumes"), "", admission.Delete, nil))
if err != nil {
t.Errorf("Unexpected error returned from admission handler (when deleting aws pv): %v", err)
}
// Errors from the cloudprovider block creation of the volume
pvHandler.ebsVolumes = mockVolumeFailure(fmt.Errorf("invalid volume"))
err = handler.Admit(admission.NewAttributesRecord(&awsPV, api.Kind("PersistentVolume"), awsPV.Namespace, awsPV.Name, api.Resource("persistentvolumes"), "", admission.Create, nil))
if err == nil {
t.Errorf("Expected error when aws pv info fails")
}
// Don't add labels if the cloudprovider doesn't return any
labels := make(map[string]string)
pvHandler.ebsVolumes = mockVolumeLabels(labels)
err = handler.Admit(admission.NewAttributesRecord(&awsPV, api.Kind("PersistentVolume"), awsPV.Namespace, awsPV.Name, api.Resource("persistentvolumes"), "", admission.Create, nil))
if err != nil {
t.Errorf("Expected no error when creating aws pv")
}
if len(awsPV.ObjectMeta.Labels) != 0 {
t.Errorf("Unexpected number of labels")
}
// Don't panic if the cloudprovider returns nil, nil
pvHandler.ebsVolumes = mockVolumeFailure(nil)
err = handler.Admit(admission.NewAttributesRecord(&awsPV, api.Kind("PersistentVolume"), awsPV.Namespace, awsPV.Name, api.Resource("persistentvolumes"), "", admission.Create, nil))
if err != nil {
t.Errorf("Expected no error when cloud provider returns empty labels")
}
// Labels from the cloudprovider should be applied to the volume
labels = make(map[string]string)
labels["a"] = "1"
labels["b"] = "2"
pvHandler.ebsVolumes = mockVolumeLabels(labels)
err = handler.Admit(admission.NewAttributesRecord(&awsPV, api.Kind("PersistentVolume"), awsPV.Namespace, awsPV.Name, api.Resource("persistentvolumes"), "", admission.Create, nil))
if err != nil {
t.Errorf("Expected no error when creating aws pv")
}
if awsPV.Labels["a"] != "1" || awsPV.Labels["b"] != "2" {
t.Errorf("Expected label a to be added when creating aws pv")
}
// User-provided labels should be honored, but cloudprovider labels replace them when they overlap
awsPV.ObjectMeta.Labels = make(map[string]string)
awsPV.ObjectMeta.Labels["a"] = "not1"
awsPV.ObjectMeta.Labels["c"] = "3"
err = handler.Admit(admission.NewAttributesRecord(&awsPV, api.Kind("PersistentVolume"), awsPV.Namespace, awsPV.Name, api.Resource("persistentvolumes"), "", admission.Create, nil))
if err != nil {
t.Errorf("Expected no error when creating aws pv")
}
if awsPV.Labels["a"] != "1" || awsPV.Labels["b"] != "2" {
t.Errorf("Expected cloudprovider labels to replace user labels when creating aws pv")
}
if awsPV.Labels["c"] != "3" {
t.Errorf("Expected (non-conflicting) user provided labels to be honored when creating aws pv")
}
}

View File

@ -0,0 +1,19 @@
/*
Copyright 2014 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// labels created persistent volumes with zone information
// as provided by the cloud provider
package label

View File

@ -26,12 +26,21 @@ import (
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
"github.com/golang/glog"
"k8s.io/kubernetes/pkg/api/unversioned"
)
type NodeInfo interface {
GetNodeInfo(nodeID string) (*api.Node, error)
}
type PersistentVolumeInfo interface {
GetPersistentVolumeInfo(pvID string) (*api.PersistentVolume, error)
}
type PersistentVolumeClaimInfo interface {
GetPersistentVolumeClaimInfo(namespace string, pvcID string) (*api.PersistentVolumeClaim, error)
}
type StaticNodeInfo struct {
*api.NodeList
}
@ -136,6 +145,108 @@ func NoDiskConflict(pod *api.Pod, existingPods []*api.Pod, node string) (bool, e
return true, nil
}
type VolumeZoneChecker struct {
nodeInfo NodeInfo
pvInfo PersistentVolumeInfo
pvcInfo PersistentVolumeClaimInfo
}
// VolumeZonePredicate evaluates if a pod can fit due to the volumes it requests, given
// that some volumes may have zone scheduling constraints. The requirement is that any
// volume zone-labels must match the equivalent zone-labels on the node. It is OK for
// the node to have more zone-label constraints (for example, a hypothetical replicated
// volume might allow region-wide access)
//
// Currently this is only supported with PersistentVolumeClaims, and looks to the labels
// only on the bound PersistentVolume.
//
// Working with volumes declared inline in the pod specification (i.e. not
// using a PersistentVolume) is likely to be harder, as it would require
// determining the zone of a volume during scheduling, and that is likely to
// require calling out to the cloud provider. It seems that we are moving away
// from inline volume declarations anyway.
func NewVolumeZonePredicate(nodeInfo NodeInfo, pvInfo PersistentVolumeInfo, pvcInfo PersistentVolumeClaimInfo) algorithm.FitPredicate {
c := &VolumeZoneChecker{
nodeInfo: nodeInfo,
pvInfo: pvInfo,
pvcInfo: pvcInfo,
}
return c.predicate
}
func (c *VolumeZoneChecker) predicate(pod *api.Pod, existingPods []*api.Pod, nodeID string) (bool, error) {
node, err := c.nodeInfo.GetNodeInfo(nodeID)
if err != nil {
return false, err
}
if node == nil {
return false, fmt.Errorf("node not found: %q", nodeID)
}
nodeConstraints := make(map[string]string)
for k, v := range node.ObjectMeta.Labels {
if k != unversioned.LabelZoneFailureDomain && k != unversioned.LabelZoneRegion {
continue
}
nodeConstraints[k] = v
}
if len(nodeConstraints) == 0 {
// The node has no zone constraints, so we're OK to schedule.
// In practice, when using zones, all nodes must be labeled with zone labels.
// We want to fast-path this case though.
return true, nil
}
namespace := pod.Namespace
manifest := &(pod.Spec)
for i := range manifest.Volumes {
volume := &manifest.Volumes[i]
if volume.PersistentVolumeClaim != nil {
pvcName := volume.PersistentVolumeClaim.ClaimName
if pvcName == "" {
return false, fmt.Errorf("PersistentVolumeClaim had no name: %q", pvcName)
}
pvc, err := c.pvcInfo.GetPersistentVolumeClaimInfo(namespace, pvcName)
if err != nil {
return false, err
}
if pvc == nil {
return false, fmt.Errorf("PersistentVolumeClaim was not found: %q", pvcName)
}
pvName := pvc.Spec.VolumeName
if pvName == "" {
return false, fmt.Errorf("PersistentVolumeClaim is not bound: %q", pvcName)
}
pv, err := c.pvInfo.GetPersistentVolumeInfo(pvName)
if err != nil {
return false, err
}
if pv == nil {
return false, fmt.Errorf("PersistentVolume not found: %q", pvName)
}
for k, v := range pv.ObjectMeta.Labels {
if k != unversioned.LabelZoneFailureDomain && k != unversioned.LabelZoneRegion {
continue
}
nodeV, _ := nodeConstraints[k]
if v != nodeV {
glog.V(2).Infof("Won't schedule pod %q onto node %q due to volume %q (mismatch on %q)", pod.Name, nodeID, pvName, k)
return false, nil
}
}
}
}
return true, nil
}
type ResourceFit struct {
info NodeInfo
}

View File

@ -64,6 +64,13 @@ func defaultPredicates() sets.String {
),
// Fit is determined by non-conflicting disk volumes.
factory.RegisterFitPredicate("NoDiskConflict", predicates.NoDiskConflict),
// Fit is determined by volume zone requirements.
factory.RegisterFitPredicateFactory(
"NoVolumeZoneConflict",
func(args factory.PluginFactoryArgs) algorithm.FitPredicate {
return predicates.NewVolumeZonePredicate(args.NodeInfo, args.PVInfo, args.PVCInfo)
},
),
// Fit is determined by node selector query.
factory.RegisterFitPredicateFactory(
"MatchNodeSelector",

View File

@ -58,6 +58,10 @@ type ConfigFactory struct {
PodLister algorithm.PodLister
// a means to list all nodes
NodeLister *cache.StoreToNodeLister
// a means to list all PersistentVolumes
PVLister *cache.StoreToPVFetcher
// a means to list all PersistentVolumeClaims
PVCLister *cache.StoreToPVCFetcher
// a means to list all services
ServiceLister *cache.StoreToServiceLister
// a means to list all controllers
@ -85,6 +89,8 @@ func NewConfigFactory(client *client.Client, rateLimiter util.RateLimiter, sched
ScheduledPodLister: &cache.StoreToPodLister{},
// Only nodes in the "Ready" condition with status == "True" are schedulable
NodeLister: &cache.StoreToNodeLister{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
PVLister: &cache.StoreToPVFetcher{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
PVCLister: &cache.StoreToPVCFetcher{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
ServiceLister: &cache.StoreToServiceLister{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
ControllerLister: &cache.StoreToReplicationControllerLister{Store: cache.NewStore(cache.MetaNamespaceKeyFunc)},
StopEverything: make(chan struct{}),
@ -188,6 +194,8 @@ func (f *ConfigFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String,
// All fit predicates only need to consider schedulable nodes.
NodeLister: f.NodeLister.NodeCondition(getNodeConditionPredicate()),
NodeInfo: &predicates.CachedNodeInfo{f.NodeLister},
PVInfo: f.PVLister,
PVCInfo: f.PVCLister,
}
predicateFuncs, err := getFitPredicateFunctions(predicateKeys, pluginArgs)
if err != nil {
@ -209,6 +217,11 @@ func (f *ConfigFactory) CreateFromKeys(predicateKeys, priorityKeys sets.String,
// Nodes may be listed frequently, so provide a local up-to-date cache.
cache.NewReflector(f.createNodeLW(), &api.Node{}, f.NodeLister.Store, 0).RunUntil(f.StopEverything)
// Watch PVs & PVCs
// They may be listed frequently for scheduling constraints, so provide a local up-to-date cache.
cache.NewReflector(f.createPersistentVolumeLW(), &api.PersistentVolume{}, f.PVLister.Store, 0).RunUntil(f.StopEverything)
cache.NewReflector(f.createPersistentVolumeClaimLW(), &api.PersistentVolumeClaim{}, f.PVCLister.Store, 0).RunUntil(f.StopEverything)
// Watch and cache all service objects. Scheduler needs to find all pods
// created by the same services or ReplicationControllers, so that it can spread them correctly.
// Cache this locally.
@ -303,6 +316,16 @@ func (factory *ConfigFactory) createNodeLW() *cache.ListWatch {
return cache.NewListWatchFromClient(factory.Client, "nodes", api.NamespaceAll, fields)
}
// createPersistentVolumeLW returns a cache.ListWatch that gets all changes to persistentVolumes.
func (factory *ConfigFactory) createPersistentVolumeLW() *cache.ListWatch {
return cache.NewListWatchFromClient(factory.Client, "persistentVolumes", api.NamespaceAll, fields.ParseSelectorOrDie(""))
}
// createPersistentVolumeClaimLW returns a cache.ListWatch that gets all changes to persistentVolumeClaims.
func (factory *ConfigFactory) createPersistentVolumeClaimLW() *cache.ListWatch {
return cache.NewListWatchFromClient(factory.Client, "persistentVolumeClaims", api.NamespaceAll, fields.ParseSelectorOrDie(""))
}
// Returns a cache.ListWatch that gets all changes to services.
func (factory *ConfigFactory) createServiceLW() *cache.ListWatch {
return cache.NewListWatchFromClient(factory.Client, "services", api.NamespaceAll, fields.ParseSelectorOrDie(""))

View File

@ -38,6 +38,8 @@ type PluginFactoryArgs struct {
algorithm.ControllerLister
NodeLister algorithm.NodeLister
NodeInfo predicates.NodeInfo
PVInfo predicates.PersistentVolumeInfo
PVCInfo predicates.PersistentVolumeClaimInfo
}
// A FitPredicateFactory produces a FitPredicate from the given args.