mirror of https://github.com/k3s-io/k3s
381 lines
12 KiB
Go
381 lines
12 KiB
Go
/*
|
|
Copyright 2014 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package persistentvolume
|
|
|
|
import (
|
|
"fmt"
|
|
"sort"
|
|
|
|
"k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/api/resource"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/labels"
|
|
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
|
"k8s.io/client-go/tools/cache"
|
|
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
|
|
"k8s.io/kubernetes/pkg/features"
|
|
volumeutil "k8s.io/kubernetes/pkg/volume/util"
|
|
)
|
|
|
|
// persistentVolumeOrderedIndex is a cache.Store that keeps persistent volumes
|
|
// indexed by AccessModes and ordered by storage capacity.
|
|
type persistentVolumeOrderedIndex struct {
|
|
store cache.Indexer
|
|
}
|
|
|
|
func newPersistentVolumeOrderedIndex() persistentVolumeOrderedIndex {
|
|
return persistentVolumeOrderedIndex{cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{"accessmodes": accessModesIndexFunc})}
|
|
}
|
|
|
|
// accessModesIndexFunc is an indexing function that returns a persistent
|
|
// volume's AccessModes as a string
|
|
func accessModesIndexFunc(obj interface{}) ([]string, error) {
|
|
if pv, ok := obj.(*v1.PersistentVolume); ok {
|
|
modes := v1helper.GetAccessModesAsString(pv.Spec.AccessModes)
|
|
return []string{modes}, nil
|
|
}
|
|
return []string{""}, fmt.Errorf("object is not a persistent volume: %v", obj)
|
|
}
|
|
|
|
// listByAccessModes returns all volumes with the given set of
|
|
// AccessModeTypes. The list is unsorted!
|
|
func (pvIndex *persistentVolumeOrderedIndex) listByAccessModes(modes []v1.PersistentVolumeAccessMode) ([]*v1.PersistentVolume, error) {
|
|
pv := &v1.PersistentVolume{
|
|
Spec: v1.PersistentVolumeSpec{
|
|
AccessModes: modes,
|
|
},
|
|
}
|
|
|
|
objs, err := pvIndex.store.Index("accessmodes", pv)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
volumes := make([]*v1.PersistentVolume, len(objs))
|
|
for i, obj := range objs {
|
|
volumes[i] = obj.(*v1.PersistentVolume)
|
|
}
|
|
|
|
return volumes, nil
|
|
}
|
|
|
|
// find returns the nearest PV from the ordered list or nil if a match is not found
|
|
func (pvIndex *persistentVolumeOrderedIndex) findByClaim(claim *v1.PersistentVolumeClaim, delayBinding bool) (*v1.PersistentVolume, error) {
|
|
// PVs are indexed by their access modes to allow easier searching. Each
|
|
// index is the string representation of a set of access modes. There is a
|
|
// finite number of possible sets and PVs will only be indexed in one of
|
|
// them (whichever index matches the PV's modes).
|
|
//
|
|
// A request for resources will always specify its desired access modes.
|
|
// Any matching PV must have at least that number of access modes, but it
|
|
// can have more. For example, a user asks for ReadWriteOnce but a GCEPD
|
|
// is available, which is ReadWriteOnce+ReadOnlyMany.
|
|
//
|
|
// Searches are performed against a set of access modes, so we can attempt
|
|
// not only the exact matching modes but also potential matches (the GCEPD
|
|
// example above).
|
|
allPossibleModes := pvIndex.allPossibleMatchingAccessModes(claim.Spec.AccessModes)
|
|
|
|
for _, modes := range allPossibleModes {
|
|
volumes, err := pvIndex.listByAccessModes(modes)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
bestVol, err := findMatchingVolume(claim, volumes, nil /* node for topology binding*/, nil /* exclusion map */, delayBinding)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if bestVol != nil {
|
|
return bestVol, nil
|
|
}
|
|
}
|
|
return nil, nil
|
|
}
|
|
|
|
// findMatchingVolume goes through the list of volumes to find the best matching volume
|
|
// for the claim.
|
|
//
|
|
// This function is used by both the PV controller and scheduler.
|
|
//
|
|
// delayBinding is true only in the PV controller path. When set, prebound PVs are still returned
|
|
// as a match for the claim, but unbound PVs are skipped.
|
|
//
|
|
// node is set only in the scheduler path. When set, the PV node affinity is checked against
|
|
// the node's labels.
|
|
//
|
|
// excludedVolumes is only used in the scheduler path, and is needed for evaluating multiple
|
|
// unbound PVCs for a single Pod at one time. As each PVC finds a matching PV, the chosen
|
|
// PV needs to be excluded from future matching.
|
|
func findMatchingVolume(
|
|
claim *v1.PersistentVolumeClaim,
|
|
volumes []*v1.PersistentVolume,
|
|
node *v1.Node,
|
|
excludedVolumes map[string]*v1.PersistentVolume,
|
|
delayBinding bool) (*v1.PersistentVolume, error) {
|
|
|
|
var smallestVolume *v1.PersistentVolume
|
|
var smallestVolumeQty resource.Quantity
|
|
requestedQty := claim.Spec.Resources.Requests[v1.ResourceName(v1.ResourceStorage)]
|
|
requestedClass := v1helper.GetPersistentVolumeClaimClass(claim)
|
|
|
|
var selector labels.Selector
|
|
if claim.Spec.Selector != nil {
|
|
internalSelector, err := metav1.LabelSelectorAsSelector(claim.Spec.Selector)
|
|
if err != nil {
|
|
// should be unreachable code due to validation
|
|
return nil, fmt.Errorf("error creating internal label selector for claim: %v: %v", claimToClaimKey(claim), err)
|
|
}
|
|
selector = internalSelector
|
|
}
|
|
|
|
// Go through all available volumes with two goals:
|
|
// - find a volume that is either pre-bound by user or dynamically
|
|
// provisioned for this claim. Because of this we need to loop through
|
|
// all volumes.
|
|
// - find the smallest matching one if there is no volume pre-bound to
|
|
// the claim.
|
|
for _, volume := range volumes {
|
|
if _, ok := excludedVolumes[volume.Name]; ok {
|
|
// Skip volumes in the excluded list
|
|
continue
|
|
}
|
|
|
|
volumeQty := volume.Spec.Capacity[v1.ResourceStorage]
|
|
|
|
// check if volumeModes do not match (feature gate protected)
|
|
isMismatch, err := checkVolumeModeMismatches(&claim.Spec, &volume.Spec)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error checking if volumeMode was a mismatch: %v", err)
|
|
}
|
|
// filter out mismatching volumeModes
|
|
if isMismatch {
|
|
continue
|
|
}
|
|
|
|
// check if PV's DeletionTimeStamp is set, if so, skip this volume.
|
|
if utilfeature.DefaultFeatureGate.Enabled(features.StorageObjectInUseProtection) {
|
|
if volume.ObjectMeta.DeletionTimestamp != nil {
|
|
continue
|
|
}
|
|
}
|
|
|
|
nodeAffinityValid := true
|
|
if node != nil {
|
|
// Scheduler path, check that the PV NodeAffinity
|
|
// is satisfied by the node
|
|
err := volumeutil.CheckNodeAffinity(volume, node.Labels)
|
|
if err != nil {
|
|
nodeAffinityValid = false
|
|
}
|
|
}
|
|
|
|
if isVolumeBoundToClaim(volume, claim) {
|
|
// this claim and volume are pre-bound; return
|
|
// the volume if the size request is satisfied,
|
|
// otherwise continue searching for a match
|
|
if volumeQty.Cmp(requestedQty) < 0 {
|
|
continue
|
|
}
|
|
|
|
// If PV node affinity is invalid, return no match.
|
|
// This means the prebound PV (and therefore PVC)
|
|
// is not suitable for this node.
|
|
if !nodeAffinityValid {
|
|
return nil, nil
|
|
}
|
|
|
|
return volume, nil
|
|
}
|
|
|
|
if node == nil && delayBinding {
|
|
// PV controller does not bind this claim.
|
|
// Scheduler will handle binding unbound volumes
|
|
// Scheduler path will have node != nil
|
|
continue
|
|
}
|
|
|
|
// filter out:
|
|
// - volumes in non-available phase
|
|
// - volumes bound to another claim
|
|
// - volumes whose labels don't match the claim's selector, if specified
|
|
// - volumes in Class that is not requested
|
|
// - volumes whose NodeAffinity does not match the node
|
|
if volume.Status.Phase != v1.VolumeAvailable {
|
|
// We ignore volumes in non-available phase, because volumes that
|
|
// satisfies matching criteria will be updated to available, binding
|
|
// them now has high chance of encountering unnecessary failures
|
|
// due to API conflicts.
|
|
continue
|
|
} else if volume.Spec.ClaimRef != nil {
|
|
continue
|
|
} else if selector != nil && !selector.Matches(labels.Set(volume.Labels)) {
|
|
continue
|
|
}
|
|
if v1helper.GetPersistentVolumeClass(volume) != requestedClass {
|
|
continue
|
|
}
|
|
if !nodeAffinityValid {
|
|
continue
|
|
}
|
|
|
|
if node != nil {
|
|
// Scheduler path
|
|
// Check that the access modes match
|
|
if !checkAccessModes(claim, volume) {
|
|
continue
|
|
}
|
|
}
|
|
|
|
if volumeQty.Cmp(requestedQty) >= 0 {
|
|
if smallestVolume == nil || smallestVolumeQty.Cmp(volumeQty) > 0 {
|
|
smallestVolume = volume
|
|
smallestVolumeQty = volumeQty
|
|
}
|
|
}
|
|
}
|
|
|
|
if smallestVolume != nil {
|
|
// Found a matching volume
|
|
return smallestVolume, nil
|
|
}
|
|
|
|
return nil, nil
|
|
}
|
|
|
|
// checkVolumeModeMismatches is a convenience method that checks volumeMode for PersistentVolume
|
|
// and PersistentVolumeClaims
|
|
func checkVolumeModeMismatches(pvcSpec *v1.PersistentVolumeClaimSpec, pvSpec *v1.PersistentVolumeSpec) (bool, error) {
|
|
if !utilfeature.DefaultFeatureGate.Enabled(features.BlockVolume) {
|
|
return false, nil
|
|
}
|
|
|
|
// In HA upgrades, we cannot guarantee that the apiserver is on a version >= controller-manager.
|
|
// So we default a nil volumeMode to filesystem
|
|
requestedVolumeMode := v1.PersistentVolumeFilesystem
|
|
if pvcSpec.VolumeMode != nil {
|
|
requestedVolumeMode = *pvcSpec.VolumeMode
|
|
}
|
|
pvVolumeMode := v1.PersistentVolumeFilesystem
|
|
if pvSpec.VolumeMode != nil {
|
|
pvVolumeMode = *pvSpec.VolumeMode
|
|
}
|
|
return requestedVolumeMode != pvVolumeMode, nil
|
|
}
|
|
|
|
// findBestMatchForClaim is a convenience method that finds a volume by the claim's AccessModes and requests for Storage
|
|
func (pvIndex *persistentVolumeOrderedIndex) findBestMatchForClaim(claim *v1.PersistentVolumeClaim, delayBinding bool) (*v1.PersistentVolume, error) {
|
|
return pvIndex.findByClaim(claim, delayBinding)
|
|
}
|
|
|
|
// allPossibleMatchingAccessModes returns an array of AccessMode arrays that
|
|
// can satisfy a user's requested modes.
|
|
//
|
|
// see comments in the Find func above regarding indexing.
|
|
//
|
|
// allPossibleMatchingAccessModes gets all stringified accessmodes from the
|
|
// index and returns all those that contain at least all of the requested
|
|
// mode.
|
|
//
|
|
// For example, assume the index contains 2 types of PVs where the stringified
|
|
// accessmodes are:
|
|
//
|
|
// "RWO,ROX" -- some number of GCEPDs
|
|
// "RWO,ROX,RWX" -- some number of NFS volumes
|
|
//
|
|
// A request for RWO could be satisfied by both sets of indexed volumes, so
|
|
// allPossibleMatchingAccessModes returns:
|
|
//
|
|
// [][]v1.PersistentVolumeAccessMode {
|
|
// []v1.PersistentVolumeAccessMode {
|
|
// v1.ReadWriteOnce, v1.ReadOnlyMany,
|
|
// },
|
|
// []v1.PersistentVolumeAccessMode {
|
|
// v1.ReadWriteOnce, v1.ReadOnlyMany, v1.ReadWriteMany,
|
|
// },
|
|
// }
|
|
//
|
|
// A request for RWX can be satisfied by only one set of indexed volumes, so
|
|
// the return is:
|
|
//
|
|
// [][]v1.PersistentVolumeAccessMode {
|
|
// []v1.PersistentVolumeAccessMode {
|
|
// v1.ReadWriteOnce, v1.ReadOnlyMany, v1.ReadWriteMany,
|
|
// },
|
|
// }
|
|
//
|
|
// This func returns modes with ascending levels of modes to give the user
|
|
// what is closest to what they actually asked for.
|
|
func (pvIndex *persistentVolumeOrderedIndex) allPossibleMatchingAccessModes(requestedModes []v1.PersistentVolumeAccessMode) [][]v1.PersistentVolumeAccessMode {
|
|
matchedModes := [][]v1.PersistentVolumeAccessMode{}
|
|
keys := pvIndex.store.ListIndexFuncValues("accessmodes")
|
|
for _, key := range keys {
|
|
indexedModes := v1helper.GetAccessModesFromString(key)
|
|
if volumeutil.AccessModesContainedInAll(indexedModes, requestedModes) {
|
|
matchedModes = append(matchedModes, indexedModes)
|
|
}
|
|
}
|
|
|
|
// sort by the number of modes in each array with the fewest number of
|
|
// modes coming first. this allows searching for volumes by the minimum
|
|
// number of modes required of the possible matches.
|
|
sort.Sort(byAccessModes{matchedModes})
|
|
return matchedModes
|
|
}
|
|
|
|
// byAccessModes is used to order access modes by size, with the fewest modes first
|
|
type byAccessModes struct {
|
|
modes [][]v1.PersistentVolumeAccessMode
|
|
}
|
|
|
|
func (c byAccessModes) Less(i, j int) bool {
|
|
return len(c.modes[i]) < len(c.modes[j])
|
|
}
|
|
|
|
func (c byAccessModes) Swap(i, j int) {
|
|
c.modes[i], c.modes[j] = c.modes[j], c.modes[i]
|
|
}
|
|
|
|
func (c byAccessModes) Len() int {
|
|
return len(c.modes)
|
|
}
|
|
|
|
func claimToClaimKey(claim *v1.PersistentVolumeClaim) string {
|
|
return fmt.Sprintf("%s/%s", claim.Namespace, claim.Name)
|
|
}
|
|
|
|
func claimrefToClaimKey(claimref *v1.ObjectReference) string {
|
|
return fmt.Sprintf("%s/%s", claimref.Namespace, claimref.Name)
|
|
}
|
|
|
|
// Returns true if PV satisfies all the PVC's requested AccessModes
|
|
func checkAccessModes(claim *v1.PersistentVolumeClaim, volume *v1.PersistentVolume) bool {
|
|
pvModesMap := map[v1.PersistentVolumeAccessMode]bool{}
|
|
for _, mode := range volume.Spec.AccessModes {
|
|
pvModesMap[mode] = true
|
|
}
|
|
|
|
for _, mode := range claim.Spec.AccessModes {
|
|
_, ok := pvModesMap[mode]
|
|
if !ok {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|