mirror of https://github.com/k3s-io/k3s
955 lines
35 KiB
Go
955 lines
35 KiB
Go
/*
|
|
Copyright 2015 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package util
|
|
|
|
import (
|
|
"fmt"
|
|
"io/ioutil"
|
|
"os"
|
|
"path"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
v1 "k8s.io/api/core/v1"
|
|
storage "k8s.io/api/storage/v1"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/labels"
|
|
"k8s.io/apimachinery/pkg/runtime"
|
|
"k8s.io/apimachinery/pkg/util/sets"
|
|
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
|
clientset "k8s.io/client-go/kubernetes"
|
|
"k8s.io/klog"
|
|
"k8s.io/kubernetes/pkg/api/legacyscheme"
|
|
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
|
|
"k8s.io/kubernetes/pkg/features"
|
|
kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
|
|
"k8s.io/kubernetes/pkg/util/mount"
|
|
utilstrings "k8s.io/kubernetes/pkg/util/strings"
|
|
"k8s.io/kubernetes/pkg/volume"
|
|
|
|
"reflect"
|
|
|
|
"hash/fnv"
|
|
"math/rand"
|
|
"strconv"
|
|
|
|
"k8s.io/apimachinery/pkg/api/resource"
|
|
utypes "k8s.io/apimachinery/pkg/types"
|
|
"k8s.io/kubernetes/pkg/volume/util/types"
|
|
"k8s.io/kubernetes/pkg/volume/util/volumepathhandler"
|
|
)
|
|
|
|
const (
|
|
// GB - GigaByte size
|
|
GB = 1000 * 1000 * 1000
|
|
// GIB - GibiByte size
|
|
GIB = 1024 * 1024 * 1024
|
|
|
|
readyFileName = "ready"
|
|
|
|
// ControllerManagedAttachAnnotation is the key of the annotation on Node
|
|
// objects that indicates attach/detach operations for the node should be
|
|
// managed by the attach/detach controller
|
|
ControllerManagedAttachAnnotation string = "volumes.kubernetes.io/controller-managed-attach-detach"
|
|
|
|
// KeepTerminatedPodVolumesAnnotation is the key of the annotation on Node
|
|
// that decides if pod volumes are unmounted when pod is terminated
|
|
KeepTerminatedPodVolumesAnnotation string = "volumes.kubernetes.io/keep-terminated-pod-volumes"
|
|
|
|
// VolumeGidAnnotationKey is the of the annotation on the PersistentVolume
|
|
// object that specifies a supplemental GID.
|
|
VolumeGidAnnotationKey = "pv.beta.kubernetes.io/gid"
|
|
|
|
// VolumeDynamicallyCreatedByKey is the key of the annotation on PersistentVolume
|
|
// object created dynamically
|
|
VolumeDynamicallyCreatedByKey = "kubernetes.io/createdby"
|
|
)
|
|
|
|
// VolumeZoneConfig contains config information about zonal volume.
|
|
type VolumeZoneConfig struct {
|
|
ZonePresent bool
|
|
ZonesPresent bool
|
|
ReplicaZoneFromNodePresent bool
|
|
Zone string
|
|
Zones string
|
|
ReplicaZoneFromNode string
|
|
}
|
|
|
|
// IsReady checks for the existence of a regular file
|
|
// called 'ready' in the given directory and returns
|
|
// true if that file exists.
|
|
func IsReady(dir string) bool {
|
|
readyFile := path.Join(dir, readyFileName)
|
|
s, err := os.Stat(readyFile)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
|
|
if !s.Mode().IsRegular() {
|
|
klog.Errorf("ready-file is not a file: %s", readyFile)
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// SetReady creates a file called 'ready' in the given
|
|
// directory. It logs an error if the file cannot be
|
|
// created.
|
|
func SetReady(dir string) {
|
|
if err := os.MkdirAll(dir, 0750); err != nil && !os.IsExist(err) {
|
|
klog.Errorf("Can't mkdir %s: %v", dir, err)
|
|
return
|
|
}
|
|
|
|
readyFile := path.Join(dir, readyFileName)
|
|
file, err := os.Create(readyFile)
|
|
if err != nil {
|
|
klog.Errorf("Can't touch %s: %v", readyFile, err)
|
|
return
|
|
}
|
|
file.Close()
|
|
}
|
|
|
|
// UnmountPath is a common unmount routine that unmounts the given path and
|
|
// deletes the remaining directory if successful.
|
|
// TODO: Remove this function and change callers to call mount pkg directly
|
|
func UnmountPath(mountPath string, mounter mount.Interface) error {
|
|
return mount.CleanupMountPoint(mountPath, mounter, false /* extensiveMountPointCheck */)
|
|
}
|
|
|
|
// UnmountMountPoint is a common unmount routine that unmounts the given path and
|
|
// deletes the remaining directory if successful.
|
|
// if extensiveMountPointCheck is true
|
|
// IsNotMountPoint will be called instead of IsLikelyNotMountPoint.
|
|
// IsNotMountPoint is more expensive but properly handles bind mounts.
|
|
// TODO: Change callers to call mount pkg directly
|
|
func UnmountMountPoint(mountPath string, mounter mount.Interface, extensiveMountPointCheck bool) error {
|
|
return mount.CleanupMountPoint(mountPath, mounter, extensiveMountPointCheck)
|
|
}
|
|
|
|
// PathExists returns true if the specified path exists.
|
|
// TODO: Change callers to call mount pkg directly
|
|
func PathExists(path string) (bool, error) {
|
|
return mount.PathExists(path)
|
|
}
|
|
|
|
// IsCorruptedMnt return true if err is about corrupted mount point
|
|
// TODO: Change callers to call mount pkg directly
|
|
func IsCorruptedMnt(err error) bool {
|
|
return mount.IsCorruptedMnt(err)
|
|
}
|
|
|
|
// GetSecretForPod locates secret by name in the pod's namespace and returns secret map
|
|
func GetSecretForPod(pod *v1.Pod, secretName string, kubeClient clientset.Interface) (map[string]string, error) {
|
|
secret := make(map[string]string)
|
|
if kubeClient == nil {
|
|
return secret, fmt.Errorf("Cannot get kube client")
|
|
}
|
|
secrets, err := kubeClient.CoreV1().Secrets(pod.Namespace).Get(secretName, metav1.GetOptions{})
|
|
if err != nil {
|
|
return secret, err
|
|
}
|
|
for name, data := range secrets.Data {
|
|
secret[name] = string(data)
|
|
}
|
|
return secret, nil
|
|
}
|
|
|
|
// GetSecretForPV locates secret by name and namespace, verifies the secret type, and returns secret map
|
|
func GetSecretForPV(secretNamespace, secretName, volumePluginName string, kubeClient clientset.Interface) (map[string]string, error) {
|
|
secret := make(map[string]string)
|
|
if kubeClient == nil {
|
|
return secret, fmt.Errorf("Cannot get kube client")
|
|
}
|
|
secrets, err := kubeClient.CoreV1().Secrets(secretNamespace).Get(secretName, metav1.GetOptions{})
|
|
if err != nil {
|
|
return secret, err
|
|
}
|
|
if secrets.Type != v1.SecretType(volumePluginName) {
|
|
return secret, fmt.Errorf("Cannot get secret of type %s", volumePluginName)
|
|
}
|
|
for name, data := range secrets.Data {
|
|
secret[name] = string(data)
|
|
}
|
|
return secret, nil
|
|
}
|
|
|
|
// GetClassForVolume locates storage class by persistent volume
|
|
func GetClassForVolume(kubeClient clientset.Interface, pv *v1.PersistentVolume) (*storage.StorageClass, error) {
|
|
if kubeClient == nil {
|
|
return nil, fmt.Errorf("Cannot get kube client")
|
|
}
|
|
className := v1helper.GetPersistentVolumeClass(pv)
|
|
if className == "" {
|
|
return nil, fmt.Errorf("Volume has no storage class")
|
|
}
|
|
|
|
class, err := kubeClient.StorageV1().StorageClasses().Get(className, metav1.GetOptions{})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return class, nil
|
|
}
|
|
|
|
// CheckNodeAffinity looks at the PV node affinity, and checks if the node has the same corresponding labels
|
|
// This ensures that we don't mount a volume that doesn't belong to this node
|
|
func CheckNodeAffinity(pv *v1.PersistentVolume, nodeLabels map[string]string) error {
|
|
return checkVolumeNodeAffinity(pv, nodeLabels)
|
|
}
|
|
|
|
func checkVolumeNodeAffinity(pv *v1.PersistentVolume, nodeLabels map[string]string) error {
|
|
if pv.Spec.NodeAffinity == nil {
|
|
return nil
|
|
}
|
|
|
|
if pv.Spec.NodeAffinity.Required != nil {
|
|
terms := pv.Spec.NodeAffinity.Required.NodeSelectorTerms
|
|
klog.V(10).Infof("Match for Required node selector terms %+v", terms)
|
|
if !v1helper.MatchNodeSelectorTerms(terms, labels.Set(nodeLabels), nil) {
|
|
return fmt.Errorf("No matching NodeSelectorTerms")
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// LoadPodFromFile will read, decode, and return a Pod from a file.
|
|
func LoadPodFromFile(filePath string) (*v1.Pod, error) {
|
|
if filePath == "" {
|
|
return nil, fmt.Errorf("file path not specified")
|
|
}
|
|
podDef, err := ioutil.ReadFile(filePath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to read file path %s: %+v", filePath, err)
|
|
}
|
|
if len(podDef) == 0 {
|
|
return nil, fmt.Errorf("file was empty: %s", filePath)
|
|
}
|
|
pod := &v1.Pod{}
|
|
|
|
codec := legacyscheme.Codecs.UniversalDecoder()
|
|
if err := runtime.DecodeInto(codec, podDef, pod); err != nil {
|
|
return nil, fmt.Errorf("failed decoding file: %v", err)
|
|
}
|
|
return pod, nil
|
|
}
|
|
|
|
// SelectZoneForVolume is a wrapper around SelectZonesForVolume
|
|
// to select a single zone for a volume based on parameters
|
|
func SelectZoneForVolume(zoneParameterPresent, zonesParameterPresent bool, zoneParameter string, zonesParameter, zonesWithNodes sets.String, node *v1.Node, allowedTopologies []v1.TopologySelectorTerm, pvcName string) (string, error) {
|
|
zones, err := SelectZonesForVolume(zoneParameterPresent, zonesParameterPresent, zoneParameter, zonesParameter, zonesWithNodes, node, allowedTopologies, pvcName, 1)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
zone, ok := zones.PopAny()
|
|
if !ok {
|
|
return "", fmt.Errorf("could not determine a zone to provision volume in")
|
|
}
|
|
return zone, nil
|
|
}
|
|
|
|
// SelectZonesForVolume selects zones for a volume based on several factors:
|
|
// node.zone, allowedTopologies, zone/zones parameters from storageclass,
|
|
// zones with active nodes from the cluster. The number of zones = replicas.
|
|
func SelectZonesForVolume(zoneParameterPresent, zonesParameterPresent bool, zoneParameter string, zonesParameter, zonesWithNodes sets.String, node *v1.Node, allowedTopologies []v1.TopologySelectorTerm, pvcName string, numReplicas uint32) (sets.String, error) {
|
|
if zoneParameterPresent && zonesParameterPresent {
|
|
return nil, fmt.Errorf("both zone and zones StorageClass parameters must not be used at the same time")
|
|
}
|
|
|
|
var zoneFromNode string
|
|
// pick one zone from node if present
|
|
if node != nil {
|
|
// VolumeScheduling implicit since node is not nil
|
|
if zoneParameterPresent || zonesParameterPresent {
|
|
return nil, fmt.Errorf("zone[s] cannot be specified in StorageClass if VolumeBindingMode is set to WaitForFirstConsumer. Please specify allowedTopologies in StorageClass for constraining zones")
|
|
}
|
|
|
|
// pick node's zone for one of the replicas
|
|
var ok bool
|
|
zoneFromNode, ok = node.ObjectMeta.Labels[kubeletapis.LabelZoneFailureDomain]
|
|
if !ok {
|
|
return nil, fmt.Errorf("%s Label for node missing", kubeletapis.LabelZoneFailureDomain)
|
|
}
|
|
// if single replica volume and node with zone found, return immediately
|
|
if numReplicas == 1 {
|
|
return sets.NewString(zoneFromNode), nil
|
|
}
|
|
}
|
|
|
|
// pick zone from allowedZones if specified
|
|
allowedZones, err := ZonesFromAllowedTopologies(allowedTopologies)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if (len(allowedTopologies) > 0) && (allowedZones.Len() == 0) {
|
|
return nil, fmt.Errorf("no matchLabelExpressions with %s key found in allowedTopologies. Please specify matchLabelExpressions with %s key", kubeletapis.LabelZoneFailureDomain, kubeletapis.LabelZoneFailureDomain)
|
|
}
|
|
|
|
if allowedZones.Len() > 0 {
|
|
// VolumeScheduling implicit since allowedZones present
|
|
if zoneParameterPresent || zonesParameterPresent {
|
|
return nil, fmt.Errorf("zone[s] cannot be specified in StorageClass if allowedTopologies specified")
|
|
}
|
|
// scheduler will guarantee if node != null above, zoneFromNode is member of allowedZones.
|
|
// so if zoneFromNode != "", we can safely assume it is part of allowedZones.
|
|
zones, err := chooseZonesForVolumeIncludingZone(allowedZones, pvcName, zoneFromNode, numReplicas)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("cannot process zones in allowedTopologies: %v", err)
|
|
}
|
|
return zones, nil
|
|
}
|
|
|
|
// pick zone from parameters if present
|
|
if zoneParameterPresent {
|
|
if numReplicas > 1 {
|
|
return nil, fmt.Errorf("zone cannot be specified if desired number of replicas for pv is greather than 1. Please specify zones or allowedTopologies to specify desired zones")
|
|
}
|
|
return sets.NewString(zoneParameter), nil
|
|
}
|
|
|
|
if zonesParameterPresent {
|
|
if uint32(zonesParameter.Len()) < numReplicas {
|
|
return nil, fmt.Errorf("not enough zones found in zones parameter to provision a volume with %d replicas. Found %d zones, need %d zones", numReplicas, zonesParameter.Len(), numReplicas)
|
|
}
|
|
// directly choose from zones parameter; no zone from node need to be considered
|
|
return ChooseZonesForVolume(zonesParameter, pvcName, numReplicas), nil
|
|
}
|
|
|
|
// pick zone from zones with nodes
|
|
if zonesWithNodes.Len() > 0 {
|
|
// If node != null (and thus zoneFromNode != ""), zoneFromNode will be member of zonesWithNodes
|
|
zones, err := chooseZonesForVolumeIncludingZone(zonesWithNodes, pvcName, zoneFromNode, numReplicas)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("cannot process zones where nodes exist in the cluster: %v", err)
|
|
}
|
|
return zones, nil
|
|
}
|
|
return nil, fmt.Errorf("cannot determine zones to provision volume in")
|
|
}
|
|
|
|
// ZonesFromAllowedTopologies returns a list of zones specified in allowedTopologies
|
|
func ZonesFromAllowedTopologies(allowedTopologies []v1.TopologySelectorTerm) (sets.String, error) {
|
|
zones := make(sets.String)
|
|
for _, term := range allowedTopologies {
|
|
for _, exp := range term.MatchLabelExpressions {
|
|
if exp.Key == kubeletapis.LabelZoneFailureDomain {
|
|
for _, value := range exp.Values {
|
|
zones.Insert(value)
|
|
}
|
|
} else {
|
|
return nil, fmt.Errorf("unsupported key found in matchLabelExpressions: %s", exp.Key)
|
|
}
|
|
}
|
|
}
|
|
return zones, nil
|
|
}
|
|
|
|
// ZonesSetToLabelValue converts zones set to label value
|
|
func ZonesSetToLabelValue(strSet sets.String) string {
|
|
return strings.Join(strSet.UnsortedList(), kubeletapis.LabelMultiZoneDelimiter)
|
|
}
|
|
|
|
// ZonesToSet converts a string containing a comma separated list of zones to set
|
|
func ZonesToSet(zonesString string) (sets.String, error) {
|
|
zones, err := stringToSet(zonesString, ",")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error parsing zones %s, must be strings separated by commas: %v", zonesString, err)
|
|
}
|
|
return zones, nil
|
|
}
|
|
|
|
// LabelZonesToSet converts a PV label value from string containing a delimited list of zones to set
|
|
func LabelZonesToSet(labelZonesValue string) (sets.String, error) {
|
|
return stringToSet(labelZonesValue, kubeletapis.LabelMultiZoneDelimiter)
|
|
}
|
|
|
|
// StringToSet converts a string containing list separated by specified delimiter to a set
|
|
func stringToSet(str, delimiter string) (sets.String, error) {
|
|
zonesSlice := strings.Split(str, delimiter)
|
|
zonesSet := make(sets.String)
|
|
for _, zone := range zonesSlice {
|
|
trimmedZone := strings.TrimSpace(zone)
|
|
if trimmedZone == "" {
|
|
return make(sets.String), fmt.Errorf(
|
|
"%q separated list (%q) must not contain an empty string",
|
|
delimiter,
|
|
str)
|
|
}
|
|
zonesSet.Insert(trimmedZone)
|
|
}
|
|
return zonesSet, nil
|
|
}
|
|
|
|
// LabelZonesToList converts a PV label value from string containing a delimited list of zones to list
|
|
func LabelZonesToList(labelZonesValue string) ([]string, error) {
|
|
return stringToList(labelZonesValue, kubeletapis.LabelMultiZoneDelimiter)
|
|
}
|
|
|
|
// StringToList converts a string containing list separated by specified delimiter to a list
|
|
func stringToList(str, delimiter string) ([]string, error) {
|
|
zonesSlice := make([]string, 0)
|
|
for _, zone := range strings.Split(str, delimiter) {
|
|
trimmedZone := strings.TrimSpace(zone)
|
|
if trimmedZone == "" {
|
|
return nil, fmt.Errorf(
|
|
"%q separated list (%q) must not contain an empty string",
|
|
delimiter,
|
|
str)
|
|
}
|
|
zonesSlice = append(zonesSlice, trimmedZone)
|
|
}
|
|
return zonesSlice, nil
|
|
}
|
|
|
|
// CalculateTimeoutForVolume calculates time for a Recycler pod to complete a
|
|
// recycle operation. The calculation and return value is either the
|
|
// minimumTimeout or the timeoutIncrement per Gi of storage size, whichever is
|
|
// greater.
|
|
func CalculateTimeoutForVolume(minimumTimeout, timeoutIncrement int, pv *v1.PersistentVolume) int64 {
|
|
giQty := resource.MustParse("1Gi")
|
|
pvQty := pv.Spec.Capacity[v1.ResourceStorage]
|
|
giSize := giQty.Value()
|
|
pvSize := pvQty.Value()
|
|
timeout := (pvSize / giSize) * int64(timeoutIncrement)
|
|
if timeout < int64(minimumTimeout) {
|
|
return int64(minimumTimeout)
|
|
}
|
|
return timeout
|
|
}
|
|
|
|
// RoundUpSize calculates how many allocation units are needed to accommodate
|
|
// a volume of given size. E.g. when user wants 1500MiB volume, while AWS EBS
|
|
// allocates volumes in gibibyte-sized chunks,
|
|
// RoundUpSize(1500 * 1024*1024, 1024*1024*1024) returns '2'
|
|
// (2 GiB is the smallest allocatable volume that can hold 1500MiB)
|
|
func RoundUpSize(volumeSizeBytes int64, allocationUnitBytes int64) int64 {
|
|
roundedUp := volumeSizeBytes / allocationUnitBytes
|
|
if volumeSizeBytes%allocationUnitBytes > 0 {
|
|
roundedUp++
|
|
}
|
|
return roundedUp
|
|
}
|
|
|
|
// RoundUpToGB rounds up given quantity to chunks of GB
|
|
func RoundUpToGB(size resource.Quantity) int64 {
|
|
requestBytes := size.Value()
|
|
return RoundUpSize(requestBytes, GB)
|
|
}
|
|
|
|
// RoundUpToGiB rounds up given quantity upto chunks of GiB
|
|
func RoundUpToGiB(size resource.Quantity) int64 {
|
|
requestBytes := size.Value()
|
|
return RoundUpSize(requestBytes, GIB)
|
|
}
|
|
|
|
// RoundUpSizeInt calculates how many allocation units are needed to accommodate
|
|
// a volume of given size. It returns an int instead of an int64 and an error if
|
|
// there's overflow
|
|
func RoundUpSizeInt(volumeSizeBytes int64, allocationUnitBytes int64) (int, error) {
|
|
roundedUp := RoundUpSize(volumeSizeBytes, allocationUnitBytes)
|
|
roundedUpInt := int(roundedUp)
|
|
if int64(roundedUpInt) != roundedUp {
|
|
return 0, fmt.Errorf("capacity %v is too great, casting results in integer overflow", roundedUp)
|
|
}
|
|
return roundedUpInt, nil
|
|
}
|
|
|
|
// RoundUpToGBInt rounds up given quantity to chunks of GB. It returns an
|
|
// int instead of an int64 and an error if there's overflow
|
|
func RoundUpToGBInt(size resource.Quantity) (int, error) {
|
|
requestBytes := size.Value()
|
|
return RoundUpSizeInt(requestBytes, GB)
|
|
}
|
|
|
|
// RoundUpToGiBInt rounds up given quantity upto chunks of GiB. It returns an
|
|
// int instead of an int64 and an error if there's overflow
|
|
func RoundUpToGiBInt(size resource.Quantity) (int, error) {
|
|
requestBytes := size.Value()
|
|
return RoundUpSizeInt(requestBytes, GIB)
|
|
}
|
|
|
|
// GenerateVolumeName returns a PV name with clusterName prefix. The function
|
|
// should be used to generate a name of GCE PD or Cinder volume. It basically
|
|
// adds "<clusterName>-dynamic-" before the PV name, making sure the resulting
|
|
// string fits given length and cuts "dynamic" if not.
|
|
func GenerateVolumeName(clusterName, pvName string, maxLength int) string {
|
|
prefix := clusterName + "-dynamic"
|
|
pvLen := len(pvName)
|
|
|
|
// cut the "<clusterName>-dynamic" to fit full pvName into maxLength
|
|
// +1 for the '-' dash
|
|
if pvLen+1+len(prefix) > maxLength {
|
|
prefix = prefix[:maxLength-pvLen-1]
|
|
}
|
|
return prefix + "-" + pvName
|
|
}
|
|
|
|
// GetPath checks if the path from the mounter is empty.
|
|
func GetPath(mounter volume.Mounter) (string, error) {
|
|
path := mounter.GetPath()
|
|
if path == "" {
|
|
return "", fmt.Errorf("Path is empty %s", reflect.TypeOf(mounter).String())
|
|
}
|
|
return path, nil
|
|
}
|
|
|
|
// ChooseZoneForVolume implements our heuristics for choosing a zone for volume creation based on the volume name
|
|
// Volumes are generally round-robin-ed across all active zones, using the hash of the PVC Name.
|
|
// However, if the PVCName ends with `-<integer>`, we will hash the prefix, and then add the integer to the hash.
|
|
// This means that a StatefulSet's volumes (`claimname-statefulsetname-id`) will spread across available zones,
|
|
// assuming the id values are consecutive.
|
|
func ChooseZoneForVolume(zones sets.String, pvcName string) string {
|
|
// No zones available, return empty string.
|
|
if zones.Len() == 0 {
|
|
return ""
|
|
}
|
|
|
|
// We create the volume in a zone determined by the name
|
|
// Eventually the scheduler will coordinate placement into an available zone
|
|
hash, index := getPVCNameHashAndIndexOffset(pvcName)
|
|
|
|
// Zones.List returns zones in a consistent order (sorted)
|
|
// We do have a potential failure case where volumes will not be properly spread,
|
|
// if the set of zones changes during StatefulSet volume creation. However, this is
|
|
// probably relatively unlikely because we expect the set of zones to be essentially
|
|
// static for clusters.
|
|
// Hopefully we can address this problem if/when we do full scheduler integration of
|
|
// PVC placement (which could also e.g. avoid putting volumes in overloaded or
|
|
// unhealthy zones)
|
|
zoneSlice := zones.List()
|
|
zone := zoneSlice[(hash+index)%uint32(len(zoneSlice))]
|
|
|
|
klog.V(2).Infof("Creating volume for PVC %q; chose zone=%q from zones=%q", pvcName, zone, zoneSlice)
|
|
return zone
|
|
}
|
|
|
|
// chooseZonesForVolumeIncludingZone is a wrapper around ChooseZonesForVolume that ensures zoneToInclude is chosen
|
|
// zoneToInclude can either be empty in which case it is ignored. If non-empty, zoneToInclude is expected to be member of zones.
|
|
// numReplicas is expected to be > 0 and <= zones.Len()
|
|
func chooseZonesForVolumeIncludingZone(zones sets.String, pvcName, zoneToInclude string, numReplicas uint32) (sets.String, error) {
|
|
if numReplicas == 0 {
|
|
return nil, fmt.Errorf("invalid number of replicas passed")
|
|
}
|
|
if uint32(zones.Len()) < numReplicas {
|
|
return nil, fmt.Errorf("not enough zones found to provision a volume with %d replicas. Need at least %d distinct zones for a volume with %d replicas", numReplicas, numReplicas, numReplicas)
|
|
}
|
|
if zoneToInclude != "" && !zones.Has(zoneToInclude) {
|
|
return nil, fmt.Errorf("zone to be included: %s needs to be member of set: %v", zoneToInclude, zones)
|
|
}
|
|
if uint32(zones.Len()) == numReplicas {
|
|
return zones, nil
|
|
}
|
|
if zoneToInclude != "" {
|
|
zones.Delete(zoneToInclude)
|
|
numReplicas = numReplicas - 1
|
|
}
|
|
zonesChosen := ChooseZonesForVolume(zones, pvcName, numReplicas)
|
|
if zoneToInclude != "" {
|
|
zonesChosen.Insert(zoneToInclude)
|
|
}
|
|
return zonesChosen, nil
|
|
}
|
|
|
|
// ChooseZonesForVolume is identical to ChooseZoneForVolume, but selects a multiple zones, for multi-zone disks.
|
|
func ChooseZonesForVolume(zones sets.String, pvcName string, numZones uint32) sets.String {
|
|
// No zones available, return empty set.
|
|
replicaZones := sets.NewString()
|
|
if zones.Len() == 0 {
|
|
return replicaZones
|
|
}
|
|
|
|
// We create the volume in a zone determined by the name
|
|
// Eventually the scheduler will coordinate placement into an available zone
|
|
hash, index := getPVCNameHashAndIndexOffset(pvcName)
|
|
|
|
// Zones.List returns zones in a consistent order (sorted)
|
|
// We do have a potential failure case where volumes will not be properly spread,
|
|
// if the set of zones changes during StatefulSet volume creation. However, this is
|
|
// probably relatively unlikely because we expect the set of zones to be essentially
|
|
// static for clusters.
|
|
// Hopefully we can address this problem if/when we do full scheduler integration of
|
|
// PVC placement (which could also e.g. avoid putting volumes in overloaded or
|
|
// unhealthy zones)
|
|
zoneSlice := zones.List()
|
|
|
|
startingIndex := index * numZones
|
|
for index = startingIndex; index < startingIndex+numZones; index++ {
|
|
zone := zoneSlice[(hash+index)%uint32(len(zoneSlice))]
|
|
replicaZones.Insert(zone)
|
|
}
|
|
|
|
klog.V(2).Infof("Creating volume for replicated PVC %q; chosen zones=%q from zones=%q",
|
|
pvcName, replicaZones.UnsortedList(), zoneSlice)
|
|
return replicaZones
|
|
}
|
|
|
|
func getPVCNameHashAndIndexOffset(pvcName string) (hash uint32, index uint32) {
|
|
if pvcName == "" {
|
|
// We should always be called with a name; this shouldn't happen
|
|
klog.Warningf("No name defined during volume create; choosing random zone")
|
|
|
|
hash = rand.Uint32()
|
|
} else {
|
|
hashString := pvcName
|
|
|
|
// Heuristic to make sure that volumes in a StatefulSet are spread across zones
|
|
// StatefulSet PVCs are (currently) named ClaimName-StatefulSetName-Id,
|
|
// where Id is an integer index.
|
|
// Note though that if a StatefulSet pod has multiple claims, we need them to be
|
|
// in the same zone, because otherwise the pod will be unable to mount both volumes,
|
|
// and will be unschedulable. So we hash _only_ the "StatefulSetName" portion when
|
|
// it looks like `ClaimName-StatefulSetName-Id`.
|
|
// We continue to round-robin volume names that look like `Name-Id` also; this is a useful
|
|
// feature for users that are creating statefulset-like functionality without using statefulsets.
|
|
lastDash := strings.LastIndexByte(pvcName, '-')
|
|
if lastDash != -1 {
|
|
statefulsetIDString := pvcName[lastDash+1:]
|
|
statefulsetID, err := strconv.ParseUint(statefulsetIDString, 10, 32)
|
|
if err == nil {
|
|
// Offset by the statefulsetID, so we round-robin across zones
|
|
index = uint32(statefulsetID)
|
|
// We still hash the volume name, but only the prefix
|
|
hashString = pvcName[:lastDash]
|
|
|
|
// In the special case where it looks like `ClaimName-StatefulSetName-Id`,
|
|
// hash only the StatefulSetName, so that different claims on the same StatefulSet
|
|
// member end up in the same zone.
|
|
// Note that StatefulSetName (and ClaimName) might themselves both have dashes.
|
|
// We actually just take the portion after the final - of ClaimName-StatefulSetName.
|
|
// For our purposes it doesn't much matter (just suboptimal spreading).
|
|
lastDash := strings.LastIndexByte(hashString, '-')
|
|
if lastDash != -1 {
|
|
hashString = hashString[lastDash+1:]
|
|
}
|
|
|
|
klog.V(2).Infof("Detected StatefulSet-style volume name %q; index=%d", pvcName, index)
|
|
}
|
|
}
|
|
|
|
// We hash the (base) volume name, so we don't bias towards the first N zones
|
|
h := fnv.New32()
|
|
h.Write([]byte(hashString))
|
|
hash = h.Sum32()
|
|
}
|
|
|
|
return hash, index
|
|
}
|
|
|
|
// UnmountViaEmptyDir delegates the tear down operation for secret, configmap, git_repo and downwardapi
|
|
// to empty_dir
|
|
func UnmountViaEmptyDir(dir string, host volume.VolumeHost, volName string, volSpec volume.Spec, podUID utypes.UID) error {
|
|
klog.V(3).Infof("Tearing down volume %v for pod %v at %v", volName, podUID, dir)
|
|
|
|
// Wrap EmptyDir, let it do the teardown.
|
|
wrapped, err := host.NewWrapperUnmounter(volName, volSpec, podUID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return wrapped.TearDownAt(dir)
|
|
}
|
|
|
|
// MountOptionFromSpec extracts and joins mount options from volume spec with supplied options
|
|
func MountOptionFromSpec(spec *volume.Spec, options ...string) []string {
|
|
pv := spec.PersistentVolume
|
|
|
|
if pv != nil {
|
|
// Use beta annotation first
|
|
if mo, ok := pv.Annotations[v1.MountOptionAnnotation]; ok {
|
|
moList := strings.Split(mo, ",")
|
|
return JoinMountOptions(moList, options)
|
|
}
|
|
|
|
if len(pv.Spec.MountOptions) > 0 {
|
|
return JoinMountOptions(pv.Spec.MountOptions, options)
|
|
}
|
|
}
|
|
|
|
return options
|
|
}
|
|
|
|
// JoinMountOptions joins mount options eliminating duplicates
|
|
func JoinMountOptions(userOptions []string, systemOptions []string) []string {
|
|
allMountOptions := sets.NewString()
|
|
|
|
for _, mountOption := range userOptions {
|
|
if len(mountOption) > 0 {
|
|
allMountOptions.Insert(mountOption)
|
|
}
|
|
}
|
|
|
|
for _, mountOption := range systemOptions {
|
|
allMountOptions.Insert(mountOption)
|
|
}
|
|
return allMountOptions.List()
|
|
}
|
|
|
|
// ValidateZone returns:
|
|
// - an error in case zone is an empty string or contains only any combination of spaces and tab characters
|
|
// - nil otherwise
|
|
func ValidateZone(zone string) error {
|
|
if strings.TrimSpace(zone) == "" {
|
|
return fmt.Errorf("the provided %q zone is not valid, it's an empty string or contains only spaces and tab characters", zone)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// AccessModesContains returns whether the requested mode is contained by modes
|
|
func AccessModesContains(modes []v1.PersistentVolumeAccessMode, mode v1.PersistentVolumeAccessMode) bool {
|
|
for _, m := range modes {
|
|
if m == mode {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// AccessModesContainedInAll returns whether all of the requested modes are contained by modes
|
|
func AccessModesContainedInAll(indexedModes []v1.PersistentVolumeAccessMode, requestedModes []v1.PersistentVolumeAccessMode) bool {
|
|
for _, mode := range requestedModes {
|
|
if !AccessModesContains(indexedModes, mode) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// GetWindowsPath get a windows path
|
|
func GetWindowsPath(path string) string {
|
|
windowsPath := strings.Replace(path, "/", "\\", -1)
|
|
if strings.HasPrefix(windowsPath, "\\") {
|
|
windowsPath = "c:" + windowsPath
|
|
}
|
|
return windowsPath
|
|
}
|
|
|
|
// GetUniquePodName returns a unique identifier to reference a pod by
|
|
func GetUniquePodName(pod *v1.Pod) types.UniquePodName {
|
|
return types.UniquePodName(pod.UID)
|
|
}
|
|
|
|
// GetUniqueVolumeName returns a unique name representing the volume/plugin.
|
|
// Caller should ensure that volumeName is a name/ID uniquely identifying the
|
|
// actual backing device, directory, path, etc. for a particular volume.
|
|
// The returned name can be used to uniquely reference the volume, for example,
|
|
// to prevent operations (attach/detach or mount/unmount) from being triggered
|
|
// on the same volume.
|
|
func GetUniqueVolumeName(pluginName, volumeName string) v1.UniqueVolumeName {
|
|
return v1.UniqueVolumeName(fmt.Sprintf("%s/%s", pluginName, volumeName))
|
|
}
|
|
|
|
// GetUniqueVolumeNameFromSpecWithPod returns a unique volume name with pod
|
|
// name included. This is useful to generate different names for different pods
|
|
// on same volume.
|
|
func GetUniqueVolumeNameFromSpecWithPod(
|
|
podName types.UniquePodName, volumePlugin volume.VolumePlugin, volumeSpec *volume.Spec) v1.UniqueVolumeName {
|
|
return v1.UniqueVolumeName(
|
|
fmt.Sprintf("%s/%v-%s", volumePlugin.GetPluginName(), podName, volumeSpec.Name()))
|
|
}
|
|
|
|
// GetUniqueVolumeNameFromSpec uses the given VolumePlugin to generate a unique
|
|
// name representing the volume defined in the specified volume spec.
|
|
// This returned name can be used to uniquely reference the actual backing
|
|
// device, directory, path, etc. referenced by the given volumeSpec.
|
|
// If the given plugin does not support the volume spec, this returns an error.
|
|
func GetUniqueVolumeNameFromSpec(
|
|
volumePlugin volume.VolumePlugin,
|
|
volumeSpec *volume.Spec) (v1.UniqueVolumeName, error) {
|
|
if volumePlugin == nil {
|
|
return "", fmt.Errorf(
|
|
"volumePlugin should not be nil. volumeSpec.Name=%q",
|
|
volumeSpec.Name())
|
|
}
|
|
|
|
volumeName, err := volumePlugin.GetVolumeName(volumeSpec)
|
|
if err != nil || volumeName == "" {
|
|
return "", fmt.Errorf(
|
|
"failed to GetVolumeName from volumePlugin for volumeSpec %q err=%v",
|
|
volumeSpec.Name(),
|
|
err)
|
|
}
|
|
|
|
return GetUniqueVolumeName(
|
|
volumePlugin.GetPluginName(),
|
|
volumeName),
|
|
nil
|
|
}
|
|
|
|
// IsPodTerminated checks if pod is terminated
|
|
func IsPodTerminated(pod *v1.Pod, podStatus v1.PodStatus) bool {
|
|
return podStatus.Phase == v1.PodFailed || podStatus.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(podStatus.ContainerStatuses))
|
|
}
|
|
|
|
// notRunning returns true if every status is terminated or waiting, or the status list
|
|
// is empty.
|
|
func notRunning(statuses []v1.ContainerStatus) bool {
|
|
for _, status := range statuses {
|
|
if status.State.Terminated == nil && status.State.Waiting == nil {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// SplitUniqueName splits the unique name to plugin name and volume name strings. It expects the uniqueName to follow
|
|
// the format plugin_name/volume_name and the plugin name must be namespaced as described by the plugin interface,
|
|
// i.e. namespace/plugin containing exactly one '/'. This means the unique name will always be in the form of
|
|
// plugin_namespace/plugin/volume_name, see k8s.io/kubernetes/pkg/volume/plugins.go VolumePlugin interface
|
|
// description and pkg/volume/util/volumehelper/volumehelper.go GetUniqueVolumeNameFromSpec that constructs
|
|
// the unique volume names.
|
|
func SplitUniqueName(uniqueName v1.UniqueVolumeName) (string, string, error) {
|
|
components := strings.SplitN(string(uniqueName), "/", 3)
|
|
if len(components) != 3 {
|
|
return "", "", fmt.Errorf("cannot split volume unique name %s to plugin/volume components", uniqueName)
|
|
}
|
|
pluginName := fmt.Sprintf("%s/%s", components[0], components[1])
|
|
return pluginName, components[2], nil
|
|
}
|
|
|
|
// NewSafeFormatAndMountFromHost creates a new SafeFormatAndMount with Mounter
|
|
// and Exec taken from given VolumeHost.
|
|
func NewSafeFormatAndMountFromHost(pluginName string, host volume.VolumeHost) *mount.SafeFormatAndMount {
|
|
mounter := host.GetMounter(pluginName)
|
|
exec := host.GetExec(pluginName)
|
|
return &mount.SafeFormatAndMount{Interface: mounter, Exec: exec}
|
|
}
|
|
|
|
// GetVolumeMode retrieves VolumeMode from pv.
|
|
// If the volume doesn't have PersistentVolume, it's an inline volume,
|
|
// should return volumeMode as filesystem to keep existing behavior.
|
|
func GetVolumeMode(volumeSpec *volume.Spec) (v1.PersistentVolumeMode, error) {
|
|
if volumeSpec == nil || volumeSpec.PersistentVolume == nil {
|
|
return v1.PersistentVolumeFilesystem, nil
|
|
}
|
|
if volumeSpec.PersistentVolume.Spec.VolumeMode != nil {
|
|
return *volumeSpec.PersistentVolume.Spec.VolumeMode, nil
|
|
}
|
|
return "", fmt.Errorf("cannot get volumeMode for volume: %v", volumeSpec.Name())
|
|
}
|
|
|
|
// GetPersistentVolumeClaimVolumeMode retrieves VolumeMode from pvc.
|
|
func GetPersistentVolumeClaimVolumeMode(claim *v1.PersistentVolumeClaim) (v1.PersistentVolumeMode, error) {
|
|
if claim.Spec.VolumeMode != nil {
|
|
return *claim.Spec.VolumeMode, nil
|
|
}
|
|
return "", fmt.Errorf("cannot get volumeMode from pvc: %v", claim.Name)
|
|
}
|
|
|
|
// GetPersistentVolumeClaimQualifiedName returns a qualified name for pvc.
|
|
func GetPersistentVolumeClaimQualifiedName(claim *v1.PersistentVolumeClaim) string {
|
|
return utilstrings.JoinQualifiedName(claim.GetNamespace(), claim.GetName())
|
|
}
|
|
|
|
// CheckVolumeModeFilesystem checks VolumeMode.
|
|
// If the mode is Filesystem, return true otherwise return false.
|
|
func CheckVolumeModeFilesystem(volumeSpec *volume.Spec) (bool, error) {
|
|
if utilfeature.DefaultFeatureGate.Enabled(features.BlockVolume) {
|
|
volumeMode, err := GetVolumeMode(volumeSpec)
|
|
if err != nil {
|
|
return true, err
|
|
}
|
|
if volumeMode == v1.PersistentVolumeBlock {
|
|
return false, nil
|
|
}
|
|
}
|
|
return true, nil
|
|
}
|
|
|
|
// CheckPersistentVolumeClaimModeBlock checks VolumeMode.
|
|
// If the mode is Block, return true otherwise return false.
|
|
func CheckPersistentVolumeClaimModeBlock(pvc *v1.PersistentVolumeClaim) bool {
|
|
return utilfeature.DefaultFeatureGate.Enabled(features.BlockVolume) && pvc.Spec.VolumeMode != nil && *pvc.Spec.VolumeMode == v1.PersistentVolumeBlock
|
|
}
|
|
|
|
// IsWindowsUNCPath checks if path is prefixed with \\
|
|
// This can be used to skip any processing of paths
|
|
// that point to SMB shares, local named pipes and local UNC path
|
|
func IsWindowsUNCPath(goos, path string) bool {
|
|
if goos != "windows" {
|
|
return false
|
|
}
|
|
// Check for UNC prefix \\
|
|
if strings.HasPrefix(path, `\\`) {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// IsWindowsLocalPath checks if path is a local path
|
|
// prefixed with "/" or "\" like "/foo/bar" or "\foo\bar"
|
|
func IsWindowsLocalPath(goos, path string) bool {
|
|
if goos != "windows" {
|
|
return false
|
|
}
|
|
if IsWindowsUNCPath(goos, path) {
|
|
return false
|
|
}
|
|
if strings.Contains(path, ":") {
|
|
return false
|
|
}
|
|
if !(strings.HasPrefix(path, `/`) || strings.HasPrefix(path, `\`)) {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
// MakeAbsolutePath convert path to absolute path according to GOOS
|
|
func MakeAbsolutePath(goos, path string) string {
|
|
if goos != "windows" {
|
|
return filepath.Clean("/" + path)
|
|
}
|
|
// These are all for windows
|
|
// If there is a colon, give up.
|
|
if strings.Contains(path, ":") {
|
|
return path
|
|
}
|
|
// If there is a slash, but no drive, add 'c:'
|
|
if strings.HasPrefix(path, "/") || strings.HasPrefix(path, "\\") {
|
|
return "c:" + path
|
|
}
|
|
// Otherwise, add 'c:\'
|
|
return "c:\\" + path
|
|
}
|
|
|
|
// MapBlockVolume is a utility function to provide a common way of mounting
|
|
// block device path for a specified volume and pod. This function should be
|
|
// called by volume plugins that implements volume.BlockVolumeMapper.Map() method.
|
|
func MapBlockVolume(
|
|
devicePath,
|
|
globalMapPath,
|
|
podVolumeMapPath,
|
|
volumeMapName string,
|
|
podUID utypes.UID,
|
|
) error {
|
|
blkUtil := volumepathhandler.NewBlockVolumePathHandler()
|
|
|
|
// map devicePath to global node path
|
|
mapErr := blkUtil.MapDevice(devicePath, globalMapPath, string(podUID))
|
|
if mapErr != nil {
|
|
return mapErr
|
|
}
|
|
|
|
// map devicePath to pod volume path
|
|
mapErr = blkUtil.MapDevice(devicePath, podVolumeMapPath, volumeMapName)
|
|
if mapErr != nil {
|
|
return mapErr
|
|
}
|
|
|
|
return nil
|
|
}
|