mirror of https://github.com/k3s-io/k3s
438 lines
16 KiB
Go
438 lines
16 KiB
Go
/*
|
|
Copyright 2016 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package core
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
"time"
|
|
|
|
"k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/api/resource"
|
|
"k8s.io/apimachinery/pkg/labels"
|
|
"k8s.io/apimachinery/pkg/runtime"
|
|
"k8s.io/apimachinery/pkg/runtime/schema"
|
|
|
|
"k8s.io/apimachinery/pkg/util/clock"
|
|
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
|
|
"k8s.io/apimachinery/pkg/util/sets"
|
|
"k8s.io/apiserver/pkg/admission"
|
|
api "k8s.io/kubernetes/pkg/apis/core"
|
|
"k8s.io/kubernetes/pkg/apis/core/helper"
|
|
"k8s.io/kubernetes/pkg/apis/core/helper/qos"
|
|
k8s_api_v1 "k8s.io/kubernetes/pkg/apis/core/v1"
|
|
"k8s.io/kubernetes/pkg/kubeapiserver/admission/util"
|
|
"k8s.io/kubernetes/pkg/quota"
|
|
"k8s.io/kubernetes/pkg/quota/generic"
|
|
)
|
|
|
|
// the name used for object count quota
|
|
var podObjectCountName = generic.ObjectCountQuotaResourceNameFor(v1.SchemeGroupVersion.WithResource("pods").GroupResource())
|
|
|
|
// podResources are the set of resources managed by quota associated with pods.
|
|
var podResources = []api.ResourceName{
|
|
podObjectCountName,
|
|
api.ResourceCPU,
|
|
api.ResourceMemory,
|
|
api.ResourceEphemeralStorage,
|
|
api.ResourceRequestsCPU,
|
|
api.ResourceRequestsMemory,
|
|
api.ResourceRequestsEphemeralStorage,
|
|
api.ResourceLimitsCPU,
|
|
api.ResourceLimitsMemory,
|
|
api.ResourceLimitsEphemeralStorage,
|
|
api.ResourcePods,
|
|
}
|
|
|
|
// podResourcePrefixes are the set of prefixes for resources (Hugepages, and other
|
|
// potential extended reources with specific prefix) managed by quota associated with pods.
|
|
var podResourcePrefixes = []string{
|
|
api.ResourceHugePagesPrefix,
|
|
api.ResourceRequestsHugePagesPrefix,
|
|
}
|
|
|
|
// requestedResourcePrefixes are the set of prefixes for resources
|
|
// that might be declared in pod's Resources.Requests/Limits
|
|
var requestedResourcePrefixes = []string{
|
|
api.ResourceHugePagesPrefix,
|
|
}
|
|
|
|
// maskResourceWithPrefix mask resource with certain prefix
|
|
// e.g. hugepages-XXX -> requests.hugepages-XXX
|
|
func maskResourceWithPrefix(resource api.ResourceName, prefix string) api.ResourceName {
|
|
return api.ResourceName(fmt.Sprintf("%s%s", prefix, string(resource)))
|
|
}
|
|
|
|
// isExtendedResourceNameForQuota returns true if the extended resource name
|
|
// has the quota related resource prefix.
|
|
func isExtendedResourceNameForQuota(name api.ResourceName) bool {
|
|
// As overcommit is not supported by extended resources for now,
|
|
// only quota objects in format of "requests.resourceName" is allowed.
|
|
return !helper.IsNativeResource(name) && strings.HasPrefix(string(name), api.DefaultResourceRequestsPrefix)
|
|
}
|
|
|
|
// NOTE: it was a mistake, but if a quota tracks cpu or memory related resources,
|
|
// the incoming pod is required to have those values set. we should not repeat
|
|
// this mistake for other future resources (gpus, ephemeral-storage,etc).
|
|
// do not add more resources to this list!
|
|
var validationSet = sets.NewString(
|
|
string(api.ResourceCPU),
|
|
string(api.ResourceMemory),
|
|
string(api.ResourceRequestsCPU),
|
|
string(api.ResourceRequestsMemory),
|
|
string(api.ResourceLimitsCPU),
|
|
string(api.ResourceLimitsMemory),
|
|
)
|
|
|
|
// NewPodEvaluator returns an evaluator that can evaluate pods
|
|
func NewPodEvaluator(f quota.ListerForResourceFunc, clock clock.Clock) quota.Evaluator {
|
|
listFuncByNamespace := generic.ListResourceUsingListerFunc(f, v1.SchemeGroupVersion.WithResource("pods"))
|
|
podEvaluator := &podEvaluator{listFuncByNamespace: listFuncByNamespace, clock: clock}
|
|
return podEvaluator
|
|
}
|
|
|
|
// podEvaluator knows how to measure usage of pods.
|
|
type podEvaluator struct {
|
|
// knows how to list pods
|
|
listFuncByNamespace generic.ListFuncByNamespace
|
|
// used to track time
|
|
clock clock.Clock
|
|
}
|
|
|
|
// Constraints verifies that all required resources are present on the pod
|
|
// In addition, it validates that the resources are valid (i.e. requests < limits)
|
|
func (p *podEvaluator) Constraints(required []api.ResourceName, item runtime.Object) error {
|
|
pod, ok := item.(*api.Pod)
|
|
if !ok {
|
|
return fmt.Errorf("unexpected input object %v", item)
|
|
}
|
|
|
|
// BACKWARD COMPATIBILITY REQUIREMENT: if we quota cpu or memory, then each container
|
|
// must make an explicit request for the resource. this was a mistake. it coupled
|
|
// validation with resource counting, but we did this before QoS was even defined.
|
|
// let's not make that mistake again with other resources now that QoS is defined.
|
|
requiredSet := quota.ToSet(required).Intersection(validationSet)
|
|
missingSet := sets.NewString()
|
|
for i := range pod.Spec.Containers {
|
|
enforcePodContainerConstraints(&pod.Spec.Containers[i], requiredSet, missingSet)
|
|
}
|
|
for i := range pod.Spec.InitContainers {
|
|
enforcePodContainerConstraints(&pod.Spec.InitContainers[i], requiredSet, missingSet)
|
|
}
|
|
if len(missingSet) == 0 {
|
|
return nil
|
|
}
|
|
return fmt.Errorf("must specify %s", strings.Join(missingSet.List(), ","))
|
|
}
|
|
|
|
// GroupResource that this evaluator tracks
|
|
func (p *podEvaluator) GroupResource() schema.GroupResource {
|
|
return v1.SchemeGroupVersion.WithResource("pods").GroupResource()
|
|
}
|
|
|
|
// Handles returns true if the evaluator should handle the specified attributes.
|
|
func (p *podEvaluator) Handles(a admission.Attributes) bool {
|
|
op := a.GetOperation()
|
|
if op == admission.Create {
|
|
return true
|
|
}
|
|
initializationCompletion, err := util.IsInitializationCompletion(a)
|
|
if err != nil {
|
|
// fail closed, will try to give an evaluation.
|
|
utilruntime.HandleError(err)
|
|
return true
|
|
}
|
|
// only uninitialized pods might be updated.
|
|
return initializationCompletion
|
|
}
|
|
|
|
// Matches returns true if the evaluator matches the specified quota with the provided input item
|
|
func (p *podEvaluator) Matches(resourceQuota *api.ResourceQuota, item runtime.Object) (bool, error) {
|
|
return generic.Matches(resourceQuota, item, p.MatchingResources, podMatchesScopeFunc)
|
|
}
|
|
|
|
// MatchingResources takes the input specified list of resources and returns the set of resources it matches.
|
|
func (p *podEvaluator) MatchingResources(input []api.ResourceName) []api.ResourceName {
|
|
result := quota.Intersection(input, podResources)
|
|
for _, resource := range input {
|
|
// for resources with certain prefix, e.g. hugepages
|
|
if quota.ContainsPrefix(podResourcePrefixes, resource) {
|
|
result = append(result, resource)
|
|
}
|
|
// for extended resources
|
|
if isExtendedResourceNameForQuota(resource) {
|
|
result = append(result, resource)
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// MatchingScopes takes the input specified list of scopes and pod object. Returns the set of scope selectors pod matches.
|
|
func (p *podEvaluator) MatchingScopes(item runtime.Object, scopeSelectors []api.ScopedResourceSelectorRequirement) ([]api.ScopedResourceSelectorRequirement, error) {
|
|
matchedScopes := []api.ScopedResourceSelectorRequirement{}
|
|
for _, selector := range scopeSelectors {
|
|
match, err := podMatchesScopeFunc(selector, item)
|
|
if err != nil {
|
|
return []api.ScopedResourceSelectorRequirement{}, fmt.Errorf("error on matching scope %v: %v", selector, err)
|
|
}
|
|
if match {
|
|
matchedScopes = append(matchedScopes, selector)
|
|
}
|
|
}
|
|
return matchedScopes, nil
|
|
}
|
|
|
|
// UncoveredQuotaScopes takes the input matched scopes which are limited by configuration and the matched quota scopes.
|
|
// It returns the scopes which are in limited scopes but dont have a corresponding covering quota scope
|
|
func (p *podEvaluator) UncoveredQuotaScopes(limitedScopes []api.ScopedResourceSelectorRequirement, matchedQuotaScopes []api.ScopedResourceSelectorRequirement) ([]api.ScopedResourceSelectorRequirement, error) {
|
|
uncoveredScopes := []api.ScopedResourceSelectorRequirement{}
|
|
for _, selector := range limitedScopes {
|
|
isCovered := false
|
|
for _, matchedScopeSelector := range matchedQuotaScopes {
|
|
if matchedScopeSelector.ScopeName == selector.ScopeName {
|
|
isCovered = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if !isCovered {
|
|
uncoveredScopes = append(uncoveredScopes, selector)
|
|
}
|
|
}
|
|
return uncoveredScopes, nil
|
|
}
|
|
|
|
// Usage knows how to measure usage associated with pods
|
|
func (p *podEvaluator) Usage(item runtime.Object) (api.ResourceList, error) {
|
|
// delegate to normal usage
|
|
return PodUsageFunc(item, p.clock)
|
|
}
|
|
|
|
// UsageStats calculates aggregate usage for the object.
|
|
func (p *podEvaluator) UsageStats(options quota.UsageStatsOptions) (quota.UsageStats, error) {
|
|
return generic.CalculateUsageStats(options, p.listFuncByNamespace, podMatchesScopeFunc, p.Usage)
|
|
}
|
|
|
|
// verifies we implement the required interface.
|
|
var _ quota.Evaluator = &podEvaluator{}
|
|
|
|
// enforcePodContainerConstraints checks for required resources that are not set on this container and
|
|
// adds them to missingSet.
|
|
func enforcePodContainerConstraints(container *api.Container, requiredSet, missingSet sets.String) {
|
|
requests := container.Resources.Requests
|
|
limits := container.Resources.Limits
|
|
containerUsage := podComputeUsageHelper(requests, limits)
|
|
containerSet := quota.ToSet(quota.ResourceNames(containerUsage))
|
|
if !containerSet.Equal(requiredSet) {
|
|
difference := requiredSet.Difference(containerSet)
|
|
missingSet.Insert(difference.List()...)
|
|
}
|
|
}
|
|
|
|
// podComputeUsageHelper can summarize the pod compute quota usage based on requests and limits
|
|
func podComputeUsageHelper(requests api.ResourceList, limits api.ResourceList) api.ResourceList {
|
|
result := api.ResourceList{}
|
|
result[api.ResourcePods] = resource.MustParse("1")
|
|
if request, found := requests[api.ResourceCPU]; found {
|
|
result[api.ResourceCPU] = request
|
|
result[api.ResourceRequestsCPU] = request
|
|
}
|
|
if limit, found := limits[api.ResourceCPU]; found {
|
|
result[api.ResourceLimitsCPU] = limit
|
|
}
|
|
if request, found := requests[api.ResourceMemory]; found {
|
|
result[api.ResourceMemory] = request
|
|
result[api.ResourceRequestsMemory] = request
|
|
}
|
|
if limit, found := limits[api.ResourceMemory]; found {
|
|
result[api.ResourceLimitsMemory] = limit
|
|
}
|
|
if request, found := requests[api.ResourceEphemeralStorage]; found {
|
|
result[api.ResourceEphemeralStorage] = request
|
|
result[api.ResourceRequestsEphemeralStorage] = request
|
|
}
|
|
if limit, found := limits[api.ResourceEphemeralStorage]; found {
|
|
result[api.ResourceLimitsEphemeralStorage] = limit
|
|
}
|
|
for resource, request := range requests {
|
|
// for resources with certain prefix, e.g. hugepages
|
|
if quota.ContainsPrefix(requestedResourcePrefixes, resource) {
|
|
result[resource] = request
|
|
result[maskResourceWithPrefix(resource, api.DefaultResourceRequestsPrefix)] = request
|
|
}
|
|
// for extended resources
|
|
if helper.IsExtendedResourceName(resource) {
|
|
// only quota objects in format of "requests.resourceName" is allowed for extended resource.
|
|
result[maskResourceWithPrefix(resource, api.DefaultResourceRequestsPrefix)] = request
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
func toInternalPodOrError(obj runtime.Object) (*api.Pod, error) {
|
|
pod := &api.Pod{}
|
|
switch t := obj.(type) {
|
|
case *v1.Pod:
|
|
if err := k8s_api_v1.Convert_v1_Pod_To_core_Pod(t, pod, nil); err != nil {
|
|
return nil, err
|
|
}
|
|
case *api.Pod:
|
|
pod = t
|
|
default:
|
|
return nil, fmt.Errorf("expect *api.Pod or *v1.Pod, got %v", t)
|
|
}
|
|
return pod, nil
|
|
}
|
|
|
|
// podMatchesScopeFunc is a function that knows how to evaluate if a pod matches a scope
|
|
func podMatchesScopeFunc(selector api.ScopedResourceSelectorRequirement, object runtime.Object) (bool, error) {
|
|
pod, err := toInternalPodOrError(object)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
switch selector.ScopeName {
|
|
case api.ResourceQuotaScopeTerminating:
|
|
return isTerminating(pod), nil
|
|
case api.ResourceQuotaScopeNotTerminating:
|
|
return !isTerminating(pod), nil
|
|
case api.ResourceQuotaScopeBestEffort:
|
|
return isBestEffort(pod), nil
|
|
case api.ResourceQuotaScopeNotBestEffort:
|
|
return !isBestEffort(pod), nil
|
|
case api.ResourceQuotaScopePriorityClass:
|
|
return podMatchesSelector(pod, selector)
|
|
}
|
|
return false, nil
|
|
}
|
|
|
|
// PodUsageFunc returns the quota usage for a pod.
|
|
// A pod is charged for quota if the following are not true.
|
|
// - pod has a terminal phase (failed or succeeded)
|
|
// - pod has been marked for deletion and grace period has expired
|
|
func PodUsageFunc(obj runtime.Object, clock clock.Clock) (api.ResourceList, error) {
|
|
pod, err := toInternalPodOrError(obj)
|
|
if err != nil {
|
|
return api.ResourceList{}, err
|
|
}
|
|
|
|
// always quota the object count (even if the pod is end of life)
|
|
// object count quotas track all objects that are in storage.
|
|
// where "pods" tracks all pods that have not reached a terminal state,
|
|
// count/pods tracks all pods independent of state.
|
|
result := api.ResourceList{
|
|
podObjectCountName: *(resource.NewQuantity(1, resource.DecimalSI)),
|
|
}
|
|
|
|
// by convention, we do not quota compute resources that have reached end-of life
|
|
// note: the "pods" resource is considered a compute resource since it is tied to life-cycle.
|
|
if !QuotaPod(pod, clock) {
|
|
return result, nil
|
|
}
|
|
|
|
requests := api.ResourceList{}
|
|
limits := api.ResourceList{}
|
|
// TODO: ideally, we have pod level requests and limits in the future.
|
|
for i := range pod.Spec.Containers {
|
|
requests = quota.Add(requests, pod.Spec.Containers[i].Resources.Requests)
|
|
limits = quota.Add(limits, pod.Spec.Containers[i].Resources.Limits)
|
|
}
|
|
// InitContainers are run sequentially before other containers start, so the highest
|
|
// init container resource is compared against the sum of app containers to determine
|
|
// the effective usage for both requests and limits.
|
|
for i := range pod.Spec.InitContainers {
|
|
requests = quota.Max(requests, pod.Spec.InitContainers[i].Resources.Requests)
|
|
limits = quota.Max(limits, pod.Spec.InitContainers[i].Resources.Limits)
|
|
}
|
|
|
|
result = quota.Add(result, podComputeUsageHelper(requests, limits))
|
|
return result, nil
|
|
}
|
|
|
|
func isBestEffort(pod *api.Pod) bool {
|
|
return qos.GetPodQOS(pod) == api.PodQOSBestEffort
|
|
}
|
|
|
|
func isTerminating(pod *api.Pod) bool {
|
|
if pod.Spec.ActiveDeadlineSeconds != nil && *pod.Spec.ActiveDeadlineSeconds >= int64(0) {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func podMatchesSelector(pod *api.Pod, selector api.ScopedResourceSelectorRequirement) (bool, error) {
|
|
labelSelector, err := helper.ScopedResourceSelectorRequirementsAsSelector(selector)
|
|
if err != nil {
|
|
return false, fmt.Errorf("failed to parse and convert selector: %v", err)
|
|
}
|
|
var m map[string]string
|
|
if len(pod.Spec.PriorityClassName) != 0 {
|
|
m = map[string]string{string(api.ResourceQuotaScopePriorityClass): pod.Spec.PriorityClassName}
|
|
}
|
|
if labelSelector.Matches(labels.Set(m)) {
|
|
return true, nil
|
|
}
|
|
return false, nil
|
|
}
|
|
|
|
// QuotaPod returns true if the pod is eligible to track against a quota
|
|
// A pod is eligible for quota, unless any of the following are true:
|
|
// - pod has a terminal phase (failed or succeeded)
|
|
// - pod has been marked for deletion and grace period has expired.
|
|
func QuotaPod(pod *api.Pod, clock clock.Clock) bool {
|
|
// if pod is terminal, ignore it for quota
|
|
if api.PodFailed == pod.Status.Phase || api.PodSucceeded == pod.Status.Phase {
|
|
return false
|
|
}
|
|
// deleted pods that should be gone should not be charged to user quota.
|
|
// this can happen if a node is lost, and the kubelet is never able to confirm deletion.
|
|
// even though the cluster may have drifting clocks, quota makes a reasonable effort
|
|
// to balance cluster needs against user needs. user's do not control clocks,
|
|
// but at worst a small drive in clocks will only slightly impact quota.
|
|
if pod.DeletionTimestamp != nil && pod.DeletionGracePeriodSeconds != nil {
|
|
now := clock.Now()
|
|
deletionTime := pod.DeletionTimestamp.Time
|
|
gracePeriod := time.Duration(*pod.DeletionGracePeriodSeconds) * time.Second
|
|
if now.After(deletionTime.Add(gracePeriod)) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// QuotaV1Pod returns true if the pod is eligible to track against a quota
|
|
// if it's not in a terminal state according to its phase.
|
|
func QuotaV1Pod(pod *v1.Pod, clock clock.Clock) bool {
|
|
// if pod is terminal, ignore it for quota
|
|
if v1.PodFailed == pod.Status.Phase || v1.PodSucceeded == pod.Status.Phase {
|
|
return false
|
|
}
|
|
// if pods are stuck terminating (for example, a node is lost), we do not want
|
|
// to charge the user for that pod in quota because it could prevent them from
|
|
// scaling up new pods to service their application.
|
|
if pod.DeletionTimestamp != nil && pod.DeletionGracePeriodSeconds != nil {
|
|
now := clock.Now()
|
|
deletionTime := pod.DeletionTimestamp.Time
|
|
gracePeriod := time.Duration(*pod.DeletionGracePeriodSeconds) * time.Second
|
|
if now.After(deletionTime.Add(gracePeriod)) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|