/* Copyright 2014 The Kubernetes Authors All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package resourcequota import ( "fmt" "time" "github.com/golang/glog" "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api/resource" "k8s.io/kubernetes/pkg/client/cache" clientset "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset" "k8s.io/kubernetes/pkg/controller" "k8s.io/kubernetes/pkg/controller/framework" "k8s.io/kubernetes/pkg/runtime" utilruntime "k8s.io/kubernetes/pkg/util/runtime" "k8s.io/kubernetes/pkg/util/wait" "k8s.io/kubernetes/pkg/util/workqueue" "k8s.io/kubernetes/pkg/watch" ) // ResourceQuotaController is responsible for tracking quota usage status in the system type ResourceQuotaController struct { // Must have authority to list all resources in the system, and update quota status kubeClient clientset.Interface // An index of resource quota objects by namespace rqIndexer cache.Indexer // Watches changes to all resource quota rqController *framework.Controller // A store of pods, populated by the podController podStore cache.StoreToPodLister // Watches changes to all pods (so we can optimize release of compute resources) podController *framework.Controller // ResourceQuota objects that need to be synchronized queue *workqueue.Type // To allow injection of syncUsage for testing. syncHandler func(key string) error // function that controls full recalculation of quota usage resyncPeriod controller.ResyncPeriodFunc } // NewResourceQuotaController creates a new ResourceQuotaController func NewResourceQuotaController(kubeClient clientset.Interface, resyncPeriod controller.ResyncPeriodFunc) *ResourceQuotaController { rq := &ResourceQuotaController{ kubeClient: kubeClient, queue: workqueue.New(), resyncPeriod: resyncPeriod, } rq.rqIndexer, rq.rqController = framework.NewIndexerInformer( &cache.ListWatch{ ListFunc: func(options api.ListOptions) (runtime.Object, error) { return rq.kubeClient.Core().ResourceQuotas(api.NamespaceAll).List(options) }, WatchFunc: func(options api.ListOptions) (watch.Interface, error) { return rq.kubeClient.Core().ResourceQuotas(api.NamespaceAll).Watch(options) }, }, &api.ResourceQuota{}, resyncPeriod(), framework.ResourceEventHandlerFuncs{ AddFunc: rq.enqueueResourceQuota, UpdateFunc: func(old, cur interface{}) { // We are only interested in observing updates to quota.spec to drive updates to quota.status. // We ignore all updates to quota.Status because they are all driven by this controller. // IMPORTANT: // We do not use this function to queue up a full quota recalculation. To do so, would require // us to enqueue all quota.Status updates, and since quota.Status updates involve additional queries // that cannot be backed by a cache and result in a full query of a namespace's content, we do not // want to pay the price on spurious status updates. As a result, we have a separate routine that is // responsible for enqueue of all resource quotas when doing a full resync (enqueueAll) oldResourceQuota := old.(*api.ResourceQuota) curResourceQuota := cur.(*api.ResourceQuota) if api.Semantic.DeepEqual(oldResourceQuota.Spec.Hard, curResourceQuota.Status.Hard) { return } glog.V(4).Infof("Observed updated quota spec for %v/%v", curResourceQuota.Namespace, curResourceQuota.Name) rq.enqueueResourceQuota(curResourceQuota) }, // This will enter the sync loop and no-op, because the controller has been deleted from the store. // Note that deleting a controller immediately after scaling it to 0 will not work. The recommended // way of achieving this is by performing a `stop` operation on the controller. DeleteFunc: rq.enqueueResourceQuota, }, cache.Indexers{"namespace": cache.MetaNamespaceIndexFunc}, ) // We use this pod controller to rapidly observe when a pod deletion occurs in order to // release compute resources from any associated quota. rq.podStore.Store, rq.podController = framework.NewInformer( &cache.ListWatch{ ListFunc: func(options api.ListOptions) (runtime.Object, error) { return rq.kubeClient.Core().Pods(api.NamespaceAll).List(options) }, WatchFunc: func(options api.ListOptions) (watch.Interface, error) { return rq.kubeClient.Core().Pods(api.NamespaceAll).Watch(options) }, }, &api.Pod{}, resyncPeriod(), framework.ResourceEventHandlerFuncs{ DeleteFunc: rq.deletePod, }, ) // set the synchronization handler rq.syncHandler = rq.syncResourceQuotaFromKey return rq } // enqueueAll is called at the fullResyncPeriod interval to force a full recalculation of quota usage statistics func (rq *ResourceQuotaController) enqueueAll() { defer glog.V(4).Infof("Resource quota controller queued all resource quota for full calculation of usage") for _, k := range rq.rqIndexer.ListKeys() { rq.queue.Add(k) } } // obj could be an *api.ResourceQuota, or a DeletionFinalStateUnknown marker item. func (rq *ResourceQuotaController) enqueueResourceQuota(obj interface{}) { key, err := controller.KeyFunc(obj) if err != nil { glog.Errorf("Couldn't get key for object %+v: %v", obj, err) return } rq.queue.Add(key) } // worker runs a worker thread that just dequeues items, processes them, and marks them done. // It enforces that the syncHandler is never invoked concurrently with the same key. func (rq *ResourceQuotaController) worker() { for { func() { key, quit := rq.queue.Get() if quit { return } defer rq.queue.Done(key) err := rq.syncHandler(key.(string)) if err != nil { utilruntime.HandleError(err) } }() } } // Run begins quota controller using the specified number of workers func (rq *ResourceQuotaController) Run(workers int, stopCh <-chan struct{}) { defer utilruntime.HandleCrash() go rq.rqController.Run(stopCh) go rq.podController.Run(stopCh) for i := 0; i < workers; i++ { go wait.Until(rq.worker, time.Second, stopCh) } go wait.Until(func() { rq.enqueueAll() }, rq.resyncPeriod(), stopCh) <-stopCh glog.Infof("Shutting down ResourceQuotaController") rq.queue.ShutDown() } // FilterQuotaPods eliminates pods that no longer have a cost against the quota // pods that have a restart policy of always are always returned // pods that are in a failed state, but have a restart policy of on failure are always returned // pods that are not in a success state or a failure state are included in quota func FilterQuotaPods(pods []api.Pod) []*api.Pod { var result []*api.Pod for i := range pods { value := &pods[i] // a pod that has a restart policy always no matter its state counts against usage if value.Spec.RestartPolicy == api.RestartPolicyAlways { result = append(result, value) continue } // a failed pod with a restart policy of on failure will count against usage if api.PodFailed == value.Status.Phase && value.Spec.RestartPolicy == api.RestartPolicyOnFailure { result = append(result, value) continue } // if the pod is not succeeded or failed, then we count it against quota if api.PodSucceeded != value.Status.Phase && api.PodFailed != value.Status.Phase { result = append(result, value) continue } } return result } // syncResourceQuotaFromKey syncs a quota key func (rq *ResourceQuotaController) syncResourceQuotaFromKey(key string) (err error) { startTime := time.Now() defer func() { glog.V(4).Infof("Finished syncing resource quota %q (%v)", key, time.Now().Sub(startTime)) }() obj, exists, err := rq.rqIndexer.GetByKey(key) if !exists { glog.Infof("Resource quota has been deleted %v", key) return nil } if err != nil { glog.Infof("Unable to retrieve resource quota %v from store: %v", key, err) rq.queue.Add(key) return err } quota := *obj.(*api.ResourceQuota) return rq.syncResourceQuota(quota) } // syncResourceQuota runs a complete sync of current status func (rq *ResourceQuotaController) syncResourceQuota(quota api.ResourceQuota) (err error) { // quota is dirty if any part of spec hard limits differs from the status hard limits dirty := !api.Semantic.DeepEqual(quota.Spec.Hard, quota.Status.Hard) // dirty tracks if the usage status differs from the previous sync, // if so, we send a new usage with latest status // if this is our first sync, it will be dirty by default, since we need track usage dirty = dirty || (quota.Status.Hard == nil || quota.Status.Used == nil) // Create a usage object that is based on the quota resource version usage := api.ResourceQuota{ ObjectMeta: api.ObjectMeta{ Name: quota.Name, Namespace: quota.Namespace, ResourceVersion: quota.ResourceVersion, Labels: quota.Labels, Annotations: quota.Annotations}, Status: api.ResourceQuotaStatus{ Hard: api.ResourceList{}, Used: api.ResourceList{}, }, } // set the hard values supported on the quota for k, v := range quota.Spec.Hard { usage.Status.Hard[k] = *v.Copy() } // set any last known observed status values for usage for k, v := range quota.Status.Used { usage.Status.Used[k] = *v.Copy() } set := map[api.ResourceName]bool{} for k := range usage.Status.Hard { set[k] = true } pods := &api.PodList{} if set[api.ResourcePods] || set[api.ResourceMemory] || set[api.ResourceCPU] { pods, err = rq.kubeClient.Core().Pods(usage.Namespace).List(api.ListOptions{}) if err != nil { return err } } filteredPods := FilterQuotaPods(pods.Items) // iterate over each resource, and update observation for k := range usage.Status.Hard { // look if there is a used value, if none, we are definitely dirty prevQuantity, found := usage.Status.Used[k] if !found { dirty = true } var value *resource.Quantity switch k { case api.ResourcePods: value = resource.NewQuantity(int64(len(filteredPods)), resource.DecimalSI) case api.ResourceServices: items, err := rq.kubeClient.Core().Services(usage.Namespace).List(api.ListOptions{}) if err != nil { return err } value = resource.NewQuantity(int64(len(items.Items)), resource.DecimalSI) case api.ResourceReplicationControllers: items, err := rq.kubeClient.Core().ReplicationControllers(usage.Namespace).List(api.ListOptions{}) if err != nil { return err } value = resource.NewQuantity(int64(len(items.Items)), resource.DecimalSI) case api.ResourceQuotas: items, err := rq.kubeClient.Core().ResourceQuotas(usage.Namespace).List(api.ListOptions{}) if err != nil { return err } value = resource.NewQuantity(int64(len(items.Items)), resource.DecimalSI) case api.ResourceSecrets: items, err := rq.kubeClient.Core().Secrets(usage.Namespace).List(api.ListOptions{}) if err != nil { return err } value = resource.NewQuantity(int64(len(items.Items)), resource.DecimalSI) case api.ResourcePersistentVolumeClaims: items, err := rq.kubeClient.Core().PersistentVolumeClaims(usage.Namespace).List(api.ListOptions{}) if err != nil { return err } value = resource.NewQuantity(int64(len(items.Items)), resource.DecimalSI) case api.ResourceMemory: value = PodsRequests(filteredPods, api.ResourceMemory) case api.ResourceCPU: value = PodsRequests(filteredPods, api.ResourceCPU) } // ignore fields we do not understand (assume another controller is tracking it) if value != nil { // see if the value has changed dirty = dirty || (value.Value() != prevQuantity.Value()) // just update the value usage.Status.Used[k] = *value } } // update the usage only if it changed if dirty { _, err = rq.kubeClient.Core().ResourceQuotas(usage.Namespace).UpdateStatus(&usage) return err } return nil } // PodsRequests returns sum of each resource request for each pod in list // If a given pod in the list does not have a request for the named resource, we log the error // but still attempt to get the most representative count func PodsRequests(pods []*api.Pod, resourceName api.ResourceName) *resource.Quantity { var sum *resource.Quantity for i := range pods { pod := pods[i] podQuantity, err := PodRequests(pod, resourceName) if err != nil { // log the error, but try to keep the most accurate count possible in log // rationale here is that you may have had pods in a namespace that did not have // explicit requests prior to adding the quota glog.Infof("No explicit request for resource, pod %s/%s, %s", pod.Namespace, pod.Name, resourceName) } else { if sum == nil { sum = podQuantity } else { sum.Add(*podQuantity) } } } // if list is empty if sum == nil { q := resource.MustParse("0") sum = &q } return sum } // PodRequests returns sum of each resource request across all containers in pod func PodRequests(pod *api.Pod, resourceName api.ResourceName) (*resource.Quantity, error) { if !PodHasRequests(pod, resourceName) { return nil, fmt.Errorf("Each container in pod %s/%s does not have an explicit request for resource %s.", pod.Namespace, pod.Name, resourceName) } var sum *resource.Quantity for j := range pod.Spec.Containers { value, _ := pod.Spec.Containers[j].Resources.Requests[resourceName] if sum == nil { sum = value.Copy() } else { err := sum.Add(value) if err != nil { return sum, err } } } // if list is empty if sum == nil { q := resource.MustParse("0") sum = &q } return sum, nil } // PodHasRequests verifies that each container in the pod has an explicit request that is non-zero for a named resource func PodHasRequests(pod *api.Pod, resourceName api.ResourceName) bool { for j := range pod.Spec.Containers { value, valueSet := pod.Spec.Containers[j].Resources.Requests[resourceName] if !valueSet || value.Value() == int64(0) { return false } } return true } // When a pod is deleted, enqueue the quota that manages the pod and update its expectations. // obj could be an *api.Pod, or a DeletionFinalStateUnknown marker item. func (rq *ResourceQuotaController) deletePod(obj interface{}) { pod, ok := obj.(*api.Pod) // When a delete is dropped, the relist will notice a pod in the store not // in the list, leading to the insertion of a tombstone object which contains // the deleted key/value. Note that this value might be stale. If the pod // changed labels the new rc will not be woken up till the periodic resync. if !ok { tombstone, ok := obj.(cache.DeletedFinalStateUnknown) if !ok { glog.Errorf("Couldn't get object from tombstone %+v, could take up to %v before a quota records the deletion", obj, rq.resyncPeriod()) return } pod, ok = tombstone.Obj.(*api.Pod) if !ok { glog.Errorf("Tombstone contained object that is not a pod %+v, could take up to %v before quota records the deletion", obj, rq.resyncPeriod()) return } } quotas, err := rq.rqIndexer.Index("namespace", pod) if err != nil { glog.Errorf("Couldn't find resource quota associated with pod %+v, could take up to %v before a quota records the deletion", obj, rq.resyncPeriod()) } if len(quotas) == 0 { glog.V(4).Infof("No resource quota associated with namespace %q", pod.Namespace) return } for i := range quotas { quota := quotas[i].(*api.ResourceQuota) rq.enqueueResourceQuota(quota) } }