2016-05-13 03:35:18 +00:00
/ *
2016-06-03 00:25:58 +00:00
Copyright 2016 The Kubernetes Authors .
2016-05-13 03:35:18 +00:00
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package eviction
import (
2016-11-04 16:30:02 +00:00
"fmt"
2016-05-13 03:35:18 +00:00
"sort"
"sync"
"time"
"github.com/golang/glog"
2017-02-19 03:16:10 +00:00
2017-06-22 18:24:23 +00:00
"k8s.io/api/core/v1"
2017-01-25 13:13:07 +00:00
"k8s.io/apimachinery/pkg/api/resource"
2017-05-19 17:57:39 +00:00
"k8s.io/apimachinery/pkg/util/clock"
2017-01-11 14:09:48 +00:00
"k8s.io/apimachinery/pkg/util/wait"
2017-01-28 22:48:35 +00:00
utilfeature "k8s.io/apiserver/pkg/util/feature"
2017-01-30 18:39:54 +00:00
"k8s.io/client-go/tools/record"
2017-08-18 04:42:19 +00:00
apiv1resource "k8s.io/kubernetes/pkg/api/v1/resource"
2017-11-08 22:34:54 +00:00
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
2017-01-28 22:48:35 +00:00
"k8s.io/kubernetes/pkg/features"
2017-05-30 20:54:15 +00:00
statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1"
2016-09-09 15:16:47 +00:00
"k8s.io/kubernetes/pkg/kubelet/cm"
2017-02-10 05:14:10 +00:00
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
2016-05-13 03:35:18 +00:00
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
2017-03-13 22:53:04 +00:00
"k8s.io/kubernetes/pkg/kubelet/metrics"
2017-02-19 03:16:10 +00:00
kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
2016-05-13 03:35:18 +00:00
"k8s.io/kubernetes/pkg/kubelet/server/stats"
2017-01-28 22:48:35 +00:00
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
2016-05-13 03:35:18 +00:00
"k8s.io/kubernetes/pkg/kubelet/util/format"
)
2017-05-16 21:23:42 +00:00
const (
podCleanupTimeout = 30 * time . Second
podCleanupPollFreq = time . Second
)
2016-09-09 15:16:47 +00:00
// managerImpl implements Manager
2016-05-13 03:35:18 +00:00
type managerImpl struct {
// used to track time
2016-05-26 03:08:56 +00:00
clock clock . Clock
2016-05-13 03:35:18 +00:00
// config is how the manager is configured
config Config
// the function to invoke to kill a pod
killPodFunc KillPodFunc
2016-08-03 20:55:52 +00:00
// the interface that knows how to do image gc
imageGC ImageGC
2017-09-18 07:13:24 +00:00
// the interface that knows how to do container gc
2017-05-22 18:00:22 +00:00
containerGC ContainerGC
2016-05-13 03:35:18 +00:00
// protects access to internal state
sync . RWMutex
// node conditions are the set of conditions present
2016-11-18 20:50:58 +00:00
nodeConditions [ ] v1 . NodeConditionType
2016-05-13 03:35:18 +00:00
// captures when a node condition was last observed based on a threshold being met
nodeConditionsLastObservedAt nodeConditionsObservedAt
// nodeRef is a reference to the node
2017-07-15 05:25:54 +00:00
nodeRef * v1 . ObjectReference
2016-05-13 03:35:18 +00:00
// used to record events about the node
recorder record . EventRecorder
// used to measure usage stats on system
summaryProvider stats . SummaryProvider
// records when a threshold was first observed
thresholdsFirstObservedAt thresholdsObservedAt
2016-08-03 02:58:48 +00:00
// records the set of thresholds that have been met (including graceperiod) but not yet resolved
2017-02-10 05:14:10 +00:00
thresholdsMet [ ] evictionapi . Threshold
2016-07-22 19:23:09 +00:00
// resourceToRankFunc maps a resource to ranking function for that resource.
2016-11-18 20:50:58 +00:00
resourceToRankFunc map [ v1 . ResourceName ] rankFunc
2016-08-03 20:55:52 +00:00
// resourceToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource.
2016-11-18 20:50:58 +00:00
resourceToNodeReclaimFuncs map [ v1 . ResourceName ] nodeReclaimFuncs
2016-09-14 20:46:20 +00:00
// last observations from synchronize
lastObservations signalObservations
2016-09-09 15:16:47 +00:00
// notifiersInitialized indicates if the threshold notifiers have been initialized (i.e. synchronize() has been called once)
notifiersInitialized bool
2017-05-25 19:29:19 +00:00
// dedicatedImageFs indicates if imagefs is on a separate device from the rootfs
dedicatedImageFs * bool
2016-05-13 03:35:18 +00:00
}
// ensure it implements the required interface
var _ Manager = & managerImpl { }
// NewManager returns a configured Manager and an associated admission handler to enforce eviction configuration.
func NewManager (
summaryProvider stats . SummaryProvider ,
config Config ,
killPodFunc KillPodFunc ,
2016-08-03 20:55:52 +00:00
imageGC ImageGC ,
2017-05-22 18:00:22 +00:00
containerGC ContainerGC ,
2016-05-13 03:35:18 +00:00
recorder record . EventRecorder ,
2017-07-15 05:25:54 +00:00
nodeRef * v1 . ObjectReference ,
2017-12-18 02:40:38 +00:00
clock clock . Clock ,
) ( Manager , lifecycle . PodAdmitHandler ) {
2016-05-13 03:35:18 +00:00
manager := & managerImpl {
clock : clock ,
killPodFunc : killPodFunc ,
2016-08-03 20:55:52 +00:00
imageGC : imageGC ,
2017-05-22 18:00:22 +00:00
containerGC : containerGC ,
2016-05-13 03:35:18 +00:00
config : config ,
recorder : recorder ,
summaryProvider : summaryProvider ,
nodeRef : nodeRef ,
nodeConditionsLastObservedAt : nodeConditionsObservedAt { } ,
thresholdsFirstObservedAt : thresholdsObservedAt { } ,
2017-05-25 19:29:19 +00:00
dedicatedImageFs : nil ,
2016-05-13 03:35:18 +00:00
}
2016-12-29 08:56:15 +00:00
return manager , manager
2016-05-13 03:35:18 +00:00
}
// Admit rejects a pod if its not safe to admit for node stability.
func ( m * managerImpl ) Admit ( attrs * lifecycle . PodAdmitAttributes ) lifecycle . PodAdmitResult {
m . RLock ( )
defer m . RUnlock ( )
if len ( m . nodeConditions ) == 0 {
return lifecycle . PodAdmitResult { Admit : true }
}
2017-02-19 03:16:10 +00:00
// Admit Critical pods even under resource pressure since they are required for system stability.
// https://github.com/kubernetes/kubernetes/issues/40573 has more details.
if utilfeature . DefaultFeatureGate . Enabled ( features . ExperimentalCriticalPodAnnotation ) && kubelettypes . IsCriticalPod ( attrs . Pod ) {
return lifecycle . PodAdmitResult { Admit : true }
}
2016-08-27 03:06:15 +00:00
// the node has memory pressure, admit if not best-effort
2016-11-18 20:50:58 +00:00
if hasNodeCondition ( m . nodeConditions , v1 . NodeMemoryPressure ) {
2017-05-20 11:16:53 +00:00
notBestEffort := v1 . PodQOSBestEffort != v1qos . GetPodQOS ( attrs . Pod )
2017-01-26 04:39:13 +00:00
if notBestEffort {
2016-07-22 19:23:09 +00:00
return lifecycle . PodAdmitResult { Admit : true }
}
2016-05-13 03:35:18 +00:00
}
2016-07-22 19:23:09 +00:00
// reject pods when under memory pressure (if pod is best effort), or if under disk pressure.
2017-05-08 12:02:10 +00:00
glog . Warningf ( "Failed to admit pod %s - node has conditions: %v" , format . Pod ( attrs . Pod ) , m . nodeConditions )
2016-05-13 03:35:18 +00:00
return lifecycle . PodAdmitResult {
Admit : false ,
Reason : reason ,
2016-11-04 16:30:02 +00:00
Message : fmt . Sprintf ( message , m . nodeConditions ) ,
2016-05-13 03:35:18 +00:00
}
}
// Start starts the control loop to observe and response to low compute resources.
2018-02-16 22:53:44 +00:00
func ( m * managerImpl ) Start ( diskInfoProvider DiskInfoProvider , podFunc ActivePodsFunc , podCleanedUpFunc PodCleanedUpFunc , monitoringInterval time . Duration ) {
2016-07-22 19:23:09 +00:00
// start the eviction manager monitoring
2017-05-16 21:23:42 +00:00
go func ( ) {
for {
2018-02-16 22:53:44 +00:00
if evictedPods := m . synchronize ( diskInfoProvider , podFunc ) ; evictedPods != nil {
2017-05-30 20:54:15 +00:00
glog . Infof ( "eviction manager: pods %s evicted, waiting for pod to be cleaned up" , format . Pods ( evictedPods ) )
m . waitForPodsCleanup ( podCleanedUpFunc , evictedPods )
2017-05-16 21:23:42 +00:00
} else {
time . Sleep ( monitoringInterval )
}
}
} ( )
2016-05-13 03:35:18 +00:00
}
// IsUnderMemoryPressure returns true if the node is under memory pressure.
func ( m * managerImpl ) IsUnderMemoryPressure ( ) bool {
m . RLock ( )
defer m . RUnlock ( )
2016-11-18 20:50:58 +00:00
return hasNodeCondition ( m . nodeConditions , v1 . NodeMemoryPressure )
2016-05-13 03:35:18 +00:00
}
2016-07-22 19:23:09 +00:00
// IsUnderDiskPressure returns true if the node is under disk pressure.
func ( m * managerImpl ) IsUnderDiskPressure ( ) bool {
m . RLock ( )
defer m . RUnlock ( )
2016-11-18 20:50:58 +00:00
return hasNodeCondition ( m . nodeConditions , v1 . NodeDiskPressure )
2016-07-22 19:23:09 +00:00
}
2017-12-18 02:40:38 +00:00
// IsUnderPIDPressure returns true if the node is under PID pressure.
func ( m * managerImpl ) IsUnderPIDPressure ( ) bool {
m . RLock ( )
defer m . RUnlock ( )
return hasNodeCondition ( m . nodeConditions , v1 . NodePIDPressure )
}
2017-02-10 05:14:10 +00:00
func startMemoryThresholdNotifier ( thresholds [ ] evictionapi . Threshold , observations signalObservations , hard bool , handler thresholdNotifierHandlerFunc ) error {
2016-09-09 15:16:47 +00:00
for _ , threshold := range thresholds {
2017-02-10 05:14:10 +00:00
if threshold . Signal != evictionapi . SignalMemoryAvailable || hard != isHardEvictionThreshold ( threshold ) {
2016-09-09 15:16:47 +00:00
continue
}
2017-02-10 05:14:10 +00:00
observed , found := observations [ evictionapi . SignalMemoryAvailable ]
2016-09-09 15:16:47 +00:00
if ! found {
continue
}
cgroups , err := cm . GetCgroupSubsystems ( )
if err != nil {
return err
}
// TODO add support for eviction from --cgroup-root
cgpath , found := cgroups . MountPoints [ "memory" ]
if ! found || len ( cgpath ) == 0 {
return fmt . Errorf ( "memory cgroup mount point not found" )
}
attribute := "memory.usage_in_bytes"
2017-02-10 05:14:10 +00:00
quantity := evictionapi . GetThresholdQuantity ( threshold . Value , observed . capacity )
2016-09-09 15:16:47 +00:00
usageThreshold := resource . NewQuantity ( observed . capacity . Value ( ) , resource . DecimalSI )
usageThreshold . Sub ( * quantity )
description := fmt . Sprintf ( "<%s available" , formatThresholdValue ( threshold . Value ) )
memcgThresholdNotifier , err := NewMemCGThresholdNotifier ( cgpath , attribute , usageThreshold . String ( ) , description , handler )
if err != nil {
return err
}
go memcgThresholdNotifier . Start ( wait . NeverStop )
return nil
}
return nil
}
2016-05-13 03:35:18 +00:00
// synchronize is the main control loop that enforces eviction thresholds.
2017-05-16 21:23:42 +00:00
// Returns the pod that was killed, or nil if no pod was killed.
2018-02-16 22:53:44 +00:00
func ( m * managerImpl ) synchronize ( diskInfoProvider DiskInfoProvider , podFunc ActivePodsFunc ) [ ] * v1 . Pod {
2016-05-13 03:35:18 +00:00
// if we have nothing to do, just return
thresholds := m . config . Thresholds
if len ( thresholds ) == 0 {
2017-05-16 21:23:42 +00:00
return nil
2016-05-13 03:35:18 +00:00
}
2017-02-08 17:58:02 +00:00
glog . V ( 3 ) . Infof ( "eviction manager: synchronize housekeeping" )
2016-07-28 17:02:01 +00:00
// build the ranking functions (if not yet known)
// TODO: have a function in cadvisor that lets us know if global housekeeping has completed
2017-05-25 19:29:19 +00:00
if m . dedicatedImageFs == nil {
hasImageFs , ok := diskInfoProvider . HasDedicatedImageFs ( )
if ok != nil {
2017-05-16 21:23:42 +00:00
return nil
2016-07-28 17:02:01 +00:00
}
2017-05-25 19:29:19 +00:00
m . dedicatedImageFs = & hasImageFs
m . resourceToRankFunc = buildResourceToRankFunc ( hasImageFs )
2017-05-22 18:00:22 +00:00
m . resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs ( m . imageGC , m . containerGC , hasImageFs )
2016-07-28 17:02:01 +00:00
}
2017-05-25 19:29:19 +00:00
activePods := podFunc ( )
2018-02-08 22:57:30 +00:00
updateStats := true
summary , err := m . summaryProvider . Get ( updateStats )
2016-05-13 03:35:18 +00:00
if err != nil {
2018-02-08 22:57:30 +00:00
glog . Errorf ( "eviction manager: failed to get get summary stats: %v" , err )
2017-05-16 21:23:42 +00:00
return nil
2016-05-13 03:35:18 +00:00
}
2018-02-08 22:57:30 +00:00
// make observations and get a function to derive pod usage stats relative to those observations.
2018-02-16 22:53:44 +00:00
observations , statsFunc := makeSignalObservations ( summary )
2017-02-08 17:58:02 +00:00
debugLogObservations ( "observations" , observations )
2016-05-13 03:35:18 +00:00
2016-09-09 15:16:47 +00:00
// attempt to create a threshold notifier to improve eviction response time
2016-12-07 15:09:41 +00:00
if m . config . KernelMemcgNotification && ! m . notifiersInitialized {
glog . Infof ( "eviction manager attempting to integrate with kernel memcg notification api" )
2016-09-09 15:16:47 +00:00
m . notifiersInitialized = true
// start soft memory notification
err = startMemoryThresholdNotifier ( m . config . Thresholds , observations , false , func ( desc string ) {
glog . Infof ( "soft memory eviction threshold crossed at %s" , desc )
// TODO wait grace period for soft memory limit
2018-02-16 22:53:44 +00:00
m . synchronize ( diskInfoProvider , podFunc )
2016-09-09 15:16:47 +00:00
} )
if err != nil {
2018-01-02 02:22:32 +00:00
glog . Warningf ( "eviction manager: failed to create soft memory threshold notifier: %v" , err )
2016-09-09 15:16:47 +00:00
}
// start hard memory notification
err = startMemoryThresholdNotifier ( m . config . Thresholds , observations , true , func ( desc string ) {
glog . Infof ( "hard memory eviction threshold crossed at %s" , desc )
2018-02-16 22:53:44 +00:00
m . synchronize ( diskInfoProvider , podFunc )
2016-09-09 15:16:47 +00:00
} )
if err != nil {
2018-01-02 02:22:32 +00:00
glog . Warningf ( "eviction manager: failed to create hard memory threshold notifier: %v" , err )
2016-09-09 15:16:47 +00:00
}
}
2016-05-13 03:35:18 +00:00
// determine the set of thresholds met independent of grace period
2016-08-03 02:58:48 +00:00
thresholds = thresholdsMet ( thresholds , observations , false )
2017-02-08 17:58:02 +00:00
debugLogThresholdsWithObservation ( "thresholds - ignoring grace period" , thresholds , observations )
2016-08-03 02:58:48 +00:00
// determine the set of thresholds previously met that have not yet satisfied the associated min-reclaim
if len ( m . thresholdsMet ) > 0 {
thresholdsNotYetResolved := thresholdsMet ( m . thresholdsMet , observations , true )
thresholds = mergeThresholds ( thresholds , thresholdsNotYetResolved )
}
2017-02-08 17:58:02 +00:00
debugLogThresholdsWithObservation ( "thresholds - reclaim not satisfied" , thresholds , observations )
2016-05-13 03:35:18 +00:00
// track when a threshold was first observed
2016-09-09 15:16:47 +00:00
now := m . clock . Now ( )
2016-05-13 03:35:18 +00:00
thresholdsFirstObservedAt := thresholdsFirstObservedAt ( thresholds , m . thresholdsFirstObservedAt , now )
// the set of node conditions that are triggered by currently observed thresholds
nodeConditions := nodeConditions ( thresholds )
2017-02-08 17:58:02 +00:00
if len ( nodeConditions ) > 0 {
glog . V ( 3 ) . Infof ( "eviction manager: node conditions - observed: %v" , nodeConditions )
}
2016-05-13 03:35:18 +00:00
// track when a node condition was last observed
nodeConditionsLastObservedAt := nodeConditionsLastObservedAt ( nodeConditions , m . nodeConditionsLastObservedAt , now )
// node conditions report true if it has been observed within the transition period window
nodeConditions = nodeConditionsObservedSince ( nodeConditionsLastObservedAt , m . config . PressureTransitionPeriod , now )
2017-02-08 17:58:02 +00:00
if len ( nodeConditions ) > 0 {
glog . V ( 3 ) . Infof ( "eviction manager: node conditions - transition period not met: %v" , nodeConditions )
}
2016-05-13 03:35:18 +00:00
// determine the set of thresholds we need to drive eviction behavior (i.e. all grace periods are met)
thresholds = thresholdsMetGracePeriod ( thresholdsFirstObservedAt , now )
2017-02-08 17:58:02 +00:00
debugLogThresholdsWithObservation ( "thresholds - grace periods satisified" , thresholds , observations )
2016-05-13 03:35:18 +00:00
// update internal state
m . Lock ( )
m . nodeConditions = nodeConditions
m . thresholdsFirstObservedAt = thresholdsFirstObservedAt
m . nodeConditionsLastObservedAt = nodeConditionsLastObservedAt
2016-08-03 02:58:48 +00:00
m . thresholdsMet = thresholds
2017-09-06 20:01:49 +00:00
// determine the set of thresholds whose stats have been updated since the last sync
thresholds = thresholdsUpdatedStats ( thresholds , observations , m . lastObservations )
debugLogThresholdsWithObservation ( "thresholds - updated stats" , thresholds , observations )
2016-09-14 20:46:20 +00:00
m . lastObservations = observations
2016-05-13 03:35:18 +00:00
m . Unlock ( )
2017-08-05 10:50:49 +00:00
// evict pods if there is a resource usage violation from local volume temporary storage
2017-12-12 01:22:27 +00:00
// If eviction happens in localStorageEviction function, skip the rest of eviction action
2017-05-30 20:54:15 +00:00
if utilfeature . DefaultFeatureGate . Enabled ( features . LocalStorageCapacityIsolation ) {
2018-02-08 22:57:30 +00:00
if evictedPods := m . localStorageEviction ( summary , activePods ) ; len ( evictedPods ) > 0 {
2017-05-30 20:54:15 +00:00
return evictedPods
}
}
2016-05-13 03:35:18 +00:00
// determine the set of resources under starvation
2016-08-03 20:55:52 +00:00
starvedResources := getStarvedResources ( thresholds )
2016-05-13 03:35:18 +00:00
if len ( starvedResources ) == 0 {
2016-05-17 21:11:08 +00:00
glog . V ( 3 ) . Infof ( "eviction manager: no resources are starved" )
2017-05-16 21:23:42 +00:00
return nil
2016-05-13 03:35:18 +00:00
}
// rank the resources to reclaim by eviction priority
sort . Sort ( byEvictionPriority ( starvedResources ) )
resourceToReclaim := starvedResources [ 0 ]
glog . Warningf ( "eviction manager: attempting to reclaim %v" , resourceToReclaim )
2016-05-17 21:11:08 +00:00
// determine if this is a soft or hard eviction associated with the resource
2016-09-09 15:16:47 +00:00
softEviction := isSoftEvictionThresholds ( thresholds , resourceToReclaim )
2016-05-17 21:11:08 +00:00
2016-05-13 03:35:18 +00:00
// record an event about the resources we are now attempting to reclaim via eviction
2016-11-18 20:50:58 +00:00
m . recorder . Eventf ( m . nodeRef , v1 . EventTypeWarning , "EvictionThresholdMet" , "Attempting to reclaim %s" , resourceToReclaim )
2016-05-13 03:35:18 +00:00
2016-08-03 20:55:52 +00:00
// check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
2018-02-16 22:53:44 +00:00
if m . reclaimNodeLevelResources ( resourceToReclaim ) {
2016-08-03 20:55:52 +00:00
glog . Infof ( "eviction manager: able to reduce %v pressure without evicting pods." , resourceToReclaim )
2017-05-16 21:23:42 +00:00
return nil
2016-08-03 20:55:52 +00:00
}
glog . Infof ( "eviction manager: must evict pod(s) to reclaim %v" , resourceToReclaim )
2016-05-13 03:35:18 +00:00
// rank the pods for eviction
2016-07-22 19:23:09 +00:00
rank , ok := m . resourceToRankFunc [ resourceToReclaim ]
2016-05-13 03:35:18 +00:00
if ! ok {
glog . Errorf ( "eviction manager: no ranking function for resource %s" , resourceToReclaim )
2017-05-16 21:23:42 +00:00
return nil
2016-05-13 03:35:18 +00:00
}
// the only candidates viable for eviction are those pods that had anything running.
2017-05-25 19:29:19 +00:00
if len ( activePods ) == 0 {
glog . Errorf ( "eviction manager: eviction thresholds have been met, but no pods are active to evict" )
return nil
}
2016-05-13 03:35:18 +00:00
// rank the running pods for eviction for the specified resource
rank ( activePods , statsFunc )
glog . Infof ( "eviction manager: pods ranked for eviction: %s" , format . Pods ( activePods ) )
2017-03-13 22:53:04 +00:00
//record age of metrics for met thresholds that we are using for evictions.
for _ , t := range thresholds {
timeObserved := observations [ t . Signal ] . time
if ! timeObserved . IsZero ( ) {
metrics . EvictionStatsAge . WithLabelValues ( string ( t . Signal ) ) . Observe ( metrics . SinceInMicroseconds ( timeObserved . Time ) )
}
}
2016-05-13 03:35:18 +00:00
// we kill at most a single pod during each eviction interval
for i := range activePods {
pod := activePods [ i ]
2017-02-19 03:16:10 +00:00
// If the pod is marked as critical and static, and support for critical pod annotations is enabled,
// do not evict such pods. Static pods are not re-admitted after evictions.
// https://github.com/kubernetes/kubernetes/issues/40573 has more details.
2017-01-28 22:48:35 +00:00
if utilfeature . DefaultFeatureGate . Enabled ( features . ExperimentalCriticalPodAnnotation ) &&
2017-02-19 03:16:10 +00:00
kubelettypes . IsCriticalPod ( pod ) && kubepod . IsStaticPod ( pod ) {
2017-01-28 22:48:35 +00:00
continue
}
2016-11-18 20:50:58 +00:00
status := v1 . PodStatus {
Phase : v1 . PodFailed ,
2016-11-04 16:30:02 +00:00
Message : fmt . Sprintf ( message , resourceToReclaim ) ,
2016-05-13 03:35:18 +00:00
Reason : reason ,
}
// record that we are evicting the pod
2016-11-18 20:50:58 +00:00
m . recorder . Eventf ( pod , v1 . EventTypeWarning , reason , fmt . Sprintf ( message , resourceToReclaim ) )
2016-05-13 03:35:18 +00:00
gracePeriodOverride := int64 ( 0 )
2016-05-17 21:11:08 +00:00
if softEviction {
gracePeriodOverride = m . config . MaxPodGracePeriodSeconds
}
2016-05-13 03:35:18 +00:00
// this is a blocking call and should only return when the pod and its containers are killed.
err := m . killPodFunc ( pod , status , & gracePeriodOverride )
if err != nil {
2017-05-16 21:23:42 +00:00
glog . Warningf ( "eviction manager: error while evicting pod %s: %v" , format . Pod ( pod ) , err )
2016-05-13 03:35:18 +00:00
}
2017-05-30 20:54:15 +00:00
return [ ] * v1 . Pod { pod }
2016-05-13 03:35:18 +00:00
}
glog . Infof ( "eviction manager: unable to evict any pods from the node" )
2017-05-16 21:23:42 +00:00
return nil
}
2017-05-30 20:54:15 +00:00
func ( m * managerImpl ) waitForPodsCleanup ( podCleanedUpFunc PodCleanedUpFunc , pods [ ] * v1 . Pod ) {
2017-05-16 21:23:42 +00:00
timeout := m . clock . NewTimer ( podCleanupTimeout )
tick := m . clock . Tick ( podCleanupPollFreq )
for {
select {
case <- timeout . C ( ) :
2017-05-30 20:54:15 +00:00
glog . Warningf ( "eviction manager: timed out waiting for pods %s to be cleaned up" , format . Pods ( pods ) )
2017-05-16 21:23:42 +00:00
return
case <- tick :
2017-05-30 20:54:15 +00:00
for i , pod := range pods {
if ! podCleanedUpFunc ( pod ) {
break
}
if i == len ( pods ) - 1 {
glog . Infof ( "eviction manager: pods %s successfully cleaned up" , format . Pods ( pods ) )
return
}
2017-05-16 21:23:42 +00:00
}
}
}
2016-05-13 03:35:18 +00:00
}
2016-08-03 20:55:52 +00:00
// reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required.
2018-02-16 22:53:44 +00:00
func ( m * managerImpl ) reclaimNodeLevelResources ( resourceToReclaim v1 . ResourceName ) bool {
2016-08-03 20:55:52 +00:00
nodeReclaimFuncs := m . resourceToNodeReclaimFuncs [ resourceToReclaim ]
for _ , nodeReclaimFunc := range nodeReclaimFuncs {
// attempt to reclaim the pressured resource.
2018-02-16 16:35:24 +00:00
if err := nodeReclaimFunc ( ) ; err != nil {
2017-04-27 00:40:30 +00:00
glog . Warningf ( "eviction manager: unexpected error when attempting to reduce %v pressure: %v" , resourceToReclaim , err )
}
2018-02-16 16:35:24 +00:00
}
if len ( nodeReclaimFuncs ) > 0 {
summary , err := m . summaryProvider . Get ( true )
if err != nil {
glog . Errorf ( "eviction manager: failed to get get summary stats after resource reclaim: %v" , err )
return false
2017-04-27 00:40:30 +00:00
}
2018-02-16 16:35:24 +00:00
// make observations and get a function to derive pod usage stats relative to those observations.
2018-02-16 22:53:44 +00:00
observations , _ := makeSignalObservations ( summary )
2018-02-16 16:35:24 +00:00
debugLogObservations ( "observations after resource reclaim" , observations )
// determine the set of thresholds met independent of grace period
thresholds := thresholdsMet ( m . config . Thresholds , observations , false )
debugLogThresholdsWithObservation ( "thresholds after resource reclaim - ignoring grace period" , thresholds , observations )
if len ( thresholds ) == 0 {
2017-04-27 00:40:30 +00:00
return true
2016-08-03 20:55:52 +00:00
}
}
return false
}
2017-05-30 20:54:15 +00:00
// localStorageEviction checks the EmptyDir volume usage for each pod and determine whether it exceeds the specified limit and needs
// to be evicted. It also checks every container in the pod, if the container overlay usage exceeds the limit, the pod will be evicted too.
2018-02-08 22:57:30 +00:00
func ( m * managerImpl ) localStorageEviction ( summary * statsapi . Summary , pods [ ] * v1 . Pod ) [ ] * v1 . Pod {
2017-05-30 20:54:15 +00:00
statsFunc := cachedStatsFunc ( summary . Pods )
evicted := [ ] * v1 . Pod { }
for _ , pod := range pods {
podStats , ok := statsFunc ( pod )
if ! ok {
continue
}
if m . emptyDirLimitEviction ( podStats , pod ) {
evicted = append ( evicted , pod )
continue
}
2017-08-18 04:42:19 +00:00
if m . podEphemeralStorageLimitEviction ( podStats , pod ) {
evicted = append ( evicted , pod )
continue
}
if m . containerEphemeralStorageLimitEviction ( podStats , pod ) {
2017-05-30 20:54:15 +00:00
evicted = append ( evicted , pod )
}
}
return evicted
}
func ( m * managerImpl ) emptyDirLimitEviction ( podStats statsapi . PodStats , pod * v1 . Pod ) bool {
podVolumeUsed := make ( map [ string ] * resource . Quantity )
for _ , volume := range podStats . VolumeStats {
podVolumeUsed [ volume . Name ] = resource . NewQuantity ( int64 ( * volume . UsedBytes ) , resource . BinarySI )
}
for i := range pod . Spec . Volumes {
source := & pod . Spec . Volumes [ i ] . VolumeSource
if source . EmptyDir != nil {
size := source . EmptyDir . SizeLimit
used := podVolumeUsed [ pod . Spec . Volumes [ i ] . Name ]
2017-08-04 18:50:38 +00:00
if used != nil && size != nil && size . Sign ( ) == 1 && used . Cmp ( * size ) > 0 {
2017-05-30 20:54:15 +00:00
// the emptyDir usage exceeds the size limit, evict the pod
return m . evictPod ( pod , v1 . ResourceName ( "EmptyDir" ) , fmt . Sprintf ( "emptyDir usage exceeds the limit %q" , size . String ( ) ) )
}
}
}
2017-08-18 04:42:19 +00:00
return false
}
func ( m * managerImpl ) podEphemeralStorageLimitEviction ( podStats statsapi . PodStats , pod * v1 . Pod ) bool {
_ , podLimits := apiv1resource . PodRequestsAndLimits ( pod )
_ , found := podLimits [ v1 . ResourceEphemeralStorage ]
if ! found {
return false
}
podEphemeralStorageTotalUsage := & resource . Quantity { }
fsStatsSet := [ ] fsStatsType { }
if * m . dedicatedImageFs {
fsStatsSet = [ ] fsStatsType { fsStatsLogs , fsStatsLocalVolumeSource }
} else {
fsStatsSet = [ ] fsStatsType { fsStatsRoot , fsStatsLogs , fsStatsLocalVolumeSource }
}
2017-08-29 00:59:42 +00:00
podEphemeralUsage , err := podLocalEphemeralStorageUsage ( podStats , pod , fsStatsSet )
2017-08-18 04:42:19 +00:00
if err != nil {
glog . Errorf ( "eviction manager: error getting pod disk usage %v" , err )
return false
}
2017-08-29 00:59:42 +00:00
podEphemeralStorageTotalUsage . Add ( podEphemeralUsage [ resourceDisk ] )
2017-08-18 04:42:19 +00:00
if podEphemeralStorageTotalUsage . Cmp ( podLimits [ v1 . ResourceEphemeralStorage ] ) > 0 {
// the total usage of pod exceeds the total size limit of containers, evict the pod
return m . evictPod ( pod , v1 . ResourceEphemeralStorage , fmt . Sprintf ( "pod ephemeral local storage usage exceeds the total limit of containers %v" , podLimits [ v1 . ResourceEphemeralStorage ] ) )
}
2017-05-30 20:54:15 +00:00
return false
}
2017-08-18 04:42:19 +00:00
func ( m * managerImpl ) containerEphemeralStorageLimitEviction ( podStats statsapi . PodStats , pod * v1 . Pod ) bool {
2017-05-30 20:54:15 +00:00
thresholdsMap := make ( map [ string ] * resource . Quantity )
for _ , container := range pod . Spec . Containers {
2017-08-18 04:42:19 +00:00
ephemeralLimit := container . Resources . Limits . StorageEphemeral ( )
if ephemeralLimit != nil && ephemeralLimit . Value ( ) != 0 {
thresholdsMap [ container . Name ] = ephemeralLimit
2017-05-30 20:54:15 +00:00
}
}
for _ , containerStat := range podStats . Containers {
2017-08-18 04:42:19 +00:00
containerUsed := diskUsage ( containerStat . Logs )
if ! * m . dedicatedImageFs {
containerUsed . Add ( * diskUsage ( containerStat . Rootfs ) )
}
if ephemeralStorageThreshold , ok := thresholdsMap [ containerStat . Name ] ; ok {
if ephemeralStorageThreshold . Cmp ( * containerUsed ) < 0 {
return m . evictPod ( pod , v1 . ResourceEphemeralStorage , fmt . Sprintf ( "container's ephemeral local storage usage exceeds the limit %q" , ephemeralStorageThreshold . String ( ) ) )
2017-05-30 20:54:15 +00:00
}
}
}
return false
}
func ( m * managerImpl ) evictPod ( pod * v1 . Pod , resourceName v1 . ResourceName , evictMsg string ) bool {
if utilfeature . DefaultFeatureGate . Enabled ( features . ExperimentalCriticalPodAnnotation ) &&
kubelettypes . IsCriticalPod ( pod ) && kubepod . IsStaticPod ( pod ) {
glog . Errorf ( "eviction manager: cannot evict a critical pod %s" , format . Pod ( pod ) )
return false
}
status := v1 . PodStatus {
Phase : v1 . PodFailed ,
Message : fmt . Sprintf ( message , resourceName ) ,
Reason : reason ,
}
// record that we are evicting the pod
m . recorder . Eventf ( pod , v1 . EventTypeWarning , reason , evictMsg )
gracePeriod := int64 ( 0 )
err := m . killPodFunc ( pod , status , & gracePeriod )
if err != nil {
glog . Errorf ( "eviction manager: pod %s failed to evict %v" , format . Pod ( pod ) , err )
} else {
glog . Infof ( "eviction manager: pod %s is evicted successfully" , format . Pod ( pod ) )
}
return true
}