2019-01-12 04:58:27 +00:00
/ *
Copyright 2016 The Kubernetes Authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package eviction
import (
"fmt"
2020-07-17 23:14:37 +00:00
"os"
2019-01-12 04:58:27 +00:00
"sort"
"strconv"
"strings"
"time"
2019-12-12 01:27:03 +00:00
v1 "k8s.io/api/core/v1"
2019-01-12 04:58:27 +00:00
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/klog"
2019-12-12 01:27:03 +00:00
"k8s.io/kubernetes/pkg/api/v1/pod"
2019-09-27 21:51:53 +00:00
v1resource "k8s.io/kubernetes/pkg/api/v1/resource"
2019-01-12 04:58:27 +00:00
statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
2019-08-30 18:33:25 +00:00
volumeutils "k8s.io/kubernetes/pkg/volume/util"
2019-01-12 04:58:27 +00:00
)
const (
unsupportedEvictionSignal = "unsupported eviction signal %v"
// Reason is the reason reported back in status.
Reason = "Evicted"
// nodeLowMessageFmt is the message for evictions due to resource pressure.
nodeLowMessageFmt = "The node was low on resource: %v. "
2019-04-07 17:07:55 +00:00
// nodeConditionMessageFmt is the message for evictions due to resource pressure.
nodeConditionMessageFmt = "The node had condition: %v. "
2019-01-12 04:58:27 +00:00
// containerMessageFmt provides additional information for containers exceeding requests
containerMessageFmt = "Container %s was using %s, which exceeds its request of %s. "
// containerEphemeralStorageMessageFmt provides additional information for containers which have exceeded their ES limit
containerEphemeralStorageMessageFmt = "Container %s exceeded its local ephemeral storage limit %q. "
// podEphemeralStorageMessageFmt provides additional information for pods which have exceeded their ES limit
podEphemeralStorageMessageFmt = "Pod ephemeral local storage usage exceeds the total limit of containers %s. "
// emptyDirMessageFmt provides additional information for empty-dir volumes which have exceeded their size limit
emptyDirMessageFmt = "Usage of EmptyDir volume %q exceeds the limit %q. "
// inodes, number. internal to this module, used to account for local disk inode consumption.
resourceInodes v1 . ResourceName = "inodes"
2019-04-07 17:07:55 +00:00
// resourcePids, number. internal to this module, used to account for local pid consumption.
resourcePids v1 . ResourceName = "pids"
2019-01-12 04:58:27 +00:00
// OffendingContainersKey is the key in eviction event annotations for the list of container names which exceeded their requests
OffendingContainersKey = "offending_containers"
// OffendingContainersUsageKey is the key in eviction event annotations for the list of usage of containers which exceeded their requests
OffendingContainersUsageKey = "offending_containers_usage"
// StarvedResourceKey is the key for the starved resource in eviction event annotations
StarvedResourceKey = "starved_resource"
)
var (
// signalToNodeCondition maps a signal to the node condition to report if threshold is met.
signalToNodeCondition map [ evictionapi . Signal ] v1 . NodeConditionType
// signalToResource maps a Signal to its associated Resource.
signalToResource map [ evictionapi . Signal ] v1 . ResourceName
)
func init ( ) {
// map eviction signals to node conditions
signalToNodeCondition = map [ evictionapi . Signal ] v1 . NodeConditionType { }
signalToNodeCondition [ evictionapi . SignalMemoryAvailable ] = v1 . NodeMemoryPressure
signalToNodeCondition [ evictionapi . SignalAllocatableMemoryAvailable ] = v1 . NodeMemoryPressure
signalToNodeCondition [ evictionapi . SignalImageFsAvailable ] = v1 . NodeDiskPressure
signalToNodeCondition [ evictionapi . SignalNodeFsAvailable ] = v1 . NodeDiskPressure
signalToNodeCondition [ evictionapi . SignalImageFsInodesFree ] = v1 . NodeDiskPressure
signalToNodeCondition [ evictionapi . SignalNodeFsInodesFree ] = v1 . NodeDiskPressure
signalToNodeCondition [ evictionapi . SignalPIDAvailable ] = v1 . NodePIDPressure
// map signals to resources (and vice-versa)
signalToResource = map [ evictionapi . Signal ] v1 . ResourceName { }
signalToResource [ evictionapi . SignalMemoryAvailable ] = v1 . ResourceMemory
signalToResource [ evictionapi . SignalAllocatableMemoryAvailable ] = v1 . ResourceMemory
signalToResource [ evictionapi . SignalImageFsAvailable ] = v1 . ResourceEphemeralStorage
signalToResource [ evictionapi . SignalImageFsInodesFree ] = resourceInodes
signalToResource [ evictionapi . SignalNodeFsAvailable ] = v1 . ResourceEphemeralStorage
signalToResource [ evictionapi . SignalNodeFsInodesFree ] = resourceInodes
2019-04-07 17:07:55 +00:00
signalToResource [ evictionapi . SignalPIDAvailable ] = resourcePids
2019-01-12 04:58:27 +00:00
}
// validSignal returns true if the signal is supported.
func validSignal ( signal evictionapi . Signal ) bool {
_ , found := signalToResource [ signal ]
return found
}
2019-09-27 21:51:53 +00:00
// getReclaimableThreshold finds the threshold and resource to reclaim
func getReclaimableThreshold ( thresholds [ ] evictionapi . Threshold ) ( evictionapi . Threshold , v1 . ResourceName , bool ) {
for _ , thresholdToReclaim := range thresholds {
if resourceToReclaim , ok := signalToResource [ thresholdToReclaim . Signal ] ; ok {
return thresholdToReclaim , resourceToReclaim , true
}
klog . V ( 3 ) . Infof ( "eviction manager: threshold %s was crossed, but reclaim is not implemented for this threshold." , thresholdToReclaim . Signal )
}
return evictionapi . Threshold { } , "" , false
}
2019-01-12 04:58:27 +00:00
// ParseThresholdConfig parses the flags for thresholds.
func ParseThresholdConfig ( allocatableConfig [ ] string , evictionHard , evictionSoft , evictionSoftGracePeriod , evictionMinimumReclaim map [ string ] string ) ( [ ] evictionapi . Threshold , error ) {
results := [ ] evictionapi . Threshold { }
hardThresholds , err := parseThresholdStatements ( evictionHard )
if err != nil {
return nil , err
}
results = append ( results , hardThresholds ... )
softThresholds , err := parseThresholdStatements ( evictionSoft )
if err != nil {
return nil , err
}
gracePeriods , err := parseGracePeriods ( evictionSoftGracePeriod )
if err != nil {
return nil , err
}
minReclaims , err := parseMinimumReclaims ( evictionMinimumReclaim )
if err != nil {
return nil , err
}
for i := range softThresholds {
signal := softThresholds [ i ] . Signal
period , found := gracePeriods [ signal ]
if ! found {
return nil , fmt . Errorf ( "grace period must be specified for the soft eviction threshold %v" , signal )
}
softThresholds [ i ] . GracePeriod = period
}
results = append ( results , softThresholds ... )
for i := range results {
2019-09-27 21:51:53 +00:00
if minReclaim , ok := minReclaims [ results [ i ] . Signal ] ; ok {
results [ i ] . MinReclaim = & minReclaim
2019-01-12 04:58:27 +00:00
}
}
for _ , key := range allocatableConfig {
if key == kubetypes . NodeAllocatableEnforcementKey {
results = addAllocatableThresholds ( results )
break
}
}
return results , nil
}
func addAllocatableThresholds ( thresholds [ ] evictionapi . Threshold ) [ ] evictionapi . Threshold {
additionalThresholds := [ ] evictionapi . Threshold { }
for _ , threshold := range thresholds {
if threshold . Signal == evictionapi . SignalMemoryAvailable && isHardEvictionThreshold ( threshold ) {
// Copy the SignalMemoryAvailable to SignalAllocatableMemoryAvailable
additionalThresholds = append ( additionalThresholds , evictionapi . Threshold {
Signal : evictionapi . SignalAllocatableMemoryAvailable ,
Operator : threshold . Operator ,
Value : threshold . Value ,
MinReclaim : threshold . MinReclaim ,
} )
}
}
return append ( thresholds , additionalThresholds ... )
}
// parseThresholdStatements parses the input statements into a list of Threshold objects.
func parseThresholdStatements ( statements map [ string ] string ) ( [ ] evictionapi . Threshold , error ) {
if len ( statements ) == 0 {
return nil , nil
}
results := [ ] evictionapi . Threshold { }
for signal , val := range statements {
result , err := parseThresholdStatement ( evictionapi . Signal ( signal ) , val )
if err != nil {
return nil , err
}
if result != nil {
results = append ( results , * result )
}
}
return results , nil
}
// parseThresholdStatement parses a threshold statement and returns a threshold,
// or nil if the threshold should be ignored.
func parseThresholdStatement ( signal evictionapi . Signal , val string ) ( * evictionapi . Threshold , error ) {
if ! validSignal ( signal ) {
return nil , fmt . Errorf ( unsupportedEvictionSignal , signal )
}
operator := evictionapi . OpForSignal [ signal ]
if strings . HasSuffix ( val , "%" ) {
// ignore 0% and 100%
if val == "0%" || val == "100%" {
return nil , nil
}
percentage , err := parsePercentage ( val )
if err != nil {
return nil , err
}
if percentage < 0 {
return nil , fmt . Errorf ( "eviction percentage threshold %v must be >= 0%%: %s" , signal , val )
}
if percentage > 100 {
return nil , fmt . Errorf ( "eviction percentage threshold %v must be <= 100%%: %s" , signal , val )
}
return & evictionapi . Threshold {
Signal : signal ,
Operator : operator ,
Value : evictionapi . ThresholdValue {
Percentage : percentage ,
} ,
} , nil
}
quantity , err := resource . ParseQuantity ( val )
if err != nil {
return nil , err
}
if quantity . Sign ( ) < 0 || quantity . IsZero ( ) {
return nil , fmt . Errorf ( "eviction threshold %v must be positive: %s" , signal , & quantity )
}
return & evictionapi . Threshold {
Signal : signal ,
Operator : operator ,
Value : evictionapi . ThresholdValue {
Quantity : & quantity ,
} ,
} , nil
}
// parsePercentage parses a string representing a percentage value
func parsePercentage ( input string ) ( float32 , error ) {
value , err := strconv . ParseFloat ( strings . TrimRight ( input , "%" ) , 32 )
if err != nil {
return 0 , err
}
return float32 ( value ) / 100 , nil
}
// parseGracePeriods parses the grace period statements
func parseGracePeriods ( statements map [ string ] string ) ( map [ evictionapi . Signal ] time . Duration , error ) {
if len ( statements ) == 0 {
return nil , nil
}
results := map [ evictionapi . Signal ] time . Duration { }
for signal , val := range statements {
signal := evictionapi . Signal ( signal )
if ! validSignal ( signal ) {
return nil , fmt . Errorf ( unsupportedEvictionSignal , signal )
}
gracePeriod , err := time . ParseDuration ( val )
if err != nil {
return nil , err
}
if gracePeriod < 0 {
return nil , fmt . Errorf ( "invalid eviction grace period specified: %v, must be a positive value" , val )
}
results [ signal ] = gracePeriod
}
return results , nil
}
// parseMinimumReclaims parses the minimum reclaim statements
func parseMinimumReclaims ( statements map [ string ] string ) ( map [ evictionapi . Signal ] evictionapi . ThresholdValue , error ) {
if len ( statements ) == 0 {
return nil , nil
}
results := map [ evictionapi . Signal ] evictionapi . ThresholdValue { }
for signal , val := range statements {
signal := evictionapi . Signal ( signal )
if ! validSignal ( signal ) {
return nil , fmt . Errorf ( unsupportedEvictionSignal , signal )
}
if strings . HasSuffix ( val , "%" ) {
percentage , err := parsePercentage ( val )
if err != nil {
return nil , err
}
if percentage <= 0 {
return nil , fmt . Errorf ( "eviction percentage minimum reclaim %v must be positive: %s" , signal , val )
}
results [ signal ] = evictionapi . ThresholdValue {
Percentage : percentage ,
}
continue
}
quantity , err := resource . ParseQuantity ( val )
if err != nil {
return nil , err
}
if quantity . Sign ( ) < 0 {
return nil , fmt . Errorf ( "negative eviction minimum reclaim specified for %v" , signal )
}
results [ signal ] = evictionapi . ThresholdValue {
Quantity : & quantity ,
}
}
return results , nil
}
// diskUsage converts used bytes into a resource quantity.
func diskUsage ( fsStats * statsapi . FsStats ) * resource . Quantity {
if fsStats == nil || fsStats . UsedBytes == nil {
return & resource . Quantity { Format : resource . BinarySI }
}
usage := int64 ( * fsStats . UsedBytes )
return resource . NewQuantity ( usage , resource . BinarySI )
}
// inodeUsage converts inodes consumed into a resource quantity.
func inodeUsage ( fsStats * statsapi . FsStats ) * resource . Quantity {
if fsStats == nil || fsStats . InodesUsed == nil {
return & resource . Quantity { Format : resource . DecimalSI }
}
usage := int64 ( * fsStats . InodesUsed )
return resource . NewQuantity ( usage , resource . DecimalSI )
}
// memoryUsage converts working set into a resource quantity.
func memoryUsage ( memStats * statsapi . MemoryStats ) * resource . Quantity {
if memStats == nil || memStats . WorkingSetBytes == nil {
return & resource . Quantity { Format : resource . BinarySI }
}
usage := int64 ( * memStats . WorkingSetBytes )
return resource . NewQuantity ( usage , resource . BinarySI )
}
// localVolumeNames returns the set of volumes for the pod that are local
2019-12-12 01:27:03 +00:00
// TODO: summary API should report what volumes consume local storage rather than hard-code here.
2019-01-12 04:58:27 +00:00
func localVolumeNames ( pod * v1 . Pod ) [ ] string {
result := [ ] string { }
for _ , volume := range pod . Spec . Volumes {
if volume . HostPath != nil ||
( volume . EmptyDir != nil && volume . EmptyDir . Medium != v1 . StorageMediumMemory ) ||
volume . ConfigMap != nil ||
volume . GitRepo != nil {
result = append ( result , volume . Name )
}
}
return result
}
// containerUsage aggregates container disk usage and inode consumption for the specified stats to measure.
func containerUsage ( podStats statsapi . PodStats , statsToMeasure [ ] fsStatsType ) v1 . ResourceList {
disk := resource . Quantity { Format : resource . BinarySI }
inodes := resource . Quantity { Format : resource . DecimalSI }
for _ , container := range podStats . Containers {
if hasFsStatsType ( statsToMeasure , fsStatsRoot ) {
disk . Add ( * diskUsage ( container . Rootfs ) )
inodes . Add ( * inodeUsage ( container . Rootfs ) )
}
if hasFsStatsType ( statsToMeasure , fsStatsLogs ) {
disk . Add ( * diskUsage ( container . Logs ) )
inodes . Add ( * inodeUsage ( container . Logs ) )
}
}
return v1 . ResourceList {
v1 . ResourceEphemeralStorage : disk ,
resourceInodes : inodes ,
}
}
// podLocalVolumeUsage aggregates pod local volumes disk usage and inode consumption for the specified stats to measure.
func podLocalVolumeUsage ( volumeNames [ ] string , podStats statsapi . PodStats ) v1 . ResourceList {
disk := resource . Quantity { Format : resource . BinarySI }
inodes := resource . Quantity { Format : resource . DecimalSI }
for _ , volumeName := range volumeNames {
for _ , volumeStats := range podStats . VolumeStats {
if volumeStats . Name == volumeName {
disk . Add ( * diskUsage ( & volumeStats . FsStats ) )
inodes . Add ( * inodeUsage ( & volumeStats . FsStats ) )
break
}
}
}
return v1 . ResourceList {
v1 . ResourceEphemeralStorage : disk ,
resourceInodes : inodes ,
}
}
// podDiskUsage aggregates pod disk usage and inode consumption for the specified stats to measure.
func podDiskUsage ( podStats statsapi . PodStats , pod * v1 . Pod , statsToMeasure [ ] fsStatsType ) ( v1 . ResourceList , error ) {
disk := resource . Quantity { Format : resource . BinarySI }
inodes := resource . Quantity { Format : resource . DecimalSI }
containerUsageList := containerUsage ( podStats , statsToMeasure )
disk . Add ( containerUsageList [ v1 . ResourceEphemeralStorage ] )
inodes . Add ( containerUsageList [ resourceInodes ] )
if hasFsStatsType ( statsToMeasure , fsStatsLocalVolumeSource ) {
volumeNames := localVolumeNames ( pod )
podLocalVolumeUsageList := podLocalVolumeUsage ( volumeNames , podStats )
disk . Add ( podLocalVolumeUsageList [ v1 . ResourceEphemeralStorage ] )
inodes . Add ( podLocalVolumeUsageList [ resourceInodes ] )
}
return v1 . ResourceList {
v1 . ResourceEphemeralStorage : disk ,
resourceInodes : inodes ,
} , nil
}
// localEphemeralVolumeNames returns the set of ephemeral volumes for the pod that are local
func localEphemeralVolumeNames ( pod * v1 . Pod ) [ ] string {
result := [ ] string { }
for _ , volume := range pod . Spec . Volumes {
2019-08-30 18:33:25 +00:00
if volumeutils . IsLocalEphemeralVolume ( volume ) {
2019-01-12 04:58:27 +00:00
result = append ( result , volume . Name )
}
}
return result
}
// podLocalEphemeralStorageUsage aggregates pod local ephemeral storage usage and inode consumption for the specified stats to measure.
2020-07-17 23:14:37 +00:00
func podLocalEphemeralStorageUsage ( podStats statsapi . PodStats , pod * v1 . Pod , statsToMeasure [ ] fsStatsType , etcHostsPath string ) ( v1 . ResourceList , error ) {
2019-01-12 04:58:27 +00:00
disk := resource . Quantity { Format : resource . BinarySI }
inodes := resource . Quantity { Format : resource . DecimalSI }
containerUsageList := containerUsage ( podStats , statsToMeasure )
disk . Add ( containerUsageList [ v1 . ResourceEphemeralStorage ] )
inodes . Add ( containerUsageList [ resourceInodes ] )
if hasFsStatsType ( statsToMeasure , fsStatsLocalVolumeSource ) {
volumeNames := localEphemeralVolumeNames ( pod )
podLocalVolumeUsageList := podLocalVolumeUsage ( volumeNames , podStats )
disk . Add ( podLocalVolumeUsageList [ v1 . ResourceEphemeralStorage ] )
inodes . Add ( podLocalVolumeUsageList [ resourceInodes ] )
}
2020-07-17 23:14:37 +00:00
if len ( etcHostsPath ) > 0 {
if stat , err := os . Stat ( etcHostsPath ) ; err == nil {
disk . Add ( * resource . NewQuantity ( int64 ( stat . Size ( ) ) , resource . BinarySI ) )
inodes . Add ( * resource . NewQuantity ( int64 ( 1 ) , resource . DecimalSI ) )
}
}
2019-01-12 04:58:27 +00:00
return v1 . ResourceList {
v1 . ResourceEphemeralStorage : disk ,
resourceInodes : inodes ,
} , nil
}
// formatThreshold formats a threshold for logging.
func formatThreshold ( threshold evictionapi . Threshold ) string {
return fmt . Sprintf ( "threshold(signal=%v, operator=%v, value=%v, gracePeriod=%v)" , threshold . Signal , threshold . Operator , evictionapi . ThresholdValue ( threshold . Value ) , threshold . GracePeriod )
}
// cachedStatsFunc returns a statsFunc based on the provided pod stats.
func cachedStatsFunc ( podStats [ ] statsapi . PodStats ) statsFunc {
uid2PodStats := map [ string ] statsapi . PodStats { }
for i := range podStats {
uid2PodStats [ podStats [ i ] . PodRef . UID ] = podStats [ i ]
}
return func ( pod * v1 . Pod ) ( statsapi . PodStats , bool ) {
stats , found := uid2PodStats [ string ( pod . UID ) ]
return stats , found
}
}
// Cmp compares p1 and p2 and returns:
//
// -1 if p1 < p2
// 0 if p1 == p2
// +1 if p1 > p2
//
type cmpFunc func ( p1 , p2 * v1 . Pod ) int
// multiSorter implements the Sort interface, sorting changes within.
type multiSorter struct {
pods [ ] * v1 . Pod
cmp [ ] cmpFunc
}
// Sort sorts the argument slice according to the less functions passed to OrderedBy.
func ( ms * multiSorter ) Sort ( pods [ ] * v1 . Pod ) {
ms . pods = pods
sort . Sort ( ms )
}
// OrderedBy returns a Sorter that sorts using the cmp functions, in order.
// Call its Sort method to sort the data.
func orderedBy ( cmp ... cmpFunc ) * multiSorter {
return & multiSorter {
cmp : cmp ,
}
}
// Len is part of sort.Interface.
func ( ms * multiSorter ) Len ( ) int {
return len ( ms . pods )
}
// Swap is part of sort.Interface.
func ( ms * multiSorter ) Swap ( i , j int ) {
ms . pods [ i ] , ms . pods [ j ] = ms . pods [ j ] , ms . pods [ i ]
}
// Less is part of sort.Interface.
func ( ms * multiSorter ) Less ( i , j int ) bool {
p1 , p2 := ms . pods [ i ] , ms . pods [ j ]
var k int
for k = 0 ; k < len ( ms . cmp ) - 1 ; k ++ {
cmpResult := ms . cmp [ k ] ( p1 , p2 )
// p1 is less than p2
if cmpResult < 0 {
return true
}
// p1 is greater than p2
if cmpResult > 0 {
return false
}
// we don't know yet
}
// the last cmp func is the final decider
return ms . cmp [ k ] ( p1 , p2 ) < 0
}
// priority compares pods by Priority, if priority is enabled.
func priority ( p1 , p2 * v1 . Pod ) int {
2019-12-12 01:27:03 +00:00
priority1 := pod . GetPodPriority ( p1 )
priority2 := pod . GetPodPriority ( p2 )
2019-01-12 04:58:27 +00:00
if priority1 == priority2 {
return 0
}
if priority1 > priority2 {
return 1
}
return - 1
}
// exceedMemoryRequests compares whether or not pods' memory usage exceeds their requests
func exceedMemoryRequests ( stats statsFunc ) cmpFunc {
return func ( p1 , p2 * v1 . Pod ) int {
p1Stats , p1Found := stats ( p1 )
p2Stats , p2Found := stats ( p2 )
if ! p1Found || ! p2Found {
// prioritize evicting the pod for which no stats were found
return cmpBool ( ! p1Found , ! p2Found )
}
2019-04-07 17:07:55 +00:00
p1Memory := memoryUsage ( p1Stats . Memory )
p2Memory := memoryUsage ( p2Stats . Memory )
2019-09-27 21:51:53 +00:00
p1ExceedsRequests := p1Memory . Cmp ( v1resource . GetResourceRequestQuantity ( p1 , v1 . ResourceMemory ) ) == 1
p2ExceedsRequests := p2Memory . Cmp ( v1resource . GetResourceRequestQuantity ( p2 , v1 . ResourceMemory ) ) == 1
2019-01-12 04:58:27 +00:00
// prioritize evicting the pod which exceeds its requests
return cmpBool ( p1ExceedsRequests , p2ExceedsRequests )
}
}
// memory compares pods by largest consumer of memory relative to request.
func memory ( stats statsFunc ) cmpFunc {
return func ( p1 , p2 * v1 . Pod ) int {
p1Stats , p1Found := stats ( p1 )
p2Stats , p2Found := stats ( p2 )
if ! p1Found || ! p2Found {
// prioritize evicting the pod for which no stats were found
return cmpBool ( ! p1Found , ! p2Found )
}
// adjust p1, p2 usage relative to the request (if any)
2019-04-07 17:07:55 +00:00
p1Memory := memoryUsage ( p1Stats . Memory )
2019-09-27 21:51:53 +00:00
p1Request := v1resource . GetResourceRequestQuantity ( p1 , v1 . ResourceMemory )
2019-01-12 04:58:27 +00:00
p1Memory . Sub ( p1Request )
2019-04-07 17:07:55 +00:00
p2Memory := memoryUsage ( p2Stats . Memory )
2019-09-27 21:51:53 +00:00
p2Request := v1resource . GetResourceRequestQuantity ( p2 , v1 . ResourceMemory )
2019-01-12 04:58:27 +00:00
p2Memory . Sub ( p2Request )
// prioritize evicting the pod which has the larger consumption of memory
2019-04-07 17:07:55 +00:00
return p2Memory . Cmp ( * p1Memory )
2019-01-12 04:58:27 +00:00
}
}
// exceedDiskRequests compares whether or not pods' disk usage exceeds their requests
func exceedDiskRequests ( stats statsFunc , fsStatsToMeasure [ ] fsStatsType , diskResource v1 . ResourceName ) cmpFunc {
return func ( p1 , p2 * v1 . Pod ) int {
p1Stats , p1Found := stats ( p1 )
p2Stats , p2Found := stats ( p2 )
if ! p1Found || ! p2Found {
// prioritize evicting the pod for which no stats were found
return cmpBool ( ! p1Found , ! p2Found )
}
p1Usage , p1Err := podDiskUsage ( p1Stats , p1 , fsStatsToMeasure )
p2Usage , p2Err := podDiskUsage ( p2Stats , p2 , fsStatsToMeasure )
if p1Err != nil || p2Err != nil {
// prioritize evicting the pod which had an error getting stats
return cmpBool ( p1Err != nil , p2Err != nil )
}
p1Disk := p1Usage [ diskResource ]
p2Disk := p2Usage [ diskResource ]
2019-09-27 21:51:53 +00:00
p1ExceedsRequests := p1Disk . Cmp ( v1resource . GetResourceRequestQuantity ( p1 , diskResource ) ) == 1
p2ExceedsRequests := p2Disk . Cmp ( v1resource . GetResourceRequestQuantity ( p2 , diskResource ) ) == 1
2019-01-12 04:58:27 +00:00
// prioritize evicting the pod which exceeds its requests
return cmpBool ( p1ExceedsRequests , p2ExceedsRequests )
}
}
// disk compares pods by largest consumer of disk relative to request for the specified disk resource.
func disk ( stats statsFunc , fsStatsToMeasure [ ] fsStatsType , diskResource v1 . ResourceName ) cmpFunc {
return func ( p1 , p2 * v1 . Pod ) int {
p1Stats , p1Found := stats ( p1 )
p2Stats , p2Found := stats ( p2 )
if ! p1Found || ! p2Found {
// prioritize evicting the pod for which no stats were found
return cmpBool ( ! p1Found , ! p2Found )
}
p1Usage , p1Err := podDiskUsage ( p1Stats , p1 , fsStatsToMeasure )
p2Usage , p2Err := podDiskUsage ( p2Stats , p2 , fsStatsToMeasure )
if p1Err != nil || p2Err != nil {
// prioritize evicting the pod which had an error getting stats
return cmpBool ( p1Err != nil , p2Err != nil )
}
// adjust p1, p2 usage relative to the request (if any)
p1Disk := p1Usage [ diskResource ]
p2Disk := p2Usage [ diskResource ]
2019-09-27 21:51:53 +00:00
p1Request := v1resource . GetResourceRequestQuantity ( p1 , v1 . ResourceEphemeralStorage )
2019-01-12 04:58:27 +00:00
p1Disk . Sub ( p1Request )
2019-09-27 21:51:53 +00:00
p2Request := v1resource . GetResourceRequestQuantity ( p2 , v1 . ResourceEphemeralStorage )
2019-01-12 04:58:27 +00:00
p2Disk . Sub ( p2Request )
// prioritize evicting the pod which has the larger consumption of disk
return p2Disk . Cmp ( p1Disk )
}
}
// cmpBool compares booleans, placing true before false
func cmpBool ( a , b bool ) int {
if a == b {
return 0
}
if ! b {
return - 1
}
return 1
}
// rankMemoryPressure orders the input pods for eviction in response to memory pressure.
// It ranks by whether or not the pod's usage exceeds its requests, then by priority, and
// finally by memory usage above requests.
func rankMemoryPressure ( pods [ ] * v1 . Pod , stats statsFunc ) {
orderedBy ( exceedMemoryRequests ( stats ) , priority , memory ( stats ) ) . Sort ( pods )
}
2019-04-07 17:07:55 +00:00
// rankPIDPressure orders the input pods by priority in response to PID pressure.
func rankPIDPressure ( pods [ ] * v1 . Pod , stats statsFunc ) {
orderedBy ( priority ) . Sort ( pods )
}
2019-01-12 04:58:27 +00:00
// rankDiskPressureFunc returns a rankFunc that measures the specified fs stats.
func rankDiskPressureFunc ( fsStatsToMeasure [ ] fsStatsType , diskResource v1 . ResourceName ) rankFunc {
return func ( pods [ ] * v1 . Pod , stats statsFunc ) {
orderedBy ( exceedDiskRequests ( stats , fsStatsToMeasure , diskResource ) , priority , disk ( stats , fsStatsToMeasure , diskResource ) ) . Sort ( pods )
}
}
// byEvictionPriority implements sort.Interface for []v1.ResourceName.
type byEvictionPriority [ ] evictionapi . Threshold
func ( a byEvictionPriority ) Len ( ) int { return len ( a ) }
func ( a byEvictionPriority ) Swap ( i , j int ) { a [ i ] , a [ j ] = a [ j ] , a [ i ] }
// Less ranks memory before all other resources, and ranks thresholds with no resource to reclaim last
func ( a byEvictionPriority ) Less ( i , j int ) bool {
_ , jSignalHasResource := signalToResource [ a [ j ] . Signal ]
return a [ i ] . Signal == evictionapi . SignalMemoryAvailable || a [ i ] . Signal == evictionapi . SignalAllocatableMemoryAvailable || ! jSignalHasResource
}
// makeSignalObservations derives observations using the specified summary provider.
func makeSignalObservations ( summary * statsapi . Summary ) ( signalObservations , statsFunc ) {
// build the function to work against for pod stats
statsFunc := cachedStatsFunc ( summary . Pods )
// build an evaluation context for current eviction signals
result := signalObservations { }
if memory := summary . Node . Memory ; memory != nil && memory . AvailableBytes != nil && memory . WorkingSetBytes != nil {
result [ evictionapi . SignalMemoryAvailable ] = signalObservation {
available : resource . NewQuantity ( int64 ( * memory . AvailableBytes ) , resource . BinarySI ) ,
capacity : resource . NewQuantity ( int64 ( * memory . AvailableBytes + * memory . WorkingSetBytes ) , resource . BinarySI ) ,
time : memory . Time ,
}
}
if allocatableContainer , err := getSysContainer ( summary . Node . SystemContainers , statsapi . SystemContainerPods ) ; err != nil {
klog . Errorf ( "eviction manager: failed to construct signal: %q error: %v" , evictionapi . SignalAllocatableMemoryAvailable , err )
} else {
if memory := allocatableContainer . Memory ; memory != nil && memory . AvailableBytes != nil && memory . WorkingSetBytes != nil {
result [ evictionapi . SignalAllocatableMemoryAvailable ] = signalObservation {
available : resource . NewQuantity ( int64 ( * memory . AvailableBytes ) , resource . BinarySI ) ,
capacity : resource . NewQuantity ( int64 ( * memory . AvailableBytes + * memory . WorkingSetBytes ) , resource . BinarySI ) ,
time : memory . Time ,
}
}
}
if nodeFs := summary . Node . Fs ; nodeFs != nil {
if nodeFs . AvailableBytes != nil && nodeFs . CapacityBytes != nil {
result [ evictionapi . SignalNodeFsAvailable ] = signalObservation {
available : resource . NewQuantity ( int64 ( * nodeFs . AvailableBytes ) , resource . BinarySI ) ,
capacity : resource . NewQuantity ( int64 ( * nodeFs . CapacityBytes ) , resource . BinarySI ) ,
time : nodeFs . Time ,
}
}
if nodeFs . InodesFree != nil && nodeFs . Inodes != nil {
result [ evictionapi . SignalNodeFsInodesFree ] = signalObservation {
available : resource . NewQuantity ( int64 ( * nodeFs . InodesFree ) , resource . DecimalSI ) ,
capacity : resource . NewQuantity ( int64 ( * nodeFs . Inodes ) , resource . DecimalSI ) ,
time : nodeFs . Time ,
}
}
}
if summary . Node . Runtime != nil {
if imageFs := summary . Node . Runtime . ImageFs ; imageFs != nil {
if imageFs . AvailableBytes != nil && imageFs . CapacityBytes != nil {
result [ evictionapi . SignalImageFsAvailable ] = signalObservation {
available : resource . NewQuantity ( int64 ( * imageFs . AvailableBytes ) , resource . BinarySI ) ,
capacity : resource . NewQuantity ( int64 ( * imageFs . CapacityBytes ) , resource . BinarySI ) ,
time : imageFs . Time ,
}
if imageFs . InodesFree != nil && imageFs . Inodes != nil {
result [ evictionapi . SignalImageFsInodesFree ] = signalObservation {
available : resource . NewQuantity ( int64 ( * imageFs . InodesFree ) , resource . DecimalSI ) ,
capacity : resource . NewQuantity ( int64 ( * imageFs . Inodes ) , resource . DecimalSI ) ,
time : imageFs . Time ,
}
}
}
}
}
if rlimit := summary . Node . Rlimit ; rlimit != nil {
if rlimit . NumOfRunningProcesses != nil && rlimit . MaxPID != nil {
available := int64 ( * rlimit . MaxPID ) - int64 ( * rlimit . NumOfRunningProcesses )
result [ evictionapi . SignalPIDAvailable ] = signalObservation {
available : resource . NewQuantity ( available , resource . BinarySI ) ,
capacity : resource . NewQuantity ( int64 ( * rlimit . MaxPID ) , resource . BinarySI ) ,
time : rlimit . Time ,
}
}
}
return result , statsFunc
}
func getSysContainer ( sysContainers [ ] statsapi . ContainerStats , name string ) ( * statsapi . ContainerStats , error ) {
for _ , cont := range sysContainers {
if cont . Name == name {
return & cont , nil
}
}
return nil , fmt . Errorf ( "system container %q not found in metrics" , name )
}
// thresholdsMet returns the set of thresholds that were met independent of grace period
func thresholdsMet ( thresholds [ ] evictionapi . Threshold , observations signalObservations , enforceMinReclaim bool ) [ ] evictionapi . Threshold {
results := [ ] evictionapi . Threshold { }
for i := range thresholds {
threshold := thresholds [ i ]
observed , found := observations [ threshold . Signal ]
if ! found {
klog . Warningf ( "eviction manager: no observation found for eviction signal %v" , threshold . Signal )
continue
}
// determine if we have met the specified threshold
thresholdMet := false
quantity := evictionapi . GetThresholdQuantity ( threshold . Value , observed . capacity )
// if enforceMinReclaim is specified, we compare relative to value - minreclaim
if enforceMinReclaim && threshold . MinReclaim != nil {
quantity . Add ( * evictionapi . GetThresholdQuantity ( * threshold . MinReclaim , observed . capacity ) )
}
thresholdResult := quantity . Cmp ( * observed . available )
switch threshold . Operator {
case evictionapi . OpLessThan :
thresholdMet = thresholdResult > 0
}
if thresholdMet {
results = append ( results , threshold )
}
}
return results
}
func debugLogObservations ( logPrefix string , observations signalObservations ) {
if ! klog . V ( 3 ) {
return
}
for k , v := range observations {
if ! v . time . IsZero ( ) {
klog . Infof ( "eviction manager: %v: signal=%v, available: %v, capacity: %v, time: %v" , logPrefix , k , v . available , v . capacity , v . time )
} else {
klog . Infof ( "eviction manager: %v: signal=%v, available: %v, capacity: %v" , logPrefix , k , v . available , v . capacity )
}
}
}
func debugLogThresholdsWithObservation ( logPrefix string , thresholds [ ] evictionapi . Threshold , observations signalObservations ) {
if ! klog . V ( 3 ) {
return
}
for i := range thresholds {
threshold := thresholds [ i ]
observed , found := observations [ threshold . Signal ]
if found {
quantity := evictionapi . GetThresholdQuantity ( threshold . Value , observed . capacity )
klog . Infof ( "eviction manager: %v: threshold [signal=%v, quantity=%v] observed %v" , logPrefix , threshold . Signal , quantity , observed . available )
} else {
klog . Infof ( "eviction manager: %v: threshold [signal=%v] had no observation" , logPrefix , threshold . Signal )
}
}
}
func thresholdsUpdatedStats ( thresholds [ ] evictionapi . Threshold , observations , lastObservations signalObservations ) [ ] evictionapi . Threshold {
results := [ ] evictionapi . Threshold { }
for i := range thresholds {
threshold := thresholds [ i ]
observed , found := observations [ threshold . Signal ]
if ! found {
klog . Warningf ( "eviction manager: no observation found for eviction signal %v" , threshold . Signal )
continue
}
last , found := lastObservations [ threshold . Signal ]
if ! found || observed . time . IsZero ( ) || observed . time . After ( last . time . Time ) {
results = append ( results , threshold )
}
}
return results
}
// thresholdsFirstObservedAt merges the input set of thresholds with the previous observation to determine when active set of thresholds were initially met.
func thresholdsFirstObservedAt ( thresholds [ ] evictionapi . Threshold , lastObservedAt thresholdsObservedAt , now time . Time ) thresholdsObservedAt {
results := thresholdsObservedAt { }
for i := range thresholds {
observedAt , found := lastObservedAt [ thresholds [ i ] ]
if ! found {
observedAt = now
}
results [ thresholds [ i ] ] = observedAt
}
return results
}
// thresholdsMetGracePeriod returns the set of thresholds that have satisfied associated grace period
func thresholdsMetGracePeriod ( observedAt thresholdsObservedAt , now time . Time ) [ ] evictionapi . Threshold {
results := [ ] evictionapi . Threshold { }
for threshold , at := range observedAt {
duration := now . Sub ( at )
if duration < threshold . GracePeriod {
klog . V ( 2 ) . Infof ( "eviction manager: eviction criteria not yet met for %v, duration: %v" , formatThreshold ( threshold ) , duration )
continue
}
results = append ( results , threshold )
}
return results
}
// nodeConditions returns the set of node conditions associated with a threshold
func nodeConditions ( thresholds [ ] evictionapi . Threshold ) [ ] v1 . NodeConditionType {
results := [ ] v1 . NodeConditionType { }
for _ , threshold := range thresholds {
if nodeCondition , found := signalToNodeCondition [ threshold . Signal ] ; found {
if ! hasNodeCondition ( results , nodeCondition ) {
results = append ( results , nodeCondition )
}
}
}
return results
}
// nodeConditionsLastObservedAt merges the input with the previous observation to determine when a condition was most recently met.
func nodeConditionsLastObservedAt ( nodeConditions [ ] v1 . NodeConditionType , lastObservedAt nodeConditionsObservedAt , now time . Time ) nodeConditionsObservedAt {
results := nodeConditionsObservedAt { }
// the input conditions were observed "now"
for i := range nodeConditions {
results [ nodeConditions [ i ] ] = now
}
// the conditions that were not observed now are merged in with their old time
for key , value := range lastObservedAt {
_ , found := results [ key ]
if ! found {
results [ key ] = value
}
}
return results
}
// nodeConditionsObservedSince returns the set of conditions that have been observed within the specified period
func nodeConditionsObservedSince ( observedAt nodeConditionsObservedAt , period time . Duration , now time . Time ) [ ] v1 . NodeConditionType {
results := [ ] v1 . NodeConditionType { }
for nodeCondition , at := range observedAt {
duration := now . Sub ( at )
if duration < period {
results = append ( results , nodeCondition )
}
}
return results
}
// hasFsStatsType returns true if the fsStat is in the input list
func hasFsStatsType ( inputs [ ] fsStatsType , item fsStatsType ) bool {
for _ , input := range inputs {
if input == item {
return true
}
}
return false
}
// hasNodeCondition returns true if the node condition is in the input list
func hasNodeCondition ( inputs [ ] v1 . NodeConditionType , item v1 . NodeConditionType ) bool {
for _ , input := range inputs {
if input == item {
return true
}
}
return false
}
// mergeThresholds will merge both threshold lists eliminating duplicates.
func mergeThresholds ( inputsA [ ] evictionapi . Threshold , inputsB [ ] evictionapi . Threshold ) [ ] evictionapi . Threshold {
results := inputsA
for _ , threshold := range inputsB {
if ! hasThreshold ( results , threshold ) {
results = append ( results , threshold )
}
}
return results
}
// hasThreshold returns true if the threshold is in the input list
func hasThreshold ( inputs [ ] evictionapi . Threshold , item evictionapi . Threshold ) bool {
for _ , input := range inputs {
if input . GracePeriod == item . GracePeriod && input . Operator == item . Operator && input . Signal == item . Signal && compareThresholdValue ( input . Value , item . Value ) {
return true
}
}
return false
}
// compareThresholdValue returns true if the two thresholdValue objects are logically the same
func compareThresholdValue ( a evictionapi . ThresholdValue , b evictionapi . ThresholdValue ) bool {
if a . Quantity != nil {
if b . Quantity == nil {
return false
}
return a . Quantity . Cmp ( * b . Quantity ) == 0
}
if b . Quantity != nil {
return false
}
return a . Percentage == b . Percentage
}
// isHardEvictionThreshold returns true if eviction should immediately occur
func isHardEvictionThreshold ( threshold evictionapi . Threshold ) bool {
return threshold . GracePeriod == time . Duration ( 0 )
}
func isAllocatableEvictionThreshold ( threshold evictionapi . Threshold ) bool {
return threshold . Signal == evictionapi . SignalAllocatableMemoryAvailable
}
// buildSignalToRankFunc returns ranking functions associated with resources
func buildSignalToRankFunc ( withImageFs bool ) map [ evictionapi . Signal ] rankFunc {
signalToRankFunc := map [ evictionapi . Signal ] rankFunc {
evictionapi . SignalMemoryAvailable : rankMemoryPressure ,
evictionapi . SignalAllocatableMemoryAvailable : rankMemoryPressure ,
2019-04-07 17:07:55 +00:00
evictionapi . SignalPIDAvailable : rankPIDPressure ,
2019-01-12 04:58:27 +00:00
}
// usage of an imagefs is optional
if withImageFs {
// with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes
signalToRankFunc [ evictionapi . SignalNodeFsAvailable ] = rankDiskPressureFunc ( [ ] fsStatsType { fsStatsLogs , fsStatsLocalVolumeSource } , v1 . ResourceEphemeralStorage )
signalToRankFunc [ evictionapi . SignalNodeFsInodesFree ] = rankDiskPressureFunc ( [ ] fsStatsType { fsStatsLogs , fsStatsLocalVolumeSource } , resourceInodes )
// with an imagefs, imagefs pod rank func for eviction only includes rootfs
signalToRankFunc [ evictionapi . SignalImageFsAvailable ] = rankDiskPressureFunc ( [ ] fsStatsType { fsStatsRoot } , v1 . ResourceEphemeralStorage )
signalToRankFunc [ evictionapi . SignalImageFsInodesFree ] = rankDiskPressureFunc ( [ ] fsStatsType { fsStatsRoot } , resourceInodes )
} else {
// without an imagefs, nodefs pod rank func for eviction looks at all fs stats.
// since imagefs and nodefs share a common device, they share common ranking functions.
signalToRankFunc [ evictionapi . SignalNodeFsAvailable ] = rankDiskPressureFunc ( [ ] fsStatsType { fsStatsRoot , fsStatsLogs , fsStatsLocalVolumeSource } , v1 . ResourceEphemeralStorage )
signalToRankFunc [ evictionapi . SignalNodeFsInodesFree ] = rankDiskPressureFunc ( [ ] fsStatsType { fsStatsRoot , fsStatsLogs , fsStatsLocalVolumeSource } , resourceInodes )
signalToRankFunc [ evictionapi . SignalImageFsAvailable ] = rankDiskPressureFunc ( [ ] fsStatsType { fsStatsRoot , fsStatsLogs , fsStatsLocalVolumeSource } , v1 . ResourceEphemeralStorage )
signalToRankFunc [ evictionapi . SignalImageFsInodesFree ] = rankDiskPressureFunc ( [ ] fsStatsType { fsStatsRoot , fsStatsLogs , fsStatsLocalVolumeSource } , resourceInodes )
}
return signalToRankFunc
}
// PodIsEvicted returns true if the reported pod status is due to an eviction.
func PodIsEvicted ( podStatus v1 . PodStatus ) bool {
return podStatus . Phase == v1 . PodFailed && podStatus . Reason == Reason
}
// buildSignalToNodeReclaimFuncs returns reclaim functions associated with resources.
func buildSignalToNodeReclaimFuncs ( imageGC ImageGC , containerGC ContainerGC , withImageFs bool ) map [ evictionapi . Signal ] nodeReclaimFuncs {
signalToReclaimFunc := map [ evictionapi . Signal ] nodeReclaimFuncs { }
// usage of an imagefs is optional
if withImageFs {
// with an imagefs, nodefs pressure should just delete logs
signalToReclaimFunc [ evictionapi . SignalNodeFsAvailable ] = nodeReclaimFuncs { }
signalToReclaimFunc [ evictionapi . SignalNodeFsInodesFree ] = nodeReclaimFuncs { }
// with an imagefs, imagefs pressure should delete unused images
signalToReclaimFunc [ evictionapi . SignalImageFsAvailable ] = nodeReclaimFuncs { containerGC . DeleteAllUnusedContainers , imageGC . DeleteUnusedImages }
signalToReclaimFunc [ evictionapi . SignalImageFsInodesFree ] = nodeReclaimFuncs { containerGC . DeleteAllUnusedContainers , imageGC . DeleteUnusedImages }
} else {
// without an imagefs, nodefs pressure should delete logs, and unused images
// since imagefs and nodefs share a common device, they share common reclaim functions
signalToReclaimFunc [ evictionapi . SignalNodeFsAvailable ] = nodeReclaimFuncs { containerGC . DeleteAllUnusedContainers , imageGC . DeleteUnusedImages }
signalToReclaimFunc [ evictionapi . SignalNodeFsInodesFree ] = nodeReclaimFuncs { containerGC . DeleteAllUnusedContainers , imageGC . DeleteUnusedImages }
signalToReclaimFunc [ evictionapi . SignalImageFsAvailable ] = nodeReclaimFuncs { containerGC . DeleteAllUnusedContainers , imageGC . DeleteUnusedImages }
signalToReclaimFunc [ evictionapi . SignalImageFsInodesFree ] = nodeReclaimFuncs { containerGC . DeleteAllUnusedContainers , imageGC . DeleteUnusedImages }
}
return signalToReclaimFunc
}
// evictionMessage constructs a useful message about why an eviction occurred, and annotations to provide metadata about the eviction
func evictionMessage ( resourceToReclaim v1 . ResourceName , pod * v1 . Pod , stats statsFunc ) ( message string , annotations map [ string ] string ) {
annotations = make ( map [ string ] string )
message = fmt . Sprintf ( nodeLowMessageFmt , resourceToReclaim )
containers := [ ] string { }
containerUsage := [ ] string { }
podStats , ok := stats ( pod )
if ! ok {
return
}
for _ , containerStats := range podStats . Containers {
for _ , container := range pod . Spec . Containers {
if container . Name == containerStats . Name {
requests := container . Resources . Requests [ resourceToReclaim ]
var usage * resource . Quantity
switch resourceToReclaim {
case v1 . ResourceEphemeralStorage :
if containerStats . Rootfs != nil && containerStats . Rootfs . UsedBytes != nil && containerStats . Logs != nil && containerStats . Logs . UsedBytes != nil {
usage = resource . NewQuantity ( int64 ( * containerStats . Rootfs . UsedBytes + * containerStats . Logs . UsedBytes ) , resource . BinarySI )
}
case v1 . ResourceMemory :
if containerStats . Memory != nil && containerStats . Memory . WorkingSetBytes != nil {
usage = resource . NewQuantity ( int64 ( * containerStats . Memory . WorkingSetBytes ) , resource . BinarySI )
}
}
if usage != nil && usage . Cmp ( requests ) > 0 {
message += fmt . Sprintf ( containerMessageFmt , container . Name , usage . String ( ) , requests . String ( ) )
containers = append ( containers , container . Name )
containerUsage = append ( containerUsage , usage . String ( ) )
}
}
}
}
annotations [ OffendingContainersKey ] = strings . Join ( containers , "," )
annotations [ OffendingContainersUsageKey ] = strings . Join ( containerUsage , "," )
annotations [ StarvedResourceKey ] = string ( resourceToReclaim )
return
}