Merge pull request #63415 from dashpole/eviction_event

Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

Clean up kubelet eviction events

**What this PR does / why we need it**:
This makes eviction events better.  
* Exceeding container disk limits no longer says "node was low on X", since the node isn't actually low on a resource.  The container limit was just exceeded.  Same for pods and volumes.
* Eviction message now lists containers which were exceeding their requests.  This is an event from a container evicted while under memory pressure: 
`reason: 'Evicted' The node was low on resource: memory. Container high-priority-memory-hog was using 166088Ki, which exceeds its request of 10Mi.`
* Eviction messages now displays real resources, when they exist.  Rather than `The node was low on resource: nodefs`, it will now show `The node was low on resource: ephemeral-storage`.

This also cleans up eviction code in order to accomplish this.  We previously had a resource for each signal: e.g. `SignalNodeFsAvailable` mapped to the resource`nodefs`, and `nodefs` maps to reclaim functions, and ranking functions.  Now, signals map directly to reclaim and ranking functions, and signals map to real resources: e.g. `SignalNodeFsAvailable` maps to the resource `ephemeral-storage`, which is what we use in events.
This also cleans up duplicated code by reusing the `evictPod` function.  It also removes the unused signal `SignalAllocatableNodeFsAvailable`.

**Release note**:
```release-note
NONE
```

/sig node
/priority important-longterm

/assign @dchen1107 @jingxu97
pull/8/head
Kubernetes Submit Queue 2018-05-08 10:38:29 -07:00 committed by GitHub
commit f13fa1e3af
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 222 additions and 210 deletions

View File

@ -20,7 +20,6 @@ go_test(
"//pkg/kubelet/eviction/api:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/types:go_default_library",
"//pkg/quota:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",

View File

@ -38,8 +38,6 @@ const (
SignalImageFsInodesFree Signal = "imagefs.inodesFree"
// SignalAllocatableMemoryAvailable is amount of memory available for pod allocation (i.e. allocatable - workingSet (of pods), in bytes.
SignalAllocatableMemoryAvailable Signal = "allocatableMemory.available"
// SignalAllocatableNodeFsAvailable is amount of local storage available for pod allocation
SignalAllocatableNodeFsAvailable Signal = "allocatableNodeFs.available"
// SignalPIDAvailable is amount of PID available for pod allocation
SignalPIDAvailable Signal = "pid.available"
)
@ -60,13 +58,11 @@ const (
// from either above or below, never both). There is thus no reason to expose the
// operator in the Kubelet's public API. Instead, we internally map signal types to operators.
var OpForSignal = map[Signal]ThresholdOperator{
SignalMemoryAvailable: OpLessThan,
SignalNodeFsAvailable: OpLessThan,
SignalNodeFsInodesFree: OpLessThan,
SignalImageFsAvailable: OpLessThan,
SignalImageFsInodesFree: OpLessThan,
SignalAllocatableMemoryAvailable: OpLessThan,
SignalAllocatableNodeFsAvailable: OpLessThan,
SignalMemoryAvailable: OpLessThan,
SignalNodeFsAvailable: OpLessThan,
SignalNodeFsInodesFree: OpLessThan,
SignalImageFsAvailable: OpLessThan,
SignalImageFsInodesFree: OpLessThan,
}
// ThresholdValue is a value holder that abstracts literal versus percentage based quantity

View File

@ -76,10 +76,10 @@ type managerImpl struct {
thresholdsFirstObservedAt thresholdsObservedAt
// records the set of thresholds that have been met (including graceperiod) but not yet resolved
thresholdsMet []evictionapi.Threshold
// resourceToRankFunc maps a resource to ranking function for that resource.
resourceToRankFunc map[v1.ResourceName]rankFunc
// resourceToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource.
resourceToNodeReclaimFuncs map[v1.ResourceName]nodeReclaimFuncs
// signalToRankFunc maps a resource to ranking function for that resource.
signalToRankFunc map[evictionapi.Signal]rankFunc
// signalToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource.
signalToNodeReclaimFuncs map[evictionapi.Signal]nodeReclaimFuncs
// last observations from synchronize
lastObservations signalObservations
// notifierStopCh is a channel used to stop all thresholdNotifiers
@ -239,8 +239,8 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
return nil
}
m.dedicatedImageFs = &hasImageFs
m.resourceToRankFunc = buildResourceToRankFunc(hasImageFs)
m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, m.containerGC, hasImageFs)
m.signalToRankFunc = buildSignalToRankFunc(hasImageFs)
m.signalToNodeReclaimFuncs = buildSignalToNodeReclaimFuncs(m.imageGC, m.containerGC, hasImageFs)
}
activePods := podFunc()
@ -333,26 +333,26 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
}
}
// determine the set of resources under starvation
starvedResources := getStarvedResources(thresholds)
if len(starvedResources) == 0 {
if len(thresholds) == 0 {
glog.V(3).Infof("eviction manager: no resources are starved")
return nil
}
// rank the resources to reclaim by eviction priority
sort.Sort(byEvictionPriority(starvedResources))
resourceToReclaim := starvedResources[0]
// rank the thresholds by eviction priority
sort.Sort(byEvictionPriority(thresholds))
thresholdToReclaim := thresholds[0]
resourceToReclaim, found := signalToResource[thresholdToReclaim.Signal]
if !found {
glog.V(3).Infof("eviction manager: threshold %s was crossed, but reclaim is not implemented for this threshold.", thresholdToReclaim.Signal)
return nil
}
glog.Warningf("eviction manager: attempting to reclaim %v", resourceToReclaim)
// determine if this is a soft or hard eviction associated with the resource
softEviction := isSoftEvictionThresholds(thresholds, resourceToReclaim)
// record an event about the resources we are now attempting to reclaim via eviction
m.recorder.Eventf(m.nodeRef, v1.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim)
// check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
if m.reclaimNodeLevelResources(resourceToReclaim) {
if m.reclaimNodeLevelResources(thresholdToReclaim.Signal, resourceToReclaim) {
glog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim)
return nil
}
@ -360,9 +360,9 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
glog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim)
// rank the pods for eviction
rank, ok := m.resourceToRankFunc[resourceToReclaim]
rank, ok := m.signalToRankFunc[thresholdToReclaim.Signal]
if !ok {
glog.Errorf("eviction manager: no ranking function for resource %s", resourceToReclaim)
glog.Errorf("eviction manager: no ranking function for signal %s", thresholdToReclaim.Signal)
return nil
}
@ -388,30 +388,13 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
// we kill at most a single pod during each eviction interval
for i := range activePods {
pod := activePods[i]
// If the pod is marked as critical and static, and support for critical pod annotations is enabled,
// do not evict such pods. Static pods are not re-admitted after evictions.
// https://github.com/kubernetes/kubernetes/issues/40573 has more details.
if utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) &&
kubelettypes.IsCriticalPod(pod) && kubepod.IsStaticPod(pod) {
continue
}
status := v1.PodStatus{
Phase: v1.PodFailed,
Message: fmt.Sprintf(message, resourceToReclaim),
Reason: reason,
}
// record that we are evicting the pod
m.recorder.Eventf(pod, v1.EventTypeWarning, reason, fmt.Sprintf(message, resourceToReclaim))
gracePeriodOverride := int64(0)
if softEviction {
if !isHardEvictionThreshold(thresholdToReclaim) {
gracePeriodOverride = m.config.MaxPodGracePeriodSeconds
}
// this is a blocking call and should only return when the pod and its containers are killed.
err := m.killPodFunc(pod, status, &gracePeriodOverride)
if err != nil {
glog.Warningf("eviction manager: error while evicting pod %s: %v", format.Pod(pod), err)
if m.evictPod(pod, gracePeriodOverride, evictionMessage(resourceToReclaim, pod, statsFunc)) {
return []*v1.Pod{pod}
}
return []*v1.Pod{pod}
}
glog.Infof("eviction manager: unable to evict any pods from the node")
return nil
@ -440,8 +423,8 @@ func (m *managerImpl) waitForPodsCleanup(podCleanedUpFunc PodCleanedUpFunc, pods
}
// reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required.
func (m *managerImpl) reclaimNodeLevelResources(resourceToReclaim v1.ResourceName) bool {
nodeReclaimFuncs := m.resourceToNodeReclaimFuncs[resourceToReclaim]
func (m *managerImpl) reclaimNodeLevelResources(signalToReclaim evictionapi.Signal, resourceToReclaim v1.ResourceName) bool {
nodeReclaimFuncs := m.signalToNodeReclaimFuncs[signalToReclaim]
for _, nodeReclaimFunc := range nodeReclaimFuncs {
// attempt to reclaim the pressured resource.
if err := nodeReclaimFunc(); err != nil {
@ -512,7 +495,7 @@ func (m *managerImpl) emptyDirLimitEviction(podStats statsapi.PodStats, pod *v1.
used := podVolumeUsed[pod.Spec.Volumes[i].Name]
if used != nil && size != nil && size.Sign() == 1 && used.Cmp(*size) > 0 {
// the emptyDir usage exceeds the size limit, evict the pod
return m.evictPod(pod, v1.ResourceName("EmptyDir"), fmt.Sprintf("emptyDir usage exceeds the limit %q", size.String()))
return m.evictPod(pod, 0, fmt.Sprintf(emptyDirMessage, pod.Spec.Volumes[i].Name, size.String()))
}
}
}
@ -540,10 +523,11 @@ func (m *managerImpl) podEphemeralStorageLimitEviction(podStats statsapi.PodStat
return false
}
podEphemeralStorageTotalUsage.Add(podEphemeralUsage[resourceDisk])
if podEphemeralStorageTotalUsage.Cmp(podLimits[v1.ResourceEphemeralStorage]) > 0 {
podEphemeralStorageTotalUsage.Add(podEphemeralUsage[v1.ResourceEphemeralStorage])
podEphemeralStorageLimit := podLimits[v1.ResourceEphemeralStorage]
if podEphemeralStorageTotalUsage.Cmp(podEphemeralStorageLimit) > 0 {
// the total usage of pod exceeds the total size limit of containers, evict the pod
return m.evictPod(pod, v1.ResourceEphemeralStorage, fmt.Sprintf("pod ephemeral local storage usage exceeds the total limit of containers %v", podLimits[v1.ResourceEphemeralStorage]))
return m.evictPod(pod, 0, fmt.Sprintf(podEphemeralStorageMessage, podEphemeralStorageLimit.String()))
}
return false
}
@ -565,7 +549,7 @@ func (m *managerImpl) containerEphemeralStorageLimitEviction(podStats statsapi.P
if ephemeralStorageThreshold, ok := thresholdsMap[containerStat.Name]; ok {
if ephemeralStorageThreshold.Cmp(*containerUsed) < 0 {
return m.evictPod(pod, v1.ResourceEphemeralStorage, fmt.Sprintf("container's ephemeral local storage usage exceeds the limit %q", ephemeralStorageThreshold.String()))
return m.evictPod(pod, 0, fmt.Sprintf(containerEphemeralStorageMessage, containerStat.Name, ephemeralStorageThreshold.String()))
}
}
@ -573,21 +557,24 @@ func (m *managerImpl) containerEphemeralStorageLimitEviction(podStats statsapi.P
return false
}
func (m *managerImpl) evictPod(pod *v1.Pod, resourceName v1.ResourceName, evictMsg string) bool {
func (m *managerImpl) evictPod(pod *v1.Pod, gracePeriodOverride int64, evictMsg string) bool {
// If the pod is marked as critical and static, and support for critical pod annotations is enabled,
// do not evict such pods. Static pods are not re-admitted after evictions.
// https://github.com/kubernetes/kubernetes/issues/40573 has more details.
if utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) &&
kubelettypes.IsCriticalPod(pod) && kubepod.IsStaticPod(pod) {
glog.Errorf("eviction manager: cannot evict a critical pod %s", format.Pod(pod))
glog.Errorf("eviction manager: cannot evict a critical static pod %s", format.Pod(pod))
return false
}
status := v1.PodStatus{
Phase: v1.PodFailed,
Message: fmt.Sprintf(message, resourceName),
Message: evictMsg,
Reason: reason,
}
// record that we are evicting the pod
m.recorder.Eventf(pod, v1.EventTypeWarning, reason, evictMsg)
gracePeriod := int64(0)
err := m.killPodFunc(pod, status, &gracePeriod)
// this is a blocking call and should only return when the pod and its containers are killed.
err := m.killPodFunc(pod, status, &gracePeriodOverride)
if err != nil {
glog.Errorf("eviction manager: pod %s failed to evict %v", format.Pod(pod), err)
} else {

View File

@ -41,19 +41,17 @@ const (
// the reason reported back in status.
reason = "Evicted"
// the message associated with the reason.
message = "The node was low on resource: %v."
// disk, in bytes. internal to this module, used to account for local disk usage.
resourceDisk v1.ResourceName = "disk"
message = "The node was low on resource: %v. "
// additional information for containers exceeding requests
containerMessage = "Container %s was using %s, which exceeds its request of %s. "
// additional information for containers which have exceeded their ES limit
containerEphemeralStorageMessage = "Container %s exceeded its local ephemeral storage limit %q. "
// additional information for pods which have exceeded their ES limit
podEphemeralStorageMessage = "Pod ephemeral local storage usage exceeds the total limit of containers %s. "
// additional information for empty-dir volumes which have exceeded their size limit
emptyDirMessage = "Usage of EmptyDir volume %q exceeds the limit %q. "
// inodes, number. internal to this module, used to account for local disk inode consumption.
resourceInodes v1.ResourceName = "inodes"
// imagefs, in bytes. internal to this module, used to account for local image filesystem usage.
resourceImageFs v1.ResourceName = "imagefs"
// imagefs inodes, number. internal to this module, used to account for local image filesystem inodes.
resourceImageFsInodes v1.ResourceName = "imagefsInodes"
// nodefs, in bytes. internal to this module, used to account for local node root filesystem usage.
resourceNodeFs v1.ResourceName = "nodefs"
// nodefs inodes, number. internal to this module, used to account for local node root filesystem inodes.
resourceNodeFsInodes v1.ResourceName = "nodefsInodes"
// this prevents constantly updating the memcg notifier if synchronize
// is run frequently.
notifierRefreshInterval = 10 * time.Second
@ -64,8 +62,6 @@ var (
signalToNodeCondition map[evictionapi.Signal]v1.NodeConditionType
// signalToResource maps a Signal to its associated Resource.
signalToResource map[evictionapi.Signal]v1.ResourceName
// resourceClaimToSignal maps a Resource that can be reclaimed to its associated Signal
resourceClaimToSignal map[v1.ResourceName][]evictionapi.Signal
)
func init() {
@ -83,17 +79,10 @@ func init() {
signalToResource = map[evictionapi.Signal]v1.ResourceName{}
signalToResource[evictionapi.SignalMemoryAvailable] = v1.ResourceMemory
signalToResource[evictionapi.SignalAllocatableMemoryAvailable] = v1.ResourceMemory
signalToResource[evictionapi.SignalImageFsAvailable] = resourceImageFs
signalToResource[evictionapi.SignalImageFsInodesFree] = resourceImageFsInodes
signalToResource[evictionapi.SignalNodeFsAvailable] = resourceNodeFs
signalToResource[evictionapi.SignalNodeFsInodesFree] = resourceNodeFsInodes
// maps resource to signals (the following resource could be reclaimed)
resourceClaimToSignal = map[v1.ResourceName][]evictionapi.Signal{}
resourceClaimToSignal[resourceNodeFs] = []evictionapi.Signal{evictionapi.SignalNodeFsAvailable}
resourceClaimToSignal[resourceImageFs] = []evictionapi.Signal{evictionapi.SignalImageFsAvailable}
resourceClaimToSignal[resourceNodeFsInodes] = []evictionapi.Signal{evictionapi.SignalNodeFsInodesFree}
resourceClaimToSignal[resourceImageFsInodes] = []evictionapi.Signal{evictionapi.SignalImageFsInodesFree}
signalToResource[evictionapi.SignalImageFsAvailable] = v1.ResourceEphemeralStorage
signalToResource[evictionapi.SignalImageFsInodesFree] = resourceInodes
signalToResource[evictionapi.SignalNodeFsAvailable] = v1.ResourceEphemeralStorage
signalToResource[evictionapi.SignalNodeFsInodesFree] = resourceInodes
}
// validSignal returns true if the signal is supported.
@ -310,10 +299,10 @@ func diskUsage(fsStats *statsapi.FsStats) *resource.Quantity {
// inodeUsage converts inodes consumed into a resource quantity.
func inodeUsage(fsStats *statsapi.FsStats) *resource.Quantity {
if fsStats == nil || fsStats.InodesUsed == nil {
return &resource.Quantity{Format: resource.BinarySI}
return &resource.Quantity{Format: resource.DecimalSI}
}
usage := int64(*fsStats.InodesUsed)
return resource.NewQuantity(usage, resource.BinarySI)
return resource.NewQuantity(usage, resource.DecimalSI)
}
// memoryUsage converts working set into a resource quantity.
@ -343,7 +332,7 @@ func localVolumeNames(pod *v1.Pod) []string {
// containerUsage aggregates container disk usage and inode consumption for the specified stats to measure.
func containerUsage(podStats statsapi.PodStats, statsToMeasure []fsStatsType) v1.ResourceList {
disk := resource.Quantity{Format: resource.BinarySI}
inodes := resource.Quantity{Format: resource.BinarySI}
inodes := resource.Quantity{Format: resource.DecimalSI}
for _, container := range podStats.Containers {
if hasFsStatsType(statsToMeasure, fsStatsRoot) {
disk.Add(*diskUsage(container.Rootfs))
@ -355,15 +344,15 @@ func containerUsage(podStats statsapi.PodStats, statsToMeasure []fsStatsType) v1
}
}
return v1.ResourceList{
resourceDisk: disk,
resourceInodes: inodes,
v1.ResourceEphemeralStorage: disk,
resourceInodes: inodes,
}
}
// podLocalVolumeUsage aggregates pod local volumes disk usage and inode consumption for the specified stats to measure.
func podLocalVolumeUsage(volumeNames []string, podStats statsapi.PodStats) v1.ResourceList {
disk := resource.Quantity{Format: resource.BinarySI}
inodes := resource.Quantity{Format: resource.BinarySI}
inodes := resource.Quantity{Format: resource.DecimalSI}
for _, volumeName := range volumeNames {
for _, volumeStats := range podStats.VolumeStats {
if volumeStats.Name == volumeName {
@ -374,29 +363,29 @@ func podLocalVolumeUsage(volumeNames []string, podStats statsapi.PodStats) v1.Re
}
}
return v1.ResourceList{
resourceDisk: disk,
resourceInodes: inodes,
v1.ResourceEphemeralStorage: disk,
resourceInodes: inodes,
}
}
// podDiskUsage aggregates pod disk usage and inode consumption for the specified stats to measure.
func podDiskUsage(podStats statsapi.PodStats, pod *v1.Pod, statsToMeasure []fsStatsType) (v1.ResourceList, error) {
disk := resource.Quantity{Format: resource.BinarySI}
inodes := resource.Quantity{Format: resource.BinarySI}
inodes := resource.Quantity{Format: resource.DecimalSI}
containerUsageList := containerUsage(podStats, statsToMeasure)
disk.Add(containerUsageList[resourceDisk])
disk.Add(containerUsageList[v1.ResourceEphemeralStorage])
inodes.Add(containerUsageList[resourceInodes])
if hasFsStatsType(statsToMeasure, fsStatsLocalVolumeSource) {
volumeNames := localVolumeNames(pod)
podLocalVolumeUsageList := podLocalVolumeUsage(volumeNames, podStats)
disk.Add(podLocalVolumeUsageList[resourceDisk])
disk.Add(podLocalVolumeUsageList[v1.ResourceEphemeralStorage])
inodes.Add(podLocalVolumeUsageList[resourceInodes])
}
return v1.ResourceList{
resourceDisk: disk,
resourceInodes: inodes,
v1.ResourceEphemeralStorage: disk,
resourceInodes: inodes,
}, nil
}
@ -426,21 +415,21 @@ func localEphemeralVolumeNames(pod *v1.Pod) []string {
// podLocalEphemeralStorageUsage aggregates pod local ephemeral storage usage and inode consumption for the specified stats to measure.
func podLocalEphemeralStorageUsage(podStats statsapi.PodStats, pod *v1.Pod, statsToMeasure []fsStatsType) (v1.ResourceList, error) {
disk := resource.Quantity{Format: resource.BinarySI}
inodes := resource.Quantity{Format: resource.BinarySI}
inodes := resource.Quantity{Format: resource.DecimalSI}
containerUsageList := containerUsage(podStats, statsToMeasure)
disk.Add(containerUsageList[resourceDisk])
disk.Add(containerUsageList[v1.ResourceEphemeralStorage])
inodes.Add(containerUsageList[resourceInodes])
if hasFsStatsType(statsToMeasure, fsStatsLocalVolumeSource) {
volumeNames := localEphemeralVolumeNames(pod)
podLocalVolumeUsageList := podLocalVolumeUsage(volumeNames, podStats)
disk.Add(podLocalVolumeUsageList[resourceDisk])
disk.Add(podLocalVolumeUsageList[v1.ResourceEphemeralStorage])
inodes.Add(podLocalVolumeUsageList[resourceInodes])
}
return v1.ResourceList{
resourceDisk: disk,
resourceInodes: inodes,
v1.ResourceEphemeralStorage: disk,
resourceInodes: inodes,
}, nil
}
@ -605,7 +594,7 @@ func memory(stats statsFunc) cmpFunc {
// max(max of init container requests, sum of container requests)
func podRequest(pod *v1.Pod, resourceName v1.ResourceName) resource.Quantity {
containerValue := resource.Quantity{Format: resource.BinarySI}
if resourceName == resourceDisk && !utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) {
if resourceName == v1.ResourceEphemeralStorage && !utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) {
// if the local storage capacity isolation feature gate is disabled, pods request 0 disk
return containerValue
}
@ -613,7 +602,7 @@ func podRequest(pod *v1.Pod, resourceName v1.ResourceName) resource.Quantity {
switch resourceName {
case v1.ResourceMemory:
containerValue.Add(*pod.Spec.Containers[i].Resources.Requests.Memory())
case resourceDisk:
case v1.ResourceEphemeralStorage:
containerValue.Add(*pod.Spec.Containers[i].Resources.Requests.StorageEphemeral())
}
}
@ -624,7 +613,7 @@ func podRequest(pod *v1.Pod, resourceName v1.ResourceName) resource.Quantity {
if initValue.Cmp(*pod.Spec.InitContainers[i].Resources.Requests.Memory()) < 0 {
initValue = *pod.Spec.InitContainers[i].Resources.Requests.Memory()
}
case resourceDisk:
case v1.ResourceEphemeralStorage:
if initValue.Cmp(*pod.Spec.InitContainers[i].Resources.Requests.StorageEphemeral()) < 0 {
initValue = *pod.Spec.InitContainers[i].Resources.Requests.StorageEphemeral()
}
@ -681,9 +670,9 @@ func disk(stats statsFunc, fsStatsToMeasure []fsStatsType, diskResource v1.Resou
// adjust p1, p2 usage relative to the request (if any)
p1Disk := p1Usage[diskResource]
p2Disk := p2Usage[diskResource]
p1Request := podRequest(p1, resourceDisk)
p1Request := podRequest(p1, v1.ResourceEphemeralStorage)
p1Disk.Sub(p1Request)
p2Request := podRequest(p2, resourceDisk)
p2Request := podRequest(p2, v1.ResourceEphemeralStorage)
p2Disk.Sub(p2Request)
// prioritize evicting the pod which has the larger consumption of disk
return p2Disk.Cmp(p1Disk)
@ -716,14 +705,15 @@ func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType, diskResource v1.Resour
}
// byEvictionPriority implements sort.Interface for []v1.ResourceName.
type byEvictionPriority []v1.ResourceName
type byEvictionPriority []evictionapi.Threshold
func (a byEvictionPriority) Len() int { return len(a) }
func (a byEvictionPriority) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
// Less ranks memory before all other resources.
// Less ranks memory before all other resources, and ranks thresholds with no resource to reclaim last
func (a byEvictionPriority) Less(i, j int) bool {
return a[i] == v1.ResourceMemory
_, jSignalHasResource := signalToResource[a[j].Signal]
return a[i].Signal == evictionapi.SignalMemoryAvailable || a[i].Signal == evictionapi.SignalAllocatableMemoryAvailable || !jSignalHasResource
}
// makeSignalObservations derives observations using the specified summary provider.
@ -761,8 +751,8 @@ func makeSignalObservations(summary *statsapi.Summary) (signalObservations, stat
}
if nodeFs.InodesFree != nil && nodeFs.Inodes != nil {
result[evictionapi.SignalNodeFsInodesFree] = signalObservation{
available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.BinarySI),
capacity: resource.NewQuantity(int64(*nodeFs.Inodes), resource.BinarySI),
available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.DecimalSI),
capacity: resource.NewQuantity(int64(*nodeFs.Inodes), resource.DecimalSI),
time: nodeFs.Time,
}
}
@ -777,8 +767,8 @@ func makeSignalObservations(summary *statsapi.Summary) (signalObservations, stat
}
if imageFs.InodesFree != nil && imageFs.Inodes != nil {
result[evictionapi.SignalImageFsInodesFree] = signalObservation{
available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.BinarySI),
capacity: resource.NewQuantity(int64(*imageFs.Inodes), resource.BinarySI),
available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.DecimalSI),
capacity: resource.NewQuantity(int64(*imageFs.Inodes), resource.DecimalSI),
time: imageFs.Time,
}
}
@ -1006,57 +996,34 @@ func compareThresholdValue(a evictionapi.ThresholdValue, b evictionapi.Threshold
return a.Percentage == b.Percentage
}
// getStarvedResources returns the set of resources that are starved based on thresholds met.
func getStarvedResources(thresholds []evictionapi.Threshold) []v1.ResourceName {
results := []v1.ResourceName{}
for _, threshold := range thresholds {
if starvedResource, found := signalToResource[threshold.Signal]; found {
results = append(results, starvedResource)
}
}
return results
}
// isSoftEviction returns true if the thresholds met for the starved resource are only soft thresholds
func isSoftEvictionThresholds(thresholds []evictionapi.Threshold, starvedResource v1.ResourceName) bool {
for _, threshold := range thresholds {
if resourceToCheck := signalToResource[threshold.Signal]; resourceToCheck != starvedResource {
continue
}
if isHardEvictionThreshold(threshold) {
return false
}
}
return true
}
// isHardEvictionThreshold returns true if eviction should immediately occur
func isHardEvictionThreshold(threshold evictionapi.Threshold) bool {
return threshold.GracePeriod == time.Duration(0)
}
// buildResourceToRankFunc returns ranking functions associated with resources
func buildResourceToRankFunc(withImageFs bool) map[v1.ResourceName]rankFunc {
resourceToRankFunc := map[v1.ResourceName]rankFunc{
v1.ResourceMemory: rankMemoryPressure,
// buildSignalToRankFunc returns ranking functions associated with resources
func buildSignalToRankFunc(withImageFs bool) map[evictionapi.Signal]rankFunc {
signalToRankFunc := map[evictionapi.Signal]rankFunc{
evictionapi.SignalMemoryAvailable: rankMemoryPressure,
evictionapi.SignalAllocatableMemoryAvailable: rankMemoryPressure,
}
// usage of an imagefs is optional
if withImageFs {
// with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes
resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)
resourceToRankFunc[resourceNodeFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)
signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
// with an imagefs, imagefs pod rank func for eviction only includes rootfs
resourceToRankFunc[resourceImageFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceDisk)
resourceToRankFunc[resourceImageFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceInodes)
signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, v1.ResourceEphemeralStorage)
signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceInodes)
} else {
// without an imagefs, nodefs pod rank func for eviction looks at all fs stats.
// since imagefs and nodefs share a common device, they share common ranking functions.
resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)
resourceToRankFunc[resourceNodeFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
resourceToRankFunc[resourceImageFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)
resourceToRankFunc[resourceImageFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)
signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)
signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
}
return resourceToRankFunc
return signalToRankFunc
}
// PodIsEvicted returns true if the reported pod status is due to an eviction.
@ -1064,26 +1031,57 @@ func PodIsEvicted(podStatus v1.PodStatus) bool {
return podStatus.Phase == v1.PodFailed && podStatus.Reason == reason
}
// buildResourceToNodeReclaimFuncs returns reclaim functions associated with resources.
func buildResourceToNodeReclaimFuncs(imageGC ImageGC, containerGC ContainerGC, withImageFs bool) map[v1.ResourceName]nodeReclaimFuncs {
resourceToReclaimFunc := map[v1.ResourceName]nodeReclaimFuncs{}
// buildSignalToNodeReclaimFuncs returns reclaim functions associated with resources.
func buildSignalToNodeReclaimFuncs(imageGC ImageGC, containerGC ContainerGC, withImageFs bool) map[evictionapi.Signal]nodeReclaimFuncs {
signalToReclaimFunc := map[evictionapi.Signal]nodeReclaimFuncs{}
// usage of an imagefs is optional
if withImageFs {
// with an imagefs, nodefs pressure should just delete logs
resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{}
resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{}
signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{}
signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{}
// with an imagefs, imagefs pressure should delete unused images
resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
} else {
// without an imagefs, nodefs pressure should delete logs, and unused images
// since imagefs and nodefs share a common device, they share common reclaim functions
resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
}
return resourceToReclaimFunc
return signalToReclaimFunc
}
// evictionMessage constructs a useful message about why an eviction occurred
func evictionMessage(resourceToReclaim v1.ResourceName, pod *v1.Pod, stats statsFunc) string {
message := fmt.Sprintf(message, resourceToReclaim)
podStats, ok := stats(pod)
if !ok {
return message
}
for _, containerStats := range podStats.Containers {
for _, container := range pod.Spec.Containers {
if container.Name == containerStats.Name {
requests := container.Resources.Requests[resourceToReclaim]
var usage *resource.Quantity
switch resourceToReclaim {
case v1.ResourceEphemeralStorage:
if containerStats.Rootfs != nil && containerStats.Rootfs.UsedBytes != nil && containerStats.Logs != nil && containerStats.Logs.UsedBytes != nil {
usage = resource.NewQuantity(int64(*containerStats.Rootfs.UsedBytes+*containerStats.Logs.UsedBytes), resource.BinarySI)
}
case v1.ResourceMemory:
if containerStats.Memory != nil && containerStats.Memory.WorkingSetBytes != nil {
usage = resource.NewQuantity(int64(*containerStats.Memory.WorkingSetBytes), resource.BinarySI)
}
}
if usage != nil && usage.Cmp(requests) > 0 {
message += fmt.Sprintf(containerMessage, container.Name, usage.String(), requests.String())
}
}
}
}
return message
}
// thresholdStopCh is a ThresholdStopCh which can only be closed after notifierRefreshInterval time has passed

View File

@ -19,6 +19,7 @@ package eviction
import (
"fmt"
"reflect"
"sort"
"sync"
"testing"
"time"
@ -29,12 +30,10 @@ import (
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/clock"
utilfeature "k8s.io/apiserver/pkg/util/feature"
api "k8s.io/kubernetes/pkg/apis/core"
"k8s.io/kubernetes/pkg/features"
statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/quota"
)
func quantityMustParse(value string) *resource.Quantity {
@ -470,7 +469,7 @@ func TestOrderedByExceedsRequestDisk(t *testing.T) {
return result, found
}
pods := []*v1.Pod{below, exceeds}
orderedBy(exceedDiskRequests(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods)
orderedBy(exceedDiskRequests(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)).Sort(pods)
expected := []*v1.Pod{exceeds, below}
for i := range expected {
@ -584,7 +583,7 @@ func TestOrderedbyDisk(t *testing.T) {
return result, found
}
pods := []*v1.Pod{pod1, pod2, pod3, pod4, pod5, pod6}
orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods)
orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)).Sort(pods)
expected := []*v1.Pod{pod1, pod3, pod2, pod4, pod5, pod6}
for i := range expected {
if pods[i] != expected[i] {
@ -651,7 +650,7 @@ func TestOrderedbyDiskDisableLocalStorage(t *testing.T) {
return result, found
}
pods := []*v1.Pod{pod1, pod3, pod2, pod4, pod5, pod6}
orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods)
orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)).Sort(pods)
expected := []*v1.Pod{pod5, pod3, pod1, pod6, pod4, pod2}
for i := range expected {
if pods[i] != expected[i] {
@ -780,7 +779,7 @@ func TestOrderedByPriorityDisk(t *testing.T) {
pods := []*v1.Pod{pod8, pod7, pod6, pod5, pod4, pod3, pod2, pod1}
expected := []*v1.Pod{pod1, pod2, pod3, pod4, pod5, pod6, pod7, pod8}
fsStatsToMeasure := []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}
orderedBy(exceedDiskRequests(statsFn, fsStatsToMeasure, resourceDisk), priority, disk(statsFn, fsStatsToMeasure, resourceDisk)).Sort(pods)
orderedBy(exceedDiskRequests(statsFn, fsStatsToMeasure, v1.ResourceEphemeralStorage), priority, disk(statsFn, fsStatsToMeasure, v1.ResourceEphemeralStorage)).Sort(pods)
for i := range expected {
if pods[i] != expected[i] {
t.Errorf("Expected pod[%d]: %s, but got: %s", i, expected[i].Name, pods[i].Name)
@ -932,6 +931,80 @@ func TestOrderedByPriorityMemory(t *testing.T) {
}
}
func TestSortByEvictionPriority(t *testing.T) {
for _, tc := range []struct {
name string
thresholds []evictionapi.Threshold
expected []evictionapi.Threshold
}{
{
name: "empty threshold list",
thresholds: []evictionapi.Threshold{},
expected: []evictionapi.Threshold{},
},
{
name: "memory first, PID last",
thresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalPIDAvailable,
},
{
Signal: evictionapi.SignalNodeFsAvailable,
},
{
Signal: evictionapi.SignalMemoryAvailable,
},
},
expected: []evictionapi.Threshold{
{
Signal: evictionapi.SignalMemoryAvailable,
},
{
Signal: evictionapi.SignalNodeFsAvailable,
},
{
Signal: evictionapi.SignalPIDAvailable,
},
},
},
{
name: "allocatable memory first, PID last",
thresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalPIDAvailable,
},
{
Signal: evictionapi.SignalNodeFsAvailable,
},
{
Signal: evictionapi.SignalAllocatableMemoryAvailable,
},
},
expected: []evictionapi.Threshold{
{
Signal: evictionapi.SignalAllocatableMemoryAvailable,
},
{
Signal: evictionapi.SignalNodeFsAvailable,
},
{
Signal: evictionapi.SignalPIDAvailable,
},
},
},
} {
t.Run(tc.name, func(t *testing.T) {
sort.Sort(byEvictionPriority(tc.thresholds))
for i := range tc.expected {
if tc.thresholds[i].Signal != tc.expected[i].Signal {
t.Errorf("At index %d, expected threshold with signal %s, but got %s", i, tc.expected[i].Signal, tc.thresholds[i].Signal)
}
}
})
}
}
type fakeSummaryProvider struct {
result *statsapi.Summary
}
@ -1622,47 +1695,6 @@ func TestHasNodeConditions(t *testing.T) {
}
}
func TestGetStarvedResources(t *testing.T) {
testCases := map[string]struct {
inputs []evictionapi.Threshold
result []v1.ResourceName
}{
"memory.available": {
inputs: []evictionapi.Threshold{
{Signal: evictionapi.SignalMemoryAvailable},
},
result: []v1.ResourceName{v1.ResourceMemory},
},
"imagefs.available": {
inputs: []evictionapi.Threshold{
{Signal: evictionapi.SignalImageFsAvailable},
},
result: []v1.ResourceName{resourceImageFs},
},
"nodefs.available": {
inputs: []evictionapi.Threshold{
{Signal: evictionapi.SignalNodeFsAvailable},
},
result: []v1.ResourceName{resourceNodeFs},
},
}
var internalResourceNames = func(in []v1.ResourceName) []api.ResourceName {
var out []api.ResourceName
for _, name := range in {
out = append(out, api.ResourceName(name))
}
return out
}
for testName, testCase := range testCases {
actual := getStarvedResources(testCase.inputs)
actualSet := quota.ToSet(internalResourceNames(actual))
expectedSet := quota.ToSet(internalResourceNames(testCase.result))
if !actualSet.Equal(expectedSet) {
t.Errorf("Test case: %s, expected: %v, actual: %v", testName, expectedSet, actualSet)
}
}
}
func TestParsePercentage(t *testing.T) {
testCases := map[string]struct {
hasError bool