mirror of https://github.com/k3s-io/k3s
582 lines
22 KiB
Go
582 lines
22 KiB
Go
/*
|
|
Copyright 2016 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package eviction
|
|
|
|
import (
|
|
"fmt"
|
|
"sort"
|
|
"sync"
|
|
"time"
|
|
|
|
"k8s.io/klog"
|
|
|
|
"k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/api/resource"
|
|
"k8s.io/apimachinery/pkg/util/clock"
|
|
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
|
"k8s.io/client-go/tools/record"
|
|
apiv1resource "k8s.io/kubernetes/pkg/api/v1/resource"
|
|
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
|
|
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
|
|
"k8s.io/kubernetes/pkg/features"
|
|
statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1"
|
|
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
|
|
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
|
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
|
kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
|
|
"k8s.io/kubernetes/pkg/kubelet/server/stats"
|
|
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
|
|
"k8s.io/kubernetes/pkg/kubelet/util/format"
|
|
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
|
|
)
|
|
|
|
const (
|
|
podCleanupTimeout = 30 * time.Second
|
|
podCleanupPollFreq = time.Second
|
|
)
|
|
|
|
// managerImpl implements Manager
|
|
type managerImpl struct {
|
|
// used to track time
|
|
clock clock.Clock
|
|
// config is how the manager is configured
|
|
config Config
|
|
// the function to invoke to kill a pod
|
|
killPodFunc KillPodFunc
|
|
// the function to get the mirror pod by a given statid pod
|
|
mirrorPodFunc MirrorPodFunc
|
|
// the interface that knows how to do image gc
|
|
imageGC ImageGC
|
|
// the interface that knows how to do container gc
|
|
containerGC ContainerGC
|
|
// protects access to internal state
|
|
sync.RWMutex
|
|
// node conditions are the set of conditions present
|
|
nodeConditions []v1.NodeConditionType
|
|
// captures when a node condition was last observed based on a threshold being met
|
|
nodeConditionsLastObservedAt nodeConditionsObservedAt
|
|
// nodeRef is a reference to the node
|
|
nodeRef *v1.ObjectReference
|
|
// used to record events about the node
|
|
recorder record.EventRecorder
|
|
// used to measure usage stats on system
|
|
summaryProvider stats.SummaryProvider
|
|
// records when a threshold was first observed
|
|
thresholdsFirstObservedAt thresholdsObservedAt
|
|
// records the set of thresholds that have been met (including graceperiod) but not yet resolved
|
|
thresholdsMet []evictionapi.Threshold
|
|
// signalToRankFunc maps a resource to ranking function for that resource.
|
|
signalToRankFunc map[evictionapi.Signal]rankFunc
|
|
// signalToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource.
|
|
signalToNodeReclaimFuncs map[evictionapi.Signal]nodeReclaimFuncs
|
|
// last observations from synchronize
|
|
lastObservations signalObservations
|
|
// dedicatedImageFs indicates if imagefs is on a separate device from the rootfs
|
|
dedicatedImageFs *bool
|
|
// thresholdNotifiers is a list of memory threshold notifiers which each notify for a memory eviction threshold
|
|
thresholdNotifiers []ThresholdNotifier
|
|
// thresholdsLastUpdated is the last time the thresholdNotifiers were updated.
|
|
thresholdsLastUpdated time.Time
|
|
}
|
|
|
|
// ensure it implements the required interface
|
|
var _ Manager = &managerImpl{}
|
|
|
|
// NewManager returns a configured Manager and an associated admission handler to enforce eviction configuration.
|
|
func NewManager(
|
|
summaryProvider stats.SummaryProvider,
|
|
config Config,
|
|
killPodFunc KillPodFunc,
|
|
mirrorPodFunc MirrorPodFunc,
|
|
imageGC ImageGC,
|
|
containerGC ContainerGC,
|
|
recorder record.EventRecorder,
|
|
nodeRef *v1.ObjectReference,
|
|
clock clock.Clock,
|
|
) (Manager, lifecycle.PodAdmitHandler) {
|
|
manager := &managerImpl{
|
|
clock: clock,
|
|
killPodFunc: killPodFunc,
|
|
mirrorPodFunc: mirrorPodFunc,
|
|
imageGC: imageGC,
|
|
containerGC: containerGC,
|
|
config: config,
|
|
recorder: recorder,
|
|
summaryProvider: summaryProvider,
|
|
nodeRef: nodeRef,
|
|
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
|
|
thresholdsFirstObservedAt: thresholdsObservedAt{},
|
|
dedicatedImageFs: nil,
|
|
thresholdNotifiers: []ThresholdNotifier{},
|
|
}
|
|
return manager, manager
|
|
}
|
|
|
|
// Admit rejects a pod if its not safe to admit for node stability.
|
|
func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {
|
|
m.RLock()
|
|
defer m.RUnlock()
|
|
if len(m.nodeConditions) == 0 {
|
|
return lifecycle.PodAdmitResult{Admit: true}
|
|
}
|
|
// Admit Critical pods even under resource pressure since they are required for system stability.
|
|
// https://github.com/kubernetes/kubernetes/issues/40573 has more details.
|
|
if kubelettypes.IsCriticalPod(attrs.Pod) {
|
|
return lifecycle.PodAdmitResult{Admit: true}
|
|
}
|
|
// the node has memory pressure, admit if not best-effort
|
|
if hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure) {
|
|
notBestEffort := v1.PodQOSBestEffort != v1qos.GetPodQOS(attrs.Pod)
|
|
if notBestEffort {
|
|
return lifecycle.PodAdmitResult{Admit: true}
|
|
}
|
|
|
|
// When node has memory pressure and TaintNodesByCondition is enabled, check BestEffort Pod's toleration:
|
|
// admit it if tolerates memory pressure taint, fail for other tolerations, e.g. OutOfDisk.
|
|
if utilfeature.DefaultFeatureGate.Enabled(features.TaintNodesByCondition) &&
|
|
v1helper.TolerationsTolerateTaint(attrs.Pod.Spec.Tolerations, &v1.Taint{
|
|
Key: schedulerapi.TaintNodeMemoryPressure,
|
|
Effect: v1.TaintEffectNoSchedule,
|
|
}) {
|
|
return lifecycle.PodAdmitResult{Admit: true}
|
|
}
|
|
}
|
|
|
|
// reject pods when under memory pressure (if pod is best effort), or if under disk pressure.
|
|
klog.Warningf("Failed to admit pod %s - node has conditions: %v", format.Pod(attrs.Pod), m.nodeConditions)
|
|
return lifecycle.PodAdmitResult{
|
|
Admit: false,
|
|
Reason: Reason,
|
|
Message: fmt.Sprintf(nodeConditionMessageFmt, m.nodeConditions),
|
|
}
|
|
}
|
|
|
|
// Start starts the control loop to observe and response to low compute resources.
|
|
func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) {
|
|
thresholdHandler := func(message string) {
|
|
klog.Infof(message)
|
|
m.synchronize(diskInfoProvider, podFunc)
|
|
}
|
|
if m.config.KernelMemcgNotification {
|
|
for _, threshold := range m.config.Thresholds {
|
|
if threshold.Signal == evictionapi.SignalMemoryAvailable || threshold.Signal == evictionapi.SignalAllocatableMemoryAvailable {
|
|
notifier, err := NewMemoryThresholdNotifier(threshold, m.config.PodCgroupRoot, &CgroupNotifierFactory{}, thresholdHandler)
|
|
if err != nil {
|
|
klog.Warningf("eviction manager: failed to create memory threshold notifier: %v", err)
|
|
} else {
|
|
go notifier.Start()
|
|
m.thresholdNotifiers = append(m.thresholdNotifiers, notifier)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// start the eviction manager monitoring
|
|
go func() {
|
|
for {
|
|
if evictedPods := m.synchronize(diskInfoProvider, podFunc); evictedPods != nil {
|
|
klog.Infof("eviction manager: pods %s evicted, waiting for pod to be cleaned up", format.Pods(evictedPods))
|
|
m.waitForPodsCleanup(podCleanedUpFunc, evictedPods)
|
|
} else {
|
|
time.Sleep(monitoringInterval)
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
// IsUnderMemoryPressure returns true if the node is under memory pressure.
|
|
func (m *managerImpl) IsUnderMemoryPressure() bool {
|
|
m.RLock()
|
|
defer m.RUnlock()
|
|
return hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure)
|
|
}
|
|
|
|
// IsUnderDiskPressure returns true if the node is under disk pressure.
|
|
func (m *managerImpl) IsUnderDiskPressure() bool {
|
|
m.RLock()
|
|
defer m.RUnlock()
|
|
return hasNodeCondition(m.nodeConditions, v1.NodeDiskPressure)
|
|
}
|
|
|
|
// IsUnderPIDPressure returns true if the node is under PID pressure.
|
|
func (m *managerImpl) IsUnderPIDPressure() bool {
|
|
m.RLock()
|
|
defer m.RUnlock()
|
|
return hasNodeCondition(m.nodeConditions, v1.NodePIDPressure)
|
|
}
|
|
|
|
// synchronize is the main control loop that enforces eviction thresholds.
|
|
// Returns the pod that was killed, or nil if no pod was killed.
|
|
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) []*v1.Pod {
|
|
// if we have nothing to do, just return
|
|
thresholds := m.config.Thresholds
|
|
if len(thresholds) == 0 && !utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) {
|
|
return nil
|
|
}
|
|
|
|
klog.V(3).Infof("eviction manager: synchronize housekeeping")
|
|
// build the ranking functions (if not yet known)
|
|
// TODO: have a function in cadvisor that lets us know if global housekeeping has completed
|
|
if m.dedicatedImageFs == nil {
|
|
hasImageFs, ok := diskInfoProvider.HasDedicatedImageFs()
|
|
if ok != nil {
|
|
return nil
|
|
}
|
|
m.dedicatedImageFs = &hasImageFs
|
|
m.signalToRankFunc = buildSignalToRankFunc(hasImageFs)
|
|
m.signalToNodeReclaimFuncs = buildSignalToNodeReclaimFuncs(m.imageGC, m.containerGC, hasImageFs)
|
|
}
|
|
|
|
activePods := podFunc()
|
|
updateStats := true
|
|
summary, err := m.summaryProvider.Get(updateStats)
|
|
if err != nil {
|
|
klog.Errorf("eviction manager: failed to get summary stats: %v", err)
|
|
return nil
|
|
}
|
|
|
|
if m.clock.Since(m.thresholdsLastUpdated) > notifierRefreshInterval {
|
|
m.thresholdsLastUpdated = m.clock.Now()
|
|
for _, notifier := range m.thresholdNotifiers {
|
|
if err := notifier.UpdateThreshold(summary); err != nil {
|
|
klog.Warningf("eviction manager: failed to update %s: %v", notifier.Description(), err)
|
|
}
|
|
}
|
|
}
|
|
|
|
// make observations and get a function to derive pod usage stats relative to those observations.
|
|
observations, statsFunc := makeSignalObservations(summary)
|
|
debugLogObservations("observations", observations)
|
|
|
|
// determine the set of thresholds met independent of grace period
|
|
thresholds = thresholdsMet(thresholds, observations, false)
|
|
debugLogThresholdsWithObservation("thresholds - ignoring grace period", thresholds, observations)
|
|
|
|
// determine the set of thresholds previously met that have not yet satisfied the associated min-reclaim
|
|
if len(m.thresholdsMet) > 0 {
|
|
thresholdsNotYetResolved := thresholdsMet(m.thresholdsMet, observations, true)
|
|
thresholds = mergeThresholds(thresholds, thresholdsNotYetResolved)
|
|
}
|
|
debugLogThresholdsWithObservation("thresholds - reclaim not satisfied", thresholds, observations)
|
|
|
|
// track when a threshold was first observed
|
|
now := m.clock.Now()
|
|
thresholdsFirstObservedAt := thresholdsFirstObservedAt(thresholds, m.thresholdsFirstObservedAt, now)
|
|
|
|
// the set of node conditions that are triggered by currently observed thresholds
|
|
nodeConditions := nodeConditions(thresholds)
|
|
if len(nodeConditions) > 0 {
|
|
klog.V(3).Infof("eviction manager: node conditions - observed: %v", nodeConditions)
|
|
}
|
|
|
|
// track when a node condition was last observed
|
|
nodeConditionsLastObservedAt := nodeConditionsLastObservedAt(nodeConditions, m.nodeConditionsLastObservedAt, now)
|
|
|
|
// node conditions report true if it has been observed within the transition period window
|
|
nodeConditions = nodeConditionsObservedSince(nodeConditionsLastObservedAt, m.config.PressureTransitionPeriod, now)
|
|
if len(nodeConditions) > 0 {
|
|
klog.V(3).Infof("eviction manager: node conditions - transition period not met: %v", nodeConditions)
|
|
}
|
|
|
|
// determine the set of thresholds we need to drive eviction behavior (i.e. all grace periods are met)
|
|
thresholds = thresholdsMetGracePeriod(thresholdsFirstObservedAt, now)
|
|
debugLogThresholdsWithObservation("thresholds - grace periods satisified", thresholds, observations)
|
|
|
|
// update internal state
|
|
m.Lock()
|
|
m.nodeConditions = nodeConditions
|
|
m.thresholdsFirstObservedAt = thresholdsFirstObservedAt
|
|
m.nodeConditionsLastObservedAt = nodeConditionsLastObservedAt
|
|
m.thresholdsMet = thresholds
|
|
|
|
// determine the set of thresholds whose stats have been updated since the last sync
|
|
thresholds = thresholdsUpdatedStats(thresholds, observations, m.lastObservations)
|
|
debugLogThresholdsWithObservation("thresholds - updated stats", thresholds, observations)
|
|
|
|
m.lastObservations = observations
|
|
m.Unlock()
|
|
|
|
// evict pods if there is a resource usage violation from local volume temporary storage
|
|
// If eviction happens in localStorageEviction function, skip the rest of eviction action
|
|
if utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) {
|
|
if evictedPods := m.localStorageEviction(summary, activePods); len(evictedPods) > 0 {
|
|
return evictedPods
|
|
}
|
|
}
|
|
|
|
if len(thresholds) == 0 {
|
|
klog.V(3).Infof("eviction manager: no resources are starved")
|
|
return nil
|
|
}
|
|
|
|
// rank the thresholds by eviction priority
|
|
sort.Sort(byEvictionPriority(thresholds))
|
|
thresholdToReclaim := thresholds[0]
|
|
resourceToReclaim, found := signalToResource[thresholdToReclaim.Signal]
|
|
if !found {
|
|
klog.V(3).Infof("eviction manager: threshold %s was crossed, but reclaim is not implemented for this threshold.", thresholdToReclaim.Signal)
|
|
return nil
|
|
}
|
|
klog.Warningf("eviction manager: attempting to reclaim %v", resourceToReclaim)
|
|
|
|
// record an event about the resources we are now attempting to reclaim via eviction
|
|
m.recorder.Eventf(m.nodeRef, v1.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim)
|
|
|
|
// check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
|
|
if m.reclaimNodeLevelResources(thresholdToReclaim.Signal, resourceToReclaim) {
|
|
klog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim)
|
|
return nil
|
|
}
|
|
|
|
klog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim)
|
|
|
|
// rank the pods for eviction
|
|
rank, ok := m.signalToRankFunc[thresholdToReclaim.Signal]
|
|
if !ok {
|
|
klog.Errorf("eviction manager: no ranking function for signal %s", thresholdToReclaim.Signal)
|
|
return nil
|
|
}
|
|
|
|
// the only candidates viable for eviction are those pods that had anything running.
|
|
if len(activePods) == 0 {
|
|
klog.Errorf("eviction manager: eviction thresholds have been met, but no pods are active to evict")
|
|
return nil
|
|
}
|
|
|
|
// rank the running pods for eviction for the specified resource
|
|
rank(activePods, statsFunc)
|
|
|
|
klog.Infof("eviction manager: pods ranked for eviction: %s", format.Pods(activePods))
|
|
|
|
//record age of metrics for met thresholds that we are using for evictions.
|
|
for _, t := range thresholds {
|
|
timeObserved := observations[t.Signal].time
|
|
if !timeObserved.IsZero() {
|
|
metrics.EvictionStatsAge.WithLabelValues(string(t.Signal)).Observe(metrics.SinceInSeconds(timeObserved.Time))
|
|
metrics.DeprecatedEvictionStatsAge.WithLabelValues(string(t.Signal)).Observe(metrics.SinceInMicroseconds(timeObserved.Time))
|
|
}
|
|
}
|
|
|
|
// we kill at most a single pod during each eviction interval
|
|
for i := range activePods {
|
|
pod := activePods[i]
|
|
gracePeriodOverride := int64(0)
|
|
if !isHardEvictionThreshold(thresholdToReclaim) {
|
|
gracePeriodOverride = m.config.MaxPodGracePeriodSeconds
|
|
}
|
|
message, annotations := evictionMessage(resourceToReclaim, pod, statsFunc)
|
|
if m.evictPod(pod, gracePeriodOverride, message, annotations) {
|
|
return []*v1.Pod{pod}
|
|
}
|
|
}
|
|
klog.Infof("eviction manager: unable to evict any pods from the node")
|
|
return nil
|
|
}
|
|
|
|
func (m *managerImpl) waitForPodsCleanup(podCleanedUpFunc PodCleanedUpFunc, pods []*v1.Pod) {
|
|
timeout := m.clock.NewTimer(podCleanupTimeout)
|
|
defer timeout.Stop()
|
|
ticker := m.clock.NewTicker(podCleanupPollFreq)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-timeout.C():
|
|
klog.Warningf("eviction manager: timed out waiting for pods %s to be cleaned up", format.Pods(pods))
|
|
return
|
|
case <-ticker.C():
|
|
for i, pod := range pods {
|
|
if !podCleanedUpFunc(pod) {
|
|
break
|
|
}
|
|
if i == len(pods)-1 {
|
|
klog.Infof("eviction manager: pods %s successfully cleaned up", format.Pods(pods))
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required.
|
|
func (m *managerImpl) reclaimNodeLevelResources(signalToReclaim evictionapi.Signal, resourceToReclaim v1.ResourceName) bool {
|
|
nodeReclaimFuncs := m.signalToNodeReclaimFuncs[signalToReclaim]
|
|
for _, nodeReclaimFunc := range nodeReclaimFuncs {
|
|
// attempt to reclaim the pressured resource.
|
|
if err := nodeReclaimFunc(); err != nil {
|
|
klog.Warningf("eviction manager: unexpected error when attempting to reduce %v pressure: %v", resourceToReclaim, err)
|
|
}
|
|
|
|
}
|
|
if len(nodeReclaimFuncs) > 0 {
|
|
summary, err := m.summaryProvider.Get(true)
|
|
if err != nil {
|
|
klog.Errorf("eviction manager: failed to get summary stats after resource reclaim: %v", err)
|
|
return false
|
|
}
|
|
|
|
// make observations and get a function to derive pod usage stats relative to those observations.
|
|
observations, _ := makeSignalObservations(summary)
|
|
debugLogObservations("observations after resource reclaim", observations)
|
|
|
|
// determine the set of thresholds met independent of grace period
|
|
thresholds := thresholdsMet(m.config.Thresholds, observations, false)
|
|
debugLogThresholdsWithObservation("thresholds after resource reclaim - ignoring grace period", thresholds, observations)
|
|
|
|
if len(thresholds) == 0 {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// localStorageEviction checks the EmptyDir volume usage for each pod and determine whether it exceeds the specified limit and needs
|
|
// to be evicted. It also checks every container in the pod, if the container overlay usage exceeds the limit, the pod will be evicted too.
|
|
func (m *managerImpl) localStorageEviction(summary *statsapi.Summary, pods []*v1.Pod) []*v1.Pod {
|
|
statsFunc := cachedStatsFunc(summary.Pods)
|
|
evicted := []*v1.Pod{}
|
|
for _, pod := range pods {
|
|
podStats, ok := statsFunc(pod)
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
if m.emptyDirLimitEviction(podStats, pod) {
|
|
evicted = append(evicted, pod)
|
|
continue
|
|
}
|
|
|
|
if m.podEphemeralStorageLimitEviction(podStats, pod) {
|
|
evicted = append(evicted, pod)
|
|
continue
|
|
}
|
|
|
|
if m.containerEphemeralStorageLimitEviction(podStats, pod) {
|
|
evicted = append(evicted, pod)
|
|
}
|
|
}
|
|
|
|
return evicted
|
|
}
|
|
|
|
func (m *managerImpl) emptyDirLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool {
|
|
podVolumeUsed := make(map[string]*resource.Quantity)
|
|
for _, volume := range podStats.VolumeStats {
|
|
podVolumeUsed[volume.Name] = resource.NewQuantity(int64(*volume.UsedBytes), resource.BinarySI)
|
|
}
|
|
for i := range pod.Spec.Volumes {
|
|
source := &pod.Spec.Volumes[i].VolumeSource
|
|
if source.EmptyDir != nil {
|
|
size := source.EmptyDir.SizeLimit
|
|
used := podVolumeUsed[pod.Spec.Volumes[i].Name]
|
|
if used != nil && size != nil && size.Sign() == 1 && used.Cmp(*size) > 0 {
|
|
// the emptyDir usage exceeds the size limit, evict the pod
|
|
return m.evictPod(pod, 0, fmt.Sprintf(emptyDirMessageFmt, pod.Spec.Volumes[i].Name, size.String()), nil)
|
|
}
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func (m *managerImpl) podEphemeralStorageLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool {
|
|
_, podLimits := apiv1resource.PodRequestsAndLimits(pod)
|
|
_, found := podLimits[v1.ResourceEphemeralStorage]
|
|
if !found {
|
|
return false
|
|
}
|
|
|
|
podEphemeralStorageTotalUsage := &resource.Quantity{}
|
|
fsStatsSet := []fsStatsType{}
|
|
if *m.dedicatedImageFs {
|
|
fsStatsSet = []fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}
|
|
} else {
|
|
fsStatsSet = []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}
|
|
}
|
|
podEphemeralUsage, err := podLocalEphemeralStorageUsage(podStats, pod, fsStatsSet)
|
|
if err != nil {
|
|
klog.Errorf("eviction manager: error getting pod disk usage %v", err)
|
|
return false
|
|
}
|
|
|
|
podEphemeralStorageTotalUsage.Add(podEphemeralUsage[v1.ResourceEphemeralStorage])
|
|
podEphemeralStorageLimit := podLimits[v1.ResourceEphemeralStorage]
|
|
if podEphemeralStorageTotalUsage.Cmp(podEphemeralStorageLimit) > 0 {
|
|
// the total usage of pod exceeds the total size limit of containers, evict the pod
|
|
return m.evictPod(pod, 0, fmt.Sprintf(podEphemeralStorageMessageFmt, podEphemeralStorageLimit.String()), nil)
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (m *managerImpl) containerEphemeralStorageLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool {
|
|
thresholdsMap := make(map[string]*resource.Quantity)
|
|
for _, container := range pod.Spec.Containers {
|
|
ephemeralLimit := container.Resources.Limits.StorageEphemeral()
|
|
if ephemeralLimit != nil && ephemeralLimit.Value() != 0 {
|
|
thresholdsMap[container.Name] = ephemeralLimit
|
|
}
|
|
}
|
|
|
|
for _, containerStat := range podStats.Containers {
|
|
containerUsed := diskUsage(containerStat.Logs)
|
|
if !*m.dedicatedImageFs {
|
|
containerUsed.Add(*diskUsage(containerStat.Rootfs))
|
|
}
|
|
|
|
if ephemeralStorageThreshold, ok := thresholdsMap[containerStat.Name]; ok {
|
|
if ephemeralStorageThreshold.Cmp(*containerUsed) < 0 {
|
|
return m.evictPod(pod, 0, fmt.Sprintf(containerEphemeralStorageMessageFmt, containerStat.Name, ephemeralStorageThreshold.String()), nil)
|
|
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (m *managerImpl) evictPod(pod *v1.Pod, gracePeriodOverride int64, evictMsg string, annotations map[string]string) bool {
|
|
// If the pod is marked as critical and static, and support for critical pod annotations is enabled,
|
|
// do not evict such pods. Static pods are not re-admitted after evictions.
|
|
// https://github.com/kubernetes/kubernetes/issues/40573 has more details.
|
|
if kubepod.IsStaticPod(pod) {
|
|
// need mirrorPod to check its "priority" value; static pod doesn't carry it
|
|
if mirrorPod, ok := m.mirrorPodFunc(pod); ok && mirrorPod != nil {
|
|
// skip only when it's a static and critical pod
|
|
if kubelettypes.IsCriticalPod(mirrorPod) {
|
|
klog.Errorf("eviction manager: cannot evict a critical static pod %s", format.Pod(pod))
|
|
return false
|
|
}
|
|
} else {
|
|
// we should never hit this
|
|
klog.Errorf("eviction manager: cannot get mirror pod from static pod %s, so cannot evict it", format.Pod(pod))
|
|
return false
|
|
}
|
|
}
|
|
status := v1.PodStatus{
|
|
Phase: v1.PodFailed,
|
|
Message: evictMsg,
|
|
Reason: Reason,
|
|
}
|
|
// record that we are evicting the pod
|
|
m.recorder.AnnotatedEventf(pod, annotations, v1.EventTypeWarning, Reason, evictMsg)
|
|
// this is a blocking call and should only return when the pod and its containers are killed.
|
|
err := m.killPodFunc(pod, status, &gracePeriodOverride)
|
|
if err != nil {
|
|
klog.Errorf("eviction manager: pod %s failed to evict %v", format.Pod(pod), err)
|
|
} else {
|
|
klog.Infof("eviction manager: pod %s is evicted successfully", format.Pod(pod))
|
|
}
|
|
return true
|
|
}
|