2019-01-12 04:58:27 +00:00
/ *
Copyright 2014 The Kubernetes Authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package kubelet
import (
2021-07-02 08:43:15 +00:00
"context"
2019-01-12 04:58:27 +00:00
"fmt"
"strings"
"sync"
"time"
2021-07-02 08:43:15 +00:00
v1 "k8s.io/api/core/v1"
2019-01-12 04:58:27 +00:00
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/tools/record"
2020-08-10 17:43:49 +00:00
"k8s.io/klog/v2"
2019-01-12 04:58:27 +00:00
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/events"
"k8s.io/kubernetes/pkg/kubelet/eviction"
2021-07-02 08:43:15 +00:00
"k8s.io/kubernetes/pkg/kubelet/metrics"
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
2019-01-12 04:58:27 +00:00
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/kubelet/util/queue"
)
// OnCompleteFunc is a function that is invoked when an operation completes.
// If err is non-nil, the operation did not complete successfully.
type OnCompleteFunc func ( err error )
2021-07-02 08:43:15 +00:00
// PodStatusFunc is a function that is invoked to override the pod status when a pod is killed.
type PodStatusFunc func ( podStatus * v1 . PodStatus )
2019-01-12 04:58:27 +00:00
// KillPodOptions are options when performing a pod update whose update type is kill.
type KillPodOptions struct {
2021-07-02 08:43:15 +00:00
// CompletedCh is closed when the kill request completes (syncTerminatingPod has completed
// without error) or if the pod does not exist, or if the pod has already terminated. This
// could take an arbitrary amount of time to be closed, but is never left open once
// CouldHaveRunningContainers() returns false.
CompletedCh chan <- struct { }
// Evict is true if this is a pod triggered eviction - once a pod is evicted some resources are
// more aggressively reaped than during normal pod operation (stopped containers).
Evict bool
// PodStatusFunc is invoked (if set) and overrides the status of the pod at the time the pod is killed.
// The provided status is populated from the latest state.
2019-01-12 04:58:27 +00:00
PodStatusFunc PodStatusFunc
// PodTerminationGracePeriodSecondsOverride is optional override to use if a pod is being killed as part of kill operation.
PodTerminationGracePeriodSecondsOverride * int64
}
// UpdatePodOptions is an options struct to pass to a UpdatePod operation.
type UpdatePodOptions struct {
2021-07-02 08:43:15 +00:00
// The type of update (create, update, sync, kill).
UpdateType kubetypes . SyncPodType
// StartTime is an optional timestamp for when this update was created. If set,
// when this update is fully realized by the pod worker it will be recorded in
// the PodWorkerDuration metric.
StartTime time . Time
// Pod to update. Required.
2019-01-12 04:58:27 +00:00
Pod * v1 . Pod
2021-07-02 08:43:15 +00:00
// MirrorPod is the mirror pod if Pod is a static pod. Optional when UpdateType
// is kill or terminated.
2019-01-12 04:58:27 +00:00
MirrorPod * v1 . Pod
2021-07-02 08:43:15 +00:00
// RunningPod is a runtime pod that is no longer present in config. Required
// if Pod is nil, ignored if Pod is set.
RunningPod * kubecontainer . Pod
// KillPodOptions is used to override the default termination behavior of the
// pod or to update the pod status after an operation is completed. Since a
// pod can be killed for multiple reasons, PodStatusFunc is invoked in order
// and later kills have an opportunity to override the status (i.e. a preemption
// may be later turned into an eviction).
2019-01-12 04:58:27 +00:00
KillPodOptions * KillPodOptions
}
2021-07-02 08:43:15 +00:00
// PodWorkType classifies the three phases of pod lifecycle - setup (sync),
// teardown of containers (terminating), cleanup (terminated).
type PodWorkType int
const (
// SyncPodSync is when the pod is expected to be started and running.
SyncPodWork PodWorkType = iota
// TerminatingPodWork is when the pod is no longer being set up, but some
// containers may be running and are being torn down.
TerminatingPodWork
// TerminatedPodWork indicates the pod is stopped, can have no more running
// containers, and any foreground cleanup can be executed.
TerminatedPodWork
)
// podWork is the internal changes
type podWork struct {
// WorkType is the type of sync to perform - sync (create), terminating (stop
// containers), terminated (clean up and write status).
WorkType PodWorkType
// Options contains the data to sync.
Options UpdatePodOptions
}
2019-01-12 04:58:27 +00:00
// PodWorkers is an abstract interface for testability.
type PodWorkers interface {
2021-07-02 08:43:15 +00:00
// UpdatePod notifies the pod worker of a change to a pod, which will then
// be processed in FIFO order by a goroutine per pod UID. The state of the
// pod will be passed to the syncPod method until either the pod is marked
// as deleted, it reaches a terminal phase (Succeeded/Failed), or the pod
// is evicted by the kubelet. Once that occurs the syncTerminatingPod method
// will be called until it exits successfully, and after that all further
// UpdatePod() calls will be ignored for that pod until it has been forgotten
// due to significant time passing. A pod that is terminated will never be
// restarted.
UpdatePod ( options UpdatePodOptions )
// SyncKnownPods removes workers for pods that are not in the desiredPods set
// and have been terminated for a significant period of time. Once this method
// has been called once, the workers are assumed to be fully initialized and
// subsequent calls to ShouldPodContentBeRemoved on unknown pods will return
// true.
SyncKnownPods ( desiredPods [ ] * v1 . Pod ) map [ types . UID ] PodWorkType
2019-01-12 04:58:27 +00:00
2021-07-02 08:43:15 +00:00
// CouldHaveRunningContainers returns true before the pod workers have synced,
// once the pod workers see the pod (syncPod could be called), and returns false
// after the pod has been terminated (running containers guaranteed stopped).
//
// Intended for use by the kubelet config loops, but not subsystems, which should
// use ShouldPod*().
CouldHaveRunningContainers ( uid types . UID ) bool
// IsPodTerminationRequested returns true when pod termination has been requested
// until the termination completes and the pod is removed from config. This should
// not be used in cleanup loops because it will return false if the pod has already
// been cleaned up - use ShouldPodContainersBeTerminating instead. Also, this method
// may return true while containers are still being initialized by the pod worker.
//
// Intended for use by the kubelet sync* methods, but not subsystems, which should
// use ShouldPod*().
IsPodTerminationRequested ( uid types . UID ) bool
// ShouldPodContainersBeTerminating returns false before pod workers have synced,
// or once a pod has started terminating. This check is similar to
// ShouldPodRuntimeBeRemoved but is also true after pod termination is requested.
//
// Intended for use by subsystem sync loops to avoid performing background setup
// after termination has been requested for a pod. Callers must ensure that the
// syncPod method is non-blocking when their data is absent.
ShouldPodContainersBeTerminating ( uid types . UID ) bool
// ShouldPodRuntimeBeRemoved returns true if runtime managers within the Kubelet
// should aggressively cleanup pod resources that are not containers or on disk
// content, like attached volumes. This is true when a pod is not yet observed
// by a worker after the first sync (meaning it can't be running yet) or after
// all running containers are stopped.
// TODO: Once pod logs are separated from running containers, this method should
// be used to gate whether containers are kept.
//
// Intended for use by subsystem sync loops to know when to start tearing down
// resources that are used by running containers. Callers should ensure that
// runtime content they own is not required for post-termination - for instance
// containers are required in docker to preserve pod logs until after the pod
// is deleted.
ShouldPodRuntimeBeRemoved ( uid types . UID ) bool
// ShouldPodContentBeRemoved returns true if resource managers within the Kubelet
// should aggressively cleanup all content related to the pod. This is true
// during pod eviction (when we wish to remove that content to free resources)
// as well as after the request to delete a pod has resulted in containers being
// stopped (which is a more graceful action). Note that a deleting pod can still
// be evicted.
//
// Intended for use by subsystem sync loops to know when to start tearing down
// resources that are used by non-deleted pods. Content is generally preserved
// until deletion+removal_from_etcd or eviction, although garbage collection
// can free content when this method returns false.
ShouldPodContentBeRemoved ( uid types . UID ) bool
// IsPodForMirrorPodTerminatingByFullName returns true if a static pod with the
// provided pod name is currently terminating and has yet to complete. It is
// intended to be used only during orphan mirror pod cleanup to prevent us from
// deleting a terminating static pod from the apiserver before the pod is shut
// down.
IsPodForMirrorPodTerminatingByFullName ( podFullname string ) bool
2019-01-12 04:58:27 +00:00
}
2021-07-02 08:43:15 +00:00
// the function to invoke to perform a sync (reconcile the kubelet state to the desired shape of the pod)
type syncPodFnType func ( ctx context . Context , updateType kubetypes . SyncPodType , pod * v1 . Pod , mirrorPod * v1 . Pod , podStatus * kubecontainer . PodStatus ) error
// the function to invoke to terminate a pod (ensure no running processes are present)
type syncTerminatingPodFnType func ( ctx context . Context , pod * v1 . Pod , podStatus * kubecontainer . PodStatus , runningPod * kubecontainer . Pod , gracePeriod * int64 , podStatusFn func ( * v1 . PodStatus ) ) error
// the function to invoke to cleanup a pod that is terminated
type syncTerminatedPodFnType func ( ctx context . Context , pod * v1 . Pod , podStatus * kubecontainer . PodStatus ) error
2019-01-12 04:58:27 +00:00
const (
// jitter factor for resyncInterval
workerResyncIntervalJitterFactor = 0.5
// jitter factor for backOffPeriod and backOffOnTransientErrorPeriod
workerBackOffPeriodJitterFactor = 0.5
// backoff period when transient error occurred.
backOffOnTransientErrorPeriod = time . Second
)
2021-07-02 08:43:15 +00:00
// podSyncStatus tracks per-pod transitions through the three phases of pod
// worker sync (setup, terminating, terminated).
type podSyncStatus struct {
// ctx is the context that is associated with the current pod sync.
ctx context . Context
// cancelFn if set is expected to cancel the current sync*Pod operation.
cancelFn context . CancelFunc
// working is true if a pod worker is currently in a sync method.
working bool
// syncedAt is the time at which the pod worker first observed this pod.
syncedAt time . Time
// terminatingAt is set once the pod is requested to be killed - note that
// this can be set before the pod worker starts terminating the pod, see
// terminating.
terminatingAt time . Time
// startedTerminating is true once the pod worker has observed the request to
// stop a pod (exited syncPod and observed a podWork with WorkType
// TerminatingPodWork). Once this is set, it is safe for other components
// of the kubelet to assume that no other containers may be started.
startedTerminating bool
// deleted is true if the pod has been marked for deletion on the apiserver
// or has no configuration represented (was deleted before).
deleted bool
// gracePeriod is the requested gracePeriod once terminatingAt is nonzero.
gracePeriod int64
// evicted is true if the kill indicated this was an eviction (an evicted
// pod can be more aggressively cleaned up).
evicted bool
// terminatedAt is set once the pod worker has completed a successful
// syncTerminatingPod call and means all running containers are stopped.
terminatedAt time . Time
// finished is true once the pod worker completes for a pod
// (syncTerminatedPod exited with no errors) until SyncKnownPods is invoked
// to remove the pod. A terminal pod (Succeeded/Failed) will have
// termination status until the pod is deleted.
finished bool
// notifyPostTerminating will be closed once the pod transitions to
// terminated. After the pod is in terminated state, nothing should be
// added to this list.
notifyPostTerminating [ ] chan <- struct { }
// statusPostTerminating is a list of the status changes associated
// with kill pod requests. After the pod is in terminated state, nothing
// should be added to this list. The worker will execute the last function
// in this list on each termination attempt.
statusPostTerminating [ ] PodStatusFunc
}
func ( s * podSyncStatus ) IsWorking ( ) bool { return s . working }
func ( s * podSyncStatus ) IsTerminationRequested ( ) bool { return ! s . terminatingAt . IsZero ( ) }
func ( s * podSyncStatus ) IsTerminationStarted ( ) bool { return s . startedTerminating }
func ( s * podSyncStatus ) IsTerminated ( ) bool { return ! s . terminatedAt . IsZero ( ) }
func ( s * podSyncStatus ) IsFinished ( ) bool { return s . finished }
func ( s * podSyncStatus ) IsEvicted ( ) bool { return s . evicted }
func ( s * podSyncStatus ) IsDeleted ( ) bool { return s . deleted }
// podWorkers keeps track of operations on pods and ensures each pod is
// reconciled with the container runtime and other subsystems. The worker
// also tracks which pods are in flight for starting, which pods are
// shutting down but still have running containers, and which pods have
// terminated recently and are guaranteed to have no running containers.
//
// A pod passed to a pod worker is either being synced (expected to be
// running), terminating (has running containers but no new containers are
// expected to start), terminated (has no running containers but may still
// have resources being consumed), or cleaned up (no resources remaining).
// Once a pod is set to be "torn down" it cannot be started again for that
// UID (corresponding to a delete or eviction) until:
//
// 1. The pod worker is finalized (syncTerminatingPod and
// syncTerminatedPod exit without error sequentially)
// 2. The SyncKnownPods method is invoked by kubelet housekeeping and the pod
// is not part of the known config.
//
// Pod workers provide a consistent source of information to other kubelet
// loops about the status of the pod and whether containers can be
// running. The ShouldPodContentBeRemoved() method tracks whether a pod's
// contents should still exist, which includes non-existent pods after
// SyncKnownPods() has been called once (as per the contract, all existing
// pods should be provided via UpdatePod before SyncKnownPods is invoked).
// Generally other sync loops are expected to separate "setup" and
// "teardown" responsibilities and the information methods here assist in
// each by centralizing that state. A simple visualization of the time
// intervals involved might look like:
//
// ---| = kubelet config has synced at least once
// -------| |- = pod exists in apiserver config
// --------| |---------------- = CouldHaveRunningContainers() is true
// ^- pod is observed by pod worker .
// . .
// ----------| |------------------------- = syncPod is running
// . ^- pod worker loop sees change and invokes syncPod
// . . .
// --------------| |------- = ShouldPodContainersBeTerminating() returns true
// --------------| |------- = IsPodTerminationRequested() returns true (pod is known)
// . . ^- Kubelet evicts pod .
// . . .
// -------------------| |---------------- = syncTerminatingPod runs then exits without error
// . . ^ pod worker loop exits syncPod, sees pod is terminating,
// . . invokes syncTerminatingPod
// . . .
// ---| |------------------| . = ShouldPodRuntimeBeRemoved() returns true (post-sync)
// . ^ syncTerminatingPod has exited successfully
// . .
// ----------------------------| |------- = syncTerminatedPod runs then exits without error
// . ^ other loops can tear down
// . .
// ------------------------------------| |---- = status manager is waiting for PodResourcesAreReclaimed()
// . ^ .
// ----------| |- = status manager can be writing pod status
// ^ status manager deletes pod because no longer exists in config
//
// Other components in the Kubelet can request a termination of the pod
// via the UpdatePod method or the killPodNow wrapper - this will ensure
// the components of the pod are stopped until the kubelet is restarted
// or permanently (if the phase of the pod is set to a terminal phase
// in the pod status change).
//
2019-01-12 04:58:27 +00:00
type podWorkers struct {
// Protects all per worker fields.
podLock sync . Mutex
2021-07-02 08:43:15 +00:00
// podsSynced is true once the pod worker has been synced at least once,
// which means that all working pods have been started via UpdatePod().
podsSynced bool
2019-01-12 04:58:27 +00:00
// Tracks all running per-pod goroutines - per-pod goroutine will be
// processing updates received through its corresponding channel.
2021-07-02 08:43:15 +00:00
podUpdates map [ types . UID ] chan podWork
2019-01-12 04:58:27 +00:00
// Tracks the last undelivered work item for this pod - a work item is
// undelivered if it comes in while the worker is working.
2021-07-02 08:43:15 +00:00
lastUndeliveredWorkUpdate map [ types . UID ] podWork
// Tracks by UID the termination status of a pod - syncing, terminating,
// terminated, and evicted.
podSyncStatuses map [ types . UID ] * podSyncStatus
// Tracks when a static pod is being killed and is removed when the
// static pod transitions to the killed state.
terminatingStaticPodFullnames map [ string ] struct { }
2019-01-12 04:58:27 +00:00
workQueue queue . WorkQueue
2021-07-02 08:43:15 +00:00
// This function is run to sync the desired state of pod.
2019-01-12 04:58:27 +00:00
// NOTE: This function has to be thread-safe - it can be called for
// different pods at the same time.
2021-07-02 08:43:15 +00:00
syncPodFn syncPodFnType
syncTerminatingPodFn syncTerminatingPodFnType
syncTerminatedPodFn syncTerminatedPodFnType
2019-01-12 04:58:27 +00:00
// The EventRecorder to use
recorder record . EventRecorder
// backOffPeriod is the duration to back off when there is a sync error.
backOffPeriod time . Duration
// resyncInterval is the duration to wait until the next sync.
resyncInterval time . Duration
// podCache stores kubecontainer.PodStatus for all pods.
podCache kubecontainer . Cache
}
2021-07-02 08:43:15 +00:00
func newPodWorkers (
syncPodFn syncPodFnType ,
syncTerminatingPodFn syncTerminatingPodFnType ,
syncTerminatedPodFn syncTerminatedPodFnType ,
recorder record . EventRecorder ,
workQueue queue . WorkQueue ,
resyncInterval , backOffPeriod time . Duration ,
podCache kubecontainer . Cache ,
) PodWorkers {
2019-01-12 04:58:27 +00:00
return & podWorkers {
2021-07-02 08:43:15 +00:00
podSyncStatuses : map [ types . UID ] * podSyncStatus { } ,
podUpdates : map [ types . UID ] chan podWork { } ,
lastUndeliveredWorkUpdate : map [ types . UID ] podWork { } ,
terminatingStaticPodFullnames : map [ string ] struct { } { } ,
syncPodFn : syncPodFn ,
syncTerminatingPodFn : syncTerminatingPodFn ,
syncTerminatedPodFn : syncTerminatedPodFn ,
recorder : recorder ,
workQueue : workQueue ,
resyncInterval : resyncInterval ,
backOffPeriod : backOffPeriod ,
podCache : podCache ,
2019-01-12 04:58:27 +00:00
}
}
2021-07-02 08:43:15 +00:00
func ( p * podWorkers ) CouldHaveRunningContainers ( uid types . UID ) bool {
p . podLock . Lock ( )
defer p . podLock . Unlock ( )
if status , ok := p . podSyncStatuses [ uid ] ; ok {
return ! status . IsTerminated ( )
2019-01-12 04:58:27 +00:00
}
2021-07-02 08:43:15 +00:00
// once all pods are synced, any pod without sync status is known to not be running.
return ! p . podsSynced
2019-01-12 04:58:27 +00:00
}
2021-07-02 08:43:15 +00:00
func ( p * podWorkers ) IsPodTerminationRequested ( uid types . UID ) bool {
p . podLock . Lock ( )
defer p . podLock . Unlock ( )
if status , ok := p . podSyncStatuses [ uid ] ; ok {
// the pod may still be setting up at this point.
return status . IsTerminationRequested ( )
}
// an unknown pod is considered not to be terminating (use ShouldPodContainersBeTerminating in
// cleanup loops to avoid failing to cleanup pods that have already been removed from config)
return false
}
func ( p * podWorkers ) ShouldPodContainersBeTerminating ( uid types . UID ) bool {
p . podLock . Lock ( )
defer p . podLock . Unlock ( )
if status , ok := p . podSyncStatuses [ uid ] ; ok {
// we wait until the pod worker goroutine observes the termination, which means syncPod will not
// be executed again, which means no new containers can be started
return status . IsTerminationStarted ( )
}
// once we've synced, if the pod isn't known to the workers we should be tearing them
// down
return p . podsSynced
}
func ( p * podWorkers ) ShouldPodRuntimeBeRemoved ( uid types . UID ) bool {
p . podLock . Lock ( )
defer p . podLock . Unlock ( )
if status , ok := p . podSyncStatuses [ uid ] ; ok {
return status . IsTerminated ( )
}
// a pod that hasn't been sent to the pod worker yet should have no runtime components once we have
// synced all content.
return p . podsSynced
}
func ( p * podWorkers ) ShouldPodContentBeRemoved ( uid types . UID ) bool {
p . podLock . Lock ( )
defer p . podLock . Unlock ( )
if status , ok := p . podSyncStatuses [ uid ] ; ok {
return status . IsEvicted ( ) || ( status . IsDeleted ( ) && status . IsTerminated ( ) )
}
// a pod that hasn't been sent to the pod worker yet should have no content on disk once we have
// synced all content.
return p . podsSynced
}
func ( p * podWorkers ) IsPodForMirrorPodTerminatingByFullName ( podFullName string ) bool {
p . podLock . Lock ( )
defer p . podLock . Unlock ( )
_ , ok := p . terminatingStaticPodFullnames [ podFullName ]
return ok
}
// UpdatePod carries a configuration change or termination state to a pod. A pod is either runnable,
// terminating, or terminated, and will transition to terminating if deleted on the apiserver, it is
// discovered to have a terminal phase (Succeeded or Failed), or if it is evicted by the kubelet.
func ( p * podWorkers ) UpdatePod ( options UpdatePodOptions ) {
// handle when the pod is an orphan (no config) and we only have runtime status by running only
// the terminating part of the lifecycle
2019-01-12 04:58:27 +00:00
pod := options . Pod
2021-07-02 08:43:15 +00:00
var isRuntimePod bool
if options . RunningPod != nil {
if options . Pod == nil {
pod = options . RunningPod . ToAPIPod ( )
if options . UpdateType != kubetypes . SyncPodKill {
klog . InfoS ( "Pod update is ignored, runtime pods can only be killed" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID )
return
}
options . Pod = pod
isRuntimePod = true
} else {
options . RunningPod = nil
klog . InfoS ( "Pod update included RunningPod which is only valid when Pod is not specified" , "pod" , klog . KObj ( options . Pod ) , "podUID" , options . Pod . UID )
}
}
2019-01-12 04:58:27 +00:00
uid := pod . UID
p . podLock . Lock ( )
defer p . podLock . Unlock ( )
2021-07-02 08:43:15 +00:00
// decide what to do with this pod - we are either setting it up, tearing it down, or ignoring it
now := time . Now ( )
status , ok := p . podSyncStatuses [ uid ]
if ! ok {
klog . V ( 4 ) . InfoS ( "Pod is being synced for the first time" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID )
status = & podSyncStatus {
syncedAt : now ,
}
p . podSyncStatuses [ uid ] = status
}
// once a pod is terminated by UID, it cannot reenter the pod worker (until the UID is purged by housekeeping)
if status . IsFinished ( ) {
klog . V ( 4 ) . InfoS ( "Pod is finished processing, no further updates" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID )
return
}
// check for a transition to terminating
var becameTerminating bool
if ! status . IsTerminationRequested ( ) {
switch {
case isRuntimePod :
klog . V ( 4 ) . InfoS ( "Pod is orphaned and must be torn down" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID )
status . deleted = true
status . terminatingAt = now
becameTerminating = true
case pod . DeletionTimestamp != nil :
klog . V ( 4 ) . InfoS ( "Pod is marked for graceful deletion, begin teardown" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID )
status . deleted = true
status . terminatingAt = now
becameTerminating = true
case pod . Status . Phase == v1 . PodFailed , pod . Status . Phase == v1 . PodSucceeded :
klog . V ( 4 ) . InfoS ( "Pod is in a terminal phase (success/failed), begin teardown" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID )
status . terminatingAt = now
becameTerminating = true
case options . UpdateType == kubetypes . SyncPodKill :
if options . KillPodOptions != nil && options . KillPodOptions . Evict {
klog . V ( 4 ) . InfoS ( "Pod is being evicted by the kubelet, begin teardown" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID )
status . evicted = true
} else {
klog . V ( 4 ) . InfoS ( "Pod is being removed by the kubelet, begin teardown" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID )
}
status . terminatingAt = now
becameTerminating = true
}
}
// once a pod is terminating, all updates are kills and the grace period can only decrease
var workType PodWorkType
var wasGracePeriodShortened bool
switch {
case status . IsTerminated ( ) :
// A terminated pod may still be waiting for cleanup - if we receive a runtime pod kill request
// due to housekeeping seeing an older cached version of the runtime pod simply ignore it until
// after the pod worker completes.
if isRuntimePod {
klog . V ( 3 ) . InfoS ( "Pod is waiting for termination, ignoring runtime-only kill until after pod worker is fully terminated" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID )
return
}
workType = TerminatedPodWork
if options . KillPodOptions != nil {
if ch := options . KillPodOptions . CompletedCh ; ch != nil {
close ( ch )
}
}
options . KillPodOptions = nil
case status . IsTerminationRequested ( ) :
workType = TerminatingPodWork
if options . KillPodOptions == nil {
options . KillPodOptions = & KillPodOptions { }
}
if ch := options . KillPodOptions . CompletedCh ; ch != nil {
status . notifyPostTerminating = append ( status . notifyPostTerminating , ch )
}
if fn := options . KillPodOptions . PodStatusFunc ; fn != nil {
status . statusPostTerminating = append ( status . statusPostTerminating , fn )
}
gracePeriod , gracePeriodShortened := calculateEffectiveGracePeriod ( status , pod , options . KillPodOptions )
wasGracePeriodShortened = gracePeriodShortened
status . gracePeriod = gracePeriod
// always set the grace period for syncTerminatingPod so we don't have to recalculate,
// will never be zero.
options . KillPodOptions . PodTerminationGracePeriodSecondsOverride = & gracePeriod
// if a static pod comes through, start tracking it explicitly (cleared by the pod worker loop)
if kubelettypes . IsStaticPod ( pod ) {
p . terminatingStaticPodFullnames [ kubecontainer . GetPodFullName ( pod ) ] = struct { } { }
}
default :
workType = SyncPodWork
// KillPodOptions is not valid for sync actions outside of the terminating phase
if options . KillPodOptions != nil {
if ch := options . KillPodOptions . CompletedCh ; ch != nil {
close ( ch )
}
options . KillPodOptions = nil
}
}
// the desired work we want to be performing
work := podWork {
WorkType : workType ,
Options : options ,
}
// start the pod worker goroutine if it doesn't exist
var podUpdates chan podWork
var exists bool
2019-01-12 04:58:27 +00:00
if podUpdates , exists = p . podUpdates [ uid ] ; ! exists {
// We need to have a buffer here, because checkForUpdates() method that
// puts an update into channel is called from the same goroutine where
// the channel is consumed. However, it is guaranteed that in such case
// the channel is empty, so buffer of size 1 is enough.
2021-07-02 08:43:15 +00:00
podUpdates = make ( chan podWork , 1 )
2019-01-12 04:58:27 +00:00
p . podUpdates [ uid ] = podUpdates
// Creating a new pod worker either means this is a new pod, or that the
// kubelet just restarted. In either case the kubelet is willing to believe
// the status of the pod for the first pod worker sync. See corresponding
// comment in syncPod.
go func ( ) {
defer runtime . HandleCrash ( )
p . managePodLoop ( podUpdates )
} ( )
}
2021-07-02 08:43:15 +00:00
// dispatch a request to the pod worker if none are running
if ! status . IsWorking ( ) {
status . working = true
podUpdates <- work
return
}
// capture the maximum latency between a requested update and when the pod
// worker observes it
if undelivered , ok := p . lastUndeliveredWorkUpdate [ pod . UID ] ; ok {
// track the max latency between when a config change is requested and when it is realized
// NOTE: this undercounts the latency when multiple requests are queued, but captures max latency
if ! undelivered . Options . StartTime . IsZero ( ) && undelivered . Options . StartTime . Before ( work . Options . StartTime ) {
work . Options . StartTime = undelivered . Options . StartTime
}
}
// always sync the most recent data
p . lastUndeliveredWorkUpdate [ pod . UID ] = work
if ( becameTerminating || wasGracePeriodShortened ) && status . cancelFn != nil {
klog . V ( 3 ) . InfoS ( "Cancelling current pod sync" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID , "updateType" , work . WorkType )
status . cancelFn ( )
return
}
}
// calculateEffectiveGracePeriod sets the initial grace period for a newly terminating pod or allows a
// shorter grace period to be provided, returning the desired value.
func calculateEffectiveGracePeriod ( status * podSyncStatus , pod * v1 . Pod , options * KillPodOptions ) ( int64 , bool ) {
// enforce the restriction that a grace period can only decrease and track whatever our value is,
// then ensure a calculated value is passed down to lower levels
gracePeriod := status . gracePeriod
// this value is bedrock truth - the apiserver owns telling us this value calculated by apiserver
if override := pod . DeletionGracePeriodSeconds ; override != nil {
if gracePeriod == 0 || * override < gracePeriod {
gracePeriod = * override
2019-01-12 04:58:27 +00:00
}
}
2021-07-02 08:43:15 +00:00
// we allow other parts of the kubelet (namely eviction) to request this pod be terminated faster
if options != nil {
if override := options . PodTerminationGracePeriodSecondsOverride ; override != nil {
if gracePeriod == 0 || * override < gracePeriod {
gracePeriod = * override
}
}
}
// make a best effort to default this value to the pod's desired intent, in the event
// the kubelet provided no requested value (graceful termination?)
if gracePeriod == 0 && pod . Spec . TerminationGracePeriodSeconds != nil {
gracePeriod = * pod . Spec . TerminationGracePeriodSeconds
}
// no matter what, we always supply a grace period of 1
if gracePeriod < 1 {
gracePeriod = 1
}
return gracePeriod , status . gracePeriod != 0 && status . gracePeriod != gracePeriod
2019-01-12 04:58:27 +00:00
}
2021-07-02 08:43:15 +00:00
func ( p * podWorkers ) managePodLoop ( podUpdates <- chan podWork ) {
var lastSyncTime time . Time
for update := range podUpdates {
pod := update . Options . Pod
klog . V ( 4 ) . InfoS ( "Processing pod event" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID , "updateType" , update . WorkType )
err := func ( ) error {
// The worker is responsible for ensuring the sync method sees the appropriate
// status updates on resyncs (the result of the last sync), transitions to
// terminating (no wait), or on terminated (whatever the most recent state is).
// Only syncing and terminating can generate pod status changes, while terminated
// pods ensure the most recent status makes it to the api server.
var status * kubecontainer . PodStatus
var err error
switch {
case update . Options . RunningPod != nil :
// when we receive a running pod, we don't need status at all
default :
// wait until we see the next refresh from the PLEG via the cache (max 2s)
// TODO: this adds ~1s of latency on all transitions from sync to terminating
// to terminated, and on all termination retries (including evictions). We should
// improve latency by making the the pleg continuous and by allowing pod status
// changes to be refreshed when key events happen (killPod, sync->terminating).
// Improving this latency also reduces the possibility that a terminated
// container's status is garbage collected before we have a chance to update the
// API server (thus losing the exit code).
status , err = p . podCache . GetNewerThan ( pod . UID , lastSyncTime )
}
if err != nil {
// This is the legacy event thrown by manage pod loop all other events are now dispatched
// from syncPodFn
p . recorder . Eventf ( pod , v1 . EventTypeWarning , events . FailedSync , "error determining status: %v" , err )
return err
}
ctx := p . contextForWorker ( pod . UID )
// Take the appropriate action (illegal phases are prevented by UpdatePod)
switch {
case update . WorkType == TerminatedPodWork :
err = p . syncTerminatedPodFn ( ctx , pod , status )
case update . WorkType == TerminatingPodWork :
var gracePeriod * int64
if opt := update . Options . KillPodOptions ; opt != nil {
gracePeriod = opt . PodTerminationGracePeriodSecondsOverride
}
podStatusFn := p . acknowledgeTerminating ( pod )
err = p . syncTerminatingPodFn ( ctx , pod , status , update . Options . RunningPod , gracePeriod , podStatusFn )
default :
err = p . syncPodFn ( ctx , update . Options . UpdateType , pod , update . Options . MirrorPod , status )
}
lastSyncTime = time . Now ( )
return err
} ( )
switch {
case err == context . Canceled :
// when the context is cancelled we expect an update to already be queued
klog . V ( 2 ) . InfoS ( "Sync exited with context cancellation error" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID , "updateType" , update . WorkType )
case err != nil :
// we will queue a retry
klog . ErrorS ( err , "Error syncing pod, skipping" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID )
case update . WorkType == TerminatedPodWork :
// we can shut down the worker
p . completeTerminated ( pod )
if start := update . Options . StartTime ; ! start . IsZero ( ) {
metrics . PodWorkerDuration . WithLabelValues ( "terminated" ) . Observe ( metrics . SinceInSeconds ( start ) )
}
klog . V ( 4 ) . InfoS ( "Processing pod event done" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID , "updateType" , update . WorkType )
return
case update . WorkType == TerminatingPodWork :
// pods that don't exist in config don't need to be terminated, garbage collection will cover them
if update . Options . RunningPod != nil {
p . completeTerminatingRuntimePod ( pod )
if start := update . Options . StartTime ; ! start . IsZero ( ) {
metrics . PodWorkerDuration . WithLabelValues ( update . Options . UpdateType . String ( ) ) . Observe ( metrics . SinceInSeconds ( start ) )
}
klog . V ( 4 ) . InfoS ( "Processing pod event done" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID , "updateType" , update . WorkType )
return
}
// otherwise we move to the terminating phase
p . completeTerminating ( pod )
}
// queue a retry for errors if necessary, then put the next event in the channel if any
p . completeWork ( pod , err )
if start := update . Options . StartTime ; ! start . IsZero ( ) {
metrics . PodWorkerDuration . WithLabelValues ( update . Options . UpdateType . String ( ) ) . Observe ( metrics . SinceInSeconds ( start ) )
}
klog . V ( 4 ) . InfoS ( "Processing pod event done" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID , "updateType" , update . WorkType )
2019-01-12 04:58:27 +00:00
}
}
2021-07-02 08:43:15 +00:00
// acknowledgeTerminating sets the terminating flag on the pod status once the pod worker sees
// the termination state so that other components know no new containers will be started in this
// pod. It then returns the status function, if any, that applies to this pod.
func ( p * podWorkers ) acknowledgeTerminating ( pod * v1 . Pod ) PodStatusFunc {
2019-01-12 04:58:27 +00:00
p . podLock . Lock ( )
defer p . podLock . Unlock ( )
2021-07-02 08:43:15 +00:00
status , ok := p . podSyncStatuses [ pod . UID ]
if ! ok {
return nil
}
if ! status . terminatingAt . IsZero ( ) && ! status . startedTerminating {
klog . V ( 4 ) . InfoS ( "Pod worker has observed request to terminate" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID )
status . startedTerminating = true
}
if l := len ( status . statusPostTerminating ) ; l > 0 {
return status . statusPostTerminating [ l - 1 ]
}
return nil
2019-01-12 04:58:27 +00:00
}
2021-07-02 08:43:15 +00:00
// completeTerminating is invoked when syncTerminatingPod completes successfully, which means
// no container is running, no container will be started in the future, and we are ready for
// cleanup. This updates the termination state which prevents future syncs and will ensure
// other kubelet loops know this pod is not running any containers.
func ( p * podWorkers ) completeTerminating ( pod * v1 . Pod ) {
2019-01-12 04:58:27 +00:00
p . podLock . Lock ( )
defer p . podLock . Unlock ( )
2021-07-02 08:43:15 +00:00
klog . V ( 4 ) . InfoS ( "Pod terminated all containers successfully" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID )
// if a static pod is being tracked, forget it
delete ( p . terminatingStaticPodFullnames , kubecontainer . GetPodFullName ( pod ) )
if status , ok := p . podSyncStatuses [ pod . UID ] ; ok {
if status . terminatingAt . IsZero ( ) {
klog . V ( 4 ) . InfoS ( "Pod worker was terminated but did not have terminatingAt set, likely programmer error" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID )
}
status . terminatedAt = time . Now ( )
for _ , ch := range status . notifyPostTerminating {
close ( ch )
2019-01-12 04:58:27 +00:00
}
2021-07-02 08:43:15 +00:00
status . notifyPostTerminating = nil
status . statusPostTerminating = nil
}
p . lastUndeliveredWorkUpdate [ pod . UID ] = podWork {
WorkType : TerminatedPodWork ,
Options : UpdatePodOptions {
Pod : pod ,
} ,
}
}
// completeTerminatingRuntimePod is invoked when syncTerminatingPod completes successfully,
// which means an orphaned pod (no config) is terminated and we can exit. Since orphaned
// pods have no API representation, we want to exit the loop at this point
// cleanup. This updates the termination state which prevents future syncs and will ensure
// other kubelet loops know this pod is not running any containers.
func ( p * podWorkers ) completeTerminatingRuntimePod ( pod * v1 . Pod ) {
p . podLock . Lock ( )
defer p . podLock . Unlock ( )
klog . V ( 4 ) . InfoS ( "Pod terminated all orphaned containers successfully and worker can now stop" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID )
// if a static pod is being tracked, forget it
delete ( p . terminatingStaticPodFullnames , kubecontainer . GetPodFullName ( pod ) )
if status , ok := p . podSyncStatuses [ pod . UID ] ; ok {
if status . terminatingAt . IsZero ( ) {
klog . V ( 4 ) . InfoS ( "Pod worker was terminated but did not have terminatingAt set, likely programmer error" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID )
}
status . terminatedAt = time . Now ( )
status . finished = true
status . working = false
}
ch , ok := p . podUpdates [ pod . UID ]
if ok {
close ( ch )
}
delete ( p . podUpdates , pod . UID )
delete ( p . lastUndeliveredWorkUpdate , pod . UID )
delete ( p . terminatingStaticPodFullnames , kubecontainer . GetPodFullName ( pod ) )
}
// completeTerminated is invoked after syncTerminatedPod completes successfully and means we
// can stop the pod worker. The pod is finalized at this point.
func ( p * podWorkers ) completeTerminated ( pod * v1 . Pod ) {
p . podLock . Lock ( )
defer p . podLock . Unlock ( )
klog . V ( 4 ) . InfoS ( "Pod is complete and the worker can now stop" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID )
ch , ok := p . podUpdates [ pod . UID ]
if ok {
close ( ch )
}
delete ( p . podUpdates , pod . UID )
delete ( p . lastUndeliveredWorkUpdate , pod . UID )
delete ( p . terminatingStaticPodFullnames , kubecontainer . GetPodFullName ( pod ) )
if status , ok := p . podSyncStatuses [ pod . UID ] ; ok {
if status . terminatingAt . IsZero ( ) {
klog . V ( 4 ) . InfoS ( "Pod worker is complete but did not have terminatingAt set, likely programmer error" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID )
}
if status . terminatedAt . IsZero ( ) {
klog . V ( 4 ) . InfoS ( "Pod worker is complete but did not have terminatedAt set, likely programmer error" , "pod" , klog . KObj ( pod ) , "podUID" , pod . UID )
}
status . finished = true
status . working = false
2019-01-12 04:58:27 +00:00
}
}
2021-07-02 08:43:15 +00:00
// completeWork requeues on error or the next sync interval and then immediately executes any pending
// work.
func ( p * podWorkers ) completeWork ( pod * v1 . Pod , syncErr error ) {
2019-01-12 04:58:27 +00:00
// Requeue the last update if the last sync returned error.
switch {
case syncErr == nil :
// No error; requeue at the regular resync interval.
2021-07-02 08:43:15 +00:00
p . workQueue . Enqueue ( pod . UID , wait . Jitter ( p . resyncInterval , workerResyncIntervalJitterFactor ) )
2019-01-12 04:58:27 +00:00
case strings . Contains ( syncErr . Error ( ) , NetworkNotReadyErrorMsg ) :
// Network is not ready; back off for short period of time and retry as network might be ready soon.
2021-07-02 08:43:15 +00:00
p . workQueue . Enqueue ( pod . UID , wait . Jitter ( backOffOnTransientErrorPeriod , workerBackOffPeriodJitterFactor ) )
2019-01-12 04:58:27 +00:00
default :
// Error occurred during the sync; back off and then retry.
2021-07-02 08:43:15 +00:00
p . workQueue . Enqueue ( pod . UID , wait . Jitter ( p . backOffPeriod , workerBackOffPeriodJitterFactor ) )
2019-01-12 04:58:27 +00:00
}
2021-07-02 08:43:15 +00:00
p . completeWorkQueueNext ( pod . UID )
2019-01-12 04:58:27 +00:00
}
2021-07-02 08:43:15 +00:00
// completeWorkQueueNext holds the lock and either queues the next work item for the worker or
// clears the working status.
func ( p * podWorkers ) completeWorkQueueNext ( uid types . UID ) {
2019-01-12 04:58:27 +00:00
p . podLock . Lock ( )
defer p . podLock . Unlock ( )
if workUpdate , exists := p . lastUndeliveredWorkUpdate [ uid ] ; exists {
p . podUpdates [ uid ] <- workUpdate
delete ( p . lastUndeliveredWorkUpdate , uid )
} else {
2021-07-02 08:43:15 +00:00
p . podSyncStatuses [ uid ] . working = false
2019-01-12 04:58:27 +00:00
}
}
2021-07-02 08:43:15 +00:00
// contextForWorker returns or initializes the appropriate context for a known
// worker. If the current context is expired, it is reset. If no worker is
// present, no context is returned.
func ( p * podWorkers ) contextForWorker ( uid types . UID ) context . Context {
p . podLock . Lock ( )
defer p . podLock . Unlock ( )
status , ok := p . podSyncStatuses [ uid ]
if ! ok {
return nil
}
if status . ctx == nil || status . ctx . Err ( ) == context . Canceled {
status . ctx , status . cancelFn = context . WithCancel ( context . Background ( ) )
}
return status . ctx
}
// SyncKnownPods will purge any fully terminated pods that are not in the desiredPods
// list, which means SyncKnownPods must be called in a threadsafe manner from calls
// to UpdatePods for new pods. It returns a map of known workers that are not finished
// with a value of SyncPodTerminated, SyncPodKill, or SyncPodSync depending on whether
// the pod is terminated, terminating, or syncing.
func ( p * podWorkers ) SyncKnownPods ( desiredPods [ ] * v1 . Pod ) map [ types . UID ] PodWorkType {
workers := make ( map [ types . UID ] PodWorkType )
known := make ( map [ types . UID ] struct { } )
for _ , pod := range desiredPods {
known [ pod . UID ] = struct { } { }
}
p . podLock . Lock ( )
defer p . podLock . Unlock ( )
p . podsSynced = true
for uid , status := range p . podSyncStatuses {
if _ , exists := known [ uid ] ; ! exists {
p . removeTerminatedWorker ( uid )
}
switch {
case ! status . terminatedAt . IsZero ( ) :
workers [ uid ] = TerminatedPodWork
case ! status . terminatingAt . IsZero ( ) :
workers [ uid ] = TerminatingPodWork
default :
workers [ uid ] = SyncPodWork
}
}
return workers
}
// removeTerminatedWorker cleans up and removes the worker status for a worker that
// has reached a terminal state of "finished" - has successfully exited
// syncTerminatedPod. This "forgets" a pod by UID and allows another pod to be recreated
// with the same UID.
func ( p * podWorkers ) removeTerminatedWorker ( uid types . UID ) {
status , ok := p . podSyncStatuses [ uid ]
if ! ok {
// already forgotten, or forgotten too early
klog . V ( 4 ) . InfoS ( "Pod worker has been requested for removal but is not a known pod" , "podUID" , uid )
return
}
if ! status . finished {
klog . V ( 4 ) . InfoS ( "Pod worker has been requested for removal but is still not fully terminated" , "podUID" , uid )
return
}
klog . V ( 4 ) . InfoS ( "Pod has been terminated and is no longer known to the kubelet, remove all history" , "podUID" , uid )
delete ( p . podSyncStatuses , uid )
delete ( p . podUpdates , uid )
delete ( p . lastUndeliveredWorkUpdate , uid )
}
2019-01-12 04:58:27 +00:00
// killPodNow returns a KillPodFunc that can be used to kill a pod.
// It is intended to be injected into other modules that need to kill a pod.
func killPodNow ( podWorkers PodWorkers , recorder record . EventRecorder ) eviction . KillPodFunc {
2021-07-02 08:43:15 +00:00
return func ( pod * v1 . Pod , isEvicted bool , gracePeriodOverride * int64 , statusFn func ( * v1 . PodStatus ) ) error {
2019-01-12 04:58:27 +00:00
// determine the grace period to use when killing the pod
gracePeriod := int64 ( 0 )
if gracePeriodOverride != nil {
gracePeriod = * gracePeriodOverride
} else if pod . Spec . TerminationGracePeriodSeconds != nil {
gracePeriod = * pod . Spec . TerminationGracePeriodSeconds
}
// we timeout and return an error if we don't get a callback within a reasonable time.
// the default timeout is relative to the grace period (we settle on 10s to wait for kubelet->runtime traffic to complete in sigkill)
timeout := int64 ( gracePeriod + ( gracePeriod / 2 ) )
minTimeout := int64 ( 10 )
if timeout < minTimeout {
timeout = minTimeout
}
timeoutDuration := time . Duration ( timeout ) * time . Second
// open a channel we block against until we get a result
2021-07-02 08:43:15 +00:00
ch := make ( chan struct { } , 1 )
podWorkers . UpdatePod ( UpdatePodOptions {
2019-01-12 04:58:27 +00:00
Pod : pod ,
UpdateType : kubetypes . SyncPodKill ,
KillPodOptions : & KillPodOptions {
2021-07-02 08:43:15 +00:00
CompletedCh : ch ,
Evict : isEvicted ,
PodStatusFunc : statusFn ,
2019-01-12 04:58:27 +00:00
PodTerminationGracePeriodSecondsOverride : gracePeriodOverride ,
} ,
} )
// wait for either a response, or a timeout
select {
2021-07-02 08:43:15 +00:00
case <- ch :
return nil
2019-01-12 04:58:27 +00:00
case <- time . After ( timeoutDuration ) :
recorder . Eventf ( pod , v1 . EventTypeWarning , events . ExceededGracePeriod , "Container runtime did not kill the pod within specified grace period." )
return fmt . Errorf ( "timeout waiting to kill pod" )
}
}
}