2019-01-12 04:58:27 +00:00
/ *
Copyright 2016 The Kubernetes Authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package kubelet
import (
"bytes"
"context"
"fmt"
"io"
"io/ioutil"
"net/http"
"net/url"
"os"
"path"
"path/filepath"
"runtime"
"sort"
"strings"
"sync"
2019-09-27 21:51:53 +00:00
v1 "k8s.io/api/core/v1"
2019-01-12 04:58:27 +00:00
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
utilvalidation "k8s.io/apimachinery/pkg/util/validation"
utilfeature "k8s.io/apiserver/pkg/util/feature"
2019-08-30 18:33:25 +00:00
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
2020-08-10 17:43:49 +00:00
"k8s.io/klog/v2"
2019-01-12 04:58:27 +00:00
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
"k8s.io/kubernetes/pkg/api/v1/resource"
podshelper "k8s.io/kubernetes/pkg/apis/core/pods"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/fieldpath"
"k8s.io/kubernetes/pkg/kubelet/cm"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
2020-08-10 17:43:49 +00:00
"k8s.io/kubernetes/pkg/kubelet/cri/streaming/portforward"
remotecommandserver "k8s.io/kubernetes/pkg/kubelet/cri/streaming/remotecommand"
2019-01-12 04:58:27 +00:00
"k8s.io/kubernetes/pkg/kubelet/envvars"
"k8s.io/kubernetes/pkg/kubelet/eviction"
"k8s.io/kubernetes/pkg/kubelet/images"
"k8s.io/kubernetes/pkg/kubelet/status"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
2020-08-10 17:43:49 +00:00
"k8s.io/kubernetes/pkg/kubelet/util"
2019-01-12 04:58:27 +00:00
"k8s.io/kubernetes/pkg/kubelet/util/format"
volumeutil "k8s.io/kubernetes/pkg/volume/util"
2019-09-27 21:51:53 +00:00
"k8s.io/kubernetes/pkg/volume/util/hostutil"
2019-04-07 17:07:55 +00:00
"k8s.io/kubernetes/pkg/volume/util/subpath"
2019-01-12 04:58:27 +00:00
"k8s.io/kubernetes/pkg/volume/util/volumepathhandler"
volumevalidation "k8s.io/kubernetes/pkg/volume/validation"
"k8s.io/kubernetes/third_party/forked/golang/expansion"
)
const (
managedHostsHeader = "# Kubernetes-managed hosts file.\n"
managedHostsHeaderWithHostNetwork = "# Kubernetes-managed hosts file (host network).\n"
2020-08-10 17:43:49 +00:00
// Capacity of the channel for storing pods to kill. A small number should
// suffice because a goroutine is dedicated to check the channel and does
// not block on anything else.
podKillingChannelCapacity = 50
2019-01-12 04:58:27 +00:00
)
// Get a list of pods that have data directories.
func ( kl * Kubelet ) listPodsFromDisk ( ) ( [ ] types . UID , error ) {
podInfos , err := ioutil . ReadDir ( kl . getPodsDir ( ) )
if err != nil {
return nil , err
}
pods := [ ] types . UID { }
for i := range podInfos {
if podInfos [ i ] . IsDir ( ) {
pods = append ( pods , types . UID ( podInfos [ i ] . Name ( ) ) )
}
}
return pods , nil
}
// GetActivePods returns non-terminal pods
func ( kl * Kubelet ) GetActivePods ( ) [ ] * v1 . Pod {
allPods := kl . podManager . GetPods ( )
activePods := kl . filterOutTerminatedPods ( allPods )
return activePods
}
// makeBlockVolumes maps the raw block devices specified in the path of the container
// Experimental
func ( kl * Kubelet ) makeBlockVolumes ( pod * v1 . Pod , container * v1 . Container , podVolumes kubecontainer . VolumeMap , blkutil volumepathhandler . BlockVolumePathHandler ) ( [ ] kubecontainer . DeviceInfo , error ) {
var devices [ ] kubecontainer . DeviceInfo
for _ , device := range container . VolumeDevices {
// check path is absolute
if ! filepath . IsAbs ( device . DevicePath ) {
return nil , fmt . Errorf ( "error DevicePath `%s` must be an absolute path" , device . DevicePath )
}
vol , ok := podVolumes [ device . Name ]
if ! ok || vol . BlockVolumeMapper == nil {
klog . Errorf ( "Block volume cannot be satisfied for container %q, because the volume is missing or the volume mapper is nil: %+v" , container . Name , device )
return nil , fmt . Errorf ( "cannot find volume %q to pass into container %q" , device . Name , container . Name )
}
// Get a symbolic link associated to a block device under pod device path
dirPath , volName := vol . BlockVolumeMapper . GetPodDeviceMapPath ( )
symlinkPath := path . Join ( dirPath , volName )
if islinkExist , checkErr := blkutil . IsSymlinkExist ( symlinkPath ) ; checkErr != nil {
return nil , checkErr
} else if islinkExist {
// Check readOnly in PVCVolumeSource and set read only permission if it's true.
permission := "mrw"
if vol . ReadOnly {
permission = "r"
}
klog . V ( 4 ) . Infof ( "Device will be attached to container %q. Path on host: %v" , container . Name , symlinkPath )
devices = append ( devices , kubecontainer . DeviceInfo { PathOnHost : symlinkPath , PathInContainer : device . DevicePath , Permissions : permission } )
}
}
return devices , nil
}
// makeMounts determines the mount points for the given container.
2019-12-12 01:27:03 +00:00
func makeMounts ( pod * v1 . Pod , podDir string , container * v1 . Container , hostName , hostDomain string , podIPs [ ] string , podVolumes kubecontainer . VolumeMap , hu hostutil . HostUtils , subpather subpath . Interface , expandEnvs [ ] kubecontainer . EnvVar ) ( [ ] kubecontainer . Mount , func ( ) , error ) {
2019-01-12 04:58:27 +00:00
// Kubernetes only mounts on /etc/hosts if:
// - container is not an infrastructure (pause) container
// - container is not already mounting on /etc/hosts
// - OS is not Windows
// Kubernetes will not mount /etc/hosts if:
// - when the Pod sandbox is being created, its IP is still unknown. Hence, PodIP will not have been set.
2019-12-12 01:27:03 +00:00
mountEtcHostsFile := len ( podIPs ) > 0 && runtime . GOOS != "windows"
klog . V ( 3 ) . Infof ( "container: %v/%v/%v podIPs: %q creating hosts mount: %v" , pod . Namespace , pod . Name , container . Name , podIPs , mountEtcHostsFile )
2019-01-12 04:58:27 +00:00
mounts := [ ] kubecontainer . Mount { }
var cleanupAction func ( )
for i , mount := range container . VolumeMounts {
// do not mount /etc/hosts if container is already mounting on the path
mountEtcHostsFile = mountEtcHostsFile && ( mount . MountPath != etcHostsPath )
vol , ok := podVolumes [ mount . Name ]
if ! ok || vol . Mounter == nil {
2021-01-14 00:37:06 +00:00
klog . Errorf ( "Mount cannot be satisfied for container %q, because the volume is missing (ok=%v) or the volume mounter (vol.Mounter) is nil (vol=%+v): %+v" , container . Name , ok , vol , mount )
2019-01-12 04:58:27 +00:00
return nil , cleanupAction , fmt . Errorf ( "cannot find volume %q to mount into container %q" , mount . Name , container . Name )
}
relabelVolume := false
// If the volume supports SELinux and it has not been
// relabeled already and it is not a read-only volume,
// relabel it and mark it as labeled
if vol . Mounter . GetAttributes ( ) . Managed && vol . Mounter . GetAttributes ( ) . SupportsSELinux && ! vol . SELinuxLabeled {
vol . SELinuxLabeled = true
relabelVolume = true
}
hostPath , err := volumeutil . GetPath ( vol . Mounter )
if err != nil {
return nil , cleanupAction , err
}
2019-04-07 17:07:55 +00:00
subPath := mount . SubPath
if mount . SubPathExpr != "" {
2019-01-12 04:58:27 +00:00
if ! utilfeature . DefaultFeatureGate . Enabled ( features . VolumeSubpath ) {
return nil , cleanupAction , fmt . Errorf ( "volume subpaths are disabled" )
}
2019-08-30 18:33:25 +00:00
subPath , err = kubecontainer . ExpandContainerVolumeMounts ( mount , expandEnvs )
if err != nil {
return nil , cleanupAction , err
}
2019-04-07 17:07:55 +00:00
}
if subPath != "" {
if ! utilfeature . DefaultFeatureGate . Enabled ( features . VolumeSubpath ) {
return nil , cleanupAction , fmt . Errorf ( "volume subpaths are disabled" )
2019-01-12 04:58:27 +00:00
}
2019-04-07 17:07:55 +00:00
if filepath . IsAbs ( subPath ) {
return nil , cleanupAction , fmt . Errorf ( "error SubPath `%s` must not be an absolute path" , subPath )
}
err = volumevalidation . ValidatePathNoBacksteps ( subPath )
2019-01-12 04:58:27 +00:00
if err != nil {
2019-04-07 17:07:55 +00:00
return nil , cleanupAction , fmt . Errorf ( "unable to provision SubPath `%s`: %v" , subPath , err )
2019-01-12 04:58:27 +00:00
}
volumePath := hostPath
2019-04-07 17:07:55 +00:00
hostPath = filepath . Join ( volumePath , subPath )
2019-01-12 04:58:27 +00:00
2019-09-27 21:51:53 +00:00
if subPathExists , err := hu . PathExists ( hostPath ) ; err != nil {
2019-01-12 04:58:27 +00:00
klog . Errorf ( "Could not determine if subPath %s exists; will not attempt to change its permissions" , hostPath )
} else if ! subPathExists {
// Create the sub path now because if it's auto-created later when referenced, it may have an
// incorrect ownership and mode. For example, the sub path directory must have at least g+rwx
// when the pod specifies an fsGroup, and if the directory is not created here, Docker will
// later auto-create it with the incorrect mode 0750
// Make extra care not to escape the volume!
2019-09-27 21:51:53 +00:00
perm , err := hu . GetMode ( volumePath )
2019-01-12 04:58:27 +00:00
if err != nil {
return nil , cleanupAction , err
}
2019-04-07 17:07:55 +00:00
if err := subpather . SafeMakeDir ( subPath , volumePath , perm ) ; err != nil {
2019-01-12 04:58:27 +00:00
// Don't pass detailed error back to the user because it could give information about host filesystem
klog . Errorf ( "failed to create subPath directory for volumeMount %q of container %q: %v" , mount . Name , container . Name , err )
return nil , cleanupAction , fmt . Errorf ( "failed to create subPath directory for volumeMount %q of container %q" , mount . Name , container . Name )
}
}
2019-04-07 17:07:55 +00:00
hostPath , cleanupAction , err = subpather . PrepareSafeSubpath ( subpath . Subpath {
2019-01-12 04:58:27 +00:00
VolumeMountIndex : i ,
Path : hostPath ,
VolumeName : vol . InnerVolumeSpecName ,
VolumePath : volumePath ,
PodDir : podDir ,
ContainerName : container . Name ,
} )
if err != nil {
// Don't pass detailed error back to the user because it could give information about host filesystem
klog . Errorf ( "failed to prepare subPath for volumeMount %q of container %q: %v" , mount . Name , container . Name , err )
return nil , cleanupAction , fmt . Errorf ( "failed to prepare subPath for volumeMount %q of container %q" , mount . Name , container . Name )
}
}
// Docker Volume Mounts fail on Windows if it is not of the form C:/
if volumeutil . IsWindowsLocalPath ( runtime . GOOS , hostPath ) {
hostPath = volumeutil . MakeAbsolutePath ( runtime . GOOS , hostPath )
}
containerPath := mount . MountPath
// IsAbs returns false for UNC path/SMB shares/named pipes in Windows. So check for those specifically and skip MakeAbsolutePath
if ! volumeutil . IsWindowsUNCPath ( runtime . GOOS , containerPath ) && ! filepath . IsAbs ( containerPath ) {
containerPath = volumeutil . MakeAbsolutePath ( runtime . GOOS , containerPath )
}
propagation , err := translateMountPropagation ( mount . MountPropagation )
if err != nil {
return nil , cleanupAction , err
}
klog . V ( 5 ) . Infof ( "Pod %q container %q mount %q has propagation %q" , format . Pod ( pod ) , container . Name , mount . Name , propagation )
mustMountRO := vol . Mounter . GetAttributes ( ) . ReadOnly
mounts = append ( mounts , kubecontainer . Mount {
Name : mount . Name ,
ContainerPath : containerPath ,
HostPath : hostPath ,
ReadOnly : mount . ReadOnly || mustMountRO ,
SELinuxRelabel : relabelVolume ,
Propagation : propagation ,
} )
}
if mountEtcHostsFile {
hostAliases := pod . Spec . HostAliases
2019-12-12 01:27:03 +00:00
hostsMount , err := makeHostsMount ( podDir , podIPs , hostName , hostDomain , hostAliases , pod . Spec . HostNetwork )
2019-01-12 04:58:27 +00:00
if err != nil {
return nil , cleanupAction , err
}
mounts = append ( mounts , * hostsMount )
}
return mounts , cleanupAction , nil
}
// translateMountPropagation transforms v1.MountPropagationMode to
// runtimeapi.MountPropagation.
func translateMountPropagation ( mountMode * v1 . MountPropagationMode ) ( runtimeapi . MountPropagation , error ) {
if runtime . GOOS == "windows" {
// Windows containers doesn't support mount propagation, use private for it.
// Refer https://docs.docker.com/storage/bind-mounts/#configure-bind-propagation.
return runtimeapi . MountPropagation_PROPAGATION_PRIVATE , nil
}
switch {
case mountMode == nil :
// PRIVATE is the default
return runtimeapi . MountPropagation_PROPAGATION_PRIVATE , nil
case * mountMode == v1 . MountPropagationHostToContainer :
return runtimeapi . MountPropagation_PROPAGATION_HOST_TO_CONTAINER , nil
case * mountMode == v1 . MountPropagationBidirectional :
return runtimeapi . MountPropagation_PROPAGATION_BIDIRECTIONAL , nil
case * mountMode == v1 . MountPropagationNone :
return runtimeapi . MountPropagation_PROPAGATION_PRIVATE , nil
default :
return 0 , fmt . Errorf ( "invalid MountPropagation mode: %q" , * mountMode )
}
}
2020-07-17 23:14:37 +00:00
// getEtcHostsPath returns the full host-side path to a pod's generated /etc/hosts file
func getEtcHostsPath ( podDir string ) string {
return path . Join ( podDir , "etc-hosts" )
}
2019-01-12 04:58:27 +00:00
// makeHostsMount makes the mountpoint for the hosts file that the containers
2019-12-12 01:27:03 +00:00
// in a pod are injected with. podIPs is provided instead of podIP as podIPs
// are present even if dual-stack feature flag is not enabled.
func makeHostsMount ( podDir string , podIPs [ ] string , hostName , hostDomainName string , hostAliases [ ] v1 . HostAlias , useHostNetwork bool ) ( * kubecontainer . Mount , error ) {
2020-07-17 23:14:37 +00:00
hostsFilePath := getEtcHostsPath ( podDir )
2019-12-12 01:27:03 +00:00
if err := ensureHostsFile ( hostsFilePath , podIPs , hostName , hostDomainName , hostAliases , useHostNetwork ) ; err != nil {
2019-01-12 04:58:27 +00:00
return nil , err
}
return & kubecontainer . Mount {
Name : "k8s-managed-etc-hosts" ,
ContainerPath : etcHostsPath ,
HostPath : hostsFilePath ,
ReadOnly : false ,
SELinuxRelabel : true ,
} , nil
}
// ensureHostsFile ensures that the given host file has an up-to-date ip, host
// name, and domain name.
2019-12-12 01:27:03 +00:00
func ensureHostsFile ( fileName string , hostIPs [ ] string , hostName , hostDomainName string , hostAliases [ ] v1 . HostAlias , useHostNetwork bool ) error {
2019-01-12 04:58:27 +00:00
var hostsFileContent [ ] byte
var err error
if useHostNetwork {
// if Pod is using host network, read hosts file from the node's filesystem.
// `etcHostsPath` references the location of the hosts file on the node.
// `/etc/hosts` for *nix systems.
hostsFileContent , err = nodeHostsFileContent ( etcHostsPath , hostAliases )
if err != nil {
return err
}
} else {
// if Pod is not using host network, create a managed hosts file with Pod IP and other information.
2019-12-12 01:27:03 +00:00
hostsFileContent = managedHostsFileContent ( hostIPs , hostName , hostDomainName , hostAliases )
2019-01-12 04:58:27 +00:00
}
return ioutil . WriteFile ( fileName , hostsFileContent , 0644 )
}
// nodeHostsFileContent reads the content of node's hosts file.
func nodeHostsFileContent ( hostsFilePath string , hostAliases [ ] v1 . HostAlias ) ( [ ] byte , error ) {
hostsFileContent , err := ioutil . ReadFile ( hostsFilePath )
if err != nil {
return nil , err
}
var buffer bytes . Buffer
buffer . WriteString ( managedHostsHeaderWithHostNetwork )
buffer . Write ( hostsFileContent )
buffer . Write ( hostsEntriesFromHostAliases ( hostAliases ) )
return buffer . Bytes ( ) , nil
}
2019-12-12 01:27:03 +00:00
// managedHostsFileContent generates the content of the managed etc hosts based on Pod IPs and other
2019-01-12 04:58:27 +00:00
// information.
2019-12-12 01:27:03 +00:00
func managedHostsFileContent ( hostIPs [ ] string , hostName , hostDomainName string , hostAliases [ ] v1 . HostAlias ) [ ] byte {
2019-01-12 04:58:27 +00:00
var buffer bytes . Buffer
buffer . WriteString ( managedHostsHeader )
buffer . WriteString ( "127.0.0.1\tlocalhost\n" ) // ipv4 localhost
buffer . WriteString ( "::1\tlocalhost ip6-localhost ip6-loopback\n" ) // ipv6 localhost
buffer . WriteString ( "fe00::0\tip6-localnet\n" )
buffer . WriteString ( "fe00::0\tip6-mcastprefix\n" )
buffer . WriteString ( "fe00::1\tip6-allnodes\n" )
buffer . WriteString ( "fe00::2\tip6-allrouters\n" )
if len ( hostDomainName ) > 0 {
2019-12-12 01:27:03 +00:00
// host entry generated for all IPs in podIPs
// podIPs field is populated for clusters even
// dual-stack feature flag is not enabled.
for _ , hostIP := range hostIPs {
buffer . WriteString ( fmt . Sprintf ( "%s\t%s.%s\t%s\n" , hostIP , hostName , hostDomainName , hostName ) )
}
2019-01-12 04:58:27 +00:00
} else {
2019-12-12 01:27:03 +00:00
for _ , hostIP := range hostIPs {
buffer . WriteString ( fmt . Sprintf ( "%s\t%s\n" , hostIP , hostName ) )
}
2019-01-12 04:58:27 +00:00
}
buffer . Write ( hostsEntriesFromHostAliases ( hostAliases ) )
return buffer . Bytes ( )
}
func hostsEntriesFromHostAliases ( hostAliases [ ] v1 . HostAlias ) [ ] byte {
if len ( hostAliases ) == 0 {
return [ ] byte { }
}
var buffer bytes . Buffer
buffer . WriteString ( "\n" )
buffer . WriteString ( "# Entries added by HostAliases.\n" )
// for each IP, write all aliases onto single line in hosts file
for _ , hostAlias := range hostAliases {
buffer . WriteString ( fmt . Sprintf ( "%s\t%s\n" , hostAlias . IP , strings . Join ( hostAlias . Hostnames , "\t" ) ) )
}
return buffer . Bytes ( )
}
// truncatePodHostnameIfNeeded truncates the pod hostname if it's longer than 63 chars.
func truncatePodHostnameIfNeeded ( podName , hostname string ) ( string , error ) {
// Cap hostname at 63 chars (specification is 64bytes which is 63 chars and the null terminating char).
const hostnameMaxLen = 63
if len ( hostname ) <= hostnameMaxLen {
return hostname , nil
}
truncated := hostname [ : hostnameMaxLen ]
klog . Errorf ( "hostname for pod:%q was longer than %d. Truncated hostname to :%q" , podName , hostnameMaxLen , truncated )
// hostname should not end with '-' or '.'
truncated = strings . TrimRight ( truncated , "-." )
if len ( truncated ) == 0 {
// This should never happen.
return "" , fmt . Errorf ( "hostname for pod %q was invalid: %q" , podName , hostname )
}
return truncated , nil
}
// GeneratePodHostNameAndDomain creates a hostname and domain name for a pod,
// given that pod's spec and annotations or returns an error.
func ( kl * Kubelet ) GeneratePodHostNameAndDomain ( pod * v1 . Pod ) ( string , string , error ) {
clusterDomain := kl . dnsConfigurer . ClusterDomain
hostname := pod . Name
if len ( pod . Spec . Hostname ) > 0 {
if msgs := utilvalidation . IsDNS1123Label ( pod . Spec . Hostname ) ; len ( msgs ) != 0 {
2019-09-27 21:51:53 +00:00
return "" , "" , fmt . Errorf ( "pod Hostname %q is not a valid DNS label: %s" , pod . Spec . Hostname , strings . Join ( msgs , ";" ) )
2019-01-12 04:58:27 +00:00
}
hostname = pod . Spec . Hostname
}
hostname , err := truncatePodHostnameIfNeeded ( pod . Name , hostname )
if err != nil {
return "" , "" , err
}
hostDomain := ""
if len ( pod . Spec . Subdomain ) > 0 {
if msgs := utilvalidation . IsDNS1123Label ( pod . Spec . Subdomain ) ; len ( msgs ) != 0 {
2019-09-27 21:51:53 +00:00
return "" , "" , fmt . Errorf ( "pod Subdomain %q is not a valid DNS label: %s" , pod . Spec . Subdomain , strings . Join ( msgs , ";" ) )
2019-01-12 04:58:27 +00:00
}
hostDomain = fmt . Sprintf ( "%s.%s.svc.%s" , pod . Spec . Subdomain , pod . Namespace , clusterDomain )
}
return hostname , hostDomain , nil
}
// GetPodCgroupParent gets pod cgroup parent from container manager.
func ( kl * Kubelet ) GetPodCgroupParent ( pod * v1 . Pod ) string {
pcm := kl . containerManager . NewPodContainerManager ( )
_ , cgroupParent := pcm . GetPodContainerName ( pod )
return cgroupParent
}
// GenerateRunContainerOptions generates the RunContainerOptions, which can be used by
// the container runtime to set parameters for launching a container.
2019-12-12 01:27:03 +00:00
func ( kl * Kubelet ) GenerateRunContainerOptions ( pod * v1 . Pod , container * v1 . Container , podIP string , podIPs [ ] string ) ( * kubecontainer . RunContainerOptions , func ( ) , error ) {
2019-01-12 04:58:27 +00:00
opts , err := kl . containerManager . GetResources ( pod , container )
if err != nil {
return nil , nil , err
}
2020-08-10 17:43:49 +00:00
// The value of hostname is the short host name and it is sent to makeMounts to create /etc/hosts file.
2019-01-12 04:58:27 +00:00
hostname , hostDomainName , err := kl . GeneratePodHostNameAndDomain ( pod )
if err != nil {
return nil , nil , err
}
2020-08-10 17:43:49 +00:00
// nodename will be equals to hostname if SetHostnameAsFQDN is nil or false. If SetHostnameFQDN
// is true and hostDomainName is defined, nodename will be the FQDN (hostname.hostDomainName)
nodename , err := util . GetNodenameForKernel ( hostname , hostDomainName , pod . Spec . SetHostnameAsFQDN )
if err != nil {
return nil , nil , err
}
opts . Hostname = nodename
2019-01-12 04:58:27 +00:00
podName := volumeutil . GetUniquePodName ( pod )
volumes := kl . volumeManager . GetMountedVolumesForPod ( podName )
2020-03-26 21:07:15 +00:00
blkutil := volumepathhandler . NewBlockVolumePathHandler ( )
blkVolumes , err := kl . makeBlockVolumes ( pod , container , volumes , blkutil )
if err != nil {
return nil , nil , err
2019-01-12 04:58:27 +00:00
}
2020-03-26 21:07:15 +00:00
opts . Devices = append ( opts . Devices , blkVolumes ... )
2019-01-12 04:58:27 +00:00
2019-12-12 01:27:03 +00:00
envs , err := kl . makeEnvironmentVariables ( pod , container , podIP , podIPs )
2019-01-12 04:58:27 +00:00
if err != nil {
return nil , nil , err
}
opts . Envs = append ( opts . Envs , envs ... )
2019-12-12 01:27:03 +00:00
// only podIPs is sent to makeMounts, as podIPs is populated even if dual-stack feature flag is not enabled.
mounts , cleanupAction , err := makeMounts ( pod , kl . getPodDir ( pod . UID ) , container , hostname , hostDomainName , podIPs , volumes , kl . hostutil , kl . subpather , opts . Envs )
2019-01-12 04:58:27 +00:00
if err != nil {
return nil , cleanupAction , err
}
opts . Mounts = append ( opts . Mounts , mounts ... )
2019-12-12 01:27:03 +00:00
// adding TerminationMessagePath on Windows is only allowed if ContainerD is used. Individual files cannot
// be mounted as volumes using Docker for Windows.
supportsSingleFileMapping := kl . containerRuntime . SupportsSingleFileMapping ( )
if len ( container . TerminationMessagePath ) != 0 && supportsSingleFileMapping {
2019-01-12 04:58:27 +00:00
p := kl . getPodContainerDir ( pod . UID , container . Name )
if err := os . MkdirAll ( p , 0750 ) ; err != nil {
klog . Errorf ( "Error on creating %q: %v" , p , err )
} else {
opts . PodContainerDir = p
}
}
// only do this check if the experimental behavior is enabled, otherwise allow it to default to false
if kl . experimentalHostUserNamespaceDefaulting {
opts . EnableHostUserNamespace = kl . enableHostUserNamespace ( pod )
}
return opts , cleanupAction , nil
}
var masterServices = sets . NewString ( "kubernetes" )
// getServiceEnvVarMap makes a map[string]string of env vars for services a
// pod in namespace ns should see.
func ( kl * Kubelet ) getServiceEnvVarMap ( ns string , enableServiceLinks bool ) ( map [ string ] string , error ) {
var (
serviceMap = make ( map [ string ] * v1 . Service )
m = make ( map [ string ] string )
)
// Get all service resources from the master (via a cache),
// and populate them into service environment variables.
if kl . serviceLister == nil {
// Kubelets without masters (e.g. plain GCE ContainerVM) don't set env vars.
return m , nil
}
services , err := kl . serviceLister . List ( labels . Everything ( ) )
if err != nil {
return m , fmt . Errorf ( "failed to list services when setting up env vars" )
}
// project the services in namespace ns onto the master services
for i := range services {
service := services [ i ]
// ignore services where ClusterIP is "None" or empty
if ! v1helper . IsServiceIPSet ( service ) {
continue
}
serviceName := service . Name
// We always want to add environment variabled for master services
// from the master service namespace, even if enableServiceLinks is false.
// We also add environment variables for other services in the same
// namespace, if enableServiceLinks is true.
if service . Namespace == kl . masterServiceNamespace && masterServices . Has ( serviceName ) {
if _ , exists := serviceMap [ serviceName ] ; ! exists {
serviceMap [ serviceName ] = service
}
} else if service . Namespace == ns && enableServiceLinks {
serviceMap [ serviceName ] = service
}
}
mappedServices := [ ] * v1 . Service { }
for key := range serviceMap {
mappedServices = append ( mappedServices , serviceMap [ key ] )
}
for _ , e := range envvars . FromServices ( mappedServices ) {
m [ e . Name ] = e . Value
}
return m , nil
}
// Make the environment variables for a pod in the given namespace.
2019-12-12 01:27:03 +00:00
func ( kl * Kubelet ) makeEnvironmentVariables ( pod * v1 . Pod , container * v1 . Container , podIP string , podIPs [ ] string ) ( [ ] kubecontainer . EnvVar , error ) {
2019-04-07 17:07:55 +00:00
if pod . Spec . EnableServiceLinks == nil {
return nil , fmt . Errorf ( "nil pod.spec.enableServiceLinks encountered, cannot construct envvars" )
}
2020-08-10 17:43:49 +00:00
// If the pod originates from the kube-api, when we know that the kube-apiserver is responding and the kubelet's credentials are valid.
// Knowing this, it is reasonable to wait until the service lister has synchronized at least once before attempting to build
// a service env var map. This doesn't present the race below from happening entirely, but it does prevent the "obvious"
// failure case of services simply not having completed a list operation that can reasonably be expected to succeed.
// One common case this prevents is a kubelet restart reading pods before services and some pod not having the
// KUBERNETES_SERVICE_HOST injected because we didn't wait a short time for services to sync before proceeding.
// The KUBERNETES_SERVICE_HOST link is special because it is unconditionally injected into pods and is read by the
// in-cluster-config for pod clients
if ! kubetypes . IsStaticPod ( pod ) && ! kl . serviceHasSynced ( ) {
return nil , fmt . Errorf ( "services have not yet been read at least once, cannot construct envvars" )
}
2019-01-12 04:58:27 +00:00
var result [ ] kubecontainer . EnvVar
// Note: These are added to the docker Config, but are not included in the checksum computed
// by kubecontainer.HashContainer(...). That way, we can still determine whether an
// v1.Container is already running by its hash. (We don't want to restart a container just
// because some service changed.)
//
// Note that there is a race between Kubelet seeing the pod and kubelet seeing the service.
// To avoid this users can: (1) wait between starting a service and starting; or (2) detect
// missing service env var and exit and be restarted; or (3) use DNS instead of env vars
// and keep trying to resolve the DNS name of the service (recommended).
serviceEnv , err := kl . getServiceEnvVarMap ( pod . Namespace , * pod . Spec . EnableServiceLinks )
if err != nil {
return result , err
}
var (
configMaps = make ( map [ string ] * v1 . ConfigMap )
secrets = make ( map [ string ] * v1 . Secret )
tmpEnv = make ( map [ string ] string )
)
// Env will override EnvFrom variables.
// Process EnvFrom first then allow Env to replace existing values.
for _ , envFrom := range container . EnvFrom {
switch {
case envFrom . ConfigMapRef != nil :
cm := envFrom . ConfigMapRef
name := cm . Name
configMap , ok := configMaps [ name ]
if ! ok {
if kl . kubeClient == nil {
2019-09-27 21:51:53 +00:00
return result , fmt . Errorf ( "couldn't get configMap %v/%v, no kubeClient defined" , pod . Namespace , name )
2019-01-12 04:58:27 +00:00
}
optional := cm . Optional != nil && * cm . Optional
configMap , err = kl . configMapManager . GetConfigMap ( pod . Namespace , name )
if err != nil {
if errors . IsNotFound ( err ) && optional {
// ignore error when marked optional
continue
}
return result , err
}
configMaps [ name ] = configMap
}
invalidKeys := [ ] string { }
for k , v := range configMap . Data {
if len ( envFrom . Prefix ) > 0 {
k = envFrom . Prefix + k
}
if errMsgs := utilvalidation . IsEnvVarName ( k ) ; len ( errMsgs ) != 0 {
invalidKeys = append ( invalidKeys , k )
continue
}
tmpEnv [ k ] = v
}
if len ( invalidKeys ) > 0 {
sort . Strings ( invalidKeys )
kl . recorder . Eventf ( pod , v1 . EventTypeWarning , "InvalidEnvironmentVariableNames" , "Keys [%s] from the EnvFrom configMap %s/%s were skipped since they are considered invalid environment variable names." , strings . Join ( invalidKeys , ", " ) , pod . Namespace , name )
}
case envFrom . SecretRef != nil :
s := envFrom . SecretRef
name := s . Name
secret , ok := secrets [ name ]
if ! ok {
if kl . kubeClient == nil {
2019-09-27 21:51:53 +00:00
return result , fmt . Errorf ( "couldn't get secret %v/%v, no kubeClient defined" , pod . Namespace , name )
2019-01-12 04:58:27 +00:00
}
optional := s . Optional != nil && * s . Optional
secret , err = kl . secretManager . GetSecret ( pod . Namespace , name )
if err != nil {
if errors . IsNotFound ( err ) && optional {
// ignore error when marked optional
continue
}
return result , err
}
secrets [ name ] = secret
}
invalidKeys := [ ] string { }
for k , v := range secret . Data {
if len ( envFrom . Prefix ) > 0 {
k = envFrom . Prefix + k
}
if errMsgs := utilvalidation . IsEnvVarName ( k ) ; len ( errMsgs ) != 0 {
invalidKeys = append ( invalidKeys , k )
continue
}
tmpEnv [ k ] = string ( v )
}
if len ( invalidKeys ) > 0 {
sort . Strings ( invalidKeys )
kl . recorder . Eventf ( pod , v1 . EventTypeWarning , "InvalidEnvironmentVariableNames" , "Keys [%s] from the EnvFrom secret %s/%s were skipped since they are considered invalid environment variable names." , strings . Join ( invalidKeys , ", " ) , pod . Namespace , name )
}
}
}
// Determine the final values of variables:
//
// 1. Determine the final value of each variable:
// a. If the variable's Value is set, expand the `$(var)` references to other
// variables in the .Value field; the sources of variables are the declared
// variables of the container and the service environment variables
// b. If a source is defined for an environment variable, resolve the source
// 2. Create the container's environment in the order variables are declared
// 3. Add remaining service environment vars
var (
mappingFunc = expansion . MappingFuncFor ( tmpEnv , serviceEnv )
)
for _ , envVar := range container . Env {
runtimeVal := envVar . Value
if runtimeVal != "" {
// Step 1a: expand variable references
runtimeVal = expansion . Expand ( runtimeVal , mappingFunc )
} else if envVar . ValueFrom != nil {
// Step 1b: resolve alternate env var sources
switch {
case envVar . ValueFrom . FieldRef != nil :
2019-12-12 01:27:03 +00:00
runtimeVal , err = kl . podFieldSelectorRuntimeValue ( envVar . ValueFrom . FieldRef , pod , podIP , podIPs )
2019-01-12 04:58:27 +00:00
if err != nil {
return result , err
}
case envVar . ValueFrom . ResourceFieldRef != nil :
defaultedPod , defaultedContainer , err := kl . defaultPodLimitsForDownwardAPI ( pod , container )
if err != nil {
return result , err
}
runtimeVal , err = containerResourceRuntimeValue ( envVar . ValueFrom . ResourceFieldRef , defaultedPod , defaultedContainer )
if err != nil {
return result , err
}
case envVar . ValueFrom . ConfigMapKeyRef != nil :
cm := envVar . ValueFrom . ConfigMapKeyRef
name := cm . Name
key := cm . Key
optional := cm . Optional != nil && * cm . Optional
configMap , ok := configMaps [ name ]
if ! ok {
if kl . kubeClient == nil {
2019-09-27 21:51:53 +00:00
return result , fmt . Errorf ( "couldn't get configMap %v/%v, no kubeClient defined" , pod . Namespace , name )
2019-01-12 04:58:27 +00:00
}
configMap , err = kl . configMapManager . GetConfigMap ( pod . Namespace , name )
if err != nil {
if errors . IsNotFound ( err ) && optional {
// ignore error when marked optional
continue
}
return result , err
}
configMaps [ name ] = configMap
}
runtimeVal , ok = configMap . Data [ key ]
if ! ok {
if optional {
continue
}
2019-09-27 21:51:53 +00:00
return result , fmt . Errorf ( "couldn't find key %v in ConfigMap %v/%v" , key , pod . Namespace , name )
2019-01-12 04:58:27 +00:00
}
case envVar . ValueFrom . SecretKeyRef != nil :
s := envVar . ValueFrom . SecretKeyRef
name := s . Name
key := s . Key
optional := s . Optional != nil && * s . Optional
secret , ok := secrets [ name ]
if ! ok {
if kl . kubeClient == nil {
2019-09-27 21:51:53 +00:00
return result , fmt . Errorf ( "couldn't get secret %v/%v, no kubeClient defined" , pod . Namespace , name )
2019-01-12 04:58:27 +00:00
}
secret , err = kl . secretManager . GetSecret ( pod . Namespace , name )
if err != nil {
if errors . IsNotFound ( err ) && optional {
// ignore error when marked optional
continue
}
return result , err
}
secrets [ name ] = secret
}
runtimeValBytes , ok := secret . Data [ key ]
if ! ok {
if optional {
continue
}
2019-09-27 21:51:53 +00:00
return result , fmt . Errorf ( "couldn't find key %v in Secret %v/%v" , key , pod . Namespace , name )
2019-01-12 04:58:27 +00:00
}
runtimeVal = string ( runtimeValBytes )
}
}
// Accesses apiserver+Pods.
// So, the master may set service env vars, or kubelet may. In case both are doing
// it, we delete the key from the kubelet-generated ones so we don't have duplicate
// env vars.
// TODO: remove this next line once all platforms use apiserver+Pods.
delete ( serviceEnv , envVar . Name )
tmpEnv [ envVar . Name ] = runtimeVal
}
// Append the env vars
for k , v := range tmpEnv {
result = append ( result , kubecontainer . EnvVar { Name : k , Value : v } )
}
// Append remaining service env vars.
for k , v := range serviceEnv {
// Accesses apiserver+Pods.
// So, the master may set service env vars, or kubelet may. In case both are doing
// it, we skip the key from the kubelet-generated ones so we don't have duplicate
// env vars.
// TODO: remove this next line once all platforms use apiserver+Pods.
if _ , present := tmpEnv [ k ] ; ! present {
result = append ( result , kubecontainer . EnvVar { Name : k , Value : v } )
}
}
return result , nil
}
// podFieldSelectorRuntimeValue returns the runtime value of the given
// selector for a pod.
2019-12-12 01:27:03 +00:00
func ( kl * Kubelet ) podFieldSelectorRuntimeValue ( fs * v1 . ObjectFieldSelector , pod * v1 . Pod , podIP string , podIPs [ ] string ) ( string , error ) {
2019-01-12 04:58:27 +00:00
internalFieldPath , _ , err := podshelper . ConvertDownwardAPIFieldLabel ( fs . APIVersion , fs . FieldPath , "" )
if err != nil {
return "" , err
}
switch internalFieldPath {
case "spec.nodeName" :
return pod . Spec . NodeName , nil
case "spec.serviceAccountName" :
return pod . Spec . ServiceAccountName , nil
case "status.hostIP" :
2020-12-01 01:06:26 +00:00
hostIPs , err := kl . getHostIPsAnyWay ( )
2019-01-12 04:58:27 +00:00
if err != nil {
return "" , err
}
2020-12-01 01:06:26 +00:00
return hostIPs [ 0 ] . String ( ) , nil
2019-01-12 04:58:27 +00:00
case "status.podIP" :
return podIP , nil
2019-12-12 01:27:03 +00:00
case "status.podIPs" :
return strings . Join ( podIPs , "," ) , nil
2019-01-12 04:58:27 +00:00
}
return fieldpath . ExtractFieldPathAsString ( pod , internalFieldPath )
}
// containerResourceRuntimeValue returns the value of the provided container resource
func containerResourceRuntimeValue ( fs * v1 . ResourceFieldSelector , pod * v1 . Pod , container * v1 . Container ) ( string , error ) {
containerName := fs . ContainerName
if len ( containerName ) == 0 {
return resource . ExtractContainerResourceValue ( fs , container )
}
return resource . ExtractResourceValueByContainerName ( fs , pod , containerName )
}
// One of the following arguments must be non-nil: runningPod, status.
func ( kl * Kubelet ) killPod ( pod * v1 . Pod , runningPod * kubecontainer . Pod , status * kubecontainer . PodStatus , gracePeriodOverride * int64 ) error {
var p kubecontainer . Pod
if runningPod != nil {
p = * runningPod
} else if status != nil {
p = kubecontainer . ConvertPodStatusToRunningPod ( kl . getRuntime ( ) . Type ( ) , status )
} else {
return fmt . Errorf ( "one of the two arguments must be non-nil: runningPod, status" )
}
// Call the container runtime KillPod method which stops all running containers of the pod
if err := kl . containerRuntime . KillPod ( pod , p , gracePeriodOverride ) ; err != nil {
return err
}
if err := kl . containerManager . UpdateQOSCgroups ( ) ; err != nil {
klog . V ( 2 ) . Infof ( "Failed to update QoS cgroups while killing pod: %v" , err )
}
return nil
}
// makePodDataDirs creates the dirs for the pod datas.
func ( kl * Kubelet ) makePodDataDirs ( pod * v1 . Pod ) error {
uid := pod . UID
if err := os . MkdirAll ( kl . getPodDir ( uid ) , 0750 ) ; err != nil && ! os . IsExist ( err ) {
return err
}
if err := os . MkdirAll ( kl . getPodVolumesDir ( uid ) , 0750 ) ; err != nil && ! os . IsExist ( err ) {
return err
}
if err := os . MkdirAll ( kl . getPodPluginsDir ( uid ) , 0750 ) ; err != nil && ! os . IsExist ( err ) {
return err
}
return nil
}
// getPullSecretsForPod inspects the Pod and retrieves the referenced pull
// secrets.
func ( kl * Kubelet ) getPullSecretsForPod ( pod * v1 . Pod ) [ ] v1 . Secret {
pullSecrets := [ ] v1 . Secret { }
for _ , secretRef := range pod . Spec . ImagePullSecrets {
secret , err := kl . secretManager . GetSecret ( pod . Namespace , secretRef . Name )
if err != nil {
klog . Warningf ( "Unable to retrieve pull secret %s/%s for %s/%s due to %v. The image pull may not succeed." , pod . Namespace , secretRef . Name , pod . Namespace , pod . Name , err )
continue
}
pullSecrets = append ( pullSecrets , * secret )
}
return pullSecrets
}
2020-03-26 21:07:15 +00:00
// podStatusIsTerminal reports when the specified pod has no running containers or is no longer accepting
// spec changes.
func ( kl * Kubelet ) podAndContainersAreTerminal ( pod * v1 . Pod ) ( containersTerminal , podWorkerTerminal bool ) {
2019-01-12 04:58:27 +00:00
// Check the cached pod status which was set after the last sync.
status , ok := kl . statusManager . GetPodStatus ( pod . UID )
if ! ok {
// If there is no cached status, use the status from the
// apiserver. This is useful if kubelet has recently been
// restarted.
status = pod . Status
}
2020-03-26 21:07:15 +00:00
// A pod transitions into failed or succeeded from either container lifecycle (RestartNever container
// fails) or due to external events like deletion or eviction. A terminal pod *should* have no running
// containers, but to know that the pod has completed its lifecycle you must wait for containers to also
// be terminal.
containersTerminal = notRunning ( status . ContainerStatuses )
// The kubelet must accept config changes from the pod spec until it has reached a point where changes would
// have no effect on any running container.
podWorkerTerminal = status . Phase == v1 . PodFailed || status . Phase == v1 . PodSucceeded || ( pod . DeletionTimestamp != nil && containersTerminal )
return
}
// podIsTerminated returns true if the provided pod is in a terminal phase ("Failed", "Succeeded") or
// has been deleted and has no running containers. This corresponds to when a pod must accept changes to
// its pod spec (e.g. terminating containers allow grace period to be shortened).
func ( kl * Kubelet ) podIsTerminated ( pod * v1 . Pod ) bool {
_ , podWorkerTerminal := kl . podAndContainersAreTerminal ( pod )
return podWorkerTerminal
2019-01-12 04:58:27 +00:00
}
2020-03-26 21:07:15 +00:00
// IsPodTerminated returns true if the pod with the provided UID is in a terminal phase ("Failed",
// "Succeeded") or has been deleted and has no running containers. This corresponds to when a pod must
// accept changes to its pod spec (e.g. terminating containers allow grace period to be shortened)
2019-01-12 04:58:27 +00:00
func ( kl * Kubelet ) IsPodTerminated ( uid types . UID ) bool {
pod , podFound := kl . podManager . GetPodByUID ( uid )
if ! podFound {
return true
}
return kl . podIsTerminated ( pod )
}
// IsPodDeleted returns true if the pod is deleted. For the pod to be deleted, either:
// 1. The pod object is deleted
// 2. The pod's status is evicted
// 3. The pod's deletion timestamp is set, and containers are not running
func ( kl * Kubelet ) IsPodDeleted ( uid types . UID ) bool {
pod , podFound := kl . podManager . GetPodByUID ( uid )
if ! podFound {
return true
}
status , statusFound := kl . statusManager . GetPodStatus ( pod . UID )
if ! statusFound {
status = pod . Status
}
return eviction . PodIsEvicted ( status ) || ( pod . DeletionTimestamp != nil && notRunning ( status . ContainerStatuses ) )
}
// PodResourcesAreReclaimed returns true if all required node-level resources that a pod was consuming have
// been reclaimed by the kubelet. Reclaiming resources is a prerequisite to deleting a pod from the API server.
func ( kl * Kubelet ) PodResourcesAreReclaimed ( pod * v1 . Pod , status v1 . PodStatus ) bool {
if ! notRunning ( status . ContainerStatuses ) {
2019-12-12 01:27:03 +00:00
// We shouldn't delete pods that still have running containers
2019-01-12 04:58:27 +00:00
klog . V ( 3 ) . Infof ( "Pod %q is terminated, but some containers are still running" , format . Pod ( pod ) )
return false
}
// pod's containers should be deleted
runtimeStatus , err := kl . podCache . Get ( pod . UID )
if err != nil {
klog . V ( 3 ) . Infof ( "Pod %q is terminated, Error getting runtimeStatus from the podCache: %s" , format . Pod ( pod ) , err )
return false
}
if len ( runtimeStatus . ContainerStatuses ) > 0 {
var statusStr string
for _ , status := range runtimeStatus . ContainerStatuses {
statusStr += fmt . Sprintf ( "%+v " , * status )
}
klog . V ( 3 ) . Infof ( "Pod %q is terminated, but some containers have not been cleaned up: %s" , format . Pod ( pod ) , statusStr )
return false
}
if kl . podVolumesExist ( pod . UID ) && ! kl . keepTerminatedPodVolumes {
2019-12-12 01:27:03 +00:00
// We shouldn't delete pods whose volumes have not been cleaned up if we are not keeping terminated pod volumes
2019-01-12 04:58:27 +00:00
klog . V ( 3 ) . Infof ( "Pod %q is terminated, but some volumes have not been cleaned up" , format . Pod ( pod ) )
return false
}
if kl . kubeletConfiguration . CgroupsPerQOS {
pcm := kl . containerManager . NewPodContainerManager ( )
if pcm . Exists ( pod ) {
klog . V ( 3 ) . Infof ( "Pod %q is terminated, but pod cgroup sandbox has not been cleaned up" , format . Pod ( pod ) )
return false
}
}
return true
}
// podResourcesAreReclaimed simply calls PodResourcesAreReclaimed with the most up-to-date status.
func ( kl * Kubelet ) podResourcesAreReclaimed ( pod * v1 . Pod ) bool {
status , ok := kl . statusManager . GetPodStatus ( pod . UID )
if ! ok {
status = pod . Status
}
return kl . PodResourcesAreReclaimed ( pod , status )
}
// notRunning returns true if every status is terminated or waiting, or the status list
// is empty.
func notRunning ( statuses [ ] v1 . ContainerStatus ) bool {
for _ , status := range statuses {
if status . State . Terminated == nil && status . State . Waiting == nil {
return false
}
}
return true
}
// filterOutTerminatedPods returns the given pods which the status manager
// does not consider failed or succeeded.
func ( kl * Kubelet ) filterOutTerminatedPods ( pods [ ] * v1 . Pod ) [ ] * v1 . Pod {
var filteredPods [ ] * v1 . Pod
for _ , p := range pods {
if kl . podIsTerminated ( p ) {
continue
}
filteredPods = append ( filteredPods , p )
}
return filteredPods
}
// removeOrphanedPodStatuses removes obsolete entries in podStatus where
// the pod is no longer considered bound to this node.
func ( kl * Kubelet ) removeOrphanedPodStatuses ( pods [ ] * v1 . Pod , mirrorPods [ ] * v1 . Pod ) {
podUIDs := make ( map [ types . UID ] bool )
for _ , pod := range pods {
podUIDs [ pod . UID ] = true
}
for _ , pod := range mirrorPods {
podUIDs [ pod . UID ] = true
}
kl . statusManager . RemoveOrphanedStatuses ( podUIDs )
}
2020-08-10 17:43:49 +00:00
// deleteOrphanedMirrorPods checks whether pod killer has done with orphaned mirror pod.
// If pod killing is done, podManager.DeleteMirrorPod() is called to delete mirror pod
// from the API server
func ( kl * Kubelet ) deleteOrphanedMirrorPods ( ) {
podFullNames := kl . podManager . GetOrphanedMirrorPodNames ( )
for _ , podFullname := range podFullNames {
if ! kl . podKiller . IsMirrorPodPendingTerminationByPodName ( podFullname ) {
_ , err := kl . podManager . DeleteMirrorPod ( podFullname , nil )
if err != nil {
klog . Errorf ( "encountered error when deleting mirror pod %q : %v" , podFullname , err )
} else {
klog . V ( 3 ) . Infof ( "deleted pod %q" , podFullname )
}
}
}
}
2019-01-12 04:58:27 +00:00
// HandlePodCleanups performs a series of cleanup work, including terminating
// pod workers, killing unwanted pods, and removing orphaned volumes/pod
// directories.
// NOTE: This function is executed by the main sync loop, so it
// should not contain any blocking calls.
func ( kl * Kubelet ) HandlePodCleanups ( ) error {
// The kubelet lacks checkpointing, so we need to introspect the set of pods
// in the cgroup tree prior to inspecting the set of pods in our pod manager.
// this ensures our view of the cgroup tree does not mistakenly observe pods
// that are added after the fact...
var (
cgroupPods map [ types . UID ] cm . CgroupName
err error
)
if kl . cgroupsPerQOS {
pcm := kl . containerManager . NewPodContainerManager ( )
cgroupPods , err = pcm . GetAllPodsFromCgroups ( )
if err != nil {
return fmt . Errorf ( "failed to get list of pods that still exist on cgroup mounts: %v" , err )
}
}
allPods , mirrorPods := kl . podManager . GetPodsAndMirrorPods ( )
// Pod phase progresses monotonically. Once a pod has reached a final state,
// it should never leave regardless of the restart policy. The statuses
// of such pods should not be changed, and there is no need to sync them.
// TODO: the logic here does not handle two cases:
// 1. If the containers were removed immediately after they died, kubelet
// may fail to generate correct statuses, let alone filtering correctly.
// 2. If kubelet restarted before writing the terminated status for a pod
// to the apiserver, it could still restart the terminated pod (even
// though the pod was not considered terminated by the apiserver).
// These two conditions could be alleviated by checkpointing kubelet.
activePods := kl . filterOutTerminatedPods ( allPods )
2019-09-27 21:51:53 +00:00
desiredPods := make ( map [ types . UID ] sets . Empty )
2019-01-12 04:58:27 +00:00
for _ , pod := range activePods {
2019-09-27 21:51:53 +00:00
desiredPods [ pod . UID ] = sets . Empty { }
2019-01-12 04:58:27 +00:00
}
// Stop the workers for no-longer existing pods.
kl . podWorkers . ForgetNonExistingPodWorkers ( desiredPods )
2019-09-27 21:51:53 +00:00
kl . probeManager . CleanupPods ( desiredPods )
2019-01-12 04:58:27 +00:00
runningPods , err := kl . runtimeCache . GetPods ( )
if err != nil {
klog . Errorf ( "Error listing containers: %#v" , err )
return err
}
for _ , pod := range runningPods {
if _ , found := desiredPods [ pod . ID ] ; ! found {
2020-08-10 17:43:49 +00:00
kl . podKiller . KillPod ( & kubecontainer . PodPair { APIPod : nil , RunningPod : pod } )
2019-01-12 04:58:27 +00:00
}
}
kl . removeOrphanedPodStatuses ( allPods , mirrorPods )
// Note that we just killed the unwanted pods. This may not have reflected
// in the cache. We need to bypass the cache to get the latest set of
// running pods to clean up the volumes.
// TODO: Evaluate the performance impact of bypassing the runtime cache.
runningPods , err = kl . containerRuntime . GetPods ( false )
if err != nil {
klog . Errorf ( "Error listing containers: %#v" , err )
return err
}
// Remove any orphaned volumes.
// Note that we pass all pods (including terminated pods) to the function,
// so that we don't remove volumes associated with terminated but not yet
// deleted pods.
err = kl . cleanupOrphanedPodDirs ( allPods , runningPods )
if err != nil {
// We want all cleanup tasks to be run even if one of them failed. So
// we just log an error here and continue other cleanup tasks.
// This also applies to the other clean up tasks.
klog . Errorf ( "Failed cleaning up orphaned pod directories: %v" , err )
}
// Remove any orphaned mirror pods.
2020-08-10 17:43:49 +00:00
kl . deleteOrphanedMirrorPods ( )
2019-01-12 04:58:27 +00:00
// Remove any cgroups in the hierarchy for pods that are no longer running.
if kl . cgroupsPerQOS {
2020-08-10 17:43:49 +00:00
pcm := kl . containerManager . NewPodContainerManager ( )
kl . cleanupOrphanedPodCgroups ( pcm , cgroupPods , activePods )
2019-01-12 04:58:27 +00:00
}
kl . backOff . GC ( )
return nil
}
2020-08-10 17:43:49 +00:00
// PodKiller handles requests for killing pods
type PodKiller interface {
// KillPod receives pod speficier representing the pod to kill
KillPod ( pair * kubecontainer . PodPair )
// PerformPodKillingWork performs the actual pod killing work via calling CRI
// It returns after its Close() func is called and all outstanding pod killing requests are served
PerformPodKillingWork ( )
// After Close() is called, this pod killer wouldn't accept any more pod killing requests
Close ( )
// IsMirrorPodPendingTerminationByPodName checks whether the mirror pod for the given full pod name is pending termination
IsMirrorPodPendingTerminationByPodName ( podFullname string ) bool
// IsMirrorPodPendingTerminationByUID checks whether the mirror pod for the given uid is pending termination
IsMirrorPodPendingTerminationByUID ( uid types . UID ) bool
// MarkMirrorPodPendingTermination marks the mirror pod entering grace period of termination
MarkMirrorPodPendingTermination ( pod * v1 . Pod )
}
// podKillerWithChannel is an implementation of PodKiller which receives pod killing requests via channel
type podKillerWithChannel struct {
// Channel for getting pods to kill.
podKillingCh chan * kubecontainer . PodPair
// lock for synchronization between HandlePodCleanups and pod killer
podKillingLock * sync . Mutex
// mirrorPodTerminationMap keeps track of the progress of mirror pod termination
// The key is the UID of the pod and the value is the full name of the pod
mirrorPodTerminationMap map [ string ] string
// killPod is the func which invokes runtime to kill the pod
killPod func ( pod * v1 . Pod , runningPod * kubecontainer . Pod , status * kubecontainer . PodStatus , gracePeriodOverride * int64 ) error
}
// NewPodKiller returns a functional PodKiller
func NewPodKiller ( kl * Kubelet ) PodKiller {
podKiller := & podKillerWithChannel {
podKillingCh : make ( chan * kubecontainer . PodPair , podKillingChannelCapacity ) ,
podKillingLock : & sync . Mutex { } ,
mirrorPodTerminationMap : make ( map [ string ] string ) ,
killPod : kl . killPod ,
}
return podKiller
}
// IsMirrorPodPendingTerminationByUID checks whether the pod for the given uid is pending termination
func ( pk * podKillerWithChannel ) IsMirrorPodPendingTerminationByUID ( uid types . UID ) bool {
pk . podKillingLock . Lock ( )
defer pk . podKillingLock . Unlock ( )
_ , ok := pk . mirrorPodTerminationMap [ string ( uid ) ]
return ok
}
// IsMirrorPodPendingTerminationByPodName checks whether the given pod is in grace period of termination
func ( pk * podKillerWithChannel ) IsMirrorPodPendingTerminationByPodName ( podFullname string ) bool {
pk . podKillingLock . Lock ( )
defer pk . podKillingLock . Unlock ( )
for _ , name := range pk . mirrorPodTerminationMap {
if name == podFullname {
return true
}
}
return false
}
func ( pk * podKillerWithChannel ) markMirrorPodTerminated ( uid string ) {
pk . podKillingLock . Lock ( )
klog . V ( 4 ) . Infof ( "marking pod termination %q" , uid )
delete ( pk . mirrorPodTerminationMap , uid )
pk . podKillingLock . Unlock ( )
}
// MarkMirrorPodPendingTermination marks the pod entering grace period of termination
func ( pk * podKillerWithChannel ) MarkMirrorPodPendingTermination ( pod * v1 . Pod ) {
fullname := kubecontainer . GetPodFullName ( pod )
klog . V ( 3 ) . Infof ( "marking pod pending termination %q" , string ( pod . UID ) )
pk . podKillingLock . Lock ( )
pk . mirrorPodTerminationMap [ string ( pod . UID ) ] = fullname
pk . podKillingLock . Unlock ( )
}
// Close closes the channel through which requests are delivered
func ( pk * podKillerWithChannel ) Close ( ) {
close ( pk . podKillingCh )
}
// KillPod sends pod killing request to the killer
func ( pk * podKillerWithChannel ) KillPod ( pair * kubecontainer . PodPair ) {
pk . podKillingCh <- pair
}
// PerformPodKillingWork launches a goroutine to kill a pod received from the channel if
2019-01-12 04:58:27 +00:00
// another goroutine isn't already in action.
2020-08-10 17:43:49 +00:00
func ( pk * podKillerWithChannel ) PerformPodKillingWork ( ) {
2019-01-12 04:58:27 +00:00
killing := sets . NewString ( )
// guard for the killing set
lock := sync . Mutex { }
2020-08-10 17:43:49 +00:00
for podPair := range pk . podKillingCh {
2019-01-12 04:58:27 +00:00
runningPod := podPair . RunningPod
apiPod := podPair . APIPod
lock . Lock ( )
exists := killing . Has ( string ( runningPod . ID ) )
if ! exists {
killing . Insert ( string ( runningPod . ID ) )
}
lock . Unlock ( )
if ! exists {
go func ( apiPod * v1 . Pod , runningPod * kubecontainer . Pod ) {
klog . V ( 2 ) . Infof ( "Killing unwanted pod %q" , runningPod . Name )
2020-08-10 17:43:49 +00:00
err := pk . killPod ( apiPod , runningPod , nil , nil )
2019-01-12 04:58:27 +00:00
if err != nil {
klog . Errorf ( "Failed killing the pod %q: %v" , runningPod . Name , err )
}
lock . Lock ( )
killing . Delete ( string ( runningPod . ID ) )
lock . Unlock ( )
2020-08-10 17:43:49 +00:00
pk . markMirrorPodTerminated ( string ( runningPod . ID ) )
2019-01-12 04:58:27 +00:00
} ( apiPod , runningPod )
}
}
}
// validateContainerLogStatus returns the container ID for the desired container to retrieve logs for, based on the state
// of the container. The previous flag will only return the logs for the last terminated container, otherwise, the current
// running container is preferred over a previous termination. If info about the container is not available then a specific
// error is returned to the end user.
func ( kl * Kubelet ) validateContainerLogStatus ( podName string , podStatus * v1 . PodStatus , containerName string , previous bool ) ( containerID kubecontainer . ContainerID , err error ) {
var cID string
cStatus , found := podutil . GetContainerStatus ( podStatus . ContainerStatuses , containerName )
if ! found {
cStatus , found = podutil . GetContainerStatus ( podStatus . InitContainerStatuses , containerName )
}
2019-09-27 21:51:53 +00:00
if ! found && utilfeature . DefaultFeatureGate . Enabled ( features . EphemeralContainers ) {
cStatus , found = podutil . GetContainerStatus ( podStatus . EphemeralContainerStatuses , containerName )
}
2019-01-12 04:58:27 +00:00
if ! found {
return kubecontainer . ContainerID { } , fmt . Errorf ( "container %q in pod %q is not available" , containerName , podName )
}
lastState := cStatus . LastTerminationState
waiting , running , terminated := cStatus . State . Waiting , cStatus . State . Running , cStatus . State . Terminated
switch {
case previous :
if lastState . Terminated == nil || lastState . Terminated . ContainerID == "" {
return kubecontainer . ContainerID { } , fmt . Errorf ( "previous terminated container %q in pod %q not found" , containerName , podName )
}
cID = lastState . Terminated . ContainerID
case running != nil :
cID = cStatus . ContainerID
case terminated != nil :
// in cases where the next container didn't start, terminated.ContainerID will be empty, so get logs from the lastState.Terminated.
if terminated . ContainerID == "" {
if lastState . Terminated != nil && lastState . Terminated . ContainerID != "" {
cID = lastState . Terminated . ContainerID
} else {
return kubecontainer . ContainerID { } , fmt . Errorf ( "container %q in pod %q is terminated" , containerName , podName )
}
} else {
cID = terminated . ContainerID
}
case lastState . Terminated != nil :
if lastState . Terminated . ContainerID == "" {
return kubecontainer . ContainerID { } , fmt . Errorf ( "container %q in pod %q is terminated" , containerName , podName )
}
cID = lastState . Terminated . ContainerID
case waiting != nil :
// output some info for the most common pending failures
switch reason := waiting . Reason ; reason {
case images . ErrImagePull . Error ( ) :
return kubecontainer . ContainerID { } , fmt . Errorf ( "container %q in pod %q is waiting to start: image can't be pulled" , containerName , podName )
case images . ErrImagePullBackOff . Error ( ) :
return kubecontainer . ContainerID { } , fmt . Errorf ( "container %q in pod %q is waiting to start: trying and failing to pull image" , containerName , podName )
default :
return kubecontainer . ContainerID { } , fmt . Errorf ( "container %q in pod %q is waiting to start: %v" , containerName , podName , reason )
}
default :
// unrecognized state
return kubecontainer . ContainerID { } , fmt . Errorf ( "container %q in pod %q is waiting to start - no logs yet" , containerName , podName )
}
return kubecontainer . ParseContainerID ( cID ) , nil
}
// GetKubeletContainerLogs returns logs from the container
// TODO: this method is returning logs of random container attempts, when it should be returning the most recent attempt
// or all of them.
func ( kl * Kubelet ) GetKubeletContainerLogs ( ctx context . Context , podFullName , containerName string , logOptions * v1 . PodLogOptions , stdout , stderr io . Writer ) error {
// Pod workers periodically write status to statusManager. If status is not
// cached there, something is wrong (or kubelet just restarted and hasn't
// caught up yet). Just assume the pod is not ready yet.
name , namespace , err := kubecontainer . ParsePodFullName ( podFullName )
if err != nil {
return fmt . Errorf ( "unable to parse pod full name %q: %v" , podFullName , err )
}
pod , ok := kl . GetPodByName ( namespace , name )
if ! ok {
return fmt . Errorf ( "pod %q cannot be found - no logs available" , name )
}
podUID := pod . UID
if mirrorPod , ok := kl . podManager . GetMirrorPodByPod ( pod ) ; ok {
podUID = mirrorPod . UID
}
podStatus , found := kl . statusManager . GetPodStatus ( podUID )
if ! found {
// If there is no cached status, use the status from the
// apiserver. This is useful if kubelet has recently been
// restarted.
podStatus = pod . Status
}
// TODO: Consolidate the logic here with kuberuntime.GetContainerLogs, here we convert container name to containerID,
// but inside kuberuntime we convert container id back to container name and restart count.
// TODO: After separate container log lifecycle management, we should get log based on the existing log files
// instead of container status.
containerID , err := kl . validateContainerLogStatus ( pod . Name , & podStatus , containerName , logOptions . Previous )
if err != nil {
return err
}
// Do a zero-byte write to stdout before handing off to the container runtime.
// This ensures at least one Write call is made to the writer when copying starts,
// even if we then block waiting for log output from the container.
if _ , err := stdout . Write ( [ ] byte { } ) ; err != nil {
return err
}
if kl . dockerLegacyService != nil {
// dockerLegacyService should only be non-nil when we actually need it, so
// inject it into the runtimeService.
// TODO(random-liu): Remove this hack after deprecating unsupported log driver.
return kl . dockerLegacyService . GetContainerLogs ( ctx , pod , containerID , logOptions , stdout , stderr )
}
return kl . containerRuntime . GetContainerLogs ( ctx , pod , containerID , logOptions , stdout , stderr )
}
// getPhase returns the phase of a pod given its container info.
func getPhase ( spec * v1 . PodSpec , info [ ] v1 . ContainerStatus ) v1 . PodPhase {
pendingInitialization := 0
failedInitialization := 0
for _ , container := range spec . InitContainers {
containerStatus , ok := podutil . GetContainerStatus ( info , container . Name )
if ! ok {
pendingInitialization ++
continue
}
switch {
case containerStatus . State . Running != nil :
pendingInitialization ++
case containerStatus . State . Terminated != nil :
2019-08-30 18:33:25 +00:00
if containerStatus . State . Terminated . ExitCode != 0 {
2019-01-12 04:58:27 +00:00
failedInitialization ++
}
case containerStatus . State . Waiting != nil :
if containerStatus . LastTerminationState . Terminated != nil {
2019-08-30 18:33:25 +00:00
if containerStatus . LastTerminationState . Terminated . ExitCode != 0 {
2019-01-12 04:58:27 +00:00
failedInitialization ++
}
} else {
pendingInitialization ++
}
default :
pendingInitialization ++
}
}
unknown := 0
running := 0
waiting := 0
stopped := 0
succeeded := 0
for _ , container := range spec . Containers {
containerStatus , ok := podutil . GetContainerStatus ( info , container . Name )
if ! ok {
unknown ++
continue
}
switch {
case containerStatus . State . Running != nil :
running ++
case containerStatus . State . Terminated != nil :
stopped ++
if containerStatus . State . Terminated . ExitCode == 0 {
succeeded ++
}
case containerStatus . State . Waiting != nil :
if containerStatus . LastTerminationState . Terminated != nil {
stopped ++
} else {
waiting ++
}
default :
unknown ++
}
}
if failedInitialization > 0 && spec . RestartPolicy == v1 . RestartPolicyNever {
return v1 . PodFailed
}
switch {
case pendingInitialization > 0 :
fallthrough
case waiting > 0 :
klog . V ( 5 ) . Infof ( "pod waiting > 0, pending" )
// One or more containers has not been started
return v1 . PodPending
case running > 0 && unknown == 0 :
// All containers have been started, and at least
// one container is running
return v1 . PodRunning
case running == 0 && stopped > 0 && unknown == 0 :
// All containers are terminated
if spec . RestartPolicy == v1 . RestartPolicyAlways {
// All containers are in the process of restarting
return v1 . PodRunning
}
if stopped == succeeded {
// RestartPolicy is not Always, and all
// containers are terminated in success
return v1 . PodSucceeded
}
if spec . RestartPolicy == v1 . RestartPolicyNever {
// RestartPolicy is Never, and all containers are
// terminated with at least one in failure
return v1 . PodFailed
}
// RestartPolicy is OnFailure, and at least one in failure
// and in the process of restarting
return v1 . PodRunning
default :
klog . V ( 5 ) . Infof ( "pod default case, pending" )
return v1 . PodPending
}
}
// generateAPIPodStatus creates the final API pod status for a pod, given the
// internal pod status.
func ( kl * Kubelet ) generateAPIPodStatus ( pod * v1 . Pod , podStatus * kubecontainer . PodStatus ) v1 . PodStatus {
klog . V ( 3 ) . Infof ( "Generating status for %q" , format . Pod ( pod ) )
s := kl . convertStatusToAPIStatus ( pod , podStatus )
// check if an internal module has requested the pod is evicted.
for _ , podSyncHandler := range kl . PodSyncHandlers {
if result := podSyncHandler . ShouldEvict ( pod ) ; result . Evict {
s . Phase = v1 . PodFailed
s . Reason = result . Reason
s . Message = result . Message
return * s
}
}
// Assume info is ready to process
spec := & pod . Spec
allStatus := append ( append ( [ ] v1 . ContainerStatus { } , s . ContainerStatuses ... ) , s . InitContainerStatuses ... )
s . Phase = getPhase ( spec , allStatus )
// Check for illegal phase transition
if pod . Status . Phase == v1 . PodFailed || pod . Status . Phase == v1 . PodSucceeded {
// API server shows terminal phase; transitions are not allowed
if s . Phase != pod . Status . Phase {
klog . Errorf ( "Pod attempted illegal phase transition from %s to %s: %v" , pod . Status . Phase , s . Phase , s )
// Force back to phase from the API server
s . Phase = pod . Status . Phase
}
}
kl . probeManager . UpdatePodStatus ( pod . UID , s )
s . Conditions = append ( s . Conditions , status . GeneratePodInitializedCondition ( spec , s . InitContainerStatuses , s . Phase ) )
s . Conditions = append ( s . Conditions , status . GeneratePodReadyCondition ( spec , s . Conditions , s . ContainerStatuses , s . Phase ) )
s . Conditions = append ( s . Conditions , status . GenerateContainersReadyCondition ( spec , s . ContainerStatuses , s . Phase ) )
// Status manager will take care of the LastTransitionTimestamp, either preserve
// the timestamp from apiserver, or set a new one. When kubelet sees the pod,
// `PodScheduled` condition must be true.
s . Conditions = append ( s . Conditions , v1 . PodCondition {
Type : v1 . PodScheduled ,
Status : v1 . ConditionTrue ,
} )
if kl . kubeClient != nil {
2020-12-01 01:06:26 +00:00
hostIPs , err := kl . getHostIPsAnyWay ( )
2019-01-12 04:58:27 +00:00
if err != nil {
2020-12-01 01:06:26 +00:00
klog . V ( 4 ) . Infof ( "Cannot get host IPs: %v" , err )
2019-01-12 04:58:27 +00:00
} else {
2020-12-01 01:06:26 +00:00
s . HostIP = hostIPs [ 0 ] . String ( )
2019-01-12 04:58:27 +00:00
if kubecontainer . IsHostNetworkPod ( pod ) && s . PodIP == "" {
2020-12-01 01:06:26 +00:00
s . PodIP = hostIPs [ 0 ] . String ( )
2020-03-26 21:07:15 +00:00
s . PodIPs = [ ] v1 . PodIP { { IP : s . PodIP } }
2020-12-01 01:06:26 +00:00
if utilfeature . DefaultFeatureGate . Enabled ( features . IPv6DualStack ) && len ( hostIPs ) == 2 {
s . PodIPs = append ( s . PodIPs , v1 . PodIP { IP : hostIPs [ 1 ] . String ( ) } )
}
2019-01-12 04:58:27 +00:00
}
}
}
return * s
}
// convertStatusToAPIStatus creates an api PodStatus for the given pod from
// the given internal pod status. It is purely transformative and does not
// alter the kubelet state at all.
func ( kl * Kubelet ) convertStatusToAPIStatus ( pod * v1 . Pod , podStatus * kubecontainer . PodStatus ) * v1 . PodStatus {
var apiPodStatus v1 . PodStatus
2019-09-27 21:51:53 +00:00
apiPodStatus . PodIPs = make ( [ ] v1 . PodIP , 0 , len ( podStatus . IPs ) )
for _ , ip := range podStatus . IPs {
apiPodStatus . PodIPs = append ( apiPodStatus . PodIPs , v1 . PodIP {
IP : ip ,
} )
}
if len ( apiPodStatus . PodIPs ) > 0 {
apiPodStatus . PodIP = apiPodStatus . PodIPs [ 0 ] . IP
}
2019-01-12 04:58:27 +00:00
// set status for Pods created on versions of kube older than 1.6
apiPodStatus . QOSClass = v1qos . GetPodQOS ( pod )
oldPodStatus , found := kl . statusManager . GetPodStatus ( pod . UID )
if ! found {
oldPodStatus = pod . Status
}
apiPodStatus . ContainerStatuses = kl . convertToAPIContainerStatuses (
pod , podStatus ,
oldPodStatus . ContainerStatuses ,
pod . Spec . Containers ,
len ( pod . Spec . InitContainers ) > 0 ,
false ,
)
apiPodStatus . InitContainerStatuses = kl . convertToAPIContainerStatuses (
pod , podStatus ,
oldPodStatus . InitContainerStatuses ,
pod . Spec . InitContainers ,
len ( pod . Spec . InitContainers ) > 0 ,
true ,
)
2019-09-27 21:51:53 +00:00
if utilfeature . DefaultFeatureGate . Enabled ( features . EphemeralContainers ) {
var ecSpecs [ ] v1 . Container
for i := range pod . Spec . EphemeralContainers {
ecSpecs = append ( ecSpecs , v1 . Container ( pod . Spec . EphemeralContainers [ i ] . EphemeralContainerCommon ) )
}
// #80875: By now we've iterated podStatus 3 times. We could refactor this to make a single
// pass through podStatus.ContainerStatuses
apiPodStatus . EphemeralContainerStatuses = kl . convertToAPIContainerStatuses (
pod , podStatus ,
oldPodStatus . EphemeralContainerStatuses ,
ecSpecs ,
len ( pod . Spec . InitContainers ) > 0 ,
false ,
)
}
2019-01-12 04:58:27 +00:00
// Preserves conditions not controlled by kubelet
for _ , c := range pod . Status . Conditions {
if ! kubetypes . PodConditionByKubelet ( c . Type ) {
apiPodStatus . Conditions = append ( apiPodStatus . Conditions , c )
}
}
2019-09-27 21:51:53 +00:00
2019-01-12 04:58:27 +00:00
return & apiPodStatus
}
// convertToAPIContainerStatuses converts the given internal container
// statuses into API container statuses.
func ( kl * Kubelet ) convertToAPIContainerStatuses ( pod * v1 . Pod , podStatus * kubecontainer . PodStatus , previousStatus [ ] v1 . ContainerStatus , containers [ ] v1 . Container , hasInitContainers , isInitContainer bool ) [ ] v1 . ContainerStatus {
2020-12-01 01:06:26 +00:00
convertContainerStatus := func ( cs * kubecontainer . Status , oldStatus * v1 . ContainerStatus ) * v1 . ContainerStatus {
2019-01-12 04:58:27 +00:00
cid := cs . ID . String ( )
status := & v1 . ContainerStatus {
Name : cs . Name ,
RestartCount : int32 ( cs . RestartCount ) ,
Image : cs . Image ,
ImageID : cs . ImageID ,
ContainerID : cid ,
}
2020-12-01 01:06:26 +00:00
switch {
case cs . State == kubecontainer . ContainerStateRunning :
2019-01-12 04:58:27 +00:00
status . State . Running = & v1 . ContainerStateRunning { StartedAt : metav1 . NewTime ( cs . StartedAt ) }
2020-12-01 01:06:26 +00:00
case cs . State == kubecontainer . ContainerStateCreated :
2019-01-12 04:58:27 +00:00
// Treat containers in the "created" state as if they are exited.
// The pod workers are supposed start all containers it creates in
// one sync (syncPod) iteration. There should not be any normal
// "created" containers when the pod worker generates the status at
// the beginning of a sync iteration.
fallthrough
2020-12-01 01:06:26 +00:00
case cs . State == kubecontainer . ContainerStateExited :
2019-01-12 04:58:27 +00:00
status . State . Terminated = & v1 . ContainerStateTerminated {
ExitCode : int32 ( cs . ExitCode ) ,
Reason : cs . Reason ,
Message : cs . Message ,
StartedAt : metav1 . NewTime ( cs . StartedAt ) ,
FinishedAt : metav1 . NewTime ( cs . FinishedAt ) ,
ContainerID : cid ,
}
2020-12-01 01:06:26 +00:00
case cs . State == kubecontainer . ContainerStateUnknown &&
oldStatus != nil && // we have an old status
oldStatus . State . Running != nil : // our previous status was running
// if this happens, then we know that this container was previously running and isn't anymore (assuming the CRI isn't failing to return running containers).
// you can imagine this happening in cases where a container failed and the kubelet didn't ask about it in time to see the result.
// in this case, the container should not to into waiting state immediately because that can make cases like runonce pods actually run
// twice. "container never ran" is different than "container ran and failed". This is handled differently in the kubelet
// and it is handled differently in higher order logic like crashloop detection and handling
status . State . Terminated = & v1 . ContainerStateTerminated {
Reason : "ContainerStatusUnknown" ,
Message : "The container could not be located when the pod was terminated" ,
ExitCode : 137 , // this code indicates an error
}
// the restart count normally comes from the CRI (see near the top of this method), but since this is being added explicitly
// for the case where the CRI did not return a status, we need to manually increment the restart count to be accurate.
status . RestartCount = oldStatus . RestartCount + 1
2019-01-12 04:58:27 +00:00
default :
2020-12-01 01:06:26 +00:00
// this collapses any unknown state to container waiting. If any container is waiting, then the pod status moves to pending even if it is running.
// if I'm reading this correctly, then any failure to read status on any container results in the entire pod going pending even if the containers
// are actually running.
// see https://github.com/kubernetes/kubernetes/blob/5d1b3e26af73dde33ecb6a3e69fb5876ceab192f/pkg/kubelet/kuberuntime/kuberuntime_container.go#L497 to
// https://github.com/kubernetes/kubernetes/blob/8976e3620f8963e72084971d9d4decbd026bf49f/pkg/kubelet/kuberuntime/helpers.go#L58-L71
// and interpreted here https://github.com/kubernetes/kubernetes/blob/b27e78f590a0d43e4a23ca3b2bf1739ca4c6e109/pkg/kubelet/kubelet_pods.go#L1434-L1439
2019-01-12 04:58:27 +00:00
status . State . Waiting = & v1 . ContainerStateWaiting { }
}
return status
}
// Fetch old containers statuses from old pod status.
oldStatuses := make ( map [ string ] v1 . ContainerStatus , len ( containers ) )
for _ , status := range previousStatus {
oldStatuses [ status . Name ] = status
}
// Set all container statuses to default waiting state
statuses := make ( map [ string ] * v1 . ContainerStatus , len ( containers ) )
defaultWaitingState := v1 . ContainerState { Waiting : & v1 . ContainerStateWaiting { Reason : "ContainerCreating" } }
if hasInitContainers {
defaultWaitingState = v1 . ContainerState { Waiting : & v1 . ContainerStateWaiting { Reason : "PodInitializing" } }
}
for _ , container := range containers {
status := & v1 . ContainerStatus {
Name : container . Name ,
Image : container . Image ,
State : defaultWaitingState ,
}
oldStatus , found := oldStatuses [ container . Name ]
if found {
if oldStatus . State . Terminated != nil {
// Do not update status on terminated init containers as
// they be removed at any time.
status = & oldStatus
} else {
// Apply some values from the old statuses as the default values.
status . RestartCount = oldStatus . RestartCount
status . LastTerminationState = oldStatus . LastTerminationState
}
}
statuses [ container . Name ] = status
}
2020-12-01 01:06:26 +00:00
for _ , container := range containers {
found := false
for _ , cStatus := range podStatus . ContainerStatuses {
if container . Name == cStatus . Name {
found = true
break
}
}
if found {
continue
}
// if no container is found, then assuming it should be waiting seems plausible, but the status code requires
// that a previous termination be present. If we're offline long enough (or something removed the container?), then
// the previous termination may not be present. This next code block ensures that if the container was previously running
// then when that container status disappears, we can infer that it terminated even if we don't know the status code.
// By setting the lasttermination state we are able to leave the container status waiting and present more accurate
// data via the API.
oldStatus , ok := oldStatuses [ container . Name ]
if ! ok {
continue
}
if oldStatus . State . Terminated != nil {
// if the old container status was terminated, the lasttermination status is correct
continue
}
if oldStatus . State . Running == nil {
// if the old container status isn't running, then waiting is an appropriate status and we have nothing to do
continue
}
if pod . DeletionTimestamp == nil {
continue
}
// and if the pod itself is being deleted, then the CRI may have removed the container already and for whatever reason the kubelet missed the exit code
// (this seems not awesome). We know at this point that we will not be restarting the container.
status := statuses [ container . Name ]
// if the status we're about to write indicates the default, the Waiting status will force this pod back into Pending.
// That isn't true, we know the pod is going away.
isDefaultWaitingStatus := status . State . Waiting != nil && status . State . Waiting . Reason == "ContainerCreating"
if hasInitContainers {
isDefaultWaitingStatus = status . State . Waiting != nil && status . State . Waiting . Reason == "PodInitializing"
}
if ! isDefaultWaitingStatus {
// we the status was written, don't override
continue
}
if status . LastTerminationState . Terminated != nil {
// if we already have a termination state, nothing to do
continue
}
// setting this value ensures that we show as stopped here, not as waiting:
// https://github.com/kubernetes/kubernetes/blob/90c9f7b3e198e82a756a68ffeac978a00d606e55/pkg/kubelet/kubelet_pods.go#L1440-L1445
// This prevents the pod from becoming pending
status . LastTerminationState . Terminated = & v1 . ContainerStateTerminated {
Reason : "ContainerStatusUnknown" ,
Message : "The container could not be located when the pod was deleted. The container used to be Running" ,
ExitCode : 137 ,
}
statuses [ container . Name ] = status
}
2019-01-12 04:58:27 +00:00
// Make the latest container status comes first.
sort . Sort ( sort . Reverse ( kubecontainer . SortContainerStatusesByCreationTime ( podStatus . ContainerStatuses ) ) )
// Set container statuses according to the statuses seen in pod status
containerSeen := map [ string ] int { }
for _ , cStatus := range podStatus . ContainerStatuses {
cName := cStatus . Name
if _ , ok := statuses [ cName ] ; ! ok {
// This would also ignore the infra container.
continue
}
if containerSeen [ cName ] >= 2 {
continue
}
2020-12-01 01:06:26 +00:00
var oldStatusPtr * v1 . ContainerStatus
if oldStatus , ok := oldStatuses [ cName ] ; ok {
oldStatusPtr = & oldStatus
}
status := convertContainerStatus ( cStatus , oldStatusPtr )
2019-01-12 04:58:27 +00:00
if containerSeen [ cName ] == 0 {
statuses [ cName ] = status
} else {
statuses [ cName ] . LastTerminationState = status . State
}
containerSeen [ cName ] = containerSeen [ cName ] + 1
}
// Handle the containers failed to be started, which should be in Waiting state.
for _ , container := range containers {
if isInitContainer {
// If the init container is terminated with exit code 0, it won't be restarted.
// TODO(random-liu): Handle this in a cleaner way.
s := podStatus . FindContainerStatusByName ( container . Name )
if s != nil && s . State == kubecontainer . ContainerStateExited && s . ExitCode == 0 {
continue
}
}
// If a container should be restarted in next syncpod, it is *Waiting*.
if ! kubecontainer . ShouldContainerBeRestarted ( & container , pod , podStatus ) {
continue
}
status := statuses [ container . Name ]
reason , ok := kl . reasonCache . Get ( pod . UID , container . Name )
if ! ok {
// In fact, we could also apply Waiting state here, but it is less informative,
// and the container will be restarted soon, so we prefer the original state here.
// Note that with the current implementation of ShouldContainerBeRestarted the original state here
// could be:
// * Waiting: There is no associated historical container and start failure reason record.
// * Terminated: The container is terminated.
continue
}
if status . State . Terminated != nil {
status . LastTerminationState = status . State
}
status . State = v1 . ContainerState {
Waiting : & v1 . ContainerStateWaiting {
Reason : reason . Err . Error ( ) ,
Message : reason . Message ,
} ,
}
statuses [ container . Name ] = status
}
2020-08-10 17:43:49 +00:00
// Sort the container statuses since clients of this interface expect the list
// of containers in a pod has a deterministic order.
if isInitContainer {
return kubetypes . SortStatusesOfInitContainers ( pod , statuses )
}
2019-01-12 04:58:27 +00:00
var containerStatuses [ ] v1 . ContainerStatus
for _ , status := range statuses {
containerStatuses = append ( containerStatuses , * status )
}
2020-08-10 17:43:49 +00:00
sort . Sort ( kubetypes . SortedContainerStatuses ( containerStatuses ) )
2019-01-12 04:58:27 +00:00
return containerStatuses
}
// ServeLogs returns logs of current machine.
func ( kl * Kubelet ) ServeLogs ( w http . ResponseWriter , req * http . Request ) {
// TODO: whitelist logs we are willing to serve
kl . logServer . ServeHTTP ( w , req )
}
// findContainer finds and returns the container with the given pod ID, full name, and container name.
// It returns nil if not found.
func ( kl * Kubelet ) findContainer ( podFullName string , podUID types . UID , containerName string ) ( * kubecontainer . Container , error ) {
pods , err := kl . containerRuntime . GetPods ( false )
if err != nil {
return nil , err
}
// Resolve and type convert back again.
// We need the static pod UID but the kubecontainer API works with types.UID.
podUID = types . UID ( kl . podManager . TranslatePodUID ( podUID ) )
pod := kubecontainer . Pods ( pods ) . FindPod ( podFullName , podUID )
return pod . FindContainerByName ( containerName ) , nil
}
// RunInContainer runs a command in a container, returns the combined stdout, stderr as an array of bytes
func ( kl * Kubelet ) RunInContainer ( podFullName string , podUID types . UID , containerName string , cmd [ ] string ) ( [ ] byte , error ) {
container , err := kl . findContainer ( podFullName , podUID , containerName )
if err != nil {
return nil , err
}
if container == nil {
return nil , fmt . Errorf ( "container not found (%q)" , containerName )
}
// TODO(tallclair): Pass a proper timeout value.
return kl . runner . RunInContainer ( container . ID , cmd , 0 )
}
// GetExec gets the URL the exec will be served from, or nil if the Kubelet will serve it.
func ( kl * Kubelet ) GetExec ( podFullName string , podUID types . UID , containerName string , cmd [ ] string , streamOpts remotecommandserver . Options ) ( * url . URL , error ) {
container , err := kl . findContainer ( podFullName , podUID , containerName )
if err != nil {
return nil , err
}
if container == nil {
return nil , fmt . Errorf ( "container not found (%q)" , containerName )
}
return kl . streamingRuntime . GetExec ( container . ID , cmd , streamOpts . Stdin , streamOpts . Stdout , streamOpts . Stderr , streamOpts . TTY )
}
// GetAttach gets the URL the attach will be served from, or nil if the Kubelet will serve it.
func ( kl * Kubelet ) GetAttach ( podFullName string , podUID types . UID , containerName string , streamOpts remotecommandserver . Options ) ( * url . URL , error ) {
container , err := kl . findContainer ( podFullName , podUID , containerName )
if err != nil {
return nil , err
}
if container == nil {
return nil , fmt . Errorf ( "container %s not found in pod %s" , containerName , podFullName )
}
// The TTY setting for attach must match the TTY setting in the initial container configuration,
// since whether the process is running in a TTY cannot be changed after it has started. We
// need the api.Pod to get the TTY status.
pod , found := kl . GetPodByFullName ( podFullName )
if ! found || ( string ( podUID ) != "" && pod . UID != podUID ) {
return nil , fmt . Errorf ( "pod %s not found" , podFullName )
}
containerSpec := kubecontainer . GetContainerSpec ( pod , containerName )
if containerSpec == nil {
return nil , fmt . Errorf ( "container %s not found in pod %s" , containerName , podFullName )
}
tty := containerSpec . TTY
return kl . streamingRuntime . GetAttach ( container . ID , streamOpts . Stdin , streamOpts . Stdout , streamOpts . Stderr , tty )
}
// GetPortForward gets the URL the port-forward will be served from, or nil if the Kubelet will serve it.
func ( kl * Kubelet ) GetPortForward ( podName , podNamespace string , podUID types . UID , portForwardOpts portforward . V4Options ) ( * url . URL , error ) {
pods , err := kl . containerRuntime . GetPods ( false )
if err != nil {
return nil , err
}
// Resolve and type convert back again.
// We need the static pod UID but the kubecontainer API works with types.UID.
podUID = types . UID ( kl . podManager . TranslatePodUID ( podUID ) )
podFullName := kubecontainer . BuildPodFullName ( podName , podNamespace )
pod := kubecontainer . Pods ( pods ) . FindPod ( podFullName , podUID )
if pod . IsEmpty ( ) {
return nil , fmt . Errorf ( "pod not found (%q)" , podFullName )
}
return kl . streamingRuntime . GetPortForward ( podName , podNamespace , podUID , portForwardOpts . Ports )
}
// cleanupOrphanedPodCgroups removes cgroups that should no longer exist.
// it reconciles the cached state of cgroupPods with the specified list of runningPods
2020-08-10 17:43:49 +00:00
func ( kl * Kubelet ) cleanupOrphanedPodCgroups ( pcm cm . PodContainerManager , cgroupPods map [ types . UID ] cm . CgroupName , activePods [ ] * v1 . Pod ) {
2019-01-12 04:58:27 +00:00
// Add all running pods to the set that we want to preserve
podSet := sets . NewString ( )
for _ , pod := range activePods {
podSet . Insert ( string ( pod . UID ) )
}
// Iterate over all the found pods to verify if they should be running
for uid , val := range cgroupPods {
// if the pod is in the running set, its not a candidate for cleanup
if podSet . Has ( string ( uid ) ) {
continue
}
2020-08-10 17:43:49 +00:00
// if the pod is within termination grace period, we shouldn't cleanup the underlying cgroup
if kl . podKiller . IsMirrorPodPendingTerminationByUID ( uid ) {
klog . V ( 3 ) . Infof ( "pod %q is pending termination" , uid )
continue
}
2019-01-12 04:58:27 +00:00
// If volumes have not been unmounted/detached, do not delete the cgroup
// so any memory backed volumes don't have their charges propagated to the
// parent croup. If the volumes still exist, reduce the cpu shares for any
// process in the cgroup to the minimum value while we wait. if the kubelet
// is configured to keep terminated volumes, we will delete the cgroup and not block.
if podVolumesExist := kl . podVolumesExist ( uid ) ; podVolumesExist && ! kl . keepTerminatedPodVolumes {
klog . V ( 3 ) . Infof ( "Orphaned pod %q found, but volumes not yet removed. Reducing cpu to minimum" , uid )
if err := pcm . ReduceCPULimits ( val ) ; err != nil {
klog . Warningf ( "Failed to reduce cpu time for pod %q pending volume cleanup due to %v" , uid , err )
}
continue
}
klog . V ( 3 ) . Infof ( "Orphaned pod %q found, removing pod cgroups" , uid )
// Destroy all cgroups of pod that should not be running,
// by first killing all the attached processes to these cgroups.
// We ignore errors thrown by the method, as the housekeeping loop would
// again try to delete these unwanted pod cgroups
go pcm . Destroy ( val )
}
}
// enableHostUserNamespace determines if the host user namespace should be used by the container runtime.
// Returns true if the pod is using a host pid, pic, or network namespace, the pod is using a non-namespaced
// capability, the pod contains a privileged container, or the pod has a host path volume.
//
// NOTE: when if a container shares any namespace with another container it must also share the user namespace
// or it will not have the correct capabilities in the namespace. This means that host user namespace
// is enabled per pod, not per container.
func ( kl * Kubelet ) enableHostUserNamespace ( pod * v1 . Pod ) bool {
if kubecontainer . HasPrivilegedContainer ( pod ) || hasHostNamespace ( pod ) ||
hasHostVolume ( pod ) || hasNonNamespacedCapability ( pod ) || kl . hasHostMountPVC ( pod ) {
return true
}
return false
}
// hasNonNamespacedCapability returns true if MKNOD, SYS_TIME, or SYS_MODULE is requested for any container.
func hasNonNamespacedCapability ( pod * v1 . Pod ) bool {
for _ , c := range pod . Spec . Containers {
if c . SecurityContext != nil && c . SecurityContext . Capabilities != nil {
for _ , cap := range c . SecurityContext . Capabilities . Add {
if cap == "MKNOD" || cap == "SYS_TIME" || cap == "SYS_MODULE" {
return true
}
}
}
}
return false
}
// hasHostVolume returns true if the pod spec has a HostPath volume.
func hasHostVolume ( pod * v1 . Pod ) bool {
for _ , v := range pod . Spec . Volumes {
if v . HostPath != nil {
return true
}
}
return false
}
// hasHostNamespace returns true if hostIPC, hostNetwork, or hostPID are set to true.
func hasHostNamespace ( pod * v1 . Pod ) bool {
if pod . Spec . SecurityContext == nil {
return false
}
return pod . Spec . HostIPC || pod . Spec . HostNetwork || pod . Spec . HostPID
}
// hasHostMountPVC returns true if a PVC is referencing a HostPath volume.
func ( kl * Kubelet ) hasHostMountPVC ( pod * v1 . Pod ) bool {
for _ , volume := range pod . Spec . Volumes {
if volume . PersistentVolumeClaim != nil {
2020-03-26 21:07:15 +00:00
pvc , err := kl . kubeClient . CoreV1 ( ) . PersistentVolumeClaims ( pod . Namespace ) . Get ( context . TODO ( ) , volume . PersistentVolumeClaim . ClaimName , metav1 . GetOptions { } )
2019-01-12 04:58:27 +00:00
if err != nil {
klog . Warningf ( "unable to retrieve pvc %s:%s - %v" , pod . Namespace , volume . PersistentVolumeClaim . ClaimName , err )
continue
}
if pvc != nil {
2020-03-26 21:07:15 +00:00
referencedVolume , err := kl . kubeClient . CoreV1 ( ) . PersistentVolumes ( ) . Get ( context . TODO ( ) , pvc . Spec . VolumeName , metav1 . GetOptions { } )
2019-01-12 04:58:27 +00:00
if err != nil {
klog . Warningf ( "unable to retrieve pv %s - %v" , pvc . Spec . VolumeName , err )
continue
}
if referencedVolume != nil && referencedVolume . Spec . HostPath != nil {
return true
}
}
}
}
return false
}