2015-05-12 16:58:40 +00:00
// +build linux
/ *
2016-06-03 00:25:58 +00:00
Copyright 2015 The Kubernetes Authors .
2015-05-12 16:58:40 +00:00
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
2015-10-10 00:09:53 +00:00
package cm
2015-05-12 16:58:40 +00:00
import (
2016-09-02 18:13:02 +00:00
"bufio"
2015-05-12 16:58:40 +00:00
"fmt"
2016-05-19 19:44:42 +00:00
"io/ioutil"
2015-05-12 16:58:40 +00:00
"os"
2016-09-02 18:13:02 +00:00
"os/exec"
2016-03-04 00:37:09 +00:00
"path"
2015-05-12 16:58:40 +00:00
"strconv"
2016-02-16 20:52:40 +00:00
"sync"
2015-05-12 16:58:40 +00:00
"time"
2015-08-05 22:05:17 +00:00
"github.com/golang/glog"
2015-12-11 13:25:35 +00:00
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/configs"
2017-06-22 18:24:23 +00:00
"k8s.io/api/core/v1"
2017-01-25 13:13:07 +00:00
"k8s.io/apimachinery/pkg/api/resource"
2017-01-11 14:09:48 +00:00
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
2017-02-10 05:14:10 +00:00
"k8s.io/client-go/tools/record"
2015-08-05 22:03:47 +00:00
"k8s.io/kubernetes/pkg/kubelet/cadvisor"
2016-11-10 21:08:17 +00:00
cmutil "k8s.io/kubernetes/pkg/kubelet/cm/util"
2016-06-27 18:46:20 +00:00
"k8s.io/kubernetes/pkg/kubelet/qos"
2016-03-04 00:37:09 +00:00
"k8s.io/kubernetes/pkg/util"
2015-09-22 23:42:30 +00:00
"k8s.io/kubernetes/pkg/util/mount"
2015-08-04 00:28:33 +00:00
"k8s.io/kubernetes/pkg/util/oom"
2016-08-03 17:02:09 +00:00
"k8s.io/kubernetes/pkg/util/procfs"
2015-10-05 17:28:53 +00:00
utilsysctl "k8s.io/kubernetes/pkg/util/sysctl"
2016-10-22 17:28:17 +00:00
utilversion "k8s.io/kubernetes/pkg/util/version"
2015-05-12 16:58:40 +00:00
)
2015-06-17 18:36:27 +00:00
const (
// The percent of the machine memory capacity. The value is used to calculate
// docker memory resource container's hardlimit to workaround docker memory
// leakage issue. Please see kubernetes/issues/9881 for more detail.
DockerMemoryLimitThresholdPercent = 70
2015-06-17 19:51:11 +00:00
// The minimum memory limit allocated to docker container: 150Mi
MinDockerMemoryLimit = 150 * 1024 * 1024
2016-05-19 19:44:42 +00:00
2016-05-27 00:27:00 +00:00
dockerProcessName = "docker"
dockerPidFile = "/var/run/docker.pid"
containerdProcessName = "docker-containerd"
containerdPidFile = "/run/docker/libcontainerd/docker-containerd.pid"
)
var (
// The docker version in which containerd was introduced.
2017-04-04 18:48:09 +00:00
containerdAPIVersion = utilversion . MustParseGeneric ( "1.23" )
2015-06-17 18:36:27 +00:00
)
2015-05-30 00:32:34 +00:00
// A non-user container tracked by the Kubelet.
type systemContainer struct {
// Absolute name of the container.
name string
// CPU limit in millicores.
cpuMillicores int64
// Function that ensures the state of the container.
// m is the cgroup manager for the specified container.
ensureStateFunc func ( m * fs . Manager ) error
2015-05-12 16:58:40 +00:00
2015-05-30 00:32:34 +00:00
// Manager for the cgroups of the external container.
manager * fs . Manager
}
2016-02-10 00:58:44 +00:00
func newSystemCgroups ( containerName string ) * systemContainer {
2015-05-30 00:32:34 +00:00
return & systemContainer {
name : containerName ,
manager : createManager ( containerName ) ,
}
}
2015-05-19 22:52:12 +00:00
2015-05-30 00:32:34 +00:00
type containerManagerImpl struct {
2016-02-16 20:52:40 +00:00
sync . RWMutex
2015-09-22 23:42:30 +00:00
cadvisorInterface cadvisor . Interface
mountUtil mount . Interface
2015-10-10 00:09:53 +00:00
NodeConfig
2016-03-04 00:37:09 +00:00
status Status
2015-05-30 00:32:34 +00:00
// External containers being managed.
systemContainers [ ] * systemContainer
2016-06-27 18:46:20 +00:00
qosContainers QOSContainersInfo
2017-02-10 05:14:10 +00:00
// Tasks that are run periodically
periodicTasks [ ] func ( )
2016-06-27 18:46:20 +00:00
// holds all the mounted cgroup subsystems
2016-07-13 04:39:22 +00:00
subsystems * CgroupSubsystems
2016-11-18 20:50:58 +00:00
nodeInfo * v1 . Node
2017-02-10 05:14:10 +00:00
// Interface for cgroup management
cgroupManager CgroupManager
// Capacity of this node.
capacity v1 . ResourceList
// Absolute cgroupfs path to a cgroup that Kubelet needs to place all pods under.
// This path include a top level container for enforcing Node Allocatable.
cgroupRoot string
// Event recorder interface.
recorder record . EventRecorder
2017-02-21 20:10:45 +00:00
// Interface for QoS cgroup management
qosContainerManager QOSContainerManager
2015-05-12 16:58:40 +00:00
}
2016-03-04 00:37:09 +00:00
type features struct {
cpuHardcapping bool
}
2015-10-10 00:09:53 +00:00
var _ ContainerManager = & containerManagerImpl { }
2015-05-12 16:58:40 +00:00
2015-09-22 23:42:30 +00:00
// checks if the required cgroups subsystems are mounted.
// As of now, only 'cpu' and 'memory' are required.
2016-03-04 00:37:09 +00:00
// cpu quota is a soft requirement.
func validateSystemRequirements ( mountUtil mount . Interface ) ( features , error ) {
2015-09-22 23:42:30 +00:00
const (
cgroupMountType = "cgroup"
localErr = "system validation failed"
)
2016-03-04 00:37:09 +00:00
var (
cpuMountPoint string
f features
)
2015-09-22 23:42:30 +00:00
mountPoints , err := mountUtil . List ( )
if err != nil {
2016-03-04 00:37:09 +00:00
return f , fmt . Errorf ( "%s - %v" , localErr , err )
2015-09-22 23:42:30 +00:00
}
2016-03-04 00:37:09 +00:00
2015-09-22 23:42:30 +00:00
expectedCgroups := sets . NewString ( "cpu" , "cpuacct" , "cpuset" , "memory" )
for _ , mountPoint := range mountPoints {
if mountPoint . Type == cgroupMountType {
for _ , opt := range mountPoint . Opts {
if expectedCgroups . Has ( opt ) {
expectedCgroups . Delete ( opt )
}
2016-03-04 00:37:09 +00:00
if opt == "cpu" {
cpuMountPoint = mountPoint . Path
}
2015-09-22 23:42:30 +00:00
}
}
}
if expectedCgroups . Len ( ) > 0 {
2016-03-04 00:37:09 +00:00
return f , fmt . Errorf ( "%s - Following Cgroup subsystem not mounted: %v" , localErr , expectedCgroups . List ( ) )
2015-09-22 23:42:30 +00:00
}
2016-03-04 00:37:09 +00:00
// Check if cpu quota is available.
// CPU cgroup is required and so it expected to be mounted at this point.
periodExists , err := util . FileExists ( path . Join ( cpuMountPoint , "cpu.cfs_period_us" ) )
if err != nil {
glog . Errorf ( "failed to detect if CPU cgroup cpu.cfs_period_us is available - %v" , err )
}
quotaExists , err := util . FileExists ( path . Join ( cpuMountPoint , "cpu.cfs_quota_us" ) )
if err != nil {
glog . Errorf ( "failed to detect if CPU cgroup cpu.cfs_quota_us is available - %v" , err )
}
if quotaExists && periodExists {
f . cpuHardcapping = true
}
return f , nil
2015-09-22 23:42:30 +00:00
}
2015-05-30 00:32:34 +00:00
// TODO(vmarmol): Add limits to the system containers.
2015-05-19 22:52:12 +00:00
// Takes the absolute name of the specified containers.
// Empty container name disables use of the specified container.
2017-02-10 05:14:10 +00:00
func NewContainerManager ( mountUtil mount . Interface , cadvisorInterface cadvisor . Interface , nodeConfig NodeConfig , failSwapOn bool , recorder record . EventRecorder ) ( ContainerManager , error ) {
2016-10-17 17:23:48 +00:00
subsystems , err := GetCgroupSubsystems ( )
if err != nil {
return nil , fmt . Errorf ( "failed to get mounted cgroup subsystems: %v" , err )
}
2016-09-02 18:13:02 +00:00
// Check whether swap is enabled. The Kubelet does not support running with swap enabled.
cmd := exec . Command ( "cat" , "/proc/swaps" )
stdout , err := cmd . StdoutPipe ( )
if err != nil {
return nil , err
}
if err := cmd . Start ( ) ; err != nil {
return nil , err
}
var buf [ ] string
scanner := bufio . NewScanner ( stdout )
for scanner . Scan ( ) { // Splits on newlines by default
buf = append ( buf , scanner . Text ( ) )
}
if err := cmd . Wait ( ) ; err != nil { // Clean up
return nil , err
}
// TODO(#34726:1.8.0): Remove the opt-in for failing when swap is enabled.
// Running with swap enabled should be considered an error, but in order to maintain legacy
// behavior we have to require an opt-in to this error for a period of time.
// If there is more than one line (table headers) in /proc/swaps, swap is enabled and we should error out.
if len ( buf ) > 1 {
if failSwapOn {
return nil , fmt . Errorf ( "Running with swap on is not supported, please disable swap! /proc/swaps contained: %v" , buf )
}
glog . Warningf ( "Running with swap on is not supported, please disable swap! " +
"This will be a fatal error by default starting in K8s v1.6! " +
"In the meantime, you can opt-in to making this a fatal error by enabling --experimental-fail-swap-on." )
}
2017-02-10 05:14:10 +00:00
var capacity = v1 . ResourceList { }
// It is safe to invoke `MachineInfo` on cAdvisor before logically initializing cAdvisor here because
// machine info is computed and cached once as part of cAdvisor object creation.
if info , err := cadvisorInterface . MachineInfo ( ) ; err == nil {
capacity = cadvisor . CapacityFromMachineInfo ( info )
} else {
return nil , err
}
2017-06-26 19:49:00 +00:00
rootfs , err := cadvisorInterface . RootFsInfo ( )
if err != nil {
capacity [ v1 . ResourceStorageScratch ] = resource . MustParse ( "0Gi" )
} else {
for rName , rCap := range cadvisor . StorageScratchCapacityFromFsInfo ( rootfs ) {
capacity [ rName ] = rCap
}
}
if hasDedicatedImageFs , _ := cadvisorInterface . HasDedicatedImageFs ( ) ; hasDedicatedImageFs {
imagesfs , err := cadvisorInterface . ImagesFsInfo ( )
if err != nil {
glog . Errorf ( "Failed to get Image filesystem information: %v" , err )
} else {
for rName , rCap := range cadvisor . StorageOverlayCapacityFromFsInfo ( imagesfs ) {
capacity [ rName ] = rCap
}
}
}
2016-09-02 18:13:02 +00:00
2017-02-10 05:14:10 +00:00
cgroupRoot := nodeConfig . CgroupRoot
cgroupManager := NewCgroupManager ( subsystems , nodeConfig . CgroupDriver )
2016-06-27 18:46:20 +00:00
// Check if Cgroup-root actually exists on the node
if nodeConfig . CgroupsPerQOS {
2016-10-17 17:23:48 +00:00
// this does default to / when enabled, but this tests against regressions.
2016-06-27 18:46:20 +00:00
if nodeConfig . CgroupRoot == "" {
2017-02-03 22:10:53 +00:00
return nil , fmt . Errorf ( "invalid configuration: cgroups-per-qos was specified and cgroup-root was not specified. To enable the QoS cgroup hierarchy you need to specify a valid cgroup-root" )
2016-06-27 18:46:20 +00:00
}
2016-10-17 17:23:48 +00:00
// we need to check that the cgroup root actually exists for each subsystem
// of note, we always use the cgroupfs driver when performing this check since
// the input is provided in that format.
// this is important because we do not want any name conversion to occur.
2017-02-10 05:14:10 +00:00
if ! cgroupManager . Exists ( CgroupName ( cgroupRoot ) ) {
return nil , fmt . Errorf ( "invalid configuration: cgroup-root %q doesn't exist: %v" , cgroupRoot , err )
2016-06-27 18:46:20 +00:00
}
2017-02-10 05:14:10 +00:00
glog . Infof ( "container manager verified user specified cgroup-root exists: %v" , cgroupRoot )
// Include the the top level cgroup for enforcing node allocatable into cgroup-root.
// This way, all sub modules can avoid having to understand the concept of node allocatable.
cgroupRoot = path . Join ( cgroupRoot , defaultNodeAllocatableCgroupName )
2016-06-27 18:46:20 +00:00
}
2017-02-10 05:14:10 +00:00
glog . Infof ( "Creating Container Manager object based on Node Config: %+v" , nodeConfig )
2017-02-21 20:10:45 +00:00
qosContainerManager , err := NewQOSContainerManager ( subsystems , cgroupRoot , nodeConfig )
if err != nil {
return nil , err
}
2015-09-22 23:42:30 +00:00
return & containerManagerImpl {
2017-02-21 20:10:45 +00:00
cadvisorInterface : cadvisorInterface ,
mountUtil : mountUtil ,
NodeConfig : nodeConfig ,
subsystems : subsystems ,
cgroupManager : cgroupManager ,
capacity : capacity ,
cgroupRoot : cgroupRoot ,
recorder : recorder ,
qosContainerManager : qosContainerManager ,
2015-09-22 23:42:30 +00:00
} , nil
}
2016-07-13 04:39:22 +00:00
// NewPodContainerManager is a factory method returns a PodContainerManager object
// If qosCgroups are enabled then it returns the general pod container manager
// otherwise it returns a no-op manager which essentially does nothing
func ( cm * containerManagerImpl ) NewPodContainerManager ( ) PodContainerManager {
if cm . NodeConfig . CgroupsPerQOS {
return & podContainerManagerImpl {
2017-02-21 20:10:45 +00:00
qosContainersInfo : cm . GetQOSContainersInfo ( ) ,
2016-07-13 04:39:22 +00:00
subsystems : cm . subsystems ,
2017-02-10 05:14:10 +00:00
cgroupManager : cm . cgroupManager ,
2016-07-13 04:39:22 +00:00
}
}
return & podContainerManagerNoop {
2017-02-10 05:14:10 +00:00
cgroupRoot : CgroupName ( cm . cgroupRoot ) ,
2016-07-13 04:39:22 +00:00
}
}
2015-09-22 23:42:30 +00:00
// Create a cgroup container manager.
func createManager ( containerName string ) * fs . Manager {
2016-08-02 18:04:17 +00:00
allowAllDevices := true
2015-09-22 23:42:30 +00:00
return & fs . Manager {
Cgroups : & configs . Cgroup {
2016-01-06 23:36:48 +00:00
Parent : "/" ,
Name : containerName ,
Resources : & configs . Resources {
2016-08-02 18:04:17 +00:00
AllowAllDevices : & allowAllDevices ,
2016-01-06 23:36:48 +00:00
} ,
2015-09-22 23:42:30 +00:00
} ,
}
}
2015-05-30 00:32:34 +00:00
2015-10-09 19:39:30 +00:00
type KernelTunableBehavior string
const (
KernelTunableWarn KernelTunableBehavior = "warn"
KernelTunableError KernelTunableBehavior = "error"
KernelTunableModify KernelTunableBehavior = "modify"
)
// setupKernelTunables validates kernel tunable flags are set as expected
// depending upon the specified option, it will either warn, error, or modify the kernel tunable flags
func setupKernelTunables ( option KernelTunableBehavior ) error {
desiredState := map [ string ] int {
utilsysctl . VmOvercommitMemory : utilsysctl . VmOvercommitMemoryAlways ,
utilsysctl . VmPanicOnOOM : utilsysctl . VmPanicOnOOMInvokeOOMKiller ,
2015-11-13 23:47:25 +00:00
utilsysctl . KernelPanic : utilsysctl . KernelPanicRebootTimeout ,
utilsysctl . KernelPanicOnOops : utilsysctl . KernelPanicOnOopsAlways ,
2017-03-13 17:43:31 +00:00
utilsysctl . RootMaxKeys : utilsysctl . RootMaxKeysSetting ,
utilsysctl . RootMaxBytes : utilsysctl . RootMaxBytesSetting ,
2015-10-05 17:28:53 +00:00
}
2015-10-09 19:39:30 +00:00
2016-08-10 15:38:44 +00:00
sysctl := utilsysctl . New ( )
2015-10-09 19:39:30 +00:00
errList := [ ] error { }
for flag , expectedValue := range desiredState {
2016-08-10 15:38:44 +00:00
val , err := sysctl . GetSysctl ( flag )
2015-10-09 19:39:30 +00:00
if err != nil {
errList = append ( errList , err )
continue
}
if val == expectedValue {
continue
}
switch option {
case KernelTunableError :
errList = append ( errList , fmt . Errorf ( "Invalid kernel flag: %v, expected value: %v, actual value: %v" , flag , expectedValue , val ) )
case KernelTunableWarn :
glog . V ( 2 ) . Infof ( "Invalid kernel flag: %v, expected value: %v, actual value: %v" , flag , expectedValue , val )
case KernelTunableModify :
glog . V ( 2 ) . Infof ( "Updating kernel flag: %v, expected value: %v, actual value: %v" , flag , expectedValue , val )
2016-08-10 15:38:44 +00:00
err = sysctl . SetSysctl ( flag , expectedValue )
2015-10-09 19:39:30 +00:00
if err != nil {
errList = append ( errList , err )
}
}
2015-10-05 17:28:53 +00:00
}
2015-10-14 05:18:37 +00:00
return utilerrors . NewAggregate ( errList )
2015-10-05 17:28:53 +00:00
}
2017-02-21 20:10:45 +00:00
func ( cm * containerManagerImpl ) setupNode ( activePods ActivePodsFunc ) error {
2016-03-04 00:37:09 +00:00
f , err := validateSystemRequirements ( cm . mountUtil )
if err != nil {
2015-09-22 23:42:30 +00:00
return err
}
2016-03-04 00:37:09 +00:00
if ! f . cpuHardcapping {
cm . status . SoftRequirements = fmt . Errorf ( "CPU hardcapping unsupported" )
}
2016-07-26 10:21:17 +00:00
b := KernelTunableModify
if cm . GetNodeConfig ( ) . ProtectKernelDefaults {
b = KernelTunableError
}
if err := setupKernelTunables ( b ) ; err != nil {
2015-10-05 17:28:53 +00:00
return err
}
2016-06-27 18:46:20 +00:00
// Setup top level qos containers only if CgroupsPerQOS flag is specified as true
if cm . NodeConfig . CgroupsPerQOS {
2017-02-10 05:14:10 +00:00
if err := cm . createNodeAllocatableCgroups ( ) ; err != nil {
return err
}
2017-02-28 21:03:06 +00:00
err = cm . qosContainerManager . Start ( cm . getNodeAllocatableAbsolute , activePods )
2016-06-27 18:46:20 +00:00
if err != nil {
2017-05-03 20:08:16 +00:00
return fmt . Errorf ( "failed to initialize top level QOS containers: %v" , err )
2016-06-27 18:46:20 +00:00
}
}
2017-02-10 05:14:10 +00:00
// Enforce Node Allocatable (if required)
if err := cm . enforceNodeAllocatableCgroups ( ) ; err != nil {
return err
}
2015-09-22 23:42:30 +00:00
systemContainers := [ ] * systemContainer { }
2016-02-05 01:49:17 +00:00
if cm . ContainerRuntime == "docker" {
2017-05-01 21:39:51 +00:00
// With the docker-CRI integration, dockershim will manage the cgroups
// and oom score for the docker processes.
// In the future, NodeSpec should mandate the cgroup that the
// runtime processes need to be in. For now, we still check the
// cgroup for docker periodically, so that kubelet can recognize
// the cgroup for docker and serve stats for the runtime.
// TODO(#27097): Fix this after NodeSpec is clearly defined.
cm . periodicTasks = append ( cm . periodicTasks , func ( ) {
glog . V ( 4 ) . Infof ( "[ContainerManager]: Adding periodic tasks for docker CRI integration" )
cont , err := getContainerNameForProcess ( dockerProcessName , dockerPidFile )
if err != nil {
glog . Error ( err )
return
2016-02-05 01:49:17 +00:00
}
2017-05-01 21:39:51 +00:00
glog . V ( 2 ) . Infof ( "[ContainerManager]: Discovered runtime cgroups name: %s" , cont )
cm . Lock ( )
defer cm . Unlock ( )
cm . RuntimeCgroupsName = cont
} )
2016-02-05 01:49:17 +00:00
}
2015-06-17 19:51:11 +00:00
2016-02-10 00:58:44 +00:00
if cm . SystemCgroupsName != "" {
if cm . SystemCgroupsName == "/" {
2016-02-05 01:49:17 +00:00
return fmt . Errorf ( "system container cannot be root (\"/\")" )
}
2016-02-10 00:58:44 +00:00
cont := newSystemCgroups ( cm . SystemCgroupsName )
2015-05-30 00:32:34 +00:00
cont . ensureStateFunc = func ( manager * fs . Manager ) error {
2016-11-10 21:08:17 +00:00
return ensureSystemCgroups ( "/" , manager )
2015-05-30 00:32:34 +00:00
}
systemContainers = append ( systemContainers , cont )
2015-05-19 22:52:12 +00:00
}
2016-02-10 00:58:44 +00:00
if cm . KubeletCgroupsName != "" {
cont := newSystemCgroups ( cm . KubeletCgroupsName )
2016-08-02 18:04:17 +00:00
allowAllDevices := true
2016-02-05 01:49:17 +00:00
manager := fs . Manager {
2015-05-19 22:52:12 +00:00
Cgroups : & configs . Cgroup {
2016-01-06 23:36:48 +00:00
Parent : "/" ,
2016-02-10 00:58:44 +00:00
Name : cm . KubeletCgroupsName ,
2016-02-05 01:49:17 +00:00
Resources : & configs . Resources {
2016-08-02 18:04:17 +00:00
AllowAllDevices : & allowAllDevices ,
2016-02-05 01:49:17 +00:00
} ,
2015-05-19 22:52:12 +00:00
} ,
2015-05-30 00:32:34 +00:00
}
2016-02-05 01:49:17 +00:00
cont . ensureStateFunc = func ( _ * fs . Manager ) error {
2016-09-16 23:32:58 +00:00
return ensureProcessInContainerWithOOMScore ( os . Getpid ( ) , qos . KubeletOOMScoreAdj , & manager )
2016-02-05 01:49:17 +00:00
}
systemContainers = append ( systemContainers , cont )
} else {
2016-02-16 20:52:40 +00:00
cm . periodicTasks = append ( cm . periodicTasks , func ( ) {
2016-09-16 23:32:58 +00:00
if err := ensureProcessInContainerWithOOMScore ( os . Getpid ( ) , qos . KubeletOOMScoreAdj , nil ) ; err != nil {
glog . Error ( err )
return
}
2016-02-16 20:52:40 +00:00
cont , err := getContainer ( os . Getpid ( ) )
if err != nil {
2016-03-23 00:26:50 +00:00
glog . Errorf ( "failed to find cgroups of kubelet - %v" , err )
2016-02-16 20:52:40 +00:00
return
}
cm . Lock ( )
defer cm . Unlock ( )
2016-02-10 00:58:44 +00:00
cm . KubeletCgroupsName = cont
2016-02-16 20:52:40 +00:00
} )
2015-05-30 00:32:34 +00:00
}
2015-09-22 23:42:30 +00:00
cm . systemContainers = systemContainers
return nil
2015-05-30 00:32:34 +00:00
}
2016-05-19 19:44:42 +00:00
func getContainerNameForProcess ( name , pidFile string ) ( string , error ) {
pids , err := getPidsForProcess ( name , pidFile )
2016-02-05 01:49:17 +00:00
if err != nil {
return "" , fmt . Errorf ( "failed to detect process id for %q - %v" , name , err )
}
if len ( pids ) == 0 {
return "" , nil
}
cont , err := getContainer ( pids [ 0 ] )
if err != nil {
return "" , err
}
return cont , nil
}
2015-10-10 00:09:53 +00:00
2016-02-05 01:49:17 +00:00
func ( cm * containerManagerImpl ) GetNodeConfig ( ) NodeConfig {
2016-02-16 20:52:40 +00:00
cm . RLock ( )
defer cm . RUnlock ( )
2016-02-05 01:49:17 +00:00
return cm . NodeConfig
}
2016-07-13 04:39:22 +00:00
func ( cm * containerManagerImpl ) GetMountedSubsystems ( ) * CgroupSubsystems {
return cm . subsystems
}
func ( cm * containerManagerImpl ) GetQOSContainersInfo ( ) QOSContainersInfo {
2017-02-21 20:10:45 +00:00
return cm . qosContainerManager . GetQOSContainersInfo ( )
}
func ( cm * containerManagerImpl ) UpdateQOSCgroups ( ) error {
return cm . qosContainerManager . UpdateCgroups ( )
2016-07-13 04:39:22 +00:00
}
2016-03-04 00:37:09 +00:00
func ( cm * containerManagerImpl ) Status ( ) Status {
cm . RLock ( )
defer cm . RUnlock ( )
return cm . status
}
2017-02-21 20:10:45 +00:00
func ( cm * containerManagerImpl ) Start ( node * v1 . Node , activePods ActivePodsFunc ) error {
2016-07-13 04:39:22 +00:00
// cache the node Info including resource capacity and
// allocatable of the node
cm . nodeInfo = node
2015-09-22 23:42:30 +00:00
// Setup the node
2017-02-21 20:10:45 +00:00
if err := cm . setupNode ( activePods ) ; err != nil {
2015-09-22 23:42:30 +00:00
return err
}
2017-02-10 05:14:10 +00:00
// Ensure that node allocatable configuration is valid.
if err := cm . validateNodeAllocatable ( ) ; err != nil {
return err
}
2015-05-30 00:32:34 +00:00
// Don't run a background thread if there are no ensureStateFuncs.
2016-09-06 04:56:54 +00:00
hasEnsureStateFuncs := false
2015-05-30 00:32:34 +00:00
for _ , cont := range cm . systemContainers {
if cont . ensureStateFunc != nil {
2016-09-06 04:56:54 +00:00
hasEnsureStateFuncs = true
break
2015-05-19 22:52:12 +00:00
}
}
2016-09-06 04:56:54 +00:00
if hasEnsureStateFuncs {
2016-05-27 00:27:00 +00:00
// Run ensure state functions every minute.
2016-03-02 22:18:33 +00:00
go wait . Until ( func ( ) {
for _ , cont := range cm . systemContainers {
if cont . ensureStateFunc != nil {
if err := cont . ensureStateFunc ( cont . manager ) ; err != nil {
glog . Warningf ( "[ContainerManager] Failed to ensure state of %q: %v" , cont . name , err )
}
}
}
} , time . Minute , wait . NeverStop )
2015-05-12 16:58:40 +00:00
}
2015-05-30 00:32:34 +00:00
2016-03-02 22:18:33 +00:00
if len ( cm . periodicTasks ) > 0 {
go wait . Until ( func ( ) {
for _ , task := range cm . periodicTasks {
if task != nil {
task ( )
2015-06-04 18:07:08 +00:00
}
2015-05-30 00:32:34 +00:00
}
2016-03-02 22:18:33 +00:00
} , 5 * time . Minute , wait . NeverStop )
}
2016-02-16 20:52:40 +00:00
2015-05-12 16:58:40 +00:00
return nil
}
2016-11-18 20:50:58 +00:00
func ( cm * containerManagerImpl ) SystemCgroupsLimit ( ) v1 . ResourceList {
2015-05-30 00:32:34 +00:00
cpuLimit := int64 ( 0 )
// Sum up resources of all external containers.
for _ , cont := range cm . systemContainers {
cpuLimit += cont . cpuMillicores
}
2016-11-18 20:50:58 +00:00
return v1 . ResourceList {
v1 . ResourceCPU : * resource . NewMilliQuantity (
2015-05-30 00:32:34 +00:00
cpuLimit ,
resource . DecimalSI ) ,
}
}
2016-02-04 21:34:56 +00:00
func isProcessRunningInHost ( pid int ) ( bool , error ) {
2016-09-16 23:32:58 +00:00
// Get init pid namespace.
initPidNs , err := os . Readlink ( "/proc/1/ns/pid" )
2016-02-04 21:34:56 +00:00
if err != nil {
2016-09-16 23:32:58 +00:00
return false , fmt . Errorf ( "failed to find pid namespace of init process" )
2016-02-04 21:34:56 +00:00
}
2016-09-16 23:32:58 +00:00
glog . V ( 10 ) . Infof ( "init pid ns is %q" , initPidNs )
processPidNs , err := os . Readlink ( fmt . Sprintf ( "/proc/%d/ns/pid" , pid ) )
2016-02-04 21:34:56 +00:00
if err != nil {
2016-09-16 23:32:58 +00:00
return false , fmt . Errorf ( "failed to find pid namespace of process %q" , pid )
2016-02-04 21:34:56 +00:00
}
2016-09-16 23:32:58 +00:00
glog . V ( 10 ) . Infof ( "Pid %d pid ns is %q" , pid , processPidNs )
return initPidNs == processPidNs , nil
2016-02-04 21:34:56 +00:00
}
2016-05-19 19:44:42 +00:00
func getPidFromPidFile ( pidFile string ) ( int , error ) {
file , err := os . Open ( pidFile )
if err != nil {
return 0 , fmt . Errorf ( "error opening pid file %s: %v" , pidFile , err )
}
defer file . Close ( )
data , err := ioutil . ReadAll ( file )
if err != nil {
return 0 , fmt . Errorf ( "error reading pid file %s: %v" , pidFile , err )
}
pid , err := strconv . Atoi ( string ( data ) )
if err != nil {
return 0 , fmt . Errorf ( "error parsing %s as a number: %v" , string ( data ) , err )
}
return pid , nil
}
func getPidsForProcess ( name , pidFile string ) ( [ ] int , error ) {
2017-03-28 16:28:12 +00:00
if len ( pidFile ) == 0 {
return procfs . PidOf ( name )
}
pid , err := getPidFromPidFile ( pidFile )
if err == nil {
return [ ] int { pid } , nil
2016-05-19 19:44:42 +00:00
}
2017-03-28 16:28:12 +00:00
// Try to lookup pid by process name
pids , err2 := procfs . PidOf ( name )
if err2 == nil {
return pids , nil
}
// Return error from getPidFromPidFile since that should have worked
// and is the real source of the problem.
glog . V ( 4 ) . Infof ( "unable to get pid from %s: %v" , pidFile , err )
return [ ] int { } , err
2016-02-05 01:49:17 +00:00
}
2015-05-12 16:58:40 +00:00
2016-02-05 01:49:17 +00:00
// Ensures that the Docker daemon is in the desired container.
2016-10-10 20:56:53 +00:00
// Temporarily export the function to be used by dockershim.
// TODO(yujuhong): Move this function to dockershim once kubelet migrates to
// dockershim as the default.
2017-04-04 18:48:09 +00:00
func EnsureDockerInContainer ( dockerAPIVersion * utilversion . Version , oomScoreAdj int , manager * fs . Manager ) error {
2016-05-27 00:27:00 +00:00
type process struct { name , file string }
dockerProcs := [ ] process { { dockerProcessName , dockerPidFile } }
2017-04-04 18:48:09 +00:00
if dockerAPIVersion . AtLeast ( containerdAPIVersion ) {
2016-05-27 00:27:00 +00:00
dockerProcs = append ( dockerProcs , process { containerdProcessName , containerdPidFile } )
2016-02-05 01:49:17 +00:00
}
2016-05-27 00:27:00 +00:00
var errs [ ] error
for _ , proc := range dockerProcs {
pids , err := getPidsForProcess ( proc . name , proc . file )
2015-05-12 16:58:40 +00:00
if err != nil {
2016-05-27 00:27:00 +00:00
errs = append ( errs , fmt . Errorf ( "failed to get pids for %q: %v" , proc . name , err ) )
continue
2015-05-12 16:58:40 +00:00
}
2016-05-27 00:27:00 +00:00
// Move if the pid is not already in the desired container.
for _ , pid := range pids {
2016-09-16 23:32:58 +00:00
if err := ensureProcessInContainerWithOOMScore ( pid , oomScoreAdj , manager ) ; err != nil {
2016-05-27 00:27:00 +00:00
errs = append ( errs , fmt . Errorf ( "errors moving %q pid: %v" , proc . name , err ) )
2015-05-12 16:58:40 +00:00
}
}
2016-05-27 00:27:00 +00:00
}
return utilerrors . NewAggregate ( errs )
}
2015-05-14 21:40:20 +00:00
2016-09-16 23:32:58 +00:00
func ensureProcessInContainerWithOOMScore ( pid int , oomScoreAdj int , manager * fs . Manager ) error {
2016-05-27 00:27:00 +00:00
if runningInHost , err := isProcessRunningInHost ( pid ) ; err != nil {
// Err on the side of caution. Avoid moving the docker daemon unless we are able to identify its context.
return err
} else if ! runningInHost {
// Process is running inside a container. Don't touch that.
2016-09-16 23:32:58 +00:00
glog . V ( 2 ) . Infof ( "pid %d is not running in the host namespaces" , pid )
2016-05-27 00:27:00 +00:00
return nil
}
var errs [ ] error
2016-09-16 23:32:58 +00:00
if manager != nil {
cont , err := getContainer ( pid )
2016-09-16 02:28:59 +00:00
if err != nil {
2016-09-16 23:32:58 +00:00
errs = append ( errs , fmt . Errorf ( "failed to find container of PID %d: %v" , pid , err ) )
}
if cont != manager . Cgroups . Name {
err = manager . Apply ( pid )
if err != nil {
errs = append ( errs , fmt . Errorf ( "failed to move PID %d (in %q) to %q: %v" , pid , cont , manager . Cgroups . Name , err ) )
}
2015-05-14 21:40:20 +00:00
}
2015-05-12 16:58:40 +00:00
}
2016-05-27 00:27:00 +00:00
// Also apply oom-score-adj to processes
oomAdjuster := oom . NewOOMAdjuster ( )
2016-09-16 23:32:58 +00:00
glog . V ( 5 ) . Infof ( "attempting to apply oom_score_adj of %d to pid %d" , oomScoreAdj , pid )
2016-05-27 00:27:00 +00:00
if err := oomAdjuster . ApplyOOMScoreAdj ( pid , oomScoreAdj ) ; err != nil {
2016-09-16 23:32:58 +00:00
glog . V ( 3 ) . Infof ( "Failed to apply oom_score_adj %d for pid %d: %v" , oomScoreAdj , pid , err )
errs = append ( errs , fmt . Errorf ( "failed to apply oom score %d to PID %d: %v" , oomScoreAdj , pid , err ) )
2016-05-27 00:27:00 +00:00
}
2015-10-14 05:18:37 +00:00
return utilerrors . NewAggregate ( errs )
2015-05-12 16:58:40 +00:00
}
2016-05-31 20:02:00 +00:00
// getContainer returns the cgroup associated with the specified pid.
// It enforces a unified hierarchy for memory and cpu cgroups.
// On systemd environments, it uses the name=systemd cgroup for the specified pid.
2015-05-12 16:58:40 +00:00
func getContainer ( pid int ) ( string , error ) {
2015-12-11 13:25:35 +00:00
cgs , err := cgroups . ParseCgroupFile ( fmt . Sprintf ( "/proc/%d/cgroup" , pid ) )
2015-05-12 16:58:40 +00:00
if err != nil {
return "" , err
}
2016-05-31 20:02:00 +00:00
cpu , found := cgs [ "cpu" ]
if ! found {
return "" , cgroups . NewNotFoundError ( "cpu" )
}
memory , found := cgs [ "memory" ]
if ! found {
return "" , cgroups . NewNotFoundError ( "memory" )
}
// since we use this container for accounting, we need to ensure its a unified hierarchy.
if cpu != memory {
return "" , fmt . Errorf ( "cpu and memory cgroup hierarchy not unified. cpu: %s, memory: %s" , cpu , memory )
}
// on systemd, every pid is in a unified cgroup hierarchy (name=systemd as seen in systemd-cgls)
// cpu and memory accounting is off by default, users may choose to enable it per unit or globally.
// users could enable CPU and memory accounting globally via /etc/systemd/system.conf (DefaultCPUAccounting=true DefaultMemoryAccounting=true).
// users could also enable CPU and memory accounting per unit via CPUAccounting=true and MemoryAccounting=true
// we only warn if accounting is not enabled for CPU or memory so as to not break local development flows where kubelet is launched in a terminal.
// for example, the cgroup for the user session will be something like /user.slice/user-X.slice/session-X.scope, but the cpu and memory
// cgroup will be the closest ancestor where accounting is performed (most likely /) on systems that launch docker containers.
// as a result, on those systems, you will not get cpu or memory accounting statistics for kubelet.
// in addition, you would not get memory or cpu accounting for the runtime unless accounting was enabled on its unit (or globally).
if systemd , found := cgs [ "name=systemd" ] ; found {
if systemd != cpu {
glog . Warningf ( "CPUAccounting not enabled for pid: %d" , pid )
}
if systemd != memory {
glog . Warningf ( "MemoryAccounting not enabled for pid: %d" , pid )
}
return systemd , nil
2015-12-11 13:25:35 +00:00
}
2016-05-31 20:02:00 +00:00
return cpu , nil
2015-05-12 16:58:40 +00:00
}
2015-05-19 22:52:12 +00:00
2015-08-18 23:21:28 +00:00
// Ensures the system container is created and all non-kernel threads and process 1
// without a container are moved to it.
//
// The reason of leaving kernel threads at root cgroup is that we don't want to tie the
// execution of these threads with to-be defined /system quota and create priority inversions.
//
2016-11-10 21:08:17 +00:00
func ensureSystemCgroups ( rootCgroupPath string , manager * fs . Manager ) error {
2015-05-19 22:52:12 +00:00
// Move non-kernel PIDs to the system container.
attemptsRemaining := 10
var errs [ ] error
for attemptsRemaining >= 0 {
// Only keep errors on latest attempt.
errs = [ ] error { }
attemptsRemaining --
2016-11-10 21:08:17 +00:00
allPids , err := cmutil . GetPids ( rootCgroupPath )
2015-05-19 22:52:12 +00:00
if err != nil {
2015-06-04 18:07:08 +00:00
errs = append ( errs , fmt . Errorf ( "failed to list PIDs for root: %v" , err ) )
2015-05-19 22:52:12 +00:00
continue
}
2016-01-06 23:36:48 +00:00
// Remove kernel pids and other protected PIDs (pid 1, PIDs already in system & kubelet containers)
2015-05-19 22:52:12 +00:00
pids := make ( [ ] int , 0 , len ( allPids ) )
for _ , pid := range allPids {
2016-01-31 23:27:34 +00:00
if pid == 1 || isKernelPid ( pid ) {
2015-05-19 22:52:12 +00:00
continue
}
pids = append ( pids , pid )
}
2016-01-06 23:36:48 +00:00
glog . Infof ( "Found %d PIDs in root, %d of them are not to be moved" , len ( allPids ) , len ( allPids ) - len ( pids ) )
2015-05-19 22:52:12 +00:00
2016-01-06 23:36:48 +00:00
// Check if we have moved all the non-kernel PIDs.
2015-05-19 22:52:12 +00:00
if len ( pids ) == 0 {
break
}
2016-01-06 23:36:48 +00:00
glog . Infof ( "Moving non-kernel processes: %v" , pids )
2015-05-19 22:52:12 +00:00
for _ , pid := range pids {
2015-05-30 00:32:34 +00:00
err := manager . Apply ( pid )
2015-05-19 22:52:12 +00:00
if err != nil {
2015-05-30 00:32:34 +00:00
errs = append ( errs , fmt . Errorf ( "failed to move PID %d into the system container %q: %v" , pid , manager . Cgroups . Name , err ) )
2015-05-19 22:52:12 +00:00
}
}
}
if attemptsRemaining < 0 {
2015-05-30 00:32:34 +00:00
errs = append ( errs , fmt . Errorf ( "ran out of attempts to create system containers %q" , manager . Cgroups . Name ) )
2015-05-19 22:52:12 +00:00
}
2015-10-14 05:18:37 +00:00
return utilerrors . NewAggregate ( errs )
2015-05-19 22:52:12 +00:00
}
// Determines whether the specified PID is a kernel PID.
func isKernelPid ( pid int ) bool {
// Kernel threads have no associated executable.
_ , err := os . Readlink ( fmt . Sprintf ( "/proc/%d/exe" , pid ) )
return err != nil
}
2016-05-27 00:27:00 +00:00
2017-04-04 18:48:09 +00:00
// Helper for getting the docker API version.
func getDockerAPIVersion ( cadvisor cadvisor . Interface ) * utilversion . Version {
2016-05-27 00:27:00 +00:00
versions , err := cadvisor . VersionInfo ( )
if err != nil {
glog . Errorf ( "Error requesting cAdvisor VersionInfo: %v" , err )
2017-04-04 18:48:09 +00:00
return utilversion . MustParseSemantic ( "0.0" )
2016-05-27 00:27:00 +00:00
}
2017-04-04 18:48:09 +00:00
dockerAPIVersion , err := utilversion . ParseGeneric ( versions . DockerAPIVersion )
2016-05-27 00:27:00 +00:00
if err != nil {
glog . Errorf ( "Error parsing docker version %q: %v" , versions . DockerVersion , err )
2017-04-04 18:48:09 +00:00
return utilversion . MustParseSemantic ( "0.0" )
2016-05-27 00:27:00 +00:00
}
2017-04-04 18:48:09 +00:00
return dockerAPIVersion
2016-05-27 00:27:00 +00:00
}
2017-02-10 05:14:10 +00:00
func ( m * containerManagerImpl ) GetCapacity ( ) v1 . ResourceList {
return m . capacity
}