2015-05-12 16:58:40 +00:00
// +build linux
/ *
2016-06-03 00:25:58 +00:00
Copyright 2015 The Kubernetes Authors .
2015-05-12 16:58:40 +00:00
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
2015-10-10 00:09:53 +00:00
package cm
2015-05-12 16:58:40 +00:00
import (
"fmt"
2016-05-19 19:44:42 +00:00
"io/ioutil"
2015-05-12 16:58:40 +00:00
"os"
2016-03-04 00:37:09 +00:00
"path"
2015-05-12 16:58:40 +00:00
"strconv"
2016-02-16 20:52:40 +00:00
"sync"
2015-05-12 16:58:40 +00:00
"time"
2016-05-27 00:27:00 +00:00
"github.com/blang/semver"
2015-08-05 22:05:17 +00:00
"github.com/golang/glog"
2015-12-11 13:25:35 +00:00
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/configs"
2015-08-05 22:03:47 +00:00
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/resource"
"k8s.io/kubernetes/pkg/kubelet/cadvisor"
2016-06-27 18:46:20 +00:00
"k8s.io/kubernetes/pkg/kubelet/qos"
2016-03-04 00:37:09 +00:00
"k8s.io/kubernetes/pkg/util"
2015-10-14 05:18:37 +00:00
utilerrors "k8s.io/kubernetes/pkg/util/errors"
2015-09-22 23:42:30 +00:00
"k8s.io/kubernetes/pkg/util/mount"
2015-08-04 00:28:33 +00:00
"k8s.io/kubernetes/pkg/util/oom"
2016-08-03 17:02:09 +00:00
"k8s.io/kubernetes/pkg/util/procfs"
2016-05-19 19:44:42 +00:00
"k8s.io/kubernetes/pkg/util/runtime"
2015-09-22 23:42:30 +00:00
"k8s.io/kubernetes/pkg/util/sets"
2015-10-05 17:28:53 +00:00
utilsysctl "k8s.io/kubernetes/pkg/util/sysctl"
2016-02-02 10:57:06 +00:00
"k8s.io/kubernetes/pkg/util/wait"
2015-05-12 16:58:40 +00:00
)
2015-06-17 18:36:27 +00:00
const (
// The percent of the machine memory capacity. The value is used to calculate
// docker memory resource container's hardlimit to workaround docker memory
// leakage issue. Please see kubernetes/issues/9881 for more detail.
DockerMemoryLimitThresholdPercent = 70
2015-06-17 19:51:11 +00:00
// The minimum memory limit allocated to docker container: 150Mi
MinDockerMemoryLimit = 150 * 1024 * 1024
2016-05-19 19:44:42 +00:00
2016-05-27 00:27:00 +00:00
dockerProcessName = "docker"
dockerPidFile = "/var/run/docker.pid"
containerdProcessName = "docker-containerd"
containerdPidFile = "/run/docker/libcontainerd/docker-containerd.pid"
)
var (
// The docker version in which containerd was introduced.
containerdVersion = semver . MustParse ( "1.11.0" )
2015-06-17 18:36:27 +00:00
)
2015-05-30 00:32:34 +00:00
// A non-user container tracked by the Kubelet.
type systemContainer struct {
// Absolute name of the container.
name string
// CPU limit in millicores.
cpuMillicores int64
// Function that ensures the state of the container.
// m is the cgroup manager for the specified container.
ensureStateFunc func ( m * fs . Manager ) error
2015-05-12 16:58:40 +00:00
2015-05-30 00:32:34 +00:00
// Manager for the cgroups of the external container.
manager * fs . Manager
}
2016-02-10 00:58:44 +00:00
func newSystemCgroups ( containerName string ) * systemContainer {
2015-05-30 00:32:34 +00:00
return & systemContainer {
name : containerName ,
manager : createManager ( containerName ) ,
}
}
2015-05-19 22:52:12 +00:00
2015-05-30 00:32:34 +00:00
type containerManagerImpl struct {
2016-02-16 20:52:40 +00:00
sync . RWMutex
2015-09-22 23:42:30 +00:00
cadvisorInterface cadvisor . Interface
mountUtil mount . Interface
2015-10-10 00:09:53 +00:00
NodeConfig
2016-03-04 00:37:09 +00:00
status Status
2015-05-30 00:32:34 +00:00
// External containers being managed.
systemContainers [ ] * systemContainer
2016-06-27 18:46:20 +00:00
qosContainers QOSContainersInfo
2016-02-16 20:52:40 +00:00
periodicTasks [ ] func ( )
2016-06-27 18:46:20 +00:00
// holds all the mounted cgroup subsystems
2016-07-13 04:39:22 +00:00
subsystems * CgroupSubsystems
nodeInfo * api . Node
2015-05-12 16:58:40 +00:00
}
2016-03-04 00:37:09 +00:00
type features struct {
cpuHardcapping bool
}
2015-10-10 00:09:53 +00:00
var _ ContainerManager = & containerManagerImpl { }
2015-05-12 16:58:40 +00:00
2015-09-22 23:42:30 +00:00
// checks if the required cgroups subsystems are mounted.
// As of now, only 'cpu' and 'memory' are required.
2016-03-04 00:37:09 +00:00
// cpu quota is a soft requirement.
func validateSystemRequirements ( mountUtil mount . Interface ) ( features , error ) {
2015-09-22 23:42:30 +00:00
const (
cgroupMountType = "cgroup"
localErr = "system validation failed"
)
2016-03-04 00:37:09 +00:00
var (
cpuMountPoint string
f features
)
2015-09-22 23:42:30 +00:00
mountPoints , err := mountUtil . List ( )
if err != nil {
2016-03-04 00:37:09 +00:00
return f , fmt . Errorf ( "%s - %v" , localErr , err )
2015-09-22 23:42:30 +00:00
}
2016-03-04 00:37:09 +00:00
2015-09-22 23:42:30 +00:00
expectedCgroups := sets . NewString ( "cpu" , "cpuacct" , "cpuset" , "memory" )
for _ , mountPoint := range mountPoints {
if mountPoint . Type == cgroupMountType {
for _ , opt := range mountPoint . Opts {
if expectedCgroups . Has ( opt ) {
expectedCgroups . Delete ( opt )
}
2016-03-04 00:37:09 +00:00
if opt == "cpu" {
cpuMountPoint = mountPoint . Path
}
2015-09-22 23:42:30 +00:00
}
}
}
if expectedCgroups . Len ( ) > 0 {
2016-03-04 00:37:09 +00:00
return f , fmt . Errorf ( "%s - Following Cgroup subsystem not mounted: %v" , localErr , expectedCgroups . List ( ) )
2015-09-22 23:42:30 +00:00
}
2016-03-04 00:37:09 +00:00
// Check if cpu quota is available.
// CPU cgroup is required and so it expected to be mounted at this point.
periodExists , err := util . FileExists ( path . Join ( cpuMountPoint , "cpu.cfs_period_us" ) )
if err != nil {
glog . Errorf ( "failed to detect if CPU cgroup cpu.cfs_period_us is available - %v" , err )
}
quotaExists , err := util . FileExists ( path . Join ( cpuMountPoint , "cpu.cfs_quota_us" ) )
if err != nil {
glog . Errorf ( "failed to detect if CPU cgroup cpu.cfs_quota_us is available - %v" , err )
}
if quotaExists && periodExists {
f . cpuHardcapping = true
}
return f , nil
2015-09-22 23:42:30 +00:00
}
2015-05-30 00:32:34 +00:00
// TODO(vmarmol): Add limits to the system containers.
2015-05-19 22:52:12 +00:00
// Takes the absolute name of the specified containers.
// Empty container name disables use of the specified container.
2016-02-05 01:49:17 +00:00
func NewContainerManager ( mountUtil mount . Interface , cadvisorInterface cadvisor . Interface , nodeConfig NodeConfig ) ( ContainerManager , error ) {
2016-06-27 18:46:20 +00:00
// Check if Cgroup-root actually exists on the node
if nodeConfig . CgroupsPerQOS {
if nodeConfig . CgroupRoot == "" {
return nil , fmt . Errorf ( "invalid configuration: cgroups-per-qos was specified and cgroup-root was not specified. To enable the QoS cgroup hierarchy you need to specify a valid cgroup-root" )
}
if _ , err := os . Stat ( nodeConfig . CgroupRoot ) ; err != nil {
return nil , fmt . Errorf ( "invalid configuration: cgroup-root doesn't exist : %v" , err )
}
}
2016-07-13 04:39:22 +00:00
subsystems , err := GetCgroupSubsystems ( )
2016-06-27 18:46:20 +00:00
if err != nil {
2016-07-13 04:39:22 +00:00
return nil , fmt . Errorf ( "failed to get mounted cgroup subsystems: %v" , err )
2016-06-27 18:46:20 +00:00
}
2015-09-22 23:42:30 +00:00
return & containerManagerImpl {
cadvisorInterface : cadvisorInterface ,
mountUtil : mountUtil ,
2016-02-05 01:49:17 +00:00
NodeConfig : nodeConfig ,
2016-06-27 18:46:20 +00:00
subsystems : subsystems ,
2015-09-22 23:42:30 +00:00
} , nil
}
2016-07-13 04:39:22 +00:00
// NewPodContainerManager is a factory method returns a PodContainerManager object
// If qosCgroups are enabled then it returns the general pod container manager
// otherwise it returns a no-op manager which essentially does nothing
func ( cm * containerManagerImpl ) NewPodContainerManager ( ) PodContainerManager {
if cm . NodeConfig . CgroupsPerQOS {
return & podContainerManagerImpl {
qosContainersInfo : cm . qosContainers ,
nodeInfo : cm . nodeInfo ,
subsystems : cm . subsystems ,
cgroupManager : NewCgroupManager ( cm . subsystems ) ,
}
}
return & podContainerManagerNoop {
cgroupRoot : cm . NodeConfig . CgroupRoot ,
}
}
2015-09-22 23:42:30 +00:00
// Create a cgroup container manager.
func createManager ( containerName string ) * fs . Manager {
2016-08-02 18:04:17 +00:00
allowAllDevices := true
2015-09-22 23:42:30 +00:00
return & fs . Manager {
Cgroups : & configs . Cgroup {
2016-01-06 23:36:48 +00:00
Parent : "/" ,
Name : containerName ,
Resources : & configs . Resources {
2016-08-02 18:04:17 +00:00
AllowAllDevices : & allowAllDevices ,
2016-01-06 23:36:48 +00:00
} ,
2015-09-22 23:42:30 +00:00
} ,
}
}
2015-05-30 00:32:34 +00:00
2015-10-09 19:39:30 +00:00
type KernelTunableBehavior string
const (
KernelTunableWarn KernelTunableBehavior = "warn"
KernelTunableError KernelTunableBehavior = "error"
KernelTunableModify KernelTunableBehavior = "modify"
)
2016-06-27 18:46:20 +00:00
// InitQOS creates the top level qos cgroup containers
// We create top level QoS containers for only Burstable and Best Effort
// and not Guaranteed QoS class. All guaranteed pods are nested under the
// RootContainer by default. InitQOS is called only once during kubelet bootstrapping.
// TODO(@dubstack) Add support for cgroup-root to work on both systemd and cgroupfs
// drivers. Currently we only support systems running cgroupfs driver
2016-07-13 04:39:22 +00:00
func InitQOS ( rootContainer string , subsystems * CgroupSubsystems ) ( QOSContainersInfo , error ) {
2016-06-27 18:46:20 +00:00
cm := NewCgroupManager ( subsystems )
// Top level for Qos containers are created only for Burstable
// and Best Effort classes
qosClasses := [ 2 ] qos . QOSClass { qos . Burstable , qos . BestEffort }
// Create containers for both qos classes
for _ , qosClass := range qosClasses {
// get the container's absolute name
absoluteContainerName := path . Join ( rootContainer , string ( qosClass ) )
// containerConfig object stores the cgroup specifications
containerConfig := & CgroupConfig {
Name : absoluteContainerName ,
ResourceParameters : & ResourceConfig { } ,
}
// TODO(@dubstack) Add support on systemd cgroups driver
if err := cm . Create ( containerConfig ) ; err != nil {
return QOSContainersInfo { } , fmt . Errorf ( "failed to create top level %v QOS cgroup : %v" , qosClass , err )
}
}
// Store the top level qos container names
qosContainersInfo := QOSContainersInfo {
Guaranteed : rootContainer ,
Burstable : path . Join ( rootContainer , string ( qos . Burstable ) ) ,
BestEffort : path . Join ( rootContainer , string ( qos . BestEffort ) ) ,
}
return qosContainersInfo , nil
}
2015-10-09 19:39:30 +00:00
// setupKernelTunables validates kernel tunable flags are set as expected
// depending upon the specified option, it will either warn, error, or modify the kernel tunable flags
func setupKernelTunables ( option KernelTunableBehavior ) error {
desiredState := map [ string ] int {
utilsysctl . VmOvercommitMemory : utilsysctl . VmOvercommitMemoryAlways ,
utilsysctl . VmPanicOnOOM : utilsysctl . VmPanicOnOOMInvokeOOMKiller ,
2015-11-13 23:47:25 +00:00
utilsysctl . KernelPanic : utilsysctl . KernelPanicRebootTimeout ,
utilsysctl . KernelPanicOnOops : utilsysctl . KernelPanicOnOopsAlways ,
2015-10-05 17:28:53 +00:00
}
2015-10-09 19:39:30 +00:00
2016-08-10 15:38:44 +00:00
sysctl := utilsysctl . New ( )
2015-10-09 19:39:30 +00:00
errList := [ ] error { }
for flag , expectedValue := range desiredState {
2016-08-10 15:38:44 +00:00
val , err := sysctl . GetSysctl ( flag )
2015-10-09 19:39:30 +00:00
if err != nil {
errList = append ( errList , err )
continue
}
if val == expectedValue {
continue
}
switch option {
case KernelTunableError :
errList = append ( errList , fmt . Errorf ( "Invalid kernel flag: %v, expected value: %v, actual value: %v" , flag , expectedValue , val ) )
case KernelTunableWarn :
glog . V ( 2 ) . Infof ( "Invalid kernel flag: %v, expected value: %v, actual value: %v" , flag , expectedValue , val )
case KernelTunableModify :
glog . V ( 2 ) . Infof ( "Updating kernel flag: %v, expected value: %v, actual value: %v" , flag , expectedValue , val )
2016-08-10 15:38:44 +00:00
err = sysctl . SetSysctl ( flag , expectedValue )
2015-10-09 19:39:30 +00:00
if err != nil {
errList = append ( errList , err )
}
}
2015-10-05 17:28:53 +00:00
}
2015-10-14 05:18:37 +00:00
return utilerrors . NewAggregate ( errList )
2015-10-05 17:28:53 +00:00
}
2015-09-22 23:42:30 +00:00
func ( cm * containerManagerImpl ) setupNode ( ) error {
2016-03-04 00:37:09 +00:00
f , err := validateSystemRequirements ( cm . mountUtil )
if err != nil {
2015-09-22 23:42:30 +00:00
return err
}
2016-03-04 00:37:09 +00:00
if ! f . cpuHardcapping {
cm . status . SoftRequirements = fmt . Errorf ( "CPU hardcapping unsupported" )
}
2016-07-26 10:21:17 +00:00
b := KernelTunableModify
if cm . GetNodeConfig ( ) . ProtectKernelDefaults {
b = KernelTunableError
}
if err := setupKernelTunables ( b ) ; err != nil {
2015-10-05 17:28:53 +00:00
return err
}
2016-06-27 18:46:20 +00:00
// Setup top level qos containers only if CgroupsPerQOS flag is specified as true
if cm . NodeConfig . CgroupsPerQOS {
qosContainersInfo , err := InitQOS ( cm . NodeConfig . CgroupRoot , cm . subsystems )
if err != nil {
return fmt . Errorf ( "failed to initialise top level QOS containers: %v" , err )
}
cm . qosContainers = qosContainersInfo
}
2015-09-22 23:42:30 +00:00
systemContainers := [ ] * systemContainer { }
2016-02-05 01:49:17 +00:00
if cm . ContainerRuntime == "docker" {
2016-02-10 00:58:44 +00:00
if cm . RuntimeCgroupsName != "" {
cont := newSystemCgroups ( cm . RuntimeCgroupsName )
2016-02-05 01:49:17 +00:00
info , err := cm . cadvisorInterface . MachineInfo ( )
var capacity = api . ResourceList { }
if err != nil {
} else {
capacity = cadvisor . CapacityFromMachineInfo ( info )
}
memoryLimit := ( int64 ( capacity . Memory ( ) . Value ( ) * DockerMemoryLimitThresholdPercent / 100 ) )
if memoryLimit < MinDockerMemoryLimit {
2016-02-10 00:58:44 +00:00
glog . Warningf ( "Memory limit %d for container %s is too small, reset it to %d" , memoryLimit , cm . RuntimeCgroupsName , MinDockerMemoryLimit )
2016-02-05 01:49:17 +00:00
memoryLimit = MinDockerMemoryLimit
}
2015-06-17 18:36:27 +00:00
2016-02-10 00:58:44 +00:00
glog . V ( 2 ) . Infof ( "Configure resource-only container %s with memory limit: %d" , cm . RuntimeCgroupsName , memoryLimit )
2016-08-02 18:04:17 +00:00
allowAllDevices := true
2016-02-05 01:49:17 +00:00
dockerContainer := & fs . Manager {
Cgroups : & configs . Cgroup {
Parent : "/" ,
2016-02-10 00:58:44 +00:00
Name : cm . RuntimeCgroupsName ,
2016-02-05 01:49:17 +00:00
Resources : & configs . Resources {
Memory : memoryLimit ,
MemorySwap : - 1 ,
2016-08-02 18:04:17 +00:00
AllowAllDevices : & allowAllDevices ,
2016-02-05 01:49:17 +00:00
} ,
} ,
}
2016-05-27 00:27:00 +00:00
dockerVersion := getDockerVersion ( cm . cadvisorInterface )
2016-02-05 01:49:17 +00:00
cont . ensureStateFunc = func ( manager * fs . Manager ) error {
2016-05-27 00:27:00 +00:00
return ensureDockerInContainer ( dockerVersion , - 900 , dockerContainer )
2016-02-05 01:49:17 +00:00
}
systemContainers = append ( systemContainers , cont )
2015-06-17 18:36:27 +00:00
} else {
2016-02-16 20:52:40 +00:00
cm . periodicTasks = append ( cm . periodicTasks , func ( ) {
2016-05-19 19:44:42 +00:00
cont , err := getContainerNameForProcess ( dockerProcessName , dockerPidFile )
2016-02-16 20:52:40 +00:00
if err != nil {
glog . Error ( err )
return
}
2016-05-19 19:44:42 +00:00
glog . V ( 2 ) . Infof ( "Discovered runtime cgroups name: %s" , cont )
2016-02-16 20:52:40 +00:00
cm . Lock ( )
defer cm . Unlock ( )
2016-02-10 00:58:44 +00:00
cm . RuntimeCgroupsName = cont
2016-02-16 20:52:40 +00:00
} )
2015-06-17 19:51:11 +00:00
}
2016-02-05 01:49:17 +00:00
}
2015-06-17 19:51:11 +00:00
2016-02-10 00:58:44 +00:00
if cm . SystemCgroupsName != "" {
if cm . SystemCgroupsName == "/" {
2016-02-05 01:49:17 +00:00
return fmt . Errorf ( "system container cannot be root (\"/\")" )
}
2016-02-10 00:58:44 +00:00
cont := newSystemCgroups ( cm . SystemCgroupsName )
2016-02-05 01:49:17 +00:00
rootContainer := & fs . Manager {
2015-06-17 18:36:27 +00:00
Cgroups : & configs . Cgroup {
2016-01-06 23:36:48 +00:00
Parent : "/" ,
2016-02-05 01:49:17 +00:00
Name : "/" ,
2015-06-17 18:36:27 +00:00
} ,
}
2015-05-30 00:32:34 +00:00
cont . ensureStateFunc = func ( manager * fs . Manager ) error {
2016-02-10 00:58:44 +00:00
return ensureSystemCgroups ( rootContainer , manager )
2015-05-30 00:32:34 +00:00
}
systemContainers = append ( systemContainers , cont )
2015-05-19 22:52:12 +00:00
}
2016-02-10 00:58:44 +00:00
if cm . KubeletCgroupsName != "" {
cont := newSystemCgroups ( cm . KubeletCgroupsName )
2016-08-02 18:04:17 +00:00
allowAllDevices := true
2016-02-05 01:49:17 +00:00
manager := fs . Manager {
2015-05-19 22:52:12 +00:00
Cgroups : & configs . Cgroup {
2016-01-06 23:36:48 +00:00
Parent : "/" ,
2016-02-10 00:58:44 +00:00
Name : cm . KubeletCgroupsName ,
2016-02-05 01:49:17 +00:00
Resources : & configs . Resources {
2016-08-02 18:04:17 +00:00
AllowAllDevices : & allowAllDevices ,
2016-02-05 01:49:17 +00:00
} ,
2015-05-19 22:52:12 +00:00
} ,
2015-05-30 00:32:34 +00:00
}
2016-02-05 01:49:17 +00:00
cont . ensureStateFunc = func ( _ * fs . Manager ) error {
return manager . Apply ( os . Getpid ( ) )
}
systemContainers = append ( systemContainers , cont )
} else {
2016-02-16 20:52:40 +00:00
cm . periodicTasks = append ( cm . periodicTasks , func ( ) {
cont , err := getContainer ( os . Getpid ( ) )
if err != nil {
2016-03-23 00:26:50 +00:00
glog . Errorf ( "failed to find cgroups of kubelet - %v" , err )
2016-02-16 20:52:40 +00:00
return
}
cm . Lock ( )
defer cm . Unlock ( )
2016-02-10 00:58:44 +00:00
cm . KubeletCgroupsName = cont
2016-02-16 20:52:40 +00:00
} )
2015-05-30 00:32:34 +00:00
}
2015-09-22 23:42:30 +00:00
cm . systemContainers = systemContainers
return nil
2015-05-30 00:32:34 +00:00
}
2016-05-19 19:44:42 +00:00
func getContainerNameForProcess ( name , pidFile string ) ( string , error ) {
pids , err := getPidsForProcess ( name , pidFile )
2016-02-05 01:49:17 +00:00
if err != nil {
return "" , fmt . Errorf ( "failed to detect process id for %q - %v" , name , err )
}
if len ( pids ) == 0 {
return "" , nil
}
cont , err := getContainer ( pids [ 0 ] )
if err != nil {
return "" , err
}
return cont , nil
}
2015-10-10 00:09:53 +00:00
2016-02-05 01:49:17 +00:00
func ( cm * containerManagerImpl ) GetNodeConfig ( ) NodeConfig {
2016-02-16 20:52:40 +00:00
cm . RLock ( )
defer cm . RUnlock ( )
2016-02-05 01:49:17 +00:00
return cm . NodeConfig
}
2016-07-13 04:39:22 +00:00
func ( cm * containerManagerImpl ) GetMountedSubsystems ( ) * CgroupSubsystems {
return cm . subsystems
}
func ( cm * containerManagerImpl ) GetQOSContainersInfo ( ) QOSContainersInfo {
return cm . qosContainers
}
2016-03-04 00:37:09 +00:00
func ( cm * containerManagerImpl ) Status ( ) Status {
cm . RLock ( )
defer cm . RUnlock ( )
return cm . status
}
2016-07-13 04:39:22 +00:00
func ( cm * containerManagerImpl ) Start ( node * api . Node ) error {
// cache the node Info including resource capacity and
// allocatable of the node
cm . nodeInfo = node
2015-09-22 23:42:30 +00:00
// Setup the node
if err := cm . setupNode ( ) ; err != nil {
return err
}
2015-05-30 00:32:34 +00:00
// Don't run a background thread if there are no ensureStateFuncs.
numEnsureStateFuncs := 0
for _ , cont := range cm . systemContainers {
if cont . ensureStateFunc != nil {
numEnsureStateFuncs ++
2015-05-19 22:52:12 +00:00
}
}
2016-03-02 22:18:33 +00:00
if numEnsureStateFuncs >= 0 {
2016-05-27 00:27:00 +00:00
// Run ensure state functions every minute.
2016-03-02 22:18:33 +00:00
go wait . Until ( func ( ) {
for _ , cont := range cm . systemContainers {
if cont . ensureStateFunc != nil {
if err := cont . ensureStateFunc ( cont . manager ) ; err != nil {
glog . Warningf ( "[ContainerManager] Failed to ensure state of %q: %v" , cont . name , err )
}
}
}
} , time . Minute , wait . NeverStop )
2015-05-12 16:58:40 +00:00
}
2015-05-30 00:32:34 +00:00
2016-03-02 22:18:33 +00:00
if len ( cm . periodicTasks ) > 0 {
go wait . Until ( func ( ) {
for _ , task := range cm . periodicTasks {
if task != nil {
task ( )
2015-06-04 18:07:08 +00:00
}
2015-05-30 00:32:34 +00:00
}
2016-03-02 22:18:33 +00:00
} , 5 * time . Minute , wait . NeverStop )
}
2016-02-16 20:52:40 +00:00
2015-05-12 16:58:40 +00:00
return nil
}
2016-02-10 00:58:44 +00:00
func ( cm * containerManagerImpl ) SystemCgroupsLimit ( ) api . ResourceList {
2015-05-30 00:32:34 +00:00
cpuLimit := int64 ( 0 )
// Sum up resources of all external containers.
for _ , cont := range cm . systemContainers {
cpuLimit += cont . cpuMillicores
}
return api . ResourceList {
api . ResourceCPU : * resource . NewMilliQuantity (
cpuLimit ,
resource . DecimalSI ) ,
}
}
2016-02-04 21:34:56 +00:00
func isProcessRunningInHost ( pid int ) ( bool , error ) {
// Get init mount namespace. Mount namespace is unique for all containers.
initMntNs , err := os . Readlink ( "/proc/1/ns/mnt" )
if err != nil {
return false , fmt . Errorf ( "failed to find mount namespace of init process" )
}
processMntNs , err := os . Readlink ( fmt . Sprintf ( "/proc/%d/ns/mnt" , pid ) )
if err != nil {
return false , fmt . Errorf ( "failed to find mount namespace of process %q" , pid )
}
return initMntNs == processMntNs , nil
}
2016-05-19 19:44:42 +00:00
func getPidFromPidFile ( pidFile string ) ( int , error ) {
file , err := os . Open ( pidFile )
if err != nil {
return 0 , fmt . Errorf ( "error opening pid file %s: %v" , pidFile , err )
}
defer file . Close ( )
data , err := ioutil . ReadAll ( file )
if err != nil {
return 0 , fmt . Errorf ( "error reading pid file %s: %v" , pidFile , err )
}
pid , err := strconv . Atoi ( string ( data ) )
if err != nil {
return 0 , fmt . Errorf ( "error parsing %s as a number: %v" , string ( data ) , err )
}
return pid , nil
}
func getPidsForProcess ( name , pidFile string ) ( [ ] int , error ) {
if len ( pidFile ) > 0 {
if pid , err := getPidFromPidFile ( pidFile ) ; err == nil {
return [ ] int { pid } , nil
} else {
// log the error and fall back to pidof
runtime . HandleError ( err )
}
}
2016-08-04 16:58:04 +00:00
return procfs . PidOf ( name )
2016-02-05 01:49:17 +00:00
}
2015-05-12 16:58:40 +00:00
2016-02-05 01:49:17 +00:00
// Ensures that the Docker daemon is in the desired container.
2016-05-27 00:27:00 +00:00
func ensureDockerInContainer ( dockerVersion semver . Version , oomScoreAdj int , manager * fs . Manager ) error {
type process struct { name , file string }
dockerProcs := [ ] process { { dockerProcessName , dockerPidFile } }
if dockerVersion . GTE ( containerdVersion ) {
dockerProcs = append ( dockerProcs , process { containerdProcessName , containerdPidFile } )
2016-02-05 01:49:17 +00:00
}
2016-02-04 21:34:56 +00:00
2016-05-27 00:27:00 +00:00
var errs [ ] error
for _ , proc := range dockerProcs {
pids , err := getPidsForProcess ( proc . name , proc . file )
2015-05-12 16:58:40 +00:00
if err != nil {
2016-05-27 00:27:00 +00:00
errs = append ( errs , fmt . Errorf ( "failed to get pids for %q: %v" , proc . name , err ) )
continue
2015-05-12 16:58:40 +00:00
}
2016-05-27 00:27:00 +00:00
// Move if the pid is not already in the desired container.
for _ , pid := range pids {
if err := ensureProcessInContainer ( pid , oomScoreAdj , manager ) ; err != nil {
errs = append ( errs , fmt . Errorf ( "errors moving %q pid: %v" , proc . name , err ) )
2015-05-12 16:58:40 +00:00
}
}
2016-05-27 00:27:00 +00:00
}
return utilerrors . NewAggregate ( errs )
}
2015-05-14 21:40:20 +00:00
2016-05-27 00:27:00 +00:00
func ensureProcessInContainer ( pid int , oomScoreAdj int , manager * fs . Manager ) error {
if runningInHost , err := isProcessRunningInHost ( pid ) ; err != nil {
// Err on the side of caution. Avoid moving the docker daemon unless we are able to identify its context.
return err
} else if ! runningInHost {
// Process is running inside a container. Don't touch that.
return nil
}
var errs [ ] error
cont , err := getContainer ( pid )
if err != nil {
errs = append ( errs , fmt . Errorf ( "failed to find container of PID %d: %v" , pid , err ) )
}
if cont != manager . Cgroups . Name {
err = manager . Apply ( pid )
if err != nil {
errs = append ( errs , fmt . Errorf ( "failed to move PID %d (in %q) to %q" , pid , cont , manager . Cgroups . Name ) )
2015-05-14 21:40:20 +00:00
}
2015-05-12 16:58:40 +00:00
}
2016-05-27 00:27:00 +00:00
// Also apply oom-score-adj to processes
oomAdjuster := oom . NewOOMAdjuster ( )
if err := oomAdjuster . ApplyOOMScoreAdj ( pid , oomScoreAdj ) ; err != nil {
errs = append ( errs , fmt . Errorf ( "failed to apply oom score %d to PID %d" , oomScoreAdj , pid ) )
}
2015-10-14 05:18:37 +00:00
return utilerrors . NewAggregate ( errs )
2015-05-12 16:58:40 +00:00
}
2016-05-31 20:02:00 +00:00
// getContainer returns the cgroup associated with the specified pid.
// It enforces a unified hierarchy for memory and cpu cgroups.
// On systemd environments, it uses the name=systemd cgroup for the specified pid.
2015-05-12 16:58:40 +00:00
func getContainer ( pid int ) ( string , error ) {
2015-12-11 13:25:35 +00:00
cgs , err := cgroups . ParseCgroupFile ( fmt . Sprintf ( "/proc/%d/cgroup" , pid ) )
2015-05-12 16:58:40 +00:00
if err != nil {
return "" , err
}
2016-05-31 20:02:00 +00:00
cpu , found := cgs [ "cpu" ]
if ! found {
return "" , cgroups . NewNotFoundError ( "cpu" )
}
memory , found := cgs [ "memory" ]
if ! found {
return "" , cgroups . NewNotFoundError ( "memory" )
}
// since we use this container for accounting, we need to ensure its a unified hierarchy.
if cpu != memory {
return "" , fmt . Errorf ( "cpu and memory cgroup hierarchy not unified. cpu: %s, memory: %s" , cpu , memory )
}
// on systemd, every pid is in a unified cgroup hierarchy (name=systemd as seen in systemd-cgls)
// cpu and memory accounting is off by default, users may choose to enable it per unit or globally.
// users could enable CPU and memory accounting globally via /etc/systemd/system.conf (DefaultCPUAccounting=true DefaultMemoryAccounting=true).
// users could also enable CPU and memory accounting per unit via CPUAccounting=true and MemoryAccounting=true
// we only warn if accounting is not enabled for CPU or memory so as to not break local development flows where kubelet is launched in a terminal.
// for example, the cgroup for the user session will be something like /user.slice/user-X.slice/session-X.scope, but the cpu and memory
// cgroup will be the closest ancestor where accounting is performed (most likely /) on systems that launch docker containers.
// as a result, on those systems, you will not get cpu or memory accounting statistics for kubelet.
// in addition, you would not get memory or cpu accounting for the runtime unless accounting was enabled on its unit (or globally).
if systemd , found := cgs [ "name=systemd" ] ; found {
if systemd != cpu {
glog . Warningf ( "CPUAccounting not enabled for pid: %d" , pid )
}
if systemd != memory {
glog . Warningf ( "MemoryAccounting not enabled for pid: %d" , pid )
}
return systemd , nil
2015-12-11 13:25:35 +00:00
}
2016-05-31 20:02:00 +00:00
return cpu , nil
2015-05-12 16:58:40 +00:00
}
2015-05-19 22:52:12 +00:00
2015-08-18 23:21:28 +00:00
// Ensures the system container is created and all non-kernel threads and process 1
// without a container are moved to it.
//
// The reason of leaving kernel threads at root cgroup is that we don't want to tie the
// execution of these threads with to-be defined /system quota and create priority inversions.
//
2016-02-10 00:58:44 +00:00
func ensureSystemCgroups ( rootContainer * fs . Manager , manager * fs . Manager ) error {
2015-05-19 22:52:12 +00:00
// Move non-kernel PIDs to the system container.
attemptsRemaining := 10
var errs [ ] error
for attemptsRemaining >= 0 {
// Only keep errors on latest attempt.
errs = [ ] error { }
attemptsRemaining --
2015-05-30 00:32:34 +00:00
allPids , err := rootContainer . GetPids ( )
2015-05-19 22:52:12 +00:00
if err != nil {
2015-06-04 18:07:08 +00:00
errs = append ( errs , fmt . Errorf ( "failed to list PIDs for root: %v" , err ) )
2015-05-19 22:52:12 +00:00
continue
}
2016-01-06 23:36:48 +00:00
// Remove kernel pids and other protected PIDs (pid 1, PIDs already in system & kubelet containers)
2015-05-19 22:52:12 +00:00
pids := make ( [ ] int , 0 , len ( allPids ) )
for _ , pid := range allPids {
2016-01-31 23:27:34 +00:00
if pid == 1 || isKernelPid ( pid ) {
2015-05-19 22:52:12 +00:00
continue
}
pids = append ( pids , pid )
}
2016-01-06 23:36:48 +00:00
glog . Infof ( "Found %d PIDs in root, %d of them are not to be moved" , len ( allPids ) , len ( allPids ) - len ( pids ) )
2015-05-19 22:52:12 +00:00
2016-01-06 23:36:48 +00:00
// Check if we have moved all the non-kernel PIDs.
2015-05-19 22:52:12 +00:00
if len ( pids ) == 0 {
break
}
2016-01-06 23:36:48 +00:00
glog . Infof ( "Moving non-kernel processes: %v" , pids )
2015-05-19 22:52:12 +00:00
for _ , pid := range pids {
2015-05-30 00:32:34 +00:00
err := manager . Apply ( pid )
2015-05-19 22:52:12 +00:00
if err != nil {
2015-05-30 00:32:34 +00:00
errs = append ( errs , fmt . Errorf ( "failed to move PID %d into the system container %q: %v" , pid , manager . Cgroups . Name , err ) )
2015-05-19 22:52:12 +00:00
}
}
}
if attemptsRemaining < 0 {
2015-05-30 00:32:34 +00:00
errs = append ( errs , fmt . Errorf ( "ran out of attempts to create system containers %q" , manager . Cgroups . Name ) )
2015-05-19 22:52:12 +00:00
}
2015-10-14 05:18:37 +00:00
return utilerrors . NewAggregate ( errs )
2015-05-19 22:52:12 +00:00
}
// Determines whether the specified PID is a kernel PID.
func isKernelPid ( pid int ) bool {
// Kernel threads have no associated executable.
_ , err := os . Readlink ( fmt . Sprintf ( "/proc/%d/exe" , pid ) )
return err != nil
}
2016-05-27 00:27:00 +00:00
// Helper for getting the docker version.
func getDockerVersion ( cadvisor cadvisor . Interface ) semver . Version {
var fallback semver . Version // Fallback to zero-value by default.
versions , err := cadvisor . VersionInfo ( )
if err != nil {
glog . Errorf ( "Error requesting cAdvisor VersionInfo: %v" , err )
return fallback
}
dockerVersion , err := semver . Parse ( versions . DockerVersion )
if err != nil {
glog . Errorf ( "Error parsing docker version %q: %v" , versions . DockerVersion , err )
return fallback
}
return dockerVersion
}