2017-02-10 05:14:10 +00:00
// +build linux
/ *
Copyright 2017 The Kubernetes Authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package cm
import (
"fmt"
"strings"
"time"
"github.com/golang/glog"
2017-06-22 18:24:23 +00:00
"k8s.io/api/core/v1"
2017-02-10 05:14:10 +00:00
"k8s.io/apimachinery/pkg/api/resource"
2017-03-11 01:42:44 +00:00
"k8s.io/apimachinery/pkg/types"
2017-02-10 05:14:10 +00:00
"k8s.io/kubernetes/pkg/kubelet/events"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
)
const (
defaultNodeAllocatableCgroupName = "kubepods"
)
func ( cm * containerManagerImpl ) createNodeAllocatableCgroups ( ) error {
cgroupConfig := & CgroupConfig {
Name : CgroupName ( cm . cgroupRoot ) ,
// The default limits for cpu shares can be very low which can lead to CPU starvation for pods.
ResourceParameters : getCgroupConfig ( cm . capacity ) ,
}
if cm . cgroupManager . Exists ( cgroupConfig . Name ) {
return nil
}
if err := cm . cgroupManager . Create ( cgroupConfig ) ; err != nil {
glog . Errorf ( "Failed to create %q cgroup" , cm . cgroupRoot )
return err
}
return nil
}
// Enforce Node Allocatable Cgroup settings.
func ( cm * containerManagerImpl ) enforceNodeAllocatableCgroups ( ) error {
nc := cm . NodeConfig . NodeAllocatableConfig
// We need to update limits on node allocatable cgroup no matter what because
// default cpu shares on cgroups are low and can cause cpu starvation.
nodeAllocatable := cm . capacity
// Use Node Allocatable limits instead of capacity if the user requested enforcing node allocatable.
if cm . CgroupsPerQOS && nc . EnforceNodeAllocatable . Has ( NodeAllocatableEnforcementKey ) {
nodeAllocatable = cm . getNodeAllocatableAbsolute ( )
}
glog . V ( 4 ) . Infof ( "Attempting to enforce Node Allocatable with config: %+v" , nc )
cgroupConfig := & CgroupConfig {
Name : CgroupName ( cm . cgroupRoot ) ,
ResourceParameters : getCgroupConfig ( nodeAllocatable ) ,
}
2017-03-11 01:42:44 +00:00
// Using ObjectReference for events as the node maybe not cached; refer to #42701 for detail.
2017-07-15 05:25:54 +00:00
nodeRef := & v1 . ObjectReference {
2017-03-11 01:42:44 +00:00
Kind : "Node" ,
Name : cm . nodeInfo . Name ,
UID : types . UID ( cm . nodeInfo . Name ) ,
Namespace : "" ,
}
2017-02-10 05:14:10 +00:00
// If Node Allocatable is enforced on a node that has not been drained or is updated on an existing node to a lower value,
// existing memory usage across pods might be higher that current Node Allocatable Memory Limits.
// Pod Evictions are expected to bring down memory usage to below Node Allocatable limits.
// Until evictions happen retry cgroup updates.
// Update limits on non root cgroup-root to be safe since the default limits for CPU can be too low.
if cm . cgroupRoot != "/" {
go func ( ) {
for {
err := cm . cgroupManager . Update ( cgroupConfig )
if err == nil {
2017-03-11 01:42:44 +00:00
cm . recorder . Event ( nodeRef , v1 . EventTypeNormal , events . SuccessfulNodeAllocatableEnforcement , "Updated Node Allocatable limit across pods" )
2017-02-10 05:14:10 +00:00
return
}
message := fmt . Sprintf ( "Failed to update Node Allocatable Limits %q: %v" , cm . cgroupRoot , err )
2017-03-11 01:42:44 +00:00
cm . recorder . Event ( nodeRef , v1 . EventTypeWarning , events . FailedNodeAllocatableEnforcement , message )
2017-02-10 05:14:10 +00:00
time . Sleep ( time . Minute )
}
} ( )
}
// Now apply kube reserved and system reserved limits if required.
if nc . EnforceNodeAllocatable . Has ( SystemReservedEnforcementKey ) {
glog . V ( 2 ) . Infof ( "Enforcing System reserved on cgroup %q with limits: %+v" , nc . SystemReservedCgroupName , nc . SystemReserved )
if err := enforceExistingCgroup ( cm . cgroupManager , nc . SystemReservedCgroupName , nc . SystemReserved ) ; err != nil {
message := fmt . Sprintf ( "Failed to enforce System Reserved Cgroup Limits on %q: %v" , nc . SystemReservedCgroupName , err )
2017-03-11 01:42:44 +00:00
cm . recorder . Event ( nodeRef , v1 . EventTypeWarning , events . FailedNodeAllocatableEnforcement , message )
2017-02-10 05:14:10 +00:00
return fmt . Errorf ( message )
}
2017-03-11 01:42:44 +00:00
cm . recorder . Eventf ( nodeRef , v1 . EventTypeNormal , events . SuccessfulNodeAllocatableEnforcement , "Updated limits on system reserved cgroup %v" , nc . SystemReservedCgroupName )
2017-02-10 05:14:10 +00:00
}
if nc . EnforceNodeAllocatable . Has ( KubeReservedEnforcementKey ) {
glog . V ( 2 ) . Infof ( "Enforcing kube reserved on cgroup %q with limits: %+v" , nc . KubeReservedCgroupName , nc . KubeReserved )
if err := enforceExistingCgroup ( cm . cgroupManager , nc . KubeReservedCgroupName , nc . KubeReserved ) ; err != nil {
message := fmt . Sprintf ( "Failed to enforce Kube Reserved Cgroup Limits on %q: %v" , nc . KubeReservedCgroupName , err )
2017-03-11 01:42:44 +00:00
cm . recorder . Event ( nodeRef , v1 . EventTypeWarning , events . FailedNodeAllocatableEnforcement , message )
2017-02-10 05:14:10 +00:00
return fmt . Errorf ( message )
}
2017-03-11 01:42:44 +00:00
cm . recorder . Eventf ( nodeRef , v1 . EventTypeNormal , events . SuccessfulNodeAllocatableEnforcement , "Updated limits on kube reserved cgroup %v" , nc . KubeReservedCgroupName )
2017-02-10 05:14:10 +00:00
}
return nil
}
// enforceExistingCgroup updates the limits `rl` on existing cgroup `cName` using `cgroupManager` interface.
func enforceExistingCgroup ( cgroupManager CgroupManager , cName string , rl v1 . ResourceList ) error {
cgroupConfig := & CgroupConfig {
Name : CgroupName ( cName ) ,
ResourceParameters : getCgroupConfig ( rl ) ,
}
glog . V ( 4 ) . Infof ( "Enforcing limits on cgroup %q with %d cpu shares and %d bytes of memory" , cName , cgroupConfig . ResourceParameters . CpuShares , cgroupConfig . ResourceParameters . Memory )
if ! cgroupManager . Exists ( cgroupConfig . Name ) {
return fmt . Errorf ( "%q cgroup does not exist" , cgroupConfig . Name )
}
if err := cgroupManager . Update ( cgroupConfig ) ; err != nil {
return err
}
return nil
}
// Returns a ResourceConfig object that can be used to create or update cgroups via CgroupManager interface.
func getCgroupConfig ( rl v1 . ResourceList ) * ResourceConfig {
// TODO(vishh): Set CPU Quota if necessary.
if rl == nil {
return nil
}
var rc ResourceConfig
if q , exists := rl [ v1 . ResourceMemory ] ; exists {
// Memory is defined in bytes.
val := q . Value ( )
rc . Memory = & val
}
if q , exists := rl [ v1 . ResourceCPU ] ; exists {
// CPU is defined in milli-cores.
val := MilliCPUToShares ( q . MilliValue ( ) )
rc . CpuShares = & val
}
return & rc
}
// getNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement.
// Note that not all resources that are available on the node are included in the returned list of resources.
// Returns a ResourceList.
func ( cm * containerManagerImpl ) getNodeAllocatableAbsolute ( ) v1 . ResourceList {
result := make ( v1 . ResourceList )
for k , v := range cm . capacity {
value := * ( v . Copy ( ) )
if cm . NodeConfig . SystemReserved != nil {
value . Sub ( cm . NodeConfig . SystemReserved [ k ] )
}
if cm . NodeConfig . KubeReserved != nil {
value . Sub ( cm . NodeConfig . KubeReserved [ k ] )
}
if value . Sign ( ) < 0 {
// Negative Allocatable resources don't make sense.
value . Set ( 0 )
}
result [ k ] = value
}
return result
}
2017-05-25 19:29:19 +00:00
// GetNodeAllocatable returns amount of compute or storage resource that have to be reserved on this node from scheduling.
2017-02-10 05:14:10 +00:00
func ( cm * containerManagerImpl ) GetNodeAllocatableReservation ( ) v1 . ResourceList {
evictionReservation := hardEvictionReservation ( cm . HardEvictionThresholds , cm . capacity )
result := make ( v1 . ResourceList )
for k := range cm . capacity {
value := resource . NewQuantity ( 0 , resource . DecimalSI )
if cm . NodeConfig . SystemReserved != nil {
value . Add ( cm . NodeConfig . SystemReserved [ k ] )
}
if cm . NodeConfig . KubeReserved != nil {
value . Add ( cm . NodeConfig . KubeReserved [ k ] )
}
if evictionReservation != nil {
value . Add ( evictionReservation [ k ] )
}
if ! value . IsZero ( ) {
result [ k ] = * value
}
}
return result
}
// hardEvictionReservation returns a resourcelist that includes reservation of resources based on hard eviction thresholds.
func hardEvictionReservation ( thresholds [ ] evictionapi . Threshold , capacity v1 . ResourceList ) v1 . ResourceList {
if len ( thresholds ) == 0 {
return nil
}
ret := v1 . ResourceList { }
for _ , threshold := range thresholds {
if threshold . Operator != evictionapi . OpLessThan {
continue
}
switch threshold . Signal {
case evictionapi . SignalMemoryAvailable :
memoryCapacity := capacity [ v1 . ResourceMemory ]
value := evictionapi . GetThresholdQuantity ( threshold . Value , & memoryCapacity )
ret [ v1 . ResourceMemory ] = * value
2017-07-07 20:40:13 +00:00
case evictionapi . SignalNodeFsAvailable :
storageCapacity := capacity [ v1 . ResourceStorageScratch ]
value := evictionapi . GetThresholdQuantity ( threshold . Value , & storageCapacity )
ret [ v1 . ResourceStorageScratch ] = * value
2017-02-10 05:14:10 +00:00
}
}
return ret
}
// validateNodeAllocatable ensures that the user specified Node Allocatable Configuration doesn't reserve more than the node capacity.
// Returns error if the configuration is invalid, nil otherwise.
func ( cm * containerManagerImpl ) validateNodeAllocatable ( ) error {
na := cm . GetNodeAllocatableReservation ( )
zeroValue := resource . MustParse ( "0" )
var errors [ ] string
for key , val := range na {
if val . Cmp ( zeroValue ) <= 0 {
errors = append ( errors , fmt . Sprintf ( "Resource %q has an allocatable of %v" , key , val ) )
}
}
if len ( errors ) > 0 {
return fmt . Errorf ( "Invalid Node Allocatable configuration. %s" , strings . Join ( errors , " " ) )
}
return nil
}