2014-10-08 23:14:37 +00:00
/ *
2016-06-03 00:25:58 +00:00
Copyright 2014 The Kubernetes Authors .
2014-10-08 23:14:37 +00:00
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
2015-10-10 03:58:57 +00:00
package node
2014-10-08 23:14:37 +00:00
import (
2015-02-04 21:56:59 +00:00
"fmt"
2015-01-09 21:14:39 +00:00
"net"
2015-09-07 13:04:15 +00:00
"sync"
2014-10-14 22:45:09 +00:00
"time"
2014-10-08 23:14:37 +00:00
2017-08-08 19:55:57 +00:00
"github.com/golang/glog"
2017-01-25 13:39:54 +00:00
apiequality "k8s.io/apimachinery/pkg/api/equality"
2017-02-06 18:35:50 +00:00
apierrors "k8s.io/apimachinery/pkg/api/errors"
2017-01-11 14:09:48 +00:00
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
2017-03-08 07:03:57 +00:00
2017-07-10 17:54:48 +00:00
"k8s.io/client-go/kubernetes/scheme"
2017-06-22 18:24:23 +00:00
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
2017-01-24 14:11:51 +00:00
"k8s.io/client-go/tools/cache"
2017-01-30 18:39:54 +00:00
"k8s.io/client-go/tools/record"
2017-01-23 18:37:22 +00:00
"k8s.io/client-go/util/flowcontrol"
2017-03-08 07:03:57 +00:00
2017-06-22 17:25:57 +00:00
"k8s.io/api/core/v1"
2017-06-23 20:56:37 +00:00
coreinformers "k8s.io/client-go/informers/core/v1"
extensionsinformers "k8s.io/client-go/informers/extensions/v1beta1"
clientset "k8s.io/client-go/kubernetes"
corelisters "k8s.io/client-go/listers/core/v1"
extensionslisters "k8s.io/client-go/listers/extensions/v1beta1"
2017-07-16 07:11:17 +00:00
v1node "k8s.io/kubernetes/pkg/api/v1/node"
2015-08-05 22:03:47 +00:00
"k8s.io/kubernetes/pkg/cloudprovider"
2017-02-06 12:58:48 +00:00
"k8s.io/kubernetes/pkg/controller"
2017-08-08 19:55:57 +00:00
"k8s.io/kubernetes/pkg/controller/node/ipam"
2017-08-08 23:25:20 +00:00
nodesync "k8s.io/kubernetes/pkg/controller/node/ipam/sync"
2017-08-08 19:55:57 +00:00
"k8s.io/kubernetes/pkg/controller/node/scheduler"
"k8s.io/kubernetes/pkg/controller/node/util"
2016-04-13 18:38:32 +00:00
"k8s.io/kubernetes/pkg/util/metrics"
2016-07-12 07:38:57 +00:00
utilnode "k8s.io/kubernetes/pkg/util/node"
2016-05-16 09:20:23 +00:00
"k8s.io/kubernetes/pkg/util/system"
2017-07-06 13:13:13 +00:00
taintutils "k8s.io/kubernetes/pkg/util/taints"
2017-05-30 14:46:00 +00:00
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
2014-10-08 23:14:37 +00:00
)
2016-08-16 15:08:26 +00:00
func init ( ) {
// Register prometheus metrics
Register ( )
}
2015-01-16 22:28:20 +00:00
var (
2017-08-08 23:25:20 +00:00
// UnreachableTaintTemplate is the taint for when a node becomes unreachable.
2017-02-06 12:58:48 +00:00
UnreachableTaintTemplate = & v1 . Taint {
2017-05-30 14:46:00 +00:00
Key : algorithm . TaintNodeUnreachable ,
2017-02-06 12:58:48 +00:00
Effect : v1 . TaintEffectNoExecute ,
}
2017-08-08 23:25:20 +00:00
// NotReadyTaintTemplate is the taint for when a node is not ready for
// executing pods
2017-02-06 12:58:48 +00:00
NotReadyTaintTemplate = & v1 . Taint {
2017-05-30 14:46:00 +00:00
Key : algorithm . TaintNodeNotReady ,
2017-02-06 12:58:48 +00:00
Effect : v1 . TaintEffectNoExecute ,
}
2017-07-19 15:51:19 +00:00
nodeConditionToTaintKeyMap = map [ v1 . NodeConditionType ] string {
v1 . NodeMemoryPressure : algorithm . TaintNodeMemoryPressure ,
v1 . NodeOutOfDisk : algorithm . TaintNodeOutOfDisk ,
v1 . NodeDiskPressure : algorithm . TaintNodeDiskPressure ,
v1 . NodeNetworkUnavailable : algorithm . TaintNodeNetworkUnavailable ,
}
taintKeyToNodeConditionMap = map [ string ] v1 . NodeConditionType {
algorithm . TaintNodeNetworkUnavailable : v1 . NodeNetworkUnavailable ,
algorithm . TaintNodeMemoryPressure : v1 . NodeMemoryPressure ,
algorithm . TaintNodeOutOfDisk : v1 . NodeOutOfDisk ,
algorithm . TaintNodeDiskPressure : v1 . NodeDiskPressure ,
}
2015-01-16 22:28:20 +00:00
)
2015-05-19 11:23:59 +00:00
const (
2016-07-16 18:52:51 +00:00
// The amount of time the nodecontroller polls on the list nodes endpoint.
apiserverStartupGracePeriod = 10 * time . Minute
2016-09-05 10:29:28 +00:00
// The amount of time the nodecontroller should sleep between retrying NodeStatus updates
retrySleepTime = 20 * time . Millisecond
2017-08-08 23:25:20 +00:00
// ipamResyncInterval is the amount of time between when the cloud and node
// CIDR range assignments are synchronized.
ipamResyncInterval = 30 * time . Second
// ipamMaxBackoff is the maximum backoff for retrying synchronization of a
// given in the error state.
ipamMaxBackoff = 10 * time . Second
// ipamInitialRetry is the initial retry interval for retrying synchronization of a
// given in the error state.
ipamInitialBackoff = 250 * time . Millisecond
2015-05-19 11:23:59 +00:00
)
2015-03-31 11:17:12 +00:00
2017-08-08 23:25:20 +00:00
// ZoneState is the state of a given zone.
type ZoneState string
2016-07-12 07:38:57 +00:00
const (
2017-08-08 23:25:20 +00:00
stateInitial = ZoneState ( "Initial" )
stateNormal = ZoneState ( "Normal" )
stateFullDisruption = ZoneState ( "FullDisruption" )
statePartialDisruption = ZoneState ( "PartialDisruption" )
2016-07-12 07:38:57 +00:00
)
2015-04-10 22:30:11 +00:00
type nodeStatusData struct {
2016-12-03 18:57:26 +00:00
probeTimestamp metav1 . Time
readyTransitionTimestamp metav1 . Time
2016-11-18 20:50:17 +00:00
status v1 . NodeStatus
2015-03-31 15:15:39 +00:00
}
2017-08-08 23:25:20 +00:00
// Controller is the controller that manages node related cluster state.
type Controller struct {
2016-07-12 12:29:46 +00:00
allocateNodeCIDRs bool
2017-08-08 19:55:57 +00:00
allocatorType ipam . CIDRAllocatorType
2017-02-27 08:33:55 +00:00
cloud cloudprovider . Interface
clusterCIDR * net . IPNet
serviceCIDR * net . IPNet
knownNodeSet map [ string ] * v1 . Node
kubeClient clientset . Interface
2015-08-04 12:44:14 +00:00
// Method for easy mocking in unittest.
lookupIP func ( host string ) ( [ ] net . IP , error )
2017-08-08 23:25:20 +00:00
// Value used if sync_nodes_status=False. Controller will not proactively
2015-03-31 11:17:12 +00:00
// sync node status in this case, but will monitor node status updated from kubelet. If
// it doesn't receive update for this amount of time, it will start posting "NodeReady==
2017-08-08 23:25:20 +00:00
// ConditionUnknown". The amount of time before which Controller start evicting pods
2015-08-11 20:29:50 +00:00
// is controlled via flag 'pod-eviction-timeout'.
2015-03-31 11:17:12 +00:00
// Note: be cautious when changing the constant, it must work with nodeStatusUpdateFrequency
// in kubelet. There are several constraints:
// 1. nodeMonitorGracePeriod must be N times more than nodeStatusUpdateFrequency, where
// N means number of retries allowed for kubelet to post node status. It is pointless
// to make nodeMonitorGracePeriod be less than nodeStatusUpdateFrequency, since there
// will only be fresh values from Kubelet at an interval of nodeStatusUpdateFrequency.
// The constant must be less than podEvictionTimeout.
// 2. nodeMonitorGracePeriod can't be too large for user experience - larger value takes
// longer for user to see up-to-date node status.
nodeMonitorGracePeriod time . Duration
2017-08-08 23:25:20 +00:00
// Value controlling Controller monitoring period, i.e. how often does Controller
2015-04-07 19:36:09 +00:00
// check node status posted from kubelet. This value should be lower than nodeMonitorGracePeriod.
2015-03-31 11:17:12 +00:00
// TODO: Change node status monitor to watch based.
nodeMonitorPeriod time . Duration
2015-08-04 12:44:14 +00:00
// Value used if sync_nodes_status=False, only for node startup. When node
// is just created, e.g. cluster bootstrap or node creation, we give a longer grace period.
nodeStartupGracePeriod time . Duration
// per Node map storing last observed Status together with a local time when it was observed.
2017-08-28 02:49:05 +00:00
nodeStatusMap map [ string ] nodeStatusData
2015-08-04 12:44:14 +00:00
// This timestamp is to be used instead of LastProbeTime stored in Condition. We do this
// to aviod the problem with time skew across the cluster.
2017-08-28 02:49:05 +00:00
now func ( ) metav1 . Time
2015-09-07 13:04:15 +00:00
// Lock to access evictor workers
2016-07-26 05:06:16 +00:00
evictorLock sync . Mutex
2015-09-07 13:04:15 +00:00
// workers that evicts pods from unresponsive nodes.
2017-08-08 19:55:57 +00:00
zonePodEvictor map [ string ] * scheduler . RateLimitedTimedQueue
2017-02-06 12:58:48 +00:00
// workers that are responsible for tainting nodes.
2017-08-08 19:55:57 +00:00
zoneNoExecuteTainer map [ string ] * scheduler . RateLimitedTimedQueue
2017-07-31 11:39:34 +00:00
podEvictionTimeout time . Duration
2015-08-21 01:11:40 +00:00
// The maximum duration before a pod evicted from a node can be forcefully terminated.
maximumGracePeriod time . Duration
2015-08-04 12:44:14 +00:00
recorder record . EventRecorder
2016-10-14 10:38:39 +00:00
2017-02-06 18:35:50 +00:00
nodeLister corelisters . NodeLister
nodeInformerSynced cache . InformerSynced
daemonSetStore extensionslisters . DaemonSetLister
daemonSetInformerSynced cache . InformerSynced
podInformerSynced cache . InformerSynced
2017-08-08 23:25:20 +00:00
cidrAllocator ipam . CIDRAllocator
taintManager * scheduler . NoExecuteTaintManager
2015-10-20 02:25:31 +00:00
2016-07-16 06:10:29 +00:00
nodeExistsInCloudProvider func ( types . NodeName ) ( bool , error )
2017-08-08 23:25:20 +00:00
computeZoneStateFunc func ( nodeConditions [ ] * v1 . NodeCondition ) ( int , ZoneState )
2016-08-05 12:50:19 +00:00
enterPartialDisruptionFunc func ( nodeNum int ) float32
enterFullDisruptionFunc func ( nodeNum int ) float32
2016-05-16 09:20:23 +00:00
2017-08-08 23:25:20 +00:00
zoneStates map [ string ] ZoneState
2016-08-05 12:50:19 +00:00
evictionLimiterQPS float32
secondaryEvictionLimiterQPS float32
largeClusterThreshold int32
unhealthyZoneThreshold float32
2017-01-23 09:28:51 +00:00
2017-08-08 23:25:20 +00:00
// if set to true Controller will start TaintManager that will evict Pods from
2017-01-23 09:28:51 +00:00
// tainted nodes, if they're not tolerated.
runTaintManager bool
2017-02-06 12:58:48 +00:00
2017-08-08 23:25:20 +00:00
// if set to true Controller will taint Nodes with 'TaintNodeNotReady' and 'TaintNodeUnreachable'
2017-02-06 12:58:48 +00:00
// taints instead of evicting Pods itself.
useTaintBasedEvictions bool
2017-07-19 15:51:19 +00:00
// if set to true, NodeController will taint Nodes based on its condition for 'NetworkUnavailable',
// 'MemoryPressure', 'OutOfDisk' and 'DiskPressure'.
taintNodeByCondition bool
2014-10-08 23:14:37 +00:00
}
2014-12-19 09:27:01 +00:00
// NewNodeController returns a new node controller to sync instances from cloudprovider.
2016-07-16 18:52:51 +00:00
// This method returns an error if it is unable to initialize the CIDR bitmap with
// podCIDRs it has already allocated to nodes. Since we don't allow podCIDR changes
// currently, this should be handled as a fatal error.
2014-12-19 09:27:01 +00:00
func NewNodeController (
2017-02-06 18:35:50 +00:00
podInformer coreinformers . PodInformer ,
nodeInformer coreinformers . NodeInformer ,
daemonSetInformer extensionsinformers . DaemonSetInformer ,
2014-10-14 22:45:09 +00:00
cloud cloudprovider . Interface ,
2016-01-29 06:34:08 +00:00
kubeClient clientset . Interface ,
2015-04-02 15:13:13 +00:00
podEvictionTimeout time . Duration ,
2016-07-12 12:29:46 +00:00
evictionLimiterQPS float32 ,
2016-08-05 12:50:19 +00:00
secondaryEvictionLimiterQPS float32 ,
largeClusterThreshold int32 ,
unhealthyZoneThreshold float32 ,
2015-03-31 11:17:12 +00:00
nodeMonitorGracePeriod time . Duration ,
nodeStartupGracePeriod time . Duration ,
2015-04-01 12:52:28 +00:00
nodeMonitorPeriod time . Duration ,
2015-05-06 21:48:45 +00:00
clusterCIDR * net . IPNet ,
2016-05-16 10:57:44 +00:00
serviceCIDR * net . IPNet ,
2016-05-20 11:21:52 +00:00
nodeCIDRMaskSize int ,
2017-01-23 09:28:51 +00:00
allocateNodeCIDRs bool ,
2017-08-08 19:55:57 +00:00
allocatorType ipam . CIDRAllocatorType ,
2017-02-06 12:58:48 +00:00
runTaintManager bool ,
2017-07-19 15:51:19 +00:00
useTaintBasedEvictions bool ,
2017-08-08 23:25:20 +00:00
taintNodeByCondition bool ) ( * Controller , error ) {
if kubeClient == nil {
glog . Fatalf ( "kubeClient is nil when starting Controller" )
}
2015-04-08 11:45:37 +00:00
eventBroadcaster := record . NewBroadcaster ( )
2017-09-01 00:25:18 +00:00
recorder := eventBroadcaster . NewRecorder ( scheme . Scheme , v1 . EventSource { Component : "node-controller" } )
2015-06-03 06:51:32 +00:00
eventBroadcaster . StartLogging ( glog . Infof )
2017-08-08 23:25:20 +00:00
glog . V ( 0 ) . Infof ( "Sending events to api server." )
eventBroadcaster . StartRecordingToSink (
& v1core . EventSinkImpl {
2017-10-25 15:54:32 +00:00
Interface : v1core . New ( kubeClient . CoreV1 ( ) . RESTClient ( ) ) . Events ( "" ) ,
2017-08-08 23:25:20 +00:00
} )
2016-04-13 18:38:32 +00:00
2017-10-25 15:54:32 +00:00
if kubeClient != nil && kubeClient . CoreV1 ( ) . RESTClient ( ) . GetRateLimiter ( ) != nil {
metrics . RegisterMetricAndTrackRateLimiterUsage ( "node_controller" , kubeClient . CoreV1 ( ) . RESTClient ( ) . GetRateLimiter ( ) )
2016-04-13 18:38:32 +00:00
}
2016-01-27 03:53:09 +00:00
if allocateNodeCIDRs {
if clusterCIDR == nil {
2017-08-08 23:25:20 +00:00
glog . Fatal ( "Controller: Must specify clusterCIDR if allocateNodeCIDRs == true." )
2016-01-27 03:53:09 +00:00
}
mask := clusterCIDR . Mask
2016-05-20 11:21:52 +00:00
if maskSize , _ := mask . Size ( ) ; maskSize > nodeCIDRMaskSize {
2017-08-08 23:25:20 +00:00
glog . Fatal ( "Controller: Invalid clusterCIDR, mask size of clusterCIDR must be less than nodeCIDRMaskSize." )
2016-01-27 03:53:09 +00:00
}
2015-05-06 21:48:45 +00:00
}
2015-10-20 02:25:31 +00:00
2017-08-08 23:25:20 +00:00
nc := & Controller {
cloud : cloud ,
knownNodeSet : make ( map [ string ] * v1 . Node ) ,
kubeClient : kubeClient ,
recorder : recorder ,
podEvictionTimeout : podEvictionTimeout ,
maximumGracePeriod : 5 * time . Minute ,
zonePodEvictor : make ( map [ string ] * scheduler . RateLimitedTimedQueue ) ,
zoneNoExecuteTainer : make ( map [ string ] * scheduler . RateLimitedTimedQueue ) ,
nodeStatusMap : make ( map [ string ] nodeStatusData ) ,
nodeMonitorGracePeriod : nodeMonitorGracePeriod ,
nodeMonitorPeriod : nodeMonitorPeriod ,
nodeStartupGracePeriod : nodeStartupGracePeriod ,
lookupIP : net . LookupIP ,
now : metav1 . Now ,
clusterCIDR : clusterCIDR ,
serviceCIDR : serviceCIDR ,
allocateNodeCIDRs : allocateNodeCIDRs ,
allocatorType : allocatorType ,
nodeExistsInCloudProvider : func ( nodeName types . NodeName ) ( bool , error ) {
return util . NodeExistsInCloudProvider ( cloud , nodeName )
} ,
2017-07-31 11:39:34 +00:00
evictionLimiterQPS : evictionLimiterQPS ,
secondaryEvictionLimiterQPS : secondaryEvictionLimiterQPS ,
largeClusterThreshold : largeClusterThreshold ,
unhealthyZoneThreshold : unhealthyZoneThreshold ,
2017-08-08 23:25:20 +00:00
zoneStates : make ( map [ string ] ZoneState ) ,
2017-07-31 11:39:34 +00:00
runTaintManager : runTaintManager ,
useTaintBasedEvictions : useTaintBasedEvictions && runTaintManager ,
2017-10-01 00:26:35 +00:00
taintNodeByCondition : taintNodeByCondition ,
2014-10-14 22:45:09 +00:00
}
2017-03-08 07:03:57 +00:00
if useTaintBasedEvictions {
2017-08-08 23:25:20 +00:00
glog . Infof ( "Controller is using taint based evictions." )
2017-03-08 07:03:57 +00:00
}
2016-08-05 12:50:19 +00:00
nc . enterPartialDisruptionFunc = nc . ReducedQPSFunc
nc . enterFullDisruptionFunc = nc . HealthyQPSFunc
nc . computeZoneStateFunc = nc . ComputeZoneState
2015-10-20 02:25:31 +00:00
2016-09-23 16:01:58 +00:00
podInformer . Informer ( ) . AddEventHandler ( cache . ResourceEventHandlerFuncs {
2017-01-23 09:28:51 +00:00
AddFunc : func ( obj interface { } ) {
pod := obj . ( * v1 . Pod )
if nc . taintManager != nil {
nc . taintManager . PodUpdated ( nil , pod )
}
} ,
UpdateFunc : func ( prev , obj interface { } ) {
prevPod := prev . ( * v1 . Pod )
newPod := obj . ( * v1 . Pod )
if nc . taintManager != nil {
nc . taintManager . PodUpdated ( prevPod , newPod )
}
} ,
DeleteFunc : func ( obj interface { } ) {
pod , isPod := obj . ( * v1 . Pod )
2017-05-30 18:32:43 +00:00
// We can get DeletedFinalStateUnknown instead of *v1.Pod here and we need to handle that correctly.
2017-01-23 09:28:51 +00:00
if ! isPod {
deletedState , ok := obj . ( cache . DeletedFinalStateUnknown )
if ! ok {
glog . Errorf ( "Received unexpected object: %v" , obj )
return
}
pod , ok = deletedState . Obj . ( * v1 . Pod )
if ! ok {
2017-05-30 18:32:43 +00:00
glog . Errorf ( "DeletedFinalStateUnknown contained non-Pod object: %v" , deletedState . Obj )
2017-01-23 09:28:51 +00:00
return
}
}
if nc . taintManager != nil {
nc . taintManager . PodUpdated ( pod , nil )
}
} ,
2016-07-20 20:26:07 +00:00
} )
2017-02-06 18:35:50 +00:00
nc . podInformerSynced = podInformer . Informer ( ) . HasSynced
2016-01-27 03:53:09 +00:00
if nc . allocateNodeCIDRs {
2017-08-08 23:25:20 +00:00
if nc . allocatorType == ipam . IPAMFromClusterAllocatorType || nc . allocatorType == ipam . IPAMFromCloudAllocatorType {
cfg := & ipam . Config {
Resync : ipamResyncInterval ,
MaxBackoff : ipamMaxBackoff ,
InitialRetry : ipamInitialBackoff ,
}
switch nc . allocatorType {
case ipam . IPAMFromClusterAllocatorType :
cfg . Mode = nodesync . SyncFromCluster
case ipam . IPAMFromCloudAllocatorType :
cfg . Mode = nodesync . SyncFromCloud
}
ipamc , err := ipam . NewController ( cfg , kubeClient , cloud , clusterCIDR , serviceCIDR , nodeCIDRMaskSize )
2016-10-31 11:04:04 +00:00
if err != nil {
2017-08-08 23:25:20 +00:00
glog . Fatalf ( "Error creating ipam controller: %v" , err )
2016-10-31 11:04:04 +00:00
}
2017-08-08 23:25:20 +00:00
if err := ipamc . Start ( nodeInformer ) ; err != nil {
glog . Fatalf ( "Error trying to Init(): %v" , err )
}
} else {
var err error
nc . cidrAllocator , err = ipam . New (
kubeClient , cloud , nc . allocatorType , nc . clusterCIDR , nc . serviceCIDR , nodeCIDRMaskSize )
if err != nil {
return nil , err
}
nc . cidrAllocator . Register ( nodeInformer )
2017-03-10 20:15:00 +00:00
}
2016-01-27 03:53:09 +00:00
}
2017-01-23 09:28:51 +00:00
if nc . runTaintManager {
2017-08-30 07:01:45 +00:00
nc . taintManager = scheduler . NewNoExecuteTaintManager ( kubeClient )
2017-05-05 10:01:08 +00:00
nodeInformer . Informer ( ) . AddEventHandler ( cache . ResourceEventHandlerFuncs {
2017-08-08 19:55:57 +00:00
AddFunc : util . CreateAddNodeHandler ( func ( node * v1 . Node ) error {
2017-05-05 10:01:08 +00:00
nc . taintManager . NodeUpdated ( nil , node )
return nil
} ) ,
2017-08-08 19:55:57 +00:00
UpdateFunc : util . CreateUpdateNodeHandler ( func ( oldNode , newNode * v1 . Node ) error {
2017-05-05 10:01:08 +00:00
nc . taintManager . NodeUpdated ( oldNode , newNode )
return nil
} ) ,
2017-08-08 19:55:57 +00:00
DeleteFunc : util . CreateDeleteNodeHandler ( func ( node * v1 . Node ) error {
2017-05-05 10:01:08 +00:00
nc . taintManager . NodeUpdated ( node , nil )
return nil
} ) ,
} )
2017-01-23 09:28:51 +00:00
}
2017-07-19 15:51:19 +00:00
if nc . taintNodeByCondition {
2017-10-01 00:26:35 +00:00
glog . Infof ( "Controller will taint node by condition." )
2017-07-19 15:51:19 +00:00
nodeInformer . Informer ( ) . AddEventHandler ( cache . ResourceEventHandlerFuncs {
AddFunc : util . CreateAddNodeHandler ( func ( node * v1 . Node ) error {
return nc . doNoScheduleTaintingPass ( node )
} ) ,
UpdateFunc : util . CreateUpdateNodeHandler ( func ( _ , newNode * v1 . Node ) error {
return nc . doNoScheduleTaintingPass ( newNode )
} ) ,
} )
}
2017-09-15 07:46:26 +00:00
// NOTE(resouer): nodeInformer to substitute deprecated taint key (notReady -> not-ready).
// Remove this logic when we don't need this backwards compatibility
nodeInformer . Informer ( ) . AddEventHandler ( cache . ResourceEventHandlerFuncs {
AddFunc : util . CreateAddNodeHandler ( func ( node * v1 . Node ) error {
return nc . doFixDeprecatedTaintKeyPass ( node )
} ) ,
UpdateFunc : util . CreateUpdateNodeHandler ( func ( _ , newNode * v1 . Node ) error {
return nc . doFixDeprecatedTaintKeyPass ( newNode )
} ) ,
} )
2017-02-06 18:35:50 +00:00
nc . nodeLister = nodeInformer . Lister ( )
nc . nodeInformerSynced = nodeInformer . Informer ( ) . HasSynced
2016-01-27 03:53:09 +00:00
2017-02-06 18:35:50 +00:00
nc . daemonSetStore = daemonSetInformer . Lister ( )
nc . daemonSetInformerSynced = daemonSetInformer . Informer ( ) . HasSynced
2016-01-27 03:53:09 +00:00
2016-07-16 18:52:51 +00:00
return nc , nil
2014-10-14 22:45:09 +00:00
}
2017-08-08 23:25:20 +00:00
func ( nc * Controller ) doEvictionPass ( ) {
2017-04-04 13:35:44 +00:00
nc . evictorLock . Lock ( )
defer nc . evictorLock . Unlock ( )
for k := range nc . zonePodEvictor {
// Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded).
2017-08-08 19:55:57 +00:00
nc . zonePodEvictor [ k ] . Try ( func ( value scheduler . TimedValue ) ( bool , time . Duration ) {
2017-04-04 13:35:44 +00:00
node , err := nc . nodeLister . Get ( value . Value )
if apierrors . IsNotFound ( err ) {
glog . Warningf ( "Node %v no longer present in nodeLister!" , value . Value )
} else if err != nil {
glog . Warningf ( "Failed to get Node %v from the nodeLister: %v" , value . Value , err )
} else {
zone := utilnode . GetZoneKey ( node )
2017-08-08 23:25:20 +00:00
evictionsNumber . WithLabelValues ( zone ) . Inc ( )
2017-04-04 13:35:44 +00:00
}
2017-08-08 23:25:20 +00:00
nodeUID , _ := value . UID . ( string )
remaining , err := util . DeletePods ( nc . kubeClient , nc . recorder , value . Value , nodeUID , nc . daemonSetStore )
2017-04-04 13:35:44 +00:00
if err != nil {
utilruntime . HandleError ( fmt . Errorf ( "unable to evict node %q: %v" , value . Value , err ) )
return false , 0
}
if remaining {
2017-08-08 23:25:20 +00:00
glog . Infof ( "Pods awaiting deletion due to Controller eviction" )
2017-04-04 13:35:44 +00:00
}
return true , 0
} )
}
}
2017-09-15 07:46:26 +00:00
// doFixDeprecatedTaintKeyPass checks and replaces deprecated taint key with proper key name if needed.
func ( nc * Controller ) doFixDeprecatedTaintKeyPass ( node * v1 . Node ) error {
taintsToAdd := [ ] * v1 . Taint { }
taintsToDel := [ ] * v1 . Taint { }
for _ , taint := range node . Spec . Taints {
if taint . Key == algorithm . DeprecatedTaintNodeNotReady {
// delete old taint
tDel := taint
taintsToDel = append ( taintsToDel , & tDel )
// add right taint
tAdd := taint
tAdd . Key = algorithm . TaintNodeNotReady
taintsToAdd = append ( taintsToAdd , & tAdd )
glog . Warningf ( "Detected deprecated taint key: %v on node: %v, will substitute it with %v" ,
algorithm . DeprecatedTaintNodeNotReady , node . GetName ( ) , algorithm . TaintNodeNotReady )
break
}
}
if len ( taintsToAdd ) == 0 && len ( taintsToDel ) == 0 {
return nil
}
if ! util . SwapNodeControllerTaint ( nc . kubeClient , taintsToAdd , taintsToDel , node ) {
return fmt . Errorf ( "failed to swap taints of node %+v" , node )
}
return nil
}
2017-08-08 23:25:20 +00:00
func ( nc * Controller ) doNoScheduleTaintingPass ( node * v1 . Node ) error {
2017-07-19 15:51:19 +00:00
// Map node's condition to Taints.
taints := [ ] v1 . Taint { }
for _ , condition := range node . Status . Conditions {
if _ , found := nodeConditionToTaintKeyMap [ condition . Type ] ; found {
if condition . Status == v1 . ConditionTrue {
taints = append ( taints , v1 . Taint {
Key : nodeConditionToTaintKeyMap [ condition . Type ] ,
Effect : v1 . TaintEffectNoSchedule ,
} )
}
}
}
nodeTaints := taintutils . TaintSetFilter ( node . Spec . Taints , func ( t * v1 . Taint ) bool {
_ , found := taintKeyToNodeConditionMap [ t . Key ]
return found
} )
taintsToAdd , taintsToDel := taintutils . TaintSetDiff ( taints , nodeTaints )
// If nothing to add not delete, return true directly.
if len ( taintsToAdd ) == 0 && len ( taintsToDel ) == 0 {
return nil
}
if ! util . SwapNodeControllerTaint ( nc . kubeClient , taintsToAdd , taintsToDel , node ) {
return fmt . Errorf ( "failed to swap taints of node %+v" , node )
}
return nil
}
2017-08-08 23:25:20 +00:00
func ( nc * Controller ) doNoExecuteTaintingPass ( ) {
2017-04-04 13:35:44 +00:00
nc . evictorLock . Lock ( )
defer nc . evictorLock . Unlock ( )
2017-07-31 11:39:34 +00:00
for k := range nc . zoneNoExecuteTainer {
2017-04-04 13:35:44 +00:00
// Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded).
2017-08-08 19:55:57 +00:00
nc . zoneNoExecuteTainer [ k ] . Try ( func ( value scheduler . TimedValue ) ( bool , time . Duration ) {
2017-04-04 13:35:44 +00:00
node , err := nc . nodeLister . Get ( value . Value )
if apierrors . IsNotFound ( err ) {
glog . Warningf ( "Node %v no longer present in nodeLister!" , value . Value )
return true , 0
} else if err != nil {
glog . Warningf ( "Failed to get Node %v from the nodeLister: %v" , value . Value , err )
// retry in 50 millisecond
return false , 50 * time . Millisecond
} else {
zone := utilnode . GetZoneKey ( node )
2017-08-08 23:25:20 +00:00
evictionsNumber . WithLabelValues ( zone ) . Inc ( )
2017-04-04 13:35:44 +00:00
}
2017-07-16 07:11:17 +00:00
_ , condition := v1node . GetNodeCondition ( & node . Status , v1 . NodeReady )
2017-04-04 13:35:44 +00:00
// Because we want to mimic NodeStatus.Condition["Ready"] we make "unreachable" and "not ready" taints mutually exclusive.
taintToAdd := v1 . Taint { }
oppositeTaint := v1 . Taint { }
if condition . Status == v1 . ConditionFalse {
taintToAdd = * NotReadyTaintTemplate
oppositeTaint = * UnreachableTaintTemplate
} else if condition . Status == v1 . ConditionUnknown {
taintToAdd = * UnreachableTaintTemplate
oppositeTaint = * NotReadyTaintTemplate
} else {
// It seems that the Node is ready again, so there's no need to taint it.
glog . V ( 4 ) . Infof ( "Node %v was in a taint queue, but it's ready now. Ignoring taint request." , value . Value )
return true , 0
}
2017-07-19 15:51:19 +00:00
return util . SwapNodeControllerTaint ( nc . kubeClient , [ ] * v1 . Taint { & taintToAdd } , [ ] * v1 . Taint { & oppositeTaint } , node ) , 0
2017-04-04 13:35:44 +00:00
} )
}
}
2015-08-04 12:44:14 +00:00
// Run starts an asynchronous loop that monitors the status of cluster nodes.
2017-08-08 23:25:20 +00:00
func ( nc * Controller ) Run ( stopCh <- chan struct { } ) {
2017-04-12 19:49:17 +00:00
defer utilruntime . HandleCrash ( )
2016-10-14 19:36:31 +00:00
2017-04-12 19:49:17 +00:00
glog . Infof ( "Starting node controller" )
defer glog . Infof ( "Shutting down node controller" )
2015-10-20 02:25:31 +00:00
2017-04-12 19:49:17 +00:00
if ! controller . WaitForCacheSync ( "node" , stopCh , nc . nodeInformerSynced , nc . podInformerSynced , nc . daemonSetInformerSynced ) {
return
}
2016-10-14 19:36:31 +00:00
2017-04-12 19:49:17 +00:00
// Incorporate the results of node status pushed from kubelet to master.
go wait . Until ( func ( ) {
if err := nc . monitorNodeStatus ( ) ; err != nil {
glog . Errorf ( "Error monitoring node status: %v" , err )
2017-01-23 09:28:51 +00:00
}
2017-04-12 19:49:17 +00:00
} , nc . nodeMonitorPeriod , wait . NeverStop )
2017-01-23 09:28:51 +00:00
2017-04-12 19:49:17 +00:00
if nc . runTaintManager {
go nc . taintManager . Run ( wait . NeverStop )
}
if nc . useTaintBasedEvictions {
// Handling taint based evictions. Because we don't want a dedicated logic in TaintManager for NC-originated
// taints and we normally don't rate limit evictions caused by taints, we need to rate limit adding taints.
2017-08-08 19:55:57 +00:00
go wait . Until ( nc . doNoExecuteTaintingPass , scheduler . NodeEvictionPeriod , wait . NeverStop )
2017-04-12 19:49:17 +00:00
} else {
// Managing eviction of nodes:
// When we delete pods off a node, if the node was not empty at the time we then
// queue an eviction watcher. If we hit an error, retry deletion.
2017-08-08 19:55:57 +00:00
go wait . Until ( nc . doEvictionPass , scheduler . NodeEvictionPeriod , wait . NeverStop )
2017-04-12 19:49:17 +00:00
}
<- stopCh
2015-10-20 02:25:31 +00:00
}
2017-06-23 07:38:05 +00:00
// addPodEvictorForNewZone checks if new zone appeared, and if so add new evictor.
2017-08-08 23:25:20 +00:00
func ( nc * Controller ) addPodEvictorForNewZone ( node * v1 . Node ) {
2017-06-23 07:38:05 +00:00
zone := utilnode . GetZoneKey ( node )
if _ , found := nc . zoneStates [ zone ] ; ! found {
nc . zoneStates [ zone ] = stateInitial
if ! nc . useTaintBasedEvictions {
nc . zonePodEvictor [ zone ] =
2017-08-08 19:55:57 +00:00
scheduler . NewRateLimitedTimedQueue (
flowcontrol . NewTokenBucketRateLimiter ( nc . evictionLimiterQPS , scheduler . EvictionRateLimiterBurst ) )
2017-06-23 07:38:05 +00:00
} else {
2017-07-31 11:39:34 +00:00
nc . zoneNoExecuteTainer [ zone ] =
2017-08-08 19:55:57 +00:00
scheduler . NewRateLimitedTimedQueue (
flowcontrol . NewTokenBucketRateLimiter ( nc . evictionLimiterQPS , scheduler . EvictionRateLimiterBurst ) )
2017-06-23 07:38:05 +00:00
}
// Init the metric for the new zone.
glog . Infof ( "Initializing eviction metric for zone: %v" , zone )
2017-08-08 23:25:20 +00:00
evictionsNumber . WithLabelValues ( zone ) . Add ( 0 )
2017-06-23 07:38:05 +00:00
}
}
2015-08-04 12:44:14 +00:00
// monitorNodeStatus verifies node status are constantly updated by kubelet, and if not,
// post "NodeReady==ConditionUnknown". It also evicts all pods if node is not ready or
// not reachable for a long period of time.
2017-08-08 23:25:20 +00:00
func ( nc * Controller ) monitorNodeStatus ( ) error {
2016-12-19 10:15:39 +00:00
// We are listing nodes from local cache as we can tolerate some small delays
// comparing to state from etcd and there is eventual consistency anyway.
2017-02-06 18:35:50 +00:00
nodes , err := nc . nodeLister . List ( labels . Everything ( ) )
2015-08-19 16:54:08 +00:00
if err != nil {
return err
}
2017-06-23 07:38:05 +00:00
added , deleted , newZoneRepresentatives := nc . classifyNodes ( nodes )
for i := range newZoneRepresentatives {
nc . addPodEvictorForNewZone ( newZoneRepresentatives [ i ] )
}
2016-07-12 07:38:57 +00:00
for i := range added {
2017-08-08 23:25:20 +00:00
glog . V ( 1 ) . Infof ( "Controller observed a new Node: %#v" , added [ i ] . Name )
util . RecordNodeEvent ( nc . recorder , added [ i ] . Name , string ( added [ i ] . UID ) , v1 . EventTypeNormal , "RegisteredNode" , fmt . Sprintf ( "Registered Node %v in Controller" , added [ i ] . Name ) )
2016-07-12 07:38:57 +00:00
nc . knownNodeSet [ added [ i ] . Name ] = added [ i ]
2017-06-23 07:38:05 +00:00
nc . addPodEvictorForNewZone ( added [ i ] )
2017-02-06 12:58:48 +00:00
if nc . useTaintBasedEvictions {
2017-07-19 15:51:19 +00:00
nc . markNodeAsReachable ( added [ i ] )
2017-02-06 12:58:48 +00:00
} else {
nc . cancelPodEviction ( added [ i ] )
}
2015-08-05 13:22:13 +00:00
}
2016-07-12 07:38:57 +00:00
for i := range deleted {
2017-08-08 23:25:20 +00:00
glog . V ( 1 ) . Infof ( "Controller observed a Node deletion: %v" , deleted [ i ] . Name )
util . RecordNodeEvent ( nc . recorder , deleted [ i ] . Name , string ( deleted [ i ] . UID ) , v1 . EventTypeNormal , "RemovingNode" , fmt . Sprintf ( "Removing Node %v from Controller" , deleted [ i ] . Name ) )
2016-07-12 07:38:57 +00:00
delete ( nc . knownNodeSet , deleted [ i ] . Name )
2015-08-05 13:22:13 +00:00
}
2016-11-18 20:50:17 +00:00
zoneToNodeConditions := map [ string ] [ ] * v1 . NodeCondition { }
2017-02-06 18:35:50 +00:00
for i := range nodes {
2015-08-04 12:44:14 +00:00
var gracePeriod time . Duration
2016-11-18 20:50:17 +00:00
var observedReadyCondition v1 . NodeCondition
var currentReadyCondition * v1 . NodeCondition
2017-08-15 12:14:21 +00:00
node := nodes [ i ] . DeepCopy ( )
2017-08-08 19:55:57 +00:00
if err := wait . PollImmediate ( retrySleepTime , retrySleepTime * scheduler . NodeStatusUpdateRetry , func ( ) ( bool , error ) {
2016-05-16 09:20:23 +00:00
gracePeriod , observedReadyCondition , currentReadyCondition , err = nc . tryUpdateNodeStatus ( node )
2015-08-04 12:44:14 +00:00
if err == nil {
2017-01-05 12:22:35 +00:00
return true , nil
2015-08-04 12:44:14 +00:00
}
name := node . Name
2017-10-25 15:54:32 +00:00
node , err = nc . kubeClient . CoreV1 ( ) . Nodes ( ) . Get ( name , metav1 . GetOptions { } )
2015-08-04 12:44:14 +00:00
if err != nil {
glog . Errorf ( "Failed while getting a Node to retry updating NodeStatus. Probably Node %s was deleted." , name )
2017-01-05 12:22:35 +00:00
return false , err
2015-08-04 12:44:14 +00:00
}
2017-01-05 12:22:35 +00:00
return false , nil
} ) ; err != nil {
2017-10-01 00:26:35 +00:00
glog . Errorf ( "Update status of Node '%v' from Controller error: %v. " +
2017-01-05 12:22:35 +00:00
"Skipping - no pods will be evicted." , node . Name , err )
2015-08-04 12:44:14 +00:00
continue
}
2017-01-05 12:22:35 +00:00
2016-07-13 14:57:22 +00:00
// We do not treat a master node as a part of the cluster for network disruption checking.
2016-11-18 20:50:17 +00:00
if ! system . IsMasterNode ( node . Name ) {
2016-07-12 07:38:57 +00:00
zoneToNodeConditions [ utilnode . GetZoneKey ( node ) ] = append ( zoneToNodeConditions [ utilnode . GetZoneKey ( node ) ] , currentReadyCondition )
}
2015-08-04 12:44:14 +00:00
decisionTimestamp := nc . now ( )
2016-05-16 09:20:23 +00:00
if currentReadyCondition != nil {
2015-08-04 12:44:14 +00:00
// Check eviction timeout against decisionTimestamp
2017-02-06 12:58:48 +00:00
if observedReadyCondition . Status == v1 . ConditionFalse {
if nc . useTaintBasedEvictions {
2017-04-04 13:35:44 +00:00
// We want to update the taint straight away if Node is already tainted with the UnreachableTaint
2017-07-06 13:13:13 +00:00
if taintutils . TaintExists ( node . Spec . Taints , UnreachableTaintTemplate ) {
2017-04-04 13:35:44 +00:00
taintToAdd := * NotReadyTaintTemplate
2017-07-19 15:51:19 +00:00
if ! util . SwapNodeControllerTaint ( nc . kubeClient , [ ] * v1 . Taint { & taintToAdd } , [ ] * v1 . Taint { UnreachableTaintTemplate } , node ) {
2017-04-04 13:35:44 +00:00
glog . Errorf ( "Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle." )
}
} else if nc . markNodeForTainting ( node ) {
2017-03-07 09:29:57 +00:00
glog . V ( 2 ) . Infof ( "Node %v is NotReady as of %v. Adding it to the Taint queue." ,
2017-02-06 12:58:48 +00:00
node . Name ,
decisionTimestamp ,
)
}
} else {
if decisionTimestamp . After ( nc . nodeStatusMap [ node . Name ] . readyTransitionTimestamp . Add ( nc . podEvictionTimeout ) ) {
if nc . evictPods ( node ) {
2017-03-07 09:29:57 +00:00
glog . V ( 2 ) . Infof ( "Node is NotReady. Adding Pods on Node %s to eviction queue: %v is later than %v + %v" ,
2017-02-06 12:58:48 +00:00
node . Name ,
decisionTimestamp ,
nc . nodeStatusMap [ node . Name ] . readyTransitionTimestamp ,
nc . podEvictionTimeout ,
)
}
}
2015-08-04 12:44:14 +00:00
}
}
2017-02-06 12:58:48 +00:00
if observedReadyCondition . Status == v1 . ConditionUnknown {
if nc . useTaintBasedEvictions {
2017-04-04 13:35:44 +00:00
// We want to update the taint straight away if Node is already tainted with the UnreachableTaint
2017-07-06 13:13:13 +00:00
if taintutils . TaintExists ( node . Spec . Taints , NotReadyTaintTemplate ) {
2017-04-04 13:35:44 +00:00
taintToAdd := * UnreachableTaintTemplate
2017-07-19 15:51:19 +00:00
if ! util . SwapNodeControllerTaint ( nc . kubeClient , [ ] * v1 . Taint { & taintToAdd } , [ ] * v1 . Taint { NotReadyTaintTemplate } , node ) {
2017-04-04 13:35:44 +00:00
glog . Errorf ( "Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle." )
}
} else if nc . markNodeForTainting ( node ) {
2017-03-07 09:29:57 +00:00
glog . V ( 2 ) . Infof ( "Node %v is unresponsive as of %v. Adding it to the Taint queue." ,
2017-02-06 12:58:48 +00:00
node . Name ,
decisionTimestamp ,
)
}
} else {
if decisionTimestamp . After ( nc . nodeStatusMap [ node . Name ] . probeTimestamp . Add ( nc . podEvictionTimeout ) ) {
if nc . evictPods ( node ) {
2017-03-07 09:29:57 +00:00
glog . V ( 2 ) . Infof ( "Node is unresponsive. Adding Pods on Node %s to eviction queues: %v is later than %v + %v" ,
2017-02-06 12:58:48 +00:00
node . Name ,
decisionTimestamp ,
nc . nodeStatusMap [ node . Name ] . readyTransitionTimestamp ,
nc . podEvictionTimeout - gracePeriod ,
)
}
}
2015-08-04 12:44:14 +00:00
}
}
2016-11-18 20:50:17 +00:00
if observedReadyCondition . Status == v1 . ConditionTrue {
2017-02-06 12:58:48 +00:00
if nc . useTaintBasedEvictions {
2017-07-19 15:51:19 +00:00
removed , err := nc . markNodeAsReachable ( node )
2017-02-06 12:58:48 +00:00
if err != nil {
glog . Errorf ( "Failed to remove taints from node %v. Will retry in next iteration." , node . Name )
}
if removed {
glog . V ( 2 ) . Infof ( "Node %s is healthy again, removing all taints" , node . Name )
}
} else {
if nc . cancelPodEviction ( node ) {
glog . V ( 2 ) . Infof ( "Node %s is ready again, cancelled pod eviction" , node . Name )
}
2015-08-04 12:44:14 +00:00
}
}
// Report node event.
2016-11-18 20:50:17 +00:00
if currentReadyCondition . Status != v1 . ConditionTrue && observedReadyCondition . Status == v1 . ConditionTrue {
2017-08-08 19:55:57 +00:00
util . RecordNodeStatusChange ( nc . recorder , node , "NodeNotReady" )
if err = util . MarkAllPodsNotReady ( nc . kubeClient , node ) ; err != nil {
2016-01-15 07:32:10 +00:00
utilruntime . HandleError ( fmt . Errorf ( "Unable to mark all pods NotReady on node %v: %v" , node . Name , err ) )
2015-11-24 22:46:17 +00:00
}
2015-08-04 12:44:14 +00:00
}
// Check with the cloud provider to see if the node still exists. If it
2016-02-12 21:07:45 +00:00
// doesn't, delete the node immediately.
2016-11-18 20:50:17 +00:00
if currentReadyCondition . Status != v1 . ConditionTrue && nc . cloud != nil {
2016-07-16 06:10:29 +00:00
exists , err := nc . nodeExistsInCloudProvider ( types . NodeName ( node . Name ) )
2016-02-12 21:07:45 +00:00
if err != nil {
glog . Errorf ( "Error determining if node %v exists in cloud: %v" , node . Name , err )
2015-08-04 12:44:14 +00:00
continue
}
2016-02-12 21:07:45 +00:00
if ! exists {
2016-05-16 09:20:23 +00:00
glog . V ( 2 ) . Infof ( "Deleting node (no longer present in cloud provider): %s" , node . Name )
2017-08-08 19:55:57 +00:00
util . RecordNodeEvent ( nc . recorder , node . Name , string ( node . UID ) , v1 . EventTypeNormal , "DeletingNode" , fmt . Sprintf ( "Deleting Node %v because it's not present according to cloud provider" , node . Name ) )
2016-02-12 21:07:45 +00:00
go func ( nodeName string ) {
defer utilruntime . HandleCrash ( )
// Kubelet is not reporting and Cloud Provider says node
// is gone. Delete it without worrying about grace
// periods.
2017-08-08 19:55:57 +00:00
if err := util . ForcefullyDeleteNode ( nc . kubeClient , nodeName ) ; err != nil {
2016-02-12 21:07:45 +00:00
glog . Errorf ( "Unable to forcefully delete node %q: %v" , nodeName , err )
}
} ( node . Name )
2015-08-04 12:44:14 +00:00
}
}
}
}
2017-02-06 18:35:50 +00:00
nc . handleDisruption ( zoneToNodeConditions , nodes )
2016-05-16 09:20:23 +00:00
2016-07-13 14:57:22 +00:00
return nil
}
2017-08-08 23:25:20 +00:00
func ( nc * Controller ) handleDisruption ( zoneToNodeConditions map [ string ] [ ] * v1 . NodeCondition , nodes [ ] * v1 . Node ) {
newZoneStates := map [ string ] ZoneState { }
2016-07-13 14:57:22 +00:00
allAreFullyDisrupted := true
2016-07-12 07:38:57 +00:00
for k , v := range zoneToNodeConditions {
2017-08-08 23:25:20 +00:00
zoneSize . WithLabelValues ( k ) . Set ( float64 ( len ( v ) ) )
2016-08-16 15:08:26 +00:00
unhealthy , newState := nc . computeZoneStateFunc ( v )
2017-08-08 23:25:20 +00:00
zoneHealth . WithLabelValues ( k ) . Set ( float64 ( 100 * ( len ( v ) - unhealthy ) ) / float64 ( len ( v ) ) )
unhealthyNodes . WithLabelValues ( k ) . Set ( float64 ( unhealthy ) )
2016-07-13 14:57:22 +00:00
if newState != stateFullDisruption {
allAreFullyDisrupted = false
}
newZoneStates [ k ] = newState
if _ , had := nc . zoneStates [ k ] ; ! had {
2017-02-06 12:58:48 +00:00
glog . Errorf ( "Setting initial state for unseen zone: %v" , k )
2016-07-13 14:57:22 +00:00
nc . zoneStates [ k ] = stateInitial
}
}
allWasFullyDisrupted := true
for k , v := range nc . zoneStates {
if _ , have := zoneToNodeConditions [ k ] ; ! have {
2017-08-08 23:25:20 +00:00
zoneSize . WithLabelValues ( k ) . Set ( 0 )
zoneHealth . WithLabelValues ( k ) . Set ( 100 )
unhealthyNodes . WithLabelValues ( k ) . Set ( 0 )
2016-07-13 14:57:22 +00:00
delete ( nc . zoneStates , k )
2016-07-12 07:38:57 +00:00
continue
}
2016-07-13 14:57:22 +00:00
if v != stateFullDisruption {
allWasFullyDisrupted = false
break
2016-07-12 07:38:57 +00:00
}
2016-07-13 14:57:22 +00:00
}
// At least one node was responding in previous pass or in the current pass. Semantics is as follows:
// - if the new state is "partialDisruption" we call a user defined function that returns a new limiter to use,
// - if the new state is "normal" we resume normal operation (go back to default limiter settings),
// - if new state is "fullDisruption" we restore normal eviction rate,
// - unless all zones in the cluster are in "fullDisruption" - in that case we stop all evictions.
if ! allAreFullyDisrupted || ! allWasFullyDisrupted {
// We're switching to full disruption mode
if allAreFullyDisrupted {
2017-08-08 23:25:20 +00:00
glog . V ( 0 ) . Info ( "Controller detected that all Nodes are not-Ready. Entering master disruption mode." )
2017-02-06 18:35:50 +00:00
for i := range nodes {
2017-02-06 12:58:48 +00:00
if nc . useTaintBasedEvictions {
2017-07-19 15:51:19 +00:00
_ , err := nc . markNodeAsReachable ( nodes [ i ] )
2017-02-06 12:58:48 +00:00
if err != nil {
glog . Errorf ( "Failed to remove taints from Node %v" , nodes [ i ] . Name )
}
} else {
nc . cancelPodEviction ( nodes [ i ] )
}
2016-07-13 14:57:22 +00:00
}
// We stop all evictions.
2017-02-06 12:58:48 +00:00
for k := range nc . zoneStates {
if nc . useTaintBasedEvictions {
2017-07-31 11:39:34 +00:00
nc . zoneNoExecuteTainer [ k ] . SwapLimiter ( 0 )
2017-02-06 12:58:48 +00:00
} else {
nc . zonePodEvictor [ k ] . SwapLimiter ( 0 )
}
2016-07-13 14:57:22 +00:00
}
for k := range nc . zoneStates {
nc . zoneStates [ k ] = stateFullDisruption
2016-07-12 07:38:57 +00:00
}
2016-07-13 14:57:22 +00:00
// All rate limiters are updated, so we can return early here.
return
}
// We're exiting full disruption mode
if allWasFullyDisrupted {
2017-08-08 23:25:20 +00:00
glog . V ( 0 ) . Info ( "Controller detected that some Nodes are Ready. Exiting master disruption mode." )
2016-07-13 14:57:22 +00:00
// When exiting disruption mode update probe timestamps on all Nodes.
now := nc . now ( )
2017-02-06 18:35:50 +00:00
for i := range nodes {
v := nc . nodeStatusMap [ nodes [ i ] . Name ]
2016-07-13 14:57:22 +00:00
v . probeTimestamp = now
v . readyTransitionTimestamp = now
2017-02-06 18:35:50 +00:00
nc . nodeStatusMap [ nodes [ i ] . Name ] = v
2016-07-13 14:57:22 +00:00
}
// We reset all rate limiters to settings appropriate for the given state.
2017-02-06 12:58:48 +00:00
for k := range nc . zoneStates {
2016-07-13 14:57:22 +00:00
nc . setLimiterInZone ( k , len ( zoneToNodeConditions [ k ] ) , newZoneStates [ k ] )
nc . zoneStates [ k ] = newZoneStates [ k ]
}
return
}
// We know that there's at least one not-fully disrupted so,
// we can use default behavior for rate limiters
for k , v := range nc . zoneStates {
newState := newZoneStates [ k ]
if v == newState {
continue
}
2017-08-08 23:25:20 +00:00
glog . V ( 0 ) . Infof ( "Controller detected that zone %v is now in state %v." , k , newState )
2016-07-13 14:57:22 +00:00
nc . setLimiterInZone ( k , len ( zoneToNodeConditions [ k ] ) , newState )
nc . zoneStates [ k ] = newState
2016-05-16 09:20:23 +00:00
}
}
2016-07-13 14:57:22 +00:00
}
2016-07-12 07:38:57 +00:00
2017-08-08 23:25:20 +00:00
func ( nc * Controller ) setLimiterInZone ( zone string , zoneSize int , state ZoneState ) {
2016-07-13 14:57:22 +00:00
switch state {
case stateNormal :
2017-02-06 12:58:48 +00:00
if nc . useTaintBasedEvictions {
2017-07-31 11:39:34 +00:00
nc . zoneNoExecuteTainer [ zone ] . SwapLimiter ( nc . evictionLimiterQPS )
2017-02-06 12:58:48 +00:00
} else {
nc . zonePodEvictor [ zone ] . SwapLimiter ( nc . evictionLimiterQPS )
}
2016-07-13 14:57:22 +00:00
case statePartialDisruption :
2017-02-06 12:58:48 +00:00
if nc . useTaintBasedEvictions {
2017-07-31 11:39:34 +00:00
nc . zoneNoExecuteTainer [ zone ] . SwapLimiter (
2017-02-06 12:58:48 +00:00
nc . enterPartialDisruptionFunc ( zoneSize ) )
} else {
nc . zonePodEvictor [ zone ] . SwapLimiter (
nc . enterPartialDisruptionFunc ( zoneSize ) )
}
2016-07-13 14:57:22 +00:00
case stateFullDisruption :
2017-02-06 12:58:48 +00:00
if nc . useTaintBasedEvictions {
2017-07-31 11:39:34 +00:00
nc . zoneNoExecuteTainer [ zone ] . SwapLimiter (
2017-02-06 12:58:48 +00:00
nc . enterFullDisruptionFunc ( zoneSize ) )
} else {
nc . zonePodEvictor [ zone ] . SwapLimiter (
nc . enterFullDisruptionFunc ( zoneSize ) )
}
2016-07-13 14:57:22 +00:00
}
2015-08-04 12:44:14 +00:00
}
2017-07-15 12:22:55 +00:00
// tryUpdateNodeStatus checks a given node's conditions and tries to update it. Returns grace period to
// which given node is entitled, state of current and last observed Ready Condition, and an error if it occurred.
2017-08-08 23:25:20 +00:00
func ( nc * Controller ) tryUpdateNodeStatus ( node * v1 . Node ) ( time . Duration , v1 . NodeCondition , * v1 . NodeCondition , error ) {
2015-03-30 12:44:02 +00:00
var err error
var gracePeriod time . Duration
2016-11-18 20:50:17 +00:00
var observedReadyCondition v1 . NodeCondition
2017-07-16 07:11:17 +00:00
_ , currentReadyCondition := v1node . GetNodeCondition ( & node . Status , v1 . NodeReady )
2016-05-16 09:20:23 +00:00
if currentReadyCondition == nil {
2015-03-30 12:44:02 +00:00
// If ready condition is nil, then kubelet (or nodecontroller) never posted node status.
// A fake ready condition is created, where LastProbeTime and LastTransitionTime is set
// to node.CreationTimestamp to avoid handle the corner case.
2016-11-18 20:50:17 +00:00
observedReadyCondition = v1 . NodeCondition {
Type : v1 . NodeReady ,
Status : v1 . ConditionUnknown ,
2015-03-27 14:09:51 +00:00
LastHeartbeatTime : node . CreationTimestamp ,
2015-03-30 12:44:02 +00:00
LastTransitionTime : node . CreationTimestamp ,
}
2015-03-31 11:17:12 +00:00
gracePeriod = nc . nodeStartupGracePeriod
2015-04-10 22:30:11 +00:00
nc . nodeStatusMap [ node . Name ] = nodeStatusData {
2015-03-31 15:15:39 +00:00
status : node . Status ,
probeTimestamp : node . CreationTimestamp ,
readyTransitionTimestamp : node . CreationTimestamp ,
}
2015-03-30 12:44:02 +00:00
} else {
// If ready condition is not nil, make a copy of it, since we may modify it in place later.
2016-05-16 09:20:23 +00:00
observedReadyCondition = * currentReadyCondition
2015-03-31 11:17:12 +00:00
gracePeriod = nc . nodeMonitorGracePeriod
2015-03-30 12:44:02 +00:00
}
2015-03-31 15:15:39 +00:00
savedNodeStatus , found := nc . nodeStatusMap [ node . Name ]
// There are following cases to check:
// - both saved and new status have no Ready Condition set - we leave everything as it is,
2017-08-08 23:25:20 +00:00
// - saved status have no Ready Condition, but current one does - Controller was restarted with Node data already present in etcd,
2015-03-31 15:15:39 +00:00
// - saved status have some Ready Condition, but current one does not - it's an error, but we fill it up because that's probably a good thing to do,
// - both saved and current statuses have Ready Conditions and they have the same LastProbeTime - nothing happened on that Node, it may be
// unresponsive, so we leave it as it is,
// - both saved and current statuses have Ready Conditions, they have different LastProbeTimes, but the same Ready Condition State -
// everything's in order, no transition occurred, we update only probeTimestamp,
// - both saved and current statuses have Ready Conditions, different LastProbeTimes and different Ready Condition State -
// Ready Condition changed it state since we last seen it, so we update both probeTimestamp and readyTransitionTimestamp.
// TODO: things to consider:
2015-07-29 21:11:19 +00:00
// - if 'LastProbeTime' have gone back in time its probably an error, currently we ignore it,
2015-03-31 15:15:39 +00:00
// - currently only correct Ready State transition outside of Node Controller is marking it ready by Kubelet, we don't check
// if that's the case, but it does not seem necessary.
2016-11-18 20:50:17 +00:00
var savedCondition * v1 . NodeCondition
2015-09-29 06:43:04 +00:00
if found {
2017-07-16 07:11:17 +00:00
_ , savedCondition = v1node . GetNodeCondition ( & savedNodeStatus . status , v1 . NodeReady )
2015-09-29 06:43:04 +00:00
}
2017-07-16 07:11:17 +00:00
_ , observedCondition := v1node . GetNodeCondition ( & node . Status , v1 . NodeReady )
2015-03-31 15:15:39 +00:00
if ! found {
glog . Warningf ( "Missing timestamp for Node %s. Assuming now as a timestamp." , node . Name )
2015-04-10 22:30:11 +00:00
savedNodeStatus = nodeStatusData {
2015-03-31 15:15:39 +00:00
status : node . Status ,
probeTimestamp : nc . now ( ) ,
readyTransitionTimestamp : nc . now ( ) ,
}
} else if savedCondition == nil && observedCondition != nil {
glog . V ( 1 ) . Infof ( "Creating timestamp entry for newly observed Node %s" , node . Name )
2015-04-10 22:30:11 +00:00
savedNodeStatus = nodeStatusData {
2015-03-31 15:15:39 +00:00
status : node . Status ,
probeTimestamp : nc . now ( ) ,
readyTransitionTimestamp : nc . now ( ) ,
}
} else if savedCondition != nil && observedCondition == nil {
glog . Errorf ( "ReadyCondition was removed from Status of Node %s" , node . Name )
// TODO: figure out what to do in this case. For now we do the same thing as above.
2015-04-10 22:30:11 +00:00
savedNodeStatus = nodeStatusData {
2015-03-31 15:15:39 +00:00
status : node . Status ,
probeTimestamp : nc . now ( ) ,
readyTransitionTimestamp : nc . now ( ) ,
}
2015-03-27 14:09:51 +00:00
} else if savedCondition != nil && observedCondition != nil && savedCondition . LastHeartbeatTime != observedCondition . LastHeartbeatTime {
2016-12-03 18:57:26 +00:00
var transitionTime metav1 . Time
2015-03-31 15:15:39 +00:00
// If ReadyCondition changed since the last time we checked, we update the transition timestamp to "now",
// otherwise we leave it as it is.
if savedCondition . LastTransitionTime != observedCondition . LastTransitionTime {
glog . V ( 3 ) . Infof ( "ReadyCondition for Node %s transitioned from %v to %v" , node . Name , savedCondition . Status , observedCondition )
transitionTime = nc . now ( )
} else {
transitionTime = savedNodeStatus . readyTransitionTimestamp
}
2016-02-20 20:07:23 +00:00
if glog . V ( 5 ) {
2016-05-16 09:20:23 +00:00
glog . V ( 5 ) . Infof ( "Node %s ReadyCondition updated. Updating timestamp: %+v vs %+v." , node . Name , savedNodeStatus . status , node . Status )
2016-02-20 20:07:23 +00:00
} else {
glog . V ( 3 ) . Infof ( "Node %s ReadyCondition updated. Updating timestamp." , node . Name )
}
2015-04-10 22:30:11 +00:00
savedNodeStatus = nodeStatusData {
2015-03-31 15:15:39 +00:00
status : node . Status ,
probeTimestamp : nc . now ( ) ,
readyTransitionTimestamp : transitionTime ,
}
}
2016-05-16 09:20:23 +00:00
nc . nodeStatusMap [ node . Name ] = savedNodeStatus
2015-03-31 15:15:39 +00:00
if nc . now ( ) . After ( savedNodeStatus . probeTimestamp . Add ( gracePeriod ) ) {
2015-03-30 12:44:02 +00:00
// NodeReady condition was last set longer ago than gracePeriod, so update it to Unknown
2015-10-22 19:47:43 +00:00
// (regardless of its current value) in the master.
2016-05-16 09:20:23 +00:00
if currentReadyCondition == nil {
2015-05-06 21:39:14 +00:00
glog . V ( 2 ) . Infof ( "node %v is never updated by kubelet" , node . Name )
2016-11-18 20:50:17 +00:00
node . Status . Conditions = append ( node . Status . Conditions , v1 . NodeCondition {
Type : v1 . NodeReady ,
Status : v1 . ConditionUnknown ,
2015-09-11 10:08:09 +00:00
Reason : "NodeStatusNeverUpdated" ,
Message : fmt . Sprintf ( "Kubelet never posted node status." ) ,
2015-03-27 14:09:51 +00:00
LastHeartbeatTime : node . CreationTimestamp ,
2015-03-30 12:44:02 +00:00
LastTransitionTime : nc . now ( ) ,
} )
} else {
2016-04-11 02:51:29 +00:00
glog . V ( 4 ) . Infof ( "node %v hasn't been updated for %+v. Last ready condition is: %+v" ,
2016-05-16 09:20:23 +00:00
node . Name , nc . now ( ) . Time . Sub ( savedNodeStatus . probeTimestamp . Time ) , observedReadyCondition )
2016-11-18 20:50:17 +00:00
if observedReadyCondition . Status != v1 . ConditionUnknown {
currentReadyCondition . Status = v1 . ConditionUnknown
2016-05-16 09:20:23 +00:00
currentReadyCondition . Reason = "NodeStatusUnknown"
2016-11-10 18:09:27 +00:00
currentReadyCondition . Message = "Kubelet stopped posting node status."
2015-03-30 12:44:02 +00:00
// LastProbeTime is the last time we heard from kubelet.
2016-05-16 09:20:23 +00:00
currentReadyCondition . LastHeartbeatTime = observedReadyCondition . LastHeartbeatTime
currentReadyCondition . LastTransitionTime = nc . now ( )
2015-03-30 12:44:02 +00:00
}
}
2015-10-22 19:47:43 +00:00
2016-11-10 18:09:27 +00:00
// remaining node conditions should also be set to Unknown
2017-07-15 12:22:55 +00:00
remainingNodeConditionTypes := [ ] v1 . NodeConditionType {
v1 . NodeMemoryPressure ,
v1 . NodeDiskPressure ,
// We don't change 'NodeNetworkUnavailable' condition, as it's managed on a control plane level.
// v1.NodeNetworkUnavailable,
}
2016-11-10 18:09:27 +00:00
nowTimestamp := nc . now ( )
for _ , nodeConditionType := range remainingNodeConditionTypes {
2017-07-16 07:11:17 +00:00
_ , currentCondition := v1node . GetNodeCondition ( & node . Status , nodeConditionType )
2016-11-10 18:09:27 +00:00
if currentCondition == nil {
glog . V ( 2 ) . Infof ( "Condition %v of node %v was never updated by kubelet" , nodeConditionType , node . Name )
node . Status . Conditions = append ( node . Status . Conditions , v1 . NodeCondition {
Type : nodeConditionType ,
Status : v1 . ConditionUnknown ,
Reason : "NodeStatusNeverUpdated" ,
Message : "Kubelet never posted node status." ,
LastHeartbeatTime : node . CreationTimestamp ,
LastTransitionTime : nowTimestamp ,
} )
} else {
glog . V ( 4 ) . Infof ( "node %v hasn't been updated for %+v. Last %v is: %+v" ,
node . Name , nc . now ( ) . Time . Sub ( savedNodeStatus . probeTimestamp . Time ) , nodeConditionType , currentCondition )
if currentCondition . Status != v1 . ConditionUnknown {
currentCondition . Status = v1 . ConditionUnknown
currentCondition . Reason = "NodeStatusUnknown"
currentCondition . Message = "Kubelet stopped posting node status."
currentCondition . LastTransitionTime = nowTimestamp
}
2015-10-22 19:47:43 +00:00
}
}
2017-07-16 07:11:17 +00:00
_ , currentCondition := v1node . GetNodeCondition ( & node . Status , v1 . NodeReady )
2017-01-25 13:39:54 +00:00
if ! apiequality . Semantic . DeepEqual ( currentCondition , & observedReadyCondition ) {
2017-10-25 15:54:32 +00:00
if _ , err = nc . kubeClient . CoreV1 ( ) . Nodes ( ) . UpdateStatus ( node ) ; err != nil {
2015-03-31 15:15:39 +00:00
glog . Errorf ( "Error updating node %s: %v" , node . Name , err )
2016-05-16 09:20:23 +00:00
return gracePeriod , observedReadyCondition , currentReadyCondition , err
2015-03-31 15:15:39 +00:00
}
2017-08-08 23:25:20 +00:00
nc . nodeStatusMap [ node . Name ] = nodeStatusData {
status : node . Status ,
probeTimestamp : nc . nodeStatusMap [ node . Name ] . probeTimestamp ,
readyTransitionTimestamp : nc . now ( ) ,
}
return gracePeriod , observedReadyCondition , currentReadyCondition , nil
2015-03-30 12:44:02 +00:00
}
}
2016-05-16 09:20:23 +00:00
return gracePeriod , observedReadyCondition , currentReadyCondition , err
}
2017-06-23 07:38:05 +00:00
// classifyNodes classifies the allNodes to three categories:
// 1. added: the nodes that in 'allNodes', but not in 'knownNodeSet'
// 2. deleted: the nodes that in 'knownNodeSet', but not in 'allNodes'
// 3. newZoneRepresentatives: the nodes that in both 'knownNodeSet' and 'allNodes', but no zone states
2017-08-08 23:25:20 +00:00
func ( nc * Controller ) classifyNodes ( allNodes [ ] * v1 . Node ) ( added , deleted , newZoneRepresentatives [ ] * v1 . Node ) {
2017-06-23 07:38:05 +00:00
for i := range allNodes {
if _ , has := nc . knownNodeSet [ allNodes [ i ] . Name ] ; ! has {
added = append ( added , allNodes [ i ] )
} else {
// Currently, we only consider new zone as updated.
zone := utilnode . GetZoneKey ( allNodes [ i ] )
if _ , found := nc . zoneStates [ zone ] ; ! found {
newZoneRepresentatives = append ( newZoneRepresentatives , allNodes [ i ] )
}
2016-07-12 07:38:57 +00:00
}
}
2017-06-23 07:38:05 +00:00
2016-07-12 07:38:57 +00:00
// If there's a difference between lengths of known Nodes and observed nodes
// we must have removed some Node.
2017-06-23 07:38:05 +00:00
if len ( nc . knownNodeSet ) + len ( added ) != len ( allNodes ) {
2016-11-18 20:50:17 +00:00
knowSetCopy := map [ string ] * v1 . Node { }
2016-07-12 07:38:57 +00:00
for k , v := range nc . knownNodeSet {
knowSetCopy [ k ] = v
}
2017-06-23 07:38:05 +00:00
for i := range allNodes {
delete ( knowSetCopy , allNodes [ i ] . Name )
2016-07-12 07:38:57 +00:00
}
for i := range knowSetCopy {
deleted = append ( deleted , knowSetCopy [ i ] )
}
}
return
}
2015-08-25 13:47:08 +00:00
// cancelPodEviction removes any queued evictions, typically because the node is available again. It
// returns true if an eviction was queued.
2017-08-08 23:25:20 +00:00
func ( nc * Controller ) cancelPodEviction ( node * v1 . Node ) bool {
2016-07-12 12:29:46 +00:00
zone := utilnode . GetZoneKey ( node )
2015-09-07 13:04:15 +00:00
nc . evictorLock . Lock ( )
defer nc . evictorLock . Unlock ( )
2016-07-12 12:29:46 +00:00
wasDeleting := nc . zonePodEvictor [ zone ] . Remove ( node . Name )
2016-10-28 17:45:04 +00:00
if wasDeleting {
2016-07-12 07:38:57 +00:00
glog . V ( 2 ) . Infof ( "Cancelling pod Eviction on Node: %v" , node . Name )
2015-09-15 21:45:56 +00:00
return true
}
return false
2015-08-25 13:47:08 +00:00
}
2016-07-11 11:23:53 +00:00
// evictPods queues an eviction for the provided node name, and returns false if the node is already
// queued for eviction.
2017-08-08 23:25:20 +00:00
func ( nc * Controller ) evictPods ( node * v1 . Node ) bool {
2016-07-11 11:23:53 +00:00
nc . evictorLock . Lock ( )
defer nc . evictorLock . Unlock ( )
2016-08-14 01:41:20 +00:00
return nc . zonePodEvictor [ utilnode . GetZoneKey ( node ) ] . Add ( node . Name , string ( node . UID ) )
2016-05-16 09:20:23 +00:00
}
2016-08-05 12:50:19 +00:00
2017-08-08 23:25:20 +00:00
func ( nc * Controller ) markNodeForTainting ( node * v1 . Node ) bool {
2017-02-06 12:58:48 +00:00
nc . evictorLock . Lock ( )
defer nc . evictorLock . Unlock ( )
2017-07-31 11:39:34 +00:00
return nc . zoneNoExecuteTainer [ utilnode . GetZoneKey ( node ) ] . Add ( node . Name , string ( node . UID ) )
2017-02-06 12:58:48 +00:00
}
2017-08-08 23:25:20 +00:00
func ( nc * Controller ) markNodeAsReachable ( node * v1 . Node ) ( bool , error ) {
2017-02-06 12:58:48 +00:00
nc . evictorLock . Lock ( )
defer nc . evictorLock . Unlock ( )
2017-08-07 11:29:39 +00:00
err := controller . RemoveTaintOffNode ( nc . kubeClient , node . Name , node , UnreachableTaintTemplate )
2017-02-06 12:58:48 +00:00
if err != nil {
glog . Errorf ( "Failed to remove taint from node %v: %v" , node . Name , err )
return false , err
}
2017-08-07 11:29:39 +00:00
err = controller . RemoveTaintOffNode ( nc . kubeClient , node . Name , node , NotReadyTaintTemplate )
2017-02-06 12:58:48 +00:00
if err != nil {
glog . Errorf ( "Failed to remove taint from node %v: %v" , node . Name , err )
return false , err
}
2017-07-31 11:39:34 +00:00
return nc . zoneNoExecuteTainer [ utilnode . GetZoneKey ( node ) ] . Remove ( node . Name ) , nil
2017-02-06 12:58:48 +00:00
}
2017-08-08 23:25:20 +00:00
// HealthyQPSFunc returns the default value for cluster eviction rate - we take
// nodeNum for consistency with ReducedQPSFunc.
func ( nc * Controller ) HealthyQPSFunc ( nodeNum int ) float32 {
2016-08-05 12:50:19 +00:00
return nc . evictionLimiterQPS
}
2017-08-08 23:25:20 +00:00
// ReducedQPSFunc returns the QPS for when a the cluster is large make
// evictions slower, if they're small stop evictions altogether.
func ( nc * Controller ) ReducedQPSFunc ( nodeNum int ) float32 {
2016-08-05 12:50:19 +00:00
if int32 ( nodeNum ) > nc . largeClusterThreshold {
return nc . secondaryEvictionLimiterQPS
}
return 0
}
2017-07-15 12:22:55 +00:00
// ComputeZoneState returns a slice of NodeReadyConditions for all Nodes in a given zone.
2016-08-05 12:50:19 +00:00
// The zone is considered:
// - fullyDisrupted if there're no Ready Nodes,
// - partiallyDisrupted if at least than nc.unhealthyZoneThreshold percent of Nodes are not Ready,
// - normal otherwise
2017-08-08 23:25:20 +00:00
func ( nc * Controller ) ComputeZoneState ( nodeReadyConditions [ ] * v1 . NodeCondition ) ( int , ZoneState ) {
2016-08-05 12:50:19 +00:00
readyNodes := 0
notReadyNodes := 0
for i := range nodeReadyConditions {
2016-11-18 20:50:17 +00:00
if nodeReadyConditions [ i ] != nil && nodeReadyConditions [ i ] . Status == v1 . ConditionTrue {
2016-08-05 12:50:19 +00:00
readyNodes ++
} else {
notReadyNodes ++
}
}
switch {
case readyNodes == 0 && notReadyNodes > 0 :
2016-08-16 15:08:26 +00:00
return notReadyNodes , stateFullDisruption
2016-08-05 12:50:19 +00:00
case notReadyNodes > 2 && float32 ( notReadyNodes ) / float32 ( notReadyNodes + readyNodes ) >= nc . unhealthyZoneThreshold :
2016-08-16 15:08:26 +00:00
return notReadyNodes , statePartialDisruption
2016-08-05 12:50:19 +00:00
default :
2016-08-16 15:08:26 +00:00
return notReadyNodes , stateNormal
2016-08-05 12:50:19 +00:00
}
}