Merge pull request #41133 from gmarek/nc-taints

Automatic merge from submit-queue NodeController sets NodeTaints instead of deleting Pods ```release-note Add an alpha feature that makes NodeController set Taints instead of deleting Pods from not Ready Nodes. ``` cc @timothysc @wojtek-t @davidopp @aveshagarwal - this PR just uses library functions from previous one. @kevin-wangzefeng - the only thing that's left is to write an admission controller. I don't remember what was the agreements. Are you going to write it, or should I?
2017-02-24 03:23:43 -08:00 · 2017-02-24 03:23:43 -08:00 · 2bb97435eb
parent 5ccc4f2175 f9d6086217
commit 2bb97435eb
14 changed files with 540 additions and 96 deletions
--- a/cmd/kube-controller-manager/app/controllermanager.go
+++ b/cmd/kube-controller-manager/app/controllermanager.go
@ -444,6 +444,7 @@ func StartControllers(controllers map[string]InitFunc, s *options.CMServer, root
 		int(s.NodeCIDRMaskSize),
 		s.AllocateNodeCIDRs,
 		s.EnableTaintManager,
+		s.UseTaintBasedEvictions,
 	)
 	if err != nil {
 		return fmt.Errorf("failed to initialize nodecontroller: %v", err)
--- a/cmd/kube-controller-manager/app/options/options.go
+++ b/cmd/kube-controller-manager/app/options/options.go
@ -105,6 +105,7 @@ func NewCMServer() *CMServer {
 			ClusterSigningKeyFile:    "/etc/kubernetes/ca/ca.key",
 			ReconcilerSyncLoopPeriod: metav1.Duration{Duration: 60 * time.Second},
 			EnableTaintManager:       true,
+			UseTaintBasedEvictions:   false,
 		},
 	}
 	s.LeaderElection.LeaderElect = true
@ -198,6 +199,7 @@ func (s *CMServer) AddFlags(fs *pflag.FlagSet, allControllers []string, disabled
 	fs.BoolVar(&s.DisableAttachDetachReconcilerSync, "disable-attach-detach-reconcile-sync", false, "Disable volume attach detach reconciler sync. Disabling this may cause volumes to be mismatched with pods. Use wisely.")
 	fs.DurationVar(&s.ReconcilerSyncLoopPeriod.Duration, "attach-detach-reconcile-sync-period", s.ReconcilerSyncLoopPeriod.Duration, "The reconciler sync wait time between volume attach detach. This duration must be larger than one second, and increasing this value from the default may allow for volumes to be mismatched with pods.")
 	fs.BoolVar(&s.EnableTaintManager, "enable-taint-manager", s.EnableTaintManager, "WARNING: Beta feature. If set to true enables NoExecute Taints and will evict all not-tolerating Pod running on Nodes tainted with this kind of Taints.")
+	fs.BoolVar(&s.UseTaintBasedEvictions, "use-taint-based-evictions", s.UseTaintBasedEvictions, "WARNING: Alpha feature. If set to true NodeController will use taints to evict Pods from notReady and unreachable Nodes.")

 	leaderelection.BindFlags(&s.LeaderElection, fs)

--- a/hack/verify-flags/known-flags.txt
+++ b/hack/verify-flags/known-flags.txt
@ -639,6 +639,8 @@ upgrade-image
 upgrade-target
 use-kubernetes-cluster-service
 use-service-account-credentials
+use-kubernetes-version
+use-taint-based-evictions
 user-whitelist
 verb
 verify-only
--- a/pkg/apis/componentconfig/types.go
+++ b/pkg/apis/componentconfig/types.go
@ -794,6 +794,8 @@ type KubeControllerManagerConfiguration struct {
 	// If set to true enables NoExecute Taints and will evict all not-tolerating
 	// Pod running on Nodes tainted with this kind of Taints.
 	EnableTaintManager bool
+	// If set to true NodeController will use taints to evict Pods from notReady and unreachable Nodes.
+	UseTaintBasedEvictions bool
 }

 // VolumeConfiguration contains *all* enumerated flags meant to configure all volume
--- a/pkg/controller/controller_utils.go
+++ b/pkg/controller/controller_utils.go
@ -873,7 +873,22 @@ func AddOrUpdateTaintOnNode(c clientset.Interface, nodeName string, taint *v1.Ta

 // RemoveTaintOffNode is for cleaning up taints temporarily added to node,
 // won't fail if target taint doesn't exist or has been removed.
-func RemoveTaintOffNode(c clientset.Interface, nodeName string, taint *v1.Taint) error {
+// If passed a node it'll check if there's anything to be done, if taint is not present it won't issue
+// any API calls.
+func RemoveTaintOffNode(c clientset.Interface, nodeName string, taint *v1.Taint, node *v1.Node) error {
+	// Short circuit for limiting amout of API calls.
+	if node != nil {
+		match := false
+		for i := range node.Spec.Taints {
+			if node.Spec.Taints[i].MatchTaint(taint) {
+				match = true
+				break
+			}
+		}
+		if !match {
+			return nil
+		}
+	}
 	firstTry := true
 	return clientretry.RetryOnConflict(UpdateTaintBackoff, func() error {
 		var err error
@ -907,7 +922,7 @@ func PatchNodeTaints(c clientset.Interface, nodeName string, oldNode *v1.Node, n
 		return fmt.Errorf("failed to marshal old node %#v for node %q: %v", oldNode, nodeName, err)
 	}

-	newAnnotations := newNode.Annotations
+	newTaints := newNode.Spec.Taints
 	objCopy, err := api.Scheme.DeepCopy(oldNode)
 	if err != nil {
 		return fmt.Errorf("failed to copy node object %#v: %v", oldNode, err)
@ -916,7 +931,7 @@ func PatchNodeTaints(c clientset.Interface, nodeName string, oldNode *v1.Node, n
 	if !ok {
 		return fmt.Errorf("failed to cast copy onto node object %#v: %v", newNode, err)
 	}
-	newNode.Annotations = newAnnotations
+	newNode.Spec.Taints = newTaints
 	newData, err := json.Marshal(newNode)
 	if err != nil {
 		return fmt.Errorf("failed to marshal new node %#v for node %q: %v", newNode, nodeName, err)
--- a/pkg/controller/node/BUILD
+++ b/pkg/controller/node/BUILD
@ -31,6 +31,7 @@ go_library(
        "//pkg/client/listers/core/v1:go_default_library",
        "//pkg/client/listers/extensions/v1beta1:go_default_library",
        "//pkg/cloudprovider:go_default_library",
+        "//pkg/controller:go_default_library",
        "//pkg/kubelet/util/format:go_default_library",
        "//pkg/util/metrics:go_default_library",
        "//pkg/util/node:go_default_library",
--- a/pkg/controller/node/nodecontroller.go
+++ b/pkg/controller/node/nodecontroller.go
@ -45,6 +45,7 @@ import (
 	corelisters "k8s.io/kubernetes/pkg/client/listers/core/v1"
 	extensionslisters "k8s.io/kubernetes/pkg/client/listers/extensions/v1beta1"
 	"k8s.io/kubernetes/pkg/cloudprovider"
+	"k8s.io/kubernetes/pkg/controller"
 	"k8s.io/kubernetes/pkg/util/metrics"
 	utilnode "k8s.io/kubernetes/pkg/util/node"
 	"k8s.io/kubernetes/pkg/util/system"
@ -63,6 +64,16 @@ var (
 	// The minimum kubelet version for which the nodecontroller
 	// can safely flip pod.Status to NotReady.
 	podStatusReconciliationVersion = utilversion.MustParseSemantic("v1.2.0")
+
+	UnreachableTaintTemplate = &v1.Taint{
+		Key:    metav1.TaintNodeUnreachable,
+		Effect: v1.TaintEffectNoExecute,
+	}
+
+	NotReadyTaintTemplate = &v1.Taint{
+		Key:    metav1.TaintNodeNotReady,
+		Effect: v1.TaintEffectNoExecute,
+	}
 )

 const (
@ -132,8 +143,10 @@ type NodeController struct {
 	// Lock to access evictor workers
 	evictorLock sync.Mutex
 	// workers that evicts pods from unresponsive nodes.
-	zonePodEvictor     map[string]*RateLimitedTimedQueue
-	podEvictionTimeout time.Duration
+	zonePodEvictor map[string]*RateLimitedTimedQueue
+	// workers that are responsible for tainting nodes.
+	zoneNotReadyOrUnreachableTainer map[string]*RateLimitedTimedQueue
+	podEvictionTimeout              time.Duration
 	// The maximum duration before a pod evicted from a node can be forcefully terminated.
 	maximumGracePeriod time.Duration
 	recorder           record.EventRecorder
@ -166,6 +179,10 @@ type NodeController struct {
 	// if set to true NodeController will start TaintManager that will evict Pods from
 	// tainted nodes, if they're not tolerated.
 	runTaintManager bool
+
+	// if set to true NodeController will taint Nodes with 'TaintNodeNotReady' and 'TaintNodeUnreachable'
+	// taints instead of evicting Pods itself.
+	useTaintBasedEvictions bool
 }

 // NewNodeController returns a new node controller to sync instances from cloudprovider.
@ -190,7 +207,8 @@ func NewNodeController(
 	serviceCIDR *net.IPNet,
 	nodeCIDRMaskSize int,
 	allocateNodeCIDRs bool,
-	runTaintManager bool) (*NodeController, error) {
+	runTaintManager bool,
+	useTaintBasedEvictions bool) (*NodeController, error) {
 	eventBroadcaster := record.NewBroadcaster()
 	recorder := eventBroadcaster.NewRecorder(api.Scheme, clientv1.EventSource{Component: "controllermanager"})
 	eventBroadcaster.StartLogging(glog.Infof)
@ -216,30 +234,32 @@ func NewNodeController(
 	}

 	nc := &NodeController{
-		cloud:                       cloud,
-		knownNodeSet:                make(map[string]*v1.Node),
-		kubeClient:                  kubeClient,
-		recorder:                    recorder,
-		podEvictionTimeout:          podEvictionTimeout,
-		maximumGracePeriod:          5 * time.Minute,
-		zonePodEvictor:              make(map[string]*RateLimitedTimedQueue),
-		nodeStatusMap:               make(map[string]nodeStatusData),
-		nodeMonitorGracePeriod:      nodeMonitorGracePeriod,
-		nodeMonitorPeriod:           nodeMonitorPeriod,
-		nodeStartupGracePeriod:      nodeStartupGracePeriod,
-		lookupIP:                    net.LookupIP,
-		now:                         metav1.Now,
-		clusterCIDR:                 clusterCIDR,
-		serviceCIDR:                 serviceCIDR,
-		allocateNodeCIDRs:           allocateNodeCIDRs,
-		forcefullyDeletePod:         func(p *v1.Pod) error { return forcefullyDeletePod(kubeClient, p) },
-		nodeExistsInCloudProvider:   func(nodeName types.NodeName) (bool, error) { return nodeExistsInCloudProvider(cloud, nodeName) },
-		evictionLimiterQPS:          evictionLimiterQPS,
-		secondaryEvictionLimiterQPS: secondaryEvictionLimiterQPS,
-		largeClusterThreshold:       largeClusterThreshold,
-		unhealthyZoneThreshold:      unhealthyZoneThreshold,
-		zoneStates:                  make(map[string]zoneState),
-		runTaintManager:             runTaintManager,
+		cloud:                           cloud,
+		knownNodeSet:                    make(map[string]*v1.Node),
+		kubeClient:                      kubeClient,
+		recorder:                        recorder,
+		podEvictionTimeout:              podEvictionTimeout,
+		maximumGracePeriod:              5 * time.Minute,
+		zonePodEvictor:                  make(map[string]*RateLimitedTimedQueue),
+		zoneNotReadyOrUnreachableTainer: make(map[string]*RateLimitedTimedQueue),
+		nodeStatusMap:                   make(map[string]nodeStatusData),
+		nodeMonitorGracePeriod:          nodeMonitorGracePeriod,
+		nodeMonitorPeriod:               nodeMonitorPeriod,
+		nodeStartupGracePeriod:          nodeStartupGracePeriod,
+		lookupIP:                        net.LookupIP,
+		now:                             metav1.Now,
+		clusterCIDR:                     clusterCIDR,
+		serviceCIDR:                     serviceCIDR,
+		allocateNodeCIDRs:               allocateNodeCIDRs,
+		forcefullyDeletePod:             func(p *v1.Pod) error { return forcefullyDeletePod(kubeClient, p) },
+		nodeExistsInCloudProvider:       func(nodeName types.NodeName) (bool, error) { return nodeExistsInCloudProvider(cloud, nodeName) },
+		evictionLimiterQPS:              evictionLimiterQPS,
+		secondaryEvictionLimiterQPS:     secondaryEvictionLimiterQPS,
+		largeClusterThreshold:           largeClusterThreshold,
+		unhealthyZoneThreshold:          unhealthyZoneThreshold,
+		zoneStates:                      make(map[string]zoneState),
+		runTaintManager:                 runTaintManager,
+		useTaintBasedEvictions:          useTaintBasedEvictions && runTaintManager,
 	}
 	nc.enterPartialDisruptionFunc = nc.ReducedQPSFunc
 	nc.enterFullDisruptionFunc = nc.HealthyQPSFunc
@ -426,38 +446,100 @@ func (nc *NodeController) Run() {
 			go nc.taintManager.Run(wait.NeverStop)
 		}

-		// Managing eviction of nodes:
-		// When we delete pods off a node, if the node was not empty at the time we then
-		// queue an eviction watcher. If we hit an error, retry deletion.
-		go wait.Until(func() {
-			nc.evictorLock.Lock()
-			defer nc.evictorLock.Unlock()
-			for k := range nc.zonePodEvictor {
-				nc.zonePodEvictor[k].Try(func(value TimedValue) (bool, time.Duration) {
-					node, err := nc.nodeLister.Get(value.Value)
-					if apierrors.IsNotFound(err) {
-						glog.Warningf("Node %v no longer present in nodeLister!", value.Value)
-					} else if err != nil {
-						glog.Warningf("Failed to get Node %v from the nodeLister: %v", value.Value, err)
-					} else {
-						zone := utilnode.GetZoneKey(node)
-						EvictionsNumber.WithLabelValues(zone).Inc()
-					}
+		if nc.useTaintBasedEvictions {
+			// Handling taint based evictions. Because we don't want a dedicated logic in TaintManager for NC-originated
+			// taints and we normally don't rate limit evictions caused by taints, we need to rate limit adding taints.
+			go wait.Until(func() {
+				nc.evictorLock.Lock()
+				defer nc.evictorLock.Unlock()
+				for k := range nc.zoneNotReadyOrUnreachableTainer {
+					// Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded).
+					nc.zoneNotReadyOrUnreachableTainer[k].Try(func(value TimedValue) (bool, time.Duration) {
+						node, err := nc.nodeLister.Get(value.Value)
+						if apierrors.IsNotFound(err) {
+							glog.Warningf("Node %v no longer present in nodeLister!", value.Value)
+							return true, 0
+						} else if err != nil {
+							glog.Warningf("Failed to get Node %v from the nodeLister: %v", value.Value, err)
+							// retry in 50 millisecond
+							return false, 50 * time.Millisecond
+						} else {
+							zone := utilnode.GetZoneKey(node)
+							EvictionsNumber.WithLabelValues(zone).Inc()
+						}
+						_, condition := v1.GetNodeCondition(&node.Status, v1.NodeReady)
+						// Because we want to mimic NodeStatus.Condition["Ready"] we make "unreachable" and "not ready" taints mutually exclusive.
+						taintToAdd := v1.Taint{}
+						oppositeTaint := v1.Taint{}
+						if condition.Status == v1.ConditionFalse {
+							taintToAdd = *NotReadyTaintTemplate
+							oppositeTaint = *UnreachableTaintTemplate
+						} else if condition.Status == v1.ConditionUnknown {
+							taintToAdd = *UnreachableTaintTemplate
+							oppositeTaint = *NotReadyTaintTemplate
+						} else {
+							// It seems that the Node is ready again, so there's no need to taint it.
+							return true, 0
+						}

-					nodeUid, _ := value.UID.(string)
-					remaining, err := deletePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, nc.daemonSetStore)
-					if err != nil {
-						utilruntime.HandleError(fmt.Errorf("unable to evict node %q: %v", value.Value, err))
-						return false, 0
-					}
-
-					if remaining {
-						glog.Infof("Pods awaiting deletion due to NodeController eviction")
-					}
-					return true, 0
-				})
-			}
-		}, nodeEvictionPeriod, wait.NeverStop)
+						taintToAdd.TimeAdded = metav1.Now()
+						err = controller.AddOrUpdateTaintOnNode(nc.kubeClient, value.Value, &taintToAdd)
+						if err != nil {
+							utilruntime.HandleError(
+								fmt.Errorf(
+									"unable to taint %v unresponsive Node %q: %v",
+									taintToAdd.Key,
+									value.Value,
+									err))
+							return false, 0
+						}
+						err = controller.RemoveTaintOffNode(nc.kubeClient, value.Value, &oppositeTaint, node)
+						if err != nil {
+							utilruntime.HandleError(
+								fmt.Errorf(
+									"unable to remove %v unneeded taint from unresponsive Node %q: %v",
+									oppositeTaint.Key,
+									value.Value,
+									err))
+							return false, 0
+						}
+						return true, 0
+					})
+				}
+			}, nodeEvictionPeriod, wait.NeverStop)
+		} else {
+			// Managing eviction of nodes:
+			// When we delete pods off a node, if the node was not empty at the time we then
+			// queue an eviction watcher. If we hit an error, retry deletion.
+			go wait.Until(func() {
+				nc.evictorLock.Lock()
+				defer nc.evictorLock.Unlock()
+				for k := range nc.zonePodEvictor {
+					// Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded).
+					nc.zonePodEvictor[k].Try(func(value TimedValue) (bool, time.Duration) {
+						node, err := nc.nodeLister.Get(value.Value)
+						if apierrors.IsNotFound(err) {
+							glog.Warningf("Node %v no longer present in nodeLister!", value.Value)
+						} else if err != nil {
+							glog.Warningf("Failed to get Node %v from the nodeLister: %v", value.Value, err)
+						} else {
+							zone := utilnode.GetZoneKey(node)
+							EvictionsNumber.WithLabelValues(zone).Inc()
+						}
+						nodeUid, _ := value.UID.(string)
+						remaining, err := deletePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, nc.daemonSetStore)
+						if err != nil {
+							utilruntime.HandleError(fmt.Errorf("unable to evict node %q: %v", value.Value, err))
+							return false, 0
+						}
+						if remaining {
+							glog.Infof("Pods awaiting deletion due to NodeController eviction")
+						}
+						return true, 0
+					})
+				}
+			}, nodeEvictionPeriod, wait.NeverStop)
+		}
 	}()
 }

@ -478,15 +560,26 @@ func (nc *NodeController) monitorNodeStatus() error {
 		nc.knownNodeSet[added[i].Name] = added[i]
 		// When adding new Nodes we need to check if new zone appeared, and if so add new evictor.
 		zone := utilnode.GetZoneKey(added[i])
-		if _, found := nc.zonePodEvictor[zone]; !found {
-			nc.zonePodEvictor[zone] =
-				NewRateLimitedTimedQueue(
-					flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst))
+		if _, found := nc.zoneStates[zone]; !found {
+			nc.zoneStates[zone] = stateInitial
+			if !nc.useTaintBasedEvictions {
+				nc.zonePodEvictor[zone] =
+					NewRateLimitedTimedQueue(
+						flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst))
+			} else {
+				nc.zoneNotReadyOrUnreachableTainer[zone] =
+					NewRateLimitedTimedQueue(
+						flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst))
+			}
 			// Init the metric for the new zone.
 			glog.Infof("Initializing eviction metric for zone: %v", zone)
 			EvictionsNumber.WithLabelValues(zone).Add(0)
 		}
-		nc.cancelPodEviction(added[i])
+		if nc.useTaintBasedEvictions {
+			nc.markNodeAsHealthy(added[i])
+		} else {
+			nc.cancelPodEviction(added[i])
+		}
 	}

 	for i := range deleted {
@ -532,21 +625,61 @@ func (nc *NodeController) monitorNodeStatus() error {
 		decisionTimestamp := nc.now()
 		if currentReadyCondition != nil {
 			// Check eviction timeout against decisionTimestamp
-			if observedReadyCondition.Status == v1.ConditionFalse &&
-				decisionTimestamp.After(nc.nodeStatusMap[node.Name].readyTransitionTimestamp.Add(nc.podEvictionTimeout)) {
-				if nc.evictPods(node) {
-					glog.V(2).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout)
+			if observedReadyCondition.Status == v1.ConditionFalse {
+				if nc.useTaintBasedEvictions {
+					if nc.markNodeForTainting(node) {
+						glog.V(2).Infof("Tainting Node %v with NotReady taint on %v",
+							node.Name,
+							decisionTimestamp,
+						)
+					}
+				} else {
+					if decisionTimestamp.After(nc.nodeStatusMap[node.Name].readyTransitionTimestamp.Add(nc.podEvictionTimeout)) {
+						if nc.evictPods(node) {
+							glog.V(2).Infof("Evicting pods on node %s: %v is later than %v + %v",
+								node.Name,
+								decisionTimestamp,
+								nc.nodeStatusMap[node.Name].readyTransitionTimestamp,
+								nc.podEvictionTimeout,
+							)
+						}
+					}
 				}
 			}
-			if observedReadyCondition.Status == v1.ConditionUnknown &&
-				decisionTimestamp.After(nc.nodeStatusMap[node.Name].probeTimestamp.Add(nc.podEvictionTimeout)) {
-				if nc.evictPods(node) {
-					glog.V(2).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout-gracePeriod)
+			if observedReadyCondition.Status == v1.ConditionUnknown {
+				if nc.useTaintBasedEvictions {
+					if nc.markNodeForTainting(node) {
+						glog.V(2).Infof("Tainting Node %v with NotReady taint on %v",
+							node.Name,
+							decisionTimestamp,
+						)
+					}
+				} else {
+					if decisionTimestamp.After(nc.nodeStatusMap[node.Name].probeTimestamp.Add(nc.podEvictionTimeout)) {
+						if nc.evictPods(node) {
+							glog.V(2).Infof("Evicting pods on node %s: %v is later than %v + %v",
+								node.Name,
+								decisionTimestamp,
+								nc.nodeStatusMap[node.Name].readyTransitionTimestamp,
+								nc.podEvictionTimeout-gracePeriod,
+							)
+						}
+					}
 				}
 			}
 			if observedReadyCondition.Status == v1.ConditionTrue {
-				if nc.cancelPodEviction(node) {
-					glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name)
+				if nc.useTaintBasedEvictions {
+					removed, err := nc.markNodeAsHealthy(node)
+					if err != nil {
+						glog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name)
+					}
+					if removed {
+						glog.V(2).Infof("Node %s is healthy again, removing all taints", node.Name)
+					}
+				} else {
+					if nc.cancelPodEviction(node) {
+						glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name)
+					}
 				}
 			}

@ -600,6 +733,7 @@ func (nc *NodeController) handleDisruption(zoneToNodeConditions map[string][]*v1
 		}
 		newZoneStates[k] = newState
 		if _, had := nc.zoneStates[k]; !had {
+			glog.Errorf("Setting initial state for unseen zone: %v", k)
 			nc.zoneStates[k] = stateInitial
 		}
 	}
@ -629,11 +763,22 @@ func (nc *NodeController) handleDisruption(zoneToNodeConditions map[string][]*v1
 		if allAreFullyDisrupted {
 			glog.V(0).Info("NodeController detected that all Nodes are not-Ready. Entering master disruption mode.")
 			for i := range nodes {
-				nc.cancelPodEviction(nodes[i])
+				if nc.useTaintBasedEvictions {
+					_, err := nc.markNodeAsHealthy(nodes[i])
+					if err != nil {
+						glog.Errorf("Failed to remove taints from Node %v", nodes[i].Name)
+					}
+				} else {
+					nc.cancelPodEviction(nodes[i])
+				}
 			}
 			// We stop all evictions.
-			for k := range nc.zonePodEvictor {
-				nc.zonePodEvictor[k].SwapLimiter(0)
+			for k := range nc.zoneStates {
+				if nc.useTaintBasedEvictions {
+					nc.zoneNotReadyOrUnreachableTainer[k].SwapLimiter(0)
+				} else {
+					nc.zonePodEvictor[k].SwapLimiter(0)
+				}
 			}
 			for k := range nc.zoneStates {
 				nc.zoneStates[k] = stateFullDisruption
@ -653,7 +798,7 @@ func (nc *NodeController) handleDisruption(zoneToNodeConditions map[string][]*v1
 				nc.nodeStatusMap[nodes[i].Name] = v
 			}
 			// We reset all rate limiters to settings appropriate for the given state.
-			for k := range nc.zonePodEvictor {
+			for k := range nc.zoneStates {
 				nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newZoneStates[k])
 				nc.zoneStates[k] = newZoneStates[k]
 			}
@ -676,13 +821,27 @@ func (nc *NodeController) handleDisruption(zoneToNodeConditions map[string][]*v1
 func (nc *NodeController) setLimiterInZone(zone string, zoneSize int, state zoneState) {
 	switch state {
 	case stateNormal:
-		nc.zonePodEvictor[zone].SwapLimiter(nc.evictionLimiterQPS)
+		if nc.useTaintBasedEvictions {
+			nc.zoneNotReadyOrUnreachableTainer[zone].SwapLimiter(nc.evictionLimiterQPS)
+		} else {
+			nc.zonePodEvictor[zone].SwapLimiter(nc.evictionLimiterQPS)
+		}
 	case statePartialDisruption:
-		nc.zonePodEvictor[zone].SwapLimiter(
-			nc.enterPartialDisruptionFunc(zoneSize))
+		if nc.useTaintBasedEvictions {
+			nc.zoneNotReadyOrUnreachableTainer[zone].SwapLimiter(
+				nc.enterPartialDisruptionFunc(zoneSize))
+		} else {
+			nc.zonePodEvictor[zone].SwapLimiter(
+				nc.enterPartialDisruptionFunc(zoneSize))
+		}
 	case stateFullDisruption:
-		nc.zonePodEvictor[zone].SwapLimiter(
-			nc.enterFullDisruptionFunc(zoneSize))
+		if nc.useTaintBasedEvictions {
+			nc.zoneNotReadyOrUnreachableTainer[zone].SwapLimiter(
+				nc.enterFullDisruptionFunc(zoneSize))
+		} else {
+			nc.zonePodEvictor[zone].SwapLimiter(
+				nc.enterFullDisruptionFunc(zoneSize))
+		}
 	}
 }

@ -898,6 +1057,28 @@ func (nc *NodeController) evictPods(node *v1.Node) bool {
 	return nc.zonePodEvictor[utilnode.GetZoneKey(node)].Add(node.Name, string(node.UID))
 }

+func (nc *NodeController) markNodeForTainting(node *v1.Node) bool {
+	nc.evictorLock.Lock()
+	defer nc.evictorLock.Unlock()
+	return nc.zoneNotReadyOrUnreachableTainer[utilnode.GetZoneKey(node)].Add(node.Name, string(node.UID))
+}
+
+func (nc *NodeController) markNodeAsHealthy(node *v1.Node) (bool, error) {
+	nc.evictorLock.Lock()
+	defer nc.evictorLock.Unlock()
+	err := controller.RemoveTaintOffNode(nc.kubeClient, node.Name, UnreachableTaintTemplate, node)
+	if err != nil {
+		glog.Errorf("Failed to remove taint from node %v: %v", node.Name, err)
+		return false, err
+	}
+	err = controller.RemoveTaintOffNode(nc.kubeClient, node.Name, NotReadyTaintTemplate, node)
+	if err != nil {
+		glog.Errorf("Failed to remove taint from node %v: %v", node.Name, err)
+		return false, err
+	}
+	return nc.zoneNotReadyOrUnreachableTainer[utilnode.GetZoneKey(node)].Remove(node.Name), nil
+}
+
 // Default value for cluster eviction rate - we take nodeNum for consistency with ReducedQPSFunc.
 func (nc *NodeController) HealthyQPSFunc(nodeNum int) float32 {
 	return nc.evictionLimiterQPS
--- a/pkg/controller/node/nodecontroller_test.go
+++ b/pkg/controller/node/nodecontroller_test.go
@ -100,6 +100,7 @@ func NewNodeControllerFromClient(
 		nodeCIDRMaskSize,
 		allocateNodeCIDRs,
 		false,
+		false,
 	)
 	if err != nil {
 		return nil, err
@ -573,11 +574,15 @@ func TestMonitorNodeStatusEvictPods(t *testing.T) {
 		}
 		zones := testutil.GetZones(item.fakeNodeHandler)
 		for _, zone := range zones {
-			nodeController.zonePodEvictor[zone].Try(func(value TimedValue) (bool, time.Duration) {
-				nodeUid, _ := value.UID.(string)
-				deletePods(item.fakeNodeHandler, nodeController.recorder, value.Value, nodeUid, nodeController.daemonSetInformer.Lister())
-				return true, 0
-			})
+			if _, ok := nodeController.zonePodEvictor[zone]; ok {
+				nodeController.zonePodEvictor[zone].Try(func(value TimedValue) (bool, time.Duration) {
+					nodeUid, _ := value.UID.(string)
+					deletePods(item.fakeNodeHandler, nodeController.recorder, value.Value, nodeUid, nodeController.daemonSetInformer.Lister())
+					return true, 0
+				})
+			} else {
+				t.Fatalf("Zone %v was unitialized!", zone)
+			}
 		}

 		podEvicted := false
--- a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/controller_policy.go
+++ b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/controller_policy.go
@ -167,7 +167,7 @@ func init() {
 	addControllerRole(rbac.ClusterRole{
 		ObjectMeta: metav1.ObjectMeta{Name: saRolePrefix + "node-controller"},
 		Rules: []rbac.PolicyRule{
-			rbac.NewRule("get", "list", "update", "delete").Groups(legacyGroup).Resources("nodes").RuleOrDie(),
+			rbac.NewRule("get", "list", "update", "delete", "patch").Groups(legacyGroup).Resources("nodes").RuleOrDie(),
 			rbac.NewRule("update").Groups(legacyGroup).Resources("nodes/status").RuleOrDie(),
 			// used for pod eviction
 			rbac.NewRule("update").Groups(legacyGroup).Resources("pods/status").RuleOrDie(),
--- a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/controller-roles.yaml
+++ b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/controller-roles.yaml
@ -549,6 +549,7 @@ items:
    - delete
    - get
    - list
+    - patch
    - update
  - apiGroups:
    - ""
--- a/test/e2e/BUILD
+++ b/test/e2e/BUILD
@ -133,6 +133,7 @@ go_library(
        "//pkg/controller/deployment/util:go_default_library",
        "//pkg/controller/endpoint:go_default_library",
        "//pkg/controller/job:go_default_library",
+        "//pkg/controller/node:go_default_library",
        "//pkg/controller/replicaset:go_default_library",
        "//pkg/controller/replication:go_default_library",
        "//pkg/kubectl:go_default_library",
--- a/test/e2e/framework/BUILD
+++ b/test/e2e/framework/BUILD
@ -57,6 +57,7 @@ go_library(
        "//pkg/cloudprovider/providers/gce:go_default_library",
        "//pkg/controller:go_default_library",
        "//pkg/controller/deployment/util:go_default_library",
+        "//pkg/controller/node:go_default_library",
        "//pkg/kubectl:go_default_library",
        "//pkg/kubelet/api/v1alpha1/stats:go_default_library",
        "//pkg/kubelet/metrics:go_default_library",
--- a/test/e2e/framework/util.go
+++ b/test/e2e/framework/util.go
@ -82,6 +82,7 @@ import (
 	gcecloud "k8s.io/kubernetes/pkg/cloudprovider/providers/gce"
 	"k8s.io/kubernetes/pkg/controller"
 	deploymentutil "k8s.io/kubernetes/pkg/controller/deployment/util"
+	nodectlr "k8s.io/kubernetes/pkg/controller/node"
 	"k8s.io/kubernetes/pkg/kubectl"
 	"k8s.io/kubernetes/pkg/kubelet/util/format"
 	"k8s.io/kubernetes/pkg/master/ports"
@ -774,7 +775,7 @@ func WaitForMatchPodsCondition(c clientset.Interface, opts metav1.ListOptions, d
 		if len(conditionNotMatch) <= 0 {
 			return err
 		}
-		Logf("%d pods are not %s", len(conditionNotMatch), desc)
+		Logf("%d pods are not %s: %v", len(conditionNotMatch), desc, conditionNotMatch)
 	}
 	return fmt.Errorf("gave up waiting for matching pods to be '%s' after %v", desc, timeout)
 }
@ -2496,7 +2497,7 @@ func ExpectNodeHasLabel(c clientset.Interface, nodeName string, labelKey string,
 }

 func RemoveTaintOffNode(c clientset.Interface, nodeName string, taint v1.Taint) {
-	ExpectNoError(controller.RemoveTaintOffNode(c, nodeName, &taint))
+	ExpectNoError(controller.RemoveTaintOffNode(c, nodeName, &taint, nil))
 	VerifyThatTaintIsGone(c, nodeName, &taint)
 }

@ -2525,14 +2526,24 @@ func VerifyThatTaintIsGone(c clientset.Interface, nodeName string, taint *v1.Tai

 func ExpectNodeHasTaint(c clientset.Interface, nodeName string, taint *v1.Taint) {
 	By("verifying the node has the taint " + taint.ToString())
+	if has, err := NodeHasTaint(c, nodeName, taint); !has {
+		ExpectNoError(err)
+		Failf("Failed to find taint %s on node %s", taint.ToString(), nodeName)
+	}
+}
+
+func NodeHasTaint(c clientset.Interface, nodeName string, taint *v1.Taint) (bool, error) {
 	node, err := c.Core().Nodes().Get(nodeName, metav1.GetOptions{})
-	ExpectNoError(err)
+	if err != nil {
+		return false, err
+	}

 	nodeTaints := node.Spec.Taints

 	if len(nodeTaints) == 0 || !v1.TaintExists(nodeTaints, taint) {
-		Failf("Failed to find taint %s on node %s", taint.ToString(), nodeName)
+		return false, nil
 	}
+	return true, nil
 }

 func getScalerForKind(internalClientset internalclientset.Interface, kind schema.GroupKind) (kubectl.Scaler, error) {
@ -3961,7 +3972,47 @@ func isNodeConditionSetAsExpected(node *v1.Node, conditionType v1.NodeConditionT
 	for _, cond := range node.Status.Conditions {
 		// Ensure that the condition type and the status matches as desired.
 		if cond.Type == conditionType {
-			if (cond.Status == v1.ConditionTrue) == wantTrue {
+			// For NodeReady condition we need to check Taints as well
+			if cond.Type == v1.NodeReady {
+				hasNodeControllerTaints := false
+				// For NodeReady we need to check if Taints are gone as well
+				taints := node.Spec.Taints
+				for _, taint := range taints {
+					if taint.MatchTaint(nodectlr.UnreachableTaintTemplate) || taint.MatchTaint(nodectlr.NotReadyTaintTemplate) {
+						hasNodeControllerTaints = true
+						break
+					}
+				}
+				if wantTrue {
+					if (cond.Status == v1.ConditionTrue) && !hasNodeControllerTaints {
+						return true
+					} else {
+						msg := ""
+						if !hasNodeControllerTaints {
+							msg = fmt.Sprintf("Condition %s of node %s is %v instead of %t. Reason: %v, message: %v",
+								conditionType, node.Name, cond.Status == v1.ConditionTrue, wantTrue, cond.Reason, cond.Message)
+						} else {
+							msg = fmt.Sprintf("Condition %s of node %s is %v, but Node is tainted by NodeController with %v. Failure",
+								conditionType, node.Name, cond.Status == v1.ConditionTrue, taints)
+						}
+						if !silent {
+							Logf(msg)
+						}
+						return false
+					}
+				} else {
+					// TODO: check if the Node is tainted once we enable NC notReady/unreachable taints by default
+					if cond.Status != v1.ConditionTrue {
+						return true
+					}
+					if !silent {
+						Logf("Condition %s of node %s is %v instead of %t. Reason: %v, message: %v",
+							conditionType, node.Name, cond.Status == v1.ConditionTrue, wantTrue, cond.Reason, cond.Message)
+					}
+					return false
+				}
+			}
+			if (wantTrue && (cond.Status == v1.ConditionTrue)) || (!wantTrue && (cond.Status != v1.ConditionTrue)) {
 				return true
 			} else {
 				if !silent {
@ -3971,6 +4022,7 @@ func isNodeConditionSetAsExpected(node *v1.Node, conditionType v1.NodeConditionT
 				return false
 			}
 		}
+
 	}
 	if !silent {
 		Logf("Couldn't find condition %v on node %v", conditionType, node.Name)
--- a/test/e2e/network_partition.go
+++ b/test/e2e/network_partition.go
@ -27,10 +27,13 @@ import (
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/apimachinery/pkg/watch"
+
 	"k8s.io/client-go/tools/cache"
+
 	"k8s.io/kubernetes/pkg/api"
 	"k8s.io/kubernetes/pkg/api/v1"
 	"k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
+	nodepkg "k8s.io/kubernetes/pkg/controller/node"
 	"k8s.io/kubernetes/test/e2e/framework"
 	testutils "k8s.io/kubernetes/test/utils"

@ -476,4 +479,181 @@ var _ = framework.KubeDescribe("Network Partition [Disruptive] [Slow]", func() {
 			}
 		})
 	})
+
+	framework.KubeDescribe("Pods", func() {
+		Context("should be evicted from unready Node", func() {
+			BeforeEach(func() {
+				framework.SkipUnlessProviderIs("gce", "gke", "aws")
+				framework.SkipUnlessNodeCountIsAtLeast(2)
+			})
+
+			// What happens in this test:
+			//	Network traffic from a node to master is cut off to simulate network partition
+			// Expect to observe:
+			// 1. Node is marked NotReady after timeout by nodecontroller (40seconds)
+			// 2. All pods on node are marked NotReady shortly after #1
+			// 3. After enough time passess all Pods are evicted from the given Node
+			It("[Feature:TaintEviction] All pods on the unreachable node should be marked as NotReady upon the node turn NotReady "+
+				"AND all pods should be evicted after eviction timeout passes", func() {
+				By("choose a node - we will block all network traffic on this node")
+				var podOpts metav1.ListOptions
+				nodes := framework.GetReadySchedulableNodesOrDie(c)
+				framework.FilterNodes(nodes, func(node v1.Node) bool {
+					if !framework.IsNodeConditionSetAsExpected(&node, v1.NodeReady, true) {
+						return false
+					}
+					podOpts = metav1.ListOptions{FieldSelector: fields.OneTermEqualSelector(api.PodHostField, node.Name).String()}
+					pods, err := c.Core().Pods(metav1.NamespaceAll).List(podOpts)
+					if err != nil || len(pods.Items) <= 0 {
+						return false
+					}
+					return true
+				})
+				if len(nodes.Items) <= 0 {
+					framework.Failf("No eligible node were found: %d", len(nodes.Items))
+				}
+				node := nodes.Items[0]
+				podOpts = metav1.ListOptions{FieldSelector: fields.OneTermEqualSelector(api.PodHostField, node.Name).String()}
+				if err := framework.WaitForMatchPodsCondition(c, podOpts, "Running and Ready", podReadyTimeout, testutils.PodRunningReadyOrSucceeded); err != nil {
+					framework.Failf("Pods on node %s are not ready and running within %v: %v", node.Name, podReadyTimeout, err)
+				}
+				pods, err := c.Core().Pods(metav1.NamespaceAll).List(podOpts)
+				framework.ExpectNoError(err)
+				podTolerationTimes := map[string]time.Duration{}
+				// This test doesn't add tolerations by itself, but because they may be present in the cluster
+				// it needs to account for that.
+				for _, pod := range pods.Items {
+					namespacedName := fmt.Sprintf("%v/%v", pod.Namespace, pod.Name)
+					tolerations := pod.Spec.Tolerations
+					framework.ExpectNoError(err)
+					for _, toleration := range tolerations {
+						if toleration.ToleratesTaint(nodepkg.UnreachableTaintTemplate) {
+							if toleration.TolerationSeconds != nil {
+								podTolerationTimes[namespacedName] = time.Duration(*toleration.TolerationSeconds) * time.Second
+								break
+							} else {
+								podTolerationTimes[namespacedName] = -1
+							}
+						}
+					}
+					if _, ok := podTolerationTimes[namespacedName]; !ok {
+						podTolerationTimes[namespacedName] = 0
+					}
+				}
+				neverEvictedPods := []string{}
+				maxTolerationTime := time.Duration(0)
+				for podName, tolerationTime := range podTolerationTimes {
+					if tolerationTime < 0 {
+						neverEvictedPods = append(neverEvictedPods, podName)
+					} else {
+						if tolerationTime > maxTolerationTime {
+							maxTolerationTime = tolerationTime
+						}
+					}
+				}
+				framework.Logf(
+					"Only %v should be running after partition. Maximum TolerationSeconds among other Pods is %v",
+					neverEvictedPods,
+					maxTolerationTime,
+				)
+
+				By("Set up watch on node status")
+				nodeSelector := fields.OneTermEqualSelector("metadata.name", node.Name)
+				stopCh := make(chan struct{})
+				newNode := make(chan *v1.Node)
+				var controller cache.Controller
+				_, controller = cache.NewInformer(
+					&cache.ListWatch{
+						ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
+							options.FieldSelector = nodeSelector.String()
+							obj, err := f.ClientSet.Core().Nodes().List(options)
+							return runtime.Object(obj), err
+						},
+						WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
+							options.FieldSelector = nodeSelector.String()
+							return f.ClientSet.Core().Nodes().Watch(options)
+						},
+					},
+					&v1.Node{},
+					0,
+					cache.ResourceEventHandlerFuncs{
+						UpdateFunc: func(oldObj, newObj interface{}) {
+							n, ok := newObj.(*v1.Node)
+							Expect(ok).To(Equal(true))
+							newNode <- n
+
+						},
+					},
+				)
+
+				defer func() {
+					// Will not explicitly close newNode channel here due to
+					// race condition where stopCh and newNode are closed but informer onUpdate still executes.
+					close(stopCh)
+				}()
+				go controller.Run(stopCh)
+
+				By(fmt.Sprintf("Block traffic from node %s to the master", node.Name))
+				host := framework.GetNodeExternalIP(&node)
+				master := framework.GetMasterAddress(c)
+				defer func() {
+					By(fmt.Sprintf("Unblock traffic from node %s to the master", node.Name))
+					framework.UnblockNetwork(host, master)
+
+					if CurrentGinkgoTestDescription().Failed {
+						return
+					}
+
+					By("Expect to observe node status change from NotReady to Ready after network connectivity recovers")
+					expectNodeReadiness(true, newNode)
+				}()
+
+				framework.BlockNetwork(host, master)
+
+				By("Expect to observe node and pod status change from Ready to NotReady after network partition")
+				expectNodeReadiness(false, newNode)
+				framework.ExpectNoError(wait.Poll(1*time.Second, timeout, func() (bool, error) {
+					return framework.NodeHasTaint(c, node.Name, nodepkg.UnreachableTaintTemplate)
+				}))
+				if err = framework.WaitForMatchPodsCondition(c, podOpts, "NotReady", podNotReadyTimeout, testutils.PodNotReady); err != nil {
+					framework.Failf("Pods on node %s did not become NotReady within %v: %v", node.Name, podNotReadyTimeout, err)
+				}
+
+				sleepTime := maxTolerationTime + 20*time.Second
+				By(fmt.Sprintf("Sleeping for %v and checking if all Pods were evicted", sleepTime))
+				time.Sleep(sleepTime)
+				pods, err = c.Core().Pods(v1.NamespaceAll).List(podOpts)
+				framework.ExpectNoError(err)
+				seenRunning := []string{}
+				for _, pod := range pods.Items {
+					namespacedName := fmt.Sprintf("%v/%v", pod.Namespace, pod.Name)
+					shouldBeTerminating := true
+					for _, neverEvictedPod := range neverEvictedPods {
+						if neverEvictedPod == namespacedName {
+							shouldBeTerminating = false
+						}
+					}
+					if pod.DeletionTimestamp == nil {
+						seenRunning = append(seenRunning, namespacedName)
+						if shouldBeTerminating {
+							framework.Failf("Pod %v should have been deleted but was seen running", namespacedName)
+						}
+					}
+				}
+
+				for _, neverEvictedPod := range neverEvictedPods {
+					running := false
+					for _, runningPod := range seenRunning {
+						if runningPod == neverEvictedPod {
+							running = true
+							break
+						}
+					}
+					if !running {
+						framework.Failf("Pod %v was evicted even though it shouldn't", neverEvictedPod)
+					}
+				}
+			})
+		})
+	})
 })