daemonset: differentiate between cases in nodeShouldRun

secifically we need to differentiate between wanting to run,
should run and should continue running. This is required to
support all taint effects and will improve reporting and end
user debuggability.
pull/6/head
Mike Danese 2016-12-14 14:25:35 -08:00
parent 330c922706
commit c518e89042
2 changed files with 107 additions and 18 deletions

View File

@ -387,8 +387,11 @@ func (dsc *DaemonSetsController) addNode(obj interface{}) {
node := obj.(*v1.Node) node := obj.(*v1.Node)
for i := range dsList.Items { for i := range dsList.Items {
ds := &dsList.Items[i] ds := &dsList.Items[i]
shouldEnqueue := dsc.nodeShouldRunDaemonPod(node, ds) _, shouldSchedule, _, err := dsc.nodeShouldRunDaemonPod(node, ds)
if shouldEnqueue { if err != nil {
continue
}
if shouldSchedule {
dsc.enqueueDaemonSet(ds) dsc.enqueueDaemonSet(ds)
} }
} }
@ -406,14 +409,21 @@ func (dsc *DaemonSetsController) updateNode(old, cur interface{}) {
glog.V(4).Infof("Error enqueueing daemon sets: %v", err) glog.V(4).Infof("Error enqueueing daemon sets: %v", err)
return return
} }
// TODO: it'd be nice to pass a hint with these enqueues, so that each ds would only examine the added node (unless it has other work to do, too).
for i := range dsList.Items { for i := range dsList.Items {
ds := &dsList.Items[i] ds := &dsList.Items[i]
shouldEnqueue := (dsc.nodeShouldRunDaemonPod(oldNode, ds) != dsc.nodeShouldRunDaemonPod(curNode, ds)) _, oldShouldSchedule, oldShouldContinueRunning, err := dsc.nodeShouldRunDaemonPod(oldNode, ds)
if shouldEnqueue { if err != nil {
continue
}
_, currentShouldSchedule, currentShouldContinueRunning, err := dsc.nodeShouldRunDaemonPod(curNode, ds)
if err != nil {
continue
}
if (oldShouldSchedule != currentShouldSchedule) || (oldShouldContinueRunning != currentShouldContinueRunning) {
dsc.enqueueDaemonSet(ds) dsc.enqueueDaemonSet(ds)
} }
} }
// TODO: it'd be nice to pass a hint with these enqueues, so that each ds would only examine the added node (unless it has other work to do, too).
} }
// getNodesToDaemonSetPods returns a map from nodes to daemon pods (corresponding to ds) running on the nodes. // getNodesToDaemonSetPods returns a map from nodes to daemon pods (corresponding to ds) running on the nodes.
@ -451,22 +461,25 @@ func (dsc *DaemonSetsController) manage(ds *extensions.DaemonSet) error {
} }
var nodesNeedingDaemonPods, podsToDelete []string var nodesNeedingDaemonPods, podsToDelete []string
for _, node := range nodeList.Items { for _, node := range nodeList.Items {
shouldRun := dsc.nodeShouldRunDaemonPod(&node, ds) _, shouldSchedule, shouldContinueRunning, err := dsc.nodeShouldRunDaemonPod(&node, ds)
if err != nil {
continue
}
daemonPods, isRunning := nodeToDaemonPods[node.Name] daemonPods, isRunning := nodeToDaemonPods[node.Name]
switch { switch {
case shouldRun && !isRunning: case shouldSchedule && !isRunning:
// If daemon pod is supposed to be running on node, but isn't, create daemon pod. // If daemon pod is supposed to be running on node, but isn't, create daemon pod.
nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, node.Name) nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, node.Name)
case shouldRun && len(daemonPods) > 1: case shouldContinueRunning && len(daemonPods) > 1:
// If daemon pod is supposed to be running on node, but more than 1 daemon pod is running, delete the excess daemon pods. // If daemon pod is supposed to be running on node, but more than 1 daemon pod is running, delete the excess daemon pods.
// Sort the daemon pods by creation time, so the the oldest is preserved. // Sort the daemon pods by creation time, so the the oldest is preserved.
sort.Sort(podByCreationTimestamp(daemonPods)) sort.Sort(podByCreationTimestamp(daemonPods))
for i := 1; i < len(daemonPods); i++ { for i := 1; i < len(daemonPods); i++ {
podsToDelete = append(podsToDelete, daemonPods[i].Name) podsToDelete = append(podsToDelete, daemonPods[i].Name)
} }
case !shouldRun && isRunning: case !shouldContinueRunning && isRunning:
// If daemon pod isn't supposed to run on node, but it is, delete all daemon pods on node. // If daemon pod isn't supposed to run on node, but it is, delete all daemon pods on node.
for i := range daemonPods { for i := range daemonPods {
podsToDelete = append(podsToDelete, daemonPods[i].Name) podsToDelete = append(podsToDelete, daemonPods[i].Name)
@ -588,11 +601,14 @@ func (dsc *DaemonSetsController) updateDaemonSetStatus(ds *extensions.DaemonSet)
var desiredNumberScheduled, currentNumberScheduled, numberMisscheduled, numberReady int var desiredNumberScheduled, currentNumberScheduled, numberMisscheduled, numberReady int
for _, node := range nodeList.Items { for _, node := range nodeList.Items {
shouldRun := dsc.nodeShouldRunDaemonPod(&node, ds) wantToRun, _, _, err := dsc.nodeShouldRunDaemonPod(&node, ds)
if err != nil {
return err
}
scheduled := len(nodeToDaemonPods[node.Name]) > 0 scheduled := len(nodeToDaemonPods[node.Name]) > 0
if shouldRun { if wantToRun {
desiredNumberScheduled++ desiredNumberScheduled++
if scheduled { if scheduled {
currentNumberScheduled++ currentNumberScheduled++
@ -658,16 +674,35 @@ func (dsc *DaemonSetsController) syncDaemonSet(key string) error {
return dsc.updateDaemonSetStatus(ds) return dsc.updateDaemonSetStatus(ds)
} }
func (dsc *DaemonSetsController) nodeShouldRunDaemonPod(node *v1.Node, ds *extensions.DaemonSet) bool { // nodeShouldRunDaemonPod checks a set of preconditions against a (node,daemonset) and returns a
// summary. Returned booleans are:
// * wantToRun:
// Returns true when a user would expect a pod to run on this node and ignores conditions
// such as OutOfDisk or insufficent resource that would cause a daemonset pod not to schedule.
// This is primarily used to populate daemonset status.
// * shouldSchedule:
// Returns true when a daemonset should be scheduled to a node if a daemonset pod is not already
// running on that node.
// * shouldContinueRunning:
// Returns true when a daemonset should continue running on a node if a daemonset pod is already
// running on that node.
func (dsc *DaemonSetsController) nodeShouldRunDaemonPod(node *v1.Node, ds *extensions.DaemonSet) (wantToRun, shouldSchedule, shouldContinueRunning bool, err error) {
// Because these bools require an && of all their required conditions, we start
// with all bools set to true and set a bool to false if a condition is not met.
// A bool should probably not be set to true after this line.
wantToRun, shouldSchedule, shouldContinueRunning = true, true, true
// If the daemon set specifies a node name, check that it matches with node.Name. // If the daemon set specifies a node name, check that it matches with node.Name.
if !(ds.Spec.Template.Spec.NodeName == "" || ds.Spec.Template.Spec.NodeName == node.Name) { if !(ds.Spec.Template.Spec.NodeName == "" || ds.Spec.Template.Spec.NodeName == node.Name) {
return false return false, false, false, nil
} }
// TODO: Move it to the predicates // TODO: Move it to the predicates
for _, c := range node.Status.Conditions { for _, c := range node.Status.Conditions {
if c.Type == v1.NodeOutOfDisk && c.Status == v1.ConditionTrue { if c.Type == v1.NodeOutOfDisk && c.Status == v1.ConditionTrue {
return false // the kubelet will evict this pod if it needs to. Let kubelet
// decide whether to continue running this pod so leave shouldContinueRunning
// set to true
shouldSchedule = false
} }
} }
@ -695,22 +730,59 @@ func (dsc *DaemonSetsController) nodeShouldRunDaemonPod(node *v1.Node, ds *exten
nodeInfo := schedulercache.NewNodeInfo(pods...) nodeInfo := schedulercache.NewNodeInfo(pods...)
nodeInfo.SetNode(node) nodeInfo.SetNode(node)
fit, reasons, err := predicates.GeneralPredicates(newPod, nil, nodeInfo) _, reasons, err := predicates.GeneralPredicates(newPod, nil, nodeInfo)
if err != nil { if err != nil {
glog.Warningf("GeneralPredicates failed on ds '%s/%s' due to unexpected error: %v", ds.ObjectMeta.Namespace, ds.ObjectMeta.Name, err) glog.Warningf("GeneralPredicates failed on ds '%s/%s' due to unexpected error: %v", ds.ObjectMeta.Namespace, ds.ObjectMeta.Name, err)
return false, false, false, err
} }
for _, r := range reasons { for _, r := range reasons {
glog.V(4).Infof("GeneralPredicates failed on ds '%s/%s' for reason: %v", ds.ObjectMeta.Namespace, ds.ObjectMeta.Name, r.GetReason()) glog.V(4).Infof("GeneralPredicates failed on ds '%s/%s' for reason: %v", ds.ObjectMeta.Namespace, ds.ObjectMeta.Name, r.GetReason())
switch reason := r.(type) { switch reason := r.(type) {
case *predicates.InsufficientResourceError: case *predicates.InsufficientResourceError:
dsc.eventRecorder.Eventf(ds, v1.EventTypeNormal, "FailedPlacement", "failed to place pod on %q: %s", node.ObjectMeta.Name, reason.Error()) dsc.eventRecorder.Eventf(ds, v1.EventTypeNormal, "FailedPlacement", "failed to place pod on %q: %s", node.ObjectMeta.Name, reason.Error())
shouldSchedule = false
case *predicates.PredicateFailureError: case *predicates.PredicateFailureError:
if reason == predicates.ErrPodNotFitsHostPorts { var emitEvent bool
dsc.eventRecorder.Eventf(ds, v1.EventTypeNormal, "FailedPlacement", "failed to place pod on %q: host port conflict", node.ObjectMeta.Name) // we try to partition predicates into two partitions here: intentional on the part of the operator and not.
switch reason {
// intentional
case
predicates.ErrNodeSelectorNotMatch,
predicates.ErrPodNotMatchHostName,
predicates.ErrNodeLabelPresenceViolated,
// this one is probably intentional since it's a workaround for not having
// pod hard anti affinity.
predicates.ErrPodNotFitsHostPorts:
wantToRun, shouldSchedule, shouldContinueRunning = false, false, false
// unintentional
case
predicates.ErrDiskConflict,
predicates.ErrVolumeZoneConflict,
predicates.ErrMaxVolumeCountExceeded,
predicates.ErrNodeUnderMemoryPressure,
predicates.ErrNodeUnderDiskPressure:
// wantToRun and shouldContinueRunning are likely true here. They are
// absolutely true at the time of writing the comment. See first comment
// of this method.
shouldSchedule = false
emitEvent = true
// unexpected
case
predicates.ErrPodAffinityNotMatch,
predicates.ErrServiceAffinityViolated,
predicates.ErrTaintsTolerationsNotMatch:
return false, false, false, fmt.Errorf("unexpected reason: GeneralPredicates should not return reason %s", reason.GetReason())
default:
glog.V(4).Infof("unknownd predicate failure reason: %s", reason.GetReason())
wantToRun, shouldSchedule, shouldContinueRunning = false, false, false
emitEvent = true
}
if emitEvent {
dsc.eventRecorder.Eventf(ds, v1.EventTypeNormal, "FailedPlacement", "failed to place pod on %q: %s", node.ObjectMeta.Name, reason.GetReason())
} }
} }
} }
return fit return
} }
// byCreationTimestamp sorts a list by creation timestamp, using their names as a tie breaker. // byCreationTimestamp sorts a list by creation timestamp, using their names as a tie breaker.

View File

@ -267,6 +267,23 @@ func TestInsufficentCapacityNodeDaemonDoesNotLaunchPod(t *testing.T) {
syncAndValidateDaemonSets(t, manager, ds, podControl, 0, 0) syncAndValidateDaemonSets(t, manager, ds, podControl, 0, 0)
} }
// DaemonSets should not unschedule a daemonset pod from a node with insufficient free resource
func TestInsufficentCapacityNodeDaemonDoesNotUnscheduleRunningPod(t *testing.T) {
podSpec := resourcePodSpec("too-much-mem", "75M", "75m")
podSpec.NodeName = "too-much-mem"
manager, podControl, _ := newTestController()
node := newNode("too-much-mem", nil)
node.Status.Allocatable = allocatableResources("100M", "200m")
manager.nodeStore.Add(node)
manager.podStore.Indexer.Add(&v1.Pod{
Spec: podSpec,
})
ds := newDaemonSet("foo")
ds.Spec.Template.Spec = podSpec
manager.dsStore.Add(ds)
syncAndValidateDaemonSets(t, manager, ds, podControl, 0, 0)
}
func TestSufficentCapacityWithTerminatedPodsDaemonLaunchesPod(t *testing.T) { func TestSufficentCapacityWithTerminatedPodsDaemonLaunchesPod(t *testing.T) {
podSpec := resourcePodSpec("too-much-mem", "75M", "75m") podSpec := resourcePodSpec("too-much-mem", "75M", "75m")
manager, podControl, _ := newTestController() manager, podControl, _ := newTestController()