E2E test node upgrade (to same version)

2015-05-21 14:43:42 -07:00 · 2015-05-21 14:43:42 -07:00 · a6c47a07de
parent a8a3e9d0c7
commit a6c47a07de
4 changed files with 519 additions and 117 deletions
--- a/pkg/util/wait/wait.go
+++ b/pkg/util/wait/wait.go
@ -57,7 +57,7 @@ type WaitFunc func() <-chan struct{}
 // placed on the channel and once more when the channel is closed.  If c
 // returns an error the loop ends and that error is returned, and if c returns
 // true the loop ends and nil is returned. ErrWaitTimeout will be returned if
-// the channel is closed without c every returning true.
+// the channel is closed without c ever returning true.
 func WaitFor(wait WaitFunc, c ConditionFunc) error {
 	w := wait()
 	for {
--- a/test/e2e/reboot.go
+++ b/test/e2e/reboot.go
@ -30,28 +30,17 @@ import (
 )

 const (
-	// How long to pause between polling node or pod status.
-	poll = 5 * time.Second
-
-	// How long nodes have to be "ready" before the reboot. They should already
-	// be "ready" before the test starts, so this is small.
-	nodeReadyInitialTimeout = 20 * time.Second
-
-	// How long pods have to be "ready" before the reboot. They should already
-	// be "ready" before the test starts, so this is small.
-	podReadyBeforeTimeout = 20 * time.Second
-
 	// How long a node is allowed to go from "Ready" to "NotReady" after a
 	// reboot is issued before the test is considered failed.
-	rebootNotReadyTimeout = 2 * time.Minute
+	rebootNodeNotReadyTimeout = 2 * time.Minute

 	// How long a node is allowed to go from "NotReady" to "Ready" after a
 	// reboot is issued and it is found to be "NotReady" before the test is
 	// considered failed.
-	rebootReadyAgainTimeout = 5 * time.Minute
+	rebootNodeReadyAgainTimeout = 5 * time.Minute

 	// How long pods have to be "ready" after the reboot.
-	podReadyAgainTimeout = 5 * time.Minute
+	rebootPodReadyAgainTimeout = 5 * time.Minute
 )

 var _ = Describe("Reboot", func() {
@ -105,7 +94,7 @@ func testReboot(c *client.Client, rebootCmd string) {
 	}

 	// Get all nodes, and kick off the test on each.
-	nodelist, err := c.Nodes().List(labels.Everything(), fields.Everything())
+	nodelist, err := listNodes(c, labels.Everything(), fields.Everything())
 	if err != nil {
 		Failf("Error getting nodes: %v", err)
 	}
@ -159,6 +148,10 @@ func issueSSHCommand(node *api.Node, provider, cmd string) error {
 // It returns true through result only if all of the steps pass; at the first
 // failed step, it will return false through result and not run the rest.
 func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan bool) {
+	// Setup
+	ps := newPodStore(c, api.NamespaceDefault, labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name))
+	defer ps.Stop()
+
 	// Get the node initially.
 	Logf("Getting %s", name)
 	node, err := c.Nodes().Get(name)
@ -175,22 +168,16 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan
 	}

 	// Get all the pods on the node.
-	podList, err := c.Pods(api.NamespaceDefault).List(
-		labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name))
-	if err != nil {
-		Logf("Error getting pods for node %s: %v", name, err)
-		result <- false
-		return
-	}
-	podNames := make([]string, len(podList.Items))
-	for i, p := range podList.Items {
+	pods := ps.List()
+	podNames := make([]string, len(pods))
+	for i, p := range pods {
 		podNames[i] = p.ObjectMeta.Name
 	}
 	Logf("Node %s has %d pods: %v", name, len(podNames), podNames)

 	// For each pod, we do a sanity check to ensure it's running / healthy
 	// now, as that's what we'll be checking later.
-	if !checkPodsRunning(c, podNames, podReadyBeforeTimeout) {
+	if !checkPodsRunningReady(c, podNames, podReadyBeforeTimeout) {
 		result <- false
 		return
 	}
@ -202,20 +189,20 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan
 	}

 	// Wait for some kind of "not ready" status.
-	if !waitForNodeToBeNotReady(c, name, rebootNotReadyTimeout) {
+	if !waitForNodeToBeNotReady(c, name, rebootNodeNotReadyTimeout) {
 		result <- false
 		return
 	}

 	// Wait for some kind of "ready" status.
-	if !waitForNodeToBeReady(c, name, rebootReadyAgainTimeout) {
+	if !waitForNodeToBeReady(c, name, rebootNodeReadyAgainTimeout) {
 		result <- false
 		return
 	}

 	// Ensure all of the pods that we found on this node before the reboot are
 	// running / healthy.
-	if !checkPodsRunning(c, podNames, podReadyAgainTimeout) {
+	if !checkPodsRunningReady(c, podNames, rebootPodReadyAgainTimeout) {
 		result <- false
 		return
 	}
@ -223,72 +210,3 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan
 	Logf("Reboot successful on node %s", name)
 	result <- true
 }
-
-// checkPodsRunning returns whether all pods whose names are listed in podNames
-// are running.
-func checkPodsRunning(c *client.Client, podNames []string, timeout time.Duration) bool {
-	desc := "running and ready"
-	Logf("Waiting up to %v for the following pods to be %s: %s", timeout, desc, podNames)
-	result := make(chan bool, len(podNames))
-	for ix := range podNames {
-		// Launch off pod readiness checkers.
-		go func(name string) {
-			err := waitForPodCondition(c, api.NamespaceDefault, name, desc,
-				poll, timeout, podRunningReady)
-			result <- err == nil
-		}(podNames[ix])
-	}
-	// Wait for them all to finish.
-	success := true
-	// TODO(mbforbes): Change to `for range` syntax and remove logging once we
-	// support only Go >= 1.4.
-	for _, podName := range podNames {
-		if !<-result {
-			Logf("Pod %s failed to be %s.", podName, desc)
-			success = false
-		}
-	}
-	Logf("Wanted all pods to be %s. Result: %t. Pods: %v", desc, success, podNames)
-	return success
-}
-
-// waitForNodeToBeReady returns whether node name is ready within timeout.
-func waitForNodeToBeReady(c *client.Client, name string, timeout time.Duration) bool {
-	return waitForNodeToBe(c, name, true, timeout)
-}
-
-// waitForNodeToBeNotReady returns whether node name is not ready (i.e. the
-// readiness condition is anything but ready, e.g false or unknown) within
-// timeout.
-func waitForNodeToBeNotReady(c *client.Client, name string, timeout time.Duration) bool {
-	return waitForNodeToBe(c, name, false, timeout)
-}
-
-// waitForNodeToBe returns whether node name's readiness state matches wantReady
-// within timeout. If wantReady is true, it will ensure the node is ready; if
-// it's false, it ensures the node is in any state other than ready (e.g. not
-// ready or unknown).
-func waitForNodeToBe(c *client.Client, name string, wantReady bool, timeout time.Duration) bool {
-	Logf("Waiting up to %v for node %s readiness to be %t", timeout, name, wantReady)
-	for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) {
-		node, err := c.Nodes().Get(name)
-		if err != nil {
-			Logf("Couldn't get node %s", name)
-			continue
-		}
-
-		// Check the node readiness condition (logging all).
-		for i, cond := range node.Status.Conditions {
-			Logf("Node %s condition %d/%d: type: %v, status: %v",
-				name, i+1, len(node.Status.Conditions), cond.Type, cond.Status)
-			// Ensure that the condition type is readiness and the status
-			// matches as desired.
-			if cond.Type == api.NodeReady && (cond.Status == api.ConditionTrue) == wantReady {
-				Logf("Successfully found node %s readiness to be %t", name, wantReady)
-				return true
-			}
-		}
-	}
-	Logf("Node %s didn't reach desired readiness (%t) within %v", name, wantReady, timeout)
-	return false
-}
--- a/test/e2e/restart.go
+++ b/test/e2e/restart.go
@ -0,0 +1,383 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e
+
+import (
+	"fmt"
+	"os/exec"
+	"strings"
+	"time"
+
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util/wait"
+
+	. "github.com/onsi/ginkgo"
+	. "github.com/onsi/gomega"
+)
+
+const (
+	// How long each node is given during a process that restarts all nodes
+	// before the test is considered failed. (Note that the total time to
+	// restart all nodes will be this number times the nubmer of nodes.)
+	restartPerNodeTimeout = 5 * time.Minute
+
+	// How often to poll the statues of a restart.
+	restartPoll = 20 * time.Second
+
+	// How long a node is allowed to become "Ready" after it is restarted before
+	// the test is considered failed.
+	restartNodeReadyAgainTimeout = 5 * time.Minute
+
+	// How long a pod is allowed to become "running" and "ready" after a node
+	// restart before test is considered failed.
+	restartPodReadyAgainTimeout = 5 * time.Minute
+)
+
+var _ = Describe("Restart", func() {
+	var c *client.Client
+	var ps *podStore
+
+	BeforeEach(func() {
+		var err error
+		c, err = loadClient()
+		Expect(err).NotTo(HaveOccurred())
+		ps = newPodStore(c, api.NamespaceDefault, labels.Everything(), fields.Everything())
+	})
+
+	AfterEach(func() {
+		ps.Stop()
+	})
+
+	It("should restart all nodes and ensure all nodes and pods recover", func() {
+		// This test requires the ability to restart all nodes, so the provider
+		// check must be identical to that call.
+		provider := testContext.Provider
+		nn := testContext.CloudConfig.NumNodes
+		if !providerIs("gce") {
+			By(fmt.Sprintf("Skipping reboot test, which is not implemented for %s", provider))
+			return
+		}
+
+		By("ensuring all nodes are ready")
+		nodeNamesBefore, err := checkNodesReady(c, nodeReadyInitialTimeout, nn)
+		Expect(err).NotTo(HaveOccurred())
+		Logf("Got the following nodes before restart: %v", nodeNamesBefore)
+
+		By("ensuring all pods are running and ready")
+		pods := ps.List()
+		podNamesBefore := make([]string, len(pods))
+		for i, p := range pods {
+			podNamesBefore[i] = p.ObjectMeta.Name
+		}
+		if !checkPodsRunningReady(c, podNamesBefore, podReadyBeforeTimeout) {
+			Failf("At least one pod wasn't running and ready at test start.")
+		}
+
+		By("restarting all of the nodes")
+		err = restartNodes(provider, restartPerNodeTimeout)
+		Expect(err).NotTo(HaveOccurred())
+
+		By("ensuring all nodes are ready after the restart")
+		nodeNamesAfter, err := checkNodesReady(c, restartNodeReadyAgainTimeout, nn)
+		Expect(err).NotTo(HaveOccurred())
+		Logf("Got the following nodes after restart: %v", nodeNamesAfter)
+
+		// Make sure that we have the same number of nodes. We're not checking
+		// that the names match because that's implementation specific.
+		By("ensuring the same number of nodes exist after the restart")
+		if len(nodeNamesBefore) != len(nodeNamesAfter) {
+			Failf("Had %d nodes before nodes were restarted, but now only have %d",
+				len(nodeNamesBefore), len(nodeNamesAfter))
+		}
+
+		// Make sure that we have the same number of pods. We're not checking
+		// that the names match because they are recreated with different names
+		// across node restarts.
+		By("ensuring the same number of pods are running and ready after restart")
+		podCheckStart := time.Now()
+		podNamesAfter, err := waitForNPods(ps, len(podNamesBefore), restartPodReadyAgainTimeout)
+		Expect(err).NotTo(HaveOccurred())
+		remaining := restartPodReadyAgainTimeout - time.Since(podCheckStart)
+		if !checkPodsRunningReady(c, podNamesAfter, remaining) {
+			Failf("At least one pod wasn't running and ready after the restart.")
+		}
+	})
+})
+
+// waitForNPods tries to list pods using c until it finds expect of them,
+// returning their names if it can do so before timeout.
+func waitForNPods(ps *podStore, expect int, timeout time.Duration) ([]string, error) {
+	// Loop until we find expect pods or timeout is passed.
+	var pods []*api.Pod
+	var errLast error
+	found := wait.Poll(poll, timeout, func() (bool, error) {
+		pods = ps.List()
+		if len(pods) != expect {
+			errLast = fmt.Errorf("expected to find %d pods but found only %d", expect, len(pods))
+			Logf("Error getting pods: %v", errLast)
+			return false, nil
+		}
+		return true, nil
+	}) == nil
+	// Extract the names of all found pods.
+	podNames := make([]string, len(pods))
+	for i, p := range pods {
+		podNames[i] = p.ObjectMeta.Name
+	}
+	if !found {
+		return podNames, fmt.Errorf("couldn't find %d pods within %v; last error: %v",
+			expect, timeout, errLast)
+	}
+	return podNames, nil
+}
+
+// checkNodesReady waits up to nt for expect nodes accessed by c to be ready,
+// returning an error if this doesn't happen in time. It returns the names of
+// nodes it finds.
+func checkNodesReady(c *client.Client, nt time.Duration, expect int) ([]string, error) {
+	// First, keep getting all of the nodes until we get the number we expect.
+	var nodeList *api.NodeList
+	var errLast error
+	start := time.Now()
+	found := wait.Poll(poll, nt, func() (bool, error) {
+		// Even though listNodes(...) has its own retries, a rolling-update
+		// (GCE/GKE implementation of restart) can complete before the apiserver
+		// knows about all of the nodes. Thus, we retry the list nodes call
+		// until we get the expected number of nodes.
+		nodeList, errLast = listNodes(c, labels.Everything(), fields.Everything())
+		if errLast != nil {
+			return false, nil
+		}
+		if len(nodeList.Items) != expect {
+			errLast = fmt.Errorf("expected to find %d nodes but found only %d", expect, len(nodeList.Items))
+			Logf("%v", errLast)
+			return false, nil
+		}
+		return true, nil
+	}) == nil
+	nodeNames := make([]string, len(nodeList.Items))
+	for i, n := range nodeList.Items {
+		nodeNames[i] = n.ObjectMeta.Name
+	}
+	if !found {
+		return nodeNames, fmt.Errorf("couldn't find %d nodes within %v; last error: %v",
+			expect, nt, errLast)
+	}
+
+	// Next, ensure in parallel that all the nodes are ready. We subtract the
+	// time we spent waiting above.
+	timeout := nt - time.Since(start)
+	result := make(chan bool, len(nodeList.Items))
+	for _, n := range nodeNames {
+		n := n
+		go func() { result <- waitForNodeToBeReady(c, n, timeout) }()
+	}
+	failed := false
+	// TODO(mbforbes): Change to `for range` syntax once we support only Go
+	// >= 1.4.
+	for i := range nodeList.Items {
+		_ = i
+		if !<-result {
+			failed = true
+		}
+	}
+	if failed {
+		return nodeNames, fmt.Errorf("at least one node failed to be ready")
+	}
+	return nodeNames, nil
+}
+
+// restartNodes uses provider to do a restart of all nodes in the cluster,
+// allowing up to nt per node.
+func restartNodes(provider string, nt time.Duration) error {
+	switch provider {
+	case "gce":
+		return migRollingUpdate(nt)
+	default:
+		return fmt.Errorf("restartNodes(...) not implemented for %s", provider)
+	}
+}
+
+// migRollingUpdate starts a MIG rolling update and waits up to nt times the
+// nubmer of nodes for it to complete.
+func migRollingUpdate(nt time.Duration) error {
+	By("getting the name of the template for the managed instance group")
+	templ, err := migTemplate()
+	if err != nil {
+		return fmt.Errorf("couldn't get MIG template name: %v", err)
+	}
+
+	By("starting the managed instance group rolling update")
+	id, err := migRollingUpdateStart(templ, nt)
+	if err != nil {
+		return fmt.Errorf("couldn't start the MIG rolling update: %v", err)
+	}
+
+	By("polling the managed instance group rolling update until it completes")
+	if err := migRollingUpdatePoll(id, nt); err != nil {
+		return fmt.Errorf("err waiting until update completed: %v", err)
+	}
+
+	return nil
+}
+
+// migTemlate (GCE/GKE-only) returns the name of the MIG template that the
+// nodes of the cluster use.
+func migTemplate() (string, error) {
+	var errLast error
+	var templ string
+	key := "instanceTemplate"
+	if wait.Poll(poll, singleCallTimeout, func() (bool, error) {
+		// TODO(mbforbes): make this hit the compute API directly instead of
+		// shelling out to gcloud.
+		o, err := exec.Command("gcloud", "preview", "managed-instance-groups",
+			fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID),
+			fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone),
+			"describe",
+			testContext.CloudConfig.NodeInstanceGroup).CombinedOutput()
+		if err != nil {
+			errLast = fmt.Errorf("gcloud preview managed-instance-groups describe call failed with err: %v", err)
+			return false, nil
+		}
+		output := string(o)
+
+		// The 'describe' call probably succeeded; parse the output and try to
+		// find the line that looks like "instanceTemplate: url/to/<templ>" and
+		// return <templ>.
+		if val := parseKVLines(output, key); len(val) > 0 {
+			url := strings.Split(val, "/")
+			templ = url[len(url)-1]
+			Logf("MIG group %s using template: %s", testContext.CloudConfig.NodeInstanceGroup, templ)
+			return true, nil
+		}
+		errLast = fmt.Errorf("couldn't find %s in output to get MIG template. Output: %s", key, output)
+		return false, nil
+	}) != nil {
+		return "", fmt.Errorf("migTemplate() failed with last error: %v", errLast)
+	}
+	return templ, nil
+}
+
+// migRollingUpdateStart (GCE/GKE-only) starts a MIG rolling update using templ
+// as the new template, waiting up to nt per node, and returns the ID of that
+// update.
+func migRollingUpdateStart(templ string, nt time.Duration) (string, error) {
+	var errLast error
+	var id string
+	prefix, suffix := "Started [", "]."
+	if err := wait.Poll(poll, singleCallTimeout, func() (bool, error) {
+		// TODO(mbforbes): make this hit the compute API directly instead of
+		// shelling out to gcloud.
+		o, err := exec.Command("gcloud", "preview", "rolling-updates",
+			fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID),
+			fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone),
+			"start",
+			// Required args.
+			fmt.Sprintf("--group=%s", testContext.CloudConfig.NodeInstanceGroup),
+			fmt.Sprintf("--template=%s", templ),
+			// Optional args to fine-tune behavior.
+			fmt.Sprintf("--instance-startup-timeout=%ds", int(nt.Seconds())),
+			// NOTE: We can speed up this process by increasing
+			//       --max-num-concurrent-instances.
+			fmt.Sprintf("--max-num-concurrent-instances=%d", 1),
+			fmt.Sprintf("--max-num-failed-instances=%d", 0),
+			fmt.Sprintf("--min-instance-update-time=%ds", 0)).CombinedOutput()
+		if err != nil {
+			errLast = fmt.Errorf("gcloud preview rolling-updates call failed with err: %v", err)
+			return false, nil
+		}
+		output := string(o)
+
+		// The 'start' call probably succeeded; parse the output and try to find
+		// the line that looks like "Started [url/to/<id>]." and return <id>.
+		for _, line := range strings.Split(output, "\n") {
+			// As a sanity check, ensure the line starts with prefix and ends
+			// with suffix.
+			if strings.Index(line, prefix) != 0 || strings.Index(line, suffix) != len(line)-len(suffix) {
+				continue
+			}
+			url := strings.Split(strings.TrimSuffix(strings.TrimPrefix(line, prefix), suffix), "/")
+			id = url[len(url)-1]
+			Logf("Started MIG rolling update; ID: %s", id)
+			return true, nil
+		}
+		errLast = fmt.Errorf("couldn't find line like '%s ... %s' in output to MIG rolling-update start. Output: %s",
+			prefix, suffix, output)
+		return false, nil
+	}); err != nil {
+		return "", fmt.Errorf("migRollingUpdateStart() failed with last error: %v", errLast)
+	}
+	return id, nil
+}
+
+// migRollingUpdatePoll (CKE/GKE-only) polls the progress of the MIG rolling
+// update with ID id until it is complete. It returns an error if this takes
+// longer than nt times the number of nodes.
+func migRollingUpdatePoll(id string, nt time.Duration) error {
+	// Two keys and a val.
+	status, progress, done := "status", "statusMessage", "ROLLED_OUT"
+	start, timeout := time.Now(), nt*time.Duration(testContext.CloudConfig.NumNodes)
+	var errLast error
+	Logf("Waiting up to %v for MIG rolling update to complete.", timeout)
+	if wait.Poll(restartPoll, timeout, func() (bool, error) {
+		o, err := exec.Command("gcloud", "preview", "rolling-updates",
+			fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID),
+			fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone),
+			"describe",
+			id).CombinedOutput()
+		if err != nil {
+			errLast = fmt.Errorf("Error calling rolling-updates describe %s: %v", id, err)
+			Logf("%v", errLast)
+			return false, nil
+		}
+		output := string(o)
+
+		// The 'describe' call probably succeeded; parse the output and try to
+		// find the line that looks like "status: <status>" and see whether it's
+		// done.
+		Logf("Waiting for MIG rolling update: %s (%v elapsed)",
+			parseKVLines(output, progress), time.Since(start))
+		if st := parseKVLines(output, status); st == done {
+			return true, nil
+		}
+		return false, nil
+	}) != nil {
+		return fmt.Errorf("timeout waiting %v for MIG rolling update to complete. Last error: %v", timeout, errLast)
+	}
+	return nil
+}
+
+// parseKVLines parses output that looks like lines containing "<key>: <val>"
+// and returns <val> if <key> is found. Otherwise, it returns the empty string.
+func parseKVLines(output, key string) string {
+	delim := ":"
+	key = key + delim
+	for _, line := range strings.Split(output, "\n") {
+		pieces := strings.SplitAfterN(line, delim, 2)
+		if len(pieces) != 2 {
+			continue
+		}
+		k, v := pieces[0], pieces[1]
+		if k == key {
+			return strings.TrimSpace(v)
+		}
+	}
+	return ""
+}
--- a/test/e2e/util.go
+++ b/test/e2e/util.go
@ -42,6 +42,7 @@ import (
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/runtime"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util/wait"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"

 	"code.google.com/p/go-uuid/uuid"
@ -60,15 +61,28 @@ const (
 	// String used to mark pod deletion
 	nonExist = "NonExist"

-	// How often to poll pods.
-	podPoll = 5 * time.Second
+	// How often to poll pods and nodes.
+	poll = 5 * time.Second

 	// service accounts are provisioned after namespace creation
 	// a service account is required to support pod creation in a namespace as part of admission control
 	serviceAccountProvisionTimeout = 2 * time.Minute

-	// How often to poll for service accounts
-	serviceAccountPoll = 5 * time.Second
+	// How long to try single API calls (like 'get' or 'list'). Used to prevent
+	// transient failures from failing tests.
+	singleCallTimeout = 30 * time.Second
+
+	// How long nodes have to be "ready" when a test begins. They should already
+	// be "ready" before the test starts, so this is small.
+	nodeReadyInitialTimeout = 20 * time.Second
+
+	// How long pods have to be "ready" when a test begins. They should already
+	// be "ready" before the test starts, so this is small.
+	podReadyBeforeTimeout = 20 * time.Second
+
+	// How wide to print pod names, by default. Useful for aligning printing to
+	// quickly scan through output.
+	podPrintWidth = 55
 )

 type CloudConfig struct {
@ -218,7 +232,6 @@ func podRunningReady(p *api.Pod) (bool, error) {
 	if !podReady(p) {
 		return false, fmt.Errorf("pod '%s' on '%s' didn't have condition {%v %v}; conditions: %v",
 			p.ObjectMeta.Name, p.Spec.NodeName, api.PodReady, api.ConditionTrue, p.Status.Conditions)
-
 	}
 	return true, nil
 }
@ -233,15 +246,16 @@ func waitForPodsRunningReady(ns string, minPods int, timeout time.Duration) erro
 	if err != nil {
 		return err
 	}
+	start := time.Now()
 	Logf("Waiting up to %v for all pods (need at least %d) in namespace '%s' to be running and ready",
 		timeout, minPods, ns)
-	for start := time.Now(); time.Since(start) < timeout; time.Sleep(podPoll) {
+	if wait.Poll(poll, timeout, func() (bool, error) {
 		// We get the new list of pods in every iteration beause more pods come
 		// online during startup and we want to ensure they are also checked.
 		podList, err := c.Pods(ns).List(labels.Everything(), fields.Everything())
 		if err != nil {
 			Logf("Error getting pods in namespace '%s': %v", ns, err)
-			continue
+			return false, nil
 		}
 		nOk, badPods := 0, []api.Pod{}
 		for _, pod := range podList.Items {
@ -254,14 +268,17 @@ func waitForPodsRunningReady(ns string, minPods int, timeout time.Duration) erro
 		Logf("%d / %d pods in namespace '%s' are running and ready (%d seconds elapsed)",
 			nOk, len(podList.Items), ns, int(time.Since(start).Seconds()))
 		if nOk == len(podList.Items) && nOk >= minPods {
-			return nil
+			return true, nil
 		}
 		logPodStates(badPods)
+		return false, nil
+	}) != nil {
+		return fmt.Errorf("Not all pods in namespace '%s' running and ready within %v", ns, timeout)
 	}
-	return fmt.Errorf("Not all pods in namespace '%s' running and ready within %v", ns, timeout)
+	return nil
 }

-func waitForServiceAccountInNamespace(c *client.Client, ns, serviceAccountName string, poll, timeout time.Duration) error {
+func waitForServiceAccountInNamespace(c *client.Client, ns, serviceAccountName string, timeout time.Duration) error {
 	Logf("Waiting up to %v for service account %s to be provisioned in ns %s", timeout, serviceAccountName, ns)
 	for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) {
 		_, err := c.ServiceAccounts(ns).Get(serviceAccountName)
@ -275,20 +292,23 @@ func waitForServiceAccountInNamespace(c *client.Client, ns, serviceAccountName s
 	return fmt.Errorf("Service account %s in namespace %s not ready within %v", serviceAccountName, ns, timeout)
 }

-func waitForPodCondition(c *client.Client, ns, podName, desc string, poll, timeout time.Duration, condition podCondition) error {
-	Logf("Waiting up to %v for pod %s status to be %s", timeout, podName, desc)
+func waitForPodCondition(c *client.Client, ns, podName, desc string, timeout time.Duration, condition podCondition) error {
+	Logf("Waiting up to %[1]v for pod %-[2]*[3]s status to be %[4]s", timeout, podPrintWidth, podName, desc)
 	for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) {
 		pod, err := c.Pods(ns).Get(podName)
 		if err != nil {
-			Logf("Get pod %s in ns %s failed, ignoring for %v: %v", podName, ns, poll, err)
+			// Aligning this text makes it much more readable
+			Logf("Get pod %-[1]*[2]s in namespace '%[3]s' failed, ignoring for %[4]v. Error: %[5]v",
+				podPrintWidth, podName, ns, poll, err)
 			continue
 		}
 		done, err := condition(pod)
 		if done {
 			return err
 		}
-		Logf("Waiting for pod '%s' in namespace '%s' status to be '%q' (found phase: '%q', readiness: %t) (%v)",
-			podName, ns, desc, pod.Status.Phase, podReady(pod), time.Since(start))
+		Logf("Waiting for pod %-[1]*[2]s in namespace '%[3]s' status to be '%[4]s'"+
+			"(found phase: %[5]q, readiness: %[6]t) (%[7]v elapsed)",
+			podPrintWidth, podName, ns, desc, pod.Status.Phase, podReady(pod), time.Since(start))
 	}
 	return fmt.Errorf("gave up waiting for pod '%s' to be '%s' after %v", podName, desc, timeout)
 }
@ -297,7 +317,7 @@ func waitForPodCondition(c *client.Client, ns, podName, desc string, poll, timeo
 // the default service account is what is associated with pods when they do not specify a service account
 // as a result, pods are not able to be provisioned in a namespace until the service account is provisioned
 func waitForDefaultServiceAccountInNamespace(c *client.Client, namespace string) error {
-	return waitForServiceAccountInNamespace(c, namespace, "default", serviceAccountPoll, serviceAccountProvisionTimeout)
+	return waitForServiceAccountInNamespace(c, namespace, "default", serviceAccountProvisionTimeout)
 }

 // createNS should be used by every test, note that we append a common prefix to the provided test name.
@ -315,7 +335,7 @@ func createTestingNS(baseName string, c *client.Client) (*api.Namespace, error)
 }

 func waitForPodRunningInNamespace(c *client.Client, podName string, namespace string) error {
-	return waitForPodCondition(c, namespace, podName, "running", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) {
+	return waitForPodCondition(c, namespace, podName, "running", podStartTimeout, func(pod *api.Pod) (bool, error) {
 		if pod.Status.Phase == api.PodRunning {
 			return true, nil
 		}
@ -332,7 +352,7 @@ func waitForPodRunning(c *client.Client, podName string) error {

 // waitForPodNotPending returns an error if it took too long for the pod to go out of pending state.
 func waitForPodNotPending(c *client.Client, ns, podName string) error {
-	return waitForPodCondition(c, ns, podName, "!pending", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) {
+	return waitForPodCondition(c, ns, podName, "!pending", podStartTimeout, func(pod *api.Pod) (bool, error) {
 		if pod.Status.Phase != api.PodPending {
 			Logf("Saw pod '%s' in namespace '%s' out of pending state (found '%q')", podName, ns, pod.Status.Phase)
 			return true, nil
@ -343,7 +363,7 @@ func waitForPodNotPending(c *client.Client, ns, podName string) error {

 // waitForPodSuccessInNamespace returns nil if the pod reached state success, or an error if it reached failure or ran too long.
 func waitForPodSuccessInNamespace(c *client.Client, podName string, contName string, namespace string) error {
-	return waitForPodCondition(c, namespace, podName, "success or failure", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) {
+	return waitForPodCondition(c, namespace, podName, "success or failure", podStartTimeout, func(pod *api.Pod) (bool, error) {
 		// Cannot use pod.Status.Phase == api.PodSucceeded/api.PodFailed due to #2632
 		ci, ok := api.GetContainerStatus(pod.Status.ContainerStatuses, contName)
 		if !ok {
@ -986,6 +1006,19 @@ func DeleteRC(c *client.Client, ns, name string) error {
 	return err
 }

+// Convenient wrapper around listing nodes supporting retries.
+func listNodes(c *client.Client, label labels.Selector, field fields.Selector) (*api.NodeList, error) {
+	var nodes *api.NodeList
+	var errLast error
+	if wait.Poll(poll, singleCallTimeout, func() (bool, error) {
+		nodes, errLast = c.Nodes().List(label, field)
+		return errLast == nil, nil
+	}) != nil {
+		return nil, fmt.Errorf("listNodes() failed with last error: %v", errLast)
+	}
+	return nodes, nil
+}
+
 // FailedContainers inspects all containers in a pod and returns failure
 // information for containers that have failed or been restarted.
 // A map is returned where the key is the containerID and the value is a
@ -1111,6 +1144,74 @@ func getSigner(provider string) (ssh.Signer, error) {
 	return util.MakePrivateKeySigner(key)
 }

+// checkPodsRunning returns whether all pods whose names are listed in podNames
+// are running and ready.
+func checkPodsRunningReady(c *client.Client, podNames []string, timeout time.Duration) bool {
+	np, desc := len(podNames), "running and ready"
+	Logf("Waiting up to %v for the following %d pods to be %s: %s", timeout, np, desc, podNames)
+	result := make(chan bool, len(podNames))
+	for ix := range podNames {
+		// Launch off pod readiness checkers.
+		go func(name string) {
+			err := waitForPodCondition(c, api.NamespaceDefault, name, desc, timeout, podRunningReady)
+			result <- err == nil
+		}(podNames[ix])
+	}
+	// Wait for them all to finish.
+	success := true
+	// TODO(mbforbes): Change to `for range` syntax and remove logging once we
+	// support only Go >= 1.4.
+	for _, podName := range podNames {
+		if !<-result {
+			Logf("Pod %-[1]*[2]s failed to be %[3]s.", podPrintWidth, podName, desc)
+			success = false
+		}
+	}
+	Logf("Wanted all %d pods to be %s. Result: %t. Pods: %v", np, desc, success, podNames)
+	return success
+}
+
+// waitForNodeToBeReady returns whether node name is ready within timeout.
+func waitForNodeToBeReady(c *client.Client, name string, timeout time.Duration) bool {
+	return waitForNodeToBe(c, name, true, timeout)
+}
+
+// waitForNodeToBeNotReady returns whether node name is not ready (i.e. the
+// readiness condition is anything but ready, e.g false or unknown) within
+// timeout.
+func waitForNodeToBeNotReady(c *client.Client, name string, timeout time.Duration) bool {
+	return waitForNodeToBe(c, name, false, timeout)
+}
+
+// waitForNodeToBe returns whether node name's readiness state matches wantReady
+// within timeout. If wantReady is true, it will ensure the node is ready; if
+// it's false, it ensures the node is in any state other than ready (e.g. not
+// ready or unknown).
+func waitForNodeToBe(c *client.Client, name string, wantReady bool, timeout time.Duration) bool {
+	Logf("Waiting up to %v for node %s readiness to be %t", timeout, name, wantReady)
+	for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) {
+		node, err := c.Nodes().Get(name)
+		if err != nil {
+			Logf("Couldn't get node %s", name)
+			continue
+		}
+
+		// Check the node readiness condition (logging all).
+		for i, cond := range node.Status.Conditions {
+			Logf("Node %s condition %d/%d: type: %v, status: %v",
+				name, i+1, len(node.Status.Conditions), cond.Type, cond.Status)
+			// Ensure that the condition type is readiness and the status
+			// matches as desired.
+			if cond.Type == api.NodeReady && (cond.Status == api.ConditionTrue) == wantReady {
+				Logf("Successfully found node %s readiness to be %t", name, wantReady)
+				return true
+			}
+		}
+	}
+	Logf("Node %s didn't reach desired readiness (%t) within %v", name, wantReady, timeout)
+	return false
+}
+
 // LatencyMetrics stores data about request latency at a given quantile
 // broken down by verb (e.g. GET, PUT, LIST) and resource (e.g. pods, services).
 type LatencyMetric struct {