Node upgrade tests.

2015-06-17 00:13:26 -07:00 · 2015-06-17 00:13:26 -07:00 · 2803fbe343
parent 28197e07d6
commit 2803fbe343
11 changed files with 490 additions and 238 deletions
--- a/cluster/common.sh
+++ b/cluster/common.sh
@ -179,13 +179,13 @@ function get-kubeconfig-bearertoken() {
 function set_binary_version() {
  if [[ "${1}" == "latest_stable" ]]; then
    KUBE_VERSION=$(gsutil cat gs://kubernetes-release/release/stable.txt)
-    echo "Using latest stable version: ${KUBE_VERSION}"
+    echo "Using latest stable version: ${KUBE_VERSION}" >&2
  elif [[ "${1}" == "latest_release" ]]; then
    KUBE_VERSION=$(gsutil cat gs://kubernetes-release/release/latest.txt)
-    echo "Using latest release version: ${KUBE_VERSION}"
+    echo "Using latest release version: ${KUBE_VERSION}" >&2
  elif [[ "${1}" == "latest_ci" ]]; then
    KUBE_VERSION=$(gsutil cat gs://kubernetes-release/ci/latest.txt)
-    echo "Using latest ci version: ${KUBE_VERSION}"
+    echo "Using latest ci version: ${KUBE_VERSION}" >&2
  else
    KUBE_VERSION=${1}
  fi
--- a/cluster/gce/upgrade.sh
+++ b/cluster/gce/upgrade.sh
@ -34,10 +34,11 @@ source "${KUBE_ROOT}/cluster/${KUBERNETES_PROVIDER}/util.sh"
 function usage() {
  echo "!!! EXPERIMENTAL !!!"
  echo ""
-  echo "${0} [-M|-N] -l | <release or continuous integration version> | [latest_stable|latest_release|latest_ci]"
+  echo "${0} [-M|-N|-P] -l | <release or continuous integration version> | [latest_stable|latest_release|latest_ci]"
  echo "  Upgrades master and nodes by default"
  echo "  -M:  Upgrade master only"
  echo "  -N:  Upgrade nodes only"
+  echo "  -P:  Node upgrade prerequisites only (create a new instance template)"
  echo "  -l:  Use local(dev) binaries"
  echo ""
  echo "(... Fetching current release versions ...)"
@ -109,21 +110,27 @@ function prepare-upgrade() {
  tars_from_version
 }

-# Reads kube-env metadata from master and extracts value from provided key.
+
+# Reads kube-env metadata from first node in MINION_NAMES.
 #
 # Assumed vars:
-#   MASTER_NAME
+#   MINION_NAMES
 #   PROJECT
 #   ZONE
+function get-node-env() {
+  # TODO(mbforbes): Make this more reliable with retries.
+  gcloud compute --project ${PROJECT} ssh --zone ${ZONE} ${MINION_NAMES[0]} --command \
+    "curl --fail --silent -H 'Metadata-Flavor: Google' \
+      'http://metadata/computeMetadata/v1/instance/attributes/kube-env'" 2>/dev/null
+}
+
+# Using provided node env, extracts value from provided key.
 #
 # Args:
-# $1 env key to use
+# $1 node env (kube-env of node; result of calling get-node-env)
+# $2 env key to use
 function get-env-val() {
-  # TODO(mbforbes): Make this more reliable with retries.
-  gcloud compute --project ${PROJECT} ssh --zone ${ZONE} ${MASTER_NAME} --command \
-    "curl --fail --silent -H 'Metadata-Flavor: Google' \
-      'http://metadata/computeMetadata/v1/instance/attributes/kube-env'" 2>/dev/null \
-    | grep ${1} | cut -d : -f 2 | cut -d \' -f 2
+  echo "${1}" | grep ${2} | cut -d : -f 2 | cut -d \' -f 2
 }

 # Assumed vars:
@ -132,9 +139,40 @@ function get-env-val() {
 #   NODE_INSTANCE_PREFIX
 #   PROJECT
 #   ZONE
+#
+# Vars set:
+#   KUBELET_TOKEN
+#   KUBE_PROXY_TOKEN
+#   CA_CERT_BASE64
+#   EXTRA_DOCKER_OPTS
+#   KUBELET_CERT_BASE64
+#   KUBELET_KEY_BASE64
 function upgrade-nodes() {
-  local sanitized_version=$(echo ${KUBE_VERSION} | sed s/"\."/-/g)
-  echo "== Upgrading nodes to ${KUBE_VERSION}. =="
+  prepare-node-upgrade
+  do-node-upgrade
+}
+
+# prepare-node-upgrade creates a new instance template suitable for upgrading
+# to KUBE_VERSION and echos a single line with the name of the new template.
+#
+# Assumed vars:
+#   KUBE_VERSION
+#   MINION_SCOPES
+#   NODE_INSTANCE_PREFIX
+#   PROJECT
+#   ZONE
+#
+# Vars set:
+#   SANITIZED_VERSION
+#   KUBELET_TOKEN
+#   KUBE_PROXY_TOKEN
+#   CA_CERT_BASE64
+#   EXTRA_DOCKER_OPTS
+#   KUBELET_CERT_BASE64
+#   KUBELET_KEY_BASE64
+function prepare-node-upgrade() {
+  echo "== Preparing node upgrade (to ${KUBE_VERSION}). ==" >&2
+  SANITIZED_VERSION=$(echo ${KUBE_VERSION} | sed s/"\."/-/g)

  detect-minion-names

@ -146,34 +184,56 @@ function upgrade-nodes() {
    scope_flags=("--no-scopes")
  fi

-  # Get required node tokens.
-  KUBELET_TOKEN=$(get-env-val "KUBELET_TOKEN")
-  KUBE_PROXY_TOKEN=$(get-env-val "KUBE_PROXY_TOKEN")
+  # Get required node env vars from exiting template.
+  local node_env=$(get-node-env)
+  KUBELET_TOKEN=$(get-env-val "${node_env}" "KUBELET_TOKEN")
+  KUBE_PROXY_TOKEN=$(get-env-val "${node_env}" "KUBE_PROXY_TOKEN")
+  CA_CERT_BASE64=$(get-env-val "${node_env}" "CA_CERT")
+  EXTRA_DOCKER_OPTS=$(get-env-val "${node_env}" "EXTRA_DOCKER_OPTS")
+  KUBELET_CERT_BASE64=$(get-env-val "${node_env}" "KUBELET_CERT")
+  KUBELET_KEY_BASE64=$(get-env-val "${node_env}" "KUBELET_KEY")

  # TODO(mbforbes): How do we ensure kube-env is written in a ${version}-
  #                 compatible way?
  write-node-env
+
  # TODO(mbforbes): Get configure-vm script from ${version}. (Must plumb this
  #                 through all create-node-instance-template implementations).
-  create-node-instance-template ${sanitized_version}
+  create-node-instance-template ${SANITIZED_VERSION}
+  # The following is echo'd so that callers can get the template name.
+  echo "${NODE_INSTANCE_PREFIX}-template-${SANITIZED_VERSION}"
+  echo "== Finished preparing node upgrade (to ${KUBE_VERSION}). ==" >&2
+}

+# Prereqs:
+# - prepare-node-upgrade should have been called successfully
+function do-node-upgrade() {
+  echo "== Upgrading nodes to ${KUBE_VERSION}. ==" >&2
  # Do the actual upgrade.
-  gcloud preview rolling-updates start \
-      --group "${NODE_INSTANCE_PREFIX}-group" \
-      --max-num-concurrent-instances 1 \
-      --max-num-failed-instances 0 \
-      --project "${PROJECT}" \
-      --zone "${ZONE}" \
-      --template "${NODE_INSTANCE_PREFIX}-template-${sanitized_version}"
+  # NOTE(mbforbes): If you are changing this gcloud command, update
+  #                 test/e2e/restart.go to match this EXACTLY.
+  gcloud preview rolling-updates \
+      --project="${PROJECT}" \
+      --zone="${ZONE}" \
+      start \
+      --group="${NODE_INSTANCE_PREFIX}-group" \
+      --template="${NODE_INSTANCE_PREFIX}-template-${SANITIZED_VERSION}" \
+      --instance-startup-timeout=300s \
+      --max-num-concurrent-instances=1 \
+      --max-num-failed-instances=0 \
+      --min-instance-update-time=0s

-  echo "== Done =="
+  # TODO(mbforbes): Wait for the rolling-update to finish.
+
+  echo "== Finished upgrading nodes to ${KUBE_VERSION}. ==" >&2
 }

 master_upgrade=true
 node_upgrade=true
+node_prereqs=false
 local_binaries=false

-while getopts ":MNlh" opt; do
+while getopts ":MNPlh" opt; do
  case ${opt} in
    M)
      node_upgrade=false
@ -181,6 +241,9 @@ while getopts ":MNlh" opt; do
    N)
      master_upgrade=false
      ;;
+    P)
+      node_prereqs=true
+      ;;
    l)
      local_binaries=true
      ;;
@ -213,6 +276,11 @@ fi

 prepare-upgrade

+if [[ "${node_prereqs}" == "true" ]]; then
+  prepare-node-upgrade
+  exit 0
+fi
+
 if [[ "${master_upgrade}" == "true" ]]; then
  upgrade-master
 fi
@ -220,6 +288,7 @@ fi
 if [[ "${node_upgrade}" == "true" ]]; then
  if [[ "${local_binaries}" == "true" ]]; then
    echo "Upgrading nodes to local binaries is not yet supported." >&2
+    exit 1
  else
    upgrade-nodes
  fi
--- a/cluster/gce/util.sh
+++ b/cluster/gce/util.sh
@ -250,7 +250,7 @@ function detect-minion-names {
  MINION_NAMES=($(gcloud preview --project "${PROJECT}" instance-groups \
    --zone "${ZONE}" instances --group "${NODE_INSTANCE_PREFIX}-group" list \
    | cut -d'/' -f11))
-  echo "MINION_NAMES=${MINION_NAMES[*]}"
+  echo "MINION_NAMES=${MINION_NAMES[*]}" >&2
 }

 # Waits until the number of running nodes in the instance group is equal to NUM_NODES
@ -415,8 +415,9 @@ function create-node-template {
    fi
  fi

-  local attempt=0
+  local attempt=1
  while true; do
+    echo "Attempt ${attempt} to create ${1}" >&2
    if ! gcloud compute instance-templates create "$1" \
      --project "${PROJECT}" \
      --machine-type "${MINION_SIZE}" \
@ -428,12 +429,12 @@ function create-node-template {
      --network "${NETWORK}" \
      $2 \
      --can-ip-forward \
-      --metadata-from-file "$3","$4"; then
+      --metadata-from-file "$3","$4" >&2; then
        if (( attempt > 5 )); then
          echo -e "${color_red}Failed to create instance template $1 ${color_norm}" >&2
          exit 2
        fi
-        echo -e "${color_yellow}Attempt $(($attempt+1)) failed to create instance template $1. Retrying.${color_norm}" >&2
+        echo -e "${color_yellow}Attempt ${attempt} failed to create instance template $1. Retrying.${color_norm}" >&2
        attempt=$(($attempt+1))
    else
        break
--- a/pkg/util/ssh.go
+++ b/pkg/util/ssh.go
@ -142,6 +142,8 @@ func (s *SSHTunnel) Close() error {
 	return nil
 }

+// RunSSHCommand returns the stdout, stderr, and exit code from running cmd on
+// host along with any SSH-level error.
 func RunSSHCommand(cmd, host string, signer ssh.Signer) (string, string, int, error) {
 	// Setup the config, dial the server, and open a session.
 	config := &ssh.ClientConfig{
--- a/test/e2e/cluster_upgrade.go
+++ b/test/e2e/cluster_upgrade.go
@ -17,17 +17,16 @@ limitations under the License.
 package e2e

 import (
+	"bytes"
 	"fmt"
-	"io"
 	"net/http"
-	"os"
 	"os/exec"
 	"path"
+	"strings"
 	"sync"
 	"time"

 	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
-	"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/util/wait"
@ -36,68 +35,176 @@ import (
 	. "github.com/onsi/gomega"
 )

+// version applies to upgrades; kube-push always pushes local binaries.
+const version = "latest_ci"
+
+// The following upgrade functions are passed into the framework below and used
+// to do the actual upgrades.
+
+var masterUpgrade = func() error {
+	_, _, err := runScript("hack/e2e-internal/e2e-upgrade.sh", "-M", version)
+	return err
+}
+
+var masterPush = func() error {
+	_, _, err := runScript("hack/e2e-internal/e2e-push.sh", "-m")
+	return err
+}
+
+var nodeUpgrade = func(f Framework, replicas int) error {
+	Logf("Preparing node upgarde by creating new instance template")
+	stdout, _, err := runScript("hack/e2e-internal/e2e-upgrade.sh", "-P", version)
+	if err != nil {
+		return err
+	}
+	tmpl := strings.TrimSpace(stdout)
+
+	Logf("Performing a node upgrade to %s; waiting at most %v per node", tmpl, restartPerNodeTimeout)
+	if err := migRollingUpdate(tmpl, restartPerNodeTimeout); err != nil {
+		return fmt.Errorf("error doing node upgrade via a migRollingUpdate to %s: %v", tmpl, err)
+	}
+
+	Logf("Waiting up to %v for all nodes to be ready after the upgrade", restartNodeReadyAgainTimeout)
+	if _, err := checkNodesReady(f.Client, restartNodeReadyAgainTimeout, testContext.CloudConfig.NumNodes); err != nil {
+		return err
+	}
+
+	Logf("Waiting up to %v for all pods to be running and ready after the upgrade", restartPodReadyAgainTimeout)
+	return waitForPodsRunningReady(f.Namespace.Name, replicas, restartPodReadyAgainTimeout)
+}
+
 var _ = Describe("Skipped", func() {
 	Describe("Cluster upgrade", func() {
-		svcName := "baz"
-		var podName string
-		framework := Framework{BaseName: "cluster-upgrade"}
-		var webserver *WebserverTest
+		svcName, replicas := "baz", 2
+		var rcName, ip string
+		var ingress api.LoadBalancerIngress
+		f := Framework{BaseName: "cluster-upgrade"}
+		var w *WebserverTest

 		BeforeEach(func() {
-			framework.beforeEach()
-			webserver = NewWebserverTest(framework.Client, framework.Namespace.Name, svcName)
-			pod := webserver.CreateWebserverPod()
-			podName = pod.Name
-			svc := webserver.BuildServiceSpec()
+			By("Setting up the service, RC, and pods")
+			f.beforeEach()
+			w = NewWebserverTest(f.Client, f.Namespace.Name, svcName)
+			rc := w.CreateWebserverRC(replicas)
+			rcName = rc.ObjectMeta.Name
+			svc := w.BuildServiceSpec()
 			svc.Spec.Type = api.ServiceTypeLoadBalancer
-			webserver.CreateService(svc)
+			w.CreateService(svc)
+
+			By("Waiting for the service to become reachable")
+			result, err := waitForLoadBalancerIngress(f.Client, svcName, f.Namespace.Name)
+			Expect(err).NotTo(HaveOccurred())
+			ingresses := result.Status.LoadBalancer.Ingress
+			if len(ingresses) != 1 {
+				Failf("Was expecting only 1 ingress IP but got %d (%v): %v", len(ingresses), ingresses, result)
+			}
+			ingress = ingresses[0]
+			Logf("Got load balancer ingress point %v", ingress)
+			ip = ingress.IP
+			if ip == "" {
+				ip = ingress.Hostname
+			}
+			testLoadBalancerReachable(ingress, 80)
+
+			// TODO(mbforbes): Add setup, validate, and teardown for:
+			//  - secrets
+			//  - volumes
+			//  - persistent volumes
 		})

 		AfterEach(func() {
-			framework.afterEach()
-			webserver.Cleanup()
+			f.afterEach()
+			w.Cleanup()
 		})

 		Describe("kube-push", func() {
 			It("of master should maintain responsive services", func() {
-				testClusterUpgrade(framework, svcName, podName, func() {
-					runUpgradeScript("hack/e2e-internal/e2e-push.sh", "-m")
-				})
+				By("Validating cluster before master upgrade")
+				expectNoError(validate(f, svcName, rcName, ingress, replicas))
+				By("Performing a master upgrade")
+				testMasterUpgrade(ip, masterPush)
+				By("Validating cluster after master upgrade")
+				expectNoError(validate(f, svcName, rcName, ingress, replicas))
 			})
 		})

-		Describe("gce-upgrade", func() {
-			It("of master should maintain responsive services", func() {
+		Describe("gce-upgrade-master", func() {
+			It("should maintain responsive services", func() {
+				// TODO(mbforbes): Add GKE support.
 				if !providerIs("gce") {
-					By(fmt.Sprintf("Skippingt test, which is not implemented for %s", testContext.Provider))
+					By(fmt.Sprintf("Skipping upgrade test, which is not implemented for %s", testContext.Provider))
 					return
 				}
-				testClusterUpgrade(framework, svcName, podName, func() {
-					runUpgradeScript("hack/e2e-internal/e2e-upgrade.sh", "-M", "-l")
-				})
+				By("Validating cluster before master upgrade")
+				expectNoError(validate(f, svcName, rcName, ingress, replicas))
+				By("Performing a master upgrade")
+				testMasterUpgrade(ip, masterUpgrade)
+				By("Validating cluster after master upgrade")
+				expectNoError(validate(f, svcName, rcName, ingress, replicas))
+			})
+		})
+
+		Describe("gce-upgrade-cluster", func() {
+			var tmplBefore, tmplAfter string
+			BeforeEach(func() {
+				By("Getting the node template before the upgrade")
+				var err error
+				tmplBefore, err = migTemplate()
+				expectNoError(err)
+			})
+
+			AfterEach(func() {
+				By("Cleaning up any unused node templates")
+				var err error
+				tmplAfter, err = migTemplate()
+				if err != nil {
+					Logf("Could not get node template post-upgrade; may have leaked template %s", tmplBefore)
+					return
+				}
+				if tmplBefore == tmplAfter {
+					// The node upgrade failed so there's no need to delete
+					// anything.
+					Logf("Node template %s is still in use; not cleaning up", tmplBefore)
+					return
+				}
+				// TODO(mbforbes): Distinguish between transient failures
+				// and "cannot delete--in use" errors and retry on the
+				// former.
+				Logf("Deleting node template %s", tmplBefore)
+				o, err := exec.Command("gcloud", "compute", "instance-templates",
+					fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID),
+					"delete",
+					tmplBefore).CombinedOutput()
+				if err != nil {
+					Logf("gcloud compute instance-templates delete %s call failed with err: %v, output: %s",
+						tmplBefore, err, string(o))
+					Logf("May have leaked %s", tmplBefore)
+				}
+			})
+
+			It("should maintain a functioning cluster", func() {
+				// TODO(mbforbes): Add GKE support.
+				if !providerIs("gce") {
+					By(fmt.Sprintf("Skipping upgrade test, which is not implemented for %s", testContext.Provider))
+					return
+				}
+				By("Validating cluster before master upgrade")
+				expectNoError(validate(f, svcName, rcName, ingress, replicas))
+				By("Performing a master upgrade")
+				testMasterUpgrade(ip, masterUpgrade)
+				By("Validating cluster after master upgrade")
+				expectNoError(validate(f, svcName, rcName, ingress, replicas))
+				By("Performing a node upgrade")
+				testNodeUpgrade(f, nodeUpgrade, replicas)
+				By("Validating cluster after node upgrade")
+				expectNoError(validate(f, svcName, rcName, ingress, replicas))
 			})
 		})
 	})
 })

-func testClusterUpgrade(framework Framework, svcName, podName string, upgrade func()) {
-	result, err := waitForLoadBalancerIngress(framework.Client, svcName, framework.Namespace.Name)
-	Expect(err).NotTo(HaveOccurred())
-	ingresses := result.Status.LoadBalancer.Ingress
-	if len(ingresses) != 1 {
-		Failf("Was expecting only 1 ingress IP but got %d (%v): %v", len(ingresses), ingresses, result)
-	}
-	ingress := ingresses[0]
-	ip := ingress.IP
-	if ip == "" {
-		ip = ingress.Hostname
-	}
-
-	By("Waiting for pod to become reachable")
-	testLoadBalancerReachable(ingress, 80)
-	validateClusterUpgrade(framework, svcName, podName)
-
-	Logf("starting async validation")
+func testMasterUpgrade(ip string, mUp func() error) {
+	Logf("Starting async validation")
 	httpClient := http.Client{Timeout: 2 * time.Second}
 	done := make(chan struct{}, 1)
 	// Let's make sure we've finished the heartbeat before shutting things down.
@ -107,55 +214,94 @@ func testClusterUpgrade(framework Framework, svcName, podName string, upgrade fu
 		wg.Add(1)
 		defer wg.Done()

-		expectNoError(wait.Poll(poll, singleCallTimeout, func() (bool, error) {
+		if err := wait.Poll(poll, singleCallTimeout, func() (bool, error) {
 			r, err := httpClient.Get("http://" + ip)
 			if err != nil {
+				Logf("Error reaching %s: %v", ip, err)
 				return false, nil
 			}
 			if r.StatusCode < http.StatusOK || r.StatusCode >= http.StatusNotFound {
+				Logf("Bad response; status: %d, response: %v", r.StatusCode, r)
 				return false, nil
 			}
 			return true, nil
-		}))
+		}); err != nil {
+			// We log the error here because the test will fail at the very end
+			// because this validation runs in another goroutine. Without this,
+			// a failure is very confusing to track down because from the logs
+			// everything looks fine.
+			msg := fmt.Sprintf("Failed to contact service during master upgrade: %v", err)
+			Logf(msg)
+			Failf(msg)
+		}
 	}, 200*time.Millisecond, done)

-	By("Starting upgrade")
-	upgrade()
+	Logf("Starting master upgrade")
+	expectNoError(mUp())
 	done <- struct{}{}
 	Logf("Stopping async validation")
 	wg.Wait()
-	Logf("Upgrade complete.")
-
-	By("Validating post upgrade state")
-	validateClusterUpgrade(framework, svcName, podName)
+	Logf("Master upgrade complete")
 }

-func runUpgradeScript(scriptPath string, args ...string) {
-	cmd := exec.Command(path.Join(testContext.RepoRoot, scriptPath), args...)
-	upgradeLogPath := path.Join(testContext.OutputDir, "upgrade-"+string(util.NewUUID())+".log")
-	Logf("Writing upgrade logs to %s", upgradeLogPath)
-	upgradeLog, err := os.Create(upgradeLogPath)
-	expectNoError(err)
+func testNodeUpgrade(f Framework, nUp func(f Framework, n int) error, replicas int) {
+	Logf("Starting node upgrade")
+	expectNoError(nUp(f, replicas))
+	Logf("Node upgrade complete")
+
+	// TODO(mbforbes): Validate that:
+	// - the node software version truly changed

-	cmd.Stdout = io.MultiWriter(os.Stdout, upgradeLog)
-	cmd.Stderr = io.MultiWriter(os.Stderr, upgradeLog)
-	if err := cmd.Run(); err != nil {
-		Failf("Upgrade failed: %v", err)
-	}
 }

-func validateClusterUpgrade(framework Framework, svcName, podName string) {
-	pods, err := framework.Client.Pods(framework.Namespace.Name).List(labels.Everything(), fields.Everything())
-	Expect(err).NotTo(HaveOccurred())
-	Expect(len(pods.Items) == 1).Should(BeTrue())
-	if podName != pods.Items[0].Name {
-		Failf("pod name should not have changed")
-	}
-	_, err = podRunningReady(&pods.Items[0])
-	Expect(err).NotTo(HaveOccurred())
-	svc, err := framework.Client.Services(framework.Namespace.Name).Get(svcName)
-	Expect(err).NotTo(HaveOccurred())
-	if svcName != svc.Name {
-		Failf("service name should not have changed")
+// runScript runs script on testContext.RepoRoot using args and returns
+// stdout, stderr, and error.
+func runScript(script string, args ...string) (string, string, error) {
+	Logf("Running %s %v", script, args)
+	var bout, berr bytes.Buffer
+	cmd := exec.Command(path.Join(testContext.RepoRoot, script), args...)
+	cmd.Stdout, cmd.Stderr = &bout, &berr
+	err := cmd.Run()
+	stdout, stderr := bout.String(), berr.String()
+	if err != nil {
+		return "", "", fmt.Errorf("error running %s %v; got error %v, stdout %q, stderr %q",
+			script, args, err, stdout, stderr)
 	}
+	Logf("stdout: %s", stdout)
+	Logf("stderr: %s", stderr)
+	return stdout, stderr, nil
+}
+
+func validate(f Framework, svcNameWant, rcNameWant string, ingress api.LoadBalancerIngress, podsWant int) error {
+	Logf("Beginning cluster validation")
+	// Verify RC.
+	rcs, err := f.Client.ReplicationControllers(f.Namespace.Name).List(labels.Everything())
+	if err != nil {
+		return fmt.Errorf("error listing RCs: %v", err)
+	}
+	if len(rcs.Items) != 1 {
+		return fmt.Errorf("wanted 1 RC with name %s, got %d", rcNameWant, len(rcs.Items))
+	}
+	if got := rcs.Items[0].Name; got != rcNameWant {
+		return fmt.Errorf("wanted RC name %q, got %q", rcNameWant, got)
+	}
+
+	// Verify pods.
+	if err := verifyPods(f.Client, f.Namespace.Name, rcNameWant, false, podsWant); err != nil {
+		return fmt.Errorf("failed to find %d %q pods: %v", podsWant, rcNameWant, err)
+	}
+
+	// Verify service.
+	svc, err := f.Client.Services(f.Namespace.Name).Get(svcNameWant)
+	if err != nil {
+		return fmt.Errorf("error getting service %s: %v", svcNameWant, err)
+	}
+	if svcNameWant != svc.Name {
+		return fmt.Errorf("wanted service name %q, got %q", svcNameWant, svc.Name)
+	}
+	// TODO(mbforbes): Make testLoadBalancerReachable return an error.
+	testLoadBalancerReachable(ingress, 80)
+
+	Logf("Cluster validation succeeded")
+	return nil
 }
--- a/test/e2e/rc.go
+++ b/test/e2e/rc.go
@ -141,7 +141,7 @@ func ServeImageOrFail(c *client.Client, test string, image string) {
 	By("Trying to dial each unique pod")
 	retryTimeout := 2 * time.Minute
 	retryInterval := 5 * time.Second
-	err = wait.Poll(retryInterval, retryTimeout, podResponseChecker{c, ns, label, name, pods}.checkAllResponses)
+	err = wait.Poll(retryInterval, retryTimeout, podResponseChecker{c, ns, label, name, true, pods}.checkAllResponses)
 	if err != nil {
 		Failf("Did not get expected responses within the timeout period of %.2f seconds.", retryTimeout.Seconds())
 	}
--- a/test/e2e/reboot.go
+++ b/test/e2e/reboot.go
@ -155,7 +155,8 @@ func issueSSHCommand(node *api.Node, provider, cmd string) error {
 // failed step, it will return false through result and not run the rest.
 func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan bool) {
 	// Setup
-	ps := newPodStore(c, api.NamespaceDefault, labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name))
+	ns := api.NamespaceDefault
+	ps := newPodStore(c, ns, labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name))
 	defer ps.Stop()

 	// Get the node initially.
@ -183,7 +184,7 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan

 	// For each pod, we do a sanity check to ensure it's running / healthy
 	// now, as that's what we'll be checking later.
-	if !checkPodsRunningReady(c, podNames, podReadyBeforeTimeout) {
+	if !checkPodsRunningReady(c, ns, podNames, podReadyBeforeTimeout) {
 		result <- false
 		return
 	}
@ -209,7 +210,7 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan

 	// Ensure all of the pods that we found on this node before the reboot are
 	// running / healthy.
-	if !checkPodsRunningReady(c, podNames, rebootPodReadyAgainTimeout) {
+	if !checkPodsRunningReady(c, ns, podNames, rebootPodReadyAgainTimeout) {
 		result <- false
 		return
 	}
--- a/test/e2e/resize_nodes.go
+++ b/test/e2e/resize_nodes.go
@ -24,6 +24,7 @@ import (
 	"time"

 	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api/latest"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
@ -34,9 +35,9 @@ import (
 	. "github.com/onsi/gomega"
 )

-var serveHostnameImage string = "gcr.io/google_containers/serve_hostname:1.1"
+const serveHostnameImage = "gcr.io/google_containers/serve_hostname:1.1"

-func resizeNodeInstanceGroup(size int) error {
+func resizeGroup(size int) error {
 	// TODO: make this hit the compute API directly instread of shelling out to gcloud.
 	output, err := exec.Command("gcloud", "preview", "managed-instance-groups", "--project="+testContext.CloudConfig.ProjectID, "--zone="+testContext.CloudConfig.Zone,
 		"resize", testContext.CloudConfig.NodeInstanceGroup, fmt.Sprintf("--new-size=%v", size)).CombinedOutput()
@ -46,7 +47,7 @@ func resizeNodeInstanceGroup(size int) error {
 	return err
 }

-func nodeInstanceGroupSize() (int, error) {
+func groupSize() (int, error) {
 	// TODO: make this hit the compute API directly instread of shelling out to gcloud.
 	output, err := exec.Command("gcloud", "preview", "managed-instance-groups", "--project="+testContext.CloudConfig.ProjectID,
 		"--zone="+testContext.CloudConfig.Zone, "describe", testContext.CloudConfig.NodeInstanceGroup).CombinedOutput()
@ -71,9 +72,9 @@ func nodeInstanceGroupSize() (int, error) {
 	return currentSize, nil
 }

-func waitForNodeInstanceGroupSize(size int) error {
+func waitForGroupSize(size int) error {
 	for start := time.Now(); time.Since(start) < 4*time.Minute; time.Sleep(5 * time.Second) {
-		currentSize, err := nodeInstanceGroupSize()
+		currentSize, err := groupSize()
 		if err != nil {
 			Logf("Failed to get node instance group size: %v", err)
 			continue
@ -104,7 +105,7 @@ func waitForClusterSize(c *client.Client, size int) error {
 	return fmt.Errorf("timeout waiting for cluster size to be %d", size)
 }

-func newServiceWithNameSelector(name string) *api.Service {
+func svcByName(name string) *api.Service {
 	return &api.Service{
 		ObjectMeta: api.ObjectMeta{
 			Name: "test-service",
@ -121,12 +122,12 @@ func newServiceWithNameSelector(name string) *api.Service {
 	}
 }

-func createServiceWithNameSelector(c *client.Client, ns, name string) error {
-	_, err := c.Services(ns).Create(newServiceWithNameSelector(name))
+func newSVCByName(c *client.Client, ns, name string) error {
+	_, err := c.Services(ns).Create(svcByName(name))
 	return err
 }

-func newPodOnNode(podName, nodeName string, image string) *api.Pod {
+func podOnNode(podName, nodeName string, image string) *api.Pod {
 	return &api.Pod{
 		ObjectMeta: api.ObjectMeta{
 			Name: podName,
@ -148,18 +149,39 @@ func newPodOnNode(podName, nodeName string, image string) *api.Pod {
 	}
 }

-func createServeHostnamePodOnNode(c *client.Client, namespace, podName, nodeName string) error {
-	pod, err := c.Pods(namespace).Create(newPodOnNode(podName, nodeName, serveHostnameImage))
+func newPodOnNode(c *client.Client, namespace, podName, nodeName string) error {
+	pod, err := c.Pods(namespace).Create(podOnNode(podName, nodeName, serveHostnameImage))
 	if err == nil {
 		Logf("Created pod %s on node %s", pod.ObjectMeta.Name, nodeName)
 	} else {
-		Logf("Failed to create pod %s on node %s: %s", podName, nodeName, err)
+		Logf("Failed to create pod %s on node %s: %v", podName, nodeName, err)
 	}
 	return err
 }

-func newReplicationControllerWithNameSelector(name string, replicas int, image string) *api.ReplicationController {
+func rcByName(name string, replicas int, image string, labels map[string]string) *api.ReplicationController {
+	return rcByNameContainer(name, replicas, image, labels, api.Container{
+		Name:  name,
+		Image: image,
+	})
+}
+
+func rcByNamePort(name string, replicas int, image string, port int, labels map[string]string) *api.ReplicationController {
+	return rcByNameContainer(name, replicas, image, labels, api.Container{
+		Name:  name,
+		Image: image,
+		Ports: []api.ContainerPort{{ContainerPort: port}},
+	})
+}
+
+func rcByNameContainer(name string, replicas int, image string, labels map[string]string, c api.Container) *api.ReplicationController {
+	// Add "name": name to the labels, overwriting if it exists.
+	labels["name"] = name
 	return &api.ReplicationController{
+		TypeMeta: api.TypeMeta{
+			Kind:       "ReplicationController",
+			APIVersion: latest.Version,
+		},
 		ObjectMeta: api.ObjectMeta{
 			Name: name,
 		},
@ -170,28 +192,24 @@ func newReplicationControllerWithNameSelector(name string, replicas int, image s
 			},
 			Template: &api.PodTemplateSpec{
 				ObjectMeta: api.ObjectMeta{
-					Labels: map[string]string{"name": name},
+					Labels: labels,
 				},
 				Spec: api.PodSpec{
-					Containers: []api.Container{
-						{
-							Name:  name,
-							Image: image,
-							Ports: []api.ContainerPort{{ContainerPort: 9376}},
-						},
-					},
+					Containers: []api.Container{c},
 				},
 			},
 		},
 	}
 }

-func createServeHostnameReplicationController(c *client.Client, ns, name string, replicas int) (*api.ReplicationController, error) {
+// newRCByName creates a replication controller with a selector by name of name.
+func newRCByName(c *client.Client, ns, name string, replicas int) (*api.ReplicationController, error) {
 	By(fmt.Sprintf("creating replication controller %s", name))
-	return c.ReplicationControllers(ns).Create(newReplicationControllerWithNameSelector(name, replicas, serveHostnameImage))
+	return c.ReplicationControllers(ns).Create(rcByNamePort(
+		name, replicas, serveHostnameImage, 9376, map[string]string{}))
 }

-func resizeReplicationController(c *client.Client, ns, name string, replicas int) error {
+func resizeRC(c *client.Client, ns, name string, replicas int) error {
 	rc, err := c.ReplicationControllers(ns).Get(name)
 	if err != nil {
 		return err
@ -201,7 +219,7 @@ func resizeReplicationController(c *client.Client, ns, name string, replicas int
 	return err
 }

-func waitForPodsCreated(c *client.Client, ns, name string, replicas int) (*api.PodList, error) {
+func podsCreated(c *client.Client, ns, name string, replicas int) (*api.PodList, error) {
 	// List the pods, making sure we observe all the replicas.
 	label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name}))
 	for start := time.Now(); time.Since(start) < time.Minute; time.Sleep(5 * time.Second) {
@ -218,7 +236,7 @@ func waitForPodsCreated(c *client.Client, ns, name string, replicas int) (*api.P
 	return nil, fmt.Errorf("Pod name %s: Gave up waiting for %d pods to come up", name, replicas)
 }

-func waitForPodsRunning(c *client.Client, pods *api.PodList) []error {
+func podsRunning(c *client.Client, pods *api.PodList) []error {
 	// Wait for the pods to enter the running state. Waiting loops until the pods
 	// are running so non-running pods cause a timeout for this test.
 	By("ensuring each pod is running")
@ -233,24 +251,24 @@ func waitForPodsRunning(c *client.Client, pods *api.PodList) []error {
 	return e
 }

-func verifyPodsResponding(c *client.Client, ns, name string, pods *api.PodList) error {
+func podsResponding(c *client.Client, ns, name string, wantName bool, pods *api.PodList) error {
 	By("trying to dial each unique pod")
 	retryTimeout := 2 * time.Minute
 	retryInterval := 5 * time.Second
 	label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name}))
-	return wait.Poll(retryInterval, retryTimeout, podResponseChecker{c, ns, label, name, pods}.checkAllResponses)
+	return wait.Poll(retryInterval, retryTimeout, podResponseChecker{c, ns, label, name, wantName, pods}.checkAllResponses)
 }

-func waitForPodsCreatedRunningResponding(c *client.Client, ns, name string, replicas int) error {
-	pods, err := waitForPodsCreated(c, ns, name, replicas)
+func verifyPods(c *client.Client, ns, name string, wantName bool, replicas int) error {
+	pods, err := podsCreated(c, ns, name, replicas)
 	if err != nil {
 		return err
 	}
-	e := waitForPodsRunning(c, pods)
+	e := podsRunning(c, pods)
 	if len(e) > 0 {
 		return fmt.Errorf("Failed to wait for pods running: %v", e)
 	}
-	err = verifyPodsResponding(c, ns, name, pods)
+	err = podsResponding(c, ns, name, wantName, pods)
 	if err != nil {
 		return err
 	}
@ -331,7 +349,7 @@ func performTemporaryNetworkFailure(c *client.Client, ns, rcName string, replica
 	waitForRCPodToDisappear(c, ns, rcName, podNameToDisappear)

 	By("verifying whether the pod from the unreachable node is recreated")
-	err := waitForPodsCreatedRunningResponding(c, ns, rcName, replicas)
+	err := verifyPods(c, ns, rcName, true, replicas)
 	Expect(err).NotTo(HaveOccurred())

 	// network traffic is unblocked in a defered function
@ -372,10 +390,10 @@ var _ = Describe("Nodes", func() {
 				return
 			}
 			By("restoring the original node instance group size")
-			if err := resizeNodeInstanceGroup(testContext.CloudConfig.NumNodes); err != nil {
+			if err := resizeGroup(testContext.CloudConfig.NumNodes); err != nil {
 				Failf("Couldn't restore the original node instance group size: %v", err)
 			}
-			if err := waitForNodeInstanceGroupSize(testContext.CloudConfig.NumNodes); err != nil {
+			if err := waitForGroupSize(testContext.CloudConfig.NumNodes); err != nil {
 				Failf("Couldn't restore the original node instance group size: %v", err)
 			}
 			if err := waitForClusterSize(c, testContext.CloudConfig.NumNodes); err != nil {
@ -396,20 +414,20 @@ var _ = Describe("Nodes", func() {
 			// The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname
 			name := "my-hostname-delete-node"
 			replicas := testContext.CloudConfig.NumNodes
-			createServeHostnameReplicationController(c, ns, name, replicas)
-			err := waitForPodsCreatedRunningResponding(c, ns, name, replicas)
+			newRCByName(c, ns, name, replicas)
+			err := verifyPods(c, ns, name, true, replicas)
 			Expect(err).NotTo(HaveOccurred())

 			By(fmt.Sprintf("decreasing cluster size to %d", replicas-1))
-			err = resizeNodeInstanceGroup(replicas - 1)
+			err = resizeGroup(replicas - 1)
 			Expect(err).NotTo(HaveOccurred())
-			err = waitForNodeInstanceGroupSize(replicas - 1)
+			err = waitForGroupSize(replicas - 1)
 			Expect(err).NotTo(HaveOccurred())
 			err = waitForClusterSize(c, replicas-1)
 			Expect(err).NotTo(HaveOccurred())

 			By("verifying whether the pods from the removed node are recreated")
-			err = waitForPodsCreatedRunningResponding(c, ns, name, replicas)
+			err = verifyPods(c, ns, name, true, replicas)
 			Expect(err).NotTo(HaveOccurred())
 		})

@ -426,24 +444,24 @@ var _ = Describe("Nodes", func() {
 			// Create a replication controller for a service that serves its hostname.
 			// The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname
 			name := "my-hostname-add-node"
-			createServiceWithNameSelector(c, ns, name)
+			newSVCByName(c, ns, name)
 			replicas := testContext.CloudConfig.NumNodes
-			createServeHostnameReplicationController(c, ns, name, replicas)
-			err := waitForPodsCreatedRunningResponding(c, ns, name, replicas)
+			newRCByName(c, ns, name, replicas)
+			err := verifyPods(c, ns, name, true, replicas)
 			Expect(err).NotTo(HaveOccurred())

 			By(fmt.Sprintf("increasing cluster size to %d", replicas+1))
-			err = resizeNodeInstanceGroup(replicas + 1)
+			err = resizeGroup(replicas + 1)
 			Expect(err).NotTo(HaveOccurred())
-			err = waitForNodeInstanceGroupSize(replicas + 1)
+			err = waitForGroupSize(replicas + 1)
 			Expect(err).NotTo(HaveOccurred())
 			err = waitForClusterSize(c, replicas+1)
 			Expect(err).NotTo(HaveOccurred())

 			By(fmt.Sprintf("increasing size of the replication controller to %d and verifying all pods are running", replicas+1))
-			err = resizeReplicationController(c, ns, name, replicas+1)
+			err = resizeRC(c, ns, name, replicas+1)
 			Expect(err).NotTo(HaveOccurred())
-			err = waitForPodsCreatedRunningResponding(c, ns, name, replicas+1)
+			err = verifyPods(c, ns, name, true, replicas+1)
 			Expect(err).NotTo(HaveOccurred())
 		})
 	})
@ -472,10 +490,10 @@ var _ = Describe("Nodes", func() {
 			// Create a replication controller for a service that serves its hostname.
 			// The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname
 			name := "my-hostname-net"
-			createServiceWithNameSelector(c, ns, name)
+			newSVCByName(c, ns, name)
 			replicas := testContext.CloudConfig.NumNodes
-			createServeHostnameReplicationController(c, ns, name, replicas)
-			err := waitForPodsCreatedRunningResponding(c, ns, name, replicas)
+			newRCByName(c, ns, name, replicas)
+			err := verifyPods(c, ns, name, true, replicas)
 			Expect(err).NotTo(HaveOccurred(), "Each pod should start running and responding")

 			By("choose a node with at least one pod - we will block some network traffic on this node")
@ -496,9 +514,9 @@ var _ = Describe("Nodes", func() {
 			// increasing the RC size is not a valid way to test this
 			// since we have no guarantees the pod will be scheduled on our node.
 			additionalPod := "additionalpod"
-			err = createServeHostnamePodOnNode(c, ns, additionalPod, node.Name)
+			err = newPodOnNode(c, ns, additionalPod, node.Name)
 			Expect(err).NotTo(HaveOccurred())
-			err = waitForPodsCreatedRunningResponding(c, ns, additionalPod, 1)
+			err = verifyPods(c, ns, additionalPod, true, 1)
 			Expect(err).NotTo(HaveOccurred())

 			// verify that it is really on the requested node
--- a/test/e2e/restart.go
+++ b/test/e2e/restart.go
@ -86,7 +86,8 @@ var _ = Describe("Restart", func() {
 		for i, p := range pods {
 			podNamesBefore[i] = p.ObjectMeta.Name
 		}
-		if !checkPodsRunningReady(c, podNamesBefore, podReadyBeforeTimeout) {
+		ns := api.NamespaceDefault
+		if !checkPodsRunningReady(c, ns, podNamesBefore, podReadyBeforeTimeout) {
 			Failf("At least one pod wasn't running and ready at test start.")
 		}

@ -115,7 +116,7 @@ var _ = Describe("Restart", func() {
 		podNamesAfter, err := waitForNPods(ps, len(podNamesBefore), restartPodReadyAgainTimeout)
 		Expect(err).NotTo(HaveOccurred())
 		remaining := restartPodReadyAgainTimeout - time.Since(podCheckStart)
-		if !checkPodsRunningReady(c, podNamesAfter, remaining) {
+		if !checkPodsRunningReady(c, ns, podNamesAfter, remaining) {
 			Failf("At least one pod wasn't running and ready after the restart.")
 		}
 	})
@ -166,7 +167,8 @@ func checkNodesReady(c *client.Client, nt time.Duration, expect int) ([]string,
 			return false, nil
 		}
 		if len(nodeList.Items) != expect {
-			errLast = fmt.Errorf("expected to find %d nodes but found only %d", expect, len(nodeList.Items))
+			errLast = fmt.Errorf("expected to find %d nodes but found only %d (%v elapsed)",
+				expect, len(nodeList.Items), time.Since(start))
 			Logf("%v", errLast)
 			return false, nil
 		}
@ -180,6 +182,7 @@ func checkNodesReady(c *client.Client, nt time.Duration, expect int) ([]string,
 		return nodeNames, fmt.Errorf("couldn't find %d nodes within %v; last error: %v",
 			expect, nt, errLast)
 	}
+	Logf("Successfully found %d nodes", expect)

 	// Next, ensure in parallel that all the nodes are ready. We subtract the
 	// time we spent waiting above.
@ -209,28 +212,32 @@ func checkNodesReady(c *client.Client, nt time.Duration, expect int) ([]string,
 func restartNodes(provider string, nt time.Duration) error {
 	switch provider {
 	case "gce":
-		return migRollingUpdate(nt)
+		return migRollingUpdateSelf(nt)
 	default:
 		return fmt.Errorf("restartNodes(...) not implemented for %s", provider)
 	}
 }

-// migRollingUpdate starts a MIG rolling update and waits up to nt times the
-// nubmer of nodes for it to complete.
-func migRollingUpdate(nt time.Duration) error {
+func migRollingUpdateSelf(nt time.Duration) error {
 	By("getting the name of the template for the managed instance group")
-	templ, err := migTemplate()
+	tmpl, err := migTemplate()
 	if err != nil {
 		return fmt.Errorf("couldn't get MIG template name: %v", err)
 	}
+	return migRollingUpdate(tmpl, nt)
+}

-	By("starting the managed instance group rolling update")
-	id, err := migRollingUpdateStart(templ, nt)
+// migRollingUpdate starts a MIG rolling update, upgrading the nodes to a new
+// instance template named tmpl, and waits up to nt times the nubmer of nodes
+// for it to complete.
+func migRollingUpdate(tmpl string, nt time.Duration) error {
+	By(fmt.Sprintf("starting the MIG rolling update to %s", tmpl))
+	id, err := migRollingUpdateStart(tmpl, nt)
 	if err != nil {
 		return fmt.Errorf("couldn't start the MIG rolling update: %v", err)
 	}

-	By("polling the managed instance group rolling update until it completes")
+	By(fmt.Sprintf("polling the MIG rolling update (%s) until it completes", id))
 	if err := migRollingUpdatePoll(id, nt); err != nil {
 		return fmt.Errorf("err waiting until update completed: %v", err)
 	}
@ -284,7 +291,9 @@ func migRollingUpdateStart(templ string, nt time.Duration) (string, error) {
 	prefix, suffix := "Started [", "]."
 	if err := wait.Poll(poll, singleCallTimeout, func() (bool, error) {
 		// TODO(mbforbes): make this hit the compute API directly instead of
-		// shelling out to gcloud.
+		//                 shelling out to gcloud.
+		// NOTE(mbforbes): If you are changing this gcloud command, update
+		//                 cluster/gce/upgrade.sh to match this EXACTLY.
 		o, err := exec.Command("gcloud", "preview", "rolling-updates",
 			fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID),
 			fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone),
@ -361,6 +370,7 @@ func migRollingUpdatePoll(id string, nt time.Duration) error {
 	}) != nil {
 		return fmt.Errorf("timeout waiting %v for MIG rolling update to complete. Last error: %v", timeout, errLast)
 	}
+	Logf("MIG rolling update complete after %v", time.Since(start))
 	return nil
 }

--- a/test/e2e/service.go
+++ b/test/e2e/service.go
@ -22,13 +22,10 @@ import (
 	"math/rand"
 	"net/http"
 	"sort"
-	"strconv"
 	"strings"
-	"sync/atomic"
 	"time"

 	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
-	"github.com/GoogleCloudPlatform/kubernetes/pkg/api/latest"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
@ -277,7 +274,7 @@ var _ = Describe("Services", func() {
 		}

 		By("creating pod to be part of service " + serviceName)
-		t.CreateWebserverPod()
+		t.CreateWebserverRC(1)

 		By("hitting the pod through the service's NodePort")
 		testReachable(pickMinionIP(c), port.NodePort)
@ -324,7 +321,7 @@ var _ = Describe("Services", func() {
 		}

 		By("creating pod to be part of service " + serviceName)
-		t.CreateWebserverPod()
+		t.CreateWebserverRC(1)

 		By("hitting the pod through the service's NodePort")
 		ip := pickMinionIP(c)
@ -365,7 +362,7 @@ var _ = Describe("Services", func() {
 		}

 		By("creating pod to be part of service " + t.ServiceName)
-		t.CreateWebserverPod()
+		t.CreateWebserverRC(1)

 		By("changing service " + serviceName + " to type=NodePort")
 		service.Spec.Type = api.ServiceTypeNodePort
@ -515,7 +512,7 @@ var _ = Describe("Services", func() {
 		Expect(err).NotTo(HaveOccurred())

 		By("creating pod to be part of service " + t.ServiceName)
-		t.CreateWebserverPod()
+		t.CreateWebserverRC(1)

 		if service.Spec.Type != api.ServiceTypeLoadBalancer {
 			Failf("got unexpected Spec.Type for LoadBalancer service: %v", service)
@ -1011,11 +1008,12 @@ func testReachable(ip string, port int) {
 		Failf("Got port==0 for reachability check (%s)", url)
 	}

-	By(fmt.Sprintf("Checking reachability of %s", url))
+	By(fmt.Sprintf("Waiting up to %v for %s to be reachable", podStartTimeout, url))
+	start := time.Now()
 	expectNoError(wait.Poll(poll, podStartTimeout, func() (bool, error) {
 		resp, err := httpGetNoConnectionPool(url)
 		if err != nil {
-			Logf("Got error waiting for reachability of %s: %v", url, err)
+			Logf("Got error waiting for reachability of %s: %v (%v)", url, err, time.Since(start))
 			return false, nil
 		}
 		defer resp.Body.Close()
@ -1044,7 +1042,7 @@ func testNotReachable(ip string, port int) {
 		Failf("Got port==0 for non-reachability check (%s)", url)
 	}

-	By(fmt.Sprintf("Checking that %s is not reachable", url))
+	By(fmt.Sprintf("Waiting up to %v for %s to be *not* reachable", podStartTimeout, url))
 	expectNoError(wait.Poll(poll, podStartTimeout, func() (bool, error) {
 		resp, err := httpGetNoConnectionPool(url)
 		if err != nil {
@ -1086,11 +1084,10 @@ type WebserverTest struct {
 	TestId string
 	Labels map[string]string

-	pods     map[string]bool
+	rcs      map[string]bool
 	services map[string]bool
-
-	// Used for generating e.g. unique pod names
-	sequence int32
+	name     string
+	image    string
 }

 func NewWebserverTest(client *client.Client, namespace string, serviceName string) *WebserverTest {
@ -1103,15 +1100,13 @@ func NewWebserverTest(client *client.Client, namespace string, serviceName strin
 		"testid": t.TestId,
 	}

-	t.pods = make(map[string]bool)
+	t.rcs = make(map[string]bool)
 	t.services = make(map[string]bool)

-	return t
-}
+	t.name = "webserver"
+	t.image = "gcr.io/google_containers/test-webserver"

-func (t *WebserverTest) SequenceNext() int {
-	n := atomic.AddInt32(&t.sequence, 1)
-	return int(n)
+	return t
 }

 // Build default config for a service (which can then be changed)
@ -1131,43 +1126,27 @@ func (t *WebserverTest) BuildServiceSpec() *api.Service {
 	return service
 }

-// Create a pod with the well-known webserver configuration, and record it for cleanup
-func (t *WebserverTest) CreateWebserverPod() *api.Pod {
-	name := t.ServiceName + "-" + strconv.Itoa(t.SequenceNext())
-	pod := &api.Pod{
-		TypeMeta: api.TypeMeta{
-			Kind:       "Pod",
-			APIVersion: latest.Version,
-		},
-		ObjectMeta: api.ObjectMeta{
-			Name:   name,
-			Labels: t.Labels,
-		},
-		Spec: api.PodSpec{
-			Containers: []api.Container{
-				{
-					Name:  "webserver",
-					Image: "gcr.io/google_containers/test-webserver",
-				},
-			},
-		},
-	}
-	pod, err := t.CreatePod(pod)
+// CreateWebserverRC creates rc-backed pods with the well-known webserver
+// configuration and records it for cleanup.
+func (t *WebserverTest) CreateWebserverRC(replicas int) *api.ReplicationController {
+	rcSpec := rcByName(t.name, replicas, t.image, t.Labels)
+	rcAct, err := t.createRC(rcSpec)
 	if err != nil {
-		Failf("Failed to create pod %s: %v", pod.Name, err)
+		Failf("Failed to create rc %s: %v", rcSpec.Name, err)
 	}
-	expectNoError(waitForPodRunningInNamespace(t.Client, pod.Name, t.Namespace))
-	return pod
+	if err := verifyPods(t.Client, t.Namespace, t.name, false, replicas); err != nil {
+		Failf("Failed to create %d pods with name %s: %v", replicas, t.name, err)
+	}
+	return rcAct
 }

-// Create a pod, and record it for cleanup
-func (t *WebserverTest) CreatePod(pod *api.Pod) (*api.Pod, error) {
-	podClient := t.Client.Pods(t.Namespace)
-	result, err := podClient.Create(pod)
+// createRC creates a replication controller and records it for cleanup.
+func (t *WebserverTest) createRC(rc *api.ReplicationController) (*api.ReplicationController, error) {
+	rc, err := t.Client.ReplicationControllers(t.Namespace).Create(rc)
 	if err == nil {
-		t.pods[pod.Name] = true
+		t.rcs[rc.Name] = true
 	}
-	return result, err
+	return rc, err
 }

 // Create a service, and record it for cleanup
@ -1190,14 +1169,23 @@ func (t *WebserverTest) DeleteService(serviceName string) error {

 func (t *WebserverTest) Cleanup() []error {
 	var errs []error
-
-	for podName := range t.pods {
-		podClient := t.Client.Pods(t.Namespace)
-		By("deleting pod " + podName + " in namespace " + t.Namespace)
-		err := podClient.Delete(podName, nil)
+	for rcName := range t.rcs {
+		By("stopping RC " + rcName + " in namespace " + t.Namespace)
+		// First, resize the RC to 0.
+		old, err := t.Client.ReplicationControllers(t.Namespace).Get(rcName)
 		if err != nil {
 			errs = append(errs, err)
 		}
+		old.Spec.Replicas = 0
+		if _, err := t.Client.ReplicationControllers(t.Namespace).Update(old); err != nil {
+			errs = append(errs, err)
+		}
+		// TODO(mbforbes): Wait.
+
+		// Then, delete the RC altogether.
+		if err := t.Client.ReplicationControllers(t.Namespace).Delete(rcName); err != nil {
+			errs = append(errs, err)
+		}
 	}

 	for serviceName := range t.services {
--- a/test/e2e/util.go
+++ b/test/e2e/util.go
@ -514,6 +514,7 @@ type podResponseChecker struct {
 	ns             string
 	label          labels.Selector
 	controllerName string
+	respondName    bool // Whether the pod should respond with its own name.
 	pods           *api.PodList
 }

@ -535,16 +536,32 @@ func (r podResponseChecker) checkAllResponses() (done bool, err error) {
 			Do().
 			Raw()
 		if err != nil {
-			Logf("Controller %s: Failed to GET from replica %d (%s): %v:", r.controllerName, i+1, pod.Name, err)
+			Logf("Controller %s: Failed to GET from replica %d [%s]: %v:", r.controllerName, i+1, pod.Name, err)
 			continue
 		}
-		// The body should be the pod name.
-		if string(body) != pod.Name {
-			Logf("Controller %s: Replica %d expected response %s but got %s", r.controllerName, i+1, pod.Name, string(body))
-			continue
+		// The response checker expects the pod's name unless !respondName, in
+		// which case it just checks for a non-empty response.
+		got := string(body)
+		what := ""
+		if r.respondName {
+			what = "expected"
+			want := pod.Name
+			if got != want {
+				Logf("Controller %s: Replica %d [%s] expected response %q but got %q",
+					r.controllerName, i+1, pod.Name, want, got)
+				continue
+			}
+		} else {
+			what = "non-empty"
+			if len(got) == 0 {
+				Logf("Controller %s: Replica %d [%s] expected non-empty response",
+					r.controllerName, i+1, pod.Name)
+				continue
+			}
 		}
 		successes++
-		Logf("Controller %s: Got expected result from replica %d: %s, %d of %d required successes so far", r.controllerName, i+1, string(body), successes, len(r.pods.Items))
+		Logf("Controller %s: Got %s result from replica %d [%s]: %q, %d of %d required successes so far",
+			r.controllerName, what, i+1, pod.Name, got, successes, len(r.pods.Items))
 	}
 	if successes < len(r.pods.Items) {
 		return false, nil
@ -1171,15 +1188,15 @@ func getSigner(provider string) (ssh.Signer, error) {
 }

 // checkPodsRunning returns whether all pods whose names are listed in podNames
-// are running and ready.
-func checkPodsRunningReady(c *client.Client, podNames []string, timeout time.Duration) bool {
+// in namespace ns are running and ready, using c and waiting at most timeout.
+func checkPodsRunningReady(c *client.Client, ns string, podNames []string, timeout time.Duration) bool {
 	np, desc := len(podNames), "running and ready"
 	Logf("Waiting up to %v for the following %d pods to be %s: %s", timeout, np, desc, podNames)
 	result := make(chan bool, len(podNames))
 	for ix := range podNames {
 		// Launch off pod readiness checkers.
 		go func(name string) {
-			err := waitForPodCondition(c, api.NamespaceDefault, name, desc, timeout, podRunningReady)
+			err := waitForPodCondition(c, ns, name, desc, timeout, podRunningReady)
 			result <- err == nil
 		}(podNames[ix])
 	}