diff --git a/cluster/common.sh b/cluster/common.sh index 38184f175c..cb75be8a19 100755 --- a/cluster/common.sh +++ b/cluster/common.sh @@ -179,13 +179,13 @@ function get-kubeconfig-bearertoken() { function set_binary_version() { if [[ "${1}" == "latest_stable" ]]; then KUBE_VERSION=$(gsutil cat gs://kubernetes-release/release/stable.txt) - echo "Using latest stable version: ${KUBE_VERSION}" + echo "Using latest stable version: ${KUBE_VERSION}" >&2 elif [[ "${1}" == "latest_release" ]]; then KUBE_VERSION=$(gsutil cat gs://kubernetes-release/release/latest.txt) - echo "Using latest release version: ${KUBE_VERSION}" + echo "Using latest release version: ${KUBE_VERSION}" >&2 elif [[ "${1}" == "latest_ci" ]]; then KUBE_VERSION=$(gsutil cat gs://kubernetes-release/ci/latest.txt) - echo "Using latest ci version: ${KUBE_VERSION}" + echo "Using latest ci version: ${KUBE_VERSION}" >&2 else KUBE_VERSION=${1} fi diff --git a/cluster/gce/upgrade.sh b/cluster/gce/upgrade.sh index d8c1556bf7..886e41047a 100755 --- a/cluster/gce/upgrade.sh +++ b/cluster/gce/upgrade.sh @@ -34,10 +34,11 @@ source "${KUBE_ROOT}/cluster/${KUBERNETES_PROVIDER}/util.sh" function usage() { echo "!!! EXPERIMENTAL !!!" echo "" - echo "${0} [-M|-N] -l | | [latest_stable|latest_release|latest_ci]" + echo "${0} [-M|-N|-P] -l | | [latest_stable|latest_release|latest_ci]" echo " Upgrades master and nodes by default" echo " -M: Upgrade master only" echo " -N: Upgrade nodes only" + echo " -P: Node upgrade prerequisites only (create a new instance template)" echo " -l: Use local(dev) binaries" echo "" echo "(... Fetching current release versions ...)" @@ -109,21 +110,27 @@ function prepare-upgrade() { tars_from_version } -# Reads kube-env metadata from master and extracts value from provided key. + +# Reads kube-env metadata from first node in MINION_NAMES. # # Assumed vars: -# MASTER_NAME +# MINION_NAMES # PROJECT # ZONE +function get-node-env() { + # TODO(mbforbes): Make this more reliable with retries. + gcloud compute --project ${PROJECT} ssh --zone ${ZONE} ${MINION_NAMES[0]} --command \ + "curl --fail --silent -H 'Metadata-Flavor: Google' \ + 'http://metadata/computeMetadata/v1/instance/attributes/kube-env'" 2>/dev/null +} + +# Using provided node env, extracts value from provided key. # # Args: -# $1 env key to use +# $1 node env (kube-env of node; result of calling get-node-env) +# $2 env key to use function get-env-val() { - # TODO(mbforbes): Make this more reliable with retries. - gcloud compute --project ${PROJECT} ssh --zone ${ZONE} ${MASTER_NAME} --command \ - "curl --fail --silent -H 'Metadata-Flavor: Google' \ - 'http://metadata/computeMetadata/v1/instance/attributes/kube-env'" 2>/dev/null \ - | grep ${1} | cut -d : -f 2 | cut -d \' -f 2 + echo "${1}" | grep ${2} | cut -d : -f 2 | cut -d \' -f 2 } # Assumed vars: @@ -132,9 +139,40 @@ function get-env-val() { # NODE_INSTANCE_PREFIX # PROJECT # ZONE +# +# Vars set: +# KUBELET_TOKEN +# KUBE_PROXY_TOKEN +# CA_CERT_BASE64 +# EXTRA_DOCKER_OPTS +# KUBELET_CERT_BASE64 +# KUBELET_KEY_BASE64 function upgrade-nodes() { - local sanitized_version=$(echo ${KUBE_VERSION} | sed s/"\."/-/g) - echo "== Upgrading nodes to ${KUBE_VERSION}. ==" + prepare-node-upgrade + do-node-upgrade +} + +# prepare-node-upgrade creates a new instance template suitable for upgrading +# to KUBE_VERSION and echos a single line with the name of the new template. +# +# Assumed vars: +# KUBE_VERSION +# MINION_SCOPES +# NODE_INSTANCE_PREFIX +# PROJECT +# ZONE +# +# Vars set: +# SANITIZED_VERSION +# KUBELET_TOKEN +# KUBE_PROXY_TOKEN +# CA_CERT_BASE64 +# EXTRA_DOCKER_OPTS +# KUBELET_CERT_BASE64 +# KUBELET_KEY_BASE64 +function prepare-node-upgrade() { + echo "== Preparing node upgrade (to ${KUBE_VERSION}). ==" >&2 + SANITIZED_VERSION=$(echo ${KUBE_VERSION} | sed s/"\."/-/g) detect-minion-names @@ -146,34 +184,56 @@ function upgrade-nodes() { scope_flags=("--no-scopes") fi - # Get required node tokens. - KUBELET_TOKEN=$(get-env-val "KUBELET_TOKEN") - KUBE_PROXY_TOKEN=$(get-env-val "KUBE_PROXY_TOKEN") + # Get required node env vars from exiting template. + local node_env=$(get-node-env) + KUBELET_TOKEN=$(get-env-val "${node_env}" "KUBELET_TOKEN") + KUBE_PROXY_TOKEN=$(get-env-val "${node_env}" "KUBE_PROXY_TOKEN") + CA_CERT_BASE64=$(get-env-val "${node_env}" "CA_CERT") + EXTRA_DOCKER_OPTS=$(get-env-val "${node_env}" "EXTRA_DOCKER_OPTS") + KUBELET_CERT_BASE64=$(get-env-val "${node_env}" "KUBELET_CERT") + KUBELET_KEY_BASE64=$(get-env-val "${node_env}" "KUBELET_KEY") # TODO(mbforbes): How do we ensure kube-env is written in a ${version}- # compatible way? write-node-env + # TODO(mbforbes): Get configure-vm script from ${version}. (Must plumb this # through all create-node-instance-template implementations). - create-node-instance-template ${sanitized_version} + create-node-instance-template ${SANITIZED_VERSION} + # The following is echo'd so that callers can get the template name. + echo "${NODE_INSTANCE_PREFIX}-template-${SANITIZED_VERSION}" + echo "== Finished preparing node upgrade (to ${KUBE_VERSION}). ==" >&2 +} +# Prereqs: +# - prepare-node-upgrade should have been called successfully +function do-node-upgrade() { + echo "== Upgrading nodes to ${KUBE_VERSION}. ==" >&2 # Do the actual upgrade. - gcloud preview rolling-updates start \ - --group "${NODE_INSTANCE_PREFIX}-group" \ - --max-num-concurrent-instances 1 \ - --max-num-failed-instances 0 \ - --project "${PROJECT}" \ - --zone "${ZONE}" \ - --template "${NODE_INSTANCE_PREFIX}-template-${sanitized_version}" + # NOTE(mbforbes): If you are changing this gcloud command, update + # test/e2e/restart.go to match this EXACTLY. + gcloud preview rolling-updates \ + --project="${PROJECT}" \ + --zone="${ZONE}" \ + start \ + --group="${NODE_INSTANCE_PREFIX}-group" \ + --template="${NODE_INSTANCE_PREFIX}-template-${SANITIZED_VERSION}" \ + --instance-startup-timeout=300s \ + --max-num-concurrent-instances=1 \ + --max-num-failed-instances=0 \ + --min-instance-update-time=0s - echo "== Done ==" + # TODO(mbforbes): Wait for the rolling-update to finish. + + echo "== Finished upgrading nodes to ${KUBE_VERSION}. ==" >&2 } master_upgrade=true node_upgrade=true +node_prereqs=false local_binaries=false -while getopts ":MNlh" opt; do +while getopts ":MNPlh" opt; do case ${opt} in M) node_upgrade=false @@ -181,6 +241,9 @@ while getopts ":MNlh" opt; do N) master_upgrade=false ;; + P) + node_prereqs=true + ;; l) local_binaries=true ;; @@ -213,6 +276,11 @@ fi prepare-upgrade +if [[ "${node_prereqs}" == "true" ]]; then + prepare-node-upgrade + exit 0 +fi + if [[ "${master_upgrade}" == "true" ]]; then upgrade-master fi @@ -220,6 +288,7 @@ fi if [[ "${node_upgrade}" == "true" ]]; then if [[ "${local_binaries}" == "true" ]]; then echo "Upgrading nodes to local binaries is not yet supported." >&2 + exit 1 else upgrade-nodes fi diff --git a/cluster/gce/util.sh b/cluster/gce/util.sh index 254e6970a0..34e83b17bd 100755 --- a/cluster/gce/util.sh +++ b/cluster/gce/util.sh @@ -250,7 +250,7 @@ function detect-minion-names { MINION_NAMES=($(gcloud preview --project "${PROJECT}" instance-groups \ --zone "${ZONE}" instances --group "${NODE_INSTANCE_PREFIX}-group" list \ | cut -d'/' -f11)) - echo "MINION_NAMES=${MINION_NAMES[*]}" + echo "MINION_NAMES=${MINION_NAMES[*]}" >&2 } # Waits until the number of running nodes in the instance group is equal to NUM_NODES @@ -415,8 +415,9 @@ function create-node-template { fi fi - local attempt=0 + local attempt=1 while true; do + echo "Attempt ${attempt} to create ${1}" >&2 if ! gcloud compute instance-templates create "$1" \ --project "${PROJECT}" \ --machine-type "${MINION_SIZE}" \ @@ -428,12 +429,12 @@ function create-node-template { --network "${NETWORK}" \ $2 \ --can-ip-forward \ - --metadata-from-file "$3","$4"; then + --metadata-from-file "$3","$4" >&2; then if (( attempt > 5 )); then echo -e "${color_red}Failed to create instance template $1 ${color_norm}" >&2 exit 2 fi - echo -e "${color_yellow}Attempt $(($attempt+1)) failed to create instance template $1. Retrying.${color_norm}" >&2 + echo -e "${color_yellow}Attempt ${attempt} failed to create instance template $1. Retrying.${color_norm}" >&2 attempt=$(($attempt+1)) else break diff --git a/pkg/util/ssh.go b/pkg/util/ssh.go index 8cc747d0e4..d59d1e13a7 100644 --- a/pkg/util/ssh.go +++ b/pkg/util/ssh.go @@ -142,6 +142,8 @@ func (s *SSHTunnel) Close() error { return nil } +// RunSSHCommand returns the stdout, stderr, and exit code from running cmd on +// host along with any SSH-level error. func RunSSHCommand(cmd, host string, signer ssh.Signer) (string, string, int, error) { // Setup the config, dial the server, and open a session. config := &ssh.ClientConfig{ diff --git a/test/e2e/cluster_upgrade.go b/test/e2e/cluster_upgrade.go index 5a09722036..b79064fafb 100644 --- a/test/e2e/cluster_upgrade.go +++ b/test/e2e/cluster_upgrade.go @@ -17,17 +17,16 @@ limitations under the License. package e2e import ( + "bytes" "fmt" - "io" "net/http" - "os" "os/exec" "path" + "strings" "sync" "time" "github.com/GoogleCloudPlatform/kubernetes/pkg/api" - "github.com/GoogleCloudPlatform/kubernetes/pkg/fields" "github.com/GoogleCloudPlatform/kubernetes/pkg/labels" "github.com/GoogleCloudPlatform/kubernetes/pkg/util" "github.com/GoogleCloudPlatform/kubernetes/pkg/util/wait" @@ -36,68 +35,176 @@ import ( . "github.com/onsi/gomega" ) +// version applies to upgrades; kube-push always pushes local binaries. +const version = "latest_ci" + +// The following upgrade functions are passed into the framework below and used +// to do the actual upgrades. + +var masterUpgrade = func() error { + _, _, err := runScript("hack/e2e-internal/e2e-upgrade.sh", "-M", version) + return err +} + +var masterPush = func() error { + _, _, err := runScript("hack/e2e-internal/e2e-push.sh", "-m") + return err +} + +var nodeUpgrade = func(f Framework, replicas int) error { + Logf("Preparing node upgarde by creating new instance template") + stdout, _, err := runScript("hack/e2e-internal/e2e-upgrade.sh", "-P", version) + if err != nil { + return err + } + tmpl := strings.TrimSpace(stdout) + + Logf("Performing a node upgrade to %s; waiting at most %v per node", tmpl, restartPerNodeTimeout) + if err := migRollingUpdate(tmpl, restartPerNodeTimeout); err != nil { + return fmt.Errorf("error doing node upgrade via a migRollingUpdate to %s: %v", tmpl, err) + } + + Logf("Waiting up to %v for all nodes to be ready after the upgrade", restartNodeReadyAgainTimeout) + if _, err := checkNodesReady(f.Client, restartNodeReadyAgainTimeout, testContext.CloudConfig.NumNodes); err != nil { + return err + } + + Logf("Waiting up to %v for all pods to be running and ready after the upgrade", restartPodReadyAgainTimeout) + return waitForPodsRunningReady(f.Namespace.Name, replicas, restartPodReadyAgainTimeout) +} + var _ = Describe("Skipped", func() { Describe("Cluster upgrade", func() { - svcName := "baz" - var podName string - framework := Framework{BaseName: "cluster-upgrade"} - var webserver *WebserverTest + svcName, replicas := "baz", 2 + var rcName, ip string + var ingress api.LoadBalancerIngress + f := Framework{BaseName: "cluster-upgrade"} + var w *WebserverTest BeforeEach(func() { - framework.beforeEach() - webserver = NewWebserverTest(framework.Client, framework.Namespace.Name, svcName) - pod := webserver.CreateWebserverPod() - podName = pod.Name - svc := webserver.BuildServiceSpec() + By("Setting up the service, RC, and pods") + f.beforeEach() + w = NewWebserverTest(f.Client, f.Namespace.Name, svcName) + rc := w.CreateWebserverRC(replicas) + rcName = rc.ObjectMeta.Name + svc := w.BuildServiceSpec() svc.Spec.Type = api.ServiceTypeLoadBalancer - webserver.CreateService(svc) + w.CreateService(svc) + + By("Waiting for the service to become reachable") + result, err := waitForLoadBalancerIngress(f.Client, svcName, f.Namespace.Name) + Expect(err).NotTo(HaveOccurred()) + ingresses := result.Status.LoadBalancer.Ingress + if len(ingresses) != 1 { + Failf("Was expecting only 1 ingress IP but got %d (%v): %v", len(ingresses), ingresses, result) + } + ingress = ingresses[0] + Logf("Got load balancer ingress point %v", ingress) + ip = ingress.IP + if ip == "" { + ip = ingress.Hostname + } + testLoadBalancerReachable(ingress, 80) + + // TODO(mbforbes): Add setup, validate, and teardown for: + // - secrets + // - volumes + // - persistent volumes }) AfterEach(func() { - framework.afterEach() - webserver.Cleanup() + f.afterEach() + w.Cleanup() }) Describe("kube-push", func() { It("of master should maintain responsive services", func() { - testClusterUpgrade(framework, svcName, podName, func() { - runUpgradeScript("hack/e2e-internal/e2e-push.sh", "-m") - }) + By("Validating cluster before master upgrade") + expectNoError(validate(f, svcName, rcName, ingress, replicas)) + By("Performing a master upgrade") + testMasterUpgrade(ip, masterPush) + By("Validating cluster after master upgrade") + expectNoError(validate(f, svcName, rcName, ingress, replicas)) }) }) - Describe("gce-upgrade", func() { - It("of master should maintain responsive services", func() { + Describe("gce-upgrade-master", func() { + It("should maintain responsive services", func() { + // TODO(mbforbes): Add GKE support. if !providerIs("gce") { - By(fmt.Sprintf("Skippingt test, which is not implemented for %s", testContext.Provider)) + By(fmt.Sprintf("Skipping upgrade test, which is not implemented for %s", testContext.Provider)) return } - testClusterUpgrade(framework, svcName, podName, func() { - runUpgradeScript("hack/e2e-internal/e2e-upgrade.sh", "-M", "-l") - }) + By("Validating cluster before master upgrade") + expectNoError(validate(f, svcName, rcName, ingress, replicas)) + By("Performing a master upgrade") + testMasterUpgrade(ip, masterUpgrade) + By("Validating cluster after master upgrade") + expectNoError(validate(f, svcName, rcName, ingress, replicas)) + }) + }) + + Describe("gce-upgrade-cluster", func() { + var tmplBefore, tmplAfter string + BeforeEach(func() { + By("Getting the node template before the upgrade") + var err error + tmplBefore, err = migTemplate() + expectNoError(err) + }) + + AfterEach(func() { + By("Cleaning up any unused node templates") + var err error + tmplAfter, err = migTemplate() + if err != nil { + Logf("Could not get node template post-upgrade; may have leaked template %s", tmplBefore) + return + } + if tmplBefore == tmplAfter { + // The node upgrade failed so there's no need to delete + // anything. + Logf("Node template %s is still in use; not cleaning up", tmplBefore) + return + } + // TODO(mbforbes): Distinguish between transient failures + // and "cannot delete--in use" errors and retry on the + // former. + Logf("Deleting node template %s", tmplBefore) + o, err := exec.Command("gcloud", "compute", "instance-templates", + fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID), + "delete", + tmplBefore).CombinedOutput() + if err != nil { + Logf("gcloud compute instance-templates delete %s call failed with err: %v, output: %s", + tmplBefore, err, string(o)) + Logf("May have leaked %s", tmplBefore) + } + }) + + It("should maintain a functioning cluster", func() { + // TODO(mbforbes): Add GKE support. + if !providerIs("gce") { + By(fmt.Sprintf("Skipping upgrade test, which is not implemented for %s", testContext.Provider)) + return + } + By("Validating cluster before master upgrade") + expectNoError(validate(f, svcName, rcName, ingress, replicas)) + By("Performing a master upgrade") + testMasterUpgrade(ip, masterUpgrade) + By("Validating cluster after master upgrade") + expectNoError(validate(f, svcName, rcName, ingress, replicas)) + By("Performing a node upgrade") + testNodeUpgrade(f, nodeUpgrade, replicas) + By("Validating cluster after node upgrade") + expectNoError(validate(f, svcName, rcName, ingress, replicas)) }) }) }) }) -func testClusterUpgrade(framework Framework, svcName, podName string, upgrade func()) { - result, err := waitForLoadBalancerIngress(framework.Client, svcName, framework.Namespace.Name) - Expect(err).NotTo(HaveOccurred()) - ingresses := result.Status.LoadBalancer.Ingress - if len(ingresses) != 1 { - Failf("Was expecting only 1 ingress IP but got %d (%v): %v", len(ingresses), ingresses, result) - } - ingress := ingresses[0] - ip := ingress.IP - if ip == "" { - ip = ingress.Hostname - } - - By("Waiting for pod to become reachable") - testLoadBalancerReachable(ingress, 80) - validateClusterUpgrade(framework, svcName, podName) - - Logf("starting async validation") +func testMasterUpgrade(ip string, mUp func() error) { + Logf("Starting async validation") httpClient := http.Client{Timeout: 2 * time.Second} done := make(chan struct{}, 1) // Let's make sure we've finished the heartbeat before shutting things down. @@ -107,55 +214,94 @@ func testClusterUpgrade(framework Framework, svcName, podName string, upgrade fu wg.Add(1) defer wg.Done() - expectNoError(wait.Poll(poll, singleCallTimeout, func() (bool, error) { + if err := wait.Poll(poll, singleCallTimeout, func() (bool, error) { r, err := httpClient.Get("http://" + ip) if err != nil { + Logf("Error reaching %s: %v", ip, err) return false, nil } if r.StatusCode < http.StatusOK || r.StatusCode >= http.StatusNotFound { + Logf("Bad response; status: %d, response: %v", r.StatusCode, r) return false, nil } return true, nil - })) + }); err != nil { + // We log the error here because the test will fail at the very end + // because this validation runs in another goroutine. Without this, + // a failure is very confusing to track down because from the logs + // everything looks fine. + msg := fmt.Sprintf("Failed to contact service during master upgrade: %v", err) + Logf(msg) + Failf(msg) + } }, 200*time.Millisecond, done) - By("Starting upgrade") - upgrade() + Logf("Starting master upgrade") + expectNoError(mUp()) done <- struct{}{} Logf("Stopping async validation") wg.Wait() - Logf("Upgrade complete.") - - By("Validating post upgrade state") - validateClusterUpgrade(framework, svcName, podName) + Logf("Master upgrade complete") } -func runUpgradeScript(scriptPath string, args ...string) { - cmd := exec.Command(path.Join(testContext.RepoRoot, scriptPath), args...) - upgradeLogPath := path.Join(testContext.OutputDir, "upgrade-"+string(util.NewUUID())+".log") - Logf("Writing upgrade logs to %s", upgradeLogPath) - upgradeLog, err := os.Create(upgradeLogPath) - expectNoError(err) +func testNodeUpgrade(f Framework, nUp func(f Framework, n int) error, replicas int) { + Logf("Starting node upgrade") + expectNoError(nUp(f, replicas)) + Logf("Node upgrade complete") + + // TODO(mbforbes): Validate that: + // - the node software version truly changed - cmd.Stdout = io.MultiWriter(os.Stdout, upgradeLog) - cmd.Stderr = io.MultiWriter(os.Stderr, upgradeLog) - if err := cmd.Run(); err != nil { - Failf("Upgrade failed: %v", err) - } } -func validateClusterUpgrade(framework Framework, svcName, podName string) { - pods, err := framework.Client.Pods(framework.Namespace.Name).List(labels.Everything(), fields.Everything()) - Expect(err).NotTo(HaveOccurred()) - Expect(len(pods.Items) == 1).Should(BeTrue()) - if podName != pods.Items[0].Name { - Failf("pod name should not have changed") - } - _, err = podRunningReady(&pods.Items[0]) - Expect(err).NotTo(HaveOccurred()) - svc, err := framework.Client.Services(framework.Namespace.Name).Get(svcName) - Expect(err).NotTo(HaveOccurred()) - if svcName != svc.Name { - Failf("service name should not have changed") +// runScript runs script on testContext.RepoRoot using args and returns +// stdout, stderr, and error. +func runScript(script string, args ...string) (string, string, error) { + Logf("Running %s %v", script, args) + var bout, berr bytes.Buffer + cmd := exec.Command(path.Join(testContext.RepoRoot, script), args...) + cmd.Stdout, cmd.Stderr = &bout, &berr + err := cmd.Run() + stdout, stderr := bout.String(), berr.String() + if err != nil { + return "", "", fmt.Errorf("error running %s %v; got error %v, stdout %q, stderr %q", + script, args, err, stdout, stderr) } + Logf("stdout: %s", stdout) + Logf("stderr: %s", stderr) + return stdout, stderr, nil +} + +func validate(f Framework, svcNameWant, rcNameWant string, ingress api.LoadBalancerIngress, podsWant int) error { + Logf("Beginning cluster validation") + // Verify RC. + rcs, err := f.Client.ReplicationControllers(f.Namespace.Name).List(labels.Everything()) + if err != nil { + return fmt.Errorf("error listing RCs: %v", err) + } + if len(rcs.Items) != 1 { + return fmt.Errorf("wanted 1 RC with name %s, got %d", rcNameWant, len(rcs.Items)) + } + if got := rcs.Items[0].Name; got != rcNameWant { + return fmt.Errorf("wanted RC name %q, got %q", rcNameWant, got) + } + + // Verify pods. + if err := verifyPods(f.Client, f.Namespace.Name, rcNameWant, false, podsWant); err != nil { + return fmt.Errorf("failed to find %d %q pods: %v", podsWant, rcNameWant, err) + } + + // Verify service. + svc, err := f.Client.Services(f.Namespace.Name).Get(svcNameWant) + if err != nil { + return fmt.Errorf("error getting service %s: %v", svcNameWant, err) + } + if svcNameWant != svc.Name { + return fmt.Errorf("wanted service name %q, got %q", svcNameWant, svc.Name) + } + // TODO(mbforbes): Make testLoadBalancerReachable return an error. + testLoadBalancerReachable(ingress, 80) + + Logf("Cluster validation succeeded") + return nil } diff --git a/test/e2e/rc.go b/test/e2e/rc.go index 4940a8ef86..1a7fba5825 100644 --- a/test/e2e/rc.go +++ b/test/e2e/rc.go @@ -141,7 +141,7 @@ func ServeImageOrFail(c *client.Client, test string, image string) { By("Trying to dial each unique pod") retryTimeout := 2 * time.Minute retryInterval := 5 * time.Second - err = wait.Poll(retryInterval, retryTimeout, podResponseChecker{c, ns, label, name, pods}.checkAllResponses) + err = wait.Poll(retryInterval, retryTimeout, podResponseChecker{c, ns, label, name, true, pods}.checkAllResponses) if err != nil { Failf("Did not get expected responses within the timeout period of %.2f seconds.", retryTimeout.Seconds()) } diff --git a/test/e2e/reboot.go b/test/e2e/reboot.go index 20558d1f38..02dc494247 100644 --- a/test/e2e/reboot.go +++ b/test/e2e/reboot.go @@ -155,7 +155,8 @@ func issueSSHCommand(node *api.Node, provider, cmd string) error { // failed step, it will return false through result and not run the rest. func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan bool) { // Setup - ps := newPodStore(c, api.NamespaceDefault, labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name)) + ns := api.NamespaceDefault + ps := newPodStore(c, ns, labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name)) defer ps.Stop() // Get the node initially. @@ -183,7 +184,7 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan // For each pod, we do a sanity check to ensure it's running / healthy // now, as that's what we'll be checking later. - if !checkPodsRunningReady(c, podNames, podReadyBeforeTimeout) { + if !checkPodsRunningReady(c, ns, podNames, podReadyBeforeTimeout) { result <- false return } @@ -209,7 +210,7 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan // Ensure all of the pods that we found on this node before the reboot are // running / healthy. - if !checkPodsRunningReady(c, podNames, rebootPodReadyAgainTimeout) { + if !checkPodsRunningReady(c, ns, podNames, rebootPodReadyAgainTimeout) { result <- false return } diff --git a/test/e2e/resize_nodes.go b/test/e2e/resize_nodes.go index 1cf65ac7bc..21e5343a59 100644 --- a/test/e2e/resize_nodes.go +++ b/test/e2e/resize_nodes.go @@ -24,6 +24,7 @@ import ( "time" "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api/latest" "github.com/GoogleCloudPlatform/kubernetes/pkg/client" "github.com/GoogleCloudPlatform/kubernetes/pkg/fields" "github.com/GoogleCloudPlatform/kubernetes/pkg/labels" @@ -34,9 +35,9 @@ import ( . "github.com/onsi/gomega" ) -var serveHostnameImage string = "gcr.io/google_containers/serve_hostname:1.1" +const serveHostnameImage = "gcr.io/google_containers/serve_hostname:1.1" -func resizeNodeInstanceGroup(size int) error { +func resizeGroup(size int) error { // TODO: make this hit the compute API directly instread of shelling out to gcloud. output, err := exec.Command("gcloud", "preview", "managed-instance-groups", "--project="+testContext.CloudConfig.ProjectID, "--zone="+testContext.CloudConfig.Zone, "resize", testContext.CloudConfig.NodeInstanceGroup, fmt.Sprintf("--new-size=%v", size)).CombinedOutput() @@ -46,7 +47,7 @@ func resizeNodeInstanceGroup(size int) error { return err } -func nodeInstanceGroupSize() (int, error) { +func groupSize() (int, error) { // TODO: make this hit the compute API directly instread of shelling out to gcloud. output, err := exec.Command("gcloud", "preview", "managed-instance-groups", "--project="+testContext.CloudConfig.ProjectID, "--zone="+testContext.CloudConfig.Zone, "describe", testContext.CloudConfig.NodeInstanceGroup).CombinedOutput() @@ -71,9 +72,9 @@ func nodeInstanceGroupSize() (int, error) { return currentSize, nil } -func waitForNodeInstanceGroupSize(size int) error { +func waitForGroupSize(size int) error { for start := time.Now(); time.Since(start) < 4*time.Minute; time.Sleep(5 * time.Second) { - currentSize, err := nodeInstanceGroupSize() + currentSize, err := groupSize() if err != nil { Logf("Failed to get node instance group size: %v", err) continue @@ -104,7 +105,7 @@ func waitForClusterSize(c *client.Client, size int) error { return fmt.Errorf("timeout waiting for cluster size to be %d", size) } -func newServiceWithNameSelector(name string) *api.Service { +func svcByName(name string) *api.Service { return &api.Service{ ObjectMeta: api.ObjectMeta{ Name: "test-service", @@ -121,12 +122,12 @@ func newServiceWithNameSelector(name string) *api.Service { } } -func createServiceWithNameSelector(c *client.Client, ns, name string) error { - _, err := c.Services(ns).Create(newServiceWithNameSelector(name)) +func newSVCByName(c *client.Client, ns, name string) error { + _, err := c.Services(ns).Create(svcByName(name)) return err } -func newPodOnNode(podName, nodeName string, image string) *api.Pod { +func podOnNode(podName, nodeName string, image string) *api.Pod { return &api.Pod{ ObjectMeta: api.ObjectMeta{ Name: podName, @@ -148,18 +149,39 @@ func newPodOnNode(podName, nodeName string, image string) *api.Pod { } } -func createServeHostnamePodOnNode(c *client.Client, namespace, podName, nodeName string) error { - pod, err := c.Pods(namespace).Create(newPodOnNode(podName, nodeName, serveHostnameImage)) +func newPodOnNode(c *client.Client, namespace, podName, nodeName string) error { + pod, err := c.Pods(namespace).Create(podOnNode(podName, nodeName, serveHostnameImage)) if err == nil { Logf("Created pod %s on node %s", pod.ObjectMeta.Name, nodeName) } else { - Logf("Failed to create pod %s on node %s: %s", podName, nodeName, err) + Logf("Failed to create pod %s on node %s: %v", podName, nodeName, err) } return err } -func newReplicationControllerWithNameSelector(name string, replicas int, image string) *api.ReplicationController { +func rcByName(name string, replicas int, image string, labels map[string]string) *api.ReplicationController { + return rcByNameContainer(name, replicas, image, labels, api.Container{ + Name: name, + Image: image, + }) +} + +func rcByNamePort(name string, replicas int, image string, port int, labels map[string]string) *api.ReplicationController { + return rcByNameContainer(name, replicas, image, labels, api.Container{ + Name: name, + Image: image, + Ports: []api.ContainerPort{{ContainerPort: port}}, + }) +} + +func rcByNameContainer(name string, replicas int, image string, labels map[string]string, c api.Container) *api.ReplicationController { + // Add "name": name to the labels, overwriting if it exists. + labels["name"] = name return &api.ReplicationController{ + TypeMeta: api.TypeMeta{ + Kind: "ReplicationController", + APIVersion: latest.Version, + }, ObjectMeta: api.ObjectMeta{ Name: name, }, @@ -170,28 +192,24 @@ func newReplicationControllerWithNameSelector(name string, replicas int, image s }, Template: &api.PodTemplateSpec{ ObjectMeta: api.ObjectMeta{ - Labels: map[string]string{"name": name}, + Labels: labels, }, Spec: api.PodSpec{ - Containers: []api.Container{ - { - Name: name, - Image: image, - Ports: []api.ContainerPort{{ContainerPort: 9376}}, - }, - }, + Containers: []api.Container{c}, }, }, }, } } -func createServeHostnameReplicationController(c *client.Client, ns, name string, replicas int) (*api.ReplicationController, error) { +// newRCByName creates a replication controller with a selector by name of name. +func newRCByName(c *client.Client, ns, name string, replicas int) (*api.ReplicationController, error) { By(fmt.Sprintf("creating replication controller %s", name)) - return c.ReplicationControllers(ns).Create(newReplicationControllerWithNameSelector(name, replicas, serveHostnameImage)) + return c.ReplicationControllers(ns).Create(rcByNamePort( + name, replicas, serveHostnameImage, 9376, map[string]string{})) } -func resizeReplicationController(c *client.Client, ns, name string, replicas int) error { +func resizeRC(c *client.Client, ns, name string, replicas int) error { rc, err := c.ReplicationControllers(ns).Get(name) if err != nil { return err @@ -201,7 +219,7 @@ func resizeReplicationController(c *client.Client, ns, name string, replicas int return err } -func waitForPodsCreated(c *client.Client, ns, name string, replicas int) (*api.PodList, error) { +func podsCreated(c *client.Client, ns, name string, replicas int) (*api.PodList, error) { // List the pods, making sure we observe all the replicas. label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name})) for start := time.Now(); time.Since(start) < time.Minute; time.Sleep(5 * time.Second) { @@ -218,7 +236,7 @@ func waitForPodsCreated(c *client.Client, ns, name string, replicas int) (*api.P return nil, fmt.Errorf("Pod name %s: Gave up waiting for %d pods to come up", name, replicas) } -func waitForPodsRunning(c *client.Client, pods *api.PodList) []error { +func podsRunning(c *client.Client, pods *api.PodList) []error { // Wait for the pods to enter the running state. Waiting loops until the pods // are running so non-running pods cause a timeout for this test. By("ensuring each pod is running") @@ -233,24 +251,24 @@ func waitForPodsRunning(c *client.Client, pods *api.PodList) []error { return e } -func verifyPodsResponding(c *client.Client, ns, name string, pods *api.PodList) error { +func podsResponding(c *client.Client, ns, name string, wantName bool, pods *api.PodList) error { By("trying to dial each unique pod") retryTimeout := 2 * time.Minute retryInterval := 5 * time.Second label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name})) - return wait.Poll(retryInterval, retryTimeout, podResponseChecker{c, ns, label, name, pods}.checkAllResponses) + return wait.Poll(retryInterval, retryTimeout, podResponseChecker{c, ns, label, name, wantName, pods}.checkAllResponses) } -func waitForPodsCreatedRunningResponding(c *client.Client, ns, name string, replicas int) error { - pods, err := waitForPodsCreated(c, ns, name, replicas) +func verifyPods(c *client.Client, ns, name string, wantName bool, replicas int) error { + pods, err := podsCreated(c, ns, name, replicas) if err != nil { return err } - e := waitForPodsRunning(c, pods) + e := podsRunning(c, pods) if len(e) > 0 { return fmt.Errorf("Failed to wait for pods running: %v", e) } - err = verifyPodsResponding(c, ns, name, pods) + err = podsResponding(c, ns, name, wantName, pods) if err != nil { return err } @@ -331,7 +349,7 @@ func performTemporaryNetworkFailure(c *client.Client, ns, rcName string, replica waitForRCPodToDisappear(c, ns, rcName, podNameToDisappear) By("verifying whether the pod from the unreachable node is recreated") - err := waitForPodsCreatedRunningResponding(c, ns, rcName, replicas) + err := verifyPods(c, ns, rcName, true, replicas) Expect(err).NotTo(HaveOccurred()) // network traffic is unblocked in a defered function @@ -372,10 +390,10 @@ var _ = Describe("Nodes", func() { return } By("restoring the original node instance group size") - if err := resizeNodeInstanceGroup(testContext.CloudConfig.NumNodes); err != nil { + if err := resizeGroup(testContext.CloudConfig.NumNodes); err != nil { Failf("Couldn't restore the original node instance group size: %v", err) } - if err := waitForNodeInstanceGroupSize(testContext.CloudConfig.NumNodes); err != nil { + if err := waitForGroupSize(testContext.CloudConfig.NumNodes); err != nil { Failf("Couldn't restore the original node instance group size: %v", err) } if err := waitForClusterSize(c, testContext.CloudConfig.NumNodes); err != nil { @@ -396,20 +414,20 @@ var _ = Describe("Nodes", func() { // The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname name := "my-hostname-delete-node" replicas := testContext.CloudConfig.NumNodes - createServeHostnameReplicationController(c, ns, name, replicas) - err := waitForPodsCreatedRunningResponding(c, ns, name, replicas) + newRCByName(c, ns, name, replicas) + err := verifyPods(c, ns, name, true, replicas) Expect(err).NotTo(HaveOccurred()) By(fmt.Sprintf("decreasing cluster size to %d", replicas-1)) - err = resizeNodeInstanceGroup(replicas - 1) + err = resizeGroup(replicas - 1) Expect(err).NotTo(HaveOccurred()) - err = waitForNodeInstanceGroupSize(replicas - 1) + err = waitForGroupSize(replicas - 1) Expect(err).NotTo(HaveOccurred()) err = waitForClusterSize(c, replicas-1) Expect(err).NotTo(HaveOccurred()) By("verifying whether the pods from the removed node are recreated") - err = waitForPodsCreatedRunningResponding(c, ns, name, replicas) + err = verifyPods(c, ns, name, true, replicas) Expect(err).NotTo(HaveOccurred()) }) @@ -426,24 +444,24 @@ var _ = Describe("Nodes", func() { // Create a replication controller for a service that serves its hostname. // The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname name := "my-hostname-add-node" - createServiceWithNameSelector(c, ns, name) + newSVCByName(c, ns, name) replicas := testContext.CloudConfig.NumNodes - createServeHostnameReplicationController(c, ns, name, replicas) - err := waitForPodsCreatedRunningResponding(c, ns, name, replicas) + newRCByName(c, ns, name, replicas) + err := verifyPods(c, ns, name, true, replicas) Expect(err).NotTo(HaveOccurred()) By(fmt.Sprintf("increasing cluster size to %d", replicas+1)) - err = resizeNodeInstanceGroup(replicas + 1) + err = resizeGroup(replicas + 1) Expect(err).NotTo(HaveOccurred()) - err = waitForNodeInstanceGroupSize(replicas + 1) + err = waitForGroupSize(replicas + 1) Expect(err).NotTo(HaveOccurred()) err = waitForClusterSize(c, replicas+1) Expect(err).NotTo(HaveOccurred()) By(fmt.Sprintf("increasing size of the replication controller to %d and verifying all pods are running", replicas+1)) - err = resizeReplicationController(c, ns, name, replicas+1) + err = resizeRC(c, ns, name, replicas+1) Expect(err).NotTo(HaveOccurred()) - err = waitForPodsCreatedRunningResponding(c, ns, name, replicas+1) + err = verifyPods(c, ns, name, true, replicas+1) Expect(err).NotTo(HaveOccurred()) }) }) @@ -472,10 +490,10 @@ var _ = Describe("Nodes", func() { // Create a replication controller for a service that serves its hostname. // The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname name := "my-hostname-net" - createServiceWithNameSelector(c, ns, name) + newSVCByName(c, ns, name) replicas := testContext.CloudConfig.NumNodes - createServeHostnameReplicationController(c, ns, name, replicas) - err := waitForPodsCreatedRunningResponding(c, ns, name, replicas) + newRCByName(c, ns, name, replicas) + err := verifyPods(c, ns, name, true, replicas) Expect(err).NotTo(HaveOccurred(), "Each pod should start running and responding") By("choose a node with at least one pod - we will block some network traffic on this node") @@ -496,9 +514,9 @@ var _ = Describe("Nodes", func() { // increasing the RC size is not a valid way to test this // since we have no guarantees the pod will be scheduled on our node. additionalPod := "additionalpod" - err = createServeHostnamePodOnNode(c, ns, additionalPod, node.Name) + err = newPodOnNode(c, ns, additionalPod, node.Name) Expect(err).NotTo(HaveOccurred()) - err = waitForPodsCreatedRunningResponding(c, ns, additionalPod, 1) + err = verifyPods(c, ns, additionalPod, true, 1) Expect(err).NotTo(HaveOccurred()) // verify that it is really on the requested node diff --git a/test/e2e/restart.go b/test/e2e/restart.go index bae8b27551..cf8c2c9c94 100644 --- a/test/e2e/restart.go +++ b/test/e2e/restart.go @@ -86,7 +86,8 @@ var _ = Describe("Restart", func() { for i, p := range pods { podNamesBefore[i] = p.ObjectMeta.Name } - if !checkPodsRunningReady(c, podNamesBefore, podReadyBeforeTimeout) { + ns := api.NamespaceDefault + if !checkPodsRunningReady(c, ns, podNamesBefore, podReadyBeforeTimeout) { Failf("At least one pod wasn't running and ready at test start.") } @@ -115,7 +116,7 @@ var _ = Describe("Restart", func() { podNamesAfter, err := waitForNPods(ps, len(podNamesBefore), restartPodReadyAgainTimeout) Expect(err).NotTo(HaveOccurred()) remaining := restartPodReadyAgainTimeout - time.Since(podCheckStart) - if !checkPodsRunningReady(c, podNamesAfter, remaining) { + if !checkPodsRunningReady(c, ns, podNamesAfter, remaining) { Failf("At least one pod wasn't running and ready after the restart.") } }) @@ -166,7 +167,8 @@ func checkNodesReady(c *client.Client, nt time.Duration, expect int) ([]string, return false, nil } if len(nodeList.Items) != expect { - errLast = fmt.Errorf("expected to find %d nodes but found only %d", expect, len(nodeList.Items)) + errLast = fmt.Errorf("expected to find %d nodes but found only %d (%v elapsed)", + expect, len(nodeList.Items), time.Since(start)) Logf("%v", errLast) return false, nil } @@ -180,6 +182,7 @@ func checkNodesReady(c *client.Client, nt time.Duration, expect int) ([]string, return nodeNames, fmt.Errorf("couldn't find %d nodes within %v; last error: %v", expect, nt, errLast) } + Logf("Successfully found %d nodes", expect) // Next, ensure in parallel that all the nodes are ready. We subtract the // time we spent waiting above. @@ -209,28 +212,32 @@ func checkNodesReady(c *client.Client, nt time.Duration, expect int) ([]string, func restartNodes(provider string, nt time.Duration) error { switch provider { case "gce": - return migRollingUpdate(nt) + return migRollingUpdateSelf(nt) default: return fmt.Errorf("restartNodes(...) not implemented for %s", provider) } } -// migRollingUpdate starts a MIG rolling update and waits up to nt times the -// nubmer of nodes for it to complete. -func migRollingUpdate(nt time.Duration) error { +func migRollingUpdateSelf(nt time.Duration) error { By("getting the name of the template for the managed instance group") - templ, err := migTemplate() + tmpl, err := migTemplate() if err != nil { return fmt.Errorf("couldn't get MIG template name: %v", err) } + return migRollingUpdate(tmpl, nt) +} - By("starting the managed instance group rolling update") - id, err := migRollingUpdateStart(templ, nt) +// migRollingUpdate starts a MIG rolling update, upgrading the nodes to a new +// instance template named tmpl, and waits up to nt times the nubmer of nodes +// for it to complete. +func migRollingUpdate(tmpl string, nt time.Duration) error { + By(fmt.Sprintf("starting the MIG rolling update to %s", tmpl)) + id, err := migRollingUpdateStart(tmpl, nt) if err != nil { return fmt.Errorf("couldn't start the MIG rolling update: %v", err) } - By("polling the managed instance group rolling update until it completes") + By(fmt.Sprintf("polling the MIG rolling update (%s) until it completes", id)) if err := migRollingUpdatePoll(id, nt); err != nil { return fmt.Errorf("err waiting until update completed: %v", err) } @@ -284,7 +291,9 @@ func migRollingUpdateStart(templ string, nt time.Duration) (string, error) { prefix, suffix := "Started [", "]." if err := wait.Poll(poll, singleCallTimeout, func() (bool, error) { // TODO(mbforbes): make this hit the compute API directly instead of - // shelling out to gcloud. + // shelling out to gcloud. + // NOTE(mbforbes): If you are changing this gcloud command, update + // cluster/gce/upgrade.sh to match this EXACTLY. o, err := exec.Command("gcloud", "preview", "rolling-updates", fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID), fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone), @@ -361,6 +370,7 @@ func migRollingUpdatePoll(id string, nt time.Duration) error { }) != nil { return fmt.Errorf("timeout waiting %v for MIG rolling update to complete. Last error: %v", timeout, errLast) } + Logf("MIG rolling update complete after %v", time.Since(start)) return nil } diff --git a/test/e2e/service.go b/test/e2e/service.go index dbb591a504..270cee9139 100644 --- a/test/e2e/service.go +++ b/test/e2e/service.go @@ -22,13 +22,10 @@ import ( "math/rand" "net/http" "sort" - "strconv" "strings" - "sync/atomic" "time" "github.com/GoogleCloudPlatform/kubernetes/pkg/api" - "github.com/GoogleCloudPlatform/kubernetes/pkg/api/latest" "github.com/GoogleCloudPlatform/kubernetes/pkg/client" "github.com/GoogleCloudPlatform/kubernetes/pkg/fields" "github.com/GoogleCloudPlatform/kubernetes/pkg/labels" @@ -277,7 +274,7 @@ var _ = Describe("Services", func() { } By("creating pod to be part of service " + serviceName) - t.CreateWebserverPod() + t.CreateWebserverRC(1) By("hitting the pod through the service's NodePort") testReachable(pickMinionIP(c), port.NodePort) @@ -324,7 +321,7 @@ var _ = Describe("Services", func() { } By("creating pod to be part of service " + serviceName) - t.CreateWebserverPod() + t.CreateWebserverRC(1) By("hitting the pod through the service's NodePort") ip := pickMinionIP(c) @@ -365,7 +362,7 @@ var _ = Describe("Services", func() { } By("creating pod to be part of service " + t.ServiceName) - t.CreateWebserverPod() + t.CreateWebserverRC(1) By("changing service " + serviceName + " to type=NodePort") service.Spec.Type = api.ServiceTypeNodePort @@ -515,7 +512,7 @@ var _ = Describe("Services", func() { Expect(err).NotTo(HaveOccurred()) By("creating pod to be part of service " + t.ServiceName) - t.CreateWebserverPod() + t.CreateWebserverRC(1) if service.Spec.Type != api.ServiceTypeLoadBalancer { Failf("got unexpected Spec.Type for LoadBalancer service: %v", service) @@ -1011,11 +1008,12 @@ func testReachable(ip string, port int) { Failf("Got port==0 for reachability check (%s)", url) } - By(fmt.Sprintf("Checking reachability of %s", url)) + By(fmt.Sprintf("Waiting up to %v for %s to be reachable", podStartTimeout, url)) + start := time.Now() expectNoError(wait.Poll(poll, podStartTimeout, func() (bool, error) { resp, err := httpGetNoConnectionPool(url) if err != nil { - Logf("Got error waiting for reachability of %s: %v", url, err) + Logf("Got error waiting for reachability of %s: %v (%v)", url, err, time.Since(start)) return false, nil } defer resp.Body.Close() @@ -1044,7 +1042,7 @@ func testNotReachable(ip string, port int) { Failf("Got port==0 for non-reachability check (%s)", url) } - By(fmt.Sprintf("Checking that %s is not reachable", url)) + By(fmt.Sprintf("Waiting up to %v for %s to be *not* reachable", podStartTimeout, url)) expectNoError(wait.Poll(poll, podStartTimeout, func() (bool, error) { resp, err := httpGetNoConnectionPool(url) if err != nil { @@ -1086,11 +1084,10 @@ type WebserverTest struct { TestId string Labels map[string]string - pods map[string]bool + rcs map[string]bool services map[string]bool - - // Used for generating e.g. unique pod names - sequence int32 + name string + image string } func NewWebserverTest(client *client.Client, namespace string, serviceName string) *WebserverTest { @@ -1103,15 +1100,13 @@ func NewWebserverTest(client *client.Client, namespace string, serviceName strin "testid": t.TestId, } - t.pods = make(map[string]bool) + t.rcs = make(map[string]bool) t.services = make(map[string]bool) - return t -} + t.name = "webserver" + t.image = "gcr.io/google_containers/test-webserver" -func (t *WebserverTest) SequenceNext() int { - n := atomic.AddInt32(&t.sequence, 1) - return int(n) + return t } // Build default config for a service (which can then be changed) @@ -1131,43 +1126,27 @@ func (t *WebserverTest) BuildServiceSpec() *api.Service { return service } -// Create a pod with the well-known webserver configuration, and record it for cleanup -func (t *WebserverTest) CreateWebserverPod() *api.Pod { - name := t.ServiceName + "-" + strconv.Itoa(t.SequenceNext()) - pod := &api.Pod{ - TypeMeta: api.TypeMeta{ - Kind: "Pod", - APIVersion: latest.Version, - }, - ObjectMeta: api.ObjectMeta{ - Name: name, - Labels: t.Labels, - }, - Spec: api.PodSpec{ - Containers: []api.Container{ - { - Name: "webserver", - Image: "gcr.io/google_containers/test-webserver", - }, - }, - }, - } - pod, err := t.CreatePod(pod) +// CreateWebserverRC creates rc-backed pods with the well-known webserver +// configuration and records it for cleanup. +func (t *WebserverTest) CreateWebserverRC(replicas int) *api.ReplicationController { + rcSpec := rcByName(t.name, replicas, t.image, t.Labels) + rcAct, err := t.createRC(rcSpec) if err != nil { - Failf("Failed to create pod %s: %v", pod.Name, err) + Failf("Failed to create rc %s: %v", rcSpec.Name, err) } - expectNoError(waitForPodRunningInNamespace(t.Client, pod.Name, t.Namespace)) - return pod + if err := verifyPods(t.Client, t.Namespace, t.name, false, replicas); err != nil { + Failf("Failed to create %d pods with name %s: %v", replicas, t.name, err) + } + return rcAct } -// Create a pod, and record it for cleanup -func (t *WebserverTest) CreatePod(pod *api.Pod) (*api.Pod, error) { - podClient := t.Client.Pods(t.Namespace) - result, err := podClient.Create(pod) +// createRC creates a replication controller and records it for cleanup. +func (t *WebserverTest) createRC(rc *api.ReplicationController) (*api.ReplicationController, error) { + rc, err := t.Client.ReplicationControllers(t.Namespace).Create(rc) if err == nil { - t.pods[pod.Name] = true + t.rcs[rc.Name] = true } - return result, err + return rc, err } // Create a service, and record it for cleanup @@ -1190,14 +1169,23 @@ func (t *WebserverTest) DeleteService(serviceName string) error { func (t *WebserverTest) Cleanup() []error { var errs []error - - for podName := range t.pods { - podClient := t.Client.Pods(t.Namespace) - By("deleting pod " + podName + " in namespace " + t.Namespace) - err := podClient.Delete(podName, nil) + for rcName := range t.rcs { + By("stopping RC " + rcName + " in namespace " + t.Namespace) + // First, resize the RC to 0. + old, err := t.Client.ReplicationControllers(t.Namespace).Get(rcName) if err != nil { errs = append(errs, err) } + old.Spec.Replicas = 0 + if _, err := t.Client.ReplicationControllers(t.Namespace).Update(old); err != nil { + errs = append(errs, err) + } + // TODO(mbforbes): Wait. + + // Then, delete the RC altogether. + if err := t.Client.ReplicationControllers(t.Namespace).Delete(rcName); err != nil { + errs = append(errs, err) + } } for serviceName := range t.services { diff --git a/test/e2e/util.go b/test/e2e/util.go index 8566bd3f0e..3c5a5a80b1 100644 --- a/test/e2e/util.go +++ b/test/e2e/util.go @@ -514,6 +514,7 @@ type podResponseChecker struct { ns string label labels.Selector controllerName string + respondName bool // Whether the pod should respond with its own name. pods *api.PodList } @@ -535,16 +536,32 @@ func (r podResponseChecker) checkAllResponses() (done bool, err error) { Do(). Raw() if err != nil { - Logf("Controller %s: Failed to GET from replica %d (%s): %v:", r.controllerName, i+1, pod.Name, err) + Logf("Controller %s: Failed to GET from replica %d [%s]: %v:", r.controllerName, i+1, pod.Name, err) continue } - // The body should be the pod name. - if string(body) != pod.Name { - Logf("Controller %s: Replica %d expected response %s but got %s", r.controllerName, i+1, pod.Name, string(body)) - continue + // The response checker expects the pod's name unless !respondName, in + // which case it just checks for a non-empty response. + got := string(body) + what := "" + if r.respondName { + what = "expected" + want := pod.Name + if got != want { + Logf("Controller %s: Replica %d [%s] expected response %q but got %q", + r.controllerName, i+1, pod.Name, want, got) + continue + } + } else { + what = "non-empty" + if len(got) == 0 { + Logf("Controller %s: Replica %d [%s] expected non-empty response", + r.controllerName, i+1, pod.Name) + continue + } } successes++ - Logf("Controller %s: Got expected result from replica %d: %s, %d of %d required successes so far", r.controllerName, i+1, string(body), successes, len(r.pods.Items)) + Logf("Controller %s: Got %s result from replica %d [%s]: %q, %d of %d required successes so far", + r.controllerName, what, i+1, pod.Name, got, successes, len(r.pods.Items)) } if successes < len(r.pods.Items) { return false, nil @@ -1171,15 +1188,15 @@ func getSigner(provider string) (ssh.Signer, error) { } // checkPodsRunning returns whether all pods whose names are listed in podNames -// are running and ready. -func checkPodsRunningReady(c *client.Client, podNames []string, timeout time.Duration) bool { +// in namespace ns are running and ready, using c and waiting at most timeout. +func checkPodsRunningReady(c *client.Client, ns string, podNames []string, timeout time.Duration) bool { np, desc := len(podNames), "running and ready" Logf("Waiting up to %v for the following %d pods to be %s: %s", timeout, np, desc, podNames) result := make(chan bool, len(podNames)) for ix := range podNames { // Launch off pod readiness checkers. go func(name string) { - err := waitForPodCondition(c, api.NamespaceDefault, name, desc, timeout, podRunningReady) + err := waitForPodCondition(c, ns, name, desc, timeout, podRunningReady) result <- err == nil }(podNames[ix]) }