From 445ae0f576c2cca9b51aa53ec53432786b314d0f Mon Sep 17 00:00:00 2001 From: Jerzy Szczepkowski Date: Mon, 8 Jun 2015 10:52:37 +0200 Subject: [PATCH] Added e2e test case for network partition. Added e2e test case which verifies if a node can return to cluster after longer network partition. Valid for gce. --- test/e2e/resize_nodes.go | 263 +++++++++++++++++++++++++-------------- test/e2e/util.go | 49 ++++++++ 2 files changed, 217 insertions(+), 95 deletions(-) diff --git a/test/e2e/resize_nodes.go b/test/e2e/resize_nodes.go index 75032ffc09..4bdbe627f5 100644 --- a/test/e2e/resize_nodes.go +++ b/test/e2e/resize_nodes.go @@ -207,7 +207,23 @@ func verifyPodsResponding(c *client.Client, ns, name string, pods *api.PodList) return wait.Poll(retryInterval, retryTimeout, podResponseChecker{c, ns, label, name, pods}.checkAllResponses) } -var _ = Describe("ResizeNodes", func() { +func waitForPodsCreatedRunningResponding(c *client.Client, ns, name string, replicas int) error { + pods, err := waitForPodsCreated(c, ns, name, replicas) + if err != nil { + return err + } + e := waitForPodsRunning(c, pods) + if len(e) > 0 { + return fmt.Errorf("Failed to wait for pods running: %v", e) + } + err = verifyPodsResponding(c, ns, name, pods) + if err != nil { + return err + } + return nil +} + +var _ = Describe("Nodes", func() { supportedProviders := []string{"gce"} var testName string var c *client.Client @@ -223,116 +239,173 @@ var _ = Describe("ResizeNodes", func() { }) AfterEach(func() { - if !providerIs(supportedProviders...) { - return - } By(fmt.Sprintf("destroying namespace for this suite %s", ns)) if err := c.Namespaces().Delete(ns); err != nil { Failf("Couldn't delete namespace '%s', %v", ns, err) } - By("restoring the original node instance group size") - if err := resizeNodeInstanceGroup(testContext.CloudConfig.NumNodes); err != nil { - Failf("Couldn't restore the original node instance group size: %v", err) - } - if err := waitForNodeInstanceGroupSize(testContext.CloudConfig.NumNodes); err != nil { - Failf("Couldn't restore the original node instance group size: %v", err) - } - if err := waitForClusterSize(c, testContext.CloudConfig.NumNodes); err != nil { - Failf("Couldn't restore the original cluster size: %v", err) - } }) - testName = "should be able to delete nodes." - It(testName, func() { - Logf("starting test %s", testName) + Describe("Resize", func() { + BeforeEach(func() { + if !providerIs(supportedProviders...) { + Failf("Nodes.Resize test is only supported for providers %v (not %s). You can avoid this failure by using ginkgo.skip=Nodes.Resize in your environment.", + supportedProviders, testContext.Provider) + } + }) - if !providerIs(supportedProviders...) { - By(fmt.Sprintf("Skipping %s test, which is only supported for providers %v (not %s)", - testName, supportedProviders, testContext.Provider)) - return - } + AfterEach(func() { + if !providerIs(supportedProviders...) { + return + } + By("restoring the original node instance group size") + if err := resizeNodeInstanceGroup(testContext.CloudConfig.NumNodes); err != nil { + Failf("Couldn't restore the original node instance group size: %v", err) + } + if err := waitForNodeInstanceGroupSize(testContext.CloudConfig.NumNodes); err != nil { + Failf("Couldn't restore the original node instance group size: %v", err) + } + if err := waitForClusterSize(c, testContext.CloudConfig.NumNodes); err != nil { + Failf("Couldn't restore the original cluster size: %v", err) + } + }) - if testContext.CloudConfig.NumNodes < 2 { - By(fmt.Sprintf("skipping %s test, which requires at lease 2 nodes (not %d)", - testName, testContext.CloudConfig.NumNodes)) - return - } + testName = "should be able to delete nodes." + It(testName, func() { + Logf("starting test %s", testName) - // Create a replication controller for a service that serves its hostname. - // The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname - name := "my-hostname-delete-node-" + string(util.NewUUID()) - replicas := testContext.CloudConfig.NumNodes - createServeHostnameReplicationController(c, ns, name, replicas) - pods, err := waitForPodsCreated(c, ns, name, replicas) - Expect(err).NotTo(HaveOccurred()) - e := waitForPodsRunning(c, pods) - if len(e) > 0 { - Failf("Failed to wait for pods running: %v", e) - } - err = verifyPodsResponding(c, ns, name, pods) - Expect(err).NotTo(HaveOccurred()) + if testContext.CloudConfig.NumNodes < 2 { + Failf("Failing test %s as it requires at lease 2 nodes (not %d)", testName, testContext.CloudConfig.NumNodes) + return + } - By(fmt.Sprintf("decreasing cluster size to %d", replicas-1)) - err = resizeNodeInstanceGroup(replicas - 1) - Expect(err).NotTo(HaveOccurred()) - err = waitForNodeInstanceGroupSize(replicas - 1) - Expect(err).NotTo(HaveOccurred()) - err = waitForClusterSize(c, replicas-1) - Expect(err).NotTo(HaveOccurred()) + // Create a replication controller for a service that serves its hostname. + // The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname + name := "my-hostname-delete-node" + replicas := testContext.CloudConfig.NumNodes + createServeHostnameReplicationController(c, ns, name, replicas) + err := waitForPodsCreatedRunningResponding(c, ns, name, replicas) + Expect(err).NotTo(HaveOccurred()) - By("verifying whether the pods from the removed node are recreated") - pods, err = waitForPodsCreated(c, ns, name, replicas) - Expect(err).NotTo(HaveOccurred()) - e = waitForPodsRunning(c, pods) - if len(e) > 0 { - Failf("Failed to wait for pods running: %v", e) - } - err = verifyPodsResponding(c, ns, name, pods) - Expect(err).NotTo(HaveOccurred()) + By(fmt.Sprintf("decreasing cluster size to %d", replicas-1)) + err = resizeNodeInstanceGroup(replicas - 1) + Expect(err).NotTo(HaveOccurred()) + err = waitForNodeInstanceGroupSize(replicas - 1) + Expect(err).NotTo(HaveOccurred()) + err = waitForClusterSize(c, replicas-1) + Expect(err).NotTo(HaveOccurred()) + + By("verifying whether the pods from the removed node are recreated") + err = waitForPodsCreatedRunningResponding(c, ns, name, replicas) + Expect(err).NotTo(HaveOccurred()) + }) + + testName = "should be able to add nodes." + It(testName, func() { + Logf("starting test %s", testName) + + if testContext.CloudConfig.NumNodes < 2 { + Failf("Failing test %s as it requires at lease 2 nodes (not %d)", testName, testContext.CloudConfig.NumNodes) + return + } + + // Create a replication controller for a service that serves its hostname. + // The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname + name := "my-hostname-add-node" + createServiceWithNameSelector(c, ns, name) + replicas := testContext.CloudConfig.NumNodes + createServeHostnameReplicationController(c, ns, name, replicas) + err := waitForPodsCreatedRunningResponding(c, ns, name, replicas) + Expect(err).NotTo(HaveOccurred()) + + By(fmt.Sprintf("increasing cluster size to %d", replicas+1)) + err = resizeNodeInstanceGroup(replicas + 1) + Expect(err).NotTo(HaveOccurred()) + err = waitForNodeInstanceGroupSize(replicas + 1) + Expect(err).NotTo(HaveOccurred()) + err = waitForClusterSize(c, replicas+1) + Expect(err).NotTo(HaveOccurred()) + + By(fmt.Sprintf("increasing size of the replication controller to %d and verifying all pods are running", replicas+1)) + err = resizeReplicationController(c, ns, name, replicas+1) + Expect(err).NotTo(HaveOccurred()) + err = waitForPodsCreatedRunningResponding(c, ns, name, replicas+1) + Expect(err).NotTo(HaveOccurred()) + }) }) - testName = "should be able to add nodes." - It(testName, func() { - Logf("starting test %s", testName) + Describe("Network", func() { + BeforeEach(func() { + if !providerIs(supportedProviders...) { + Failf("Nodes.Network test is only supported for providers %v (not %s). You can avoid this failure by using ginkgo.skip=Nodes.Network in your environment.", + supportedProviders, testContext.Provider) + } + }) - if !providerIs(supportedProviders...) { - By(fmt.Sprintf("Skipping %s test, which is only supported for providers %v (not %s)", - testName, supportedProviders, testContext.Provider)) - return - } + testName = "should survive network partition." + It(testName, func() { + if testContext.CloudConfig.NumNodes < 2 { + By(fmt.Sprintf("skipping %s test, which requires at lease 2 nodes (not %d)", + testName, testContext.CloudConfig.NumNodes)) + return + } - // Create a replication controller for a service that serves its hostname. - // The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname - name := "my-hostname-add-node-" + string(util.NewUUID()) - createServiceWithNameSelector(c, ns, name) - replicas := testContext.CloudConfig.NumNodes - createServeHostnameReplicationController(c, ns, name, replicas) - pods, err := waitForPodsCreated(c, ns, name, replicas) - Expect(err).NotTo(HaveOccurred()) - e := waitForPodsRunning(c, pods) - if len(e) > 0 { - Failf("Failed to wait for pods running: %v", e) - } - err = verifyPodsResponding(c, ns, name, pods) - Expect(err).NotTo(HaveOccurred()) + // Create a replication controller for a service that serves its hostname. + // The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname + name := "my-hostname-net" + createServiceWithNameSelector(c, ns, name) + replicas := testContext.CloudConfig.NumNodes + createServeHostnameReplicationController(c, ns, name, replicas) + err := waitForPodsCreatedRunningResponding(c, ns, name, replicas) + Expect(err).NotTo(HaveOccurred()) - By(fmt.Sprintf("increasing cluster size to %d", replicas+1)) - err = resizeNodeInstanceGroup(replicas + 1) - Expect(err).NotTo(HaveOccurred()) - err = waitForNodeInstanceGroupSize(replicas + 1) - Expect(err).NotTo(HaveOccurred()) - err = waitForClusterSize(c, replicas+1) - Expect(err).NotTo(HaveOccurred()) + By("cause network partition on one node") + nodelist, err := c.Nodes().List(labels.Everything(), fields.Everything()) + Expect(err).NotTo(HaveOccurred()) + node := nodelist.Items[0] + pod, err := waitForRCPodOnNode(c, ns, name, node.Name) + Expect(err).NotTo(HaveOccurred()) + Logf("Getting external IP address for %s", name) + host := "" + for _, a := range node.Status.Addresses { + if a.Type == api.NodeExternalIP { + host = a.Address + ":22" + break + } + } + Logf("Setting network partition on %s", node.Name) + dropCmd := fmt.Sprintf("sudo iptables -I OUTPUT 1 -d %s -j DROP", testContext.CloudConfig.MasterName) + if _, _, code, err := SSH(dropCmd, host, testContext.Provider); code != 0 || err != nil { + Failf("Expected 0 exit code and nil error when running %s on %s, got %d and %v", + dropCmd, node, code, err) + } - By(fmt.Sprintf("increasing size of the replication controller to %d and verifying all pods are running", replicas+1)) - resizeReplicationController(c, ns, name, replicas+1) - pods, err = waitForPodsCreated(c, ns, name, replicas+1) - Expect(err).NotTo(HaveOccurred()) - e = waitForPodsRunning(c, pods) - if len(e) > 0 { - Failf("Failed to wait for pods running: %v", e) - } - err = verifyPodsResponding(c, ns, name, pods) - Expect(err).NotTo(HaveOccurred()) + Logf("Waiting for node %s to be not ready", node.Name) + waitForNodeToBe(c, node.Name, false, 2*time.Minute) + + Logf("Waiting for pod %s to be removed", pod.Name) + waitForRCPodToDisappear(c, ns, name, pod.Name) + + By("verifying whether the pod from the partitioned node is recreated") + err = waitForPodsCreatedRunningResponding(c, ns, name, replicas) + Expect(err).NotTo(HaveOccurred()) + + By("remove network partition") + undropCmd := "sudo iptables --delete OUTPUT 1" + if _, _, code, err := SSH(undropCmd, host, testContext.Provider); code != 0 || err != nil { + Failf("Expected 0 exit code and nil error when running %s on %s, got %d and %v", + undropCmd, node, code, err) + } + + Logf("Waiting for node %s to be ready", node.Name) + waitForNodeToBe(c, node.Name, true, 2*time.Minute) + + By("verify wheter new pods can be created on the re-attached node") + err = resizeReplicationController(c, ns, name, replicas+1) + Expect(err).NotTo(HaveOccurred()) + err = waitForPodsCreatedRunningResponding(c, ns, name, replicas+1) + Expect(err).NotTo(HaveOccurred()) + _, err = waitForRCPodOnNode(c, ns, name, node.Name) + Expect(err).NotTo(HaveOccurred()) + }) }) }) diff --git a/test/e2e/util.go b/test/e2e/util.go index 30354f4677..0c4cb51284 100644 --- a/test/e2e/util.go +++ b/test/e2e/util.go @@ -42,6 +42,7 @@ import ( "github.com/GoogleCloudPlatform/kubernetes/pkg/labels" "github.com/GoogleCloudPlatform/kubernetes/pkg/runtime" "github.com/GoogleCloudPlatform/kubernetes/pkg/util" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util/wait" "github.com/GoogleCloudPlatform/kubernetes/pkg/watch" "code.google.com/p/go-uuid/uuid" @@ -396,6 +397,54 @@ func waitForPodSuccess(c *client.Client, podName string, contName string) error return waitForPodSuccessInNamespace(c, podName, contName, api.NamespaceDefault) } +// waitForRCPodOnNode returns the pod from the given replication controller (decribed by rcName) which is scheduled on the given node. +// In case of failure or too long waiting time, an error is returned. +func waitForRCPodOnNode(c *client.Client, ns, rcName, node string) (*api.Pod, error) { + label := labels.SelectorFromSet(labels.Set(map[string]string{"name": rcName})) + var p *api.Pod = nil + err := wait.Poll(10*time.Second, 5*time.Minute, func() (bool, error) { + Logf("Waiting for pod %s to appear on node %s", rcName, node) + pods, err := c.Pods(ns).List(label, fields.Everything()) + if err != nil { + return false, err + } + for _, pod := range pods.Items { + if pod.Spec.NodeName == node { + Logf("Pod %s found on node %s", pod.Name, node) + p = &pod + return true, nil + } + } + return false, nil + }) + return p, err +} + +// waitForRCPodOnNode returns nil if the pod from the given replication controller (decribed by rcName) no longer exists. +// In case of failure or too long waiting time, an error is returned. +func waitForRCPodToDisappear(c *client.Client, ns, rcName, podName string) error { + label := labels.SelectorFromSet(labels.Set(map[string]string{"name": rcName})) + return wait.Poll(20*time.Second, 5*time.Minute, func() (bool, error) { + Logf("Waiting for pod %s to disappear", podName) + pods, err := c.Pods(ns).List(label, fields.Everything()) + if err != nil { + return false, err + } + found := false + for _, pod := range pods.Items { + if pod.Name == podName { + Logf("Pod %s still exists", podName) + found = true + } + } + if !found { + Logf("Pod %s no longer exists", podName) + return true, nil + } + return false, nil + }) +} + // Context for checking pods responses by issuing GETs to them and verifying if the answer with pod name. type podResponseChecker struct { c *client.Client