Merge pull request #9392 from jszczepkowski/e2e-net

Added e2e test case for network partition.
2015-06-08 11:35:28 -07:00 · 2015-06-08 11:35:28 -07:00 · 2bb0fc00e5
parent ab1bc9fd76 445ae0f576
commit 2bb0fc00e5
2 changed files with 217 additions and 95 deletions
--- a/test/e2e/resize_nodes.go
+++ b/test/e2e/resize_nodes.go
@ -207,7 +207,23 @@ func verifyPodsResponding(c *client.Client, ns, name string, pods *api.PodList)
 	return wait.Poll(retryInterval, retryTimeout, podResponseChecker{c, ns, label, name, pods}.checkAllResponses)
 }

-var _ = Describe("ResizeNodes", func() {
+func waitForPodsCreatedRunningResponding(c *client.Client, ns, name string, replicas int) error {
+	pods, err := waitForPodsCreated(c, ns, name, replicas)
+	if err != nil {
+		return err
+	}
+	e := waitForPodsRunning(c, pods)
+	if len(e) > 0 {
+		return fmt.Errorf("Failed to wait for pods running: %v", e)
+	}
+	err = verifyPodsResponding(c, ns, name, pods)
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+var _ = Describe("Nodes", func() {
 	supportedProviders := []string{"gce"}
 	var testName string
 	var c *client.Client
@ -223,116 +239,173 @@ var _ = Describe("ResizeNodes", func() {
 	})

 	AfterEach(func() {
-		if !providerIs(supportedProviders...) {
-			return
-		}
 		By(fmt.Sprintf("destroying namespace for this suite %s", ns))
 		if err := c.Namespaces().Delete(ns); err != nil {
 			Failf("Couldn't delete namespace '%s', %v", ns, err)
 		}
-		By("restoring the original node instance group size")
-		if err := resizeNodeInstanceGroup(testContext.CloudConfig.NumNodes); err != nil {
-			Failf("Couldn't restore the original node instance group size: %v", err)
-		}
-		if err := waitForNodeInstanceGroupSize(testContext.CloudConfig.NumNodes); err != nil {
-			Failf("Couldn't restore the original node instance group size: %v", err)
-		}
-		if err := waitForClusterSize(c, testContext.CloudConfig.NumNodes); err != nil {
-			Failf("Couldn't restore the original cluster size: %v", err)
-		}
 	})

-	testName = "should be able to delete nodes."
-	It(testName, func() {
-		Logf("starting test %s", testName)
+	Describe("Resize", func() {
+		BeforeEach(func() {
+			if !providerIs(supportedProviders...) {
+				Failf("Nodes.Resize test is only supported for providers %v (not %s). You can avoid this failure by using ginkgo.skip=Nodes.Resize in your environment.",
+					supportedProviders, testContext.Provider)
+			}
+		})

-		if !providerIs(supportedProviders...) {
-			By(fmt.Sprintf("Skipping %s test, which is only supported for providers %v (not %s)",
-				testName, supportedProviders, testContext.Provider))
-			return
-		}
+		AfterEach(func() {
+			if !providerIs(supportedProviders...) {
+				return
+			}
+			By("restoring the original node instance group size")
+			if err := resizeNodeInstanceGroup(testContext.CloudConfig.NumNodes); err != nil {
+				Failf("Couldn't restore the original node instance group size: %v", err)
+			}
+			if err := waitForNodeInstanceGroupSize(testContext.CloudConfig.NumNodes); err != nil {
+				Failf("Couldn't restore the original node instance group size: %v", err)
+			}
+			if err := waitForClusterSize(c, testContext.CloudConfig.NumNodes); err != nil {
+				Failf("Couldn't restore the original cluster size: %v", err)
+			}
+		})

-		if testContext.CloudConfig.NumNodes < 2 {
-			By(fmt.Sprintf("skipping %s test, which requires at lease 2 nodes (not %d)",
-				testName, testContext.CloudConfig.NumNodes))
-			return
-		}
+		testName = "should be able to delete nodes."
+		It(testName, func() {
+			Logf("starting test %s", testName)

-		// Create a replication controller for a service that serves its hostname.
-		// The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname
-		name := "my-hostname-delete-node-" + string(util.NewUUID())
-		replicas := testContext.CloudConfig.NumNodes
-		createServeHostnameReplicationController(c, ns, name, replicas)
-		pods, err := waitForPodsCreated(c, ns, name, replicas)
-		Expect(err).NotTo(HaveOccurred())
-		e := waitForPodsRunning(c, pods)
-		if len(e) > 0 {
-			Failf("Failed to wait for pods running: %v", e)
-		}
-		err = verifyPodsResponding(c, ns, name, pods)
-		Expect(err).NotTo(HaveOccurred())
+			if testContext.CloudConfig.NumNodes < 2 {
+				Failf("Failing test %s as it requires at lease 2 nodes (not %d)", testName, testContext.CloudConfig.NumNodes)
+				return
+			}

-		By(fmt.Sprintf("decreasing cluster size to %d", replicas-1))
-		err = resizeNodeInstanceGroup(replicas - 1)
-		Expect(err).NotTo(HaveOccurred())
-		err = waitForNodeInstanceGroupSize(replicas - 1)
-		Expect(err).NotTo(HaveOccurred())
-		err = waitForClusterSize(c, replicas-1)
-		Expect(err).NotTo(HaveOccurred())
+			// Create a replication controller for a service that serves its hostname.
+			// The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname
+			name := "my-hostname-delete-node"
+			replicas := testContext.CloudConfig.NumNodes
+			createServeHostnameReplicationController(c, ns, name, replicas)
+			err := waitForPodsCreatedRunningResponding(c, ns, name, replicas)
+			Expect(err).NotTo(HaveOccurred())

-		By("verifying whether the pods from the removed node are recreated")
-		pods, err = waitForPodsCreated(c, ns, name, replicas)
-		Expect(err).NotTo(HaveOccurred())
-		e = waitForPodsRunning(c, pods)
-		if len(e) > 0 {
-			Failf("Failed to wait for pods running: %v", e)
-		}
-		err = verifyPodsResponding(c, ns, name, pods)
-		Expect(err).NotTo(HaveOccurred())
+			By(fmt.Sprintf("decreasing cluster size to %d", replicas-1))
+			err = resizeNodeInstanceGroup(replicas - 1)
+			Expect(err).NotTo(HaveOccurred())
+			err = waitForNodeInstanceGroupSize(replicas - 1)
+			Expect(err).NotTo(HaveOccurred())
+			err = waitForClusterSize(c, replicas-1)
+			Expect(err).NotTo(HaveOccurred())
+
+			By("verifying whether the pods from the removed node are recreated")
+			err = waitForPodsCreatedRunningResponding(c, ns, name, replicas)
+			Expect(err).NotTo(HaveOccurred())
+		})
+
+		testName = "should be able to add nodes."
+		It(testName, func() {
+			Logf("starting test %s", testName)
+
+			if testContext.CloudConfig.NumNodes < 2 {
+				Failf("Failing test %s as it requires at lease 2 nodes (not %d)", testName, testContext.CloudConfig.NumNodes)
+				return
+			}
+
+			// Create a replication controller for a service that serves its hostname.
+			// The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname
+			name := "my-hostname-add-node"
+			createServiceWithNameSelector(c, ns, name)
+			replicas := testContext.CloudConfig.NumNodes
+			createServeHostnameReplicationController(c, ns, name, replicas)
+			err := waitForPodsCreatedRunningResponding(c, ns, name, replicas)
+			Expect(err).NotTo(HaveOccurred())
+
+			By(fmt.Sprintf("increasing cluster size to %d", replicas+1))
+			err = resizeNodeInstanceGroup(replicas + 1)
+			Expect(err).NotTo(HaveOccurred())
+			err = waitForNodeInstanceGroupSize(replicas + 1)
+			Expect(err).NotTo(HaveOccurred())
+			err = waitForClusterSize(c, replicas+1)
+			Expect(err).NotTo(HaveOccurred())
+
+			By(fmt.Sprintf("increasing size of the replication controller to %d and verifying all pods are running", replicas+1))
+			err = resizeReplicationController(c, ns, name, replicas+1)
+			Expect(err).NotTo(HaveOccurred())
+			err = waitForPodsCreatedRunningResponding(c, ns, name, replicas+1)
+			Expect(err).NotTo(HaveOccurred())
+		})
 	})

-	testName = "should be able to add nodes."
-	It(testName, func() {
-		Logf("starting test %s", testName)
+	Describe("Network", func() {
+		BeforeEach(func() {
+			if !providerIs(supportedProviders...) {
+				Failf("Nodes.Network test is only supported for providers %v (not %s). You can avoid this failure by using ginkgo.skip=Nodes.Network in your environment.",
+					supportedProviders, testContext.Provider)
+			}
+		})

-		if !providerIs(supportedProviders...) {
-			By(fmt.Sprintf("Skipping %s test, which is only supported for providers %v (not %s)",
-				testName, supportedProviders, testContext.Provider))
-			return
-		}
+		testName = "should survive network partition."
+		It(testName, func() {
+			if testContext.CloudConfig.NumNodes < 2 {
+				By(fmt.Sprintf("skipping %s test, which requires at lease 2 nodes (not %d)",
+					testName, testContext.CloudConfig.NumNodes))
+				return
+			}

-		// Create a replication controller for a service that serves its hostname.
-		// The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname
-		name := "my-hostname-add-node-" + string(util.NewUUID())
-		createServiceWithNameSelector(c, ns, name)
-		replicas := testContext.CloudConfig.NumNodes
-		createServeHostnameReplicationController(c, ns, name, replicas)
-		pods, err := waitForPodsCreated(c, ns, name, replicas)
-		Expect(err).NotTo(HaveOccurred())
-		e := waitForPodsRunning(c, pods)
-		if len(e) > 0 {
-			Failf("Failed to wait for pods running: %v", e)
-		}
-		err = verifyPodsResponding(c, ns, name, pods)
-		Expect(err).NotTo(HaveOccurred())
+			// Create a replication controller for a service that serves its hostname.
+			// The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname
+			name := "my-hostname-net"
+			createServiceWithNameSelector(c, ns, name)
+			replicas := testContext.CloudConfig.NumNodes
+			createServeHostnameReplicationController(c, ns, name, replicas)
+			err := waitForPodsCreatedRunningResponding(c, ns, name, replicas)
+			Expect(err).NotTo(HaveOccurred())

-		By(fmt.Sprintf("increasing cluster size to %d", replicas+1))
-		err = resizeNodeInstanceGroup(replicas + 1)
-		Expect(err).NotTo(HaveOccurred())
-		err = waitForNodeInstanceGroupSize(replicas + 1)
-		Expect(err).NotTo(HaveOccurred())
-		err = waitForClusterSize(c, replicas+1)
-		Expect(err).NotTo(HaveOccurred())
+			By("cause network partition on one node")
+			nodelist, err := c.Nodes().List(labels.Everything(), fields.Everything())
+			Expect(err).NotTo(HaveOccurred())
+			node := nodelist.Items[0]
+			pod, err := waitForRCPodOnNode(c, ns, name, node.Name)
+			Expect(err).NotTo(HaveOccurred())
+			Logf("Getting external IP address for %s", name)
+			host := ""
+			for _, a := range node.Status.Addresses {
+				if a.Type == api.NodeExternalIP {
+					host = a.Address + ":22"
+					break
+				}
+			}
+			Logf("Setting network partition on %s", node.Name)
+			dropCmd := fmt.Sprintf("sudo iptables -I OUTPUT 1 -d %s -j DROP", testContext.CloudConfig.MasterName)
+			if _, _, code, err := SSH(dropCmd, host, testContext.Provider); code != 0 || err != nil {
+				Failf("Expected 0 exit code and nil error when running %s on %s, got %d and %v",
+					dropCmd, node, code, err)
+			}

-		By(fmt.Sprintf("increasing size of the replication controller to %d and verifying all pods are running", replicas+1))
-		resizeReplicationController(c, ns, name, replicas+1)
-		pods, err = waitForPodsCreated(c, ns, name, replicas+1)
-		Expect(err).NotTo(HaveOccurred())
-		e = waitForPodsRunning(c, pods)
-		if len(e) > 0 {
-			Failf("Failed to wait for pods running: %v", e)
-		}
-		err = verifyPodsResponding(c, ns, name, pods)
-		Expect(err).NotTo(HaveOccurred())
+			Logf("Waiting for node %s to be not ready", node.Name)
+			waitForNodeToBe(c, node.Name, false, 2*time.Minute)
+
+			Logf("Waiting for pod %s to be removed", pod.Name)
+			waitForRCPodToDisappear(c, ns, name, pod.Name)
+
+			By("verifying whether the pod from the partitioned node is recreated")
+			err = waitForPodsCreatedRunningResponding(c, ns, name, replicas)
+			Expect(err).NotTo(HaveOccurred())
+
+			By("remove network partition")
+			undropCmd := "sudo iptables --delete OUTPUT 1"
+			if _, _, code, err := SSH(undropCmd, host, testContext.Provider); code != 0 || err != nil {
+				Failf("Expected 0 exit code and nil error when running %s on %s, got %d and %v",
+					undropCmd, node, code, err)
+			}
+
+			Logf("Waiting for node %s to be ready", node.Name)
+			waitForNodeToBe(c, node.Name, true, 2*time.Minute)
+
+			By("verify wheter new pods can be created on the re-attached node")
+			err = resizeReplicationController(c, ns, name, replicas+1)
+			Expect(err).NotTo(HaveOccurred())
+			err = waitForPodsCreatedRunningResponding(c, ns, name, replicas+1)
+			Expect(err).NotTo(HaveOccurred())
+			_, err = waitForRCPodOnNode(c, ns, name, node.Name)
+			Expect(err).NotTo(HaveOccurred())
+		})
 	})
 })
--- a/test/e2e/util.go
+++ b/test/e2e/util.go
@ -42,6 +42,7 @@ import (
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/runtime"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/util/wait"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"

 	"code.google.com/p/go-uuid/uuid"
@ -397,6 +398,54 @@ func waitForPodSuccess(c *client.Client, podName string, contName string) error
 	return waitForPodSuccessInNamespace(c, podName, contName, api.NamespaceDefault)
 }

+// waitForRCPodOnNode returns the pod from the given replication controller (decribed by rcName) which is scheduled on the given node.
+// In case of failure or too long waiting time, an error is returned.
+func waitForRCPodOnNode(c *client.Client, ns, rcName, node string) (*api.Pod, error) {
+	label := labels.SelectorFromSet(labels.Set(map[string]string{"name": rcName}))
+	var p *api.Pod = nil
+	err := wait.Poll(10*time.Second, 5*time.Minute, func() (bool, error) {
+		Logf("Waiting for pod %s to appear on node %s", rcName, node)
+		pods, err := c.Pods(ns).List(label, fields.Everything())
+		if err != nil {
+			return false, err
+		}
+		for _, pod := range pods.Items {
+			if pod.Spec.NodeName == node {
+				Logf("Pod %s found on node %s", pod.Name, node)
+				p = &pod
+				return true, nil
+			}
+		}
+		return false, nil
+	})
+	return p, err
+}
+
+// waitForRCPodOnNode returns nil if the pod from the given replication controller (decribed by rcName) no longer exists.
+// In case of failure or too long waiting time, an error is returned.
+func waitForRCPodToDisappear(c *client.Client, ns, rcName, podName string) error {
+	label := labels.SelectorFromSet(labels.Set(map[string]string{"name": rcName}))
+	return wait.Poll(20*time.Second, 5*time.Minute, func() (bool, error) {
+		Logf("Waiting for pod %s to disappear", podName)
+		pods, err := c.Pods(ns).List(label, fields.Everything())
+		if err != nil {
+			return false, err
+		}
+		found := false
+		for _, pod := range pods.Items {
+			if pod.Name == podName {
+				Logf("Pod %s still exists", podName)
+				found = true
+			}
+		}
+		if !found {
+			Logf("Pod %s no longer exists", podName)
+			return true, nil
+		}
+		return false, nil
+	})
+}
+
 // Context for checking pods responses by issuing GETs to them and verifying if the answer with pod name.
 type podResponseChecker struct {
 	c              *client.Client