Multiple MIGs in e2e cluster autoscaling tests.

Implemented support for multiple MIGs in e2e cluster autoscaling tests.
2016-05-23 14:10:40 +02:00 · 2016-05-23 14:10:40 +02:00 · 93e5b12a06
parent 0a6a52b19d
commit 93e5b12a06
4 changed files with 81 additions and 50 deletions
--- a/hack/ginkgo-e2e.sh
+++ b/hack/ginkgo-e2e.sh
@ -72,6 +72,19 @@ else
  NODE_INSTANCE_GROUP=""
 fi

+if [[ "${KUBERNETES_PROVIDER}" == "gce" ]]; then
+  set_num_migs
+  NODE_INSTANCE_GROUP=""
+  for ((i=1; i<=${NUM_MIGS}; i++)); do
+    if [[ $i == ${NUM_MIGS} ]]; then
+      # We are assigning the same mig names as create-nodes function from cluster/gce/util.sh.
+      NODE_INSTANCE_GROUP="${NODE_INSTANCE_GROUP}${NODE_INSTANCE_PREFIX}-group"
+    else
+      NODE_INSTANCE_GROUP="${NODE_INSTANCE_GROUP}${NODE_INSTANCE_PREFIX}-group-${i},"
+    fi
+  done
+fi
+
 if [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then
  detect-node-instance-group
 fi
--- a/test/e2e/cluster_size_autoscaling.go
+++ b/test/e2e/cluster_size_autoscaling.go
@ -18,6 +18,7 @@ package e2e

 import (
 	"fmt"
+	"strings"
 	"time"

 	"k8s.io/kubernetes/pkg/api"
@ -37,11 +38,12 @@ const (
 	scaleDownTimeout = 15 * time.Minute
 )

-var _ = framework.KubeDescribe("Cluster size autoscaling scale up [Feature:ClusterSizeAutoscaling] [Slow]", func() {
+var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
 	f := framework.NewDefaultFramework("autoscaling")
 	var nodeCount int
 	var coresPerNode int
 	var memCapacityMb int
+	var originalSizes map[string]int

 	BeforeEach(func() {
 		framework.SkipUnlessProviderIs("gce")
@ -53,57 +55,54 @@ var _ = framework.KubeDescribe("Cluster size autoscaling scale up [Feature:Clust
 		mem := nodes.Items[0].Status.Capacity[api.ResourceMemory]
 		coresPerNode = int((&cpu).MilliValue() / 1000)
 		memCapacityMb = int((&mem).Value() / 1024 / 1024)
+
+		originalSizes = make(map[string]int)
+		sum := 0
+		for _, mig := range strings.Split(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") {
+			size, err := GroupSize(mig)
+			framework.ExpectNoError(err)
+			By(fmt.Sprintf("Initial size of %s: %d", mig, size))
+			originalSizes[mig] = size
+			sum += size
+		}
+		Expect(nodeCount).Should(Equal(sum))
 	})

-	It("Should correctly handle pending pods", func() {
-		By("Too large pending pod does not increase cluster size")
+	It("shouldn't increase cluster size if pending pod it too large [Feature:ClusterSizeAutoscalingScaleUp]", func() {
 		ReserveMemory(f, "memory-reservation", 1, memCapacityMb, false)
 		// Verify, that cluster size is not changed.
-		// TODO: find a better way of verification that the cluster size will remain unchanged.
+		// TODO: find a better way of verification that the cluster size will remain unchanged using events.
 		time.Sleep(scaleUpTimeout)
-		framework.ExpectNoError(framework.WaitForClusterSize(f.Client, nodeCount, scaleUpTimeout))
+		framework.ExpectNoError(WaitForClusterSizeFunc(f.Client,
+			func(size int) bool { return size <= nodeCount }, scaleDownTimeout))
 		framework.ExpectNoError(framework.DeleteRC(f.Client, f.Namespace.Name, "memory-reservation"))
-		framework.ExpectNoError(ResizeGroup(int32(nodeCount)))
-		framework.ExpectNoError(framework.WaitForClusterSize(f.Client, nodeCount, resizeTimeout))
+		framework.ExpectNoError(WaitForClusterSizeFunc(f.Client,
+			func(size int) bool { return size <= nodeCount }, scaleDownTimeout))
+	})

-		By("Small pending pods increase cluster size")
+	It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]", func() {
 		ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false)
 		// Verify, that cluster size is increased
 		framework.ExpectNoError(WaitForClusterSizeFunc(f.Client,
 			func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout))
 		framework.ExpectNoError(framework.DeleteRC(f.Client, f.Namespace.Name, "memory-reservation"))
-		framework.ExpectNoError(ResizeGroup(int32(nodeCount)))
-		framework.ExpectNoError(framework.WaitForClusterSize(f.Client, nodeCount, resizeTimeout))
+		restoreSizes(originalSizes)
+		framework.ExpectNoError(WaitForClusterSizeFunc(f.Client,
+			func(size int) bool { return size <= nodeCount }, scaleDownTimeout))
+	})

-		By("Handling node port pods")
+	It("should increase cluster size if pods are pending due to host port conflict [Feature:ClusterSizeAutoscalingScaleUp]", func() {
 		CreateHostPortPods(f, "host-port", nodeCount+2, false)
 		framework.ExpectNoError(WaitForClusterSizeFunc(f.Client,
 			func(size int) bool { return size >= nodeCount+2 }, scaleUpTimeout))
+		framework.ExpectNoError(framework.DeleteRC(f.Client, f.Namespace.Name, "host-port"))
+		restoreSizes(originalSizes)
+		framework.ExpectNoError(WaitForClusterSizeFunc(f.Client,
+			func(size int) bool { return size <= nodeCount }, scaleDownTimeout))

-		framework.ExpectNoError(ResizeGroup(int32(nodeCount)))
-		framework.ExpectNoError(framework.WaitForClusterSize(f.Client, nodeCount, resizeTimeout))
-	})
 	})

-var _ = framework.KubeDescribe("Cluster size autoscaling scale down[Feature:ClusterSizeAutoscalingScaleDown] [Slow]", func() {
-	f := framework.NewDefaultFramework("autoscaling")
-	var nodeCount int
-	var coresPerNode int
-	var memCapacityMb int
-
-	BeforeEach(func() {
-		framework.SkipUnlessProviderIs("gce")
-
-		nodes := framework.GetReadySchedulableNodesOrDie(f.Client)
-		nodeCount = len(nodes.Items)
-		Expect(nodeCount).NotTo(BeZero())
-		cpu := nodes.Items[0].Status.Capacity[api.ResourceCPU]
-		mem := nodes.Items[0].Status.Capacity[api.ResourceMemory]
-		coresPerNode = int((&cpu).MilliValue() / 1000)
-		memCapacityMb = int((&mem).Value() / 1024 / 1024)
-	})
-
-	It("Should correctly handle pending and scale down after deletion", func() {
+	It("should correctly handle pending and scale down after deletion [Feature:ClusterSizeAutoscalingScaleDown]", func() {
 		By("Small pending pods increase cluster size")
 		ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false)
 		// Verify, that cluster size is increased
@ -192,3 +191,16 @@ func WaitForClusterSizeFunc(c *client.Client, sizeFunc func(int) bool, timeout t
 	}
 	return fmt.Errorf("timeout waiting %v for appropriate cluster size", timeout)
 }
+
+func restoreSizes(sizes map[string]int) {
+	By(fmt.Sprintf("Restoring initial size of the cluster"))
+	for mig, desiredSize := range sizes {
+		currentSize, err := GroupSize(mig)
+		framework.ExpectNoError(err)
+		if desiredSize != currentSize {
+			By(fmt.Sprintf("Setting size of %s to %d", mig, desiredSize))
+			err = ResizeGroup(mig, int32(desiredSize))
+			framework.ExpectNoError(err)
+		}
+	}
+}
--- a/test/e2e/framework/test_context.go
+++ b/test/e2e/framework/test_context.go
@ -113,7 +113,7 @@ func RegisterFlags() {
 	flag.StringVar(&cloudConfig.Zone, "gce-zone", "", "GCE zone being used, if applicable")
 	flag.StringVar(&cloudConfig.ServiceAccount, "gce-service-account", "", "GCE service account to use for GCE API calls, if applicable")
 	flag.StringVar(&cloudConfig.Cluster, "gke-cluster", "", "GKE name of cluster being used, if applicable")
-	flag.StringVar(&cloudConfig.NodeInstanceGroup, "node-instance-group", "", "Name of the managed instance group for nodes. Valid only for gce, gke or aws")
+	flag.StringVar(&cloudConfig.NodeInstanceGroup, "node-instance-group", "", "Name of the managed instance group for nodes. Valid only for gce, gke or aws. If there is more than one group: comma separated list of groups.")
 	flag.IntVar(&cloudConfig.NumNodes, "num-nodes", -1, "Number of nodes in the cluster")

 	flag.StringVar(&cloudConfig.ClusterTag, "cluster-tag", "", "Tag used to identify resources.  Only required if provider is aws.")
--- a/test/e2e/resize_nodes.go
+++ b/test/e2e/resize_nodes.go
@ -53,7 +53,7 @@ const (
 	testPort                  = 9376
 )

-func ResizeGroup(size int32) error {
+func ResizeGroup(group string, size int32) error {
 	if framework.TestContext.ReportDir != "" {
 		framework.CoreDump(framework.TestContext.ReportDir)
 		defer framework.CoreDump(framework.TestContext.ReportDir)
@ -62,7 +62,7 @@ func ResizeGroup(size int32) error {
 		// TODO: make this hit the compute API directly instead of shelling out to gcloud.
 		// TODO: make gce/gke implement InstanceGroups, so we can eliminate the per-provider logic
 		output, err := exec.Command("gcloud", "compute", "instance-groups", "managed", "resize",
-			framework.TestContext.CloudConfig.NodeInstanceGroup, fmt.Sprintf("--size=%v", size),
+			group, fmt.Sprintf("--size=%v", size),
 			"--project="+framework.TestContext.CloudConfig.ProjectID, "--zone="+framework.TestContext.CloudConfig.Zone).CombinedOutput()
 		if err != nil {
 			framework.Logf("Failed to resize node instance group: %v", string(output))
@ -70,18 +70,18 @@ func ResizeGroup(size int32) error {
 		return err
 	} else if framework.TestContext.Provider == "aws" {
 		client := autoscaling.New(session.New())
-		return awscloud.ResizeInstanceGroup(client, framework.TestContext.CloudConfig.NodeInstanceGroup, int(size))
+		return awscloud.ResizeInstanceGroup(client, group, int(size))
 	} else {
 		return fmt.Errorf("Provider does not support InstanceGroups")
 	}
 }

-func groupSize() (int, error) {
+func GroupSize(group string) (int, error) {
 	if framework.TestContext.Provider == "gce" || framework.TestContext.Provider == "gke" {
 		// TODO: make this hit the compute API directly instead of shelling out to gcloud.
 		// TODO: make gce/gke implement InstanceGroups, so we can eliminate the per-provider logic
 		output, err := exec.Command("gcloud", "compute", "instance-groups", "managed",
-			"list-instances", framework.TestContext.CloudConfig.NodeInstanceGroup, "--project="+framework.TestContext.CloudConfig.ProjectID,
+			"list-instances", group, "--project="+framework.TestContext.CloudConfig.ProjectID,
 			"--zone="+framework.TestContext.CloudConfig.Zone).CombinedOutput()
 		if err != nil {
 			return -1, err
@ -90,12 +90,12 @@ func groupSize() (int, error) {
 		return len(re.FindAllString(string(output), -1)), nil
 	} else if framework.TestContext.Provider == "aws" {
 		client := autoscaling.New(session.New())
-		instanceGroup, err := awscloud.DescribeInstanceGroup(client, framework.TestContext.CloudConfig.NodeInstanceGroup)
+		instanceGroup, err := awscloud.DescribeInstanceGroup(client, group)
 		if err != nil {
 			return -1, fmt.Errorf("error describing instance group: %v", err)
 		}
 		if instanceGroup == nil {
-			return -1, fmt.Errorf("instance group not found: %s", framework.TestContext.CloudConfig.NodeInstanceGroup)
+			return -1, fmt.Errorf("instance group not found: %s", group)
 		}
 		return instanceGroup.CurrentSize()
 	} else {
@ -103,10 +103,10 @@ func groupSize() (int, error) {
 	}
 }

-func waitForGroupSize(size int32) error {
+func WaitForGroupSize(group string, size int32) error {
 	timeout := 10 * time.Minute
 	for start := time.Now(); time.Since(start) < timeout; time.Sleep(5 * time.Second) {
-		currentSize, err := groupSize()
+		currentSize, err := GroupSize(group)
 		if err != nil {
 			framework.Logf("Failed to get node instance group size: %v", err)
 			continue
@ -347,13 +347,19 @@ var _ = framework.KubeDescribe("Nodes [Disruptive]", func() {
 	var c *client.Client
 	var ns string
 	ignoreLabels := framework.ImagePullerLabels
+	var group string
+
 	BeforeEach(func() {
 		c = f.Client
 		ns = f.Namespace.Name
 		systemPods, err := framework.GetPodsInNamespace(c, ns, ignoreLabels)
 		Expect(err).NotTo(HaveOccurred())
 		systemPodsNo = int32(len(systemPods))
-
+		if strings.Index(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") >= 0 {
+			framework.Failf("Test dose not support cluster setup with more than one MIG: %s", framework.TestContext.CloudConfig.NodeInstanceGroup)
+		} else {
+			group = framework.TestContext.CloudConfig.NodeInstanceGroup
+		}
 	})

 	// Slow issue #13323 (8 min)
@ -373,7 +379,7 @@ var _ = framework.KubeDescribe("Nodes [Disruptive]", func() {
 			}

 			By("restoring the original node instance group size")
-			if err := ResizeGroup(int32(framework.TestContext.CloudConfig.NumNodes)); err != nil {
+			if err := ResizeGroup(group, int32(framework.TestContext.CloudConfig.NumNodes)); err != nil {
 				framework.Failf("Couldn't restore the original node instance group size: %v", err)
 			}
 			// In GKE, our current tunneling setup has the potential to hold on to a broken tunnel (from a
@ -388,7 +394,7 @@ var _ = framework.KubeDescribe("Nodes [Disruptive]", func() {
 				By("waiting 5 minutes for all dead tunnels to be dropped")
 				time.Sleep(5 * time.Minute)
 			}
-			if err := waitForGroupSize(int32(framework.TestContext.CloudConfig.NumNodes)); err != nil {
+			if err := WaitForGroupSize(group, int32(framework.TestContext.CloudConfig.NumNodes)); err != nil {
 				framework.Failf("Couldn't restore the original node instance group size: %v", err)
 			}
 			if err := framework.WaitForClusterSize(c, framework.TestContext.CloudConfig.NumNodes, 10*time.Minute); err != nil {
@ -412,9 +418,9 @@ var _ = framework.KubeDescribe("Nodes [Disruptive]", func() {
 			Expect(err).NotTo(HaveOccurred())

 			By(fmt.Sprintf("decreasing cluster size to %d", replicas-1))
-			err = ResizeGroup(replicas - 1)
+			err = ResizeGroup(group, replicas-1)
 			Expect(err).NotTo(HaveOccurred())
-			err = waitForGroupSize(replicas - 1)
+			err = WaitForGroupSize(group, replicas-1)
 			Expect(err).NotTo(HaveOccurred())
 			err = framework.WaitForClusterSize(c, int(replicas-1), 10*time.Minute)
 			Expect(err).NotTo(HaveOccurred())
@ -436,9 +442,9 @@ var _ = framework.KubeDescribe("Nodes [Disruptive]", func() {
 			Expect(err).NotTo(HaveOccurred())

 			By(fmt.Sprintf("increasing cluster size to %d", replicas+1))
-			err = ResizeGroup(replicas + 1)
+			err = ResizeGroup(group, replicas+1)
 			Expect(err).NotTo(HaveOccurred())
-			err = waitForGroupSize(replicas + 1)
+			err = WaitForGroupSize(group, replicas+1)
 			Expect(err).NotTo(HaveOccurred())
 			err = framework.WaitForClusterSize(c, int(replicas+1), 10*time.Minute)
 			Expect(err).NotTo(HaveOccurred())