From 56f1744c7ae1b525cff105256afb5dac63a3791a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Osipiuk?= <lukasz@osipiuk.net>
Date: Thu, 26 Apr 2018 17:39:34 +0200
Subject: [PATCH 1/3] Add way to request GPUs in tests via RCConfig

---
 test/utils/runners.go | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/utils/runners.go b/test/utils/runners.go
index 2872384386..0a614abb62 100644
--- a/test/utils/runners.go
+++ b/test/utils/runners.go
@@ -124,6 +124,7 @@ type RCConfig struct {
 	CpuLimit          int64 // millicores
 	MemRequest        int64 // bytes
 	MemLimit          int64 // bytes
+	GpuLimit          int64 // count
 	ReadinessProbe    *v1.Probe
 	DNSPolicy         *v1.DNSPolicy
 	PriorityClassName string
@@ -615,7 +616,7 @@ func (config *RCConfig) applyTo(template *v1.PodTemplateSpec) {
 			c.Ports = append(c.Ports, v1.ContainerPort{Name: k, ContainerPort: int32(v), HostPort: int32(v)})
 		}
 	}
-	if config.CpuLimit > 0 || config.MemLimit > 0 {
+	if config.CpuLimit > 0 || config.MemLimit > 0 || config.GpuLimit > 0 {
 		template.Spec.Containers[0].Resources.Limits = v1.ResourceList{}
 	}
 	if config.CpuLimit > 0 {
@@ -633,6 +634,9 @@ func (config *RCConfig) applyTo(template *v1.PodTemplateSpec) {
 	if config.MemRequest > 0 {
 		template.Spec.Containers[0].Resources.Requests[v1.ResourceMemory] = *resource.NewQuantity(config.MemRequest, resource.DecimalSI)
 	}
+	if config.GpuLimit > 0 {
+		template.Spec.Containers[0].Resources.Limits["nvidia.com/gpu"] = *resource.NewQuantity(config.GpuLimit, resource.DecimalSI)
+	}
 	if len(config.Volumes) > 0 {
 		template.Spec.Volumes = config.Volumes
 	}

From 14fc90a8f617fa16fb5a279fe4389b40bd283838 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Osipiuk?= <lukaszos@google.com>
Date: Thu, 10 May 2018 14:26:54 +0200
Subject: [PATCH 2/3] Add framework.GetReadyNodesIncludingTaintedOrDie

---
 test/e2e/framework/util.go | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/test/e2e/framework/util.go b/test/e2e/framework/util.go
index 91e5ba5015..9930bdf1ce 100644
--- a/test/e2e/framework/util.go
+++ b/test/e2e/framework/util.go
@@ -2606,6 +2606,18 @@ func GetReadySchedulableNodesOrDie(c clientset.Interface) (nodes *v1.NodeList) {
 	return nodes
 }
 
+// GetReadyNodesIncludingTaintedOrDie returns all ready nodes, even those which are tainted.
+// There are cases when we care about tainted nodes
+// E.g. in tests related to nodes with gpu we care about nodes despite
+// presence of nvidia.com/gpu=present:NoSchedule taint
+func GetReadyNodesIncludingTaintedOrDie(c clientset.Interface) (nodes *v1.NodeList) {
+	nodes = waitListSchedulableNodesOrDie(c)
+	FilterNodes(nodes, func(node v1.Node) bool {
+		return isNodeSchedulable(&node)
+	})
+	return nodes
+}
+
 func WaitForAllNodesSchedulable(c clientset.Interface, timeout time.Duration) error {
 	Logf("Waiting up to %v for all (but %d) nodes to be schedulable", timeout, TestContext.AllowedNotReadyNodes)
 

From c6a0937080834fc5de2d20776c620d87691c7649 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Osipiuk?= <lukasz@osipiuk.net>
Date: Thu, 26 Apr 2018 22:41:16 +0200
Subject: [PATCH 3/3] Add cluster autoscaler tests for pods requiring GPU

---
 .../autoscaling/cluster_size_autoscaling.go   | 108 +++++++++++++++++-
 1 file changed, 107 insertions(+), 1 deletion(-)

diff --git a/test/e2e/autoscaling/cluster_size_autoscaling.go b/test/e2e/autoscaling/cluster_size_autoscaling.go
index 0b0b248a86..0cf417138d 100644
--- a/test/e2e/autoscaling/cluster_size_autoscaling.go
+++ b/test/e2e/autoscaling/cluster_size_autoscaling.go
@@ -207,6 +207,76 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
 	It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]",
 		func() { simpleScaleUpTest(0) })
 
+	It("Should scale up GPU pool from 0 [Feature:ClusterSizeAutoscalingGpu]", func() {
+		framework.SkipUnlessProviderIs("gke")
+
+		const gpuPoolName = "gpu-pool"
+		addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 0)
+		defer deleteNodePool(gpuPoolName)
+
+		installNvidiaDriversDaemonSet()
+
+		By("Enable autoscaler")
+		framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
+		defer disableAutoscaler(gpuPoolName, 0, 1)
+		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
+
+		By("Schedule a pod which requires GPU")
+		framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
+
+		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
+			func(size int) bool { return size == nodeCount+1 }, scaleUpTimeout))
+		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
+	})
+
+	It("Should scale up GPU pool from 1 [Feature:ClusterSizeAutoscalingGpu]", func() {
+		framework.SkipUnlessProviderIs("gke")
+
+		const gpuPoolName = "gpu-pool"
+		addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 1)
+		defer deleteNodePool(gpuPoolName)
+
+		installNvidiaDriversDaemonSet()
+
+		By("Schedule a single pod which requires GPU")
+		framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
+
+		By("Enable autoscaler")
+		framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 2))
+		defer disableAutoscaler(gpuPoolName, 0, 2)
+		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
+
+		framework.ScaleRC(f.ClientSet, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc", 2, false)
+
+		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
+			func(size int) bool { return size == nodeCount+2 }, scaleUpTimeout))
+		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(2))
+	})
+
+	It("Should scale down GPU pool from 1 [Feature:ClusterSizeAutoscalingGpu]", func() {
+		framework.SkipUnlessProviderIs("gke")
+
+		const gpuPoolName = "gpu-pool"
+		addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 1)
+		defer deleteNodePool(gpuPoolName)
+
+		installNvidiaDriversDaemonSet()
+
+		By("Schedule a single pod which requires GPU")
+		framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
+
+		By("Enable autoscaler")
+		framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
+		defer disableAutoscaler(gpuPoolName, 0, 1)
+		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
+
+		framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc")
+
+		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
+			func(size int) bool { return size == nodeCount }, scaleDownTimeout))
+		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
+	})
+
 	It("should increase cluster size if pending pods are small and one node is broken [Feature:ClusterSizeAutoscalingScaleUp]",
 		func() {
 			framework.TestUnderTemporaryNetworkFailure(c, "default", getAnyNode(c), func() { simpleScaleUpTest(1) })
@@ -957,6 +1027,12 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
 	})
 })
 
+func installNvidiaDriversDaemonSet() {
+	By("Add daemonset which installs nvidia drivers")
+	// the link differs from one in GKE documentation; discussed with @mindprince this one should be used
+	framework.RunKubectlOrDie("apply", "-f", "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/daemonset.yaml")
+}
+
 func execCmd(args ...string) *exec.Cmd {
 	glog.Infof("Executing: %s", strings.Join(args, " "))
 	return exec.Command(args[0], args[1:]...)
@@ -1300,6 +1376,16 @@ func addNodePool(name string, machineType string, numNodes int) {
 	framework.ExpectNoError(err, string(output))
 }
 
+func addGpuNodePool(name string, gpuType string, gpuCount int, numNodes int) {
+	args := []string{"beta", "container", "node-pools", "create", name, "--quiet",
+		"--accelerator", "type=" + gpuType + ",count=" + strconv.Itoa(gpuCount),
+		"--num-nodes=" + strconv.Itoa(numNodes),
+		"--cluster=" + framework.TestContext.CloudConfig.Cluster}
+	output, err := execCmd(getGcloudCommand(args)...).CombinedOutput()
+	glog.Infof("Creating node-pool %s: %s", name, output)
+	framework.ExpectNoError(err, string(output))
+}
+
 func deleteNodePool(name string) {
 	glog.Infof("Deleting node pool %s", name)
 	args := []string{"container", "node-pools", "delete", name, "--quiet",
@@ -1320,7 +1406,7 @@ func deleteNodePool(name string) {
 
 func getPoolNodes(f *framework.Framework, poolName string) []*v1.Node {
 	nodes := make([]*v1.Node, 0, 1)
-	nodeList := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
+	nodeList := framework.GetReadyNodesIncludingTaintedOrDie(f.ClientSet)
 	for _, node := range nodeList.Items {
 		if node.Labels[gkeNodepoolNameKey] == poolName {
 			nodes = append(nodes, &node)
@@ -1624,6 +1710,26 @@ func makeNodeSchedulable(c clientset.Interface, node *v1.Node, failOnCriticalAdd
 	return fmt.Errorf("Failed to remove taint from node in allowed number of retries")
 }
 
+func scheduleGpuPod(f *framework.Framework, id string) error {
+	config := &testutils.RCConfig{
+		Client:         f.ClientSet,
+		InternalClient: f.InternalClientset,
+		Name:           id,
+		Namespace:      f.Namespace.Name,
+		Timeout:        3 * scaleUpTimeout, // spinning up GPU node is slow
+		Image:          imageutils.GetPauseImageName(),
+		Replicas:       1,
+		GpuLimit:       1,
+		Labels:         map[string]string{"requires-gpu": "yes"},
+	}
+
+	err := framework.RunRC(*config)
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
 // Create an RC running a given number of pods with anti-affinity
 func runAntiAffinityPods(f *framework.Framework, namespace string, pods int, id string, podLabels, antiAffinityLabels map[string]string) error {
 	config := &testutils.RCConfig{