From 56f1744c7ae1b525cff105256afb5dac63a3791a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Osipiuk?= Date: Thu, 26 Apr 2018 17:39:34 +0200 Subject: [PATCH 1/3] Add way to request GPUs in tests via RCConfig --- test/utils/runners.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/utils/runners.go b/test/utils/runners.go index 2872384386..0a614abb62 100644 --- a/test/utils/runners.go +++ b/test/utils/runners.go @@ -124,6 +124,7 @@ type RCConfig struct { CpuLimit int64 // millicores MemRequest int64 // bytes MemLimit int64 // bytes + GpuLimit int64 // count ReadinessProbe *v1.Probe DNSPolicy *v1.DNSPolicy PriorityClassName string @@ -615,7 +616,7 @@ func (config *RCConfig) applyTo(template *v1.PodTemplateSpec) { c.Ports = append(c.Ports, v1.ContainerPort{Name: k, ContainerPort: int32(v), HostPort: int32(v)}) } } - if config.CpuLimit > 0 || config.MemLimit > 0 { + if config.CpuLimit > 0 || config.MemLimit > 0 || config.GpuLimit > 0 { template.Spec.Containers[0].Resources.Limits = v1.ResourceList{} } if config.CpuLimit > 0 { @@ -633,6 +634,9 @@ func (config *RCConfig) applyTo(template *v1.PodTemplateSpec) { if config.MemRequest > 0 { template.Spec.Containers[0].Resources.Requests[v1.ResourceMemory] = *resource.NewQuantity(config.MemRequest, resource.DecimalSI) } + if config.GpuLimit > 0 { + template.Spec.Containers[0].Resources.Limits["nvidia.com/gpu"] = *resource.NewQuantity(config.GpuLimit, resource.DecimalSI) + } if len(config.Volumes) > 0 { template.Spec.Volumes = config.Volumes } From 14fc90a8f617fa16fb5a279fe4389b40bd283838 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Osipiuk?= Date: Thu, 10 May 2018 14:26:54 +0200 Subject: [PATCH 2/3] Add framework.GetReadyNodesIncludingTaintedOrDie --- test/e2e/framework/util.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/e2e/framework/util.go b/test/e2e/framework/util.go index 91e5ba5015..9930bdf1ce 100644 --- a/test/e2e/framework/util.go +++ b/test/e2e/framework/util.go @@ -2606,6 +2606,18 @@ func GetReadySchedulableNodesOrDie(c clientset.Interface) (nodes *v1.NodeList) { return nodes } +// GetReadyNodesIncludingTaintedOrDie returns all ready nodes, even those which are tainted. +// There are cases when we care about tainted nodes +// E.g. in tests related to nodes with gpu we care about nodes despite +// presence of nvidia.com/gpu=present:NoSchedule taint +func GetReadyNodesIncludingTaintedOrDie(c clientset.Interface) (nodes *v1.NodeList) { + nodes = waitListSchedulableNodesOrDie(c) + FilterNodes(nodes, func(node v1.Node) bool { + return isNodeSchedulable(&node) + }) + return nodes +} + func WaitForAllNodesSchedulable(c clientset.Interface, timeout time.Duration) error { Logf("Waiting up to %v for all (but %d) nodes to be schedulable", timeout, TestContext.AllowedNotReadyNodes) From c6a0937080834fc5de2d20776c620d87691c7649 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Osipiuk?= Date: Thu, 26 Apr 2018 22:41:16 +0200 Subject: [PATCH 3/3] Add cluster autoscaler tests for pods requiring GPU --- .../autoscaling/cluster_size_autoscaling.go | 108 +++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) diff --git a/test/e2e/autoscaling/cluster_size_autoscaling.go b/test/e2e/autoscaling/cluster_size_autoscaling.go index 0b0b248a86..0cf417138d 100644 --- a/test/e2e/autoscaling/cluster_size_autoscaling.go +++ b/test/e2e/autoscaling/cluster_size_autoscaling.go @@ -207,6 +207,76 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() { It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]", func() { simpleScaleUpTest(0) }) + It("Should scale up GPU pool from 0 [Feature:ClusterSizeAutoscalingGpu]", func() { + framework.SkipUnlessProviderIs("gke") + + const gpuPoolName = "gpu-pool" + addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 0) + defer deleteNodePool(gpuPoolName) + + installNvidiaDriversDaemonSet() + + By("Enable autoscaler") + framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1)) + defer disableAutoscaler(gpuPoolName, 0, 1) + Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0)) + + By("Schedule a pod which requires GPU") + framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc")) + + framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, + func(size int) bool { return size == nodeCount+1 }, scaleUpTimeout)) + Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1)) + }) + + It("Should scale up GPU pool from 1 [Feature:ClusterSizeAutoscalingGpu]", func() { + framework.SkipUnlessProviderIs("gke") + + const gpuPoolName = "gpu-pool" + addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 1) + defer deleteNodePool(gpuPoolName) + + installNvidiaDriversDaemonSet() + + By("Schedule a single pod which requires GPU") + framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc")) + + By("Enable autoscaler") + framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 2)) + defer disableAutoscaler(gpuPoolName, 0, 2) + Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1)) + + framework.ScaleRC(f.ClientSet, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc", 2, false) + + framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, + func(size int) bool { return size == nodeCount+2 }, scaleUpTimeout)) + Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(2)) + }) + + It("Should scale down GPU pool from 1 [Feature:ClusterSizeAutoscalingGpu]", func() { + framework.SkipUnlessProviderIs("gke") + + const gpuPoolName = "gpu-pool" + addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 1) + defer deleteNodePool(gpuPoolName) + + installNvidiaDriversDaemonSet() + + By("Schedule a single pod which requires GPU") + framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc")) + + By("Enable autoscaler") + framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1)) + defer disableAutoscaler(gpuPoolName, 0, 1) + Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1)) + + framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc") + + framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, + func(size int) bool { return size == nodeCount }, scaleDownTimeout)) + Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0)) + }) + It("should increase cluster size if pending pods are small and one node is broken [Feature:ClusterSizeAutoscalingScaleUp]", func() { framework.TestUnderTemporaryNetworkFailure(c, "default", getAnyNode(c), func() { simpleScaleUpTest(1) }) @@ -957,6 +1027,12 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() { }) }) +func installNvidiaDriversDaemonSet() { + By("Add daemonset which installs nvidia drivers") + // the link differs from one in GKE documentation; discussed with @mindprince this one should be used + framework.RunKubectlOrDie("apply", "-f", "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/daemonset.yaml") +} + func execCmd(args ...string) *exec.Cmd { glog.Infof("Executing: %s", strings.Join(args, " ")) return exec.Command(args[0], args[1:]...) @@ -1300,6 +1376,16 @@ func addNodePool(name string, machineType string, numNodes int) { framework.ExpectNoError(err, string(output)) } +func addGpuNodePool(name string, gpuType string, gpuCount int, numNodes int) { + args := []string{"beta", "container", "node-pools", "create", name, "--quiet", + "--accelerator", "type=" + gpuType + ",count=" + strconv.Itoa(gpuCount), + "--num-nodes=" + strconv.Itoa(numNodes), + "--cluster=" + framework.TestContext.CloudConfig.Cluster} + output, err := execCmd(getGcloudCommand(args)...).CombinedOutput() + glog.Infof("Creating node-pool %s: %s", name, output) + framework.ExpectNoError(err, string(output)) +} + func deleteNodePool(name string) { glog.Infof("Deleting node pool %s", name) args := []string{"container", "node-pools", "delete", name, "--quiet", @@ -1320,7 +1406,7 @@ func deleteNodePool(name string) { func getPoolNodes(f *framework.Framework, poolName string) []*v1.Node { nodes := make([]*v1.Node, 0, 1) - nodeList := framework.GetReadySchedulableNodesOrDie(f.ClientSet) + nodeList := framework.GetReadyNodesIncludingTaintedOrDie(f.ClientSet) for _, node := range nodeList.Items { if node.Labels[gkeNodepoolNameKey] == poolName { nodes = append(nodes, &node) @@ -1624,6 +1710,26 @@ func makeNodeSchedulable(c clientset.Interface, node *v1.Node, failOnCriticalAdd return fmt.Errorf("Failed to remove taint from node in allowed number of retries") } +func scheduleGpuPod(f *framework.Framework, id string) error { + config := &testutils.RCConfig{ + Client: f.ClientSet, + InternalClient: f.InternalClientset, + Name: id, + Namespace: f.Namespace.Name, + Timeout: 3 * scaleUpTimeout, // spinning up GPU node is slow + Image: imageutils.GetPauseImageName(), + Replicas: 1, + GpuLimit: 1, + Labels: map[string]string{"requires-gpu": "yes"}, + } + + err := framework.RunRC(*config) + if err != nil { + return err + } + return nil +} + // Create an RC running a given number of pods with anti-affinity func runAntiAffinityPods(f *framework.Framework, namespace string, pods int, id string, podLabels, antiAffinityLabels map[string]string) error { config := &testutils.RCConfig{