Merge pull request #65437 from losipiuk/lo/gpu-tests-from-env

Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

Read gpu type from TESTED_GPU_TYPE env variable

```release-note
NONE
```
pull/8/head
Kubernetes Submit Queue 2018-06-26 08:55:44 -07:00 committed by GitHub
commit 0d9c432542
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 92 additions and 78 deletions

View File

@ -22,6 +22,7 @@ import (
"io/ioutil" "io/ioutil"
"math" "math"
"net/http" "net/http"
"os"
"os/exec" "os/exec"
"regexp" "regexp"
"strconv" "strconv"
@ -207,110 +208,123 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]", It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]",
func() { simpleScaleUpTest(0) }) func() { simpleScaleUpTest(0) })
supportedGpuTypes := []string{"nvidia-tesla-k80", "nvidia-tesla-v100", "nvidia-tesla-p100"} gpuType := os.Getenv("TESTED_GPU_TYPE")
for _, gpuType := range supportedGpuTypes {
gpuType := gpuType // create new variable for each iteration step
It(fmt.Sprintf("Should scale up GPU pool from 0 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() { It(fmt.Sprintf("Should scale up GPU pool from 0 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
framework.SkipUnlessProviderIs("gke") framework.SkipUnlessProviderIs("gke")
if gpuType == "" {
framework.Failf("TEST_GPU_TYPE not defined")
return
}
const gpuPoolName = "gpu-pool" const gpuPoolName = "gpu-pool"
addGpuNodePool(gpuPoolName, gpuType, 1, 0) addGpuNodePool(gpuPoolName, gpuType, 1, 0)
defer deleteNodePool(gpuPoolName) defer deleteNodePool(gpuPoolName)
installNvidiaDriversDaemonSet() installNvidiaDriversDaemonSet()
By("Enable autoscaler") By("Enable autoscaler")
framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1)) framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
defer disableAutoscaler(gpuPoolName, 0, 1) defer disableAutoscaler(gpuPoolName, 0, 1)
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
By("Schedule a pod which requires GPU") By("Schedule a pod which requires GPU")
framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc")) framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc") defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc")
framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
func(size int) bool { return size == nodeCount+1 }, scaleUpTimeout)) func(size int) bool { return size == nodeCount+1 }, scaleUpTimeout))
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
}) })
It(fmt.Sprintf("Should scale up GPU pool from 1 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() { It(fmt.Sprintf("Should scale up GPU pool from 1 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
framework.SkipUnlessProviderIs("gke") framework.SkipUnlessProviderIs("gke")
if gpuType == "" {
framework.Failf("TEST_GPU_TYPE not defined")
return
}
const gpuPoolName = "gpu-pool" const gpuPoolName = "gpu-pool"
addGpuNodePool(gpuPoolName, gpuType, 1, 1) addGpuNodePool(gpuPoolName, gpuType, 1, 1)
defer deleteNodePool(gpuPoolName) defer deleteNodePool(gpuPoolName)
installNvidiaDriversDaemonSet() installNvidiaDriversDaemonSet()
By("Schedule a single pod which requires GPU") By("Schedule a single pod which requires GPU")
framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc")) framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc") defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc")
By("Enable autoscaler") By("Enable autoscaler")
framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 2)) framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 2))
defer disableAutoscaler(gpuPoolName, 0, 2) defer disableAutoscaler(gpuPoolName, 0, 2)
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
By("Scale GPU deployment") By("Scale GPU deployment")
framework.ScaleRC(f.ClientSet, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc", 2, true) framework.ScaleRC(f.ClientSet, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc", 2, true)
framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
func(size int) bool { return size == nodeCount+2 }, scaleUpTimeout)) func(size int) bool { return size == nodeCount+2 }, scaleUpTimeout))
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(2)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(2))
}) })
It(fmt.Sprintf("Should not scale GPU pool up if pod does not require GPUs [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() { It(fmt.Sprintf("Should not scale GPU pool up if pod does not require GPUs [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
framework.SkipUnlessProviderIs("gke") framework.SkipUnlessProviderIs("gke")
if gpuType == "" {
framework.Failf("TEST_GPU_TYPE not defined")
return
}
const gpuPoolName = "gpu-pool" const gpuPoolName = "gpu-pool"
addGpuNodePool(gpuPoolName, gpuType, 1, 0) addGpuNodePool(gpuPoolName, gpuType, 1, 0)
defer deleteNodePool(gpuPoolName) defer deleteNodePool(gpuPoolName)
installNvidiaDriversDaemonSet() installNvidiaDriversDaemonSet()
By("Enable autoscaler") By("Enable autoscaler")
framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1)) framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
defer disableAutoscaler(gpuPoolName, 0, 1) defer disableAutoscaler(gpuPoolName, 0, 1)
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
By("Schedule bunch of pods beyond point of filling default pool but do not request any GPUs") By("Schedule bunch of pods beyond point of filling default pool but do not request any GPUs")
ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second) ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second)
defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation") defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation")
// Verify that cluster size is increased // Verify that cluster size is increased
framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout)) func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout))
// Expect gpu pool to stay intact // Expect gpu pool to stay intact
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
}) })
It(fmt.Sprintf("Should scale down GPU pool from 1 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() { It(fmt.Sprintf("Should scale down GPU pool from 1 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
framework.SkipUnlessProviderIs("gke") framework.SkipUnlessProviderIs("gke")
if gpuType == "" {
framework.Failf("TEST_GPU_TYPE not defined")
return
}
const gpuPoolName = "gpu-pool" const gpuPoolName = "gpu-pool"
addGpuNodePool(gpuPoolName, gpuType, 1, 1) addGpuNodePool(gpuPoolName, gpuType, 1, 1)
defer deleteNodePool(gpuPoolName) defer deleteNodePool(gpuPoolName)
installNvidiaDriversDaemonSet() installNvidiaDriversDaemonSet()
By("Schedule a single pod which requires GPU") By("Schedule a single pod which requires GPU")
framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc")) framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc") defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc")
By("Enable autoscaler") By("Enable autoscaler")
framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1)) framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
defer disableAutoscaler(gpuPoolName, 0, 1) defer disableAutoscaler(gpuPoolName, 0, 1)
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
By("Remove the only POD requiring GPU") By("Remove the only POD requiring GPU")
framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc") framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc")
framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
func(size int) bool { return size == nodeCount }, scaleDownTimeout)) func(size int) bool { return size == nodeCount }, scaleDownTimeout))
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
}) })
}
// TODO consider moving to [Feature:ClusterSizeAutoscalingGpu] as soon as NAP goes out of beta. Currently // TODO consider moving to [Feature:ClusterSizeAutoscalingGpu] as soon as NAP goes out of beta. Currently
// project needed to run the NAP tests require whitelisting for NAP alpha // project needed to run the NAP tests require whitelisting for NAP alpha