2017-04-29 00:48:36 +00:00
/ *
Copyright 2017 The Kubernetes Authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
2017-07-14 21:06:43 +00:00
package scheduling
2017-04-29 00:48:36 +00:00
import (
"strings"
"time"
2017-06-22 18:24:23 +00:00
"k8s.io/api/core/v1"
2017-04-29 00:48:36 +00:00
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/kubernetes/test/e2e/framework"
2017-08-29 08:32:08 +00:00
imageutils "k8s.io/kubernetes/test/utils/image"
2017-04-29 00:48:36 +00:00
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)
const (
testPodNamePrefix = "nvidia-gpu-"
cosOSImage = "Container-Optimized OS from Google"
// Nvidia driver installation can take upwards of 5 minutes.
driverInstallTimeout = 10 * time . Minute
2017-08-30 21:26:28 +00:00
)
type podCreationFuncType func ( ) * v1 . Pod
var (
gpuResourceName v1 . ResourceName
dsYamlUrl string
podCreationFunc podCreationFuncType
2017-04-29 00:48:36 +00:00
)
func makeCudaAdditionTestPod ( ) * v1 . Pod {
podName := testPodNamePrefix + string ( uuid . NewUUID ( ) )
testPod := & v1 . Pod {
ObjectMeta : metav1 . ObjectMeta {
Name : podName ,
} ,
Spec : v1 . PodSpec {
RestartPolicy : v1 . RestartPolicyNever ,
Containers : [ ] v1 . Container {
{
Name : "vector-addition" ,
2017-08-29 08:32:08 +00:00
Image : imageutils . GetE2EImage ( imageutils . CudaVectorAdd ) ,
2017-04-29 00:48:36 +00:00
Resources : v1 . ResourceRequirements {
Limits : v1 . ResourceList {
2017-08-30 21:26:28 +00:00
gpuResourceName : * resource . NewQuantity ( 1 , resource . DecimalSI ) ,
2017-04-29 00:48:36 +00:00
} ,
} ,
VolumeMounts : [ ] v1 . VolumeMount {
{
Name : "nvidia-libraries" ,
MountPath : "/usr/local/nvidia/lib64" ,
} ,
} ,
} ,
} ,
Volumes : [ ] v1 . Volume {
{
Name : "nvidia-libraries" ,
VolumeSource : v1 . VolumeSource {
HostPath : & v1 . HostPathVolumeSource {
Path : "/home/kubernetes/bin/nvidia/lib" ,
} ,
} ,
} ,
} ,
} ,
}
return testPod
}
2017-08-30 21:26:28 +00:00
func makeCudaAdditionDevicePluginTestPod ( ) * v1 . Pod {
podName := testPodNamePrefix + string ( uuid . NewUUID ( ) )
testPod := & v1 . Pod {
ObjectMeta : metav1 . ObjectMeta {
Name : podName ,
} ,
Spec : v1 . PodSpec {
RestartPolicy : v1 . RestartPolicyNever ,
Containers : [ ] v1 . Container {
{
Name : "vector-addition" ,
Image : imageutils . GetE2EImage ( imageutils . CudaVectorAdd ) ,
Resources : v1 . ResourceRequirements {
Limits : v1 . ResourceList {
gpuResourceName : * resource . NewQuantity ( 1 , resource . DecimalSI ) ,
} ,
} ,
} ,
} ,
} ,
}
return testPod
}
2017-04-29 00:48:36 +00:00
func isClusterRunningCOS ( f * framework . Framework ) bool {
2017-08-13 09:07:29 +00:00
nodeList , err := f . ClientSet . CoreV1 ( ) . Nodes ( ) . List ( metav1 . ListOptions { } )
2017-04-29 00:48:36 +00:00
framework . ExpectNoError ( err , "getting node list" )
for _ , node := range nodeList . Items {
if ! strings . Contains ( node . Status . NodeInfo . OSImage , cosOSImage ) {
return false
}
}
return true
}
func areGPUsAvailableOnAllSchedulableNodes ( f * framework . Framework ) bool {
framework . Logf ( "Getting list of Nodes from API server" )
2017-08-13 09:07:29 +00:00
nodeList , err := f . ClientSet . CoreV1 ( ) . Nodes ( ) . List ( metav1 . ListOptions { } )
2017-04-29 00:48:36 +00:00
framework . ExpectNoError ( err , "getting node list" )
for _ , node := range nodeList . Items {
if node . Spec . Unschedulable {
continue
}
2017-08-30 21:26:28 +00:00
framework . Logf ( "gpuResourceName %s" , gpuResourceName )
if val , ok := node . Status . Capacity [ gpuResourceName ] ; ! ok || val . Value ( ) == 0 {
2017-04-29 00:48:36 +00:00
framework . Logf ( "Nvidia GPUs not available on Node: %q" , node . Name )
return false
}
}
framework . Logf ( "Nvidia GPUs exist on all schedulable nodes" )
return true
}
func getGPUsAvailable ( f * framework . Framework ) int64 {
2017-08-13 09:07:29 +00:00
nodeList , err := f . ClientSet . CoreV1 ( ) . Nodes ( ) . List ( metav1 . ListOptions { } )
2017-04-29 00:48:36 +00:00
framework . ExpectNoError ( err , "getting node list" )
var gpusAvailable int64
for _ , node := range nodeList . Items {
2017-08-30 21:26:28 +00:00
if val , ok := node . Status . Capacity [ gpuResourceName ] ; ok {
gpusAvailable += ( & val ) . Value ( )
}
2017-04-29 00:48:36 +00:00
}
return gpusAvailable
}
func testNvidiaGPUsOnCOS ( f * framework . Framework ) {
// Skip the test if the base image is not COS.
// TODO: Add support for other base images.
// CUDA apps require host mounts which is not portable across base images (yet).
framework . Logf ( "Checking base image" )
if ! isClusterRunningCOS ( f ) {
Skip ( "Nvidia GPU tests are supproted only on Container Optimized OS image currently" )
}
framework . Logf ( "Cluster is running on COS. Proceeding with test" )
2017-08-30 21:26:28 +00:00
if f . BaseName == "device-plugin-gpus" {
2017-09-10 19:53:17 +00:00
dsYamlUrl = framework . GPUDevicePluginDSYAML
gpuResourceName = framework . NVIDIAGPUResourceName
2017-08-30 21:26:28 +00:00
podCreationFunc = makeCudaAdditionDevicePluginTestPod
} else {
dsYamlUrl = "https://raw.githubusercontent.com/ContainerEngine/accelerators/master/cos-nvidia-gpu-installer/daemonset.yaml"
gpuResourceName = v1 . ResourceNvidiaGPU
podCreationFunc = makeCudaAdditionTestPod
}
2017-04-29 00:48:36 +00:00
// GPU drivers might have already been installed.
if ! areGPUsAvailableOnAllSchedulableNodes ( f ) {
// Install Nvidia Drivers.
2017-09-18 23:10:04 +00:00
ds , err := framework . DsFromManifest ( dsYamlUrl )
Expect ( err ) . NotTo ( HaveOccurred ( ) )
2017-04-29 00:48:36 +00:00
ds . Namespace = f . Namespace . Name
2017-07-28 07:54:13 +00:00
_ , err = f . ClientSet . ExtensionsV1beta1 ( ) . DaemonSets ( f . Namespace . Name ) . Create ( ds )
2017-04-29 00:48:36 +00:00
framework . ExpectNoError ( err , "failed to create daemonset" )
framework . Logf ( "Successfully created daemonset to install Nvidia drivers. Waiting for drivers to be installed and GPUs to be available in Node Capacity..." )
// Wait for Nvidia GPUs to be available on nodes
Eventually ( func ( ) bool {
return areGPUsAvailableOnAllSchedulableNodes ( f )
} , driverInstallTimeout , time . Second ) . Should ( BeTrue ( ) )
}
framework . Logf ( "Creating as many pods as there are Nvidia GPUs and have the pods run a CUDA app" )
podList := [ ] * v1 . Pod { }
for i := int64 ( 0 ) ; i < getGPUsAvailable ( f ) ; i ++ {
2017-08-30 21:26:28 +00:00
podList = append ( podList , f . PodClient ( ) . Create ( podCreationFunc ( ) ) )
2017-04-29 00:48:36 +00:00
}
framework . Logf ( "Wait for all test pods to succeed" )
// Wait for all pods to succeed
for _ , po := range podList {
f . PodClient ( ) . WaitForSuccess ( po . Name , 5 * time . Minute )
}
}
2017-07-14 21:06:43 +00:00
var _ = SIGDescribe ( "[Feature:GPU]" , func ( ) {
2017-04-29 00:48:36 +00:00
f := framework . NewDefaultFramework ( "gpus" )
It ( "run Nvidia GPU tests on Container Optimized OS only" , func ( ) {
testNvidiaGPUsOnCOS ( f )
} )
} )
2017-08-30 21:26:28 +00:00
var _ = SIGDescribe ( "[Feature:GPUDevicePlugin]" , func ( ) {
f := framework . NewDefaultFramework ( "device-plugin-gpus" )
It ( "run Nvidia GPU Device Plugin tests on Container Optimized OS only" , func ( ) {
testNvidiaGPUsOnCOS ( f )
} )
} )