Adding an installer script that installs Nvidia drivers in Container Optimized OS

Packaged the script as a docker container stored in gcr.io/google-containers A daemonset deployment is included to make it easy to consume the installer A cluster e2e has been added to test the installation daemonset along with verifying installation by using a sample CUDA application. Node e2e for GPUs updated to avoid running on nodes without GPU devices. Signed-off-by: Vishnu kannan <vishnuk@google.com>
2017-04-28 17:48:36 -07:00 · 2017-04-28 17:48:36 -07:00 · 1e77594958
parent 95ce463e95
commit 1e77594958
19 changed files with 665 additions and 14 deletions
--- a/cluster/gce/BUILD
+++ b/cluster/gce/BUILD
@ -32,7 +32,10 @@ filegroup(

 filegroup(
    name = "all-srcs",
-    srcs = [":package-srcs"],
+    srcs = [
+        ":package-srcs",
+        "//cluster/gce/gci/nvidia-gpus:all-srcs",
+    ],
    tags = ["automanaged"],
 )

--- a/cluster/gce/config-default.sh
+++ b/cluster/gce/config-default.sh
@ -68,6 +68,8 @@ fi
 # variable. Also please update corresponding image for node e2e at:
 # https://github.com/kubernetes/kubernetes/blob/master/test/e2e_node/jenkins/image-config.yaml
 CVM_VERSION=${CVM_VERSION:-container-vm-v20170214}
+# NOTE: Update the kernel commit SHA in cluster/addons/nvidia-gpus/cos-installer-daemonset.yaml
+# while updating the COS version here.
 GCI_VERSION=${KUBE_GCI_VERSION:-gci-stable-56-9000-84-2}
 MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-}
 MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-google-containers}
--- a/cluster/gce/gci/configure-helper.sh
+++ b/cluster/gce/gci/configure-helper.sh
@ -1605,4 +1605,5 @@ else
 fi
 reset-motd
 prepare-mounter-rootfs
+modprobe configs
 echo "Done for the configuration for kubernetes"
--- a/cluster/gce/gci/nvidia-gpus/BUILD
+++ b/cluster/gce/gci/nvidia-gpus/BUILD
@ -0,0 +1,24 @@
+package(default_visibility = ["//visibility:public"])
+
+load("@io_bazel//tools/build_defs/pkg:pkg.bzl", "pkg_tar")
+load("@io_kubernetes_build//defs:build.bzl", "release_filegroup")
+
+filegroup(
+    name = "sources",
+    srcs = glob([
+        "**/*",
+    ]),
+)
+
+filegroup(
+    name = "package-srcs",
+    srcs = glob(["**"]),
+    tags = ["automanaged"],
+    visibility = ["//visibility:private"],
+)
+
+filegroup(
+    name = "all-srcs",
+    srcs = [":package-srcs"],
+    tags = ["automanaged"],
+)
--- a/cluster/gce/gci/nvidia-gpus/Dockerfile
+++ b/cluster/gce/gci/nvidia-gpus/Dockerfile
@ -0,0 +1,28 @@
+# Copyright 2017 The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM ubuntu:16.04
+
+# Disable prompts from apt
+ENV DEBIAN_FRONTEND noninteractive
+
+RUN apt-get -qq update
+RUN apt-get install -qq pciutils gcc g++ git make dpkg-dev bc module-init-tools curl 
+
+RUN mkdir /lakitu-kernel
+RUN git clone https://chromium.googlesource.com/chromiumos/third_party/kernel /lakitu-kernel
+
+ADD installer.sh /usr/bin/nvidia-installer.sh
+RUN chmod a+x /usr/bin/nvidia-installer.sh
+CMD ["/usr/bin/nvidia-installer.sh"]
--- a/cluster/gce/gci/nvidia-gpus/Makefile
+++ b/cluster/gce/gci/nvidia-gpus/Makefile
@ -0,0 +1,27 @@
+# Copyright 2017 The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+TAG?=v0.1
+REGISTRY?=gcr.io/google_containers
+IMAGE=cos-nvidia-driver-install
+
+all: container
+
+container:
+	docker build --pull -t ${REGISTRY}/${IMAGE}:${TAG} .
+
+push: 
+	gcloud docker -- push ${REGISTRY}/${IMAGE}:${TAG}
+
+.PHONY: all container push
--- a/cluster/gce/gci/nvidia-gpus/cos-installer-daemonset.yaml
+++ b/cluster/gce/gci/nvidia-gpus/cos-installer-daemonset.yaml
@ -0,0 +1,57 @@
+apiVersion: extensions/v1beta1
+kind: DaemonSet
+metadata:
+    name: cos-nvidia-installer
+    namespace: kube-system
+spec:
+  template:
+    metadata:
+      labels:
+        name: cos-nvidia-installer
+        # Update the version tag here and `LAKITU_KERNEL_SHA1` while using against new COS releases.
+        cos-version: cos-beta-59-9460-20-0
+    spec:
+      hostNetwork: true
+      hostPID: true
+      volumes:
+      - name: dev
+        hostPath:
+          path: /dev
+      - name: nvidia-overlay
+        hostPath:
+          path: /home/kubernetes/bin/nvidia
+      - name: os-release
+        hostPath:
+          path: /etc/os-release
+      - name: sysrq
+        hostPath:
+          path: /proc/sysrq-trigger
+      containers:
+      - image: gcr.io/google_containers/cos-nvidia-driver-install@sha256:ad83ede6e0c6d768bf7cf69a7dec972aa5e8f88778142ca46afd3286ad58cfc8
+        command: ["/bin/sh", "-c"]
+        args: ["usr/bin/nvidia-installer.sh && sleep infinity"]
+        env:
+          - name: BASE_DIR
+            value: "/rootfs/nvidia"
+        name: nvidia-driver-installer
+        resources:
+          requests:
+            cpu: 0.15
+        securityContext:
+          privileged: true
+        env:
+          # The kernel SHA1 here should correspond to the GCI_VERSION specified by default under cluster/gce/config-default.sh 
+          - name: LAKITU_KERNEL_SHA1
+            value: 26481563cb3788ad254c2bf2126b843c161c7e48
+          - name: BASE_DIR
+            value: "/rootfs/nvidia"
+        volumeMounts:
+        - name: nvidia-overlay
+          mountPath: /rootfs/nvidia
+        - name: dev
+          mountPath: /dev
+        - name: os-release
+          mountPath: /rootfs/etc/os-release
+        - name: sysrq
+          mountPath: /sysrq
+
--- a/cluster/gce/gci/nvidia-gpus/installer.sh
+++ b/cluster/gce/gci/nvidia-gpus/installer.sh
@ -0,0 +1,207 @@
+#!/bin/bash
+
+# Copyright 2017 The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script is for dynamically installing nvidia kernel drivers in Container Optimized OS
+
+set -o errexit
+set -o pipefail
+set -x
+
+# The script must be run as a root.
+# Prerequisites:
+#
+# LAKITU_KERNEL_SHA1 - The env variable is expected to be set to HEAD of the kernel version used on the host.
+# BASE_DIR - Directory that is mapped to a stateful partition on host. Defaults to `/rootfs/nvidia`.
+#
+# The script will output the following artifacts:
+# ${BASE_DIR}/lib* --> Nvidia CUDA libraries
+# ${BASE_DIR}/bin/* --> Nvidia debug utilities
+# ${BASE_DIR}/.cache/* --> Nvidia driver artifacts cached for idempotency.
+#
+
+BASE_DIR=${BASE_DIR:-"/rootfs/nvidia"}
+CACHE_DIR="${BASE_DIR}/.cache"
+USR_WORK_DIR="${CACHE_DIR}/usr-work"
+USR_WRITABLE_DIR="${CACHE_DIR}/usr-writable"
+LIB_WORK_DIR="${CACHE_DIR}/lib-work"
+LIB_WRITABLE_DIR="${CACHE_DIR}/lib-writable"
+
+LIB_OUTPUT_DIR="${BASE_DIR}/lib"
+BIN_OUTPUT_DIR="${BASE_DIR}/bin"
+
+KERNEL_SRC_DIR="/lakitu-kernel"
+NVIDIA_DRIVER_DIR="/nvidia"
+NVIDIA_DRIVER_VERSION="375.26"
+
+# Source: https://developer.nvidia.com/cuda-downloads
+NVIDIA_CUDA_URL="https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda_8.0.61_375.26_linux-run"
+NVIDIA_CUDA_MD5SUM="33e1bd980e91af4e55f3ef835c103f9b"
+NVIDIA_CUDA_PKG_NAME="cuda_8.0.61_375.26_linux.run"
+NVIDIA_DRIVER_PKG_NAME="NVIDIA-Linux-x86_64-375.26.run"
+
+check_nvidia_device() {
+    lspci
+    if ! lspci | grep -i -q NVIDIA; then
+        echo "No NVIDIA devices attached to this instance."
+        exit 0
+    fi
+    echo "Found NVIDIA device on this instance."
+}
+
+prepare_kernel_source() {
+    local kernel_git_repo="https://chromium.googlesource.com/chromiumos/third_party/kernel"
+    local kernel_version="$(uname -r)"
+    local kernel_version_stripped="$(echo ${kernel_version} | sed 's/\+//')"
+
+    # Checkout the correct tag.
+    echo "Downloading kernel source at tag ${kernel_version_stripped} ..."
+    pushd "${KERNEL_SRC_DIR}"
+    # TODO: Consume KERNEL SHA1 from COS image directly.
+    # git checkout "tags/v${kernel_version_stripped}"
+    git checkout ${LAKITU_KERNEL_SHA1}
+
+    # Prepare kernel configu and source for modules.
+    echo "Preparing kernel sources ..."
+    zcat "/proc/config.gz" > ".config"
+    make olddefconfig
+    make modules_prepare
+    # Done.
+    popd
+}
+
+download_install_nvidia() {
+    local pkg_name="${NVIDIA_CUDA_PKG_NAME}"
+    local url="${NVIDIA_CUDA_URL}"
+    local log_file_name="${NVIDIA_DRIVER_DIR}/nvidia-installer.log"
+
+    mkdir -p "${NVIDIA_DRIVER_DIR}"
+    pushd "${NVIDIA_DRIVER_DIR}"
+
+    echo "Downloading Nvidia CUDA package from ${url} ..."
+    curl -L -s "${url}" -o "${pkg_name}"
+    echo "${NVIDIA_CUDA_MD5SUM} ${pkg_name}" | md5sum --check
+
+    echo "Extracting Nvidia CUDA package ..."
+    sh ${pkg_name} --extract="$(pwd)"
+
+    echo "Running the Nvidia driver installer ..."
+    if ! sh "${NVIDIA_DRIVER_PKG_NAME}" --kernel-source-path="${KERNEL_SRC_DIR}" --silent --accept-license --keep --log-file-name="${log_file_name}"; then
+        echo "Nvidia installer failed, log below:"
+        echo "==================================="
+        tail -50 "${log_file_name}"
+        echo "==================================="
+        exit 1
+    fi
+    # Create unified memory device file.
+    nvidia-modprobe -c0 -u
+    popd
+}
+
+unlock_loadpin_and_reboot_if_needed() {
+    kernel_cmdline="$(cat /proc/cmdline)"
+    if echo "${kernel_cmdline}" | grep -q -v "lsm.module_locking=0"; then
+        local -r esp_partition="/dev/sda12"
+        local -r mount_path="/tmp/esp"
+        local -r grub_cfg="efi/boot/grub.cfg"
+
+        mkdir -p "${mount_path}"
+        mount "${esp_partition}" "${mount_path}"
+
+        pushd "${mount_path}"
+        cp "${grub_cfg}" "${grub_cfg}.orig"
+        sed 's/cros_efi/cros_efi lsm.module_locking=0/g' -i "efi/boot/grub.cfg"
+        cat "${grub_cfg}"
+        popd
+        sync
+        umount "${mount_path}"
+        # Restart the node for loadpin to be disabled.
+        echo b > /sysrq
+    fi
+}
+
+create_uvm_device() {
+    # Create unified memory device file.
+    nvidia-modprobe -c0 -u
+}
+
+verify_base_image() {
+    mount --bind /rootfs/etc/os-release /etc/os-release
+    local id="$(grep "^ID=" /etc/os-release)"
+    if [[ "${id#*=}" != "cos" ]]; then
+        echo "This installer is designed to run on Container-Optimized OS only"
+        exit 1
+    fi
+}
+
+setup_overlay_mounts() {
+    mkdir -p ${USR_WRITABLE_DIR} ${USR_WORK_DIR} ${LIB_WRITABLE_DIR} ${LIB_WORK_DIR} 
+    mount -t overlay -o lowerdir=/usr,upperdir=${USR_WRITABLE_DIR},workdir=${USR_WORK_DIR} none /usr
+    mount -t overlay -o lowerdir=/lib,upperdir=${LIB_WRITABLE_DIR},workdir=${LIB_WORK_DIR} none /lib
+}
+
+exit_if_install_not_needed() {
+    if nvidia-smi; then
+	echo "nvidia drivers already installed. Skipping installation"
+        post_installation_sequence
+	exit 0
+    fi
+}
+
+restart_kubelet() {
+    echo "Sending SIGTERM to kubelet"
+    pkill -SIGTERM kubelet
+}
+
+# Copy user space libraries and debug utilities to a special output directory on the host.
+# Make these artifacts world readable and executable.
+copy_files_to_host() {
+    mkdir -p ${LIB_OUTPUT_DIR} ${BIN_OUTPUT_DIR}
+    cp -r ${USR_WRITABLE_DIR}/lib/x86_64-linux-gnu/* ${LIB_OUTPUT_DIR}/
+    cp -r ${USR_WRITABLE_DIR}/bin/* ${BIN_OUTPUT_DIR}/
+    chmod -R a+rx ${LIB_OUTPUT_DIR}
+    chmod -R a+rx ${BIN_OUTPUT_DIR}
+}
+
+post_installation_sequence() {
+    create_uvm_device
+    # Copy nvidia user space libraries and debug tools to the host for use from other containers.
+    copy_files_to_host
+    # Restart the kubelet for it to pick up the GPU devices.
+    restart_kubelet
+}
+
+main() {
+    # Do not run the installer unless the base image is Container Optimized OS (COS)
+    verify_base_image
+    # Do not run the installer unless a Nvidia device is found on the PCI bus
+    check_nvidia_device
+    # Setup overlay mounts to capture nvidia driver artificats in a more permanent storage on the host.
+    setup_overlay_mounts
+    # Disable a critical security feature in COS that will allow for dynamically loading Nvidia drivers 
+    unlock_loadpin_and_reboot_if_needed
+    # Exit if installation is not required (for idempotency)
+    exit_if_install_not_needed
+    # Checkout kernel sources appropriate for the base image.
+    prepare_kernel_source
+    # Download, compile and install nvidia drivers.
+    download_install_nvidia
+    # Verify that the Nvidia drivers have been successfully installed.
+    nvidia-smi
+    # Perform post installation steps - copying artifacts, restarting kubelet, etc.
+    post_installation_sequence
+}
+
+main "$@"
--- a/hack/generate-bindata.sh
+++ b/hack/generate-bindata.sh
@ -48,7 +48,8 @@ go-bindata -nometadata -o "${BINDATA_OUTPUT}.tmp" -pkg generated \
 	"examples/..." \
 	"test/e2e/testing-manifests/..." \
 	"test/images/..." \
-	"test/fixtures/..."
+	"test/fixtures/..." \
+	"cluster/gce/gci/nvidia-gpus/..."

 gofmt -s -w "${BINDATA_OUTPUT}.tmp"

--- a/test/e2e/BUILD
+++ b/test/e2e/BUILD
@ -86,6 +86,7 @@ go_library(
        "networking.go",
        "networking_perf.go",
        "nodeoutofdisk.go",
+        "nvidia-gpus.go",
        "pod_gc.go",
        "podpreset.go",
        "pods.go",
--- a/test/e2e/generated/BUILD
+++ b/test/e2e/generated/BUILD
@ -23,6 +23,7 @@ genrule(
    name = "bindata",
    srcs = [
        "//examples:sources",
+        "//cluster/gce/gci/nvidia-gpus:sources",
        "//test/images:sources",
        "//test/fixtures:sources",
        "//test/e2e/testing-manifests:sources",
--- a/test/e2e/nvidia-gpus.go
+++ b/test/e2e/nvidia-gpus.go
@ -0,0 +1,178 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e
+
+import (
+	"strings"
+	"time"
+
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/apimachinery/pkg/util/uuid"
+	utilyaml "k8s.io/apimachinery/pkg/util/yaml"
+	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/api/v1"
+	extensions "k8s.io/kubernetes/pkg/apis/extensions/v1beta1"
+	"k8s.io/kubernetes/test/e2e/framework"
+	"k8s.io/kubernetes/test/e2e/generated"
+
+	. "github.com/onsi/ginkgo"
+	. "github.com/onsi/gomega"
+)
+
+const (
+	testPodNamePrefix = "nvidia-gpu-"
+	testCUDAImage     = "gcr.io/google_containers/cuda-vector-add:v0.1"
+	cosOSImage        = "Container-Optimized OS from Google"
+	// Nvidia driver installation can take upwards of 5 minutes.
+	driverInstallTimeout = 10 * time.Minute
+	// Nvidia COS driver installer daemonset.
+	cosNvidiaDriverInstallerPath = "cluster/gce/gci/nvidia-gpus/cos-installer-daemonset.yaml"
+)
+
+func makeCudaAdditionTestPod() *v1.Pod {
+	podName := testPodNamePrefix + string(uuid.NewUUID())
+	testPod := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: podName,
+		},
+		Spec: v1.PodSpec{
+			RestartPolicy: v1.RestartPolicyNever,
+			Containers: []v1.Container{
+				{
+					Name:  "vector-addition",
+					Image: testCUDAImage,
+					Resources: v1.ResourceRequirements{
+						Limits: v1.ResourceList{
+							v1.ResourceNvidiaGPU: *resource.NewQuantity(1, resource.DecimalSI),
+						},
+					},
+					VolumeMounts: []v1.VolumeMount{
+						{
+							Name:      "nvidia-libraries",
+							MountPath: "/usr/local/nvidia/lib64",
+						},
+					},
+				},
+			},
+			Volumes: []v1.Volume{
+				{
+					Name: "nvidia-libraries",
+					VolumeSource: v1.VolumeSource{
+						HostPath: &v1.HostPathVolumeSource{
+							Path: "/home/kubernetes/bin/nvidia/lib",
+						},
+					},
+				},
+			},
+		},
+	}
+	return testPod
+}
+
+func isClusterRunningCOS(f *framework.Framework) bool {
+	nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
+	framework.ExpectNoError(err, "getting node list")
+	for _, node := range nodeList.Items {
+		if !strings.Contains(node.Status.NodeInfo.OSImage, cosOSImage) {
+			return false
+		}
+	}
+	return true
+}
+
+func areGPUsAvailableOnAllSchedulableNodes(f *framework.Framework) bool {
+	framework.Logf("Getting list of Nodes from API server")
+	nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
+	framework.ExpectNoError(err, "getting node list")
+	for _, node := range nodeList.Items {
+		if node.Spec.Unschedulable {
+			continue
+		}
+		if node.Status.Capacity.NvidiaGPU().Value() == 0 {
+			framework.Logf("Nvidia GPUs not available on Node: %q", node.Name)
+			return false
+		}
+	}
+	framework.Logf("Nvidia GPUs exist on all schedulable nodes")
+	return true
+}
+
+func getGPUsAvailable(f *framework.Framework) int64 {
+	nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
+	framework.ExpectNoError(err, "getting node list")
+	var gpusAvailable int64
+	for _, node := range nodeList.Items {
+		gpusAvailable += node.Status.Capacity.NvidiaGPU().Value()
+	}
+	return gpusAvailable
+}
+
+func testNvidiaGPUsOnCOS(f *framework.Framework) {
+	// Skip the test if the base image is not COS.
+	// TODO: Add support for other base images.
+	// CUDA apps require host mounts which is not portable across base images (yet).
+	framework.Logf("Checking base image")
+	if !isClusterRunningCOS(f) {
+		Skip("Nvidia GPU tests are supproted only on Container Optimized OS image currently")
+	}
+	framework.Logf("Cluster is running on COS. Proceeding with test")
+	// GPU drivers might have already been installed.
+	if !areGPUsAvailableOnAllSchedulableNodes(f) {
+		// Install Nvidia Drivers.
+		ds := dsFromManifest(cosNvidiaDriverInstallerPath)
+		ds.Namespace = f.Namespace.Name
+		_, err := f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Create(ds)
+		framework.ExpectNoError(err, "failed to create daemonset")
+		framework.Logf("Successfully created daemonset to install Nvidia drivers. Waiting for drivers to be installed and GPUs to be available in Node Capacity...")
+		// Wait for Nvidia GPUs to be available on nodes
+		Eventually(func() bool {
+			return areGPUsAvailableOnAllSchedulableNodes(f)
+		}, driverInstallTimeout, time.Second).Should(BeTrue())
+	}
+	framework.Logf("Creating as many pods as there are Nvidia GPUs and have the pods run a CUDA app")
+	podList := []*v1.Pod{}
+	for i := int64(0); i < getGPUsAvailable(f); i++ {
+		podList = append(podList, f.PodClient().Create(makeCudaAdditionTestPod()))
+	}
+	framework.Logf("Wait for all test pods to succeed")
+	// Wait for all pods to succeed
+	for _, po := range podList {
+		f.PodClient().WaitForSuccess(po.Name, 5*time.Minute)
+	}
+}
+
+// dsFromManifest reads a .json/yaml file and returns the daemonset in it.
+func dsFromManifest(fileName string) *extensions.DaemonSet {
+	var controller extensions.DaemonSet
+	framework.Logf("Parsing ds from %v", fileName)
+	data := generated.ReadOrDie(fileName)
+
+	json, err := utilyaml.ToJSON(data)
+	Expect(err).NotTo(HaveOccurred())
+
+	Expect(runtime.DecodeInto(api.Codecs.UniversalDecoder(), json, &controller)).NotTo(HaveOccurred())
+	return &controller
+}
+
+var _ = framework.KubeDescribe("[Feature:GPU]", func() {
+	f := framework.NewDefaultFramework("gpus")
+	It("run Nvidia GPU tests on Container Optimized OS only", func() {
+		testNvidiaGPUsOnCOS(f)
+	})
+})
--- a/test/e2e_node/gpus.go
+++ b/test/e2e_node/gpus.go
@ -18,6 +18,7 @@ package e2e_node

 import (
 	"fmt"
+	"os/exec"
 	"time"

 	"k8s.io/apimachinery/pkg/api/resource"
@ -33,11 +34,49 @@ import (

 const acceleratorsFeatureGate = "Accelerators=true"

+func getGPUsAvailable(f *framework.Framework) int64 {
+	nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
+	framework.ExpectNoError(err, "getting node list")
+	var gpusAvailable int64
+	for _, node := range nodeList.Items {
+		gpusAvailable += node.Status.Capacity.NvidiaGPU().Value()
+	}
+	return gpusAvailable
+}
+
+func gpusExistOnAllNodes(f *framework.Framework) bool {
+	nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
+	framework.ExpectNoError(err, "getting node list")
+	for _, node := range nodeList.Items {
+		if node.Name == "kubernetes-master" {
+			continue
+		}
+		if node.Status.Capacity.NvidiaGPU().Value() == 0 {
+			return false
+		}
+	}
+	return true
+}
+
+func checkIfNvidiaGPUsExistOnNode() bool {
+	// Cannot use `lspci` because it is not installed on all distros by default.
+	err := exec.Command("/bin/sh", "-c", "find /sys/devices/pci* -type f | grep vendor | xargs cat | grep 0x10de").Run()
+	if err != nil {
+		framework.Logf("check for nvidia GPUs failed. Got Error: %v", err)
+		return false
+	}
+	return true
+}
+
 // Serial because the test updates kubelet configuration.
 var _ = framework.KubeDescribe("GPU [Serial]", func() {
 	f := framework.NewDefaultFramework("gpu-test")
 	Context("attempt to use GPUs if available", func() {
 		It("setup the node and create pods to test gpus", func() {
+			By("ensuring that Nvidia GPUs exist on the node")
+			if !checkIfNvidiaGPUsExistOnNode() {
+				Skip("Nvidia GPUs do not exist on the node. Skipping test.")
+			}
 			By("ensuring that dynamic kubelet configuration is enabled")
 			enabled, err := isKubeletConfigEnabled(f)
 			framework.ExpectNoError(err)
@ -65,19 +104,11 @@ var _ = framework.KubeDescribe("GPU [Serial]", func() {
 			}
 			framework.ExpectNoError(setKubeletConfiguration(f, newCfg))

-			By("Getting the local node object from the api server")
-			nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
-			framework.ExpectNoError(err, "getting node list")
-			Expect(len(nodeList.Items)).To(Equal(1))
-			node := nodeList.Items[0]
-			gpusAvailable := node.Status.Capacity.NvidiaGPU()
-			By("Skipping the test if GPUs aren't available")
-			if gpusAvailable.IsZero() {
-				Skip("No GPUs available on local node. Skipping test.")
-			}
+			By("Waiting for GPUs to become available on the local node")
+			Eventually(gpusExistOnAllNodes(f), 10*time.Minute, time.Second).Should(BeTrue())

 			By("Creating a pod that will consume all GPUs")
-			podSuccess := makePod(gpusAvailable.Value(), "gpus-success")
+			podSuccess := makePod(getGPUsAvailable(f), "gpus-success")
 			podSuccess = f.PodClient().CreateSync(podSuccess)

 			By("Checking the containers in the pod had restarted at-least twice successfully thereby ensuring GPUs are reused")
--- a/test/e2e_node/jenkins/gci-init-gpu.yaml
+++ b/test/e2e_node/jenkins/gci-init-gpu.yaml
@ -0,0 +1,19 @@
+#cloud-config
+
+runcmd:
+  - mount /tmp /tmp -o remount,exec,suid
+  - usermod -a -G docker jenkins
+  - mkdir -p /var/lib/kubelet
+  - mkdir -p /home/kubernetes/containerized_mounter/rootfs
+  - mount --bind /home/kubernetes/containerized_mounter/ /home/kubernetes/containerized_mounter/
+  - mount -o remount, exec /home/kubernetes/containerized_mounter/
+  - wget https://storage.googleapis.com/kubernetes-release/gci-mounter/mounter.tar -O /tmp/mounter.tar
+  - tar xvf /tmp/mounter.tar -C /home/kubernetes/containerized_mounter/rootfs
+  - mkdir -p /home/kubernetes/containerized_mounter/rootfs/var/lib/kubelet
+  - mount --rbind /var/lib/kubelet /home/kubernetes/containerized_mounter/rootfs/var/lib/kubelet
+  - mount --make-rshared /home/kubernetes/containerized_mounter/rootfs/var/lib/kubelet
+  - mount --bind /proc /home/kubernetes/containerized_mounter/rootfs/proc
+  - mount --bind /dev /home/kubernetes/containerized_mounter/rootfs/dev
+  - rm /tmp/mounter.tar
+  - modprobe configs
+  - docker run -v /dev:/dev -v /home/kubernetes/bin/nvidia:/rootfs/nvidia -v /etc/os-release:/rootfs/etc/os-release -v /proc/sysrq-trigger:/sysrq -e LAKITU_KERNEL_SHA1=2fdf6034a0fae9794d80e4d218e237771224ba8f -e BASE_DIR=/rootfs/nvidia --privileged gcr.io/google_containers/cos-nvidia-driver-install@sha256:ad83ede6e0c6d768bf7cf69a7dec972aa5e8f88778142ca46afd3286ad58cfc8
--- a/test/e2e_node/jenkins/image-config-serial.yaml
+++ b/test/e2e_node/jenkins/image-config-serial.yaml
@ -25,4 +25,9 @@ images:
  gci:
    image_regex: gci-stable-56-9000-84-2 # docker 1.11.2
    project: google-containers
-    metadata: "user-data<test/e2e_node/jenkins/gci-init.yaml,gci-update-strategy=update_disabled"
+    metadata: "user-data<test/e2e_node/jenkins/gci-init-gpu.yaml,gci-update-strategy=update_disabled"
+    resources:
+      accelerators:
+        - type: nvidia-tesla-k80
+          count: 2
+          
--- a/test/e2e_node/runner/remote/run_remote.go
+++ b/test/e2e_node/runner/remote/run_remote.go
@ -523,6 +523,7 @@ func createInstance(imageConfig *internalGCEImage) (string, error) {
 				Type:       "PERSISTENT",
 				InitializeParams: &compute.AttachedDiskInitializeParams{
 					SourceImage: sourceImage(imageConfig.image, imageConfig.project),
+					DiskSizeGb:  20,
 				},
 			},
 		},
--- a/test/images/nvidia-cuda/Dockerfile
+++ b/test/images/nvidia-cuda/Dockerfile
@ -0,0 +1,24 @@
+# Copyright 2017 The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM nvidia/cuda:8.0-devel-ubuntu16.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        cuda-samples-$CUDA_PKG_VERSION && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /usr/local/cuda/samples/0_Simple/vectorAdd
+RUN make
+
+CMD ./vectorAdd
--- a/test/images/nvidia-cuda/Makefile
+++ b/test/images/nvidia-cuda/Makefile
@ -0,0 +1,28 @@
+# Copyright 2017 The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+.PHONY: all push build
+
+TAG ?= v0.1
+
+REGISTRY ?= gcr.io/google-containers
+IMAGE = $(REGISTRY)/cuda-vector-add
+
+build:
+	docker build --pull -t $(IMAGE):$(TAG) .
+
+push:
+	gcloud docker -- push $(IMAGE):$(TAG)
+
+all: build
--- a/test/images/nvidia-cuda/README.md
+++ b/test/images/nvidia-cuda/README.md
@ -0,0 +1,13 @@
+## cuda_vector_add
+
+This is a small CUDA application that performs a simple vector addition. Useful for testing CUDA support in Kubernetes.
+
+## How to release:
+
+```
+# Build
+$ make
+
+# Push
+$ make push
+```