mirror of https://github.com/k3s-io/k3s
Adding an installer script that installs Nvidia drivers in Container Optimized OS
Packaged the script as a docker container stored in gcr.io/google-containers A daemonset deployment is included to make it easy to consume the installer A cluster e2e has been added to test the installation daemonset along with verifying installation by using a sample CUDA application. Node e2e for GPUs updated to avoid running on nodes without GPU devices. Signed-off-by: Vishnu kannan <vishnuk@google.com>pull/6/head
parent
95ce463e95
commit
1e77594958
|
@ -32,7 +32,10 @@ filegroup(
|
|||
|
||||
filegroup(
|
||||
name = "all-srcs",
|
||||
srcs = [":package-srcs"],
|
||||
srcs = [
|
||||
":package-srcs",
|
||||
"//cluster/gce/gci/nvidia-gpus:all-srcs",
|
||||
],
|
||||
tags = ["automanaged"],
|
||||
)
|
||||
|
||||
|
|
|
@ -68,6 +68,8 @@ fi
|
|||
# variable. Also please update corresponding image for node e2e at:
|
||||
# https://github.com/kubernetes/kubernetes/blob/master/test/e2e_node/jenkins/image-config.yaml
|
||||
CVM_VERSION=${CVM_VERSION:-container-vm-v20170214}
|
||||
# NOTE: Update the kernel commit SHA in cluster/addons/nvidia-gpus/cos-installer-daemonset.yaml
|
||||
# while updating the COS version here.
|
||||
GCI_VERSION=${KUBE_GCI_VERSION:-gci-stable-56-9000-84-2}
|
||||
MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-}
|
||||
MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-google-containers}
|
||||
|
|
|
@ -1605,4 +1605,5 @@ else
|
|||
fi
|
||||
reset-motd
|
||||
prepare-mounter-rootfs
|
||||
modprobe configs
|
||||
echo "Done for the configuration for kubernetes"
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
load("@io_bazel//tools/build_defs/pkg:pkg.bzl", "pkg_tar")
|
||||
load("@io_kubernetes_build//defs:build.bzl", "release_filegroup")
|
||||
|
||||
filegroup(
|
||||
name = "sources",
|
||||
srcs = glob([
|
||||
"**/*",
|
||||
]),
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "package-srcs",
|
||||
srcs = glob(["**"]),
|
||||
tags = ["automanaged"],
|
||||
visibility = ["//visibility:private"],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "all-srcs",
|
||||
srcs = [":package-srcs"],
|
||||
tags = ["automanaged"],
|
||||
)
|
|
@ -0,0 +1,28 @@
|
|||
# Copyright 2017 The Kubernetes Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
FROM ubuntu:16.04
|
||||
|
||||
# Disable prompts from apt
|
||||
ENV DEBIAN_FRONTEND noninteractive
|
||||
|
||||
RUN apt-get -qq update
|
||||
RUN apt-get install -qq pciutils gcc g++ git make dpkg-dev bc module-init-tools curl
|
||||
|
||||
RUN mkdir /lakitu-kernel
|
||||
RUN git clone https://chromium.googlesource.com/chromiumos/third_party/kernel /lakitu-kernel
|
||||
|
||||
ADD installer.sh /usr/bin/nvidia-installer.sh
|
||||
RUN chmod a+x /usr/bin/nvidia-installer.sh
|
||||
CMD ["/usr/bin/nvidia-installer.sh"]
|
|
@ -0,0 +1,27 @@
|
|||
# Copyright 2017 The Kubernetes Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
TAG?=v0.1
|
||||
REGISTRY?=gcr.io/google_containers
|
||||
IMAGE=cos-nvidia-driver-install
|
||||
|
||||
all: container
|
||||
|
||||
container:
|
||||
docker build --pull -t ${REGISTRY}/${IMAGE}:${TAG} .
|
||||
|
||||
push:
|
||||
gcloud docker -- push ${REGISTRY}/${IMAGE}:${TAG}
|
||||
|
||||
.PHONY: all container push
|
|
@ -0,0 +1,57 @@
|
|||
apiVersion: extensions/v1beta1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: cos-nvidia-installer
|
||||
namespace: kube-system
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: cos-nvidia-installer
|
||||
# Update the version tag here and `LAKITU_KERNEL_SHA1` while using against new COS releases.
|
||||
cos-version: cos-beta-59-9460-20-0
|
||||
spec:
|
||||
hostNetwork: true
|
||||
hostPID: true
|
||||
volumes:
|
||||
- name: dev
|
||||
hostPath:
|
||||
path: /dev
|
||||
- name: nvidia-overlay
|
||||
hostPath:
|
||||
path: /home/kubernetes/bin/nvidia
|
||||
- name: os-release
|
||||
hostPath:
|
||||
path: /etc/os-release
|
||||
- name: sysrq
|
||||
hostPath:
|
||||
path: /proc/sysrq-trigger
|
||||
containers:
|
||||
- image: gcr.io/google_containers/cos-nvidia-driver-install@sha256:ad83ede6e0c6d768bf7cf69a7dec972aa5e8f88778142ca46afd3286ad58cfc8
|
||||
command: ["/bin/sh", "-c"]
|
||||
args: ["usr/bin/nvidia-installer.sh && sleep infinity"]
|
||||
env:
|
||||
- name: BASE_DIR
|
||||
value: "/rootfs/nvidia"
|
||||
name: nvidia-driver-installer
|
||||
resources:
|
||||
requests:
|
||||
cpu: 0.15
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
# The kernel SHA1 here should correspond to the GCI_VERSION specified by default under cluster/gce/config-default.sh
|
||||
- name: LAKITU_KERNEL_SHA1
|
||||
value: 26481563cb3788ad254c2bf2126b843c161c7e48
|
||||
- name: BASE_DIR
|
||||
value: "/rootfs/nvidia"
|
||||
volumeMounts:
|
||||
- name: nvidia-overlay
|
||||
mountPath: /rootfs/nvidia
|
||||
- name: dev
|
||||
mountPath: /dev
|
||||
- name: os-release
|
||||
mountPath: /rootfs/etc/os-release
|
||||
- name: sysrq
|
||||
mountPath: /sysrq
|
||||
|
|
@ -0,0 +1,207 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright 2017 The Kubernetes Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# This script is for dynamically installing nvidia kernel drivers in Container Optimized OS
|
||||
|
||||
set -o errexit
|
||||
set -o pipefail
|
||||
set -x
|
||||
|
||||
# The script must be run as a root.
|
||||
# Prerequisites:
|
||||
#
|
||||
# LAKITU_KERNEL_SHA1 - The env variable is expected to be set to HEAD of the kernel version used on the host.
|
||||
# BASE_DIR - Directory that is mapped to a stateful partition on host. Defaults to `/rootfs/nvidia`.
|
||||
#
|
||||
# The script will output the following artifacts:
|
||||
# ${BASE_DIR}/lib* --> Nvidia CUDA libraries
|
||||
# ${BASE_DIR}/bin/* --> Nvidia debug utilities
|
||||
# ${BASE_DIR}/.cache/* --> Nvidia driver artifacts cached for idempotency.
|
||||
#
|
||||
|
||||
BASE_DIR=${BASE_DIR:-"/rootfs/nvidia"}
|
||||
CACHE_DIR="${BASE_DIR}/.cache"
|
||||
USR_WORK_DIR="${CACHE_DIR}/usr-work"
|
||||
USR_WRITABLE_DIR="${CACHE_DIR}/usr-writable"
|
||||
LIB_WORK_DIR="${CACHE_DIR}/lib-work"
|
||||
LIB_WRITABLE_DIR="${CACHE_DIR}/lib-writable"
|
||||
|
||||
LIB_OUTPUT_DIR="${BASE_DIR}/lib"
|
||||
BIN_OUTPUT_DIR="${BASE_DIR}/bin"
|
||||
|
||||
KERNEL_SRC_DIR="/lakitu-kernel"
|
||||
NVIDIA_DRIVER_DIR="/nvidia"
|
||||
NVIDIA_DRIVER_VERSION="375.26"
|
||||
|
||||
# Source: https://developer.nvidia.com/cuda-downloads
|
||||
NVIDIA_CUDA_URL="https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda_8.0.61_375.26_linux-run"
|
||||
NVIDIA_CUDA_MD5SUM="33e1bd980e91af4e55f3ef835c103f9b"
|
||||
NVIDIA_CUDA_PKG_NAME="cuda_8.0.61_375.26_linux.run"
|
||||
NVIDIA_DRIVER_PKG_NAME="NVIDIA-Linux-x86_64-375.26.run"
|
||||
|
||||
check_nvidia_device() {
|
||||
lspci
|
||||
if ! lspci | grep -i -q NVIDIA; then
|
||||
echo "No NVIDIA devices attached to this instance."
|
||||
exit 0
|
||||
fi
|
||||
echo "Found NVIDIA device on this instance."
|
||||
}
|
||||
|
||||
prepare_kernel_source() {
|
||||
local kernel_git_repo="https://chromium.googlesource.com/chromiumos/third_party/kernel"
|
||||
local kernel_version="$(uname -r)"
|
||||
local kernel_version_stripped="$(echo ${kernel_version} | sed 's/\+//')"
|
||||
|
||||
# Checkout the correct tag.
|
||||
echo "Downloading kernel source at tag ${kernel_version_stripped} ..."
|
||||
pushd "${KERNEL_SRC_DIR}"
|
||||
# TODO: Consume KERNEL SHA1 from COS image directly.
|
||||
# git checkout "tags/v${kernel_version_stripped}"
|
||||
git checkout ${LAKITU_KERNEL_SHA1}
|
||||
|
||||
# Prepare kernel configu and source for modules.
|
||||
echo "Preparing kernel sources ..."
|
||||
zcat "/proc/config.gz" > ".config"
|
||||
make olddefconfig
|
||||
make modules_prepare
|
||||
# Done.
|
||||
popd
|
||||
}
|
||||
|
||||
download_install_nvidia() {
|
||||
local pkg_name="${NVIDIA_CUDA_PKG_NAME}"
|
||||
local url="${NVIDIA_CUDA_URL}"
|
||||
local log_file_name="${NVIDIA_DRIVER_DIR}/nvidia-installer.log"
|
||||
|
||||
mkdir -p "${NVIDIA_DRIVER_DIR}"
|
||||
pushd "${NVIDIA_DRIVER_DIR}"
|
||||
|
||||
echo "Downloading Nvidia CUDA package from ${url} ..."
|
||||
curl -L -s "${url}" -o "${pkg_name}"
|
||||
echo "${NVIDIA_CUDA_MD5SUM} ${pkg_name}" | md5sum --check
|
||||
|
||||
echo "Extracting Nvidia CUDA package ..."
|
||||
sh ${pkg_name} --extract="$(pwd)"
|
||||
|
||||
echo "Running the Nvidia driver installer ..."
|
||||
if ! sh "${NVIDIA_DRIVER_PKG_NAME}" --kernel-source-path="${KERNEL_SRC_DIR}" --silent --accept-license --keep --log-file-name="${log_file_name}"; then
|
||||
echo "Nvidia installer failed, log below:"
|
||||
echo "==================================="
|
||||
tail -50 "${log_file_name}"
|
||||
echo "==================================="
|
||||
exit 1
|
||||
fi
|
||||
# Create unified memory device file.
|
||||
nvidia-modprobe -c0 -u
|
||||
popd
|
||||
}
|
||||
|
||||
unlock_loadpin_and_reboot_if_needed() {
|
||||
kernel_cmdline="$(cat /proc/cmdline)"
|
||||
if echo "${kernel_cmdline}" | grep -q -v "lsm.module_locking=0"; then
|
||||
local -r esp_partition="/dev/sda12"
|
||||
local -r mount_path="/tmp/esp"
|
||||
local -r grub_cfg="efi/boot/grub.cfg"
|
||||
|
||||
mkdir -p "${mount_path}"
|
||||
mount "${esp_partition}" "${mount_path}"
|
||||
|
||||
pushd "${mount_path}"
|
||||
cp "${grub_cfg}" "${grub_cfg}.orig"
|
||||
sed 's/cros_efi/cros_efi lsm.module_locking=0/g' -i "efi/boot/grub.cfg"
|
||||
cat "${grub_cfg}"
|
||||
popd
|
||||
sync
|
||||
umount "${mount_path}"
|
||||
# Restart the node for loadpin to be disabled.
|
||||
echo b > /sysrq
|
||||
fi
|
||||
}
|
||||
|
||||
create_uvm_device() {
|
||||
# Create unified memory device file.
|
||||
nvidia-modprobe -c0 -u
|
||||
}
|
||||
|
||||
verify_base_image() {
|
||||
mount --bind /rootfs/etc/os-release /etc/os-release
|
||||
local id="$(grep "^ID=" /etc/os-release)"
|
||||
if [[ "${id#*=}" != "cos" ]]; then
|
||||
echo "This installer is designed to run on Container-Optimized OS only"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
setup_overlay_mounts() {
|
||||
mkdir -p ${USR_WRITABLE_DIR} ${USR_WORK_DIR} ${LIB_WRITABLE_DIR} ${LIB_WORK_DIR}
|
||||
mount -t overlay -o lowerdir=/usr,upperdir=${USR_WRITABLE_DIR},workdir=${USR_WORK_DIR} none /usr
|
||||
mount -t overlay -o lowerdir=/lib,upperdir=${LIB_WRITABLE_DIR},workdir=${LIB_WORK_DIR} none /lib
|
||||
}
|
||||
|
||||
exit_if_install_not_needed() {
|
||||
if nvidia-smi; then
|
||||
echo "nvidia drivers already installed. Skipping installation"
|
||||
post_installation_sequence
|
||||
exit 0
|
||||
fi
|
||||
}
|
||||
|
||||
restart_kubelet() {
|
||||
echo "Sending SIGTERM to kubelet"
|
||||
pkill -SIGTERM kubelet
|
||||
}
|
||||
|
||||
# Copy user space libraries and debug utilities to a special output directory on the host.
|
||||
# Make these artifacts world readable and executable.
|
||||
copy_files_to_host() {
|
||||
mkdir -p ${LIB_OUTPUT_DIR} ${BIN_OUTPUT_DIR}
|
||||
cp -r ${USR_WRITABLE_DIR}/lib/x86_64-linux-gnu/* ${LIB_OUTPUT_DIR}/
|
||||
cp -r ${USR_WRITABLE_DIR}/bin/* ${BIN_OUTPUT_DIR}/
|
||||
chmod -R a+rx ${LIB_OUTPUT_DIR}
|
||||
chmod -R a+rx ${BIN_OUTPUT_DIR}
|
||||
}
|
||||
|
||||
post_installation_sequence() {
|
||||
create_uvm_device
|
||||
# Copy nvidia user space libraries and debug tools to the host for use from other containers.
|
||||
copy_files_to_host
|
||||
# Restart the kubelet for it to pick up the GPU devices.
|
||||
restart_kubelet
|
||||
}
|
||||
|
||||
main() {
|
||||
# Do not run the installer unless the base image is Container Optimized OS (COS)
|
||||
verify_base_image
|
||||
# Do not run the installer unless a Nvidia device is found on the PCI bus
|
||||
check_nvidia_device
|
||||
# Setup overlay mounts to capture nvidia driver artificats in a more permanent storage on the host.
|
||||
setup_overlay_mounts
|
||||
# Disable a critical security feature in COS that will allow for dynamically loading Nvidia drivers
|
||||
unlock_loadpin_and_reboot_if_needed
|
||||
# Exit if installation is not required (for idempotency)
|
||||
exit_if_install_not_needed
|
||||
# Checkout kernel sources appropriate for the base image.
|
||||
prepare_kernel_source
|
||||
# Download, compile and install nvidia drivers.
|
||||
download_install_nvidia
|
||||
# Verify that the Nvidia drivers have been successfully installed.
|
||||
nvidia-smi
|
||||
# Perform post installation steps - copying artifacts, restarting kubelet, etc.
|
||||
post_installation_sequence
|
||||
}
|
||||
|
||||
main "$@"
|
|
@ -48,7 +48,8 @@ go-bindata -nometadata -o "${BINDATA_OUTPUT}.tmp" -pkg generated \
|
|||
"examples/..." \
|
||||
"test/e2e/testing-manifests/..." \
|
||||
"test/images/..." \
|
||||
"test/fixtures/..."
|
||||
"test/fixtures/..." \
|
||||
"cluster/gce/gci/nvidia-gpus/..."
|
||||
|
||||
gofmt -s -w "${BINDATA_OUTPUT}.tmp"
|
||||
|
||||
|
|
|
@ -86,6 +86,7 @@ go_library(
|
|||
"networking.go",
|
||||
"networking_perf.go",
|
||||
"nodeoutofdisk.go",
|
||||
"nvidia-gpus.go",
|
||||
"pod_gc.go",
|
||||
"podpreset.go",
|
||||
"pods.go",
|
||||
|
|
|
@ -23,6 +23,7 @@ genrule(
|
|||
name = "bindata",
|
||||
srcs = [
|
||||
"//examples:sources",
|
||||
"//cluster/gce/gci/nvidia-gpus:sources",
|
||||
"//test/images:sources",
|
||||
"//test/fixtures:sources",
|
||||
"//test/e2e/testing-manifests:sources",
|
||||
|
|
|
@ -0,0 +1,178 @@
|
|||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package e2e
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/util/uuid"
|
||||
utilyaml "k8s.io/apimachinery/pkg/util/yaml"
|
||||
"k8s.io/kubernetes/pkg/api"
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
extensions "k8s.io/kubernetes/pkg/apis/extensions/v1beta1"
|
||||
"k8s.io/kubernetes/test/e2e/framework"
|
||||
"k8s.io/kubernetes/test/e2e/generated"
|
||||
|
||||
. "github.com/onsi/ginkgo"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
const (
|
||||
testPodNamePrefix = "nvidia-gpu-"
|
||||
testCUDAImage = "gcr.io/google_containers/cuda-vector-add:v0.1"
|
||||
cosOSImage = "Container-Optimized OS from Google"
|
||||
// Nvidia driver installation can take upwards of 5 minutes.
|
||||
driverInstallTimeout = 10 * time.Minute
|
||||
// Nvidia COS driver installer daemonset.
|
||||
cosNvidiaDriverInstallerPath = "cluster/gce/gci/nvidia-gpus/cos-installer-daemonset.yaml"
|
||||
)
|
||||
|
||||
func makeCudaAdditionTestPod() *v1.Pod {
|
||||
podName := testPodNamePrefix + string(uuid.NewUUID())
|
||||
testPod := &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: podName,
|
||||
},
|
||||
Spec: v1.PodSpec{
|
||||
RestartPolicy: v1.RestartPolicyNever,
|
||||
Containers: []v1.Container{
|
||||
{
|
||||
Name: "vector-addition",
|
||||
Image: testCUDAImage,
|
||||
Resources: v1.ResourceRequirements{
|
||||
Limits: v1.ResourceList{
|
||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(1, resource.DecimalSI),
|
||||
},
|
||||
},
|
||||
VolumeMounts: []v1.VolumeMount{
|
||||
{
|
||||
Name: "nvidia-libraries",
|
||||
MountPath: "/usr/local/nvidia/lib64",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Volumes: []v1.Volume{
|
||||
{
|
||||
Name: "nvidia-libraries",
|
||||
VolumeSource: v1.VolumeSource{
|
||||
HostPath: &v1.HostPathVolumeSource{
|
||||
Path: "/home/kubernetes/bin/nvidia/lib",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
return testPod
|
||||
}
|
||||
|
||||
func isClusterRunningCOS(f *framework.Framework) bool {
|
||||
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
|
||||
framework.ExpectNoError(err, "getting node list")
|
||||
for _, node := range nodeList.Items {
|
||||
if !strings.Contains(node.Status.NodeInfo.OSImage, cosOSImage) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func areGPUsAvailableOnAllSchedulableNodes(f *framework.Framework) bool {
|
||||
framework.Logf("Getting list of Nodes from API server")
|
||||
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
|
||||
framework.ExpectNoError(err, "getting node list")
|
||||
for _, node := range nodeList.Items {
|
||||
if node.Spec.Unschedulable {
|
||||
continue
|
||||
}
|
||||
if node.Status.Capacity.NvidiaGPU().Value() == 0 {
|
||||
framework.Logf("Nvidia GPUs not available on Node: %q", node.Name)
|
||||
return false
|
||||
}
|
||||
}
|
||||
framework.Logf("Nvidia GPUs exist on all schedulable nodes")
|
||||
return true
|
||||
}
|
||||
|
||||
func getGPUsAvailable(f *framework.Framework) int64 {
|
||||
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
|
||||
framework.ExpectNoError(err, "getting node list")
|
||||
var gpusAvailable int64
|
||||
for _, node := range nodeList.Items {
|
||||
gpusAvailable += node.Status.Capacity.NvidiaGPU().Value()
|
||||
}
|
||||
return gpusAvailable
|
||||
}
|
||||
|
||||
func testNvidiaGPUsOnCOS(f *framework.Framework) {
|
||||
// Skip the test if the base image is not COS.
|
||||
// TODO: Add support for other base images.
|
||||
// CUDA apps require host mounts which is not portable across base images (yet).
|
||||
framework.Logf("Checking base image")
|
||||
if !isClusterRunningCOS(f) {
|
||||
Skip("Nvidia GPU tests are supproted only on Container Optimized OS image currently")
|
||||
}
|
||||
framework.Logf("Cluster is running on COS. Proceeding with test")
|
||||
// GPU drivers might have already been installed.
|
||||
if !areGPUsAvailableOnAllSchedulableNodes(f) {
|
||||
// Install Nvidia Drivers.
|
||||
ds := dsFromManifest(cosNvidiaDriverInstallerPath)
|
||||
ds.Namespace = f.Namespace.Name
|
||||
_, err := f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Create(ds)
|
||||
framework.ExpectNoError(err, "failed to create daemonset")
|
||||
framework.Logf("Successfully created daemonset to install Nvidia drivers. Waiting for drivers to be installed and GPUs to be available in Node Capacity...")
|
||||
// Wait for Nvidia GPUs to be available on nodes
|
||||
Eventually(func() bool {
|
||||
return areGPUsAvailableOnAllSchedulableNodes(f)
|
||||
}, driverInstallTimeout, time.Second).Should(BeTrue())
|
||||
}
|
||||
framework.Logf("Creating as many pods as there are Nvidia GPUs and have the pods run a CUDA app")
|
||||
podList := []*v1.Pod{}
|
||||
for i := int64(0); i < getGPUsAvailable(f); i++ {
|
||||
podList = append(podList, f.PodClient().Create(makeCudaAdditionTestPod()))
|
||||
}
|
||||
framework.Logf("Wait for all test pods to succeed")
|
||||
// Wait for all pods to succeed
|
||||
for _, po := range podList {
|
||||
f.PodClient().WaitForSuccess(po.Name, 5*time.Minute)
|
||||
}
|
||||
}
|
||||
|
||||
// dsFromManifest reads a .json/yaml file and returns the daemonset in it.
|
||||
func dsFromManifest(fileName string) *extensions.DaemonSet {
|
||||
var controller extensions.DaemonSet
|
||||
framework.Logf("Parsing ds from %v", fileName)
|
||||
data := generated.ReadOrDie(fileName)
|
||||
|
||||
json, err := utilyaml.ToJSON(data)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
|
||||
Expect(runtime.DecodeInto(api.Codecs.UniversalDecoder(), json, &controller)).NotTo(HaveOccurred())
|
||||
return &controller
|
||||
}
|
||||
|
||||
var _ = framework.KubeDescribe("[Feature:GPU]", func() {
|
||||
f := framework.NewDefaultFramework("gpus")
|
||||
It("run Nvidia GPU tests on Container Optimized OS only", func() {
|
||||
testNvidiaGPUsOnCOS(f)
|
||||
})
|
||||
})
|
|
@ -18,6 +18,7 @@ package e2e_node
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"time"
|
||||
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
|
@ -33,11 +34,49 @@ import (
|
|||
|
||||
const acceleratorsFeatureGate = "Accelerators=true"
|
||||
|
||||
func getGPUsAvailable(f *framework.Framework) int64 {
|
||||
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
|
||||
framework.ExpectNoError(err, "getting node list")
|
||||
var gpusAvailable int64
|
||||
for _, node := range nodeList.Items {
|
||||
gpusAvailable += node.Status.Capacity.NvidiaGPU().Value()
|
||||
}
|
||||
return gpusAvailable
|
||||
}
|
||||
|
||||
func gpusExistOnAllNodes(f *framework.Framework) bool {
|
||||
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
|
||||
framework.ExpectNoError(err, "getting node list")
|
||||
for _, node := range nodeList.Items {
|
||||
if node.Name == "kubernetes-master" {
|
||||
continue
|
||||
}
|
||||
if node.Status.Capacity.NvidiaGPU().Value() == 0 {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func checkIfNvidiaGPUsExistOnNode() bool {
|
||||
// Cannot use `lspci` because it is not installed on all distros by default.
|
||||
err := exec.Command("/bin/sh", "-c", "find /sys/devices/pci* -type f | grep vendor | xargs cat | grep 0x10de").Run()
|
||||
if err != nil {
|
||||
framework.Logf("check for nvidia GPUs failed. Got Error: %v", err)
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Serial because the test updates kubelet configuration.
|
||||
var _ = framework.KubeDescribe("GPU [Serial]", func() {
|
||||
f := framework.NewDefaultFramework("gpu-test")
|
||||
Context("attempt to use GPUs if available", func() {
|
||||
It("setup the node and create pods to test gpus", func() {
|
||||
By("ensuring that Nvidia GPUs exist on the node")
|
||||
if !checkIfNvidiaGPUsExistOnNode() {
|
||||
Skip("Nvidia GPUs do not exist on the node. Skipping test.")
|
||||
}
|
||||
By("ensuring that dynamic kubelet configuration is enabled")
|
||||
enabled, err := isKubeletConfigEnabled(f)
|
||||
framework.ExpectNoError(err)
|
||||
|
@ -65,19 +104,11 @@ var _ = framework.KubeDescribe("GPU [Serial]", func() {
|
|||
}
|
||||
framework.ExpectNoError(setKubeletConfiguration(f, newCfg))
|
||||
|
||||
By("Getting the local node object from the api server")
|
||||
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
|
||||
framework.ExpectNoError(err, "getting node list")
|
||||
Expect(len(nodeList.Items)).To(Equal(1))
|
||||
node := nodeList.Items[0]
|
||||
gpusAvailable := node.Status.Capacity.NvidiaGPU()
|
||||
By("Skipping the test if GPUs aren't available")
|
||||
if gpusAvailable.IsZero() {
|
||||
Skip("No GPUs available on local node. Skipping test.")
|
||||
}
|
||||
By("Waiting for GPUs to become available on the local node")
|
||||
Eventually(gpusExistOnAllNodes(f), 10*time.Minute, time.Second).Should(BeTrue())
|
||||
|
||||
By("Creating a pod that will consume all GPUs")
|
||||
podSuccess := makePod(gpusAvailable.Value(), "gpus-success")
|
||||
podSuccess := makePod(getGPUsAvailable(f), "gpus-success")
|
||||
podSuccess = f.PodClient().CreateSync(podSuccess)
|
||||
|
||||
By("Checking the containers in the pod had restarted at-least twice successfully thereby ensuring GPUs are reused")
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
#cloud-config
|
||||
|
||||
runcmd:
|
||||
- mount /tmp /tmp -o remount,exec,suid
|
||||
- usermod -a -G docker jenkins
|
||||
- mkdir -p /var/lib/kubelet
|
||||
- mkdir -p /home/kubernetes/containerized_mounter/rootfs
|
||||
- mount --bind /home/kubernetes/containerized_mounter/ /home/kubernetes/containerized_mounter/
|
||||
- mount -o remount, exec /home/kubernetes/containerized_mounter/
|
||||
- wget https://storage.googleapis.com/kubernetes-release/gci-mounter/mounter.tar -O /tmp/mounter.tar
|
||||
- tar xvf /tmp/mounter.tar -C /home/kubernetes/containerized_mounter/rootfs
|
||||
- mkdir -p /home/kubernetes/containerized_mounter/rootfs/var/lib/kubelet
|
||||
- mount --rbind /var/lib/kubelet /home/kubernetes/containerized_mounter/rootfs/var/lib/kubelet
|
||||
- mount --make-rshared /home/kubernetes/containerized_mounter/rootfs/var/lib/kubelet
|
||||
- mount --bind /proc /home/kubernetes/containerized_mounter/rootfs/proc
|
||||
- mount --bind /dev /home/kubernetes/containerized_mounter/rootfs/dev
|
||||
- rm /tmp/mounter.tar
|
||||
- modprobe configs
|
||||
- docker run -v /dev:/dev -v /home/kubernetes/bin/nvidia:/rootfs/nvidia -v /etc/os-release:/rootfs/etc/os-release -v /proc/sysrq-trigger:/sysrq -e LAKITU_KERNEL_SHA1=2fdf6034a0fae9794d80e4d218e237771224ba8f -e BASE_DIR=/rootfs/nvidia --privileged gcr.io/google_containers/cos-nvidia-driver-install@sha256:ad83ede6e0c6d768bf7cf69a7dec972aa5e8f88778142ca46afd3286ad58cfc8
|
|
@ -25,4 +25,9 @@ images:
|
|||
gci:
|
||||
image_regex: gci-stable-56-9000-84-2 # docker 1.11.2
|
||||
project: google-containers
|
||||
metadata: "user-data<test/e2e_node/jenkins/gci-init.yaml,gci-update-strategy=update_disabled"
|
||||
metadata: "user-data<test/e2e_node/jenkins/gci-init-gpu.yaml,gci-update-strategy=update_disabled"
|
||||
resources:
|
||||
accelerators:
|
||||
- type: nvidia-tesla-k80
|
||||
count: 2
|
||||
|
||||
|
|
|
@ -523,6 +523,7 @@ func createInstance(imageConfig *internalGCEImage) (string, error) {
|
|||
Type: "PERSISTENT",
|
||||
InitializeParams: &compute.AttachedDiskInitializeParams{
|
||||
SourceImage: sourceImage(imageConfig.image, imageConfig.project),
|
||||
DiskSizeGb: 20,
|
||||
},
|
||||
},
|
||||
},
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
# Copyright 2017 The Kubernetes Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
FROM nvidia/cuda:8.0-devel-ubuntu16.04
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
cuda-samples-$CUDA_PKG_VERSION && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /usr/local/cuda/samples/0_Simple/vectorAdd
|
||||
RUN make
|
||||
|
||||
CMD ./vectorAdd
|
|
@ -0,0 +1,28 @@
|
|||
# Copyright 2017 The Kubernetes Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
.PHONY: all push build
|
||||
|
||||
TAG ?= v0.1
|
||||
|
||||
REGISTRY ?= gcr.io/google-containers
|
||||
IMAGE = $(REGISTRY)/cuda-vector-add
|
||||
|
||||
build:
|
||||
docker build --pull -t $(IMAGE):$(TAG) .
|
||||
|
||||
push:
|
||||
gcloud docker -- push $(IMAGE):$(TAG)
|
||||
|
||||
all: build
|
|
@ -0,0 +1,13 @@
|
|||
## cuda_vector_add
|
||||
|
||||
This is a small CUDA application that performs a simple vector addition. Useful for testing CUDA support in Kubernetes.
|
||||
|
||||
## How to release:
|
||||
|
||||
```
|
||||
# Build
|
||||
$ make
|
||||
|
||||
# Push
|
||||
$ make push
|
||||
```
|
Loading…
Reference in New Issue