mirror of https://github.com/k3s-io/k3s
Adding an installer script that installs Nvidia drivers in Container Optimized OS
Packaged the script as a docker container stored in gcr.io/google-containers A daemonset deployment is included to make it easy to consume the installer A cluster e2e has been added to test the installation daemonset along with verifying installation by using a sample CUDA application. Node e2e for GPUs updated to avoid running on nodes without GPU devices. Signed-off-by: Vishnu kannan <vishnuk@google.com>pull/6/head
parent
95ce463e95
commit
1e77594958
|
@ -32,7 +32,10 @@ filegroup(
|
||||||
|
|
||||||
filegroup(
|
filegroup(
|
||||||
name = "all-srcs",
|
name = "all-srcs",
|
||||||
srcs = [":package-srcs"],
|
srcs = [
|
||||||
|
":package-srcs",
|
||||||
|
"//cluster/gce/gci/nvidia-gpus:all-srcs",
|
||||||
|
],
|
||||||
tags = ["automanaged"],
|
tags = ["automanaged"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -68,6 +68,8 @@ fi
|
||||||
# variable. Also please update corresponding image for node e2e at:
|
# variable. Also please update corresponding image for node e2e at:
|
||||||
# https://github.com/kubernetes/kubernetes/blob/master/test/e2e_node/jenkins/image-config.yaml
|
# https://github.com/kubernetes/kubernetes/blob/master/test/e2e_node/jenkins/image-config.yaml
|
||||||
CVM_VERSION=${CVM_VERSION:-container-vm-v20170214}
|
CVM_VERSION=${CVM_VERSION:-container-vm-v20170214}
|
||||||
|
# NOTE: Update the kernel commit SHA in cluster/addons/nvidia-gpus/cos-installer-daemonset.yaml
|
||||||
|
# while updating the COS version here.
|
||||||
GCI_VERSION=${KUBE_GCI_VERSION:-gci-stable-56-9000-84-2}
|
GCI_VERSION=${KUBE_GCI_VERSION:-gci-stable-56-9000-84-2}
|
||||||
MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-}
|
MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-}
|
||||||
MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-google-containers}
|
MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-google-containers}
|
||||||
|
|
|
@ -1605,4 +1605,5 @@ else
|
||||||
fi
|
fi
|
||||||
reset-motd
|
reset-motd
|
||||||
prepare-mounter-rootfs
|
prepare-mounter-rootfs
|
||||||
|
modprobe configs
|
||||||
echo "Done for the configuration for kubernetes"
|
echo "Done for the configuration for kubernetes"
|
||||||
|
|
|
@ -0,0 +1,24 @@
|
||||||
|
package(default_visibility = ["//visibility:public"])
|
||||||
|
|
||||||
|
load("@io_bazel//tools/build_defs/pkg:pkg.bzl", "pkg_tar")
|
||||||
|
load("@io_kubernetes_build//defs:build.bzl", "release_filegroup")
|
||||||
|
|
||||||
|
filegroup(
|
||||||
|
name = "sources",
|
||||||
|
srcs = glob([
|
||||||
|
"**/*",
|
||||||
|
]),
|
||||||
|
)
|
||||||
|
|
||||||
|
filegroup(
|
||||||
|
name = "package-srcs",
|
||||||
|
srcs = glob(["**"]),
|
||||||
|
tags = ["automanaged"],
|
||||||
|
visibility = ["//visibility:private"],
|
||||||
|
)
|
||||||
|
|
||||||
|
filegroup(
|
||||||
|
name = "all-srcs",
|
||||||
|
srcs = [":package-srcs"],
|
||||||
|
tags = ["automanaged"],
|
||||||
|
)
|
|
@ -0,0 +1,28 @@
|
||||||
|
# Copyright 2017 The Kubernetes Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
FROM ubuntu:16.04
|
||||||
|
|
||||||
|
# Disable prompts from apt
|
||||||
|
ENV DEBIAN_FRONTEND noninteractive
|
||||||
|
|
||||||
|
RUN apt-get -qq update
|
||||||
|
RUN apt-get install -qq pciutils gcc g++ git make dpkg-dev bc module-init-tools curl
|
||||||
|
|
||||||
|
RUN mkdir /lakitu-kernel
|
||||||
|
RUN git clone https://chromium.googlesource.com/chromiumos/third_party/kernel /lakitu-kernel
|
||||||
|
|
||||||
|
ADD installer.sh /usr/bin/nvidia-installer.sh
|
||||||
|
RUN chmod a+x /usr/bin/nvidia-installer.sh
|
||||||
|
CMD ["/usr/bin/nvidia-installer.sh"]
|
|
@ -0,0 +1,27 @@
|
||||||
|
# Copyright 2017 The Kubernetes Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
TAG?=v0.1
|
||||||
|
REGISTRY?=gcr.io/google_containers
|
||||||
|
IMAGE=cos-nvidia-driver-install
|
||||||
|
|
||||||
|
all: container
|
||||||
|
|
||||||
|
container:
|
||||||
|
docker build --pull -t ${REGISTRY}/${IMAGE}:${TAG} .
|
||||||
|
|
||||||
|
push:
|
||||||
|
gcloud docker -- push ${REGISTRY}/${IMAGE}:${TAG}
|
||||||
|
|
||||||
|
.PHONY: all container push
|
|
@ -0,0 +1,57 @@
|
||||||
|
apiVersion: extensions/v1beta1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: cos-nvidia-installer
|
||||||
|
namespace: kube-system
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
name: cos-nvidia-installer
|
||||||
|
# Update the version tag here and `LAKITU_KERNEL_SHA1` while using against new COS releases.
|
||||||
|
cos-version: cos-beta-59-9460-20-0
|
||||||
|
spec:
|
||||||
|
hostNetwork: true
|
||||||
|
hostPID: true
|
||||||
|
volumes:
|
||||||
|
- name: dev
|
||||||
|
hostPath:
|
||||||
|
path: /dev
|
||||||
|
- name: nvidia-overlay
|
||||||
|
hostPath:
|
||||||
|
path: /home/kubernetes/bin/nvidia
|
||||||
|
- name: os-release
|
||||||
|
hostPath:
|
||||||
|
path: /etc/os-release
|
||||||
|
- name: sysrq
|
||||||
|
hostPath:
|
||||||
|
path: /proc/sysrq-trigger
|
||||||
|
containers:
|
||||||
|
- image: gcr.io/google_containers/cos-nvidia-driver-install@sha256:ad83ede6e0c6d768bf7cf69a7dec972aa5e8f88778142ca46afd3286ad58cfc8
|
||||||
|
command: ["/bin/sh", "-c"]
|
||||||
|
args: ["usr/bin/nvidia-installer.sh && sleep infinity"]
|
||||||
|
env:
|
||||||
|
- name: BASE_DIR
|
||||||
|
value: "/rootfs/nvidia"
|
||||||
|
name: nvidia-driver-installer
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 0.15
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
env:
|
||||||
|
# The kernel SHA1 here should correspond to the GCI_VERSION specified by default under cluster/gce/config-default.sh
|
||||||
|
- name: LAKITU_KERNEL_SHA1
|
||||||
|
value: 26481563cb3788ad254c2bf2126b843c161c7e48
|
||||||
|
- name: BASE_DIR
|
||||||
|
value: "/rootfs/nvidia"
|
||||||
|
volumeMounts:
|
||||||
|
- name: nvidia-overlay
|
||||||
|
mountPath: /rootfs/nvidia
|
||||||
|
- name: dev
|
||||||
|
mountPath: /dev
|
||||||
|
- name: os-release
|
||||||
|
mountPath: /rootfs/etc/os-release
|
||||||
|
- name: sysrq
|
||||||
|
mountPath: /sysrq
|
||||||
|
|
|
@ -0,0 +1,207 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2017 The Kubernetes Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# This script is for dynamically installing nvidia kernel drivers in Container Optimized OS
|
||||||
|
|
||||||
|
set -o errexit
|
||||||
|
set -o pipefail
|
||||||
|
set -x
|
||||||
|
|
||||||
|
# The script must be run as a root.
|
||||||
|
# Prerequisites:
|
||||||
|
#
|
||||||
|
# LAKITU_KERNEL_SHA1 - The env variable is expected to be set to HEAD of the kernel version used on the host.
|
||||||
|
# BASE_DIR - Directory that is mapped to a stateful partition on host. Defaults to `/rootfs/nvidia`.
|
||||||
|
#
|
||||||
|
# The script will output the following artifacts:
|
||||||
|
# ${BASE_DIR}/lib* --> Nvidia CUDA libraries
|
||||||
|
# ${BASE_DIR}/bin/* --> Nvidia debug utilities
|
||||||
|
# ${BASE_DIR}/.cache/* --> Nvidia driver artifacts cached for idempotency.
|
||||||
|
#
|
||||||
|
|
||||||
|
BASE_DIR=${BASE_DIR:-"/rootfs/nvidia"}
|
||||||
|
CACHE_DIR="${BASE_DIR}/.cache"
|
||||||
|
USR_WORK_DIR="${CACHE_DIR}/usr-work"
|
||||||
|
USR_WRITABLE_DIR="${CACHE_DIR}/usr-writable"
|
||||||
|
LIB_WORK_DIR="${CACHE_DIR}/lib-work"
|
||||||
|
LIB_WRITABLE_DIR="${CACHE_DIR}/lib-writable"
|
||||||
|
|
||||||
|
LIB_OUTPUT_DIR="${BASE_DIR}/lib"
|
||||||
|
BIN_OUTPUT_DIR="${BASE_DIR}/bin"
|
||||||
|
|
||||||
|
KERNEL_SRC_DIR="/lakitu-kernel"
|
||||||
|
NVIDIA_DRIVER_DIR="/nvidia"
|
||||||
|
NVIDIA_DRIVER_VERSION="375.26"
|
||||||
|
|
||||||
|
# Source: https://developer.nvidia.com/cuda-downloads
|
||||||
|
NVIDIA_CUDA_URL="https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda_8.0.61_375.26_linux-run"
|
||||||
|
NVIDIA_CUDA_MD5SUM="33e1bd980e91af4e55f3ef835c103f9b"
|
||||||
|
NVIDIA_CUDA_PKG_NAME="cuda_8.0.61_375.26_linux.run"
|
||||||
|
NVIDIA_DRIVER_PKG_NAME="NVIDIA-Linux-x86_64-375.26.run"
|
||||||
|
|
||||||
|
check_nvidia_device() {
|
||||||
|
lspci
|
||||||
|
if ! lspci | grep -i -q NVIDIA; then
|
||||||
|
echo "No NVIDIA devices attached to this instance."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
echo "Found NVIDIA device on this instance."
|
||||||
|
}
|
||||||
|
|
||||||
|
prepare_kernel_source() {
|
||||||
|
local kernel_git_repo="https://chromium.googlesource.com/chromiumos/third_party/kernel"
|
||||||
|
local kernel_version="$(uname -r)"
|
||||||
|
local kernel_version_stripped="$(echo ${kernel_version} | sed 's/\+//')"
|
||||||
|
|
||||||
|
# Checkout the correct tag.
|
||||||
|
echo "Downloading kernel source at tag ${kernel_version_stripped} ..."
|
||||||
|
pushd "${KERNEL_SRC_DIR}"
|
||||||
|
# TODO: Consume KERNEL SHA1 from COS image directly.
|
||||||
|
# git checkout "tags/v${kernel_version_stripped}"
|
||||||
|
git checkout ${LAKITU_KERNEL_SHA1}
|
||||||
|
|
||||||
|
# Prepare kernel configu and source for modules.
|
||||||
|
echo "Preparing kernel sources ..."
|
||||||
|
zcat "/proc/config.gz" > ".config"
|
||||||
|
make olddefconfig
|
||||||
|
make modules_prepare
|
||||||
|
# Done.
|
||||||
|
popd
|
||||||
|
}
|
||||||
|
|
||||||
|
download_install_nvidia() {
|
||||||
|
local pkg_name="${NVIDIA_CUDA_PKG_NAME}"
|
||||||
|
local url="${NVIDIA_CUDA_URL}"
|
||||||
|
local log_file_name="${NVIDIA_DRIVER_DIR}/nvidia-installer.log"
|
||||||
|
|
||||||
|
mkdir -p "${NVIDIA_DRIVER_DIR}"
|
||||||
|
pushd "${NVIDIA_DRIVER_DIR}"
|
||||||
|
|
||||||
|
echo "Downloading Nvidia CUDA package from ${url} ..."
|
||||||
|
curl -L -s "${url}" -o "${pkg_name}"
|
||||||
|
echo "${NVIDIA_CUDA_MD5SUM} ${pkg_name}" | md5sum --check
|
||||||
|
|
||||||
|
echo "Extracting Nvidia CUDA package ..."
|
||||||
|
sh ${pkg_name} --extract="$(pwd)"
|
||||||
|
|
||||||
|
echo "Running the Nvidia driver installer ..."
|
||||||
|
if ! sh "${NVIDIA_DRIVER_PKG_NAME}" --kernel-source-path="${KERNEL_SRC_DIR}" --silent --accept-license --keep --log-file-name="${log_file_name}"; then
|
||||||
|
echo "Nvidia installer failed, log below:"
|
||||||
|
echo "==================================="
|
||||||
|
tail -50 "${log_file_name}"
|
||||||
|
echo "==================================="
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
# Create unified memory device file.
|
||||||
|
nvidia-modprobe -c0 -u
|
||||||
|
popd
|
||||||
|
}
|
||||||
|
|
||||||
|
unlock_loadpin_and_reboot_if_needed() {
|
||||||
|
kernel_cmdline="$(cat /proc/cmdline)"
|
||||||
|
if echo "${kernel_cmdline}" | grep -q -v "lsm.module_locking=0"; then
|
||||||
|
local -r esp_partition="/dev/sda12"
|
||||||
|
local -r mount_path="/tmp/esp"
|
||||||
|
local -r grub_cfg="efi/boot/grub.cfg"
|
||||||
|
|
||||||
|
mkdir -p "${mount_path}"
|
||||||
|
mount "${esp_partition}" "${mount_path}"
|
||||||
|
|
||||||
|
pushd "${mount_path}"
|
||||||
|
cp "${grub_cfg}" "${grub_cfg}.orig"
|
||||||
|
sed 's/cros_efi/cros_efi lsm.module_locking=0/g' -i "efi/boot/grub.cfg"
|
||||||
|
cat "${grub_cfg}"
|
||||||
|
popd
|
||||||
|
sync
|
||||||
|
umount "${mount_path}"
|
||||||
|
# Restart the node for loadpin to be disabled.
|
||||||
|
echo b > /sysrq
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
create_uvm_device() {
|
||||||
|
# Create unified memory device file.
|
||||||
|
nvidia-modprobe -c0 -u
|
||||||
|
}
|
||||||
|
|
||||||
|
verify_base_image() {
|
||||||
|
mount --bind /rootfs/etc/os-release /etc/os-release
|
||||||
|
local id="$(grep "^ID=" /etc/os-release)"
|
||||||
|
if [[ "${id#*=}" != "cos" ]]; then
|
||||||
|
echo "This installer is designed to run on Container-Optimized OS only"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
setup_overlay_mounts() {
|
||||||
|
mkdir -p ${USR_WRITABLE_DIR} ${USR_WORK_DIR} ${LIB_WRITABLE_DIR} ${LIB_WORK_DIR}
|
||||||
|
mount -t overlay -o lowerdir=/usr,upperdir=${USR_WRITABLE_DIR},workdir=${USR_WORK_DIR} none /usr
|
||||||
|
mount -t overlay -o lowerdir=/lib,upperdir=${LIB_WRITABLE_DIR},workdir=${LIB_WORK_DIR} none /lib
|
||||||
|
}
|
||||||
|
|
||||||
|
exit_if_install_not_needed() {
|
||||||
|
if nvidia-smi; then
|
||||||
|
echo "nvidia drivers already installed. Skipping installation"
|
||||||
|
post_installation_sequence
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
restart_kubelet() {
|
||||||
|
echo "Sending SIGTERM to kubelet"
|
||||||
|
pkill -SIGTERM kubelet
|
||||||
|
}
|
||||||
|
|
||||||
|
# Copy user space libraries and debug utilities to a special output directory on the host.
|
||||||
|
# Make these artifacts world readable and executable.
|
||||||
|
copy_files_to_host() {
|
||||||
|
mkdir -p ${LIB_OUTPUT_DIR} ${BIN_OUTPUT_DIR}
|
||||||
|
cp -r ${USR_WRITABLE_DIR}/lib/x86_64-linux-gnu/* ${LIB_OUTPUT_DIR}/
|
||||||
|
cp -r ${USR_WRITABLE_DIR}/bin/* ${BIN_OUTPUT_DIR}/
|
||||||
|
chmod -R a+rx ${LIB_OUTPUT_DIR}
|
||||||
|
chmod -R a+rx ${BIN_OUTPUT_DIR}
|
||||||
|
}
|
||||||
|
|
||||||
|
post_installation_sequence() {
|
||||||
|
create_uvm_device
|
||||||
|
# Copy nvidia user space libraries and debug tools to the host for use from other containers.
|
||||||
|
copy_files_to_host
|
||||||
|
# Restart the kubelet for it to pick up the GPU devices.
|
||||||
|
restart_kubelet
|
||||||
|
}
|
||||||
|
|
||||||
|
main() {
|
||||||
|
# Do not run the installer unless the base image is Container Optimized OS (COS)
|
||||||
|
verify_base_image
|
||||||
|
# Do not run the installer unless a Nvidia device is found on the PCI bus
|
||||||
|
check_nvidia_device
|
||||||
|
# Setup overlay mounts to capture nvidia driver artificats in a more permanent storage on the host.
|
||||||
|
setup_overlay_mounts
|
||||||
|
# Disable a critical security feature in COS that will allow for dynamically loading Nvidia drivers
|
||||||
|
unlock_loadpin_and_reboot_if_needed
|
||||||
|
# Exit if installation is not required (for idempotency)
|
||||||
|
exit_if_install_not_needed
|
||||||
|
# Checkout kernel sources appropriate for the base image.
|
||||||
|
prepare_kernel_source
|
||||||
|
# Download, compile and install nvidia drivers.
|
||||||
|
download_install_nvidia
|
||||||
|
# Verify that the Nvidia drivers have been successfully installed.
|
||||||
|
nvidia-smi
|
||||||
|
# Perform post installation steps - copying artifacts, restarting kubelet, etc.
|
||||||
|
post_installation_sequence
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
|
@ -48,7 +48,8 @@ go-bindata -nometadata -o "${BINDATA_OUTPUT}.tmp" -pkg generated \
|
||||||
"examples/..." \
|
"examples/..." \
|
||||||
"test/e2e/testing-manifests/..." \
|
"test/e2e/testing-manifests/..." \
|
||||||
"test/images/..." \
|
"test/images/..." \
|
||||||
"test/fixtures/..."
|
"test/fixtures/..." \
|
||||||
|
"cluster/gce/gci/nvidia-gpus/..."
|
||||||
|
|
||||||
gofmt -s -w "${BINDATA_OUTPUT}.tmp"
|
gofmt -s -w "${BINDATA_OUTPUT}.tmp"
|
||||||
|
|
||||||
|
|
|
@ -86,6 +86,7 @@ go_library(
|
||||||
"networking.go",
|
"networking.go",
|
||||||
"networking_perf.go",
|
"networking_perf.go",
|
||||||
"nodeoutofdisk.go",
|
"nodeoutofdisk.go",
|
||||||
|
"nvidia-gpus.go",
|
||||||
"pod_gc.go",
|
"pod_gc.go",
|
||||||
"podpreset.go",
|
"podpreset.go",
|
||||||
"pods.go",
|
"pods.go",
|
||||||
|
|
|
@ -23,6 +23,7 @@ genrule(
|
||||||
name = "bindata",
|
name = "bindata",
|
||||||
srcs = [
|
srcs = [
|
||||||
"//examples:sources",
|
"//examples:sources",
|
||||||
|
"//cluster/gce/gci/nvidia-gpus:sources",
|
||||||
"//test/images:sources",
|
"//test/images:sources",
|
||||||
"//test/fixtures:sources",
|
"//test/fixtures:sources",
|
||||||
"//test/e2e/testing-manifests:sources",
|
"//test/e2e/testing-manifests:sources",
|
||||||
|
|
|
@ -0,0 +1,178 @@
|
||||||
|
/*
|
||||||
|
Copyright 2017 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package e2e
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"k8s.io/apimachinery/pkg/api/resource"
|
||||||
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/runtime"
|
||||||
|
"k8s.io/apimachinery/pkg/util/uuid"
|
||||||
|
utilyaml "k8s.io/apimachinery/pkg/util/yaml"
|
||||||
|
"k8s.io/kubernetes/pkg/api"
|
||||||
|
"k8s.io/kubernetes/pkg/api/v1"
|
||||||
|
extensions "k8s.io/kubernetes/pkg/apis/extensions/v1beta1"
|
||||||
|
"k8s.io/kubernetes/test/e2e/framework"
|
||||||
|
"k8s.io/kubernetes/test/e2e/generated"
|
||||||
|
|
||||||
|
. "github.com/onsi/ginkgo"
|
||||||
|
. "github.com/onsi/gomega"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
testPodNamePrefix = "nvidia-gpu-"
|
||||||
|
testCUDAImage = "gcr.io/google_containers/cuda-vector-add:v0.1"
|
||||||
|
cosOSImage = "Container-Optimized OS from Google"
|
||||||
|
// Nvidia driver installation can take upwards of 5 minutes.
|
||||||
|
driverInstallTimeout = 10 * time.Minute
|
||||||
|
// Nvidia COS driver installer daemonset.
|
||||||
|
cosNvidiaDriverInstallerPath = "cluster/gce/gci/nvidia-gpus/cos-installer-daemonset.yaml"
|
||||||
|
)
|
||||||
|
|
||||||
|
func makeCudaAdditionTestPod() *v1.Pod {
|
||||||
|
podName := testPodNamePrefix + string(uuid.NewUUID())
|
||||||
|
testPod := &v1.Pod{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Name: podName,
|
||||||
|
},
|
||||||
|
Spec: v1.PodSpec{
|
||||||
|
RestartPolicy: v1.RestartPolicyNever,
|
||||||
|
Containers: []v1.Container{
|
||||||
|
{
|
||||||
|
Name: "vector-addition",
|
||||||
|
Image: testCUDAImage,
|
||||||
|
Resources: v1.ResourceRequirements{
|
||||||
|
Limits: v1.ResourceList{
|
||||||
|
v1.ResourceNvidiaGPU: *resource.NewQuantity(1, resource.DecimalSI),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
VolumeMounts: []v1.VolumeMount{
|
||||||
|
{
|
||||||
|
Name: "nvidia-libraries",
|
||||||
|
MountPath: "/usr/local/nvidia/lib64",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Volumes: []v1.Volume{
|
||||||
|
{
|
||||||
|
Name: "nvidia-libraries",
|
||||||
|
VolumeSource: v1.VolumeSource{
|
||||||
|
HostPath: &v1.HostPathVolumeSource{
|
||||||
|
Path: "/home/kubernetes/bin/nvidia/lib",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return testPod
|
||||||
|
}
|
||||||
|
|
||||||
|
func isClusterRunningCOS(f *framework.Framework) bool {
|
||||||
|
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
|
||||||
|
framework.ExpectNoError(err, "getting node list")
|
||||||
|
for _, node := range nodeList.Items {
|
||||||
|
if !strings.Contains(node.Status.NodeInfo.OSImage, cosOSImage) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func areGPUsAvailableOnAllSchedulableNodes(f *framework.Framework) bool {
|
||||||
|
framework.Logf("Getting list of Nodes from API server")
|
||||||
|
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
|
||||||
|
framework.ExpectNoError(err, "getting node list")
|
||||||
|
for _, node := range nodeList.Items {
|
||||||
|
if node.Spec.Unschedulable {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if node.Status.Capacity.NvidiaGPU().Value() == 0 {
|
||||||
|
framework.Logf("Nvidia GPUs not available on Node: %q", node.Name)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
framework.Logf("Nvidia GPUs exist on all schedulable nodes")
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func getGPUsAvailable(f *framework.Framework) int64 {
|
||||||
|
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
|
||||||
|
framework.ExpectNoError(err, "getting node list")
|
||||||
|
var gpusAvailable int64
|
||||||
|
for _, node := range nodeList.Items {
|
||||||
|
gpusAvailable += node.Status.Capacity.NvidiaGPU().Value()
|
||||||
|
}
|
||||||
|
return gpusAvailable
|
||||||
|
}
|
||||||
|
|
||||||
|
func testNvidiaGPUsOnCOS(f *framework.Framework) {
|
||||||
|
// Skip the test if the base image is not COS.
|
||||||
|
// TODO: Add support for other base images.
|
||||||
|
// CUDA apps require host mounts which is not portable across base images (yet).
|
||||||
|
framework.Logf("Checking base image")
|
||||||
|
if !isClusterRunningCOS(f) {
|
||||||
|
Skip("Nvidia GPU tests are supproted only on Container Optimized OS image currently")
|
||||||
|
}
|
||||||
|
framework.Logf("Cluster is running on COS. Proceeding with test")
|
||||||
|
// GPU drivers might have already been installed.
|
||||||
|
if !areGPUsAvailableOnAllSchedulableNodes(f) {
|
||||||
|
// Install Nvidia Drivers.
|
||||||
|
ds := dsFromManifest(cosNvidiaDriverInstallerPath)
|
||||||
|
ds.Namespace = f.Namespace.Name
|
||||||
|
_, err := f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Create(ds)
|
||||||
|
framework.ExpectNoError(err, "failed to create daemonset")
|
||||||
|
framework.Logf("Successfully created daemonset to install Nvidia drivers. Waiting for drivers to be installed and GPUs to be available in Node Capacity...")
|
||||||
|
// Wait for Nvidia GPUs to be available on nodes
|
||||||
|
Eventually(func() bool {
|
||||||
|
return areGPUsAvailableOnAllSchedulableNodes(f)
|
||||||
|
}, driverInstallTimeout, time.Second).Should(BeTrue())
|
||||||
|
}
|
||||||
|
framework.Logf("Creating as many pods as there are Nvidia GPUs and have the pods run a CUDA app")
|
||||||
|
podList := []*v1.Pod{}
|
||||||
|
for i := int64(0); i < getGPUsAvailable(f); i++ {
|
||||||
|
podList = append(podList, f.PodClient().Create(makeCudaAdditionTestPod()))
|
||||||
|
}
|
||||||
|
framework.Logf("Wait for all test pods to succeed")
|
||||||
|
// Wait for all pods to succeed
|
||||||
|
for _, po := range podList {
|
||||||
|
f.PodClient().WaitForSuccess(po.Name, 5*time.Minute)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// dsFromManifest reads a .json/yaml file and returns the daemonset in it.
|
||||||
|
func dsFromManifest(fileName string) *extensions.DaemonSet {
|
||||||
|
var controller extensions.DaemonSet
|
||||||
|
framework.Logf("Parsing ds from %v", fileName)
|
||||||
|
data := generated.ReadOrDie(fileName)
|
||||||
|
|
||||||
|
json, err := utilyaml.ToJSON(data)
|
||||||
|
Expect(err).NotTo(HaveOccurred())
|
||||||
|
|
||||||
|
Expect(runtime.DecodeInto(api.Codecs.UniversalDecoder(), json, &controller)).NotTo(HaveOccurred())
|
||||||
|
return &controller
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ = framework.KubeDescribe("[Feature:GPU]", func() {
|
||||||
|
f := framework.NewDefaultFramework("gpus")
|
||||||
|
It("run Nvidia GPU tests on Container Optimized OS only", func() {
|
||||||
|
testNvidiaGPUsOnCOS(f)
|
||||||
|
})
|
||||||
|
})
|
|
@ -18,6 +18,7 @@ package e2e_node
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"os/exec"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"k8s.io/apimachinery/pkg/api/resource"
|
"k8s.io/apimachinery/pkg/api/resource"
|
||||||
|
@ -33,11 +34,49 @@ import (
|
||||||
|
|
||||||
const acceleratorsFeatureGate = "Accelerators=true"
|
const acceleratorsFeatureGate = "Accelerators=true"
|
||||||
|
|
||||||
|
func getGPUsAvailable(f *framework.Framework) int64 {
|
||||||
|
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
|
||||||
|
framework.ExpectNoError(err, "getting node list")
|
||||||
|
var gpusAvailable int64
|
||||||
|
for _, node := range nodeList.Items {
|
||||||
|
gpusAvailable += node.Status.Capacity.NvidiaGPU().Value()
|
||||||
|
}
|
||||||
|
return gpusAvailable
|
||||||
|
}
|
||||||
|
|
||||||
|
func gpusExistOnAllNodes(f *framework.Framework) bool {
|
||||||
|
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
|
||||||
|
framework.ExpectNoError(err, "getting node list")
|
||||||
|
for _, node := range nodeList.Items {
|
||||||
|
if node.Name == "kubernetes-master" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if node.Status.Capacity.NvidiaGPU().Value() == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkIfNvidiaGPUsExistOnNode() bool {
|
||||||
|
// Cannot use `lspci` because it is not installed on all distros by default.
|
||||||
|
err := exec.Command("/bin/sh", "-c", "find /sys/devices/pci* -type f | grep vendor | xargs cat | grep 0x10de").Run()
|
||||||
|
if err != nil {
|
||||||
|
framework.Logf("check for nvidia GPUs failed. Got Error: %v", err)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
// Serial because the test updates kubelet configuration.
|
// Serial because the test updates kubelet configuration.
|
||||||
var _ = framework.KubeDescribe("GPU [Serial]", func() {
|
var _ = framework.KubeDescribe("GPU [Serial]", func() {
|
||||||
f := framework.NewDefaultFramework("gpu-test")
|
f := framework.NewDefaultFramework("gpu-test")
|
||||||
Context("attempt to use GPUs if available", func() {
|
Context("attempt to use GPUs if available", func() {
|
||||||
It("setup the node and create pods to test gpus", func() {
|
It("setup the node and create pods to test gpus", func() {
|
||||||
|
By("ensuring that Nvidia GPUs exist on the node")
|
||||||
|
if !checkIfNvidiaGPUsExistOnNode() {
|
||||||
|
Skip("Nvidia GPUs do not exist on the node. Skipping test.")
|
||||||
|
}
|
||||||
By("ensuring that dynamic kubelet configuration is enabled")
|
By("ensuring that dynamic kubelet configuration is enabled")
|
||||||
enabled, err := isKubeletConfigEnabled(f)
|
enabled, err := isKubeletConfigEnabled(f)
|
||||||
framework.ExpectNoError(err)
|
framework.ExpectNoError(err)
|
||||||
|
@ -65,19 +104,11 @@ var _ = framework.KubeDescribe("GPU [Serial]", func() {
|
||||||
}
|
}
|
||||||
framework.ExpectNoError(setKubeletConfiguration(f, newCfg))
|
framework.ExpectNoError(setKubeletConfiguration(f, newCfg))
|
||||||
|
|
||||||
By("Getting the local node object from the api server")
|
By("Waiting for GPUs to become available on the local node")
|
||||||
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
|
Eventually(gpusExistOnAllNodes(f), 10*time.Minute, time.Second).Should(BeTrue())
|
||||||
framework.ExpectNoError(err, "getting node list")
|
|
||||||
Expect(len(nodeList.Items)).To(Equal(1))
|
|
||||||
node := nodeList.Items[0]
|
|
||||||
gpusAvailable := node.Status.Capacity.NvidiaGPU()
|
|
||||||
By("Skipping the test if GPUs aren't available")
|
|
||||||
if gpusAvailable.IsZero() {
|
|
||||||
Skip("No GPUs available on local node. Skipping test.")
|
|
||||||
}
|
|
||||||
|
|
||||||
By("Creating a pod that will consume all GPUs")
|
By("Creating a pod that will consume all GPUs")
|
||||||
podSuccess := makePod(gpusAvailable.Value(), "gpus-success")
|
podSuccess := makePod(getGPUsAvailable(f), "gpus-success")
|
||||||
podSuccess = f.PodClient().CreateSync(podSuccess)
|
podSuccess = f.PodClient().CreateSync(podSuccess)
|
||||||
|
|
||||||
By("Checking the containers in the pod had restarted at-least twice successfully thereby ensuring GPUs are reused")
|
By("Checking the containers in the pod had restarted at-least twice successfully thereby ensuring GPUs are reused")
|
||||||
|
|
|
@ -0,0 +1,19 @@
|
||||||
|
#cloud-config
|
||||||
|
|
||||||
|
runcmd:
|
||||||
|
- mount /tmp /tmp -o remount,exec,suid
|
||||||
|
- usermod -a -G docker jenkins
|
||||||
|
- mkdir -p /var/lib/kubelet
|
||||||
|
- mkdir -p /home/kubernetes/containerized_mounter/rootfs
|
||||||
|
- mount --bind /home/kubernetes/containerized_mounter/ /home/kubernetes/containerized_mounter/
|
||||||
|
- mount -o remount, exec /home/kubernetes/containerized_mounter/
|
||||||
|
- wget https://storage.googleapis.com/kubernetes-release/gci-mounter/mounter.tar -O /tmp/mounter.tar
|
||||||
|
- tar xvf /tmp/mounter.tar -C /home/kubernetes/containerized_mounter/rootfs
|
||||||
|
- mkdir -p /home/kubernetes/containerized_mounter/rootfs/var/lib/kubelet
|
||||||
|
- mount --rbind /var/lib/kubelet /home/kubernetes/containerized_mounter/rootfs/var/lib/kubelet
|
||||||
|
- mount --make-rshared /home/kubernetes/containerized_mounter/rootfs/var/lib/kubelet
|
||||||
|
- mount --bind /proc /home/kubernetes/containerized_mounter/rootfs/proc
|
||||||
|
- mount --bind /dev /home/kubernetes/containerized_mounter/rootfs/dev
|
||||||
|
- rm /tmp/mounter.tar
|
||||||
|
- modprobe configs
|
||||||
|
- docker run -v /dev:/dev -v /home/kubernetes/bin/nvidia:/rootfs/nvidia -v /etc/os-release:/rootfs/etc/os-release -v /proc/sysrq-trigger:/sysrq -e LAKITU_KERNEL_SHA1=2fdf6034a0fae9794d80e4d218e237771224ba8f -e BASE_DIR=/rootfs/nvidia --privileged gcr.io/google_containers/cos-nvidia-driver-install@sha256:ad83ede6e0c6d768bf7cf69a7dec972aa5e8f88778142ca46afd3286ad58cfc8
|
|
@ -25,4 +25,9 @@ images:
|
||||||
gci:
|
gci:
|
||||||
image_regex: gci-stable-56-9000-84-2 # docker 1.11.2
|
image_regex: gci-stable-56-9000-84-2 # docker 1.11.2
|
||||||
project: google-containers
|
project: google-containers
|
||||||
metadata: "user-data<test/e2e_node/jenkins/gci-init.yaml,gci-update-strategy=update_disabled"
|
metadata: "user-data<test/e2e_node/jenkins/gci-init-gpu.yaml,gci-update-strategy=update_disabled"
|
||||||
|
resources:
|
||||||
|
accelerators:
|
||||||
|
- type: nvidia-tesla-k80
|
||||||
|
count: 2
|
||||||
|
|
||||||
|
|
|
@ -523,6 +523,7 @@ func createInstance(imageConfig *internalGCEImage) (string, error) {
|
||||||
Type: "PERSISTENT",
|
Type: "PERSISTENT",
|
||||||
InitializeParams: &compute.AttachedDiskInitializeParams{
|
InitializeParams: &compute.AttachedDiskInitializeParams{
|
||||||
SourceImage: sourceImage(imageConfig.image, imageConfig.project),
|
SourceImage: sourceImage(imageConfig.image, imageConfig.project),
|
||||||
|
DiskSizeGb: 20,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
|
@ -0,0 +1,24 @@
|
||||||
|
# Copyright 2017 The Kubernetes Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
FROM nvidia/cuda:8.0-devel-ubuntu16.04
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
cuda-samples-$CUDA_PKG_VERSION && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
WORKDIR /usr/local/cuda/samples/0_Simple/vectorAdd
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
CMD ./vectorAdd
|
|
@ -0,0 +1,28 @@
|
||||||
|
# Copyright 2017 The Kubernetes Authors.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
.PHONY: all push build
|
||||||
|
|
||||||
|
TAG ?= v0.1
|
||||||
|
|
||||||
|
REGISTRY ?= gcr.io/google-containers
|
||||||
|
IMAGE = $(REGISTRY)/cuda-vector-add
|
||||||
|
|
||||||
|
build:
|
||||||
|
docker build --pull -t $(IMAGE):$(TAG) .
|
||||||
|
|
||||||
|
push:
|
||||||
|
gcloud docker -- push $(IMAGE):$(TAG)
|
||||||
|
|
||||||
|
all: build
|
|
@ -0,0 +1,13 @@
|
||||||
|
## cuda_vector_add
|
||||||
|
|
||||||
|
This is a small CUDA application that performs a simple vector addition. Useful for testing CUDA support in Kubernetes.
|
||||||
|
|
||||||
|
## How to release:
|
||||||
|
|
||||||
|
```
|
||||||
|
# Build
|
||||||
|
$ make
|
||||||
|
|
||||||
|
# Push
|
||||||
|
$ make push
|
||||||
|
```
|
Loading…
Reference in New Issue