Merge pull request #45130 from vishh/gpu-cluster-setup

Automatic merge from submit-queue (batch tested with PRs 44830, 45130)

Adding support for Accelerators to GCE clusters.

```release-note
Create clusters with GPUs in GKE by specifying "type=<gpu-type>,count=<gpu-count>" to NODE_ACCELERATORS env var.
List of available GPUs - https://cloud.google.com/compute/docs/gpus/#introduction
```
pull/6/head
Kubernetes Submit Queue 2017-05-05 15:39:32 -07:00 committed by GitHub
commit d4f92711a1
3 changed files with 30 additions and 1 deletions

View File

@ -32,6 +32,9 @@ MASTER_DISK_SIZE=${MASTER_DISK_SIZE:-20GB}
NODE_DISK_TYPE=${NODE_DISK_TYPE:-pd-standard}
NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB}
NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0}
# Accelerators to be attached to each node. Format "type=<accelerator-type>,count=<accelerator-count>"
# More information on available GPUs here - https://cloud.google.com/compute/docs/gpus/
NODE_ACCELERATORS=${NODE_ACCELERATORS:-""}
REGISTER_MASTER_KUBELET=${REGISTER_MASTER:-true}
PREEMPTIBLE_NODE=${PREEMPTIBLE_NODE:-false}
PREEMPTIBLE_MASTER=${PREEMPTIBLE_MASTER:-false}
@ -55,6 +58,11 @@ if [[ "${NODE_OS_DISTRIBUTION}" == "cos" ]]; then
NODE_OS_DISTRIBUTION="gci"
fi
# GPUs supported in GCE do not have compatible drivers in Debian 7.
if [[ "${NODE_OS_DISTRIBUTION}" == "debian" ]]; then
NODE_ACCELERATORS=""
fi
# By default a cluster will be started with the master on GCI and nodes on
# containervm. If you are updating the containervm version, update this
# variable. Also please update corresponding image for node e2e at:
@ -135,6 +143,10 @@ RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}"
# Optional: set feature gates
FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}"
if [[ ! -z "${NODE_ACCELERATORS}" ]]; then
FEATURE_GATES="${FEATURE_GATES},Accelerators=true"
fi
# Optional: Install cluster DNS.
ENABLE_CLUSTER_DNS="${KUBE_ENABLE_CLUSTER_DNS:-true}"
DNS_SERVER_IP="${KUBE_DNS_SERVER_IP:-10.0.0.10}"

View File

@ -32,6 +32,7 @@ MASTER_DISK_SIZE=${MASTER_DISK_SIZE:-20GB}
NODE_DISK_TYPE=${NODE_DISK_TYPE:-pd-standard}
NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB}
NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0}
NODE_ACCELERATORS=${NODE_ACCELERATORS:-""}
REGISTER_MASTER_KUBELET=${REGISTER_MASTER:-true}
KUBE_APISERVER_REQUEST_TIMEOUT=300
PREEMPTIBLE_NODE=${PREEMPTIBLE_NODE:-false}
@ -56,6 +57,11 @@ if [[ "${NODE_OS_DISTRIBUTION}" == "cos" ]]; then
NODE_OS_DISTRIBUTION="gci"
fi
# GPUs supported in GCE do not have compatible drivers in Debian 7.
if [[ "${NODE_OS_DISTRIBUTION}" == "debian" ]]; then
NODE_ACCELERATORS=""
fi
# By default a cluster will be started with the master on GCI and nodes on
# containervm. If you are updating the containervm version, update this
# variable. Also please update corresponding image for node e2e at:
@ -91,6 +97,10 @@ RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}"
# Optional: set feature gates
FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}"
if [[ ! -z "${NODE_ACCELERATORS}" ]]; then
FEATURE_GATES="${FEATURE_GATES},Accelerators=true"
fi
TERMINATED_POD_GC_THRESHOLD=${TERMINATED_POD_GC_THRESHOLD:-100}
# Extra docker options for nodes.

View File

@ -527,10 +527,16 @@ function create-node-template() {
"${IP_ALIAS_SUBNETWORK:-}" \
"${IP_ALIAS_SIZE:-}")
local accelerator_args=""
# VMs with Accelerators cannot be live migrated.
# More details here - https://cloud.google.com/compute/docs/gpus/add-gpus#create-new-gpu-instance
if [[ ! -z "${NODE_ACCELERATORS}" ]]; then
accelerator_args="--maintenance-policy TERMINATE --restart-on-failure --accelerator ${NODE_ACCELERATORS}"
fi
local attempt=1
while true; do
echo "Attempt ${attempt} to create ${1}" >&2
if ! ${gcloud} compute instance-templates create \
if ! ${gcloud} beta compute instance-templates create \
"$template_name" \
--project "${PROJECT}" \
--machine-type "${NODE_SIZE}" \
@ -539,6 +545,7 @@ function create-node-template() {
--image-project="${NODE_IMAGE_PROJECT}" \
--image "${NODE_IMAGE}" \
--tags "${NODE_TAG}" \
${accelerator_args} \
${local_ssds} \
--region "${REGION}" \
${network} \