Merge pull request #45130 from vishh/gpu-cluster-setup

Automatic merge from submit-queue (batch tested with PRs 44830, 45130) Adding support for Accelerators to GCE clusters. ```release-note Create clusters with GPUs in GKE by specifying "type=<gpu-type>,count=<gpu-count>" to NODE_ACCELERATORS env var. List of available GPUs - https://cloud.google.com/compute/docs/gpus/#introduction ```
2017-05-05 15:39:32 -07:00 · 2017-05-05 15:39:32 -07:00 · d4f92711a1
parent 17d33ea82e e74d4a0d68
commit d4f92711a1
3 changed files with 30 additions and 1 deletions
--- a/cluster/gce/config-default.sh
+++ b/cluster/gce/config-default.sh
@ -32,6 +32,9 @@ MASTER_DISK_SIZE=${MASTER_DISK_SIZE:-20GB}
 NODE_DISK_TYPE=${NODE_DISK_TYPE:-pd-standard}
 NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB}
 NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0}
+# Accelerators to be attached to each node. Format "type=<accelerator-type>,count=<accelerator-count>"
+# More information on available GPUs here - https://cloud.google.com/compute/docs/gpus/
+NODE_ACCELERATORS=${NODE_ACCELERATORS:-""}
 REGISTER_MASTER_KUBELET=${REGISTER_MASTER:-true}
 PREEMPTIBLE_NODE=${PREEMPTIBLE_NODE:-false}
 PREEMPTIBLE_MASTER=${PREEMPTIBLE_MASTER:-false}
@ -55,6 +58,11 @@ if [[ "${NODE_OS_DISTRIBUTION}" == "cos" ]]; then
    NODE_OS_DISTRIBUTION="gci"
 fi

+# GPUs supported in GCE do not have compatible drivers in Debian 7.
+if [[ "${NODE_OS_DISTRIBUTION}" == "debian" ]]; then
+    NODE_ACCELERATORS=""
+fi
+
 # By default a cluster will be started with the master on GCI and nodes on
 # containervm. If you are updating the containervm version, update this
 # variable. Also please update corresponding image for node e2e at:
@ -135,6 +143,10 @@ RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}"
 # Optional: set feature gates
 FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}"

+if [[ ! -z "${NODE_ACCELERATORS}" ]]; then
+    FEATURE_GATES="${FEATURE_GATES},Accelerators=true"
+fi
+
 # Optional: Install cluster DNS.
 ENABLE_CLUSTER_DNS="${KUBE_ENABLE_CLUSTER_DNS:-true}"
 DNS_SERVER_IP="${KUBE_DNS_SERVER_IP:-10.0.0.10}"
--- a/cluster/gce/config-test.sh
+++ b/cluster/gce/config-test.sh
@ -32,6 +32,7 @@ MASTER_DISK_SIZE=${MASTER_DISK_SIZE:-20GB}
 NODE_DISK_TYPE=${NODE_DISK_TYPE:-pd-standard}
 NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB}
 NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0}
+NODE_ACCELERATORS=${NODE_ACCELERATORS:-""}
 REGISTER_MASTER_KUBELET=${REGISTER_MASTER:-true}
 KUBE_APISERVER_REQUEST_TIMEOUT=300
 PREEMPTIBLE_NODE=${PREEMPTIBLE_NODE:-false}
@ -56,6 +57,11 @@ if [[ "${NODE_OS_DISTRIBUTION}" == "cos" ]]; then
    NODE_OS_DISTRIBUTION="gci"
 fi

+# GPUs supported in GCE do not have compatible drivers in Debian 7.
+if [[ "${NODE_OS_DISTRIBUTION}" == "debian" ]]; then
+    NODE_ACCELERATORS=""
+fi
+
 # By default a cluster will be started with the master on GCI and nodes on
 # containervm. If you are updating the containervm version, update this
 # variable. Also please update corresponding image for node e2e at:
@ -91,6 +97,10 @@ RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}"
 # Optional: set feature gates
 FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}"

+if [[ ! -z "${NODE_ACCELERATORS}" ]]; then
+    FEATURE_GATES="${FEATURE_GATES},Accelerators=true"
+fi
+
 TERMINATED_POD_GC_THRESHOLD=${TERMINATED_POD_GC_THRESHOLD:-100}

 # Extra docker options for nodes.
--- a/cluster/gce/util.sh
+++ b/cluster/gce/util.sh
@ -527,10 +527,16 @@ function create-node-template() {
    "${IP_ALIAS_SUBNETWORK:-}" \
    "${IP_ALIAS_SIZE:-}")

+  local accelerator_args=""
+  # VMs with Accelerators cannot be live migrated.
+  # More details here - https://cloud.google.com/compute/docs/gpus/add-gpus#create-new-gpu-instance
+  if [[ ! -z "${NODE_ACCELERATORS}" ]]; then
+      accelerator_args="--maintenance-policy TERMINATE --restart-on-failure --accelerator ${NODE_ACCELERATORS}"
+  fi
  local attempt=1
  while true; do
    echo "Attempt ${attempt} to create ${1}" >&2
-    if ! ${gcloud} compute instance-templates create \
+    if ! ${gcloud} beta compute instance-templates create \
      "$template_name" \
      --project "${PROJECT}" \
      --machine-type "${NODE_SIZE}" \
@ -539,6 +545,7 @@ function create-node-template() {
      --image-project="${NODE_IMAGE_PROJECT}" \
      --image "${NODE_IMAGE}" \
      --tags "${NODE_TAG}" \
+      ${accelerator_args} \
      ${local_ssds} \
      --region "${REGION}" \
      ${network} \