mirror of https://github.com/k3s-io/k3s
Merge pull request #45130 from vishh/gpu-cluster-setup
Automatic merge from submit-queue (batch tested with PRs 44830, 45130) Adding support for Accelerators to GCE clusters. ```release-note Create clusters with GPUs in GKE by specifying "type=<gpu-type>,count=<gpu-count>" to NODE_ACCELERATORS env var. List of available GPUs - https://cloud.google.com/compute/docs/gpus/#introduction ```pull/6/head
commit
d4f92711a1
|
@ -32,6 +32,9 @@ MASTER_DISK_SIZE=${MASTER_DISK_SIZE:-20GB}
|
|||
NODE_DISK_TYPE=${NODE_DISK_TYPE:-pd-standard}
|
||||
NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB}
|
||||
NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0}
|
||||
# Accelerators to be attached to each node. Format "type=<accelerator-type>,count=<accelerator-count>"
|
||||
# More information on available GPUs here - https://cloud.google.com/compute/docs/gpus/
|
||||
NODE_ACCELERATORS=${NODE_ACCELERATORS:-""}
|
||||
REGISTER_MASTER_KUBELET=${REGISTER_MASTER:-true}
|
||||
PREEMPTIBLE_NODE=${PREEMPTIBLE_NODE:-false}
|
||||
PREEMPTIBLE_MASTER=${PREEMPTIBLE_MASTER:-false}
|
||||
|
@ -55,6 +58,11 @@ if [[ "${NODE_OS_DISTRIBUTION}" == "cos" ]]; then
|
|||
NODE_OS_DISTRIBUTION="gci"
|
||||
fi
|
||||
|
||||
# GPUs supported in GCE do not have compatible drivers in Debian 7.
|
||||
if [[ "${NODE_OS_DISTRIBUTION}" == "debian" ]]; then
|
||||
NODE_ACCELERATORS=""
|
||||
fi
|
||||
|
||||
# By default a cluster will be started with the master on GCI and nodes on
|
||||
# containervm. If you are updating the containervm version, update this
|
||||
# variable. Also please update corresponding image for node e2e at:
|
||||
|
@ -135,6 +143,10 @@ RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}"
|
|||
# Optional: set feature gates
|
||||
FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}"
|
||||
|
||||
if [[ ! -z "${NODE_ACCELERATORS}" ]]; then
|
||||
FEATURE_GATES="${FEATURE_GATES},Accelerators=true"
|
||||
fi
|
||||
|
||||
# Optional: Install cluster DNS.
|
||||
ENABLE_CLUSTER_DNS="${KUBE_ENABLE_CLUSTER_DNS:-true}"
|
||||
DNS_SERVER_IP="${KUBE_DNS_SERVER_IP:-10.0.0.10}"
|
||||
|
|
|
@ -32,6 +32,7 @@ MASTER_DISK_SIZE=${MASTER_DISK_SIZE:-20GB}
|
|||
NODE_DISK_TYPE=${NODE_DISK_TYPE:-pd-standard}
|
||||
NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB}
|
||||
NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0}
|
||||
NODE_ACCELERATORS=${NODE_ACCELERATORS:-""}
|
||||
REGISTER_MASTER_KUBELET=${REGISTER_MASTER:-true}
|
||||
KUBE_APISERVER_REQUEST_TIMEOUT=300
|
||||
PREEMPTIBLE_NODE=${PREEMPTIBLE_NODE:-false}
|
||||
|
@ -56,6 +57,11 @@ if [[ "${NODE_OS_DISTRIBUTION}" == "cos" ]]; then
|
|||
NODE_OS_DISTRIBUTION="gci"
|
||||
fi
|
||||
|
||||
# GPUs supported in GCE do not have compatible drivers in Debian 7.
|
||||
if [[ "${NODE_OS_DISTRIBUTION}" == "debian" ]]; then
|
||||
NODE_ACCELERATORS=""
|
||||
fi
|
||||
|
||||
# By default a cluster will be started with the master on GCI and nodes on
|
||||
# containervm. If you are updating the containervm version, update this
|
||||
# variable. Also please update corresponding image for node e2e at:
|
||||
|
@ -91,6 +97,10 @@ RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}"
|
|||
# Optional: set feature gates
|
||||
FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}"
|
||||
|
||||
if [[ ! -z "${NODE_ACCELERATORS}" ]]; then
|
||||
FEATURE_GATES="${FEATURE_GATES},Accelerators=true"
|
||||
fi
|
||||
|
||||
TERMINATED_POD_GC_THRESHOLD=${TERMINATED_POD_GC_THRESHOLD:-100}
|
||||
|
||||
# Extra docker options for nodes.
|
||||
|
|
|
@ -527,10 +527,16 @@ function create-node-template() {
|
|||
"${IP_ALIAS_SUBNETWORK:-}" \
|
||||
"${IP_ALIAS_SIZE:-}")
|
||||
|
||||
local accelerator_args=""
|
||||
# VMs with Accelerators cannot be live migrated.
|
||||
# More details here - https://cloud.google.com/compute/docs/gpus/add-gpus#create-new-gpu-instance
|
||||
if [[ ! -z "${NODE_ACCELERATORS}" ]]; then
|
||||
accelerator_args="--maintenance-policy TERMINATE --restart-on-failure --accelerator ${NODE_ACCELERATORS}"
|
||||
fi
|
||||
local attempt=1
|
||||
while true; do
|
||||
echo "Attempt ${attempt} to create ${1}" >&2
|
||||
if ! ${gcloud} compute instance-templates create \
|
||||
if ! ${gcloud} beta compute instance-templates create \
|
||||
"$template_name" \
|
||||
--project "${PROJECT}" \
|
||||
--machine-type "${NODE_SIZE}" \
|
||||
|
@ -539,6 +545,7 @@ function create-node-template() {
|
|||
--image-project="${NODE_IMAGE_PROJECT}" \
|
||||
--image "${NODE_IMAGE}" \
|
||||
--tags "${NODE_TAG}" \
|
||||
${accelerator_args} \
|
||||
${local_ssds} \
|
||||
--region "${REGION}" \
|
||||
${network} \
|
||||
|
|
Loading…
Reference in New Issue