From 112a013567db1f222ae799f6659c48923768e1d6 Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Thu, 4 Jun 2015 22:39:10 -0400 Subject: [PATCH] AWS: Support different docker storage mechanism by setting DOCKER_STORAGE For parity with GCE, we really want to support aufs. But we previously supported btrfs, so we want to expose that. Most of the work here is required for aufs, and we let advanced users choose devicemapper/btrfs if they have a setup that works for those configurations. --- cluster/aws/config-default.sh | 3 + cluster/aws/config-test.sh | 3 + cluster/aws/options.md | 19 ++++ cluster/aws/templates/format-disks.sh | 154 +++++++++++++++++++++----- cluster/aws/ubuntu/util.sh | 1 + cluster/aws/util.sh | 1 + 6 files changed, 154 insertions(+), 27 deletions(-) diff --git a/cluster/aws/config-default.sh b/cluster/aws/config-default.sh index 263c4b5661..b63ec1628c 100644 --- a/cluster/aws/config-default.sh +++ b/cluster/aws/config-default.sh @@ -26,6 +26,9 @@ NUM_MINIONS=${NUM_MINIONS:-4} # Because regions are globally named, we want to create in a single region; default to us-east-1 AWS_S3_REGION=${AWS_S3_REGION:-us-east-1} +# Which docker storage mechanism to use. +DOCKER_STORAGE=${DOCKER_STORAGE:-aufs} + INSTANCE_PREFIX="${KUBE_AWS_INSTANCE_PREFIX:-kubernetes}" CLUSTER_ID=${INSTANCE_PREFIX} AWS_SSH_KEY=${AWS_SSH_KEY:-$HOME/.ssh/kube_aws_rsa} diff --git a/cluster/aws/config-test.sh b/cluster/aws/config-test.sh index fedae2affc..3ddf633cf8 100755 --- a/cluster/aws/config-test.sh +++ b/cluster/aws/config-test.sh @@ -22,6 +22,9 @@ NUM_MINIONS=${NUM_MINIONS:-2} # Because regions are globally named, we want to create in a single region; default to us-east-1 AWS_S3_REGION=${AWS_S3_REGION:-us-east-1} +# Which docker storage mechanism to use. +DOCKER_STORAGE=${DOCKER_STORAGE:-aufs} + INSTANCE_PREFIX="${KUBE_AWS_INSTANCE_PREFIX:-e2e-test-${USER}}" CLUSTER_ID=${INSTANCE_PREFIX} AWS_SSH_KEY=${AWS_SSH_KEY:-$HOME/.ssh/kube_aws_rsa} diff --git a/cluster/aws/options.md b/cluster/aws/options.md index 86642deb00..2040149411 100644 --- a/cluster/aws/options.md +++ b/cluster/aws/options.md @@ -50,5 +50,24 @@ Please note: Do not set this to "false" unless you... - ... already configured a route for "YOUR_IP/32" to an AWS internet gateway (for the master instance to reach your client directly during setup) +## DOCKER_STORAGE + +Choose the docker storage driver to use. This is an advanced option; most people should leave it as the default aufs +for parity with GCE. + +Supported values: btrfs, aufs, devicemapper, aufs-nolvm + +This will also configure your ephemeral storage in a compatible way, and your Docker containers +will run on this storage if available, as typically the root disk is comparatively small. + +* `btrfs` will combine your ephemeral disks into a btrfs volume. This is a good option if you have a recent kernel + with a reliable btrfs. +* `aufs` uses the aufs driver, but also installs LVM to combine your disks. `aufs-nolvm` will not use LVM, + meaning that only your first ephemeral disk will be used. +* `devicemapper` sets up LVM across all your ephemeral disks and sets Docker to drive it directly. This is a + similar option to btrfs, but without relying on the btrfs filesystem. Sadly, it does not work with most + configurations - see [this docker bug](https://github.com/docker/docker/issues/4036) + +If your machines don't have any ephemeral disks, this will default to the aufs driver on your root disk (with no LVM). [![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/cluster/aws/options.md?pixel)]() diff --git a/cluster/aws/templates/format-disks.sh b/cluster/aws/templates/format-disks.sh index 32590ad884..3aeb09c7e5 100644 --- a/cluster/aws/templates/format-disks.sh +++ b/cluster/aws/templates/format-disks.sh @@ -42,39 +42,139 @@ for ephemeral_device in $ephemeral_devices; do fi done +# These are set if we should move where docker/kubelet store data +# Note this gets set to the parent directory +move_docker="" +move_kubelet="" + +apt-get update + +docker_storage=${DOCKER_STORAGE:-aufs} + # Format the ephemeral disks if [[ ${#block_devices[@]} == 0 ]]; then - echo "No ephemeral block devices found" + echo "No ephemeral block devices found; will use aufs on root" + docker_storage="aufs" else - echo "Block devices: ${block_devices}" + echo "Block devices: ${block_devices[@]}" - apt-get install --yes btrfs-tools + if [[ ${docker_storage} == "btrfs" ]]; then + apt-get install --yes btrfs-tools - if [[ ${#block_devices[@]} == 1 ]]; then - echo "One ephemeral block device found; formatting with btrfs" - mkfs.btrfs -f ${block_devices[0]} + if [[ ${#block_devices[@]} == 1 ]]; then + echo "One ephemeral block device found; formatting with btrfs" + mkfs.btrfs -f ${block_devices[0]} + else + echo "Found multiple ephemeral block devices, formatting with btrfs as RAID-0" + mkfs.btrfs -f --data raid0 ${block_devices[@]} + fi + mount -t btrfs ${block_devices[0]} /mnt + + mkdir -p /mnt/kubernetes + + move_docker="/mnt" + move_kubelet="/mnt/kubernetes" + elif [[ ${docker_storage} == "aufs-nolvm" ]]; then + if [[ ${#block_devices[@]} != 1 ]]; then + echo "aufs-nolvm selected, but multiple ephemeral devices were found; only the first will be available" + fi + + /bin/umount ${block_devices[0]} + mkfs -t ext4 ${block_devices[0]} + mount -t ext4 ${block_devices[0]} /mnt + + mkdir -p /mnt/kubernetes + + move_docker="/mnt" + move_kubelet="/mnt/kubernetes" + elif [[ ${docker_storage} == "devicemapper" || ${docker_storage} == "aufs" ]]; then + # We always use LVM, even with one device + # In devicemapper mode, Docker can use LVM directly + # Also, fewer code paths are good + echo "Using LVM2 and ext4" + apt-get install --yes lvm2 + + # Don't output spurious "File descriptor X leaked on vgcreate invocation." + # Known bug: e.g. Ubuntu #591823 + export LVM_SUPPRESS_FD_WARNINGS=1 + + for block_device in ${block_devices}; do + /bin/umount ${block_device} + pvcreate ${block_device} + done + vgcreate vg-ephemeral ${block_devices[@]} + + if [[ ${docker_storage} == "devicemapper" ]]; then + # devicemapper thin provisioning, managed by docker + # This is the best option, but it is sadly broken on most distros + # Bug: https://github.com/docker/docker/issues/4036 + + # 95% goes to the docker thin-pool + lvcreate -l 95%VG --thinpool docker-thinpool vg-ephemeral + + DOCKER_OPTS="${DOCKER_OPTS} --storage-opt dm.thinpooldev=/dev/mapper/vg--ephemeral-docker--thinpool" + # Note that we don't move docker; docker goes direct to the thinpool + else + # aufs + + # Create a docker lv, use docker on it + # 95% goes to the docker thin-pool + lvcreate -l 95%VG --thinpool docker-thinpool vg-ephemeral + + THINPOOL_SIZE=$(lvs vg-ephemeral/docker-thinpool -o LV_SIZE --noheadings --units M --nosuffix) + lvcreate -V${THINPOOL_SIZE}M -T vg-ephemeral/docker-thinpool -n docker + + mkfs -t ext4 /dev/vg-ephemeral/docker + mkdir -p /mnt/docker + mount -t ext4 /dev/vg-ephemeral/docker /mnt/docker + move_docker="/mnt" + fi + + # Remaining 5% is for kubernetes data + # TODO: Should this be a thin pool? e.g. would we ever want to snapshot this data? + lvcreate -l 100%FREE -n kubernetes vg-ephemeral + mkfs -t ext4 /dev/vg-ephemeral/kubernetes + mkdir -p /mnt/kubernetes + mount -t ext4 /dev/vg-ephemeral/kubernetes /mnt/kubernetes + move_kubelet="/mnt/kubernetes" else - echo "Found multiple ephemeral block devices, formatting with btrfs as RAID-0" - mkfs.btrfs -f --data raid0 ${block_devices[@]} + echo "Ignoring unknown DOCKER_STORAGE: ${docker_storage}" fi - mount -t btrfs ${block_devices[0]} /mnt - - # Move docker to /mnt if we have it - if [[ -d /var/lib/docker ]]; then - mv /var/lib/docker /mnt/ - fi - mkdir -p /mnt/docker - ln -s /mnt/docker /var/lib/docker - DOCKER_ROOT="/mnt/docker" - DOCKER_OPTS="${DOCKER_OPTS} -g /mnt/docker" - - # Move /var/lib/kubelet to /mnt if we have it - # (the backing for empty-dir volumes can use a lot of space!) - if [[ -d /var/lib/kubelet ]]; then - mv /var/lib/kubelet /mnt/ - fi - mkdir -p /mnt/kubelet - ln -s /mnt/kubelet /var/lib/kubelet - KUBELET_ROOT="/mnt/kubelet" +fi + + +if [[ ${docker_storage} == "btrfs" ]]; then + DOCKER_OPTS="${DOCKER_OPTS} -s btrfs" +elif [[ ${docker_storage} == "aufs-nolvm" || ${docker_storage} == "aufs" ]]; then + # Install aufs kernel module + apt-get install --yes linux-image-extra-$(uname -r) + + DOCKER_OPTS="${DOCKER_OPTS} -s aufs" +elif [[ ${docker_storage} == "devicemapper" ]]; then + DOCKER_OPTS="${DOCKER_OPTS} -s devicemapper" +else + echo "Ignoring unknown DOCKER_STORAGE: ${docker_storage}" +fi + +if [[ -n "${move_docker}" ]]; then + # Move docker to e.g. /mnt + if [[ -d /var/lib/docker ]]; then + mv /var/lib/docker ${move_docker}/ + fi + mkdir -p ${move_docker}/docker + ln -s ${move_docker}/docker /var/lib/docker + DOCKER_ROOT="${move_docker}/docker" + DOCKER_OPTS="${DOCKER_OPTS} -g ${DOCKER_ROOT}" +fi + +if [[ -n "${move_kubelet}" ]]; then + # Move /var/lib/kubelet to e.g. /mnt + # (the backing for empty-dir volumes can use a lot of space!) + if [[ -d /var/lib/kubelet ]]; then + mv /var/lib/kubelet ${move_kubelet}/ + fi + mkdir -p ${move_kubelet}/kubelet + ln -s ${move_kubelet}/kubelet /var/lib/kubelet + KUBELET_ROOT="${move_kubelet}/kubelet" fi diff --git a/cluster/aws/ubuntu/util.sh b/cluster/aws/ubuntu/util.sh index 35b923274f..33281c66f6 100644 --- a/cluster/aws/ubuntu/util.sh +++ b/cluster/aws/ubuntu/util.sh @@ -31,6 +31,7 @@ function generate-minion-user-data { echo "SALT_MASTER='${MASTER_INTERNAL_IP}'" echo "MINION_IP_RANGE='${MINION_IP_RANGES[$i]}'" echo "DOCKER_OPTS='${EXTRA_DOCKER_OPTS:-}'" + echo "readonly DOCKER_STORAGE='${DOCKER_STORAGE:-}'" grep -v "^#" "${KUBE_ROOT}/cluster/aws/templates/common.sh" grep -v "^#" "${KUBE_ROOT}/cluster/aws/templates/format-disks.sh" grep -v "^#" "${KUBE_ROOT}/cluster/aws/templates/salt-minion.sh" diff --git a/cluster/aws/util.sh b/cluster/aws/util.sh index 116014004c..eae3c5dadc 100644 --- a/cluster/aws/util.sh +++ b/cluster/aws/util.sh @@ -674,6 +674,7 @@ function kube-up { echo "readonly MASTER_IP_RANGE='${MASTER_IP_RANGE:-}'" echo "readonly KUBELET_TOKEN='${KUBELET_TOKEN}'" echo "readonly KUBE_PROXY_TOKEN='${KUBE_PROXY_TOKEN}'" + echo "readonly DOCKER_STORAGE='${DOCKER_STORAGE:-}'" grep -v "^#" "${KUBE_ROOT}/cluster/aws/templates/common.sh" grep -v "^#" "${KUBE_ROOT}/cluster/aws/templates/format-disks.sh" grep -v "^#" "${KUBE_ROOT}/cluster/aws/templates/create-dynamic-salt-files.sh"