k3s/hack/jenkins/e2e-runner.sh

466 lines
19 KiB
Bash
Executable File

#!/bin/bash
# Copyright 2015 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Run e2e tests using environment variables exported in e2e.sh.
set -o errexit
set -o nounset
set -o pipefail
set -o xtrace
# include shell2junit library
source <(curl -fsS --retry 3 'https://raw.githubusercontent.com/kubernetes/kubernetes/master/third_party/forked/shell2junit/sh2ju.sh')
# Have cmd/e2e run by goe2e.sh generate JUnit report in ${WORKSPACE}/junit*.xml
ARTIFACTS=${WORKSPACE}/_artifacts
mkdir -p ${ARTIFACTS}
# E2E runner stages
STAGE_PRE="PRE-SETUP"
STAGE_SETUP="SETUP"
STAGE_CLEANUP="CLEANUP"
STAGE_KUBEMARK="KUBEMARK"
: ${KUBE_GCS_RELEASE_BUCKET:="kubernetes-release"}
: ${KUBE_GCS_DEV_RELEASE_BUCKET:="kubernetes-release-dev"}
# record_command runs the command and records its output/error messages in junit format
# it expects the first argument to be the class and the second to be the name of the command
# Example:
# record_command PRESETUP curltest curl google.com
# record_command CLEANUP check false
#
# WARNING: Variable changes in the command will NOT be effective after record_command returns.
# This is because the command runs in subshell.
function record_command() {
set +o xtrace
set +o nounset
set +o errexit
local class=$1
shift
local name=$1
shift
echo "Recording: ${class} ${name}"
echo "Running command: $@"
juLog -output="${ARTIFACTS}" -class="${class}" -name="${name}" "$@"
set -o nounset
set -o errexit
set -o xtrace
}
function running_in_docker() {
grep -q docker /proc/self/cgroup
}
function fetch_output_tars() {
echo "Using binaries from _output."
cp _output/release-tars/kubernetes*.tar.gz .
unpack_binaries
}
function fetch_server_version_tars() {
local -r server_version="$(gcloud ${CMD_GROUP:-} container get-server-config --project=${PROJECT} --zone=${ZONE} --format='value(defaultClusterVersion)')"
# Use latest build of the server version's branch for test files.
fetch_published_version_tars "ci/latest-${server_version:0:3}"
# Unset cluster api version; we want to use server default for the cluster
# version.
unset CLUSTER_API_VERSION
}
# Use a published version like "ci/latest" (default), "release/latest",
# "release/latest-1", or "release/stable"
function fetch_published_version_tars() {
local -r published_version="${1}"
IFS='/' read -a varr <<< "${published_version}"
path="${varr[0]}"
if [[ "${path}" == "release" ]]; then
local -r bucket="${KUBE_GCS_RELEASE_BUCKET}"
else
local -r bucket="${KUBE_GCS_DEV_RELEASE_BUCKET}"
fi
build_version=$(gsutil cat "gs://${bucket}/${published_version}.txt")
echo "Using published version $bucket/$build_version (from ${published_version})"
fetch_tars_from_gcs "gs://${bucket}/${path}" "${build_version}"
unpack_binaries
# Set CLUSTER_API_VERSION for GKE CI
export CLUSTER_API_VERSION=$(echo ${build_version} | cut -c 2-)
}
# TODO(ihmccreery) I'm not sure if this is necesssary, with the workspace check
# below.
function clean_binaries() {
echo "Cleaning up binaries."
rm -rf kubernetes*
}
function fetch_tars_from_gcs() {
local -r gspath="${1}"
local -r build_version="${2}"
echo "Pulling binaries from GCS; using server version ${gspath}/${build_version}."
gsutil -mq cp "${gspath}/${build_version}/kubernetes.tar.gz" "${gspath}/${build_version}/kubernetes-test.tar.gz" .
}
function unpack_binaries() {
md5sum kubernetes*.tar.gz
tar -xzf kubernetes.tar.gz
tar -xzf kubernetes-test.tar.gz
}
# Get the latest GCI image in a family.
function get_latest_gci_image() {
local -r image_project="$1"
local -r image_family="$2"
echo "$(gcloud compute images describe-from-family ${image_family} --project=${image_project} --format='value(name)')"
}
function get_latest_docker_release() {
# Typical Docker release versions are like v1.11.2-rc1, v1.11.2, and etc.
local -r version_re='.*\"tag_name\":[[:space:]]+\"v([0-9\.r|c-]+)\",.*'
local -r releases="$(curl -fsSL --retry 3 https://api.github.com/repos/docker/docker/releases)"
# The GitHub API returns releases in descending order of creation time so the
# first one is always the latest.
# TODO: if we can install `jq` on the Jenkins nodes, we won't have to craft
# regular expressions here.
while read -r line; do
if [[ "${line}" =~ ${version_re} ]]; then
echo "${BASH_REMATCH[1]}"
return
fi
done <<< "${releases}"
echo "Failed to determine the latest Docker release."
exit 1
}
function install_google_cloud_sdk_tarball() {
local -r tarball=$1
local -r install_dir=$2
mkdir -p "${install_dir}"
tar xzf "${tarball}" -C "${install_dir}"
export CLOUDSDK_CORE_DISABLE_PROMPTS=1
record_command "${STAGE_PRE}" "install_gcloud" "${install_dir}/google-cloud-sdk/install.sh" --disable-installation-options --bash-completion=false --path-update=false --usage-reporting=false
export PATH=${install_dir}/google-cloud-sdk/bin:${PATH}
}
# Only call after attempting to bring the cluster up. Don't call after
# bringing the cluster down.
function dump_cluster_logs_and_exit() {
local -r exit_status=$?
dump_cluster_logs
if [[ "${E2E_DOWN,,}" == "true" ]]; then
# If we tried to bring the cluster up, make a courtesy attempt
# to bring the cluster down so we're not leaving resources
# around. Unlike later, don't sleep beforehand, though. (We're
# just trying to tear down as many resources as we can as fast
# as possible and don't even know if we brought the master up.)
go run ./hack/e2e.go ${E2E_OPT:-} -v --down || true
fi
exit ${exit_status}
}
# Only call after attempting to bring the cluster up. Don't call after
# bringing the cluster down.
function dump_cluster_logs() {
if [[ -x "cluster/log-dump.sh" ]]; then
./cluster/log-dump.sh "${ARTIFACTS}"
fi
}
### Pre Set Up ###
if running_in_docker; then
record_command "${STAGE_PRE}" "download_gcloud" curl -fsSL --retry 3 --keepalive-time 2 -o "${WORKSPACE}/google-cloud-sdk.tar.gz" 'https://dl.google.com/dl/cloudsdk/channels/rapid/google-cloud-sdk.tar.gz'
install_google_cloud_sdk_tarball "${WORKSPACE}/google-cloud-sdk.tar.gz" /
if [[ "${KUBERNETES_PROVIDER}" == 'aws' ]]; then
pip install awscli
fi
fi
# Install gcloud from a custom path if provided. Used to test GKE with gcloud
# at HEAD, release candidate.
# TODO: figure out how to avoid installing the cloud sdk twice if run inside Docker.
if [[ -n "${CLOUDSDK_BUCKET:-}" ]]; then
# Retry the download a few times to mitigate transient server errors and
# race conditions where the bucket contents change under us as we download.
for n in $(seq 3); do
gsutil -mq cp -r "${CLOUDSDK_BUCKET}" ~ && break || sleep 1
# Delete any temporary files from the download so that we start from
# scratch when we retry.
rm -rf ~/.gsutil
done
rm -rf ~/repo ~/cloudsdk
mv ~/$(basename "${CLOUDSDK_BUCKET}") ~/repo
export CLOUDSDK_COMPONENT_MANAGER_SNAPSHOT_URL=file://${HOME}/repo/components-2.json
install_google_cloud_sdk_tarball ~/repo/google-cloud-sdk.tar.gz ~/cloudsdk
# TODO: is this necessary? this won't work inside Docker currently.
export CLOUDSDK_CONFIG=/var/lib/jenkins/.config/gcloud
fi
# We get the image project and name for GCI dynamically.
if [[ -n "${JENKINS_GCI_IMAGE_FAMILY:-}" ]]; then
GCI_STAGING_PROJECT=container-vm-image-staging
export KUBE_GCE_MASTER_PROJECT="${GCI_STAGING_PROJECT}"
export KUBE_GCE_MASTER_IMAGE="$(get_latest_gci_image "${GCI_STAGING_PROJECT}" "${JENKINS_GCI_IMAGE_FAMILY}")"
export KUBE_MASTER_OS_DISTRIBUTION="gci"
if [[ "${JENKINS_GCI_IMAGE_FAMILY}" == "gci-canary-test" ]]; then
# The family "gci-canary-test" is reserved for a special type of GCI images
# that are used to continuously validate Docker releases.
export KUBE_GCI_DOCKER_VERSION="$(get_latest_docker_release)"
fi
fi
if [[ -f "${KUBEKINS_SERVICE_ACCOUNT_FILE:-}" ]]; then
echo 'Activating service account...' # No harm in doing this multiple times.
gcloud auth activate-service-account --key-file="${KUBEKINS_SERVICE_ACCOUNT_FILE}"
unset GCE_SERVICE_ACCOUNT # Use checked in credentials, not the metadata server
unset KUBEKINS_SERVICE_ACCOUNT_FILE
elif [[ -n "${KUBEKINS_SERVICE_ACCOUNT_FILE:-}" ]]; then
echo "ERROR: cannot access service account file at: ${KUBEKINS_SERVICE_ACCOUNT_FILE}"
fi
function e2e_test() {
local -r ginkgo_test_args="${1}"
# Check to make sure the cluster is up before running tests, and fail if it's not.
go run ./hack/e2e.go ${E2E_OPT:-} -v --isup
# Jenkins will look at the junit*.xml files for test failures, so don't exit with a nonzero
# error code if it was only tests that failed.
go run ./hack/e2e.go ${E2E_OPT:-} -v --test \
${ginkgo_test_args:+--test_args="${ginkgo_test_args}"} \
&& exitcode=0 || exitcode=$?
if [[ "${E2E_PUBLISH_GREEN_VERSION:-}" == "true" && ${exitcode} == 0 ]]; then
# Use plaintext version file packaged with kubernetes.tar.gz
echo "Publish version to ci/latest-green.txt: $(cat version)"
gsutil cp ./version "gs://${KUBE_GCS_DEV_RELEASE_BUCKET}/ci/latest-green.txt"
fi
return ${exitcode}
}
echo "--------------------------------------------------------------------------------"
echo "Test Environment:"
printenv | sort
echo "--------------------------------------------------------------------------------"
# Set this var instead of exiting-- we must do the cluster teardown step. We'll
# return this at the very end.
EXIT_CODE=0
# We get the Kubernetes tarballs unless we are going to use old ones
if [[ "${JENKINS_USE_EXISTING_BINARIES:-}" =~ ^[yY]$ ]]; then
echo "Using existing binaries; not cleaning, fetching, or unpacking new ones."
elif [[ "${KUBE_RUN_FROM_OUTPUT:-}" =~ ^[yY]$ ]]; then
# TODO(spxtr) This should probably be JENKINS_USE_BINARIES_FROM_OUTPUT or
# something, rather than being prepended with KUBE, since it's sort of a
# meta-thing.
clean_binaries
fetch_output_tars
elif [[ "${JENKINS_USE_SERVER_VERSION:-}" =~ ^[yY]$ ]]; then
# This is for test, staging, and prod jobs on GKE, where we want to
# test what's running in GKE by default rather than some CI build.
clean_binaries
fetch_server_version_tars
else
# use JENKINS_PUBLISHED_VERSION, default to 'ci/latest', since that's
# usually what we're testing.
clean_binaries
fetch_published_version_tars "${JENKINS_PUBLISHED_VERSION:-ci/latest}"
fi
# Copy GCE keys so we don't keep cycling them.
# To set this up, you must know the <project>, <zone>, and <instance>
# on which your jenkins jobs are running. Then do:
#
# # SSH from your computer into the instance.
# $ gcloud compute ssh --project="<prj>" ssh --zone="<zone>" <instance>
#
# # Generate a key by ssh'ing from the instance into itself, then exit.
# $ gcloud compute ssh --project="<prj>" ssh --zone="<zone>" <instance>
# $ ^D
#
# # Copy the keys to the desired location (e.g. /var/lib/jenkins/gce_keys/).
# $ sudo mkdir -p /var/lib/jenkins/gce_keys/
# $ sudo cp ~/.ssh/google_compute_engine /var/lib/jenkins/gce_keys/
# $ sudo cp ~/.ssh/google_compute_engine.pub /var/lib/jenkins/gce_keys/
#
# # Move the permissions for the keys to Jenkins.
# $ sudo chown -R jenkins /var/lib/jenkins/gce_keys/
# $ sudo chgrp -R jenkins /var/lib/jenkins/gce_keys/
case "${KUBERNETES_PROVIDER}" in
gce|gke|kubemark)
if ! running_in_docker; then
mkdir -p ${WORKSPACE}/.ssh/
cp /var/lib/jenkins/gce_keys/google_compute_engine ${WORKSPACE}/.ssh/
cp /var/lib/jenkins/gce_keys/google_compute_engine.pub ${WORKSPACE}/.ssh/
fi
echo 'Checking existence of private ssh key'
gce_key="${WORKSPACE}/.ssh/google_compute_engine"
if [[ ! -f "${gce_key}" || ! -f "${gce_key}.pub" ]]; then
echo 'google_compute_engine ssh key missing!'
exit 1
fi
echo "Checking presence of public key in ${PROJECT}"
if ! gcloud compute --project="${PROJECT}" project-info describe |
grep "$(cat "${gce_key}.pub")" >/dev/null; then
echo 'Uploading public ssh key to project metadata...'
gcloud compute --project="${PROJECT}" config-ssh
fi
;;
default)
echo "Not copying ssh keys for ${KUBERNETES_PROVIDER}"
;;
esac
cd kubernetes
# Upload build start time and k8s version to GCS, but not on PR Jenkins.
# On PR Jenkins this is done before the build.
if [[ ! "${JOB_NAME}" =~ -pull- ]]; then
JENKINS_BUILD_STARTED=true bash <(curl -fsS --retry 3 --keepalive-time 2 "https://raw.githubusercontent.com/kubernetes/kubernetes/master/hack/jenkins/upload-to-gcs.sh")
fi
# When run inside Docker, we need to make sure all files are world-readable
# (since they will be owned by root on the host).
trap "chmod -R o+r '${ARTIFACTS}'" EXIT SIGINT SIGTERM
export E2E_REPORT_DIR=${ARTIFACTS}
declare -r gcp_list_resources_script="./cluster/gce/list-resources.sh"
declare -r gcp_resources_before="${ARTIFACTS}/gcp-resources-before.txt"
declare -r gcp_resources_cluster_up="${ARTIFACTS}/gcp-resources-cluster-up.txt"
declare -r gcp_resources_after="${ARTIFACTS}/gcp-resources-after.txt"
if [[ ( ${KUBERNETES_PROVIDER} == "gce" || ${KUBERNETES_PROVIDER} == "gke" ) && -x "${gcp_list_resources_script}" ]]; then
gcp_list_resources="true"
# Always pull the script from HEAD, overwriting the local one if it exists.
# We do this to pick up fixes if we are running tests from a branch or tag.
curl -fsS --retry 3 --keepalive-time 2 "https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/gce/list-resources.sh" > "${gcp_list_resources_script}"
else
gcp_list_resources="false"
fi
### Set up ###
if [[ "${E2E_UP,,}" == "true" ]]; then
go run ./hack/e2e.go ${E2E_OPT:-} -v --down
fi
if [[ "${gcp_list_resources}" == "true" ]]; then
${gcp_list_resources_script} > "${gcp_resources_before}"
fi
if [[ "${E2E_UP,,}" == "true" ]]; then
# We want to try to gather logs even if kube-up fails, so collect the
# result here and fail after dumping logs if it's nonzero.
go run ./hack/e2e.go ${E2E_OPT:-} -v --up || dump_cluster_logs_and_exit
go run ./hack/e2e.go -v --ctl="version --match-server-version=false"
if [[ "${gcp_list_resources}" == "true" ]]; then
${gcp_list_resources_script} > "${gcp_resources_cluster_up}"
fi
fi
# Allow download & unpack of alternate version of tests, for cross-version & upgrade testing.
#
# JENKINS_PUBLISHED_SKEW_VERSION downloads an alternate version of Kubernetes
# for testing, moving the old one to kubernetes_old.
#
# E2E_UPGRADE_TEST=true triggers a run of the e2e tests, to do something like
# upgrade the cluster, before the main test run. It uses
# GINKGO_UPGRADE_TESTS_ARGS for the test run.
#
# JENKINS_USE_SKEW_TESTS=true will run tests from the skewed version rather
# than the original version.
if [[ -n "${JENKINS_PUBLISHED_SKEW_VERSION:-}" ]]; then
cd ..
mv kubernetes kubernetes_old
fetch_published_version_tars "${JENKINS_PUBLISHED_SKEW_VERSION}"
cd kubernetes
# Upgrade the cluster before running other tests
if [[ "${E2E_UPGRADE_TEST:-}" == "true" ]]; then
# Add a report prefix for the e2e tests so that the tests don't get overwritten when we run
# the rest of the e2es.
E2E_REPORT_PREFIX='upgrade' e2e_test "${GINKGO_UPGRADE_TEST_ARGS:-}" || EXIT_CODE=1
fi
if [[ "${JENKINS_USE_SKEW_TESTS:-}" != "true" ]]; then
# Back out into the old tests now that we've downloaded & maybe upgraded.
cd ../kubernetes_old
# Append kubectl-path of skewed kubectl to test args, since we always
# want that to use the skewed kubectl version:
#
# - for upgrade jobs, we want kubectl to be at the same version as master.
# - for client skew tests, we want to use the skewed kubectl (that's what we're testing).
GINKGO_TEST_ARGS="${GINKGO_TEST_ARGS:-} --kubectl-path=$(pwd)/../kubernetes/cluster/kubectl.sh"
fi
fi
if [[ "${E2E_TEST,,}" == "true" ]]; then
e2e_test "${GINKGO_TEST_ARGS:-}" || EXIT_CODE=1
fi
### Start Kubemark ###
if [[ "${USE_KUBEMARK:-}" == "true" ]]; then
export RUN_FROM_DISTRO=true
NUM_NODES_BKP=${NUM_NODES}
MASTER_SIZE_BKP=${MASTER_SIZE}
./test/kubemark/stop-kubemark.sh
NUM_NODES=${KUBEMARK_NUM_NODES:-$NUM_NODES}
MASTER_SIZE=${KUBEMARK_MASTER_SIZE:-$MASTER_SIZE}
# If start-kubemark fails, we trigger empty set of tests that would trigger storing logs from the base cluster.
./test/kubemark/start-kubemark.sh || dump_cluster_logs_and_exit
# Similarly, if tests fail, we trigger empty set of tests that would trigger storing logs from the base cluster.
# We intentionally overwrite the exit-code from `run-e2e-tests.sh` because we want jenkins to look at the
# junit.xml results for test failures and not process the exit code. This is needed by jenkins to more gracefully
# handle blocking the merge queue as a result of test failure flakes. Infrastructure failures should continue to
# exit non-0.
# TODO: The above comment is no longer accurate. Need to fix this before
# turning xunit off for the postsubmit tests. See: #28200
./test/kubemark/run-e2e-tests.sh --ginkgo.focus="${KUBEMARK_TESTS:-starting\s30\spods}" "${KUBEMARK_TEST_ARGS:-}" || dump_cluster_logs
./test/kubemark/stop-kubemark.sh
NUM_NODES=${NUM_NODES_BKP}
MASTER_SIZE=${MASTER_SIZE_BKP}
unset RUN_FROM_DISTRO
unset NUM_NODES_BKP
unset MASTER_SIZE_BKP
fi
### Clean up ###
if [[ "${E2E_DOWN,,}" == "true" ]]; then
# Sleep before deleting the cluster to give the controller manager time to
# delete any cloudprovider resources still around from the last test.
# This is calibrated to allow enough time for 3 attempts to delete the
# resources. Each attempt is allocated 5 seconds for requests to the
# cloudprovider plus the processingRetryInterval from servicecontroller.go
# for the wait between attempts.
sleep 30
go run ./hack/e2e.go ${E2E_OPT:-} -v --down
fi
if [[ "${gcp_list_resources}" == "true" ]]; then
${gcp_list_resources_script} > "${gcp_resources_after}"
fi
# Compare resources if either the cluster was
# * started and destroyed (normal e2e)
# * neither started nor destroyed (soak test)
if [[ "${E2E_UP:-}" == "${E2E_DOWN:-}" && -f "${gcp_resources_before}" && -f "${gcp_resources_after}" ]]; then
difference=$(diff -sw -U0 -F'^\[.*\]$' "${gcp_resources_before}" "${gcp_resources_after}") || true
noleak=true
if [[ -n $(echo "${difference}" | tail -n +3 | grep -E "^\+") ]] && [[ "${FAIL_ON_GCP_RESOURCE_LEAK:-}" == "true" ]]; then
noleak=false
fi
if ! ${noleak} ; then
echo "${difference}"
echo "!!! FAIL: Google Cloud Platform resources leaked while running tests!"
EXIT_CODE=1
fi
record_command "${STAGE_CLEANUP}" "gcp_resource_leak_check" ${noleak}
fi
exit ${EXIT_CODE}