Merge pull request #74438 from pjh/gce-windows-log-dump

Support dumping logs from Windows test nodes on GCE
pull/564/head
Kubernetes Prow Robot 2019-02-26 18:12:09 -08:00 committed by GitHub
commit 81ec358db4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 157 additions and 28 deletions

View File

@ -267,7 +267,7 @@ function prepare-node-upgrade() {
# TODO(zmerlynn): Get configure-vm script from ${version}. (Must plumb this
# through all create-linux-node-instance-template implementations).
local template_name=$(get-template-name-from-version ${SANITIZED_VERSION})
local template_name=$(get-template-name-from-version ${SANITIZED_VERSION} ${NODE_INSTANCE_PREFIX})
create-linux-node-instance-template "${template_name}"
# The following is echo'd so that callers can get the template name.
echo "Instance template name: ${template_name}"
@ -373,7 +373,7 @@ function do-node-upgrade() {
# Do the actual upgrade.
# NOTE(zmerlynn): If you are changing this gcloud command, update
# test/e2e/cluster_upgrade.go to match this EXACTLY.
local template_name=$(get-template-name-from-version ${SANITIZED_VERSION})
local template_name=$(get-template-name-from-version ${SANITIZED_VERSION} ${NODE_INSTANCE_PREFIX})
local old_templates=()
local updates=()
for group in ${INSTANCE_GROUPS[@]}; do

View File

@ -112,7 +112,10 @@ if [[ "${ENABLE_CLUSTER_AUTOSCALER}" == "true" ]]; then
fi
fi
# These prefixes must not be prefixes of each other, so that they can be used to
# detect mutually exclusive sets of nodes.
NODE_INSTANCE_PREFIX=${NODE_INSTANCE_PREFIX:-"${INSTANCE_PREFIX}-minion"}
WINDOWS_NODE_INSTANCE_PREFIX=${WINDOWS_NODE_INSTANCE_PREFIX:-"${INSTANCE_PREFIX}-windows-node"}
NODE_TAGS="${NODE_TAG}"
@ -373,9 +376,12 @@ function upload-tars() {
#
# Assumed vars:
# NODE_INSTANCE_PREFIX
# WINDOWS_NODE_INSTANCE_PREFIX
# Vars set:
# NODE_NAMES
# INSTANCE_GROUPS
# WINDOWS_NODE_NAMES
# WINDOWS_INSTANCE_GROUPS
function detect-node-names() {
detect-project
INSTANCE_GROUPS=()
@ -383,6 +389,12 @@ function detect-node-names() {
--project "${PROJECT}" \
--filter "name ~ '${NODE_INSTANCE_PREFIX}-.+' AND zone:(${ZONE})" \
--format='value(name)' || true))
WINDOWS_INSTANCE_GROUPS=()
WINDOWS_INSTANCE_GROUPS+=($(gcloud compute instance-groups managed list \
--project "${PROJECT}" \
--filter "name ~ '${WINDOWS_NODE_INSTANCE_PREFIX}-.+' AND zone:(${ZONE})" \
--format='value(name)' || true))
NODE_NAMES=()
if [[ -n "${INSTANCE_GROUPS[@]:-}" ]]; then
for group in "${INSTANCE_GROUPS[@]}"; do
@ -395,6 +407,14 @@ function detect-node-names() {
if [[ -n "${HEAPSTER_MACHINE_TYPE:-}" ]]; then
NODE_NAMES+=("${NODE_INSTANCE_PREFIX}-heapster")
fi
WINDOWS_NODE_NAMES=()
if [[ -n "${WINDOWS_INSTANCE_GROUPS[@]:-}" ]]; then
for group in "${WINDOWS_INSTANCE_GROUPS[@]}"; do
WINDOWS_NODE_NAMES+=($(gcloud compute instance-groups managed \
list-instances "${group}" --zone "${ZONE}" --project "${PROJECT}" \
--format='value(instance)'))
done
fi
echo "INSTANCE_GROUPS=${INSTANCE_GROUPS[*]:-}" >&2
echo "NODE_NAMES=${NODE_NAMES[*]:-}" >&2
@ -1403,6 +1423,7 @@ function build-windows-kube-env {
build-linux-kube-env false $file
cat >>$file <<EOF
WINDOWS_NODE_INSTANCE_PREFIX: $(yaml-quote ${WINDOWS_NODE_INSTANCE_PREFIX})
NODE_BINARY_TAR_URL: $(yaml-quote ${NODE_BINARY_TAR_URL})
NODE_BINARY_TAR_HASH: $(yaml-quote ${NODE_BINARY_TAR_HASH})
K8S_DIR: $(yaml-quote ${WINDOWS_K8S_DIR})
@ -1852,9 +1873,13 @@ function make-gcloud-network-argument() {
}
# $1: version (required)
# $2: Prefix for the template name, i.e. NODE_INSTANCE_PREFIX or
# WINDOWS_NODE_INSTANCE_PREFIX.
function get-template-name-from-version() {
local -r version=${1}
local -r template_prefix=${2}
# trim template name to pass gce name validation
echo "${NODE_INSTANCE_PREFIX}-template-${1}" | cut -c 1-63 | sed 's/[\.\+]/-/g;s/-*$//g'
echo "${template_prefix}-template-${version}" | cut -c 1-63 | sed 's/[\.\+]/-/g;s/-*$//g'
}
# validates the NODE_LOCAL_SSDS_EXT variable
@ -2627,9 +2652,8 @@ function create-nodes-template() {
# NOTE: these template names and their format must match
# create-[linux,windows]-nodes() as well as get-template()!
# TODO(pjh): find a better way to manage these (get-template() is annoying).
local linux_template_name="${NODE_INSTANCE_PREFIX}-template"
local windows_template_name="${NODE_INSTANCE_PREFIX}-template-windows"
local windows_template_name="${WINDOWS_NODE_INSTANCE_PREFIX}-template"
create-linux-node-instance-template $linux_template_name
create-windows-node-instance-template $windows_template_name "${scope_flags[*]}"
}
@ -2700,22 +2724,22 @@ function create-linux-nodes() {
# Assumes:
# - NUM_WINDOWS_MIGS
# - NODE_INSTANCE_PREFIX
# - WINDOWS_NODE_INSTANCE_PREFIX
# - NUM_WINDOWS_NODES
# - PROJECT
# - ZONE
function create-windows-nodes() {
local template_name="${NODE_INSTANCE_PREFIX}-template-windows"
local template_name="${WINDOWS_NODE_INSTANCE_PREFIX}-template"
local -r nodes="${NUM_WINDOWS_NODES}"
local instances_left=${nodes}
for ((i=1; i<=${NUM_WINDOWS_MIGS}; i++)); do
local group_name="${NODE_INSTANCE_PREFIX}-windows-group-$i"
local group_name="${WINDOWS_NODE_INSTANCE_PREFIX}-group-$i"
if [[ $i == ${NUM_WINDOWS_MIGS} ]]; then
# TODO: We don't add a suffix for the last group to keep backward compatibility when there's only one MIG.
# We should change it at some point, but note #18545 when changing this.
group_name="${NODE_INSTANCE_PREFIX}-windows-group"
group_name="${WINDOWS_NODE_INSTANCE_PREFIX}-group"
fi
# Spread the remaining number of nodes evenly
this_mig_size=$((${instances_left} / (${NUM_WINDOWS_MIGS}-${i}+1)))
@ -2946,6 +2970,7 @@ function remove-replica-from-etcd() {
# Assumed vars:
# MASTER_NAME
# NODE_INSTANCE_PREFIX
# WINDOWS_NODE_INSTANCE_PREFIX
# ZONE
# This function tears down cluster resources 10 at a time to avoid issuing too many
# API calls and exceeding API quota. It is important to bring down the instances before bringing
@ -2954,7 +2979,7 @@ function kube-down() {
local -r batch=200
detect-project
detect-node-names # For INSTANCE_GROUPS
detect-node-names # For INSTANCE_GROUPS and WINDOWS_INSTANCE_GROUPS
echo "Bringing down cluster"
set +e # Do not stop on error
@ -2965,7 +2990,8 @@ function kube-down() {
# change during a cluster upgrade.)
local templates=$(get-template "${PROJECT}")
for group in ${INSTANCE_GROUPS[@]:-}; do
local all_instance_groups=(${INSTANCE_GROUPS[@]} ${WINDOWS_INSTANCE_GROUPS[@]})
for group in ${all_instance_groups[@]:-}; do
if gcloud compute instance-groups managed describe "${group}" --project "${PROJECT}" --zone "${ZONE}" &>/dev/null; then
gcloud compute instance-groups managed delete \
--project "${PROJECT}" \
@ -3087,7 +3113,7 @@ function kube-down() {
local -a minions
minions=( $(gcloud compute instances list \
--project "${PROJECT}" \
--filter="name ~ '${NODE_INSTANCE_PREFIX}-.+' AND zone:(${ZONE})" \
--filter="(name ~ '${NODE_INSTANCE_PREFIX}-.+' OR name ~ '${WINDOWS_NODE_INSTANCE_PREFIX}-.+') AND zone:(${ZONE})" \
--format='value(name)') )
# If any minions are running, delete them in batches.
while (( "${#minions[@]}" > 0 )); do
@ -3242,15 +3268,19 @@ function set-replica-name() {
REPLICA_NAME="${MASTER_NAME}-${suffix}"
}
# Gets the instance template for given NODE_INSTANCE_PREFIX. It echos the template name so that the function
# output can be used.
# Gets the instance templates in use by the cluster. It echos the template names
# so that the function output can be used.
# Assumed vars:
# NODE_INSTANCE_PREFIX
# WINDOWS_NODE_INSTANCE_PREFIX
#
# $1: project
function get-template() {
local linux_filter="${NODE_INSTANCE_PREFIX}-template(-(${KUBE_RELEASE_VERSION_DASHED_REGEX}|${KUBE_CI_VERSION_DASHED_REGEX}))?"
local windows_filter="${WINDOWS_NODE_INSTANCE_PREFIX}-template(-(${KUBE_RELEASE_VERSION_DASHED_REGEX}|${KUBE_CI_VERSION_DASHED_REGEX}))?"
gcloud compute instance-templates list \
--filter="name ~ '${NODE_INSTANCE_PREFIX}-template(-(${KUBE_RELEASE_VERSION_DASHED_REGEX}|${KUBE_CI_VERSION_DASHED_REGEX}))?'" \
--filter="name ~ '${linux_filter}' OR name ~ '${windows_filter}'" \
--project="${1}" --format='value(name)'
}
@ -3259,6 +3289,7 @@ function get-template() {
# Assumed vars:
# MASTER_NAME
# NODE_INSTANCE_PREFIX
# WINDOWS_NODE_INSTANCE_PREFIX
# ZONE
# REGION
# Vars set:
@ -3274,11 +3305,19 @@ function check-resources() {
KUBE_RESOURCE_FOUND="Managed instance groups ${INSTANCE_GROUPS[@]}"
return 1
fi
if [[ -n "${WINDOWS_INSTANCE_GROUPS[@]:-}" ]]; then
KUBE_RESOURCE_FOUND="Managed instance groups ${WINDOWS_INSTANCE_GROUPS[@]}"
return 1
fi
if gcloud compute instance-templates describe --project "${PROJECT}" "${NODE_INSTANCE_PREFIX}-template" &>/dev/null; then
KUBE_RESOURCE_FOUND="Instance template ${NODE_INSTANCE_PREFIX}-template"
return 1
fi
if gcloud compute instance-templates describe --project "${PROJECT}" "${WINDOWS_NODE_INSTANCE_PREFIX}-template" &>/dev/null; then
KUBE_RESOURCE_FOUND="Instance template ${WINDOWS_NODE_INSTANCE_PREFIX}-template"
return 1
fi
if gcloud compute instances describe --project "${PROJECT}" "${MASTER_NAME}" --zone "${ZONE}" &>/dev/null; then
KUBE_RESOURCE_FOUND="Kubernetes master ${MASTER_NAME}"
@ -3294,10 +3333,10 @@ function check-resources() {
local -a minions
minions=( $(gcloud compute instances list \
--project "${PROJECT}" \
--filter="name ~ '${NODE_INSTANCE_PREFIX}-.+' AND zone:(${ZONE})" \
--filter="(name ~ '${NODE_INSTANCE_PREFIX}-.+' OR name ~ '${WINDOWS_NODE_INSTANCE_PREFIX}-.+') AND zone:(${ZONE})" \
--format='value(name)') )
if (( "${#minions[@]}" > 0 )); then
KUBE_RESOURCE_FOUND="${#minions[@]} matching matching ${NODE_INSTANCE_PREFIX}-.+"
KUBE_RESOURCE_FOUND="${#minions[@]} matching ${NODE_INSTANCE_PREFIX}-.+ or ${WINDOWS_NODE_INSTANCE_PREFIX}-.+"
return 1
fi

View File

@ -52,6 +52,10 @@ readonly initd_logfiles="docker/log"
readonly supervisord_logfiles="kubelet.log supervisor/supervisord.log supervisor/kubelet-stdout.log supervisor/kubelet-stderr.log supervisor/docker-stdout.log supervisor/docker-stderr.log"
readonly systemd_services="kubelet kubelet-monitor kube-container-runtime-monitor ${LOG_DUMP_SYSTEMD_SERVICES:-docker}"
readonly dump_systemd_journal="${LOG_DUMP_SYSTEMD_JOURNAL:-false}"
# Log files found in WINDOWS_LOGS_DIR on Windows nodes:
readonly windows_node_logfiles="kubelet.log kube-proxy.log docker.log"
# Log files found in other directories on Windows nodes:
readonly windows_node_otherfiles="C:\\Windows\\MEMORY.dmp"
# Limit the number of concurrent node connections so that we don't run out of
# file descriptors for large clusters.
@ -195,6 +199,66 @@ function save-logs() {
copy-logs-from-node "${node_name}" "${dir}" "${files}"
}
# Saves a copy of the Windows Docker event log to ${WINDOWS_LOGS_DIR}\docker.log
# on node $1.
function export-windows-docker-event-log() {
local -r node="${1}"
local -r powershell_cmd="powershell.exe -Command \$log=\$(Get-EventLog -LogName Application -Source Docker); Set-Content '${WINDOWS_LOGS_DIR}\\docker.log' \$log.Message"
# Retry up to 3 times to allow ssh keys to be properly propagated and
# stored.
for retry in {1..3}; do
if gcloud compute ssh --project "${PROJECT}" --zone "${ZONE}" "${node}" \
--command "$powershell_cmd"; then
break
else
sleep 10
fi
done
}
# Save log files and serial console output from Windows node $1 into local
# directory $2.
# This function shouldn't ever trigger errexit.
function save-logs-windows() {
local -r node="${1}"
local -r dest_dir="${2}"
if [[ ! "${gcloud_supported_providers}" =~ "${KUBERNETES_PROVIDER}" ]]; then
echo "Not saving logs for ${node}, Windows log dumping requires gcloud support"
return
fi
export-windows-docker-event-log "${node}"
local remote_files=()
for file in ${windows_node_logfiles[@]}; do
remote_files+=( "${WINDOWS_LOGS_DIR}\\${file}" )
done
remote_files+=( "${windows_node_otherfiles[@]}" )
# TODO(pjh, yujuhong): handle rotated logs and copying multiple files at the
# same time.
for remote_file in ${remote_files[@]}; do
# Retry up to 3 times to allow ssh keys to be properly propagated and
# stored.
for retry in {1..3}; do
if gcloud compute scp --recurse --project "${PROJECT}" \
--zone "${ZONE}" "${node}:${remote_file}" "${dest_dir}" \
> /dev/null; then
break
else
sleep 10
fi
done
done
# Serial port 1 contains the Windows console output.
gcloud compute instances get-serial-port-output --project "${PROJECT}" \
--zone "${ZONE}" --port 1 "${node}" > "${dest_dir}/serial-1.log" || true
}
# Execute a command in container $2 on node $1.
# Uses docker because the container may not ordinarily permit direct execution.
function run-in-docker-container() {
@ -247,8 +311,13 @@ function dump_masters() {
fi
}
# Dumps logs from nodes in the cluster. Linux nodes to dump logs from can be
# specified via $1 or $use_custom_instance_list. If not specified then the nodes
# to dump logs for will be detected using detect-node-names(); if Windows nodes
# are present then they will be detected and their logs will be dumped too.
function dump_nodes() {
local node_names=()
local windows_node_names=()
if [[ -n "${1:-}" ]]; then
echo "Dumping logs for nodes provided as args to dump_nodes() function"
node_names=( "$@" )
@ -264,9 +333,12 @@ function dump_nodes() {
if [[ -n "${NODE_NAMES:-}" ]]; then
node_names=( "${NODE_NAMES[@]}" )
fi
if [[ -n "${WINDOWS_NODE_NAMES:-}" ]]; then
windows_node_names=( "${WINDOWS_NODE_NAMES[@]}" )
fi
fi
if [[ "${#node_names[@]}" == 0 ]]; then
if [[ "${#node_names[@]}" == 0 && "${#windows_node_names[@]}" == 0 ]]; then
echo "No nodes found!"
return
fi
@ -276,24 +348,31 @@ function dump_nodes() {
node_logfiles_all="${node_logfiles_all} ${hollow_node_logfiles}"
fi
nodes_selected_for_logs=()
linux_nodes_selected_for_logs=()
if [[ -n "${LOGDUMP_ONLY_N_RANDOM_NODES:-}" ]]; then
# We randomly choose 'LOGDUMP_ONLY_N_RANDOM_NODES' many nodes for fetching logs.
for index in `shuf -i 0-$(( ${#node_names[*]} - 1 )) -n ${LOGDUMP_ONLY_N_RANDOM_NODES}`
do
nodes_selected_for_logs+=("${node_names[$index]}")
linux_nodes_selected_for_logs+=("${node_names[$index]}")
done
else
nodes_selected_for_logs=( "${node_names[@]}" )
linux_nodes_selected_for_logs=( "${node_names[@]}" )
fi
all_selected_nodes=( "${linux_nodes_selected_for_logs[@]}" )
all_selected_nodes+=( "${windows_node_names[@]}" )
proc=${max_dump_processes}
for node_name in "${nodes_selected_for_logs[@]}"; do
for i in "${!all_selected_nodes[@]}"; do
node_name="${all_selected_nodes[$i]}"
node_dir="${report_dir}/${node_name}"
mkdir -p "${node_dir}"
if [[ "${i}" -lt "${#linux_nodes_selected_for_logs[@]}" ]]; then
# Save logs in the background. This speeds up things when there are
# many nodes.
save-logs "${node_name}" "${node_dir}" "${node_logfiles_all}" "${node_systemd_services}" &
else
save-logs-windows "${node_name}" "${node_dir}" &
fi
# We don't want to run more than ${max_dump_processes} at a time, so
# wait once we hit that many nodes. This isn't ideal, since one might
@ -311,6 +390,9 @@ function dump_nodes() {
}
# Collect names of nodes which didn't run logexporter successfully.
# This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter
# does not run on Windows nodes.
#
# Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes.
# Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes).
# Assumes:
@ -328,6 +410,8 @@ function find_non_logexported_nodes() {
done
}
# This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter
# does not run on Windows nodes.
function dump_nodes_with_logexporter() {
if [[ -n "${use_custom_instance_list}" ]]; then
echo "Dumping logs for nodes provided by log_dump_custom_get_instances() function"
@ -446,10 +530,16 @@ function detect_node_failures() {
fi
detect-node-names
if [ -z "${INSTANCE_GROUPS:-}" ]; then
if [[ "${KUBERNETES_PROVIDER}" == "gce" ]]; then
local all_instance_groups=(${INSTANCE_GROUPS[@]} ${WINDOWS_INSTANCE_GROUPS[@]})
else
local all_instance_groups=(${INSTANCE_GROUPS[@]})
fi
if [ -z "${all_instance_groups:-}" ]; then
return
fi
for group in "${INSTANCE_GROUPS[@]}"; do
for group in "${all_instance_groups[@]}"; do
local creation_timestamp=$(gcloud compute instance-groups managed describe \
"${group}" \
--project "${PROJECT}" \

View File

@ -56,7 +56,7 @@ if [[ "${KUBERNETES_PROVIDER:-}" == "gce" ]]; then
# In multizone mode we need to add instances for all nodes in the region.
if [[ "${MULTIZONE:-}" == "true" ]]; then
EXPECTED_NUM_NODES=$(gcloud -q compute instances list --project="${PROJECT}" --format=[no-heading] \
--filter="name ~ '${NODE_INSTANCE_PREFIX}.*' AND zone:($(gcloud -q compute zones list --project="${PROJECT}" --filter=region=${REGION} --format=csv[no-heading]\(name\) | tr "\n" "," | sed "s/,$//"))" | wc -l)
--filter="(name ~ '${NODE_INSTANCE_PREFIX}.*' OR name ~ '${WINDOWS_NODE_INSTANCE_PREFIX}.*') AND zone:($(gcloud -q compute zones list --project="${PROJECT}" --filter=region=${REGION} --format=csv[no-heading]\(name\) | tr "\n" "," | sed "s/,$//"))" | wc -l)
echo "Computing number of nodes, NODE_INSTANCE_PREFIX=${NODE_INSTANCE_PREFIX}, REGION=${REGION}, EXPECTED_NUM_NODES=${EXPECTED_NUM_NODES}"
fi
else