diff --git a/cluster/gce/upgrade.sh b/cluster/gce/upgrade.sh index 10b79394fa..b346080816 100755 --- a/cluster/gce/upgrade.sh +++ b/cluster/gce/upgrade.sh @@ -267,7 +267,7 @@ function prepare-node-upgrade() { # TODO(zmerlynn): Get configure-vm script from ${version}. (Must plumb this # through all create-linux-node-instance-template implementations). - local template_name=$(get-template-name-from-version ${SANITIZED_VERSION}) + local template_name=$(get-template-name-from-version ${SANITIZED_VERSION} ${NODE_INSTANCE_PREFIX}) create-linux-node-instance-template "${template_name}" # The following is echo'd so that callers can get the template name. echo "Instance template name: ${template_name}" @@ -373,7 +373,7 @@ function do-node-upgrade() { # Do the actual upgrade. # NOTE(zmerlynn): If you are changing this gcloud command, update # test/e2e/cluster_upgrade.go to match this EXACTLY. - local template_name=$(get-template-name-from-version ${SANITIZED_VERSION}) + local template_name=$(get-template-name-from-version ${SANITIZED_VERSION} ${NODE_INSTANCE_PREFIX}) local old_templates=() local updates=() for group in ${INSTANCE_GROUPS[@]}; do diff --git a/cluster/gce/util.sh b/cluster/gce/util.sh index 99ea09119f..d253a1183f 100755 --- a/cluster/gce/util.sh +++ b/cluster/gce/util.sh @@ -112,7 +112,10 @@ if [[ "${ENABLE_CLUSTER_AUTOSCALER}" == "true" ]]; then fi fi +# These prefixes must not be prefixes of each other, so that they can be used to +# detect mutually exclusive sets of nodes. NODE_INSTANCE_PREFIX=${NODE_INSTANCE_PREFIX:-"${INSTANCE_PREFIX}-minion"} +WINDOWS_NODE_INSTANCE_PREFIX=${WINDOWS_NODE_INSTANCE_PREFIX:-"${INSTANCE_PREFIX}-windows-node"} NODE_TAGS="${NODE_TAG}" @@ -373,9 +376,12 @@ function upload-tars() { # # Assumed vars: # NODE_INSTANCE_PREFIX +# WINDOWS_NODE_INSTANCE_PREFIX # Vars set: # NODE_NAMES # INSTANCE_GROUPS +# WINDOWS_NODE_NAMES +# WINDOWS_INSTANCE_GROUPS function detect-node-names() { detect-project INSTANCE_GROUPS=() @@ -383,6 +389,12 @@ function detect-node-names() { --project "${PROJECT}" \ --filter "name ~ '${NODE_INSTANCE_PREFIX}-.+' AND zone:(${ZONE})" \ --format='value(name)' || true)) + WINDOWS_INSTANCE_GROUPS=() + WINDOWS_INSTANCE_GROUPS+=($(gcloud compute instance-groups managed list \ + --project "${PROJECT}" \ + --filter "name ~ '${WINDOWS_NODE_INSTANCE_PREFIX}-.+' AND zone:(${ZONE})" \ + --format='value(name)' || true)) + NODE_NAMES=() if [[ -n "${INSTANCE_GROUPS[@]:-}" ]]; then for group in "${INSTANCE_GROUPS[@]}"; do @@ -395,6 +407,14 @@ function detect-node-names() { if [[ -n "${HEAPSTER_MACHINE_TYPE:-}" ]]; then NODE_NAMES+=("${NODE_INSTANCE_PREFIX}-heapster") fi + WINDOWS_NODE_NAMES=() + if [[ -n "${WINDOWS_INSTANCE_GROUPS[@]:-}" ]]; then + for group in "${WINDOWS_INSTANCE_GROUPS[@]}"; do + WINDOWS_NODE_NAMES+=($(gcloud compute instance-groups managed \ + list-instances "${group}" --zone "${ZONE}" --project "${PROJECT}" \ + --format='value(instance)')) + done + fi echo "INSTANCE_GROUPS=${INSTANCE_GROUPS[*]:-}" >&2 echo "NODE_NAMES=${NODE_NAMES[*]:-}" >&2 @@ -1403,6 +1423,7 @@ function build-windows-kube-env { build-linux-kube-env false $file cat >>$file </dev/null; then gcloud compute instance-groups managed delete \ --project "${PROJECT}" \ @@ -3087,7 +3113,7 @@ function kube-down() { local -a minions minions=( $(gcloud compute instances list \ --project "${PROJECT}" \ - --filter="name ~ '${NODE_INSTANCE_PREFIX}-.+' AND zone:(${ZONE})" \ + --filter="(name ~ '${NODE_INSTANCE_PREFIX}-.+' OR name ~ '${WINDOWS_NODE_INSTANCE_PREFIX}-.+') AND zone:(${ZONE})" \ --format='value(name)') ) # If any minions are running, delete them in batches. while (( "${#minions[@]}" > 0 )); do @@ -3242,15 +3268,19 @@ function set-replica-name() { REPLICA_NAME="${MASTER_NAME}-${suffix}" } -# Gets the instance template for given NODE_INSTANCE_PREFIX. It echos the template name so that the function -# output can be used. +# Gets the instance templates in use by the cluster. It echos the template names +# so that the function output can be used. # Assumed vars: # NODE_INSTANCE_PREFIX +# WINDOWS_NODE_INSTANCE_PREFIX # # $1: project function get-template() { + local linux_filter="${NODE_INSTANCE_PREFIX}-template(-(${KUBE_RELEASE_VERSION_DASHED_REGEX}|${KUBE_CI_VERSION_DASHED_REGEX}))?" + local windows_filter="${WINDOWS_NODE_INSTANCE_PREFIX}-template(-(${KUBE_RELEASE_VERSION_DASHED_REGEX}|${KUBE_CI_VERSION_DASHED_REGEX}))?" + gcloud compute instance-templates list \ - --filter="name ~ '${NODE_INSTANCE_PREFIX}-template(-(${KUBE_RELEASE_VERSION_DASHED_REGEX}|${KUBE_CI_VERSION_DASHED_REGEX}))?'" \ + --filter="name ~ '${linux_filter}' OR name ~ '${windows_filter}'" \ --project="${1}" --format='value(name)' } @@ -3259,6 +3289,7 @@ function get-template() { # Assumed vars: # MASTER_NAME # NODE_INSTANCE_PREFIX +# WINDOWS_NODE_INSTANCE_PREFIX # ZONE # REGION # Vars set: @@ -3274,11 +3305,19 @@ function check-resources() { KUBE_RESOURCE_FOUND="Managed instance groups ${INSTANCE_GROUPS[@]}" return 1 fi + if [[ -n "${WINDOWS_INSTANCE_GROUPS[@]:-}" ]]; then + KUBE_RESOURCE_FOUND="Managed instance groups ${WINDOWS_INSTANCE_GROUPS[@]}" + return 1 + fi if gcloud compute instance-templates describe --project "${PROJECT}" "${NODE_INSTANCE_PREFIX}-template" &>/dev/null; then KUBE_RESOURCE_FOUND="Instance template ${NODE_INSTANCE_PREFIX}-template" return 1 fi + if gcloud compute instance-templates describe --project "${PROJECT}" "${WINDOWS_NODE_INSTANCE_PREFIX}-template" &>/dev/null; then + KUBE_RESOURCE_FOUND="Instance template ${WINDOWS_NODE_INSTANCE_PREFIX}-template" + return 1 + fi if gcloud compute instances describe --project "${PROJECT}" "${MASTER_NAME}" --zone "${ZONE}" &>/dev/null; then KUBE_RESOURCE_FOUND="Kubernetes master ${MASTER_NAME}" @@ -3294,10 +3333,10 @@ function check-resources() { local -a minions minions=( $(gcloud compute instances list \ --project "${PROJECT}" \ - --filter="name ~ '${NODE_INSTANCE_PREFIX}-.+' AND zone:(${ZONE})" \ + --filter="(name ~ '${NODE_INSTANCE_PREFIX}-.+' OR name ~ '${WINDOWS_NODE_INSTANCE_PREFIX}-.+') AND zone:(${ZONE})" \ --format='value(name)') ) if (( "${#minions[@]}" > 0 )); then - KUBE_RESOURCE_FOUND="${#minions[@]} matching matching ${NODE_INSTANCE_PREFIX}-.+" + KUBE_RESOURCE_FOUND="${#minions[@]} matching ${NODE_INSTANCE_PREFIX}-.+ or ${WINDOWS_NODE_INSTANCE_PREFIX}-.+" return 1 fi diff --git a/cluster/log-dump/log-dump.sh b/cluster/log-dump/log-dump.sh index 4dc16c6ae2..d9defea525 100755 --- a/cluster/log-dump/log-dump.sh +++ b/cluster/log-dump/log-dump.sh @@ -52,6 +52,10 @@ readonly initd_logfiles="docker/log" readonly supervisord_logfiles="kubelet.log supervisor/supervisord.log supervisor/kubelet-stdout.log supervisor/kubelet-stderr.log supervisor/docker-stdout.log supervisor/docker-stderr.log" readonly systemd_services="kubelet kubelet-monitor kube-container-runtime-monitor ${LOG_DUMP_SYSTEMD_SERVICES:-docker}" readonly dump_systemd_journal="${LOG_DUMP_SYSTEMD_JOURNAL:-false}" +# Log files found in WINDOWS_LOGS_DIR on Windows nodes: +readonly windows_node_logfiles="kubelet.log kube-proxy.log docker.log" +# Log files found in other directories on Windows nodes: +readonly windows_node_otherfiles="C:\\Windows\\MEMORY.dmp" # Limit the number of concurrent node connections so that we don't run out of # file descriptors for large clusters. @@ -195,6 +199,66 @@ function save-logs() { copy-logs-from-node "${node_name}" "${dir}" "${files}" } +# Saves a copy of the Windows Docker event log to ${WINDOWS_LOGS_DIR}\docker.log +# on node $1. +function export-windows-docker-event-log() { + local -r node="${1}" + + local -r powershell_cmd="powershell.exe -Command \$log=\$(Get-EventLog -LogName Application -Source Docker); Set-Content '${WINDOWS_LOGS_DIR}\\docker.log' \$log.Message" + + # Retry up to 3 times to allow ssh keys to be properly propagated and + # stored. + for retry in {1..3}; do + if gcloud compute ssh --project "${PROJECT}" --zone "${ZONE}" "${node}" \ + --command "$powershell_cmd"; then + break + else + sleep 10 + fi + done +} + +# Save log files and serial console output from Windows node $1 into local +# directory $2. +# This function shouldn't ever trigger errexit. +function save-logs-windows() { + local -r node="${1}" + local -r dest_dir="${2}" + + if [[ ! "${gcloud_supported_providers}" =~ "${KUBERNETES_PROVIDER}" ]]; then + echo "Not saving logs for ${node}, Windows log dumping requires gcloud support" + return + fi + + export-windows-docker-event-log "${node}" + + local remote_files=() + for file in ${windows_node_logfiles[@]}; do + remote_files+=( "${WINDOWS_LOGS_DIR}\\${file}" ) + done + remote_files+=( "${windows_node_otherfiles[@]}" ) + + # TODO(pjh, yujuhong): handle rotated logs and copying multiple files at the + # same time. + for remote_file in ${remote_files[@]}; do + # Retry up to 3 times to allow ssh keys to be properly propagated and + # stored. + for retry in {1..3}; do + if gcloud compute scp --recurse --project "${PROJECT}" \ + --zone "${ZONE}" "${node}:${remote_file}" "${dest_dir}" \ + > /dev/null; then + break + else + sleep 10 + fi + done + done + + # Serial port 1 contains the Windows console output. + gcloud compute instances get-serial-port-output --project "${PROJECT}" \ + --zone "${ZONE}" --port 1 "${node}" > "${dest_dir}/serial-1.log" || true +} + # Execute a command in container $2 on node $1. # Uses docker because the container may not ordinarily permit direct execution. function run-in-docker-container() { @@ -247,8 +311,13 @@ function dump_masters() { fi } +# Dumps logs from nodes in the cluster. Linux nodes to dump logs from can be +# specified via $1 or $use_custom_instance_list. If not specified then the nodes +# to dump logs for will be detected using detect-node-names(); if Windows nodes +# are present then they will be detected and their logs will be dumped too. function dump_nodes() { local node_names=() + local windows_node_names=() if [[ -n "${1:-}" ]]; then echo "Dumping logs for nodes provided as args to dump_nodes() function" node_names=( "$@" ) @@ -264,9 +333,12 @@ function dump_nodes() { if [[ -n "${NODE_NAMES:-}" ]]; then node_names=( "${NODE_NAMES[@]}" ) fi + if [[ -n "${WINDOWS_NODE_NAMES:-}" ]]; then + windows_node_names=( "${WINDOWS_NODE_NAMES[@]}" ) + fi fi - if [[ "${#node_names[@]}" == 0 ]]; then + if [[ "${#node_names[@]}" == 0 && "${#windows_node_names[@]}" == 0 ]]; then echo "No nodes found!" return fi @@ -276,24 +348,31 @@ function dump_nodes() { node_logfiles_all="${node_logfiles_all} ${hollow_node_logfiles}" fi - nodes_selected_for_logs=() + linux_nodes_selected_for_logs=() if [[ -n "${LOGDUMP_ONLY_N_RANDOM_NODES:-}" ]]; then # We randomly choose 'LOGDUMP_ONLY_N_RANDOM_NODES' many nodes for fetching logs. for index in `shuf -i 0-$(( ${#node_names[*]} - 1 )) -n ${LOGDUMP_ONLY_N_RANDOM_NODES}` do - nodes_selected_for_logs+=("${node_names[$index]}") + linux_nodes_selected_for_logs+=("${node_names[$index]}") done else - nodes_selected_for_logs=( "${node_names[@]}" ) + linux_nodes_selected_for_logs=( "${node_names[@]}" ) fi + all_selected_nodes=( "${linux_nodes_selected_for_logs[@]}" ) + all_selected_nodes+=( "${windows_node_names[@]}" ) proc=${max_dump_processes} - for node_name in "${nodes_selected_for_logs[@]}"; do + for i in "${!all_selected_nodes[@]}"; do + node_name="${all_selected_nodes[$i]}" node_dir="${report_dir}/${node_name}" mkdir -p "${node_dir}" - # Save logs in the background. This speeds up things when there are - # many nodes. - save-logs "${node_name}" "${node_dir}" "${node_logfiles_all}" "${node_systemd_services}" & + if [[ "${i}" -lt "${#linux_nodes_selected_for_logs[@]}" ]]; then + # Save logs in the background. This speeds up things when there are + # many nodes. + save-logs "${node_name}" "${node_dir}" "${node_logfiles_all}" "${node_systemd_services}" & + else + save-logs-windows "${node_name}" "${node_dir}" & + fi # We don't want to run more than ${max_dump_processes} at a time, so # wait once we hit that many nodes. This isn't ideal, since one might @@ -311,6 +390,9 @@ function dump_nodes() { } # Collect names of nodes which didn't run logexporter successfully. +# This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter +# does not run on Windows nodes. +# # Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes. # Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes). # Assumes: @@ -328,6 +410,8 @@ function find_non_logexported_nodes() { done } +# This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter +# does not run on Windows nodes. function dump_nodes_with_logexporter() { if [[ -n "${use_custom_instance_list}" ]]; then echo "Dumping logs for nodes provided by log_dump_custom_get_instances() function" @@ -446,10 +530,16 @@ function detect_node_failures() { fi detect-node-names - if [ -z "${INSTANCE_GROUPS:-}" ]; then + if [[ "${KUBERNETES_PROVIDER}" == "gce" ]]; then + local all_instance_groups=(${INSTANCE_GROUPS[@]} ${WINDOWS_INSTANCE_GROUPS[@]}) + else + local all_instance_groups=(${INSTANCE_GROUPS[@]}) + fi + + if [ -z "${all_instance_groups:-}" ]; then return fi - for group in "${INSTANCE_GROUPS[@]}"; do + for group in "${all_instance_groups[@]}"; do local creation_timestamp=$(gcloud compute instance-groups managed describe \ "${group}" \ --project "${PROJECT}" \ diff --git a/cluster/validate-cluster.sh b/cluster/validate-cluster.sh index 3f7cd3d29c..c28dcb0df5 100755 --- a/cluster/validate-cluster.sh +++ b/cluster/validate-cluster.sh @@ -56,7 +56,7 @@ if [[ "${KUBERNETES_PROVIDER:-}" == "gce" ]]; then # In multizone mode we need to add instances for all nodes in the region. if [[ "${MULTIZONE:-}" == "true" ]]; then EXPECTED_NUM_NODES=$(gcloud -q compute instances list --project="${PROJECT}" --format=[no-heading] \ - --filter="name ~ '${NODE_INSTANCE_PREFIX}.*' AND zone:($(gcloud -q compute zones list --project="${PROJECT}" --filter=region=${REGION} --format=csv[no-heading]\(name\) | tr "\n" "," | sed "s/,$//"))" | wc -l) + --filter="(name ~ '${NODE_INSTANCE_PREFIX}.*' OR name ~ '${WINDOWS_NODE_INSTANCE_PREFIX}.*') AND zone:($(gcloud -q compute zones list --project="${PROJECT}" --filter=region=${REGION} --format=csv[no-heading]\(name\) | tr "\n" "," | sed "s/,$//"))" | wc -l) echo "Computing number of nodes, NODE_INSTANCE_PREFIX=${NODE_INSTANCE_PREFIX}, REGION=${REGION}, EXPECTED_NUM_NODES=${EXPECTED_NUM_NODES}" fi else