diff --git a/cluster/log-dump/log-dump.sh b/cluster/log-dump/log-dump.sh index 4dc16c6ae2..d9defea525 100755 --- a/cluster/log-dump/log-dump.sh +++ b/cluster/log-dump/log-dump.sh @@ -52,6 +52,10 @@ readonly initd_logfiles="docker/log" readonly supervisord_logfiles="kubelet.log supervisor/supervisord.log supervisor/kubelet-stdout.log supervisor/kubelet-stderr.log supervisor/docker-stdout.log supervisor/docker-stderr.log" readonly systemd_services="kubelet kubelet-monitor kube-container-runtime-monitor ${LOG_DUMP_SYSTEMD_SERVICES:-docker}" readonly dump_systemd_journal="${LOG_DUMP_SYSTEMD_JOURNAL:-false}" +# Log files found in WINDOWS_LOGS_DIR on Windows nodes: +readonly windows_node_logfiles="kubelet.log kube-proxy.log docker.log" +# Log files found in other directories on Windows nodes: +readonly windows_node_otherfiles="C:\\Windows\\MEMORY.dmp" # Limit the number of concurrent node connections so that we don't run out of # file descriptors for large clusters. @@ -195,6 +199,66 @@ function save-logs() { copy-logs-from-node "${node_name}" "${dir}" "${files}" } +# Saves a copy of the Windows Docker event log to ${WINDOWS_LOGS_DIR}\docker.log +# on node $1. +function export-windows-docker-event-log() { + local -r node="${1}" + + local -r powershell_cmd="powershell.exe -Command \$log=\$(Get-EventLog -LogName Application -Source Docker); Set-Content '${WINDOWS_LOGS_DIR}\\docker.log' \$log.Message" + + # Retry up to 3 times to allow ssh keys to be properly propagated and + # stored. + for retry in {1..3}; do + if gcloud compute ssh --project "${PROJECT}" --zone "${ZONE}" "${node}" \ + --command "$powershell_cmd"; then + break + else + sleep 10 + fi + done +} + +# Save log files and serial console output from Windows node $1 into local +# directory $2. +# This function shouldn't ever trigger errexit. +function save-logs-windows() { + local -r node="${1}" + local -r dest_dir="${2}" + + if [[ ! "${gcloud_supported_providers}" =~ "${KUBERNETES_PROVIDER}" ]]; then + echo "Not saving logs for ${node}, Windows log dumping requires gcloud support" + return + fi + + export-windows-docker-event-log "${node}" + + local remote_files=() + for file in ${windows_node_logfiles[@]}; do + remote_files+=( "${WINDOWS_LOGS_DIR}\\${file}" ) + done + remote_files+=( "${windows_node_otherfiles[@]}" ) + + # TODO(pjh, yujuhong): handle rotated logs and copying multiple files at the + # same time. + for remote_file in ${remote_files[@]}; do + # Retry up to 3 times to allow ssh keys to be properly propagated and + # stored. + for retry in {1..3}; do + if gcloud compute scp --recurse --project "${PROJECT}" \ + --zone "${ZONE}" "${node}:${remote_file}" "${dest_dir}" \ + > /dev/null; then + break + else + sleep 10 + fi + done + done + + # Serial port 1 contains the Windows console output. + gcloud compute instances get-serial-port-output --project "${PROJECT}" \ + --zone "${ZONE}" --port 1 "${node}" > "${dest_dir}/serial-1.log" || true +} + # Execute a command in container $2 on node $1. # Uses docker because the container may not ordinarily permit direct execution. function run-in-docker-container() { @@ -247,8 +311,13 @@ function dump_masters() { fi } +# Dumps logs from nodes in the cluster. Linux nodes to dump logs from can be +# specified via $1 or $use_custom_instance_list. If not specified then the nodes +# to dump logs for will be detected using detect-node-names(); if Windows nodes +# are present then they will be detected and their logs will be dumped too. function dump_nodes() { local node_names=() + local windows_node_names=() if [[ -n "${1:-}" ]]; then echo "Dumping logs for nodes provided as args to dump_nodes() function" node_names=( "$@" ) @@ -264,9 +333,12 @@ function dump_nodes() { if [[ -n "${NODE_NAMES:-}" ]]; then node_names=( "${NODE_NAMES[@]}" ) fi + if [[ -n "${WINDOWS_NODE_NAMES:-}" ]]; then + windows_node_names=( "${WINDOWS_NODE_NAMES[@]}" ) + fi fi - if [[ "${#node_names[@]}" == 0 ]]; then + if [[ "${#node_names[@]}" == 0 && "${#windows_node_names[@]}" == 0 ]]; then echo "No nodes found!" return fi @@ -276,24 +348,31 @@ function dump_nodes() { node_logfiles_all="${node_logfiles_all} ${hollow_node_logfiles}" fi - nodes_selected_for_logs=() + linux_nodes_selected_for_logs=() if [[ -n "${LOGDUMP_ONLY_N_RANDOM_NODES:-}" ]]; then # We randomly choose 'LOGDUMP_ONLY_N_RANDOM_NODES' many nodes for fetching logs. for index in `shuf -i 0-$(( ${#node_names[*]} - 1 )) -n ${LOGDUMP_ONLY_N_RANDOM_NODES}` do - nodes_selected_for_logs+=("${node_names[$index]}") + linux_nodes_selected_for_logs+=("${node_names[$index]}") done else - nodes_selected_for_logs=( "${node_names[@]}" ) + linux_nodes_selected_for_logs=( "${node_names[@]}" ) fi + all_selected_nodes=( "${linux_nodes_selected_for_logs[@]}" ) + all_selected_nodes+=( "${windows_node_names[@]}" ) proc=${max_dump_processes} - for node_name in "${nodes_selected_for_logs[@]}"; do + for i in "${!all_selected_nodes[@]}"; do + node_name="${all_selected_nodes[$i]}" node_dir="${report_dir}/${node_name}" mkdir -p "${node_dir}" - # Save logs in the background. This speeds up things when there are - # many nodes. - save-logs "${node_name}" "${node_dir}" "${node_logfiles_all}" "${node_systemd_services}" & + if [[ "${i}" -lt "${#linux_nodes_selected_for_logs[@]}" ]]; then + # Save logs in the background. This speeds up things when there are + # many nodes. + save-logs "${node_name}" "${node_dir}" "${node_logfiles_all}" "${node_systemd_services}" & + else + save-logs-windows "${node_name}" "${node_dir}" & + fi # We don't want to run more than ${max_dump_processes} at a time, so # wait once we hit that many nodes. This isn't ideal, since one might @@ -311,6 +390,9 @@ function dump_nodes() { } # Collect names of nodes which didn't run logexporter successfully. +# This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter +# does not run on Windows nodes. +# # Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes. # Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes). # Assumes: @@ -328,6 +410,8 @@ function find_non_logexported_nodes() { done } +# This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter +# does not run on Windows nodes. function dump_nodes_with_logexporter() { if [[ -n "${use_custom_instance_list}" ]]; then echo "Dumping logs for nodes provided by log_dump_custom_get_instances() function" @@ -446,10 +530,16 @@ function detect_node_failures() { fi detect-node-names - if [ -z "${INSTANCE_GROUPS:-}" ]; then + if [[ "${KUBERNETES_PROVIDER}" == "gce" ]]; then + local all_instance_groups=(${INSTANCE_GROUPS[@]} ${WINDOWS_INSTANCE_GROUPS[@]}) + else + local all_instance_groups=(${INSTANCE_GROUPS[@]}) + fi + + if [ -z "${all_instance_groups:-}" ]; then return fi - for group in "${INSTANCE_GROUPS[@]}"; do + for group in "${all_instance_groups[@]}"; do local creation_timestamp=$(gcloud compute instance-groups managed describe \ "${group}" \ --project "${PROJECT}" \