Update log-dump.sh for Windows nodes.

Tested:
```
$ PROJECT=${CLOUDSDK_CORE_PROJECT} KUBERNETES_SKIP_CONFIRM=y NUM_NODES=2 \
  NUM_WINDOWS_NODES=2 KUBE_GCE_ENABLE_IP_ALIASES=true go run \
  ./hack/e2e.go -- --up
$ cluster/log-dump/log-dump.sh
$ ls _artifacts
```

And with: NUM_NODES=2 NUM_WINDOWS_NODES=0; NUM_NODES=0 NUM_WINDOWS_NODES=2
pull/564/head
Peter Hornyack 2019-02-15 16:54:18 -08:00
parent 3efd4ca1dc
commit 0bb25290c8
1 changed files with 100 additions and 10 deletions

View File

@ -52,6 +52,10 @@ readonly initd_logfiles="docker/log"
readonly supervisord_logfiles="kubelet.log supervisor/supervisord.log supervisor/kubelet-stdout.log supervisor/kubelet-stderr.log supervisor/docker-stdout.log supervisor/docker-stderr.log" readonly supervisord_logfiles="kubelet.log supervisor/supervisord.log supervisor/kubelet-stdout.log supervisor/kubelet-stderr.log supervisor/docker-stdout.log supervisor/docker-stderr.log"
readonly systemd_services="kubelet kubelet-monitor kube-container-runtime-monitor ${LOG_DUMP_SYSTEMD_SERVICES:-docker}" readonly systemd_services="kubelet kubelet-monitor kube-container-runtime-monitor ${LOG_DUMP_SYSTEMD_SERVICES:-docker}"
readonly dump_systemd_journal="${LOG_DUMP_SYSTEMD_JOURNAL:-false}" readonly dump_systemd_journal="${LOG_DUMP_SYSTEMD_JOURNAL:-false}"
# Log files found in WINDOWS_LOGS_DIR on Windows nodes:
readonly windows_node_logfiles="kubelet.log kube-proxy.log docker.log"
# Log files found in other directories on Windows nodes:
readonly windows_node_otherfiles="C:\\Windows\\MEMORY.dmp"
# Limit the number of concurrent node connections so that we don't run out of # Limit the number of concurrent node connections so that we don't run out of
# file descriptors for large clusters. # file descriptors for large clusters.
@ -195,6 +199,66 @@ function save-logs() {
copy-logs-from-node "${node_name}" "${dir}" "${files}" copy-logs-from-node "${node_name}" "${dir}" "${files}"
} }
# Saves a copy of the Windows Docker event log to ${WINDOWS_LOGS_DIR}\docker.log
# on node $1.
function export-windows-docker-event-log() {
local -r node="${1}"
local -r powershell_cmd="powershell.exe -Command \$log=\$(Get-EventLog -LogName Application -Source Docker); Set-Content '${WINDOWS_LOGS_DIR}\\docker.log' \$log.Message"
# Retry up to 3 times to allow ssh keys to be properly propagated and
# stored.
for retry in {1..3}; do
if gcloud compute ssh --project "${PROJECT}" --zone "${ZONE}" "${node}" \
--command "$powershell_cmd"; then
break
else
sleep 10
fi
done
}
# Save log files and serial console output from Windows node $1 into local
# directory $2.
# This function shouldn't ever trigger errexit.
function save-logs-windows() {
local -r node="${1}"
local -r dest_dir="${2}"
if [[ ! "${gcloud_supported_providers}" =~ "${KUBERNETES_PROVIDER}" ]]; then
echo "Not saving logs for ${node}, Windows log dumping requires gcloud support"
return
fi
export-windows-docker-event-log "${node}"
local remote_files=()
for file in ${windows_node_logfiles[@]}; do
remote_files+=( "${WINDOWS_LOGS_DIR}\\${file}" )
done
remote_files+=( "${windows_node_otherfiles[@]}" )
# TODO(pjh, yujuhong): handle rotated logs and copying multiple files at the
# same time.
for remote_file in ${remote_files[@]}; do
# Retry up to 3 times to allow ssh keys to be properly propagated and
# stored.
for retry in {1..3}; do
if gcloud compute scp --recurse --project "${PROJECT}" \
--zone "${ZONE}" "${node}:${remote_file}" "${dest_dir}" \
> /dev/null; then
break
else
sleep 10
fi
done
done
# Serial port 1 contains the Windows console output.
gcloud compute instances get-serial-port-output --project "${PROJECT}" \
--zone "${ZONE}" --port 1 "${node}" > "${dest_dir}/serial-1.log" || true
}
# Execute a command in container $2 on node $1. # Execute a command in container $2 on node $1.
# Uses docker because the container may not ordinarily permit direct execution. # Uses docker because the container may not ordinarily permit direct execution.
function run-in-docker-container() { function run-in-docker-container() {
@ -247,8 +311,13 @@ function dump_masters() {
fi fi
} }
# Dumps logs from nodes in the cluster. Linux nodes to dump logs from can be
# specified via $1 or $use_custom_instance_list. If not specified then the nodes
# to dump logs for will be detected using detect-node-names(); if Windows nodes
# are present then they will be detected and their logs will be dumped too.
function dump_nodes() { function dump_nodes() {
local node_names=() local node_names=()
local windows_node_names=()
if [[ -n "${1:-}" ]]; then if [[ -n "${1:-}" ]]; then
echo "Dumping logs for nodes provided as args to dump_nodes() function" echo "Dumping logs for nodes provided as args to dump_nodes() function"
node_names=( "$@" ) node_names=( "$@" )
@ -264,9 +333,12 @@ function dump_nodes() {
if [[ -n "${NODE_NAMES:-}" ]]; then if [[ -n "${NODE_NAMES:-}" ]]; then
node_names=( "${NODE_NAMES[@]}" ) node_names=( "${NODE_NAMES[@]}" )
fi fi
if [[ -n "${WINDOWS_NODE_NAMES:-}" ]]; then
windows_node_names=( "${WINDOWS_NODE_NAMES[@]}" )
fi
fi fi
if [[ "${#node_names[@]}" == 0 ]]; then if [[ "${#node_names[@]}" == 0 && "${#windows_node_names[@]}" == 0 ]]; then
echo "No nodes found!" echo "No nodes found!"
return return
fi fi
@ -276,24 +348,31 @@ function dump_nodes() {
node_logfiles_all="${node_logfiles_all} ${hollow_node_logfiles}" node_logfiles_all="${node_logfiles_all} ${hollow_node_logfiles}"
fi fi
nodes_selected_for_logs=() linux_nodes_selected_for_logs=()
if [[ -n "${LOGDUMP_ONLY_N_RANDOM_NODES:-}" ]]; then if [[ -n "${LOGDUMP_ONLY_N_RANDOM_NODES:-}" ]]; then
# We randomly choose 'LOGDUMP_ONLY_N_RANDOM_NODES' many nodes for fetching logs. # We randomly choose 'LOGDUMP_ONLY_N_RANDOM_NODES' many nodes for fetching logs.
for index in `shuf -i 0-$(( ${#node_names[*]} - 1 )) -n ${LOGDUMP_ONLY_N_RANDOM_NODES}` for index in `shuf -i 0-$(( ${#node_names[*]} - 1 )) -n ${LOGDUMP_ONLY_N_RANDOM_NODES}`
do do
nodes_selected_for_logs+=("${node_names[$index]}") linux_nodes_selected_for_logs+=("${node_names[$index]}")
done done
else else
nodes_selected_for_logs=( "${node_names[@]}" ) linux_nodes_selected_for_logs=( "${node_names[@]}" )
fi fi
all_selected_nodes=( "${linux_nodes_selected_for_logs[@]}" )
all_selected_nodes+=( "${windows_node_names[@]}" )
proc=${max_dump_processes} proc=${max_dump_processes}
for node_name in "${nodes_selected_for_logs[@]}"; do for i in "${!all_selected_nodes[@]}"; do
node_name="${all_selected_nodes[$i]}"
node_dir="${report_dir}/${node_name}" node_dir="${report_dir}/${node_name}"
mkdir -p "${node_dir}" mkdir -p "${node_dir}"
# Save logs in the background. This speeds up things when there are if [[ "${i}" -lt "${#linux_nodes_selected_for_logs[@]}" ]]; then
# many nodes. # Save logs in the background. This speeds up things when there are
save-logs "${node_name}" "${node_dir}" "${node_logfiles_all}" "${node_systemd_services}" & # many nodes.
save-logs "${node_name}" "${node_dir}" "${node_logfiles_all}" "${node_systemd_services}" &
else
save-logs-windows "${node_name}" "${node_dir}" &
fi
# We don't want to run more than ${max_dump_processes} at a time, so # We don't want to run more than ${max_dump_processes} at a time, so
# wait once we hit that many nodes. This isn't ideal, since one might # wait once we hit that many nodes. This isn't ideal, since one might
@ -311,6 +390,9 @@ function dump_nodes() {
} }
# Collect names of nodes which didn't run logexporter successfully. # Collect names of nodes which didn't run logexporter successfully.
# This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter
# does not run on Windows nodes.
#
# Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes. # Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes.
# Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes). # Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes).
# Assumes: # Assumes:
@ -328,6 +410,8 @@ function find_non_logexported_nodes() {
done done
} }
# This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter
# does not run on Windows nodes.
function dump_nodes_with_logexporter() { function dump_nodes_with_logexporter() {
if [[ -n "${use_custom_instance_list}" ]]; then if [[ -n "${use_custom_instance_list}" ]]; then
echo "Dumping logs for nodes provided by log_dump_custom_get_instances() function" echo "Dumping logs for nodes provided by log_dump_custom_get_instances() function"
@ -446,10 +530,16 @@ function detect_node_failures() {
fi fi
detect-node-names detect-node-names
if [ -z "${INSTANCE_GROUPS:-}" ]; then if [[ "${KUBERNETES_PROVIDER}" == "gce" ]]; then
local all_instance_groups=(${INSTANCE_GROUPS[@]} ${WINDOWS_INSTANCE_GROUPS[@]})
else
local all_instance_groups=(${INSTANCE_GROUPS[@]})
fi
if [ -z "${all_instance_groups:-}" ]; then
return return
fi fi
for group in "${INSTANCE_GROUPS[@]}"; do for group in "${all_instance_groups[@]}"; do
local creation_timestamp=$(gcloud compute instance-groups managed describe \ local creation_timestamp=$(gcloud compute instance-groups managed describe \
"${group}" \ "${group}" \
--project "${PROJECT}" \ --project "${PROJECT}" \