k3s/cluster/gce/windows/smoke-test.sh

686 lines
22 KiB
Bash
Executable File

#!/bin/bash
# Copyright 2019 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# A small smoke test to run against a just-deployed kube-up cluster with Windows
# nodes. Performs checks such as:
# 1) Verifying that all Windows nodes have status Ready.
# 2) Verifying that no system pods are attempting to run on Windows nodes.
# 3) Verifying pairwise connectivity between most of the following: Linux
# pods, Windows pods, K8s services, and the Internet.
# 4) Verifying that basic DNS resolution works in Windows pods.
#
# This script assumes that it is run from the root of the kubernetes repository.
#
# TODOs:
# - Implement the node-to-pod checks.
# - Capture stdout for each command to a file and only print it when the test
# fails.
# - Move copy-pasted code into reusable functions.
# - Continue running all checks after one fails.
# - Test service connectivity by running a test pod with an http server and
# exposing it as a service (rather than curl-ing from existing system
# services that don't serve http requests).
# - Add test retries for transient errors, such as:
# "error: unable to upgrade connection: Authorization error
# (user=kube-apiserver, verb=create, resource=nodes, subresource=proxy)"
# Override this to use a different kubectl binary.
kubectl=kubectl
linux_deployment_timeout=60
windows_deployment_timeout=180
output_file=/tmp/k8s-smoke-test.out
function check_windows_nodes_are_ready {
# kubectl filtering is the worst.
statuses=$(${kubectl} get nodes -l beta.kubernetes.io/os=windows \
-o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}')
for status in $statuses; do
if [[ $status == "False" ]]; then
echo "ERROR: some Windows node has status != Ready"
echo "kubectl get nodes -l beta.kubernetes.io/os=windows"
${kubectl} get nodes -l beta.kubernetes.io/os=windows
exit 1
fi
done
echo "Verified that all Windows nodes have status Ready"
}
function untaint_windows_nodes {
# Untaint the windows nodes to allow test workloads without tolerations to be
# scheduled onto them.
WINDOWS_NODES=$(${kubectl} get nodes -l beta.kubernetes.io/os=windows -o name)
for node in $WINDOWS_NODES; do
${kubectl} taint node "$node" node.kubernetes.io/os:NoSchedule-
done
}
function check_no_system_pods_on_windows_nodes {
windows_system_pods=$(${kubectl} get pods --namespace kube-system \
-o wide | grep -E "Pending|windows" | wc -w)
if [[ $windows_system_pods -ne 0 ]]; then
echo "ERROR: there are kube-system pods trying to run on Windows nodes"
echo "kubectl get pods --namespace kube-system -o wide"
${kubectl} get pods --namespace kube-system -o wide
exit 1
fi
echo "Verified that all system pods are running on Linux nodes"
}
linux_webserver_deployment=linux-nginx
linux_webserver_pod_label=nginx
linux_webserver_replicas=1
function deploy_linux_webserver_pod {
echo "Writing example deployment to $linux_webserver_deployment.yaml"
cat <<EOF > $linux_webserver_deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: $linux_webserver_deployment
labels:
app: $linux_webserver_pod_label
spec:
replicas: $linux_webserver_replicas
selector:
matchLabels:
app: $linux_webserver_pod_label
template:
metadata:
labels:
app: $linux_webserver_pod_label
spec:
containers:
- name: nginx
image: nginx:1.7.9
nodeSelector:
beta.kubernetes.io/os: linux
EOF
if ! ${kubectl} create -f $linux_webserver_deployment.yaml; then
echo "kubectl create -f $linux_webserver_deployment.yaml failed"
exit 1
fi
timeout=$linux_deployment_timeout
while [[ $timeout -gt 0 ]]; do
echo "Waiting for $linux_webserver_replicas Linux $linux_webserver_pod_label pods to become Ready"
statuses=$(${kubectl} get pods -l app=$linux_webserver_pod_label \
-o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}' \
| grep "True" | wc -w)
if [[ $statuses -eq $linux_webserver_replicas ]]; then
break
else
sleep 10
(( timeout=timeout-10 ))
fi
done
if [[ $timeout -gt 0 ]]; then
echo "All $linux_webserver_pod_label pods became Ready"
else
echo "ERROR: Not all $linux_webserver_pod_label pods became Ready"
echo "kubectl get pods -l app=$linux_webserver_pod_label"
${kubectl} get pods -l app=$linux_webserver_pod_label
cleanup_deployments
exit 1
fi
}
# Returns the name of an arbitrary Linux webserver pod.
function get_linux_webserver_pod_name {
$kubectl get pods -l app=$linux_webserver_pod_label \
-o jsonpath='{.items[0].metadata.name}'
}
# Returns the IP address of an arbitrary Linux webserver pod.
function get_linux_webserver_pod_ip {
$kubectl get pods -l app=$linux_webserver_pod_label \
-o jsonpath='{.items[0].status.podIP}'
}
function undeploy_linux_webserver_pod {
${kubectl} delete deployment $linux_webserver_deployment
}
linux_command_deployment=linux-ubuntu
linux_command_pod_label=ubuntu
linux_command_replicas=1
function deploy_linux_command_pod {
echo "Writing example deployment to $linux_command_deployment.yaml"
cat <<EOF > $linux_command_deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: $linux_command_deployment
labels:
app: $linux_command_pod_label
spec:
replicas: $linux_command_replicas
selector:
matchLabels:
app: $linux_command_pod_label
template:
metadata:
labels:
app: $linux_command_pod_label
spec:
containers:
- name: ubuntu
image: ubuntu
command: ["sleep", "123456"]
nodeSelector:
beta.kubernetes.io/os: linux
EOF
if ! ${kubectl} create -f $linux_command_deployment.yaml; then
echo "kubectl create -f $linux_command_deployment.yaml failed"
exit 1
fi
timeout=$linux_deployment_timeout
while [[ $timeout -gt 0 ]]; do
echo "Waiting for $linux_command_replicas Linux $linux_command_pod_label pods to become Ready"
statuses=$(${kubectl} get pods -l app=$linux_command_pod_label \
-o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}' \
| grep "True" | wc -w)
if [[ $statuses -eq $linux_command_replicas ]]; then
break
else
sleep 10
(( timeout=timeout-10 ))
fi
done
if [[ $timeout -gt 0 ]]; then
echo "All $linux_command_pod_label pods became Ready"
else
echo "ERROR: Not all $linux_command_pod_label pods became Ready"
echo "kubectl get pods -l app=$linux_command_pod_label"
${kubectl} get pods -l app=$linux_command_pod_label
cleanup_deployments
exit 1
fi
}
# Returns the name of an arbitrary Linux command pod.
function get_linux_command_pod_name {
$kubectl get pods -l app=$linux_command_pod_label \
-o jsonpath='{.items[0].metadata.name}'
}
# Returns the IP address of an arbitrary Linux command pod.
function get_linux_command_pod_ip {
$kubectl get pods -l app=$linux_command_pod_label \
-o jsonpath='{.items[0].status.podIP}'
}
# Installs test executables (ping, curl) in the Linux command pod.
# NOTE: this assumes that there is only one Linux "command pod".
# TODO(pjh): fix this.
function prepare_linux_command_pod {
local linux_command_pod
linux_command_pod="$(get_linux_command_pod_name)"
echo "Installing test utilities in Linux command pod, may take a minute"
$kubectl exec "$linux_command_pod" -- apt-get update > /dev/null
$kubectl exec "$linux_command_pod" -- \
apt-get install -y iputils-ping curl > /dev/null
}
function undeploy_linux_command_pod {
${kubectl} delete deployment $linux_command_deployment
}
windows_webserver_deployment=windows-nettest
windows_webserver_pod_label=nettest
windows_webserver_replicas=1
function deploy_windows_webserver_pod {
echo "Writing example deployment to $windows_webserver_deployment.yaml"
cat <<EOF > $windows_webserver_deployment.yaml
# You can run a pod with the e2eteam/nettest:1.0 image (which should listen on
# <podIP>:8080) and create another pod on a different node (linux would be
# easier) to curl the http server:
# curl http://<pod_ip>:8080/read
apiVersion: apps/v1
kind: Deployment
metadata:
name: $windows_webserver_deployment
labels:
app: $windows_webserver_pod_label
spec:
replicas: $windows_webserver_replicas
selector:
matchLabels:
app: $windows_webserver_pod_label
template:
metadata:
labels:
app: $windows_webserver_pod_label
spec:
containers:
- name: nettest
image: e2eteam/nettest:1.0
nodeSelector:
beta.kubernetes.io/os: windows
tolerations:
- effect: NoSchedule
key: node.kubernetes.io/os
operator: Equal
value: windows
EOF
if ! ${kubectl} create -f $windows_webserver_deployment.yaml; then
echo "kubectl create -f $windows_webserver_deployment.yaml failed"
exit 1
fi
timeout=$windows_deployment_timeout
while [[ $timeout -gt 0 ]]; do
echo "Waiting for $windows_webserver_replicas Windows $windows_webserver_pod_label pods to become Ready"
statuses=$(${kubectl} get pods -l app=$windows_webserver_pod_label \
-o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}' \
| grep "True" | wc -w)
if [[ $statuses -eq $windows_webserver_replicas ]]; then
break
else
sleep 10
(( timeout=timeout-10 ))
fi
done
if [[ $timeout -gt 0 ]]; then
echo "All $windows_webserver_pod_label pods became Ready"
else
echo "ERROR: Not all $windows_webserver_pod_label pods became Ready"
echo "kubectl get pods -l app=$windows_webserver_pod_label"
${kubectl} get pods -l app=$windows_webserver_pod_label
cleanup_deployments
exit 1
fi
}
function get_windows_webserver_pod_name {
$kubectl get pods -l app=$windows_webserver_pod_label \
-o jsonpath='{.items[0].metadata.name}'
}
function get_windows_webserver_pod_ip {
$kubectl get pods -l app=$windows_webserver_pod_label \
-o jsonpath='{.items[0].status.podIP}'
}
function undeploy_windows_webserver_pod {
${kubectl} delete deployment $windows_webserver_deployment
}
windows_command_deployment=windows-powershell
windows_command_pod_label=powershell
windows_command_replicas=1
function deploy_windows_command_pod {
echo "Writing example deployment to $windows_command_deployment.yaml"
cat <<EOF > $windows_command_deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: $windows_command_deployment
labels:
app: $windows_command_pod_label
spec:
replicas: $windows_command_replicas
selector:
matchLabels:
app: $windows_command_pod_label
template:
metadata:
labels:
app: $windows_command_pod_label
spec:
containers:
- name: nettest
image: e2eteam/nettest:1.0
nodeSelector:
beta.kubernetes.io/os: windows
tolerations:
- effect: NoSchedule
key: node.kubernetes.io/os
operator: Equal
value: windows
EOF
if ! ${kubectl} create -f $windows_command_deployment.yaml; then
echo "kubectl create -f $windows_command_deployment.yaml failed"
exit 1
fi
timeout=$windows_deployment_timeout
while [[ $timeout -gt 0 ]]; do
echo "Waiting for $windows_command_replicas Windows $windows_command_pod_label pods to become Ready"
statuses=$(${kubectl} get pods -l app=$windows_command_pod_label \
-o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}' \
| grep "True" | wc -w)
if [[ $statuses -eq $windows_command_replicas ]]; then
break
else
sleep 10
(( timeout=timeout-10 ))
fi
done
if [[ $timeout -gt 0 ]]; then
echo "All $windows_command_pod_label pods became Ready"
else
echo "ERROR: Not all $windows_command_pod_label pods became Ready"
echo "kubectl get pods -l app=$windows_command_pod_label"
${kubectl} get pods -l app=$windows_command_pod_label
cleanup_deployments
exit 1
fi
}
function get_windows_command_pod_name {
$kubectl get pods -l app=$windows_command_pod_label \
-o jsonpath='{.items[0].metadata.name}'
}
function get_windows_command_pod_ip {
$kubectl get pods -l app=$windows_command_pod_label \
-o jsonpath='{.items[0].status.podIP}'
}
function undeploy_windows_command_pod {
${kubectl} delete deployment $windows_command_deployment
}
function test_linux_node_to_linux_pod {
echo "TODO: ${FUNCNAME[0]}"
}
function test_linux_node_to_windows_pod {
echo "TODO: ${FUNCNAME[0]}"
}
function test_linux_pod_to_linux_pod {
echo "TEST: ${FUNCNAME[0]}"
local linux_command_pod
linux_command_pod="$(get_linux_command_pod_name)"
local linux_webserver_pod_ip
linux_webserver_pod_ip="$(get_linux_webserver_pod_ip)"
if ! $kubectl exec "$linux_command_pod" -- curl -m 20 \
"http://$linux_webserver_pod_ip" &> $output_file; then
cleanup_deployments
echo "Failing output: $(cat $output_file)"
echo "FAILED: ${FUNCNAME[0]}"
exit 1
fi
}
# TODO(pjh): this test flakily fails on brand-new clusters, not sure why.
# % Total % Received % Xferd Average Speed Time Time Time Current
# Dload Upload Total Spent Left Speed
# 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
# curl: (6) Could not resolve host:
# command terminated with exit code 6
function test_linux_pod_to_windows_pod {
echo "TEST: ${FUNCNAME[0]}"
local linux_command_pod
linux_command_pod="$(get_linux_command_pod_name)"
local windows_webserver_pod_ip
windows_webserver_pod_ip="$(get_windows_webserver_pod_ip)"
if ! $kubectl exec "$linux_command_pod" -- curl -m 20 \
"http://$windows_webserver_pod_ip:8080/read" &> $output_file; then
cleanup_deployments
echo "Failing output: $(cat $output_file)"
echo "FAILED: ${FUNCNAME[0]}"
echo "This test seems to be flaky. TODO(pjh): investigate."
exit 1
fi
}
function test_linux_pod_to_internet {
echo "TEST: ${FUNCNAME[0]}"
local linux_command_pod
linux_command_pod="$(get_linux_command_pod_name)"
local internet_ip="8.8.8.8" # Google DNS
# This is expected to return 404 (not found).
if ! $kubectl exec "$linux_command_pod" -- curl -m 20 \
"http://$internet_ip" > $output_file; then
cleanup_deployments
echo "Failing output: $(cat $output_file)"
echo "FAILED: ${FUNCNAME[0]}"
exit 1
fi
}
function test_linux_pod_to_k8s_service {
echo "TEST: ${FUNCNAME[0]}"
local linux_command_pod
linux_command_pod="$(get_linux_command_pod_name)"
local service="heapster"
local service_ip
service_ip=$($kubectl get service --namespace kube-system $service \
-o jsonpath='{.spec.clusterIP}')
local service_port
service_port=$($kubectl get service --namespace kube-system $service \
-o jsonpath='{.spec.ports[?(@.protocol=="TCP")].port}')
echo "curl-ing $service address from Linux pod: $service_ip:$service_port"
# curl-ing the heapster service results in an expected 404 response code. The
# curl command does not set a failure return code in this case.
if ! $kubectl exec "$linux_command_pod" -- \
curl -m 20 "http://$service_ip:$service_port" &> $output_file; then
cleanup_deployments
echo "Failing output: $(cat $output_file)"
echo "FAILED: ${FUNCNAME[0]}"
exit 1
fi
}
function test_windows_node_to_linux_pod {
echo "TODO: ${FUNCNAME[0]}"
}
function test_windows_node_to_windows_pod {
echo "TODO: ${FUNCNAME[0]}"
}
# TODO(pjh): this test failed for me once with
# error: unable to upgrade connection: container not found ("nettest")
# Maybe the container crashed for some reason? Investigate if it happens more.
#
# TODO(pjh): another one-time failure:
# error: unable to upgrade connection: Authorization error
# (user=kube-apiserver, verb=create, resource=nodes, subresource=proxy)
function test_windows_pod_to_linux_pod {
echo "TEST: ${FUNCNAME[0]}"
local windows_command_pod
windows_command_pod="$(get_windows_command_pod_name)"
local linux_webserver_pod_ip
linux_webserver_pod_ip="$(get_linux_webserver_pod_ip)"
if ! $kubectl exec "$windows_command_pod" -- powershell.exe \
"curl -UseBasicParsing http://$linux_webserver_pod_ip" > \
$output_file; then
cleanup_deployments
echo "Failing output: $(cat $output_file)"
echo "FAILED: ${FUNCNAME[0]}"
exit 1
fi
}
function test_windows_pod_to_windows_pod {
echo "TEST: ${FUNCNAME[0]}"
local windows_command_pod
windows_command_pod="$(get_windows_command_pod_name)"
local windows_webserver_pod_ip
windows_webserver_pod_ip="$(get_windows_webserver_pod_ip)"
if ! $kubectl exec "$windows_command_pod" -- powershell.exe \
"curl -UseBasicParsing http://$windows_webserver_pod_ip:8080/read" \
> $output_file; then
cleanup_deployments
echo "Failing output: $(cat $output_file)"
echo "FAILED: ${FUNCNAME[0]}"
exit 1
fi
}
function test_windows_pod_to_internet {
echo "TEST: ${FUNCNAME[0]}"
local windows_command_pod
windows_command_pod="$(get_windows_command_pod_name)"
local internet_ip="8.8.8.8"
# This snippet tests Internet connectivity without depending on DNS by
# attempting to curl Google's well-known DNS IP, 8.8.8.8. On success we expect
# to get back a 404 status code; on failure the response object will have a
# status code of 0 or some other HTTP code.
if ! $kubectl exec "$windows_command_pod" -- powershell.exe \
"\$response = try { \`
(curl -UseBasicParsing http://$internet_ip \`
-ErrorAction Stop).BaseResponse \`
} catch [System.Net.WebException] { \`
\$_.Exception.Response \`
}; \`
\$statusCodeInt = [int]\$response.StatusCode; \`
if (\$statusCodeInt -eq 404) { \`
exit 0 \`
} else { \`
Write-Host \"curl $internet_ip got unexpected status code \$statusCodeInt\"
exit 1 \`
}" > $output_file; then
cleanup_deployments
echo "Failing output: $(cat $output_file)"
echo "FAILED: ${FUNCNAME[0]}"
exit 1
fi
}
function test_windows_pod_to_k8s_service {
echo "TEST: ${FUNCNAME[0]}"
local windows_command_pod
windows_command_pod="$(get_windows_command_pod_name)"
local service="heapster"
local service_ip
service_ip=$($kubectl get service --namespace kube-system $service \
-o jsonpath='{.spec.clusterIP}')
local service_port
service_port=$($kubectl get service --namespace kube-system $service \
-o jsonpath='{.spec.ports[?(@.protocol=="TCP")].port}')
local service_address="$service_ip:$service_port"
echo "curl-ing $service address from Windows pod: $service_address"
# Performing a web request to the heapster service results in an expected 404
# response; this code snippet filters out the expected 404 from other status
# codes that indicate failure.
if ! $kubectl exec "$windows_command_pod" -- powershell.exe \
"\$response = try { \`
(curl -UseBasicParsing http://$service_address \`
-ErrorAction Stop).BaseResponse \`
} catch [System.Net.WebException] { \`
\$_.Exception.Response \`
}; \`
\$statusCodeInt = [int]\$response.StatusCode; \`
if (\$statusCodeInt -eq 404) { \`
exit 0 \`
} else { \`
Write-Host \"curl $service_address got unexpected status code \$statusCodeInt\"
exit 1 \`
}" > $output_file; then
cleanup_deployments
echo "Failing output: $(cat $output_file)"
echo "FAILED: ${FUNCNAME[0]}"
exit 1
fi
}
function test_kube_dns_in_windows_pod {
echo "TEST: ${FUNCNAME[0]}"
local windows_command_pod
windows_command_pod="$(get_windows_command_pod_name)"
local service="kube-dns"
local service_ip
service_ip=$($kubectl get service --namespace kube-system $service \
-o jsonpath='{.spec.clusterIP}')
if ! $kubectl exec "$windows_command_pod" -- powershell.exe \
"Resolve-DnsName www.bing.com -server $service_ip" > $output_file; then
cleanup_deployments
echo "Failing output: $(cat $output_file)"
echo "FAILED: ${FUNCNAME[0]}"
exit 1
fi
}
function test_dns_just_works_in_windows_pod {
echo "TEST: ${FUNCNAME[0]}"
local windows_command_pod
windows_command_pod="$(get_windows_command_pod_name)"
if ! $kubectl exec "$windows_command_pod" -- powershell.exe \
"curl -UseBasicParsing http://www.bing.com" > $output_file; then
cleanup_deployments
echo "Failing output: $(cat $output_file)"
echo "FAILED: ${FUNCNAME[0]}"
exit 1
fi
}
function cleanup_deployments {
undeploy_linux_webserver_pod
undeploy_linux_command_pod
undeploy_windows_webserver_pod
undeploy_windows_command_pod
}
check_windows_nodes_are_ready
untaint_windows_nodes
check_no_system_pods_on_windows_nodes
deploy_linux_webserver_pod
deploy_linux_command_pod
deploy_windows_webserver_pod
deploy_windows_command_pod
prepare_linux_command_pod
echo ""
test_linux_node_to_linux_pod
test_linux_node_to_windows_pod
test_linux_pod_to_linux_pod
test_linux_pod_to_windows_pod
test_linux_pod_to_k8s_service
# Note: test_windows_node_to_k8s_service is not supported at this time.
# https://docs.microsoft.com/en-us/virtualization/windowscontainers/kubernetes/common-problems#my-windows-node-cannot-access-my-services-using-the-service-ip
test_windows_node_to_linux_pod
test_windows_node_to_windows_pod
test_windows_pod_to_linux_pod
test_windows_pod_to_windows_pod
test_windows_pod_to_internet
test_windows_pod_to_k8s_service
test_kube_dns_in_windows_pod
test_dns_just_works_in_windows_pod
echo ""
cleanup_deployments
echo "All tests passed!"
exit 0