2014-07-29 04:42:53 +00:00
#!/bin/bash
2016-06-03 00:25:58 +00:00
# Copyright 2014 The Kubernetes Authors.
2014-07-29 04:42:53 +00:00
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
2015-04-21 20:30:16 +00:00
# Validates that the cluster is healthy.
2016-02-01 11:37:38 +00:00
# Error codes are:
# 0 - success
# 1 - fatal (cluster is unlikely to work)
# 2 - non-fatal (encountered some errors, but cluster should be working correctly)
2014-07-29 04:42:53 +00:00
2014-10-06 20:25:27 +00:00
set -o errexit
set -o nounset
set -o pipefail
2014-07-29 04:42:53 +00:00
2014-10-03 21:58:49 +00:00
KUBE_ROOT = $( dirname " ${ BASH_SOURCE } " ) /..
2015-12-16 09:26:11 +00:00
if [ -f " ${ KUBE_ROOT } /cluster/env.sh " ] ; then
2016-04-22 16:49:50 +00:00
source " ${ KUBE_ROOT } /cluster/env.sh "
2015-12-16 09:26:11 +00:00
fi
2017-03-11 06:18:38 +00:00
source " ${ KUBE_ROOT } /hack/lib/util.sh "
2015-07-02 16:48:31 +00:00
source " ${ KUBE_ROOT } /cluster/kube-util.sh "
2014-07-29 04:42:53 +00:00
2016-04-22 16:49:50 +00:00
# Run kubectl and retry upon failure.
function kubectl_retry( ) {
tries = 3
while ! " ${ KUBE_ROOT } /cluster/kubectl.sh " " $@ " ; do
tries = $(( tries-1))
if [ [ ${ tries } -le 0 ] ] ; then
echo " ('kubectl $@ ' failed, giving up) " >& 2
return 1
fi
echo " (kubectl failed, will retry ${ tries } times) " >& 2
sleep 1
done
}
2016-02-01 11:37:38 +00:00
ALLOWED_NOTREADY_NODES = " ${ ALLOWED_NOTREADY_NODES :- 0 } "
2016-06-03 12:42:29 +00:00
CLUSTER_READY_ADDITIONAL_TIME_SECONDS = " ${ CLUSTER_READY_ADDITIONAL_TIME_SECONDS :- 30 } "
2016-02-01 11:37:38 +00:00
2015-11-24 03:06:36 +00:00
EXPECTED_NUM_NODES = " ${ NUM_NODES } "
2016-12-13 13:13:44 +00:00
if [ [ " ${ KUBERNETES_PROVIDER :- } " = = "gce" ] ] ; then
2016-12-20 08:48:26 +00:00
echo " Validating gce cluster, MULTIZONE= ${ MULTIZONE :- } "
2016-12-13 13:13:44 +00:00
# In multizone mode we need to add instances for all nodes in the region.
if [ [ " ${ MULTIZONE :- } " = = "true" ] ] ; then
2017-08-09 16:45:44 +00:00
EXPECTED_NUM_NODES = $( gcloud -q compute instances list --project= " ${ PROJECT } " --format= [ no-heading] \
--filter= " name ~ ' ${ NODE_INSTANCE_PREFIX } .*' AND zone:( $( gcloud -q compute zones list --project= " ${ PROJECT } " --filter= region = ${ REGION } --format= csv[ no-heading] \( name\) | tr "\n" "," | sed " s/, $// " ) ) " | wc -l)
2016-12-20 08:48:26 +00:00
echo " Computing number of nodes, NODE_INSTANCE_PREFIX= ${ NODE_INSTANCE_PREFIX } , REGION= ${ REGION } , EXPECTED_NUM_NODES= ${ EXPECTED_NUM_NODES } "
2016-12-13 13:13:44 +00:00
fi
fi
2015-08-04 18:14:46 +00:00
if [ [ " ${ REGISTER_MASTER_KUBELET :- } " = = "true" ] ] ; then
2016-10-11 12:52:51 +00:00
if [ [ " ${ KUBERNETES_PROVIDER :- } " = = "gce" ] ] ; then
NUM_MASTERS = $( get-master-replicas-count)
else
NUM_MASTERS = 1
fi
EXPECTED_NUM_NODES = $(( EXPECTED_NUM_NODES+NUM_MASTERS))
2015-08-04 18:14:46 +00:00
fi
2016-12-13 13:13:44 +00:00
2016-06-03 12:42:29 +00:00
REQUIRED_NUM_NODES = $(( EXPECTED_NUM_NODES - ALLOWED_NOTREADY_NODES))
2015-01-14 00:03:30 +00:00
# Make several attempts to deal with slow cluster birth.
2016-02-01 11:37:38 +00:00
return_value = 0
2014-12-09 23:37:06 +00:00
attempt = 0
2016-12-15 13:34:02 +00:00
# Set the timeout to ~25minutes (100 x 15 second) to avoid timeouts for 1000-node clusters.
2016-06-03 12:42:29 +00:00
PAUSE_BETWEEN_ITERATIONS_SECONDS = 15
2016-12-15 13:34:02 +00:00
MAX_ATTEMPTS = 100
2016-06-03 12:42:29 +00:00
ADDITIONAL_ITERATIONS = $(( ( CLUSTER_READY_ADDITIONAL_TIME_SECONDS + PAUSE_BETWEEN_ITERATIONS_SECONDS - 1 ) / PAUSE_BETWEEN_ITERATIONS_SECONDS))
2014-12-09 23:37:06 +00:00
while true; do
2016-04-22 16:49:50 +00:00
# Pause between iterations of this large outer loop.
if [ [ ${ attempt } -gt 0 ] ] ; then
sleep 15
fi
attempt = $(( attempt+1))
2015-06-19 05:46:43 +00:00
# The "kubectl get nodes -o template" exports node information.
2015-05-07 22:50:11 +00:00
#
2015-06-19 05:46:43 +00:00
# Echo the output and gather 2 counts:
2015-05-07 22:50:11 +00:00
# - Total number of nodes.
# - Number of "ready" nodes.
2015-08-04 18:14:46 +00:00
#
# Suppress errors from kubectl output because during cluster bootstrapping
# for clusters where the master node is registered, the apiserver will become
# available and then get restarted as the kubelet configures the docker bridge.
2016-12-15 13:34:02 +00:00
#
# We are assigning the result of kubectl_retry get nodes operation to the res
2018-02-09 06:53:53 +00:00
# variable in that way, to prevent stopping the whole script on an error.
2016-12-15 13:34:02 +00:00
node = $( kubectl_retry get nodes) && res = " $? " || res = " $? "
if [ " ${ res } " -ne "0" ] ; then
if [ [ " ${ attempt } " -gt " ${ last_run :- $MAX_ATTEMPTS } " ] ] ; then
echo -e " ${ color_red } Failed to get nodes. ${ color_norm } "
exit 1
else
continue
fi
fi
2016-04-22 16:49:50 +00:00
found = $(( $( echo " ${ node } " | wc -l) - 1 ))
ready = $(( $( echo " ${ node } " | grep -v "NotReady" | wc -l ) - 1 ))
2015-05-07 22:50:11 +00:00
2015-08-04 18:14:46 +00:00
if ( ( " ${ found } " = = " ${ EXPECTED_NUM_NODES } " ) ) && ( ( " ${ ready } " = = " ${ EXPECTED_NUM_NODES } " ) ) ; then
2014-12-09 23:37:06 +00:00
break
2016-04-22 16:49:50 +00:00
elif ( ( " ${ found } " > " ${ EXPECTED_NUM_NODES } " ) ) ; then
2016-06-17 14:04:39 +00:00
if [ [ " ${ KUBE_USE_EXISTING_MASTER :- } " != "true" ] ] ; then
echo -e " ${ color_red } Found ${ found } nodes, but expected ${ EXPECTED_NUM_NODES } . Your cluster may not behave correctly. ${ color_norm } "
fi
2016-04-22 16:49:50 +00:00
break
elif ( ( " ${ ready } " > " ${ EXPECTED_NUM_NODES } " ) ) ; then
echo -e " ${ color_red } Found ${ ready } ready nodes, but expected ${ EXPECTED_NUM_NODES } . Your cluster may not behave correctly. ${ color_norm } "
2015-10-19 21:57:45 +00:00
break
2014-12-09 23:37:06 +00:00
else
2016-06-03 12:42:29 +00:00
if [ [ " ${ REQUIRED_NUM_NODES } " -le " ${ ready } " ] ] ; then
echo -e " ${ color_green } Found ${ REQUIRED_NUM_NODES } Nodes, allowing additional ${ ADDITIONAL_ITERATIONS } iterations for other Nodes to join. ${ color_norm } "
last_run = " ${ last_run :- $(( attempt + ADDITIONAL_ITERATIONS - 1 )) } "
fi
2016-12-15 13:34:02 +00:00
if [ [ " ${ attempt } " -gt " ${ last_run :- $MAX_ATTEMPTS } " ] ] ; then
2016-06-03 12:42:29 +00:00
echo -e " ${ color_yellow } Detected ${ ready } ready nodes, found ${ found } nodes out of expected ${ EXPECTED_NUM_NODES } . Your cluster may not be fully functional. ${ color_norm } "
2016-04-22 16:49:50 +00:00
kubectl_retry get nodes
2016-06-03 12:42:29 +00:00
if [ [ " ${ REQUIRED_NUM_NODES } " -gt " ${ ready } " ] ] ; then
2016-02-01 11:37:38 +00:00
exit 1
else
return_value = 2
break
fi
2016-04-22 16:49:50 +00:00
else
2015-08-04 18:14:46 +00:00
echo -e " ${ color_yellow } Waiting for ${ EXPECTED_NUM_NODES } ready nodes. ${ ready } ready nodes, ${ found } registered. Retrying. ${ color_norm } "
2014-12-09 23:37:06 +00:00
fi
fi
done
2015-06-19 05:46:43 +00:00
echo " Found ${ found } node(s). "
2016-04-22 16:49:50 +00:00
kubectl_retry get nodes
2014-07-29 04:42:53 +00:00
2015-04-21 20:30:16 +00:00
attempt = 0
while true; do
2015-06-19 05:46:43 +00:00
# The "kubectl componentstatuses -o template" exports components health information.
2015-04-21 20:30:16 +00:00
#
2015-06-19 05:46:43 +00:00
# Echo the output and gather 2 counts:
# - Total number of componentstatuses.
# - Number of "healthy" components.
2016-04-22 16:49:50 +00:00
cs_status = $( kubectl_retry get componentstatuses -o template --template= '{{range .items}}{{with index .conditions 0}}{{.type}}:{{.status}}{{end}}{{"\n"}}{{end}}' ) || true
componentstatuses = $( echo " ${ cs_status } " | grep -c 'Healthy:' ) || true
healthy = $( echo " ${ cs_status } " | grep -c 'Healthy:True' ) || true
2014-10-17 21:48:11 +00:00
2017-06-20 05:49:36 +00:00
if ( ( componentstatuses > healthy) ) || ( ( componentstatuses = = 0) ) ; then
2015-04-21 20:30:16 +00:00
if ( ( attempt < 5) ) ; then
echo -e " ${ color_yellow } Cluster not working yet. ${ color_norm } "
2014-12-09 23:37:06 +00:00
attempt = $(( attempt+1))
sleep 30
2015-04-21 20:30:16 +00:00
else
echo -e " ${ color_yellow } Validate output: ${ color_norm } "
2016-04-22 16:49:50 +00:00
kubectl_retry get cs
2015-04-21 20:30:16 +00:00
echo -e " ${ color_red } Validation returned one or more failed components. Cluster is probably broken. ${ color_norm } "
exit 1
fi
else
break
fi
2014-07-29 04:42:53 +00:00
done
2015-04-21 20:30:16 +00:00
echo "Validate output:"
2017-06-20 05:49:36 +00:00
kubectl_retry get cs || true
2016-04-22 16:49:50 +00:00
if [ " ${ return_value } " = = "0" ] ; then
2016-02-01 11:37:38 +00:00
echo -e " ${ color_green } Cluster validation succeeded ${ color_norm } "
else
echo -e " ${ color_yellow } Cluster validation encountered some problems, but cluster should be in working order ${ color_norm } "
fi
exit " ${ return_value } "