Add trusty node health monitoring

Upstart monitors the process of docker, kubelet, and kube-proxy. This change adds an upstart job running as daemon to conduct non-PID health monitoring.
2015-09-14 14:14:56 -07:00 · 2015-09-14 14:14:56 -07:00 · 7427387938
parent f03a267089
commit 7427387938
1 changed files with 40 additions and 0 deletions
--- a/cluster/gce/trusty/node.yaml
+++ b/cluster/gce/trusty/node.yaml
@ -372,5 +372,45 @@ script
 	fi
 end script

+--===============6024533374511606659==
+MIME-Version: 1.0
+Content-Type: text/upstart-job; charset="us-ascii"
+Content-Transfer-Encoding: 7bit
+Content-Disposition: attachment; filename="kube-node-health-monitoring.conf"
+
+description "Kubenetes node health monitoring"
+
+start on stopped kube-docker and started kube-proxy
+
+respawn
+
+script
+	set -o nounset
+
+	# Wait for a minute to let docker, kubelet, and kube-proxy processes finish initialization.
+	# TODO(andyzheng0831): replace it with a more reliable method if possible.
+	sleep 60
+
+	sleep_seconds=10
+	max_seconds=10
+	# We simply kill the process when there is a failure. Another upstart job will automatically
+	# restart the process.
+	while [ 1 ]; do
+		if ! timeout 10 docker version > /dev/null; then
+			echo "Docker daemon failed!"
+			pkill docker
+		fi
+		if ! curl -m ${max_seconds} -f -s http://127.0.0.1:10255/healthz > /dev/null; then
+			echo "Kubelet is unhealthy!"
+			pkill kubelet
+		fi
+		if ! curl -m ${max_seconds} -f -s http://127.0.0.1:10249/healthz > /dev/null; then
+			echo "Kube-proxy is unhealthy!"
+			pkill kube-proxy
+		fi
+		sleep ${sleep_seconds}
+	done
+end script
+
 --===============6024533374511606659==--