Allow some NotReady nodes in 1000 node clusters

2016-02-01 12:37:38 +01:00 · 2016-02-01 12:37:38 +01:00 · 6aaabc6f46
parent 3e04a45a95
commit 6aaabc6f46
2 changed files with 48 additions and 2 deletions
--- a/cluster/validate-cluster.sh
+++ b/cluster/validate-cluster.sh
@ -15,6 +15,10 @@
 # limitations under the License.

 # Validates that the cluster is healthy.
+# Error codes are:
+# 0 - success
+# 1 - fatal (cluster is unlikely to work)
+# 2 - non-fatal (encountered some errors, but cluster should be working correctly)

 set -o errexit
 set -o nounset
@ -29,11 +33,14 @@ fi
 source "${KUBE_ROOT}/cluster/kube-env.sh"
 source "${KUBE_ROOT}/cluster/kube-util.sh"

+ALLOWED_NOTREADY_NODES="${ALLOWED_NOTREADY_NODES:-0}"
+
 EXPECTED_NUM_NODES="${NUM_NODES}"
 if [[ "${REGISTER_MASTER_KUBELET:-}" == "true" ]]; then
  EXPECTED_NUM_NODES=$((EXPECTED_NUM_NODES+1))
 fi
 # Make several attempts to deal with slow cluster birth.
+return_value=0
 attempt=0
 while true; do
  # The "kubectl get nodes -o template" exports node information.
@ -59,7 +66,12 @@ while true; do
    if (( attempt > 100 )); then
      echo -e "${color_red}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Your cluster may not be fully functional.${color_norm}"
      "${KUBE_ROOT}/cluster/kubectl.sh" get nodes
-      exit 2
+      if [ "$((${EXPECTED_NUM_NODES} - ${found}))" -gt "${ALLOWED_NOTREADY_NODES}" ]; then
+        exit 1
+      else
+        return_value=2
+        break
+      fi
 		else
      echo -e "${color_yellow}Waiting for ${EXPECTED_NUM_NODES} ready nodes. ${ready} ready nodes, ${found} registered. Retrying.${color_norm}"
    fi
@ -99,4 +111,10 @@ done

 echo "Validate output:"
 "${KUBE_ROOT}/cluster/kubectl.sh" get cs
-echo -e "${color_green}Cluster validation succeeded${color_norm}"
+if [ "${return_value}" == "0" ]; then 
+  echo -e "${color_green}Cluster validation succeeded${color_norm}"
+else
+  echo -e "${color_yellow}Cluster validation encountered some problems, but cluster should be in working order${color_norm}"
+fi
+
+exit "${return_value}"
--- a/hack/jenkins/e2e.sh
+++ b/hack/jenkins/e2e.sh
@ -508,6 +508,7 @@ case ${JOB_NAME} in

  # Runs the performance/scalability test on huge 1000-node cluster on GCE.
  # Flannel is used as network provider.
+  # Allows a couple of nodes to be NotReady during startup
  kubernetes-e2e-gce-enormous-cluster)
    : ${E2E_CLUSTER_NAME:="jenkins-gce-enormous-cluster"}
    : ${E2E_NETWORK:="e2e-enormous-cluster"}
@ -526,6 +527,32 @@ case ${JOB_NAME} in
    NODE_SIZE="n1-standard-1"
    NODE_DISK_SIZE="50GB"
    NUM_NODES="1000"
+    ALLOWED_NOTREADY_NODES="2"
+    # Reduce logs verbosity
+    TEST_CLUSTER_LOG_LEVEL="--v=1"
+    # Increase resync period to simulate production
+    TEST_CLUSTER_RESYNC_PERIOD="--min-resync-period=12h"
+    ;;
+
+  # Starts and tears down 1000-node cluster on GCE using flannel networking
+  # Requires all 1000 nodes to come up.
+  kubernetes-e2e-gce-enormous-startup)
+    : ${E2E_CLUSTER_NAME:="jenkins-gce-enormous-startup"}
+    # TODO: increase a quota for networks in kubernetes-scale and move this test to its own network
+    : ${E2E_NETWORK:="e2e-enormous-cluster"}
+    : ${E2E_TEST:="false"}
+    : ${KUBE_GCE_INSTANCE_PREFIX:="e2e-enormous-startup"}
+    : ${PROJECT:="kubernetes-scale"}
+    # Override GCE defaults.
+    NETWORK_PROVIDER="flannel"
+    # Temporarily switch of Heapster, as this will not schedule anywhere.
+    # TODO: Think of a solution to enable it.
+    ENABLE_CLUSTER_MONITORING="none"
+    E2E_ZONE="asia-east1-a"
+    MASTER_SIZE="n1-standard-32"
+    NODE_SIZE="n1-standard-1"
+    NODE_DISK_SIZE="50GB"
+    NUM_NODES="1000"
    # Reduce logs verbosity
    TEST_CLUSTER_LOG_LEVEL="--v=1"
    # Increase resync period to simulate production
@ -900,6 +927,7 @@ export KUBE_GCE_NODE_IMAGE=${KUBE_GCE_NODE_IMAGE:-}
 export KUBE_OS_DISTRIBUTION=${KUBE_OS_DISTRIBUTION:-}
 export GCE_SERVICE_ACCOUNT=$(gcloud auth list 2> /dev/null | grep active | cut -f3 -d' ')
 export FAIL_ON_GCP_RESOURCE_LEAK="${FAIL_ON_GCP_RESOURCE_LEAK:-false}"
+export ALLOWED_NOTREADY_NODES=${ALLOWED_NOTREADY_NODES:-}

 # GKE variables
 export CLUSTER_NAME=${E2E_CLUSTER_NAME}