Add an e2e image puller static pod

2016-05-18 02:16:32 +00:00 · 2016-05-18 02:16:32 +00:00 · 730555b09b
parent 4215fe57a5
commit 730555b09b
13 changed files with 172 additions and 19 deletions
--- a/cluster/common.sh
+++ b/cluster/common.sh
@ -505,6 +505,7 @@ CA_CERT: $(yaml-quote ${CA_CERT_BASE64:-})
 KUBELET_CERT: $(yaml-quote ${KUBELET_CERT_BASE64:-})
 KUBELET_KEY: $(yaml-quote ${KUBELET_KEY_BASE64:-})
 NETWORK_PROVIDER: $(yaml-quote ${NETWORK_PROVIDER:-})
+PREPULL_E2E_IMAGES: $(yaml-quote ${PREPULL_E2E_IMAGES:-})
 HAIRPIN_MODE: $(yaml-quote ${HAIRPIN_MODE:-})
 OPENCONTRAIL_TAG: $(yaml-quote ${OPENCONTRAIL_TAG:-})
 OPENCONTRAIL_KUBERNETES_TAG: $(yaml-quote ${OPENCONTRAIL_KUBERNETES_TAG:-})
--- a/cluster/gce/config-test.sh
+++ b/cluster/gce/config-test.sh
@ -158,3 +158,7 @@ HAIRPIN_MODE="${HAIRPIN_MODE:-promiscuous-bridge}" # promiscuous-bridge, hairpin

 # Optional: if set to true, kube-up will configure the cluster to run e2e tests.
 E2E_STORAGE_TEST_ENVIRONMENT=${KUBE_E2E_STORAGE_TEST_ENVIRONMENT:-false}
+
+# Optional: if set to true, a image puller is deployed. Only for use in e2e clusters.
+# TODO: Pipe this through GKE e2e clusters once we know it helps.
+PREPULL_E2E_IMAGES="${PREPULL_E2E_IMAGES:-true}"
--- a/cluster/gce/configure-vm.sh
+++ b/cluster/gce/configure-vm.sh
@ -440,6 +440,7 @@ dns_server: '$(echo "$DNS_SERVER_IP" | sed -e "s/'/''/g")'
 dns_domain: '$(echo "$DNS_DOMAIN" | sed -e "s/'/''/g")'
 admission_control: '$(echo "$ADMISSION_CONTROL" | sed -e "s/'/''/g")'
 network_provider: '$(echo "$NETWORK_PROVIDER" | sed -e "s/'/''/g")'
+prepull_e2e_images: '$(echo "$PREPULL_E2E_IMAGES" | sed -e "s/'/''/g")'
 hairpin_mode: '$(echo "$HAIRPIN_MODE" | sed -e "s/'/''/g")'
 opencontrail_tag: '$(echo "$OPENCONTRAIL_TAG" | sed -e "s/'/''/g")'
 opencontrail_kubernetes_tag: '$(echo "$OPENCONTRAIL_KUBERNETES_TAG")'
--- a/cluster/saltbase/salt/e2e-image-puller/e2e-image-puller.manifest
+++ b/cluster/saltbase/salt/e2e-image-puller/e2e-image-puller.manifest
@ -0,0 +1,44 @@
+# e2e-image-puller seeds nodes in an e2e cluster with test images.
+apiVersion: v1
+kind: Pod
+metadata:
+  name: e2e-image-puller
+  namespace: kube-system
+  labels:
+    name: e2e-image-puller
+spec:
+  containers:
+  - name: image-puller
+    resources:
+      requests:
+        cpu: 100m
+      limits:
+        cpu: 100m
+    image: gcr.io/google_containers/busybox:1.24
+    # TODO: Replace this with a go script that pulls in parallel?
+    # Currently it takes ~5m to pull all e2e images, so this is OK, and
+    # fewer moving parts is always better.
+    # TODO: Replace the hardcoded image list with an autogen list; the list is
+    # currently hard-coded for static verification. It was generated via:
+    # grep -Iiroh "gcr.io/google_.*" "${KUBE_ROOT}/test/e2e" | \
+    #     sed -e "s/[,\")}]//g" | awk '{print $1}' | sort | uniq | tr '\n' ' '
+    command:
+    - /bin/sh
+    - -c
+    - "for i in gcr.io/google_containers/busybox gcr.io/google_containers/busybox:1.24 gcr.io/google_containers/dnsutils:e2e gcr.io/google_containers/eptest:0.1 gcr.io/google_containers/fakegitserver:0.1 gcr.io/google_containers/hostexec:1.2 gcr.io/google_containers/iperf:e2e gcr.io/google_containers/jessie-dnsutils:e2e gcr.io/google_containers/liveness:e2e gcr.io/google_containers/mounttest:0.2 gcr.io/google_containers/mounttest:0.5 gcr.io/google_containers/mounttest:0.6 gcr.io/google_containers/mounttest-user:0.3 gcr.io/google_containers/netexec:1.4 gcr.io/google_containers/netexec:1.5 gcr.io/google_containers/nettest:1.7 gcr.io/google_containers/nettest:1.8 gcr.io/google_containers/nginx:1.7.9 gcr.io/google_containers/nginx-slim:0.5 gcr.io/google_containers/n-way-http:1.0 gcr.io/google_containers/pause:2.0 gcr.io/google_containers/pause-amd64:3.0 gcr.io/google_containers/porter:cd5cb5791ebaa8641955f0e8c2a9bed669b1eaab gcr.io/google_containers/portforwardtester:1.0 gcr.io/google_containers/redis:e2e gcr.io/google_containers/resource_consumer:beta2 gcr.io/google_containers/serve_hostname:v1.4 gcr.io/google_containers/servicelb:0.1 gcr.io/google_containers/test-webserver:e2e gcr.io/google_containers/ubuntu:14.04 gcr.io/google_containers/update-demo:kitten gcr.io/google_containers/update-demo:nautilus gcr.io/google_containers/volume-ceph:0.1 gcr.io/google_containers/volume-gluster:0.2 gcr.io/google_containers/volume-iscsi:0.1 gcr.io/google_containers/volume-nfs:0.6 gcr.io/google_containers/volume-rbd:0.1 gcr.io/google_samples/gb-redisslave:v1; do echo $(date '+%X') pulling $i; docker pull $i 1>/dev/null; done;"
+    securityContext:
+      privileged: true
+    volumeMounts:
+    - mountPath: /var/run/docker.sock
+      name: socket
+    - mountPath: /usr/bin/docker
+      name: docker
+  volumes:
+  - hostPath:
+      path: /var/run/docker.sock
+    name: socket
+  - hostPath:
+      path: /usr/bin/docker
+    name: docker
+  # This pod is really fire-and-forget.
+  restartPolicy: Never
--- a/cluster/saltbase/salt/e2e-image-puller/init.sls
+++ b/cluster/saltbase/salt/e2e-image-puller/init.sls
@ -0,0 +1,12 @@
+/etc/kubernetes/manifests/e2e-image-puller.manifest:
+  file.managed:
+    - source: salt://e2e-image-puller/e2e-image-puller.manifest
+    - template: jinja
+    - user: root
+    - group: root
+    - mode: 644
+    - makedirs: true
+    - dir_mode: 755
+    - require:
+      - service: docker
+      - service: kubelet
--- a/cluster/saltbase/salt/top.sls
+++ b/cluster/saltbase/salt/top.sls
@ -38,6 +38,9 @@ base:
 {% endif %}
 {% if pillar.get('enable_cluster_registry', '').lower() == 'true' %}
    - kube-registry-proxy
+{% endif %}
+{% if pillar['prepull_e2e_images'] is defined and pillar['prepull_e2e_images'].lower() == 'true' %}
+    - e2e-image-puller
 {% endif %}
    - logrotate
    - supervisor
--- a/test/e2e/e2e.go
+++ b/test/e2e/e2e.go
@ -44,6 +44,12 @@ const (
 	// running and ready before any e2e tests run. It includes pulling all of
 	// the pods (as of 5/18/15 this is 8 pods).
 	podStartupTimeout = 10 * time.Minute
+
+	// imagePrePullingTimeout is the time we wait for the e2e-image-puller
+	// static pods to pull the list of seeded images. If they don't pull
+	// images within this time we simply log their output and carry on
+	// with the tests.
+	imagePrePullingTimeout = 5 * time.Minute
 )

 var (
@ -119,7 +125,7 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
 	// cluster infrastructure pods that are being pulled or started can block
 	// test pods from running, and tests that ensure all pods are running and
 	// ready will fail).
-	if err := framework.WaitForPodsRunningReady(api.NamespaceSystem, int32(framework.TestContext.MinStartupPods), podStartupTimeout); err != nil {
+	if err := framework.WaitForPodsRunningReady(api.NamespaceSystem, int32(framework.TestContext.MinStartupPods), podStartupTimeout, framework.ImagePullerLabels); err != nil {
 		if c, errClient := framework.LoadClient(); errClient != nil {
 			framework.Logf("Unable to dump cluster information because: %v", errClient)
 		} else {
@ -130,6 +136,14 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
 		framework.Failf("Error waiting for all pods to be running and ready: %v", err)
 	}

+	if err := framework.WaitForPodsSuccess(api.NamespaceSystem, framework.ImagePullerLabels, imagePrePullingTimeout); err != nil {
+		// There is no guarantee that the image pulling will succeed in 3 minutes
+		// and we don't even run the image puller on all platforms (including GKE).
+		// We wait for it so we get an indication of failures in the logs, and to
+		// maximize benefit of image pre-pulling.
+		framework.Logf("WARNING: Image pulling pods failed to enter success in %v: %v", podStartupTimeout, err)
+	}
+
 	return nil

 }, func(data []byte) {
--- a/test/e2e/framework/framework.go
+++ b/test/e2e/framework/framework.go
@ -209,6 +209,8 @@ func (f *Framework) AfterEach() {
 	// Print events if the test failed.
 	if CurrentGinkgoTestDescription().Failed && TestContext.DumpLogsOnFailure {
 		DumpAllNamespaceInfo(f.Client, f.Namespace.Name)
+		By(fmt.Sprintf("Dumping a list of prepulled images on each node"))
+		LogPodsWithLabels(f.Client, api.NamespaceSystem, ImagePullerLabels)
 	}

 	summaries := make([]TestDataSummary, 0)
--- a/test/e2e/framework/util.go
+++ b/test/e2e/framework/util.go
@ -130,6 +130,10 @@ const (
 	ClaimProvisionTimeout = 5 * time.Minute
 )

+// Label allocated to the image puller static pod that runs on each node
+// before e2es.
+var ImagePullerLabels = map[string]string{"name": "e2e-image-puller"}
+
 // SubResource proxy should have been functional in v1.0.0, but SubResource
 // proxy via tunneling is known to be broken in v1.0.  See
 // https://github.com/kubernetes/kubernetes/pull/15224#issuecomment-146769463
@ -413,6 +417,48 @@ func hasReplicationControllersForPod(rcs *api.ReplicationControllerList, pod api
 	return false
 }

+// WaitForPodsSuccess waits till all labels matching the given selector enter
+// the Success state. The caller is expected to only invoke this method once the
+// pods have been created.
+func WaitForPodsSuccess(ns string, successPodLabels map[string]string, timeout time.Duration) error {
+	c, err := LoadClient()
+	if err != nil {
+		return err
+	}
+	successPodSelector := labels.SelectorFromSet(successPodLabels)
+	start, badPods := time.Now(), []api.Pod{}
+
+	if wait.PollImmediate(30*time.Second, timeout, func() (bool, error) {
+		podList, err := c.Pods(ns).List(api.ListOptions{LabelSelector: successPodSelector})
+		if err != nil {
+			Logf("Error getting pods in namespace %q: %v", ns, err)
+			return false, nil
+		}
+		if len(podList.Items) == 0 {
+			Logf("Waiting for pods to enter Success, but no pods in %q match label %v", ns, successPodLabels)
+			return true, nil
+		}
+		badPods = []api.Pod{}
+		for _, pod := range podList.Items {
+			if pod.Status.Phase != api.PodSucceeded {
+				badPods = append(badPods, pod)
+			}
+		}
+		successPods := len(podList.Items) - len(badPods)
+		Logf("%d / %d pods in namespace %q are in Success state (%d seconds elapsed)",
+			successPods, len(podList.Items), ns, int(time.Since(start).Seconds()))
+		if len(badPods) == 0 {
+			return true, nil
+		}
+		return false, nil
+	}) != nil {
+		logPodStates(badPods)
+		LogPodsWithLabels(c, ns, successPodLabels)
+		return fmt.Errorf("Not all pods in namespace %q are successful within %v", ns, timeout)
+	}
+	return nil
+}
+
 // WaitForPodsRunningReady waits up to timeout to ensure that all pods in
 // namespace ns are either running and ready, or failed but controlled by a
 // replication controller. Also, it ensures that at least minPods are running
@ -420,11 +466,17 @@ func hasReplicationControllersForPod(rcs *api.ReplicationControllerList, pod api
 // that it requires the list of pods on every iteration. This is useful, for
 // example, in cluster startup, because the number of pods increases while
 // waiting.
-func WaitForPodsRunningReady(ns string, minPods int32, timeout time.Duration) error {
+// If ignoreSuccessPods is true, pods in the "Success" state are ignored and
+// this function waits for minPods to enter Running/Ready. Otherwise an error is
+// returned even if there are minPods pods, some of which are in Running/Ready
+// and some in Success. This is to allow the client to decide if "Success"
+// means "Ready" or not.
+func WaitForPodsRunningReady(ns string, minPods int32, timeout time.Duration, ignoreLabels map[string]string) error {
 	c, err := LoadClient()
 	if err != nil {
 		return err
 	}
+	ignoreSelector := labels.SelectorFromSet(ignoreLabels)
 	start := time.Now()
 	Logf("Waiting up to %v for all pods (need at least %d) in namespace '%s' to be running and ready",
 		timeout, minPods, ns)
@ -449,6 +501,10 @@ func WaitForPodsRunningReady(ns string, minPods int32, timeout time.Duration) er
 		}
 		nOk, replicaOk, badPods := int32(0), int32(0), []api.Pod{}
 		for _, pod := range podList.Items {
+			if len(ignoreLabels) != 0 && ignoreSelector.Matches(labels.Set(pod.Labels)) {
+				Logf("%v in state %v, ignoring", pod.Name, pod.Status.Phase)
+				continue
+			}
 			if res, err := PodRunningReady(&pod); res && err == nil {
 				nOk++
 				if hasReplicationControllersForPod(rcList, pod) {
@ -535,6 +591,20 @@ func RunKubernetesServiceTestContainer(repoRoot string, ns string) {
 	}
 }

+func kubectlLogPod(c *client.Client, pod api.Pod) {
+	for _, container := range pod.Spec.Containers {
+		logs, err := GetPodLogs(c, pod.Namespace, pod.Name, container.Name)
+		if err != nil {
+			logs, err = getPreviousPodLogs(c, pod.Namespace, pod.Name, container.Name)
+			if err != nil {
+				Logf("Failed to get logs of pod %v, container %v, err: %v", pod.Name, container.Name, err)
+			}
+		}
+		By(fmt.Sprintf("Logs of %v/%v:%v on node %v", pod.Namespace, pod.Name, container.Name, pod.Spec.NodeName))
+		Logf(logs)
+	}
+}
+
 func LogFailedContainers(ns string) {
 	c, err := LoadClient()
 	if err != nil {
@ -549,21 +619,23 @@ func LogFailedContainers(ns string) {
 	Logf("Running kubectl logs on non-ready containers in %v", ns)
 	for _, pod := range podList.Items {
 		if res, err := PodRunningReady(&pod); !res || err != nil {
-			for _, container := range pod.Spec.Containers {
-				logs, err := GetPodLogs(c, ns, pod.Name, container.Name)
-				if err != nil {
-					logs, err = getPreviousPodLogs(c, ns, pod.Name, container.Name)
-					if err != nil {
-						Logf("Failed to get logs of pod %v, container %v, err: %v", pod.Name, container.Name, err)
-					}
-				}
-				By(fmt.Sprintf("Logs of %v/%v:%v on node %v", ns, pod.Name, container.Name, pod.Spec.NodeName))
-				Logf(logs)
-			}
+			kubectlLogPod(c, pod)
 		}
 	}
 }

+func LogPodsWithLabels(c *client.Client, ns string, match map[string]string) {
+	podList, err := c.Pods(ns).List(api.ListOptions{LabelSelector: labels.SelectorFromSet(match)})
+	if err != nil {
+		Logf("Error getting pods in namespace %q: %v", ns, err)
+		return
+	}
+	Logf("Running kubectl logs on pods with labels %v in %v", match, ns)
+	for _, pod := range podList.Items {
+		kubectlLogPod(c, pod)
+	}
+}
+
 // DeleteNamespaces deletes all namespaces that match the given delete and skip filters.
 // Filter is by simple strings.Contains; first skip filter, then delete filter.
 // Returns the list of deleted namespaces or an error.
--- a/test/e2e/mesos.go
+++ b/test/e2e/mesos.go
@ -69,7 +69,7 @@ var _ = framework.KubeDescribe("Mesos", func() {

 		const ns = "static-pods"
 		numpods := int32(len(nodelist.Items))
-		framework.ExpectNoError(framework.WaitForPodsRunningReady(ns, numpods, wait.ForeverTestTimeout),
+		framework.ExpectNoError(framework.WaitForPodsRunningReady(ns, numpods, wait.ForeverTestTimeout, map[string]string{}),
 			fmt.Sprintf("number of static pods in namespace %s is %d", ns, numpods))
 	})

--- a/test/e2e/networking.go
+++ b/test/e2e/networking.go
@ -124,7 +124,7 @@ var _ = framework.KubeDescribe("Networking", func() {
 				"Rerun it with at least two nodes to get complete coverage.")
 		}

-		podNames := LaunchNetTestPodPerNode(f, nodes, svcname, "1.8")
+		podNames := LaunchNetTestPodPerNode(f, nodes, svcname)

 		// Clean up the pods
 		defer func() {
@ -256,7 +256,7 @@ var _ = framework.KubeDescribe("Networking", func() {
 	})
 })

-func LaunchNetTestPodPerNode(f *framework.Framework, nodes *api.NodeList, name, version string) []string {
+func LaunchNetTestPodPerNode(f *framework.Framework, nodes *api.NodeList, name string) []string {
 	podNames := []string{}

 	totalPods := len(nodes.Items)
@ -275,7 +275,7 @@ func LaunchNetTestPodPerNode(f *framework.Framework, nodes *api.NodeList, name,
 				Containers: []api.Container{
 					{
 						Name:  "webserver",
-						Image: "gcr.io/google_containers/nettest:" + version,
+						Image: "gcr.io/google_containers/nettest:1.8",
 						Args: []string{
 							"-service=" + name,
 							//peers >= totalPods should be asserted by the container.
--- a/test/e2e/resize_nodes.go
+++ b/test/e2e/resize_nodes.go
@ -396,7 +396,7 @@ var _ = framework.KubeDescribe("Nodes [Disruptive]", func() {
 			// the cluster is restored to health.
 			By("waiting for system pods to successfully restart")

-			err := framework.WaitForPodsRunningReady(api.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout)
+			err := framework.WaitForPodsRunningReady(api.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout, map[string]string{})
 			Expect(err).NotTo(HaveOccurred())
 		})

--- a/test/e2e/scheduler_predicates.go
+++ b/test/e2e/scheduler_predicates.go
@ -196,7 +196,7 @@ var _ = framework.KubeDescribe("SchedulerPredicates [Serial]", func() {
 			}
 		}

-		err = framework.WaitForPodsRunningReady(api.NamespaceSystem, int32(systemPodsNo), framework.PodReadyBeforeTimeout)
+		err = framework.WaitForPodsRunningReady(api.NamespaceSystem, int32(systemPodsNo), framework.PodReadyBeforeTimeout, map[string]string{})
 		Expect(err).NotTo(HaveOccurred())

 		for _, node := range nodeList.Items {