From 730555b09bea6be2a591f637116dca19cb85b118 Mon Sep 17 00:00:00 2001 From: Prashanth B Date: Wed, 18 May 2016 02:16:32 +0000 Subject: [PATCH] Add an e2e image puller static pod --- cluster/common.sh | 1 + cluster/gce/config-test.sh | 4 + cluster/gce/configure-vm.sh | 1 + .../e2e-image-puller.manifest | 44 +++++++++ .../saltbase/salt/e2e-image-puller/init.sls | 12 +++ cluster/saltbase/salt/top.sls | 3 + test/e2e/e2e.go | 16 +++- test/e2e/framework/framework.go | 2 + test/e2e/framework/util.go | 96 ++++++++++++++++--- test/e2e/mesos.go | 2 +- test/e2e/networking.go | 6 +- test/e2e/resize_nodes.go | 2 +- test/e2e/scheduler_predicates.go | 2 +- 13 files changed, 172 insertions(+), 19 deletions(-) create mode 100644 cluster/saltbase/salt/e2e-image-puller/e2e-image-puller.manifest create mode 100644 cluster/saltbase/salt/e2e-image-puller/init.sls diff --git a/cluster/common.sh b/cluster/common.sh index 983b707bab..d8db74d63b 100755 --- a/cluster/common.sh +++ b/cluster/common.sh @@ -505,6 +505,7 @@ CA_CERT: $(yaml-quote ${CA_CERT_BASE64:-}) KUBELET_CERT: $(yaml-quote ${KUBELET_CERT_BASE64:-}) KUBELET_KEY: $(yaml-quote ${KUBELET_KEY_BASE64:-}) NETWORK_PROVIDER: $(yaml-quote ${NETWORK_PROVIDER:-}) +PREPULL_E2E_IMAGES: $(yaml-quote ${PREPULL_E2E_IMAGES:-}) HAIRPIN_MODE: $(yaml-quote ${HAIRPIN_MODE:-}) OPENCONTRAIL_TAG: $(yaml-quote ${OPENCONTRAIL_TAG:-}) OPENCONTRAIL_KUBERNETES_TAG: $(yaml-quote ${OPENCONTRAIL_KUBERNETES_TAG:-}) diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh index 67bb2ec6e9..eab2cba531 100755 --- a/cluster/gce/config-test.sh +++ b/cluster/gce/config-test.sh @@ -158,3 +158,7 @@ HAIRPIN_MODE="${HAIRPIN_MODE:-promiscuous-bridge}" # promiscuous-bridge, hairpin # Optional: if set to true, kube-up will configure the cluster to run e2e tests. E2E_STORAGE_TEST_ENVIRONMENT=${KUBE_E2E_STORAGE_TEST_ENVIRONMENT:-false} + +# Optional: if set to true, a image puller is deployed. Only for use in e2e clusters. +# TODO: Pipe this through GKE e2e clusters once we know it helps. +PREPULL_E2E_IMAGES="${PREPULL_E2E_IMAGES:-true}" diff --git a/cluster/gce/configure-vm.sh b/cluster/gce/configure-vm.sh index 066b6fe9f3..f0f5104381 100755 --- a/cluster/gce/configure-vm.sh +++ b/cluster/gce/configure-vm.sh @@ -440,6 +440,7 @@ dns_server: '$(echo "$DNS_SERVER_IP" | sed -e "s/'/''/g")' dns_domain: '$(echo "$DNS_DOMAIN" | sed -e "s/'/''/g")' admission_control: '$(echo "$ADMISSION_CONTROL" | sed -e "s/'/''/g")' network_provider: '$(echo "$NETWORK_PROVIDER" | sed -e "s/'/''/g")' +prepull_e2e_images: '$(echo "$PREPULL_E2E_IMAGES" | sed -e "s/'/''/g")' hairpin_mode: '$(echo "$HAIRPIN_MODE" | sed -e "s/'/''/g")' opencontrail_tag: '$(echo "$OPENCONTRAIL_TAG" | sed -e "s/'/''/g")' opencontrail_kubernetes_tag: '$(echo "$OPENCONTRAIL_KUBERNETES_TAG")' diff --git a/cluster/saltbase/salt/e2e-image-puller/e2e-image-puller.manifest b/cluster/saltbase/salt/e2e-image-puller/e2e-image-puller.manifest new file mode 100644 index 0000000000..09f3809a81 --- /dev/null +++ b/cluster/saltbase/salt/e2e-image-puller/e2e-image-puller.manifest @@ -0,0 +1,44 @@ +# e2e-image-puller seeds nodes in an e2e cluster with test images. +apiVersion: v1 +kind: Pod +metadata: + name: e2e-image-puller + namespace: kube-system + labels: + name: e2e-image-puller +spec: + containers: + - name: image-puller + resources: + requests: + cpu: 100m + limits: + cpu: 100m + image: gcr.io/google_containers/busybox:1.24 + # TODO: Replace this with a go script that pulls in parallel? + # Currently it takes ~5m to pull all e2e images, so this is OK, and + # fewer moving parts is always better. + # TODO: Replace the hardcoded image list with an autogen list; the list is + # currently hard-coded for static verification. It was generated via: + # grep -Iiroh "gcr.io/google_.*" "${KUBE_ROOT}/test/e2e" | \ + # sed -e "s/[,\")}]//g" | awk '{print $1}' | sort | uniq | tr '\n' ' ' + command: + - /bin/sh + - -c + - "for i in gcr.io/google_containers/busybox gcr.io/google_containers/busybox:1.24 gcr.io/google_containers/dnsutils:e2e gcr.io/google_containers/eptest:0.1 gcr.io/google_containers/fakegitserver:0.1 gcr.io/google_containers/hostexec:1.2 gcr.io/google_containers/iperf:e2e gcr.io/google_containers/jessie-dnsutils:e2e gcr.io/google_containers/liveness:e2e gcr.io/google_containers/mounttest:0.2 gcr.io/google_containers/mounttest:0.5 gcr.io/google_containers/mounttest:0.6 gcr.io/google_containers/mounttest-user:0.3 gcr.io/google_containers/netexec:1.4 gcr.io/google_containers/netexec:1.5 gcr.io/google_containers/nettest:1.7 gcr.io/google_containers/nettest:1.8 gcr.io/google_containers/nginx:1.7.9 gcr.io/google_containers/nginx-slim:0.5 gcr.io/google_containers/n-way-http:1.0 gcr.io/google_containers/pause:2.0 gcr.io/google_containers/pause-amd64:3.0 gcr.io/google_containers/porter:cd5cb5791ebaa8641955f0e8c2a9bed669b1eaab gcr.io/google_containers/portforwardtester:1.0 gcr.io/google_containers/redis:e2e gcr.io/google_containers/resource_consumer:beta2 gcr.io/google_containers/serve_hostname:v1.4 gcr.io/google_containers/servicelb:0.1 gcr.io/google_containers/test-webserver:e2e gcr.io/google_containers/ubuntu:14.04 gcr.io/google_containers/update-demo:kitten gcr.io/google_containers/update-demo:nautilus gcr.io/google_containers/volume-ceph:0.1 gcr.io/google_containers/volume-gluster:0.2 gcr.io/google_containers/volume-iscsi:0.1 gcr.io/google_containers/volume-nfs:0.6 gcr.io/google_containers/volume-rbd:0.1 gcr.io/google_samples/gb-redisslave:v1; do echo $(date '+%X') pulling $i; docker pull $i 1>/dev/null; done;" + securityContext: + privileged: true + volumeMounts: + - mountPath: /var/run/docker.sock + name: socket + - mountPath: /usr/bin/docker + name: docker + volumes: + - hostPath: + path: /var/run/docker.sock + name: socket + - hostPath: + path: /usr/bin/docker + name: docker + # This pod is really fire-and-forget. + restartPolicy: Never diff --git a/cluster/saltbase/salt/e2e-image-puller/init.sls b/cluster/saltbase/salt/e2e-image-puller/init.sls new file mode 100644 index 0000000000..f93102a456 --- /dev/null +++ b/cluster/saltbase/salt/e2e-image-puller/init.sls @@ -0,0 +1,12 @@ +/etc/kubernetes/manifests/e2e-image-puller.manifest: + file.managed: + - source: salt://e2e-image-puller/e2e-image-puller.manifest + - template: jinja + - user: root + - group: root + - mode: 644 + - makedirs: true + - dir_mode: 755 + - require: + - service: docker + - service: kubelet diff --git a/cluster/saltbase/salt/top.sls b/cluster/saltbase/salt/top.sls index 5e4b093e92..cb2f910418 100644 --- a/cluster/saltbase/salt/top.sls +++ b/cluster/saltbase/salt/top.sls @@ -38,6 +38,9 @@ base: {% endif %} {% if pillar.get('enable_cluster_registry', '').lower() == 'true' %} - kube-registry-proxy +{% endif %} +{% if pillar['prepull_e2e_images'] is defined and pillar['prepull_e2e_images'].lower() == 'true' %} + - e2e-image-puller {% endif %} - logrotate - supervisor diff --git a/test/e2e/e2e.go b/test/e2e/e2e.go index a212fafa3e..3787d6b9ee 100644 --- a/test/e2e/e2e.go +++ b/test/e2e/e2e.go @@ -44,6 +44,12 @@ const ( // running and ready before any e2e tests run. It includes pulling all of // the pods (as of 5/18/15 this is 8 pods). podStartupTimeout = 10 * time.Minute + + // imagePrePullingTimeout is the time we wait for the e2e-image-puller + // static pods to pull the list of seeded images. If they don't pull + // images within this time we simply log their output and carry on + // with the tests. + imagePrePullingTimeout = 5 * time.Minute ) var ( @@ -119,7 +125,7 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte { // cluster infrastructure pods that are being pulled or started can block // test pods from running, and tests that ensure all pods are running and // ready will fail). - if err := framework.WaitForPodsRunningReady(api.NamespaceSystem, int32(framework.TestContext.MinStartupPods), podStartupTimeout); err != nil { + if err := framework.WaitForPodsRunningReady(api.NamespaceSystem, int32(framework.TestContext.MinStartupPods), podStartupTimeout, framework.ImagePullerLabels); err != nil { if c, errClient := framework.LoadClient(); errClient != nil { framework.Logf("Unable to dump cluster information because: %v", errClient) } else { @@ -130,6 +136,14 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte { framework.Failf("Error waiting for all pods to be running and ready: %v", err) } + if err := framework.WaitForPodsSuccess(api.NamespaceSystem, framework.ImagePullerLabels, imagePrePullingTimeout); err != nil { + // There is no guarantee that the image pulling will succeed in 3 minutes + // and we don't even run the image puller on all platforms (including GKE). + // We wait for it so we get an indication of failures in the logs, and to + // maximize benefit of image pre-pulling. + framework.Logf("WARNING: Image pulling pods failed to enter success in %v: %v", podStartupTimeout, err) + } + return nil }, func(data []byte) { diff --git a/test/e2e/framework/framework.go b/test/e2e/framework/framework.go index 129a36a225..8e4da5b91c 100644 --- a/test/e2e/framework/framework.go +++ b/test/e2e/framework/framework.go @@ -209,6 +209,8 @@ func (f *Framework) AfterEach() { // Print events if the test failed. if CurrentGinkgoTestDescription().Failed && TestContext.DumpLogsOnFailure { DumpAllNamespaceInfo(f.Client, f.Namespace.Name) + By(fmt.Sprintf("Dumping a list of prepulled images on each node")) + LogPodsWithLabels(f.Client, api.NamespaceSystem, ImagePullerLabels) } summaries := make([]TestDataSummary, 0) diff --git a/test/e2e/framework/util.go b/test/e2e/framework/util.go index c6b30a527d..4a7f6ef71a 100644 --- a/test/e2e/framework/util.go +++ b/test/e2e/framework/util.go @@ -130,6 +130,10 @@ const ( ClaimProvisionTimeout = 5 * time.Minute ) +// Label allocated to the image puller static pod that runs on each node +// before e2es. +var ImagePullerLabels = map[string]string{"name": "e2e-image-puller"} + // SubResource proxy should have been functional in v1.0.0, but SubResource // proxy via tunneling is known to be broken in v1.0. See // https://github.com/kubernetes/kubernetes/pull/15224#issuecomment-146769463 @@ -413,6 +417,48 @@ func hasReplicationControllersForPod(rcs *api.ReplicationControllerList, pod api return false } +// WaitForPodsSuccess waits till all labels matching the given selector enter +// the Success state. The caller is expected to only invoke this method once the +// pods have been created. +func WaitForPodsSuccess(ns string, successPodLabels map[string]string, timeout time.Duration) error { + c, err := LoadClient() + if err != nil { + return err + } + successPodSelector := labels.SelectorFromSet(successPodLabels) + start, badPods := time.Now(), []api.Pod{} + + if wait.PollImmediate(30*time.Second, timeout, func() (bool, error) { + podList, err := c.Pods(ns).List(api.ListOptions{LabelSelector: successPodSelector}) + if err != nil { + Logf("Error getting pods in namespace %q: %v", ns, err) + return false, nil + } + if len(podList.Items) == 0 { + Logf("Waiting for pods to enter Success, but no pods in %q match label %v", ns, successPodLabels) + return true, nil + } + badPods = []api.Pod{} + for _, pod := range podList.Items { + if pod.Status.Phase != api.PodSucceeded { + badPods = append(badPods, pod) + } + } + successPods := len(podList.Items) - len(badPods) + Logf("%d / %d pods in namespace %q are in Success state (%d seconds elapsed)", + successPods, len(podList.Items), ns, int(time.Since(start).Seconds())) + if len(badPods) == 0 { + return true, nil + } + return false, nil + }) != nil { + logPodStates(badPods) + LogPodsWithLabels(c, ns, successPodLabels) + return fmt.Errorf("Not all pods in namespace %q are successful within %v", ns, timeout) + } + return nil +} + // WaitForPodsRunningReady waits up to timeout to ensure that all pods in // namespace ns are either running and ready, or failed but controlled by a // replication controller. Also, it ensures that at least minPods are running @@ -420,11 +466,17 @@ func hasReplicationControllersForPod(rcs *api.ReplicationControllerList, pod api // that it requires the list of pods on every iteration. This is useful, for // example, in cluster startup, because the number of pods increases while // waiting. -func WaitForPodsRunningReady(ns string, minPods int32, timeout time.Duration) error { +// If ignoreSuccessPods is true, pods in the "Success" state are ignored and +// this function waits for minPods to enter Running/Ready. Otherwise an error is +// returned even if there are minPods pods, some of which are in Running/Ready +// and some in Success. This is to allow the client to decide if "Success" +// means "Ready" or not. +func WaitForPodsRunningReady(ns string, minPods int32, timeout time.Duration, ignoreLabels map[string]string) error { c, err := LoadClient() if err != nil { return err } + ignoreSelector := labels.SelectorFromSet(ignoreLabels) start := time.Now() Logf("Waiting up to %v for all pods (need at least %d) in namespace '%s' to be running and ready", timeout, minPods, ns) @@ -449,6 +501,10 @@ func WaitForPodsRunningReady(ns string, minPods int32, timeout time.Duration) er } nOk, replicaOk, badPods := int32(0), int32(0), []api.Pod{} for _, pod := range podList.Items { + if len(ignoreLabels) != 0 && ignoreSelector.Matches(labels.Set(pod.Labels)) { + Logf("%v in state %v, ignoring", pod.Name, pod.Status.Phase) + continue + } if res, err := PodRunningReady(&pod); res && err == nil { nOk++ if hasReplicationControllersForPod(rcList, pod) { @@ -535,6 +591,20 @@ func RunKubernetesServiceTestContainer(repoRoot string, ns string) { } } +func kubectlLogPod(c *client.Client, pod api.Pod) { + for _, container := range pod.Spec.Containers { + logs, err := GetPodLogs(c, pod.Namespace, pod.Name, container.Name) + if err != nil { + logs, err = getPreviousPodLogs(c, pod.Namespace, pod.Name, container.Name) + if err != nil { + Logf("Failed to get logs of pod %v, container %v, err: %v", pod.Name, container.Name, err) + } + } + By(fmt.Sprintf("Logs of %v/%v:%v on node %v", pod.Namespace, pod.Name, container.Name, pod.Spec.NodeName)) + Logf(logs) + } +} + func LogFailedContainers(ns string) { c, err := LoadClient() if err != nil { @@ -549,21 +619,23 @@ func LogFailedContainers(ns string) { Logf("Running kubectl logs on non-ready containers in %v", ns) for _, pod := range podList.Items { if res, err := PodRunningReady(&pod); !res || err != nil { - for _, container := range pod.Spec.Containers { - logs, err := GetPodLogs(c, ns, pod.Name, container.Name) - if err != nil { - logs, err = getPreviousPodLogs(c, ns, pod.Name, container.Name) - if err != nil { - Logf("Failed to get logs of pod %v, container %v, err: %v", pod.Name, container.Name, err) - } - } - By(fmt.Sprintf("Logs of %v/%v:%v on node %v", ns, pod.Name, container.Name, pod.Spec.NodeName)) - Logf(logs) - } + kubectlLogPod(c, pod) } } } +func LogPodsWithLabels(c *client.Client, ns string, match map[string]string) { + podList, err := c.Pods(ns).List(api.ListOptions{LabelSelector: labels.SelectorFromSet(match)}) + if err != nil { + Logf("Error getting pods in namespace %q: %v", ns, err) + return + } + Logf("Running kubectl logs on pods with labels %v in %v", match, ns) + for _, pod := range podList.Items { + kubectlLogPod(c, pod) + } +} + // DeleteNamespaces deletes all namespaces that match the given delete and skip filters. // Filter is by simple strings.Contains; first skip filter, then delete filter. // Returns the list of deleted namespaces or an error. diff --git a/test/e2e/mesos.go b/test/e2e/mesos.go index 73dd3831d8..a156f14306 100644 --- a/test/e2e/mesos.go +++ b/test/e2e/mesos.go @@ -69,7 +69,7 @@ var _ = framework.KubeDescribe("Mesos", func() { const ns = "static-pods" numpods := int32(len(nodelist.Items)) - framework.ExpectNoError(framework.WaitForPodsRunningReady(ns, numpods, wait.ForeverTestTimeout), + framework.ExpectNoError(framework.WaitForPodsRunningReady(ns, numpods, wait.ForeverTestTimeout, map[string]string{}), fmt.Sprintf("number of static pods in namespace %s is %d", ns, numpods)) }) diff --git a/test/e2e/networking.go b/test/e2e/networking.go index cce9235872..c43b8535a3 100644 --- a/test/e2e/networking.go +++ b/test/e2e/networking.go @@ -124,7 +124,7 @@ var _ = framework.KubeDescribe("Networking", func() { "Rerun it with at least two nodes to get complete coverage.") } - podNames := LaunchNetTestPodPerNode(f, nodes, svcname, "1.8") + podNames := LaunchNetTestPodPerNode(f, nodes, svcname) // Clean up the pods defer func() { @@ -256,7 +256,7 @@ var _ = framework.KubeDescribe("Networking", func() { }) }) -func LaunchNetTestPodPerNode(f *framework.Framework, nodes *api.NodeList, name, version string) []string { +func LaunchNetTestPodPerNode(f *framework.Framework, nodes *api.NodeList, name string) []string { podNames := []string{} totalPods := len(nodes.Items) @@ -275,7 +275,7 @@ func LaunchNetTestPodPerNode(f *framework.Framework, nodes *api.NodeList, name, Containers: []api.Container{ { Name: "webserver", - Image: "gcr.io/google_containers/nettest:" + version, + Image: "gcr.io/google_containers/nettest:1.8", Args: []string{ "-service=" + name, //peers >= totalPods should be asserted by the container. diff --git a/test/e2e/resize_nodes.go b/test/e2e/resize_nodes.go index 7f4c1af254..b1929eab65 100644 --- a/test/e2e/resize_nodes.go +++ b/test/e2e/resize_nodes.go @@ -396,7 +396,7 @@ var _ = framework.KubeDescribe("Nodes [Disruptive]", func() { // the cluster is restored to health. By("waiting for system pods to successfully restart") - err := framework.WaitForPodsRunningReady(api.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout) + err := framework.WaitForPodsRunningReady(api.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout, map[string]string{}) Expect(err).NotTo(HaveOccurred()) }) diff --git a/test/e2e/scheduler_predicates.go b/test/e2e/scheduler_predicates.go index 1d24b85245..47785e7458 100644 --- a/test/e2e/scheduler_predicates.go +++ b/test/e2e/scheduler_predicates.go @@ -196,7 +196,7 @@ var _ = framework.KubeDescribe("SchedulerPredicates [Serial]", func() { } } - err = framework.WaitForPodsRunningReady(api.NamespaceSystem, int32(systemPodsNo), framework.PodReadyBeforeTimeout) + err = framework.WaitForPodsRunningReady(api.NamespaceSystem, int32(systemPodsNo), framework.PodReadyBeforeTimeout, map[string]string{}) Expect(err).NotTo(HaveOccurred()) for _, node := range nodeList.Items {