From 05d6c8a6c22c2a0c11ec1019c359ca9f12a31731 Mon Sep 17 00:00:00 2001 From: Anthony Yeh Date: Wed, 23 Aug 2017 15:56:28 -0700 Subject: [PATCH] StatefulSet: Deflake e2e `kubectl exec` commands. We seem to get a lot of flakes due to "connection refused" while running `kubectl exec`. I can't find any reason this would be caused by the test flow, so I'm adding retries to see if that helps. --- test/e2e/framework/statefulset_utils.go | 22 +++++++++++++--------- test/e2e/framework/util.go | 18 ++++++++++++++++++ 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/test/e2e/framework/statefulset_utils.go b/test/e2e/framework/statefulset_utils.go index 1ba6602430..1827573c7f 100644 --- a/test/e2e/framework/statefulset_utils.go +++ b/test/e2e/framework/statefulset_utils.go @@ -134,7 +134,7 @@ func (s *StatefulSetTester) CheckMount(ss *apps.StatefulSet, mountPath string) e func (s *StatefulSetTester) ExecInStatefulPods(ss *apps.StatefulSet, cmd string) error { podList := s.GetPodList(ss) for _, statefulPod := range podList.Items { - stdout, err := RunHostCmd(statefulPod.Namespace, statefulPod.Name, cmd) + stdout, err := RunHostCmdWithRetries(statefulPod.Namespace, statefulPod.Name, cmd, StatefulSetPoll, 3) Logf("stdout of %v on %v: %v", cmd, statefulPod.Name, stdout) if err != nil { return err @@ -148,7 +148,7 @@ func (s *StatefulSetTester) CheckHostname(ss *apps.StatefulSet) error { cmd := "printf $(hostname)" podList := s.GetPodList(ss) for _, statefulPod := range podList.Items { - hostname, err := RunHostCmd(statefulPod.Namespace, statefulPod.Name, cmd) + hostname, err := RunHostCmdWithRetries(statefulPod.Namespace, statefulPod.Name, cmd, StatefulSetPoll, 3) if err != nil { return err } @@ -508,7 +508,8 @@ func (s *StatefulSetTester) BreakHttpProbe(ss *apps.StatefulSet) error { if path == "" { return fmt.Errorf("Path expected to be not empty: %v", path) } - cmd := fmt.Sprintf("mv -v /usr/share/nginx/html%v /tmp/", path) + // Ignore 'mv' errors to make this idempotent. + cmd := fmt.Sprintf("mv -v /usr/share/nginx/html%v /tmp/ || true", path) return s.ExecInStatefulPods(ss, cmd) } @@ -518,8 +519,9 @@ func (s *StatefulSetTester) BreakPodHttpProbe(ss *apps.StatefulSet, pod *v1.Pod) if path == "" { return fmt.Errorf("Path expected to be not empty: %v", path) } - cmd := fmt.Sprintf("mv -v /usr/share/nginx/html%v /tmp/", path) - stdout, err := RunHostCmd(pod.Namespace, pod.Name, cmd) + // Ignore 'mv' errors to make this idempotent. + cmd := fmt.Sprintf("mv -v /usr/share/nginx/html%v /tmp/ || true", path) + stdout, err := RunHostCmdWithRetries(pod.Namespace, pod.Name, cmd, StatefulSetPoll, 3) Logf("stdout of %v on %v: %v", cmd, pod.Name, stdout) return err } @@ -530,7 +532,8 @@ func (s *StatefulSetTester) RestoreHttpProbe(ss *apps.StatefulSet) error { if path == "" { return fmt.Errorf("Path expected to be not empty: %v", path) } - cmd := fmt.Sprintf("mv -v /tmp%v /usr/share/nginx/html/", path) + // Ignore 'mv' errors to make this idempotent. + cmd := fmt.Sprintf("mv -v /tmp%v /usr/share/nginx/html/ || true", path) return s.ExecInStatefulPods(ss, cmd) } @@ -540,8 +543,9 @@ func (s *StatefulSetTester) RestorePodHttpProbe(ss *apps.StatefulSet, pod *v1.Po if path == "" { return fmt.Errorf("Path expected to be not empty: %v", path) } - cmd := fmt.Sprintf("mv -v /tmp%v /usr/share/nginx/html/", path) - stdout, err := RunHostCmd(pod.Namespace, pod.Name, cmd) + // Ignore 'mv' errors to make this idempotent. + cmd := fmt.Sprintf("mv -v /tmp%v /usr/share/nginx/html/ || true", path) + stdout, err := RunHostCmdWithRetries(pod.Namespace, pod.Name, cmd, StatefulSetPoll, 3) Logf("stdout of %v on %v: %v", cmd, pod.Name, stdout) return err } @@ -586,7 +590,7 @@ func (s *StatefulSetTester) ResumeNextPod(ss *apps.StatefulSet) { if resumedPod != "" { Failf("Found multiple paused stateful pods: %v and %v", pod.Name, resumedPod) } - _, err := RunHostCmd(pod.Namespace, pod.Name, "touch /data/statefulset-continue") + _, err := RunHostCmdWithRetries(pod.Namespace, pod.Name, "touch /data/statefulset-continue", StatefulSetPoll, 3) ExpectNoError(err) Logf("Resumed pod %v", pod.Name) resumedPod = pod.Name diff --git a/test/e2e/framework/util.go b/test/e2e/framework/util.go index 5d4e9c2552..6b659e4efd 100644 --- a/test/e2e/framework/util.go +++ b/test/e2e/framework/util.go @@ -3290,6 +3290,24 @@ func RunHostCmdOrDie(ns, name, cmd string) string { return stdout } +// RunHostCmdWithRetries calls RunHostCmd until it succeeds or a built-in timeout expires. +// This can be used with idempotent commands to deflake transient connection issues. +func RunHostCmdWithRetries(ns, name, cmd string, interval time.Duration, maxTries int) (string, error) { + tries := 0 + for { + out, err := RunHostCmd(ns, name, cmd) + if err == nil { + return out, nil + } + tries++ + if tries >= maxTries { + return out, fmt.Errorf("RunHostCmd still failed after %d tries: %v", tries, err) + } + Logf("Waiting %v to retry failed RunHostCmd (attempt %d): %v", interval, tries, err) + time.Sleep(interval) + } +} + // LaunchHostExecPod launches a hostexec pod in the given namespace and waits // until it's Running func LaunchHostExecPod(client clientset.Interface, ns, name string) *v1.Pod {