StatefulSet: Deflake e2e `kubectl exec` commands.

We seem to get a lot of flakes due to "connection refused" while running `kubectl exec`. I can't find any reason this would be caused by the test flow, so I'm adding retries to see if that helps.
2017-08-23 15:56:28 -07:00 · 2017-08-23 15:56:28 -07:00 · 05d6c8a6c2
parent ef7b7ebd9c
commit 05d6c8a6c2
2 changed files with 31 additions and 9 deletions
--- a/test/e2e/framework/statefulset_utils.go
+++ b/test/e2e/framework/statefulset_utils.go
@ -134,7 +134,7 @@ func (s *StatefulSetTester) CheckMount(ss *apps.StatefulSet, mountPath string) e
 func (s *StatefulSetTester) ExecInStatefulPods(ss *apps.StatefulSet, cmd string) error {
 	podList := s.GetPodList(ss)
 	for _, statefulPod := range podList.Items {
-		stdout, err := RunHostCmd(statefulPod.Namespace, statefulPod.Name, cmd)
+		stdout, err := RunHostCmdWithRetries(statefulPod.Namespace, statefulPod.Name, cmd, StatefulSetPoll, 3)
 		Logf("stdout of %v on %v: %v", cmd, statefulPod.Name, stdout)
 		if err != nil {
 			return err
@ -148,7 +148,7 @@ func (s *StatefulSetTester) CheckHostname(ss *apps.StatefulSet) error {
 	cmd := "printf $(hostname)"
 	podList := s.GetPodList(ss)
 	for _, statefulPod := range podList.Items {
-		hostname, err := RunHostCmd(statefulPod.Namespace, statefulPod.Name, cmd)
+		hostname, err := RunHostCmdWithRetries(statefulPod.Namespace, statefulPod.Name, cmd, StatefulSetPoll, 3)
 		if err != nil {
 			return err
 		}
@ -508,7 +508,8 @@ func (s *StatefulSetTester) BreakHttpProbe(ss *apps.StatefulSet) error {
 	if path == "" {
 		return fmt.Errorf("Path expected to be not empty: %v", path)
 	}
-	cmd := fmt.Sprintf("mv -v /usr/share/nginx/html%v /tmp/", path)
+	// Ignore 'mv' errors to make this idempotent.
+	cmd := fmt.Sprintf("mv -v /usr/share/nginx/html%v /tmp/ || true", path)
 	return s.ExecInStatefulPods(ss, cmd)
 }

@ -518,8 +519,9 @@ func (s *StatefulSetTester) BreakPodHttpProbe(ss *apps.StatefulSet, pod *v1.Pod)
 	if path == "" {
 		return fmt.Errorf("Path expected to be not empty: %v", path)
 	}
-	cmd := fmt.Sprintf("mv -v /usr/share/nginx/html%v /tmp/", path)
-	stdout, err := RunHostCmd(pod.Namespace, pod.Name, cmd)
+	// Ignore 'mv' errors to make this idempotent.
+	cmd := fmt.Sprintf("mv -v /usr/share/nginx/html%v /tmp/ || true", path)
+	stdout, err := RunHostCmdWithRetries(pod.Namespace, pod.Name, cmd, StatefulSetPoll, 3)
 	Logf("stdout of %v on %v: %v", cmd, pod.Name, stdout)
 	return err
 }
@ -530,7 +532,8 @@ func (s *StatefulSetTester) RestoreHttpProbe(ss *apps.StatefulSet) error {
 	if path == "" {
 		return fmt.Errorf("Path expected to be not empty: %v", path)
 	}
-	cmd := fmt.Sprintf("mv -v /tmp%v /usr/share/nginx/html/", path)
+	// Ignore 'mv' errors to make this idempotent.
+	cmd := fmt.Sprintf("mv -v /tmp%v /usr/share/nginx/html/ || true", path)
 	return s.ExecInStatefulPods(ss, cmd)
 }

@ -540,8 +543,9 @@ func (s *StatefulSetTester) RestorePodHttpProbe(ss *apps.StatefulSet, pod *v1.Po
 	if path == "" {
 		return fmt.Errorf("Path expected to be not empty: %v", path)
 	}
-	cmd := fmt.Sprintf("mv -v /tmp%v /usr/share/nginx/html/", path)
-	stdout, err := RunHostCmd(pod.Namespace, pod.Name, cmd)
+	// Ignore 'mv' errors to make this idempotent.
+	cmd := fmt.Sprintf("mv -v /tmp%v /usr/share/nginx/html/ || true", path)
+	stdout, err := RunHostCmdWithRetries(pod.Namespace, pod.Name, cmd, StatefulSetPoll, 3)
 	Logf("stdout of %v on %v: %v", cmd, pod.Name, stdout)
 	return err
 }
@ -586,7 +590,7 @@ func (s *StatefulSetTester) ResumeNextPod(ss *apps.StatefulSet) {
 		if resumedPod != "" {
 			Failf("Found multiple paused stateful pods: %v and %v", pod.Name, resumedPod)
 		}
-		_, err := RunHostCmd(pod.Namespace, pod.Name, "touch /data/statefulset-continue")
+		_, err := RunHostCmdWithRetries(pod.Namespace, pod.Name, "touch /data/statefulset-continue", StatefulSetPoll, 3)
 		ExpectNoError(err)
 		Logf("Resumed pod %v", pod.Name)
 		resumedPod = pod.Name
--- a/test/e2e/framework/util.go
+++ b/test/e2e/framework/util.go
@ -3290,6 +3290,24 @@ func RunHostCmdOrDie(ns, name, cmd string) string {
 	return stdout
 }

+// RunHostCmdWithRetries calls RunHostCmd until it succeeds or a built-in timeout expires.
+// This can be used with idempotent commands to deflake transient connection issues.
+func RunHostCmdWithRetries(ns, name, cmd string, interval time.Duration, maxTries int) (string, error) {
+	tries := 0
+	for {
+		out, err := RunHostCmd(ns, name, cmd)
+		if err == nil {
+			return out, nil
+		}
+		tries++
+		if tries >= maxTries {
+			return out, fmt.Errorf("RunHostCmd still failed after %d tries: %v", tries, err)
+		}
+		Logf("Waiting %v to retry failed RunHostCmd (attempt %d): %v", interval, tries, err)
+		time.Sleep(interval)
+	}
+}
+
 // LaunchHostExecPod launches a hostexec pod in the given namespace and waits
 // until it's Running
 func LaunchHostExecPod(client clientset.Interface, ns, name string) *v1.Pod {