Merge pull request #10802 from bprashanth/max_containers_fail

Don't ignore containers restarting during tests
2015-07-07 09:12:09 +02:00 · 2015-07-07 09:12:09 +02:00 · 34dd9c7880
parent 639a7dac50 b0bbd5b8d5
commit 34dd9c7880
3 changed files with 52 additions and 9 deletions
--- a/test/e2e/density.go
+++ b/test/e2e/density.go
@ -43,6 +43,9 @@ import (
 // NodeStartupThreshold is a rough estimate of the time allocated for a pod to start on a node.
 const NodeStartupThreshold = 4 * time.Second

+// Maximum container failures this test tolerates before failing.
+var MaxContainerFailures = 0
+
 // podLatencyData encapsulates pod startup latency information.
 type podLatencyData struct {
 	// Name of the pod
@ -190,14 +193,14 @@ var _ = Describe("Density", func() {
 			fileHndl, err := os.Create(fmt.Sprintf(testContext.OutputDir+"/%s/pod_states.csv", uuid))
 			expectNoError(err)
 			defer fileHndl.Close()
-
 			config := RCConfig{Client: c,
-				Image:         "gcr.io/google_containers/pause:go",
-				Name:          RCName,
-				Namespace:     ns,
-				PollInterval:  itArg.interval,
-				PodStatusFile: fileHndl,
-				Replicas:      totalPods,
+				Image:                "gcr.io/google_containers/pause:go",
+				Name:                 RCName,
+				Namespace:            ns,
+				PollInterval:         itArg.interval,
+				PodStatusFile:        fileHndl,
+				Replicas:             totalPods,
+				MaxContainerFailures: &MaxContainerFailures,
 			}

 			// Create a listener for events.
--- a/test/e2e/kubelet_stats.go
+++ b/test/e2e/kubelet_stats.go
@ -162,7 +162,7 @@ func HighLatencyKubeletOperations(c *client.Client, threshold time.Duration, nod
 	}
 	sort.Sort(KubeletMetricByLatency(metric))
 	var badMetrics []KubeletMetric
-	Logf("Latency metrics for node %v", nodeName)
+	Logf("\nLatency metrics for node %v", nodeName)
 	for _, m := range metric {
 		if m.Latency > threshold {
 			badMetrics = append(badMetrics, m)
--- a/test/e2e/util.go
+++ b/test/e2e/util.go
@ -178,6 +178,10 @@ type RCConfig struct {
 	// Pointer to a list of pods; if non-nil, will be set to a list of pods
 	// created by this RC by RunRC.
 	CreatedPods *[]*api.Pod
+
+	// Maximum allowable container failures. If exceeded, RunRC returns an error.
+	// Defaults to replicas*0.1 if unspecified.
+	MaxContainerFailures *int
 }

 func Logf(format string, a ...interface{}) {
@ -984,7 +988,15 @@ func Diff(oldPods []*api.Pod, curPods []*api.Pod) PodDiff {
 // It's the caller's responsibility to clean up externally (i.e. use the
 // namespace lifecycle for handling cleanup).
 func RunRC(config RCConfig) error {
-	maxContainerFailures := int(math.Max(1.0, float64(config.Replicas)*.01))
+
+	// Don't force tests to fail if they don't care about containers restarting.
+	var maxContainerFailures int
+	if config.MaxContainerFailures == nil {
+		maxContainerFailures = int(math.Max(1.0, float64(config.Replicas)*.01))
+	} else {
+		maxContainerFailures = *config.MaxContainerFailures
+	}
+
 	label := labels.SelectorFromSet(labels.Set(map[string]string{"name": config.Name}))

 	By(fmt.Sprintf("%v Creating replication controller %s", time.Now(), config.Name))
@ -1058,6 +1070,8 @@ func RunRC(config RCConfig) error {
 		unknown := 0
 		inactive := 0
 		failedContainers := 0
+		containerRestartNodes := util.NewStringSet()
+
 		pods := podStore.List()
 		if config.CreatedPods != nil {
 			*config.CreatedPods = pods
@ -1067,6 +1081,7 @@ func RunRC(config RCConfig) error {
 				running++
 				for _, v := range FailedContainers(p) {
 					failedContainers = failedContainers + v.restarts
+					containerRestartNodes.Insert(p.Spec.NodeName)
 				}
 			} else if p.Status.Phase == api.PodPending {
 				if p.Spec.NodeName == "" {
@ -1088,6 +1103,7 @@ func RunRC(config RCConfig) error {
 		}

 		if failedContainers > maxContainerFailures {
+			dumpNodeDebugInfo(config.Client, containerRestartNodes.List())
 			return fmt.Errorf("%d containers failed which is more than allowed %d", failedContainers, maxContainerFailures)
 		}
 		if len(pods) < len(oldPods) || len(pods) > config.Replicas {
@ -1137,6 +1153,11 @@ func dumpPodDebugInfo(c *client.Client, pods []*api.Pod) {

 func dumpNodeDebugInfo(c *client.Client, nodeNames []string) {
 	for _, n := range nodeNames {
+		Logf("\nLogging kubelet events for node %v", n)
+		for _, e := range getNodeEvents(c, n) {
+			Logf("source %v message %v reason %v first ts %v last ts %v, involved obj %+v",
+				e.Source, e.Message, e.Reason, e.FirstTimestamp, e.LastTimestamp, e.InvolvedObject)
+		}
 		Logf("\nLogging pods the kubelet thinks is on node %v", n)
 		podList, err := GetKubeletPods(c, n)
 		if err != nil {
@ -1155,6 +1176,25 @@ func dumpNodeDebugInfo(c *client.Client, nodeNames []string) {
 	}
 }

+// logNodeEvents logs kubelet events from the given node. This includes kubelet
+// restart and node unhealthy events. Note that listing events like this will mess
+// with latency metrics, beware of calling it during a test.
+func getNodeEvents(c *client.Client, nodeName string) []api.Event {
+	events, err := c.Events(api.NamespaceDefault).List(
+		labels.Everything(),
+		fields.Set{
+			"involvedObject.kind":      "Node",
+			"involvedObject.name":      nodeName,
+			"involvedObject.namespace": api.NamespaceAll,
+			"source":                   "kubelet",
+		}.AsSelector())
+	if err != nil {
+		Logf("Unexpected error retrieving node events %v", err)
+		return []api.Event{}
+	}
+	return events.Items
+}
+
 func ScaleRC(c *client.Client, ns, name string, size uint) error {
 	By(fmt.Sprintf("%v Scaling replication controller %s in namespace %s to %d", time.Now(), name, ns, size))
 	scaler, err := kubectl.ScalerFor("ReplicationController", kubectl.NewScalerClient(c))