From b0bbd5b8d5094153af8c50625136c3ab79d9e817 Mon Sep 17 00:00:00 2001 From: Prashanth Balasubramanian Date: Mon, 6 Jul 2015 18:11:30 -0700 Subject: [PATCH] Don't ignore containers restarting during tests --- test/e2e/density.go | 17 +++++++++------- test/e2e/kubelet_stats.go | 2 +- test/e2e/util.go | 42 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 52 insertions(+), 9 deletions(-) diff --git a/test/e2e/density.go b/test/e2e/density.go index 433e892dfb..f21148fa7a 100644 --- a/test/e2e/density.go +++ b/test/e2e/density.go @@ -43,6 +43,9 @@ import ( // NodeStartupThreshold is a rough estimate of the time allocated for a pod to start on a node. const NodeStartupThreshold = 4 * time.Second +// Maximum container failures this test tolerates before failing. +var MaxContainerFailures = 0 + // podLatencyData encapsulates pod startup latency information. type podLatencyData struct { // Name of the pod @@ -190,14 +193,14 @@ var _ = Describe("Density", func() { fileHndl, err := os.Create(fmt.Sprintf(testContext.OutputDir+"/%s/pod_states.csv", uuid)) expectNoError(err) defer fileHndl.Close() - config := RCConfig{Client: c, - Image: "gcr.io/google_containers/pause:go", - Name: RCName, - Namespace: ns, - PollInterval: itArg.interval, - PodStatusFile: fileHndl, - Replicas: totalPods, + Image: "gcr.io/google_containers/pause:go", + Name: RCName, + Namespace: ns, + PollInterval: itArg.interval, + PodStatusFile: fileHndl, + Replicas: totalPods, + MaxContainerFailures: &MaxContainerFailures, } // Create a listener for events. diff --git a/test/e2e/kubelet_stats.go b/test/e2e/kubelet_stats.go index a973540f47..66434b0825 100644 --- a/test/e2e/kubelet_stats.go +++ b/test/e2e/kubelet_stats.go @@ -162,7 +162,7 @@ func HighLatencyKubeletOperations(c *client.Client, threshold time.Duration, nod } sort.Sort(KubeletMetricByLatency(metric)) var badMetrics []KubeletMetric - Logf("Latency metrics for node %v", nodeName) + Logf("\nLatency metrics for node %v", nodeName) for _, m := range metric { if m.Latency > threshold { badMetrics = append(badMetrics, m) diff --git a/test/e2e/util.go b/test/e2e/util.go index d728e3e85d..aa160dc04e 100644 --- a/test/e2e/util.go +++ b/test/e2e/util.go @@ -178,6 +178,10 @@ type RCConfig struct { // Pointer to a list of pods; if non-nil, will be set to a list of pods // created by this RC by RunRC. CreatedPods *[]*api.Pod + + // Maximum allowable container failures. If exceeded, RunRC returns an error. + // Defaults to replicas*0.1 if unspecified. + MaxContainerFailures *int } func Logf(format string, a ...interface{}) { @@ -984,7 +988,15 @@ func Diff(oldPods []*api.Pod, curPods []*api.Pod) PodDiff { // It's the caller's responsibility to clean up externally (i.e. use the // namespace lifecycle for handling cleanup). func RunRC(config RCConfig) error { - maxContainerFailures := int(math.Max(1.0, float64(config.Replicas)*.01)) + + // Don't force tests to fail if they don't care about containers restarting. + var maxContainerFailures int + if config.MaxContainerFailures == nil { + maxContainerFailures = int(math.Max(1.0, float64(config.Replicas)*.01)) + } else { + maxContainerFailures = *config.MaxContainerFailures + } + label := labels.SelectorFromSet(labels.Set(map[string]string{"name": config.Name})) By(fmt.Sprintf("%v Creating replication controller %s", time.Now(), config.Name)) @@ -1058,6 +1070,8 @@ func RunRC(config RCConfig) error { unknown := 0 inactive := 0 failedContainers := 0 + containerRestartNodes := util.NewStringSet() + pods := podStore.List() if config.CreatedPods != nil { *config.CreatedPods = pods @@ -1067,6 +1081,7 @@ func RunRC(config RCConfig) error { running++ for _, v := range FailedContainers(p) { failedContainers = failedContainers + v.restarts + containerRestartNodes.Insert(p.Spec.NodeName) } } else if p.Status.Phase == api.PodPending { if p.Spec.NodeName == "" { @@ -1088,6 +1103,7 @@ func RunRC(config RCConfig) error { } if failedContainers > maxContainerFailures { + dumpNodeDebugInfo(config.Client, containerRestartNodes.List()) return fmt.Errorf("%d containers failed which is more than allowed %d", failedContainers, maxContainerFailures) } if len(pods) < len(oldPods) || len(pods) > config.Replicas { @@ -1137,6 +1153,11 @@ func dumpPodDebugInfo(c *client.Client, pods []*api.Pod) { func dumpNodeDebugInfo(c *client.Client, nodeNames []string) { for _, n := range nodeNames { + Logf("\nLogging kubelet events for node %v", n) + for _, e := range getNodeEvents(c, n) { + Logf("source %v message %v reason %v first ts %v last ts %v, involved obj %+v", + e.Source, e.Message, e.Reason, e.FirstTimestamp, e.LastTimestamp, e.InvolvedObject) + } Logf("\nLogging pods the kubelet thinks is on node %v", n) podList, err := GetKubeletPods(c, n) if err != nil { @@ -1155,6 +1176,25 @@ func dumpNodeDebugInfo(c *client.Client, nodeNames []string) { } } +// logNodeEvents logs kubelet events from the given node. This includes kubelet +// restart and node unhealthy events. Note that listing events like this will mess +// with latency metrics, beware of calling it during a test. +func getNodeEvents(c *client.Client, nodeName string) []api.Event { + events, err := c.Events(api.NamespaceDefault).List( + labels.Everything(), + fields.Set{ + "involvedObject.kind": "Node", + "involvedObject.name": nodeName, + "involvedObject.namespace": api.NamespaceAll, + "source": "kubelet", + }.AsSelector()) + if err != nil { + Logf("Unexpected error retrieving node events %v", err) + return []api.Event{} + } + return events.Items +} + func ScaleRC(c *client.Client, ns, name string, size uint) error { By(fmt.Sprintf("%v Scaling replication controller %s in namespace %s to %d", time.Now(), name, ns, size)) scaler, err := kubectl.ScalerFor("ReplicationController", kubectl.NewScalerClient(c))