Merge pull request #10802 from bprashanth/max_containers_fail

Don't ignore containers restarting during tests
pull/6/head
Wojciech Tyczynski 2015-07-07 09:12:09 +02:00
commit 34dd9c7880
3 changed files with 52 additions and 9 deletions

View File

@ -43,6 +43,9 @@ import (
// NodeStartupThreshold is a rough estimate of the time allocated for a pod to start on a node.
const NodeStartupThreshold = 4 * time.Second
// Maximum container failures this test tolerates before failing.
var MaxContainerFailures = 0
// podLatencyData encapsulates pod startup latency information.
type podLatencyData struct {
// Name of the pod
@ -190,14 +193,14 @@ var _ = Describe("Density", func() {
fileHndl, err := os.Create(fmt.Sprintf(testContext.OutputDir+"/%s/pod_states.csv", uuid))
expectNoError(err)
defer fileHndl.Close()
config := RCConfig{Client: c,
Image: "gcr.io/google_containers/pause:go",
Name: RCName,
Namespace: ns,
PollInterval: itArg.interval,
PodStatusFile: fileHndl,
Replicas: totalPods,
Image: "gcr.io/google_containers/pause:go",
Name: RCName,
Namespace: ns,
PollInterval: itArg.interval,
PodStatusFile: fileHndl,
Replicas: totalPods,
MaxContainerFailures: &MaxContainerFailures,
}
// Create a listener for events.

View File

@ -162,7 +162,7 @@ func HighLatencyKubeletOperations(c *client.Client, threshold time.Duration, nod
}
sort.Sort(KubeletMetricByLatency(metric))
var badMetrics []KubeletMetric
Logf("Latency metrics for node %v", nodeName)
Logf("\nLatency metrics for node %v", nodeName)
for _, m := range metric {
if m.Latency > threshold {
badMetrics = append(badMetrics, m)

View File

@ -178,6 +178,10 @@ type RCConfig struct {
// Pointer to a list of pods; if non-nil, will be set to a list of pods
// created by this RC by RunRC.
CreatedPods *[]*api.Pod
// Maximum allowable container failures. If exceeded, RunRC returns an error.
// Defaults to replicas*0.1 if unspecified.
MaxContainerFailures *int
}
func Logf(format string, a ...interface{}) {
@ -984,7 +988,15 @@ func Diff(oldPods []*api.Pod, curPods []*api.Pod) PodDiff {
// It's the caller's responsibility to clean up externally (i.e. use the
// namespace lifecycle for handling cleanup).
func RunRC(config RCConfig) error {
maxContainerFailures := int(math.Max(1.0, float64(config.Replicas)*.01))
// Don't force tests to fail if they don't care about containers restarting.
var maxContainerFailures int
if config.MaxContainerFailures == nil {
maxContainerFailures = int(math.Max(1.0, float64(config.Replicas)*.01))
} else {
maxContainerFailures = *config.MaxContainerFailures
}
label := labels.SelectorFromSet(labels.Set(map[string]string{"name": config.Name}))
By(fmt.Sprintf("%v Creating replication controller %s", time.Now(), config.Name))
@ -1058,6 +1070,8 @@ func RunRC(config RCConfig) error {
unknown := 0
inactive := 0
failedContainers := 0
containerRestartNodes := util.NewStringSet()
pods := podStore.List()
if config.CreatedPods != nil {
*config.CreatedPods = pods
@ -1067,6 +1081,7 @@ func RunRC(config RCConfig) error {
running++
for _, v := range FailedContainers(p) {
failedContainers = failedContainers + v.restarts
containerRestartNodes.Insert(p.Spec.NodeName)
}
} else if p.Status.Phase == api.PodPending {
if p.Spec.NodeName == "" {
@ -1088,6 +1103,7 @@ func RunRC(config RCConfig) error {
}
if failedContainers > maxContainerFailures {
dumpNodeDebugInfo(config.Client, containerRestartNodes.List())
return fmt.Errorf("%d containers failed which is more than allowed %d", failedContainers, maxContainerFailures)
}
if len(pods) < len(oldPods) || len(pods) > config.Replicas {
@ -1137,6 +1153,11 @@ func dumpPodDebugInfo(c *client.Client, pods []*api.Pod) {
func dumpNodeDebugInfo(c *client.Client, nodeNames []string) {
for _, n := range nodeNames {
Logf("\nLogging kubelet events for node %v", n)
for _, e := range getNodeEvents(c, n) {
Logf("source %v message %v reason %v first ts %v last ts %v, involved obj %+v",
e.Source, e.Message, e.Reason, e.FirstTimestamp, e.LastTimestamp, e.InvolvedObject)
}
Logf("\nLogging pods the kubelet thinks is on node %v", n)
podList, err := GetKubeletPods(c, n)
if err != nil {
@ -1155,6 +1176,25 @@ func dumpNodeDebugInfo(c *client.Client, nodeNames []string) {
}
}
// logNodeEvents logs kubelet events from the given node. This includes kubelet
// restart and node unhealthy events. Note that listing events like this will mess
// with latency metrics, beware of calling it during a test.
func getNodeEvents(c *client.Client, nodeName string) []api.Event {
events, err := c.Events(api.NamespaceDefault).List(
labels.Everything(),
fields.Set{
"involvedObject.kind": "Node",
"involvedObject.name": nodeName,
"involvedObject.namespace": api.NamespaceAll,
"source": "kubelet",
}.AsSelector())
if err != nil {
Logf("Unexpected error retrieving node events %v", err)
return []api.Event{}
}
return events.Items
}
func ScaleRC(c *client.Client, ns, name string, size uint) error {
By(fmt.Sprintf("%v Scaling replication controller %s in namespace %s to %d", time.Now(), name, ns, size))
scaler, err := kubectl.ScalerFor("ReplicationController", kubectl.NewScalerClient(c))