From f68b884a9de717a03bb0d23767d40bd5d24fd9c8 Mon Sep 17 00:00:00 2001 From: gmarek Date: Wed, 10 May 2017 14:50:38 +0200 Subject: [PATCH] Move rest of performance data gathered by tests to Summaries --- test/e2e/framework/framework.go | 46 +++----------- test/e2e/framework/metrics_util.go | 97 +++++++++++++++++++----------- test/e2e/framework/test_context.go | 2 +- test/e2e/framework/util.go | 35 +++++++++++ test/e2e/perf/density.go | 23 +++++-- test/e2e/perf/load.go | 15 +++-- 6 files changed, 134 insertions(+), 84 deletions(-) diff --git a/test/e2e/framework/framework.go b/test/e2e/framework/framework.go index bb1c7aba5f..e0f453f4a5 100644 --- a/test/e2e/framework/framework.go +++ b/test/e2e/framework/framework.go @@ -21,7 +21,6 @@ import ( "bytes" "encoding/json" "fmt" - "io/ioutil" "os" "path" "reflect" @@ -91,6 +90,10 @@ type Framework struct { // configuration for framework's client Options FrameworkOptions + + // Place where various additional data is stored during test run to be printed to ReportDir, + // or stdout if ReportDir is not set once test ends. + TestSummaries []TestDataSummary } type TestDataSummary interface { @@ -307,19 +310,18 @@ func (f *Framework) AfterEach() { LogContainersInPodsWithLabels(f.ClientSet, metav1.NamespaceSystem, ImagePullerLabels, "image-puller", logFunc) } - summaries := make([]TestDataSummary, 0) if TestContext.GatherKubeSystemResourceUsageData != "false" && TestContext.GatherKubeSystemResourceUsageData != "none" && f.gatherer != nil { By("Collecting resource usage data") summary, resourceViolationError := f.gatherer.stopAndSummarize([]int{90, 99, 100}, f.AddonResourceConstraints) defer ExpectNoError(resourceViolationError) - summaries = append(summaries, summary) + f.TestSummaries = append(f.TestSummaries, summary) } if TestContext.GatherLogsSizes { By("Gathering log sizes data") close(f.logsSizeCloseChannel) f.logsSizeWaitGroup.Wait() - summaries = append(summaries, f.logsSizeVerifier.GetSummary()) + f.TestSummaries = append(f.TestSummaries, f.logsSizeVerifier.GetSummary()) } if TestContext.GatherMetricsAfterTest { @@ -334,44 +336,12 @@ func (f *Framework) AfterEach() { if err != nil { Logf("MetricsGrabber failed to grab metrics (skipping metrics gathering): %v", err) } else { - summaries = append(summaries, (*MetricsForE2E)(&received)) + f.TestSummaries = append(f.TestSummaries, (*MetricsForE2E)(&received)) } } } - outputTypes := strings.Split(TestContext.OutputPrintType, ",") - now := time.Now() - for _, printType := range outputTypes { - switch printType { - case "hr": - for i := range summaries { - if TestContext.ReportDir == "" { - Logf(summaries[i].PrintHumanReadable()) - } else { - // TODO: learn to extract test name and append it to the kind instead of timestamp. - filePath := path.Join(TestContext.ReportDir, summaries[i].SummaryKind()+"_"+f.BaseName+"_"+now.Format(time.RFC3339)+".txt") - if err := ioutil.WriteFile(filePath, []byte(summaries[i].PrintHumanReadable()), 0644); err != nil { - Logf("Failed to write file %v with test performance data: %v", filePath, err) - } - } - } - case "json": - for i := range summaries { - if TestContext.ReportDir == "" { - Logf("%v JSON\n%v", summaries[i].SummaryKind(), summaries[i].PrintJSON()) - Logf("Finished") - } else { - // TODO: learn to extract test name and append it to the kind instead of timestamp. - filePath := path.Join(TestContext.ReportDir, summaries[i].SummaryKind()+"_"+f.BaseName+"_"+now.Format(time.RFC3339)+".json") - if err := ioutil.WriteFile(filePath, []byte(summaries[i].PrintJSON()), 0644); err != nil { - Logf("Failed to write file %v with test performance data: %v", filePath, err) - } - } - } - default: - Logf("Unknown output type: %v. Skipping.", printType) - } - } + PrintSummaries(f.TestSummaries, f.BaseName) // Check whether all nodes are ready after the test. // This is explicitly done at the very end of the test, to avoid diff --git a/test/e2e/framework/metrics_util.go b/test/e2e/framework/metrics_util.go index e8b9e7c8ca..fa11cb7764 100644 --- a/test/e2e/framework/metrics_util.go +++ b/test/e2e/framework/metrics_util.go @@ -150,12 +150,36 @@ type PodStartupLatency struct { Latency LatencyMetric `json:"latency"` } +func (l *PodStartupLatency) SummaryKind() string { + return "PodStartupLatency" +} + +func (l *PodStartupLatency) PrintHumanReadable() string { + return PrettyPrintJSON(*l) +} + +func (l *PodStartupLatency) PrintJSON() string { + return PrettyPrintJSON(*l) +} + type SchedulingLatency struct { Scheduling LatencyMetric `json:"scheduling"` Binding LatencyMetric `json:"binding"` Total LatencyMetric `json:"total"` } +func (l *SchedulingLatency) SummaryKind() string { + return "SchedulingLatency" +} + +func (l *SchedulingLatency) PrintHumanReadable() string { + return PrettyPrintJSON(*l) +} + +func (l *SchedulingLatency) PrintJSON() string { + return PrettyPrintJSON(*l) +} + type SaturationTime struct { TimeToSaturate time.Duration `json:"timeToStaturate"` NumberOfNodes int `json:"numberOfNodes"` @@ -174,9 +198,23 @@ type APIResponsiveness struct { APICalls []APICall `json:"apicalls"` } -func (a APIResponsiveness) Len() int { return len(a.APICalls) } -func (a APIResponsiveness) Swap(i, j int) { a.APICalls[i], a.APICalls[j] = a.APICalls[j], a.APICalls[i] } -func (a APIResponsiveness) Less(i, j int) bool { +func (a *APIResponsiveness) SummaryKind() string { + return "APIResponsiveness" +} + +func (a *APIResponsiveness) PrintHumanReadable() string { + return PrettyPrintJSON(*a) +} + +func (a *APIResponsiveness) PrintJSON() string { + return PrettyPrintJSON(*a) +} + +func (a *APIResponsiveness) Len() int { return len(a.APICalls) } +func (a *APIResponsiveness) Swap(i, j int) { + a.APICalls[i], a.APICalls[j] = a.APICalls[j], a.APICalls[i] +} +func (a *APIResponsiveness) Less(i, j int) bool { return a.APICalls[i].Latency.Perc99 < a.APICalls[j].Latency.Perc99 } @@ -225,17 +263,17 @@ func (a *APIResponsiveness) addMetricRequestCount(resource, verb string, count i a.APICalls = append(a.APICalls, apicall) } -func readLatencyMetrics(c clientset.Interface) (APIResponsiveness, error) { +func readLatencyMetrics(c clientset.Interface) (*APIResponsiveness, error) { var a APIResponsiveness body, err := getMetrics(c) if err != nil { - return a, err + return nil, err } samples, err := extractMetricSamples(body) if err != nil { - return a, err + return nil, err } ignoredResources := sets.NewString("events") @@ -262,7 +300,7 @@ func readLatencyMetrics(c clientset.Interface) (APIResponsiveness, error) { latency := sample.Value quantile, err := strconv.ParseFloat(string(sample.Metric[model.QuantileLabel]), 64) if err != nil { - return a, err + return nil, err } a.addMetricRequestLatency(resource, verb, quantile, time.Duration(int64(latency))*time.Microsecond) case "apiserver_request_count": @@ -272,22 +310,22 @@ func readLatencyMetrics(c clientset.Interface) (APIResponsiveness, error) { } } - return a, err + return &a, err } // Prints top five summary metrics for request types with latency and returns // number of such request types above threshold. -func HighLatencyRequests(c clientset.Interface) (int, error) { +func HighLatencyRequests(c clientset.Interface) (int, *APIResponsiveness, error) { metrics, err := readLatencyMetrics(c) if err != nil { - return 0, err + return 0, metrics, err } sort.Sort(sort.Reverse(metrics)) badMetrics := 0 top := 5 - for _, metric := range metrics.APICalls { + for i := range metrics.APICalls { isBad := false - if metric.Latency.Perc99 > apiCallLatencyThreshold { + if metrics.APICalls[i].Latency.Perc99 > apiCallLatencyThreshold { badMetrics++ isBad = true } @@ -297,23 +335,15 @@ func HighLatencyRequests(c clientset.Interface) (int, error) { if isBad { prefix = "WARNING " } - Logf("%vTop latency metric: %+v", prefix, metric) + Logf("%vTop latency metric: %+v", prefix, metrics.APICalls[i]) } } - - // TODO(random-liu): Remove the log when we migrate to new perfdash - Logf("API calls latencies: %s", PrettyPrintJSON(metrics)) - // Log perf data - PrintPerfData(ApiCallToPerfData(metrics)) - - return badMetrics, nil + return badMetrics, metrics, nil } // Verifies whether 50, 90 and 99th percentiles of PodStartupLatency are // within the threshold. -func VerifyPodStartupLatency(latency PodStartupLatency) error { - Logf("Pod startup latency: %s", PrettyPrintJSON(latency)) - +func VerifyPodStartupLatency(latency *PodStartupLatency) error { if latency.Latency.Perc50 > podStartupThreshold { return fmt.Errorf("too high pod startup latency 50th percentile: %v", latency.Latency.Perc50) } @@ -349,7 +379,7 @@ func getMetrics(c clientset.Interface) (string, error) { } // Retrieves scheduler metrics information. -func getSchedulingLatency(c clientset.Interface) (SchedulingLatency, error) { +func getSchedulingLatency(c clientset.Interface) (*SchedulingLatency, error) { result := SchedulingLatency{} // Check if master Node is registered @@ -358,7 +388,7 @@ func getSchedulingLatency(c clientset.Interface) (SchedulingLatency, error) { subResourceProxyAvailable, err := ServerVersionGTE(SubResourcePodProxyVersion, c.Discovery()) if err != nil { - return result, err + return nil, err } var data string @@ -400,13 +430,13 @@ func getSchedulingLatency(c clientset.Interface) (SchedulingLatency, error) { cmd := "curl http://localhost:10251/metrics" sshResult, err := SSH(cmd, GetMasterHost()+":22", TestContext.Provider) if err != nil || sshResult.Code != 0 { - return result, fmt.Errorf("unexpected error (code: %d) in ssh connection to master: %#v", sshResult.Code, err) + return &result, fmt.Errorf("unexpected error (code: %d) in ssh connection to master: %#v", sshResult.Code, err) } data = sshResult.Stdout } samples, err := extractMetricSamples(data) if err != nil { - return result, err + return nil, err } for _, sample := range samples { @@ -426,23 +456,20 @@ func getSchedulingLatency(c clientset.Interface) (SchedulingLatency, error) { latency := sample.Value quantile, err := strconv.ParseFloat(string(sample.Metric[model.QuantileLabel]), 64) if err != nil { - return result, err + return nil, err } setQuantile(metric, quantile, time.Duration(int64(latency))*time.Microsecond) } - return result, nil + return &result, nil } // Verifies (currently just by logging them) the scheduling latencies. -func VerifySchedulerLatency(c clientset.Interface) error { +func VerifySchedulerLatency(c clientset.Interface) (*SchedulingLatency, error) { latency, err := getSchedulingLatency(c) if err != nil { - return err + return nil, err } - Logf("Scheduling latency: %s", PrettyPrintJSON(latency)) - - // TODO: Add some reasonable checks once we know more about the values. - return nil + return latency, nil } func PrettyPrintJSON(metrics interface{}) string { diff --git a/test/e2e/framework/test_context.go b/test/e2e/framework/test_context.go index d393143058..779b5ceaf6 100644 --- a/test/e2e/framework/test_context.go +++ b/test/e2e/framework/test_context.go @@ -162,7 +162,7 @@ func RegisterCommonFlags() { flag.StringVar(&TestContext.GatherKubeSystemResourceUsageData, "gather-resource-usage", "false", "If set to 'true' or 'all' framework will be monitoring resource usage of system all add-ons in (some) e2e tests, if set to 'master' framework will be monitoring master node only, if set to 'none' of 'false' monitoring will be turned off.") flag.BoolVar(&TestContext.GatherLogsSizes, "gather-logs-sizes", false, "If set to true framework will be monitoring logs sizes on all machines running e2e tests.") flag.BoolVar(&TestContext.GatherMetricsAfterTest, "gather-metrics-at-teardown", false, "If set to true framwork will gather metrics from all components after each test.") - flag.StringVar(&TestContext.OutputPrintType, "output-print-type", "hr", "Comma separated list: 'hr' for human readable summaries 'json' for JSON ones.") + flag.StringVar(&TestContext.OutputPrintType, "output-print-type", "json", "Format in which summaries should be printed: 'hr' for human readable, 'json' for JSON ones.") flag.BoolVar(&TestContext.DumpLogsOnFailure, "dump-logs-on-failure", true, "If set to true test will dump data about the namespace in which test was running.") flag.BoolVar(&TestContext.DisableLogDump, "disable-log-dump", false, "If set to true, logs from master and nodes won't be gathered after test run.") flag.BoolVar(&TestContext.DeleteNamespace, "delete-namespace", true, "If true tests will delete namespace after completion. It is only designed to make debugging easier, DO NOT turn it off by default.") diff --git a/test/e2e/framework/util.go b/test/e2e/framework/util.go index bb66594b4a..3aaa744507 100644 --- a/test/e2e/framework/util.go +++ b/test/e2e/framework/util.go @@ -5526,3 +5526,38 @@ func GetAzureCloud() (*azure.Cloud, error) { } return cloud, nil } + +func PrintSummaries(summaries []TestDataSummary, testBaseName string) { + now := time.Now() + for i := range summaries { + Logf("Printing summary: %v", summaries[i].SummaryKind()) + switch TestContext.OutputPrintType { + case "hr": + if TestContext.ReportDir == "" { + Logf(summaries[i].PrintHumanReadable()) + } else { + // TODO: learn to extract test name and append it to the kind instead of timestamp. + filePath := path.Join(TestContext.ReportDir, summaries[i].SummaryKind()+"_"+testBaseName+"_"+now.Format(time.RFC3339)+".txt") + if err := ioutil.WriteFile(filePath, []byte(summaries[i].PrintHumanReadable()), 0644); err != nil { + Logf("Failed to write file %v with test performance data: %v", filePath, err) + } + } + case "json": + fallthrough + default: + if TestContext.OutputPrintType != "json" { + Logf("Unknown output type: %v. Printing JSON", TestContext.OutputPrintType) + } + if TestContext.ReportDir == "" { + Logf("%v JSON\n%v", summaries[i].SummaryKind(), summaries[i].PrintJSON()) + Logf("Finished") + } else { + // TODO: learn to extract test name and append it to the kind instead of timestamp. + filePath := path.Join(TestContext.ReportDir, summaries[i].SummaryKind()+"_"+testBaseName+"_"+now.Format(time.RFC3339)+".json") + if err := ioutil.WriteFile(filePath, []byte(summaries[i].PrintJSON()), 0644); err != nil { + Logf("Failed to write file %v with test performance data: %v", filePath, err) + } + } + } + } +} diff --git a/test/e2e/perf/density.go b/test/e2e/perf/density.go index c7b5cd5bbd..ac95f456a2 100644 --- a/test/e2e/perf/density.go +++ b/test/e2e/perf/density.go @@ -304,6 +304,8 @@ var _ = framework.KubeDescribe("Density", func() { var nodes *v1.NodeList var masters sets.String + testCaseBaseName := "density" + // Gathers data prior to framework namespace teardown AfterEach(func() { saturationThreshold := time.Duration((totalPods / MinPodsPerSecondThroughput)) * time.Second @@ -319,17 +321,25 @@ var _ = framework.KubeDescribe("Density", func() { } framework.Logf("Cluster saturation time: %s", framework.PrettyPrintJSON(saturationData)) + summaries := make([]framework.TestDataSummary, 0, 2) // Verify latency metrics. - highLatencyRequests, err := framework.HighLatencyRequests(c) + highLatencyRequests, metrics, err := framework.HighLatencyRequests(c) framework.ExpectNoError(err) - Expect(highLatencyRequests).NotTo(BeNumerically(">", 0), "There should be no high-latency requests") + if err == nil { + summaries = append(summaries, metrics) + Expect(highLatencyRequests).NotTo(BeNumerically(">", 0), "There should be no high-latency requests") + } // Verify scheduler metrics. // TODO: Reset metrics at the beginning of the test. // We should do something similar to how we do it for APIserver. - if err = framework.VerifySchedulerLatency(c); err != nil { - framework.Logf("Warning: Scheduler latency not calculated, %v", err) + latency, err := framework.VerifySchedulerLatency(c) + framework.ExpectNoError(err) + if err == nil { + summaries = append(summaries, latency) } + + framework.PrintSummaries(summaries, testCaseBaseName) }) options := framework.FrameworkOptions{ @@ -338,7 +348,7 @@ var _ = framework.KubeDescribe("Density", func() { } // Explicitly put here, to delete namespace at the end of the test // (after measuring latency metrics, etc.). - f := framework.NewFramework("density", options, nil) + f := framework.NewFramework(testCaseBaseName, options, nil) f.NamespaceDeletionTimeout = time.Hour BeforeEach(func() { @@ -718,7 +728,8 @@ var _ = framework.KubeDescribe("Density", func() { framework.PrintLatencies(e2eLag, "worst e2e total latencies") // Test whether e2e pod startup time is acceptable. - podStartupLatency := framework.PodStartupLatency{Latency: framework.ExtractLatencyMetrics(e2eLag)} + podStartupLatency := &framework.PodStartupLatency{Latency: framework.ExtractLatencyMetrics(e2eLag)} + f.TestSummaries = append(f.TestSummaries, podStartupLatency) framework.ExpectNoError(framework.VerifyPodStartupLatency(podStartupLatency)) framework.LogSuspiciousLatency(startupLag, e2eLag, nodeCount, c) diff --git a/test/e2e/perf/load.go b/test/e2e/perf/load.go index ca2587870b..bd104bd689 100644 --- a/test/e2e/perf/load.go +++ b/test/e2e/perf/load.go @@ -81,13 +81,20 @@ var _ = framework.KubeDescribe("Load capacity", func() { var configs []testutils.RunObjectConfig var secretConfigs []*testutils.SecretConfig + testCaseBaseName := "load" + // Gathers metrics before teardown // TODO add flag that allows to skip cleanup on failure AfterEach(func() { // Verify latency metrics - highLatencyRequests, err := framework.HighLatencyRequests(clientset) - framework.ExpectNoError(err, "Too many instances metrics above the threshold") - Expect(highLatencyRequests).NotTo(BeNumerically(">", 0)) + highLatencyRequests, metrics, err := framework.HighLatencyRequests(clientset) + framework.ExpectNoError(err) + if err == nil { + summaries := make([]framework.TestDataSummary, 0, 1) + summaries = append(summaries, metrics) + framework.PrintSummaries(summaries, testCaseBaseName) + Expect(highLatencyRequests).NotTo(BeNumerically(">", 0), "There should be no high-latency requests") + } }) // We assume a default throughput of 10 pods/second throughput. @@ -106,7 +113,7 @@ var _ = framework.KubeDescribe("Load capacity", func() { ClientQPS: float32(math.Max(50.0, float64(2*throughput))), ClientBurst: int(math.Max(100.0, float64(4*throughput))), } - f := framework.NewFramework("load", options, nil) + f := framework.NewFramework(testCaseBaseName, options, nil) f.NamespaceDeletionTimeout = time.Hour BeforeEach(func() {