From a968f98dc2394be4f61c67cc702fcb0e4ded6987 Mon Sep 17 00:00:00 2001 From: Wojciech Tyczynski Date: Wed, 18 Nov 2015 17:07:26 +0100 Subject: [PATCH] Expose information about scheduling latency in scalability tests. --- plugin/pkg/scheduler/metrics/metrics.go | 3 + test/e2e/density.go | 7 ++- test/e2e/metrics_util.go | 75 ++++++++++++++++++++++--- 3 files changed, 77 insertions(+), 8 deletions(-) diff --git a/plugin/pkg/scheduler/metrics/metrics.go b/plugin/pkg/scheduler/metrics/metrics.go index a2db2b3323..77bd64bfb3 100644 --- a/plugin/pkg/scheduler/metrics/metrics.go +++ b/plugin/pkg/scheduler/metrics/metrics.go @@ -31,6 +31,7 @@ var ( Subsystem: schedulerSubsystem, Name: "e2e_scheduling_latency_microseconds", Help: "E2e scheduling latency (scheduling algorithm + binding)", + MaxAge: time.Hour, }, ) SchedulingAlgorithmLatency = prometheus.NewSummary( @@ -38,6 +39,7 @@ var ( Subsystem: schedulerSubsystem, Name: "scheduling_algorithm_latency_microseconds", Help: "Scheduling algorithm latency", + MaxAge: time.Hour, }, ) BindingLatency = prometheus.NewSummary( @@ -45,6 +47,7 @@ var ( Subsystem: schedulerSubsystem, Name: "binding_latency_microseconds", Help: "Binding latency", + MaxAge: time.Hour, }, ) ) diff --git a/test/e2e/density.go b/test/e2e/density.go index 297224f2a6..049b78137f 100644 --- a/test/e2e/density.go +++ b/test/e2e/density.go @@ -104,10 +104,15 @@ var _ = Describe("Density [Skipped]", func() { expectNoError(writePerfData(c, fmt.Sprintf(testContext.OutputDir+"/%s", uuid), "after")) - // Verify latency metrics + // Verify latency metrics. highLatencyRequests, err := HighLatencyRequests(c) expectNoError(err) Expect(highLatencyRequests).NotTo(BeNumerically(">", 0), "There should be no high-latency requests") + + // Verify scheduler metrics. + // TODO: Reset metrics at the beginning of the test. + // We should do something similar to how we do it for APIserver. + expectNoError(VerifySchedulerLatency()) }) framework := NewFramework("density") diff --git a/test/e2e/metrics_util.go b/test/e2e/metrics_util.go index 2f25bba7ba..c565c8807a 100644 --- a/test/e2e/metrics_util.go +++ b/test/e2e/metrics_util.go @@ -57,6 +57,12 @@ type PodStartupLatency struct { Latency LatencyMetric `json:"latency"` } +type SchedulingLatency struct { + Scheduling LatencyMetric `json:"scheduling:` + Binding LatencyMetric `json:"binding"` + Total LatencyMetric `json:"total"` +} + type APICall struct { Resource string `json:"resource"` Verb string `json:"verb"` @@ -78,26 +84,31 @@ func (a APIResponsiveness) Less(i, j int) bool { func (a *APIResponsiveness) addMetric(resource, verb string, quantile float64, latency time.Duration) { for i, apicall := range a.APICalls { if apicall.Resource == resource && apicall.Verb == verb { - a.APICalls[i] = setQuantile(apicall, quantile, latency) + a.APICalls[i] = setQuantileAPICall(apicall, quantile, latency) return } } - apicall := setQuantile(APICall{Resource: resource, Verb: verb}, quantile, latency) + apicall := setQuantileAPICall(APICall{Resource: resource, Verb: verb}, quantile, latency) a.APICalls = append(a.APICalls, apicall) } // 0 <= quantile <=1 (e.g. 0.95 is 95%tile, 0.5 is median) // Only 0.5, 0.9 and 0.99 quantiles are supported. -func setQuantile(apicall APICall, quantile float64, latency time.Duration) APICall { +func setQuantileAPICall(apicall APICall, quantile float64, latency time.Duration) APICall { + setQuantile(&apicall.Latency, quantile, latency) + return apicall +} + +// Only 0.5, 0.9 and 0.99 quantiles are supported. +func setQuantile(metric *LatencyMetric, quantile float64, latency time.Duration) { switch quantile { case 0.5: - apicall.Latency.Perc50 = latency + metric.Perc50 = latency case 0.9: - apicall.Latency.Perc90 = latency + metric.Perc90 = latency case 0.99: - apicall.Latency.Perc99 = latency + metric.Perc99 = latency } - return apicall } func readLatencyMetrics(c *client.Client) (APIResponsiveness, error) { @@ -233,6 +244,56 @@ func getMetrics(c *client.Client) (string, error) { return string(body), nil } +// Retrieves scheduler metrics information. +func getSchedulingLatency() (SchedulingLatency, error) { + result := SchedulingLatency{} + + cmd := "curl http://localhost:10251/metrics" + sshResult, err := SSH(cmd, getMasterHost()+":22", testContext.Provider) + if err != nil || sshResult.Code != 0 { + return result, fmt.Errorf("unexpected error (code: %d) in ssh connection to master: %#v", sshResult.Code, err) + } + samples, err := extractMetricSamples(sshResult.Stdout) + if err != nil { + return result, err + } + + for _, sample := range samples { + var metric *LatencyMetric = nil + switch sample.Metric[model.MetricNameLabel] { + case "scheduler_scheduling_algorithm_latency_microseconds": + metric = &result.Scheduling + case "scheduler_binding_latency_microseconds": + metric = &result.Binding + case "scheduler_e2e_scheduling_latency_microseconds": + metric = &result.Total + } + if metric == nil { + continue + } + + latency := sample.Value + quantile, err := strconv.ParseFloat(string(sample.Metric[model.QuantileLabel]), 64) + if err != nil { + return result, err + } + setQuantile(metric, quantile, time.Duration(int64(latency))*time.Microsecond) + } + return result, nil +} + +// Verifies (currently just by logging them) the scheduling latencies. +func VerifySchedulerLatency() error { + latency, err := getSchedulingLatency() + if err != nil { + return err + } + Logf("Scheduling latency: %s", prettyPrintJSON(latency)) + + // TODO: Add some reasonable checks once we know more about the values. + return nil +} + func prettyPrintJSON(metrics interface{}) string { output := &bytes.Buffer{} if err := json.NewEncoder(output).Encode(metrics); err != nil {