mirror of https://github.com/k3s-io/k3s
Expose information about scheduling latency in scalability tests.
parent
4a9b0fc715
commit
a968f98dc2
|
@ -31,6 +31,7 @@ var (
|
|||
Subsystem: schedulerSubsystem,
|
||||
Name: "e2e_scheduling_latency_microseconds",
|
||||
Help: "E2e scheduling latency (scheduling algorithm + binding)",
|
||||
MaxAge: time.Hour,
|
||||
},
|
||||
)
|
||||
SchedulingAlgorithmLatency = prometheus.NewSummary(
|
||||
|
@ -38,6 +39,7 @@ var (
|
|||
Subsystem: schedulerSubsystem,
|
||||
Name: "scheduling_algorithm_latency_microseconds",
|
||||
Help: "Scheduling algorithm latency",
|
||||
MaxAge: time.Hour,
|
||||
},
|
||||
)
|
||||
BindingLatency = prometheus.NewSummary(
|
||||
|
@ -45,6 +47,7 @@ var (
|
|||
Subsystem: schedulerSubsystem,
|
||||
Name: "binding_latency_microseconds",
|
||||
Help: "Binding latency",
|
||||
MaxAge: time.Hour,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
|
|
@ -104,10 +104,15 @@ var _ = Describe("Density [Skipped]", func() {
|
|||
|
||||
expectNoError(writePerfData(c, fmt.Sprintf(testContext.OutputDir+"/%s", uuid), "after"))
|
||||
|
||||
// Verify latency metrics
|
||||
// Verify latency metrics.
|
||||
highLatencyRequests, err := HighLatencyRequests(c)
|
||||
expectNoError(err)
|
||||
Expect(highLatencyRequests).NotTo(BeNumerically(">", 0), "There should be no high-latency requests")
|
||||
|
||||
// Verify scheduler metrics.
|
||||
// TODO: Reset metrics at the beginning of the test.
|
||||
// We should do something similar to how we do it for APIserver.
|
||||
expectNoError(VerifySchedulerLatency())
|
||||
})
|
||||
|
||||
framework := NewFramework("density")
|
||||
|
|
|
@ -57,6 +57,12 @@ type PodStartupLatency struct {
|
|||
Latency LatencyMetric `json:"latency"`
|
||||
}
|
||||
|
||||
type SchedulingLatency struct {
|
||||
Scheduling LatencyMetric `json:"scheduling:`
|
||||
Binding LatencyMetric `json:"binding"`
|
||||
Total LatencyMetric `json:"total"`
|
||||
}
|
||||
|
||||
type APICall struct {
|
||||
Resource string `json:"resource"`
|
||||
Verb string `json:"verb"`
|
||||
|
@ -78,26 +84,31 @@ func (a APIResponsiveness) Less(i, j int) bool {
|
|||
func (a *APIResponsiveness) addMetric(resource, verb string, quantile float64, latency time.Duration) {
|
||||
for i, apicall := range a.APICalls {
|
||||
if apicall.Resource == resource && apicall.Verb == verb {
|
||||
a.APICalls[i] = setQuantile(apicall, quantile, latency)
|
||||
a.APICalls[i] = setQuantileAPICall(apicall, quantile, latency)
|
||||
return
|
||||
}
|
||||
}
|
||||
apicall := setQuantile(APICall{Resource: resource, Verb: verb}, quantile, latency)
|
||||
apicall := setQuantileAPICall(APICall{Resource: resource, Verb: verb}, quantile, latency)
|
||||
a.APICalls = append(a.APICalls, apicall)
|
||||
}
|
||||
|
||||
// 0 <= quantile <=1 (e.g. 0.95 is 95%tile, 0.5 is median)
|
||||
// Only 0.5, 0.9 and 0.99 quantiles are supported.
|
||||
func setQuantile(apicall APICall, quantile float64, latency time.Duration) APICall {
|
||||
func setQuantileAPICall(apicall APICall, quantile float64, latency time.Duration) APICall {
|
||||
setQuantile(&apicall.Latency, quantile, latency)
|
||||
return apicall
|
||||
}
|
||||
|
||||
// Only 0.5, 0.9 and 0.99 quantiles are supported.
|
||||
func setQuantile(metric *LatencyMetric, quantile float64, latency time.Duration) {
|
||||
switch quantile {
|
||||
case 0.5:
|
||||
apicall.Latency.Perc50 = latency
|
||||
metric.Perc50 = latency
|
||||
case 0.9:
|
||||
apicall.Latency.Perc90 = latency
|
||||
metric.Perc90 = latency
|
||||
case 0.99:
|
||||
apicall.Latency.Perc99 = latency
|
||||
metric.Perc99 = latency
|
||||
}
|
||||
return apicall
|
||||
}
|
||||
|
||||
func readLatencyMetrics(c *client.Client) (APIResponsiveness, error) {
|
||||
|
@ -233,6 +244,56 @@ func getMetrics(c *client.Client) (string, error) {
|
|||
return string(body), nil
|
||||
}
|
||||
|
||||
// Retrieves scheduler metrics information.
|
||||
func getSchedulingLatency() (SchedulingLatency, error) {
|
||||
result := SchedulingLatency{}
|
||||
|
||||
cmd := "curl http://localhost:10251/metrics"
|
||||
sshResult, err := SSH(cmd, getMasterHost()+":22", testContext.Provider)
|
||||
if err != nil || sshResult.Code != 0 {
|
||||
return result, fmt.Errorf("unexpected error (code: %d) in ssh connection to master: %#v", sshResult.Code, err)
|
||||
}
|
||||
samples, err := extractMetricSamples(sshResult.Stdout)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
|
||||
for _, sample := range samples {
|
||||
var metric *LatencyMetric = nil
|
||||
switch sample.Metric[model.MetricNameLabel] {
|
||||
case "scheduler_scheduling_algorithm_latency_microseconds":
|
||||
metric = &result.Scheduling
|
||||
case "scheduler_binding_latency_microseconds":
|
||||
metric = &result.Binding
|
||||
case "scheduler_e2e_scheduling_latency_microseconds":
|
||||
metric = &result.Total
|
||||
}
|
||||
if metric == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
latency := sample.Value
|
||||
quantile, err := strconv.ParseFloat(string(sample.Metric[model.QuantileLabel]), 64)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
setQuantile(metric, quantile, time.Duration(int64(latency))*time.Microsecond)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// Verifies (currently just by logging them) the scheduling latencies.
|
||||
func VerifySchedulerLatency() error {
|
||||
latency, err := getSchedulingLatency()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
Logf("Scheduling latency: %s", prettyPrintJSON(latency))
|
||||
|
||||
// TODO: Add some reasonable checks once we know more about the values.
|
||||
return nil
|
||||
}
|
||||
|
||||
func prettyPrintJSON(metrics interface{}) string {
|
||||
output := &bytes.Buffer{}
|
||||
if err := json.NewEncoder(output).Encode(metrics); err != nil {
|
||||
|
|
Loading…
Reference in New Issue