From b0dd166fa300e5e2181b344d150b7706abb932aa Mon Sep 17 00:00:00 2001 From: Shyam Jeedigunta Date: Wed, 21 Mar 2018 15:24:56 +0100 Subject: [PATCH 1/2] Capture different parts of pod-startup latency as metrics --- test/e2e/framework/metrics_util.go | 20 ++++++++++++-------- test/e2e/framework/perf_util.go | 27 +++++++++++++++++---------- test/e2e/scalability/density.go | 22 +++++++++++++++------- test/e2e_node/density_test.go | 6 +++--- 4 files changed, 47 insertions(+), 28 deletions(-) diff --git a/test/e2e/framework/metrics_util.go b/test/e2e/framework/metrics_util.go index 2753de0d4e..284beafde2 100644 --- a/test/e2e/framework/metrics_util.go +++ b/test/e2e/framework/metrics_util.go @@ -188,7 +188,11 @@ type LatencyMetric struct { } type PodStartupLatency struct { - Latency LatencyMetric `json:"latency"` + CreateToScheduleLatency LatencyMetric `json:"createToScheduleLatency"` + ScheduleToRunLatency LatencyMetric `json:"scheduleToRunLatency"` + RunToWatchLatency LatencyMetric `json:"runToWatchLatency"` + ScheduleToWatchLatency LatencyMetric `json:"scheduleToWatchLatency"` + E2ELatency LatencyMetric `json:"e2eLatency"` } func (l *PodStartupLatency) SummaryKind() string { @@ -398,17 +402,17 @@ func HighLatencyRequests(c clientset.Interface, nodeCount int) (int, *APIRespons return badMetrics, metrics, nil } -// Verifies whether 50, 90 and 99th percentiles of PodStartupLatency are +// Verifies whether 50, 90 and 99th percentiles of e2e PodStartupLatency are // within the threshold. func VerifyPodStartupLatency(latency *PodStartupLatency) error { - if latency.Latency.Perc50 > podStartupThreshold { - return fmt.Errorf("too high pod startup latency 50th percentile: %v", latency.Latency.Perc50) + if latency.E2ELatency.Perc50 > podStartupThreshold { + return fmt.Errorf("too high pod startup latency 50th percentile: %v", latency.E2ELatency.Perc50) } - if latency.Latency.Perc90 > podStartupThreshold { - return fmt.Errorf("too high pod startup latency 90th percentile: %v", latency.Latency.Perc90) + if latency.E2ELatency.Perc90 > podStartupThreshold { + return fmt.Errorf("too high pod startup latency 90th percentile: %v", latency.E2ELatency.Perc90) } - if latency.Latency.Perc99 > podStartupThreshold { - return fmt.Errorf("too high pod startup latency 99th percentile: %v", latency.Latency.Perc99) + if latency.E2ELatency.Perc99 > podStartupThreshold { + return fmt.Errorf("too high pod startup latency 99th percentile: %v", latency.E2ELatency.Perc99) } return nil } diff --git a/test/e2e/framework/perf_util.go b/test/e2e/framework/perf_util.go index 67d9ada570..18f94cb50c 100644 --- a/test/e2e/framework/perf_util.go +++ b/test/e2e/framework/perf_util.go @@ -53,22 +53,29 @@ func ApiCallToPerfData(apicalls *APIResponsiveness) *perftype.PerfData { return perfData } -// PodStartupLatencyToPerfData transforms PodStartupLatency to PerfData. -func PodStartupLatencyToPerfData(latency *PodStartupLatency) *perftype.PerfData { - perfData := &perftype.PerfData{Version: currentApiCallMetricsVersion} - item := perftype.DataItem{ +func latencyToPerfData(l LatencyMetric, name string) perftype.DataItem { + return perftype.DataItem{ Data: map[string]float64{ - "Perc50": float64(latency.Latency.Perc50) / 1000000, // us -> ms - "Perc90": float64(latency.Latency.Perc90) / 1000000, - "Perc99": float64(latency.Latency.Perc99) / 1000000, - "Perc100": float64(latency.Latency.Perc100) / 1000000, + "Perc50": float64(l.Perc50) / 1000000, // us -> ms + "Perc90": float64(l.Perc90) / 1000000, + "Perc99": float64(l.Perc99) / 1000000, + "Perc100": float64(l.Perc100) / 1000000, }, Unit: "ms", Labels: map[string]string{ - "Metric": "pod_startup", + "Metric": name, }, } - perfData.DataItems = append(perfData.DataItems, item) +} + +// PodStartupLatencyToPerfData transforms PodStartupLatency to PerfData. +func PodStartupLatencyToPerfData(latency *PodStartupLatency) *perftype.PerfData { + perfData := &perftype.PerfData{Version: currentApiCallMetricsVersion} + perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.CreateToScheduleLatency, "create_to_schedule")) + perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.ScheduleToRunLatency, "schedule_to_run")) + perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.RunToWatchLatency, "run_to_watch")) + perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.ScheduleToWatchLatency, "schedule_to_watch")) + perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.E2ELatency, "pod_startup")) return perfData } diff --git a/test/e2e/scalability/density.go b/test/e2e/scalability/density.go index 788259eab5..553bc7a316 100644 --- a/test/e2e/scalability/density.go +++ b/test/e2e/scalability/density.go @@ -829,15 +829,23 @@ var _ = SIGDescribe("Density", func() { sort.Sort(framework.LatencySlice(schedToWatchLag)) sort.Sort(framework.LatencySlice(e2eLag)) - framework.PrintLatencies(scheduleLag, "worst schedule latencies") - framework.PrintLatencies(startupLag, "worst run-after-schedule latencies") - framework.PrintLatencies(watchLag, "worst watch latencies") - framework.PrintLatencies(schedToWatchLag, "worst scheduled-to-end total latencies") - framework.PrintLatencies(e2eLag, "worst e2e total latencies") + framework.PrintLatencies(scheduleLag, "worst create-to-schedule latencies") + framework.PrintLatencies(startupLag, "worst schedule-to-run latencies") + framework.PrintLatencies(watchLag, "worst run-to-watch latencies") + framework.PrintLatencies(schedToWatchLag, "worst schedule-to-watch latencies") + framework.PrintLatencies(e2eLag, "worst e2e latencies") + + // Capture latency metrics related to pod-startup. + podStartupLatency := &framework.PodStartupLatency{ + CreateToScheduleLatency: framework.ExtractLatencyMetrics(scheduleLag), + ScheduleToRunLatency: framework.ExtractLatencyMetrics(startupLag), + RunToWatchLatency: framework.ExtractLatencyMetrics(watchLag), + ScheduleToWatchLatency: framework.ExtractLatencyMetrics(schedToWatchLag), + E2ELatency: framework.ExtractLatencyMetrics(e2eLag), + } + f.TestSummaries = append(f.TestSummaries, podStartupLatency) // Test whether e2e pod startup time is acceptable. - podStartupLatency := &framework.PodStartupLatency{Latency: framework.ExtractLatencyMetrics(e2eLag)} - f.TestSummaries = append(f.TestSummaries, podStartupLatency) framework.ExpectNoError(framework.VerifyPodStartupLatency(podStartupLatency)) framework.LogSuspiciousLatency(startupLag, e2eLag, nodeCount, c) diff --git a/test/e2e_node/density_test.go b/test/e2e_node/density_test.go index 592a5f86bb..845fff8cfe 100644 --- a/test/e2e_node/density_test.go +++ b/test/e2e_node/density_test.go @@ -556,14 +556,14 @@ func logAndVerifyLatency(batchLag time.Duration, e2eLags []framework.PodLatencyD latencyMetrics, _ := getPodStartLatency(kubeletAddr) framework.Logf("Kubelet Prometheus metrics (not reset):\n%s", framework.PrettyPrintJSON(latencyMetrics)) - podCreateLatency := framework.PodStartupLatency{Latency: framework.ExtractLatencyMetrics(e2eLags)} + podStartupLatency := framework.ExtractLatencyMetrics(e2eLags) // log latency perf data - logPerfData(getLatencyPerfData(podCreateLatency.Latency, testInfo), "latency") + logPerfData(getLatencyPerfData(podStartupLatency, testInfo), "latency") if isVerify { // check whether e2e pod startup time is acceptable. - framework.ExpectNoError(verifyPodStartupLatency(podStartupLimits, podCreateLatency.Latency)) + framework.ExpectNoError(verifyPodStartupLatency(podStartupLimits, podStartupLatency)) // check bactch pod creation latency if podBatchStartupLimit > 0 { From 0f0c754eb433acb501fe64fa46007a922aa68715 Mon Sep 17 00:00:00 2001 From: Shyam Jeedigunta Date: Wed, 21 Mar 2018 15:39:01 +0100 Subject: [PATCH 2/2] Get rid of duplicate VerifyPodStartupLatency util in node density tests --- test/e2e/framework/metrics_util.go | 19 +++++++++---------- test/e2e/scalability/density.go | 8 +++++++- test/e2e_node/density_test.go | 17 +---------------- 3 files changed, 17 insertions(+), 27 deletions(-) diff --git a/test/e2e/framework/metrics_util.go b/test/e2e/framework/metrics_util.go index 284beafde2..3b6b95c434 100644 --- a/test/e2e/framework/metrics_util.go +++ b/test/e2e/framework/metrics_util.go @@ -43,7 +43,6 @@ const ( // NodeStartupThreshold is a rough estimate of the time allocated for a pod to start on a node. NodeStartupThreshold = 4 * time.Second - podStartupThreshold time.Duration = 5 * time.Second // We are setting 1s threshold for apicalls even in small clusters to avoid flakes. // The problem is that if long GC is happening in small clusters (where we have e.g. // 1-core master machines) and tests are pretty short, it may consume significant @@ -402,17 +401,17 @@ func HighLatencyRequests(c clientset.Interface, nodeCount int) (int, *APIRespons return badMetrics, metrics, nil } -// Verifies whether 50, 90 and 99th percentiles of e2e PodStartupLatency are -// within the threshold. -func VerifyPodStartupLatency(latency *PodStartupLatency) error { - if latency.E2ELatency.Perc50 > podStartupThreshold { - return fmt.Errorf("too high pod startup latency 50th percentile: %v", latency.E2ELatency.Perc50) +// Verifies whether 50, 90 and 99th percentiles of a latency metric are +// within the expected threshold. +func VerifyLatencyWithinThreshold(threshold, actual LatencyMetric, metricName string) error { + if actual.Perc50 > threshold.Perc50 { + return fmt.Errorf("too high %v latency 50th percentile: %v", metricName, actual.Perc50) } - if latency.E2ELatency.Perc90 > podStartupThreshold { - return fmt.Errorf("too high pod startup latency 90th percentile: %v", latency.E2ELatency.Perc90) + if actual.Perc90 > threshold.Perc90 { + return fmt.Errorf("too high %v latency 90th percentile: %v", metricName, actual.Perc90) } - if latency.E2ELatency.Perc99 > podStartupThreshold { - return fmt.Errorf("too high pod startup latency 99th percentile: %v", latency.E2ELatency.Perc99) + if actual.Perc99 > threshold.Perc99 { + return fmt.Errorf("too high %v latency 99th percentile: %v", metricName, actual.Perc99) } return nil } diff --git a/test/e2e/scalability/density.go b/test/e2e/scalability/density.go index 553bc7a316..4f0c197934 100644 --- a/test/e2e/scalability/density.go +++ b/test/e2e/scalability/density.go @@ -50,6 +50,7 @@ import ( ) const ( + PodStartupLatencyThreshold = 5 * time.Second MinSaturationThreshold = 2 * time.Minute MinPodsPerSecondThroughput = 8 DensityPollInterval = 10 * time.Second @@ -846,7 +847,12 @@ var _ = SIGDescribe("Density", func() { f.TestSummaries = append(f.TestSummaries, podStartupLatency) // Test whether e2e pod startup time is acceptable. - framework.ExpectNoError(framework.VerifyPodStartupLatency(podStartupLatency)) + podStartupLatencyThreshold := framework.LatencyMetric{ + Perc50: PodStartupLatencyThreshold, + Perc90: PodStartupLatencyThreshold, + Perc99: PodStartupLatencyThreshold, + } + framework.ExpectNoError(framework.VerifyLatencyWithinThreshold(podStartupLatencyThreshold, podStartupLatency.E2ELatency, "pod startup")) framework.LogSuspiciousLatency(startupLag, e2eLag, nodeCount, c) latencyMeasurementPhase.End() diff --git a/test/e2e_node/density_test.go b/test/e2e_node/density_test.go index 845fff8cfe..9ee85cbe1d 100644 --- a/test/e2e_node/density_test.go +++ b/test/e2e_node/density_test.go @@ -472,21 +472,6 @@ func getPodStartLatency(node string) (framework.KubeletLatencyMetrics, error) { return latencyMetrics, nil } -// verifyPodStartupLatency verifies whether 50, 90 and 99th percentiles of PodStartupLatency are -// within the threshold. -func verifyPodStartupLatency(expect, actual framework.LatencyMetric) error { - if actual.Perc50 > expect.Perc50 { - return fmt.Errorf("too high pod startup latency 50th percentile: %v", actual.Perc50) - } - if actual.Perc90 > expect.Perc90 { - return fmt.Errorf("too high pod startup latency 90th percentile: %v", actual.Perc90) - } - if actual.Perc99 > expect.Perc99 { - return fmt.Errorf("too high pod startup latency 99th percentile: %v", actual.Perc99) - } - return nil -} - // newInformerWatchPod creates an informer to check whether all pods are running. func newInformerWatchPod(f *framework.Framework, mutex *sync.Mutex, watchTimes map[string]metav1.Time, podType string) cache.Controller { ns := f.Namespace.Name @@ -563,7 +548,7 @@ func logAndVerifyLatency(batchLag time.Duration, e2eLags []framework.PodLatencyD if isVerify { // check whether e2e pod startup time is acceptable. - framework.ExpectNoError(verifyPodStartupLatency(podStartupLimits, podStartupLatency)) + framework.ExpectNoError(framework.VerifyLatencyWithinThreshold(podStartupLimits, podStartupLatency, "pod startup")) // check bactch pod creation latency if podBatchStartupLimit > 0 {