From b0dd166fa300e5e2181b344d150b7706abb932aa Mon Sep 17 00:00:00 2001
From: Shyam Jeedigunta <shyamjvs@google.com>
Date: Wed, 21 Mar 2018 15:24:56 +0100
Subject: [PATCH 1/2] Capture different parts of pod-startup latency as metrics

---
 test/e2e/framework/metrics_util.go | 20 ++++++++++++--------
 test/e2e/framework/perf_util.go    | 27 +++++++++++++++++----------
 test/e2e/scalability/density.go    | 22 +++++++++++++++-------
 test/e2e_node/density_test.go      |  6 +++---
 4 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/test/e2e/framework/metrics_util.go b/test/e2e/framework/metrics_util.go
index 2753de0d4e..284beafde2 100644
--- a/test/e2e/framework/metrics_util.go
+++ b/test/e2e/framework/metrics_util.go
@@ -188,7 +188,11 @@ type LatencyMetric struct {
 }
 
 type PodStartupLatency struct {
-	Latency LatencyMetric `json:"latency"`
+	CreateToScheduleLatency LatencyMetric `json:"createToScheduleLatency"`
+	ScheduleToRunLatency    LatencyMetric `json:"scheduleToRunLatency"`
+	RunToWatchLatency       LatencyMetric `json:"runToWatchLatency"`
+	ScheduleToWatchLatency  LatencyMetric `json:"scheduleToWatchLatency"`
+	E2ELatency              LatencyMetric `json:"e2eLatency"`
 }
 
 func (l *PodStartupLatency) SummaryKind() string {
@@ -398,17 +402,17 @@ func HighLatencyRequests(c clientset.Interface, nodeCount int) (int, *APIRespons
 	return badMetrics, metrics, nil
 }
 
-// Verifies whether 50, 90 and 99th percentiles of PodStartupLatency are
+// Verifies whether 50, 90 and 99th percentiles of e2e PodStartupLatency are
 // within the threshold.
 func VerifyPodStartupLatency(latency *PodStartupLatency) error {
-	if latency.Latency.Perc50 > podStartupThreshold {
-		return fmt.Errorf("too high pod startup latency 50th percentile: %v", latency.Latency.Perc50)
+	if latency.E2ELatency.Perc50 > podStartupThreshold {
+		return fmt.Errorf("too high pod startup latency 50th percentile: %v", latency.E2ELatency.Perc50)
 	}
-	if latency.Latency.Perc90 > podStartupThreshold {
-		return fmt.Errorf("too high pod startup latency 90th percentile: %v", latency.Latency.Perc90)
+	if latency.E2ELatency.Perc90 > podStartupThreshold {
+		return fmt.Errorf("too high pod startup latency 90th percentile: %v", latency.E2ELatency.Perc90)
 	}
-	if latency.Latency.Perc99 > podStartupThreshold {
-		return fmt.Errorf("too high pod startup latency 99th percentile: %v", latency.Latency.Perc99)
+	if latency.E2ELatency.Perc99 > podStartupThreshold {
+		return fmt.Errorf("too high pod startup latency 99th percentile: %v", latency.E2ELatency.Perc99)
 	}
 	return nil
 }
diff --git a/test/e2e/framework/perf_util.go b/test/e2e/framework/perf_util.go
index 67d9ada570..18f94cb50c 100644
--- a/test/e2e/framework/perf_util.go
+++ b/test/e2e/framework/perf_util.go
@@ -53,22 +53,29 @@ func ApiCallToPerfData(apicalls *APIResponsiveness) *perftype.PerfData {
 	return perfData
 }
 
-// PodStartupLatencyToPerfData transforms PodStartupLatency to PerfData.
-func PodStartupLatencyToPerfData(latency *PodStartupLatency) *perftype.PerfData {
-	perfData := &perftype.PerfData{Version: currentApiCallMetricsVersion}
-	item := perftype.DataItem{
+func latencyToPerfData(l LatencyMetric, name string) perftype.DataItem {
+	return perftype.DataItem{
 		Data: map[string]float64{
-			"Perc50":  float64(latency.Latency.Perc50) / 1000000, // us -> ms
-			"Perc90":  float64(latency.Latency.Perc90) / 1000000,
-			"Perc99":  float64(latency.Latency.Perc99) / 1000000,
-			"Perc100": float64(latency.Latency.Perc100) / 1000000,
+			"Perc50":  float64(l.Perc50) / 1000000, // us -> ms
+			"Perc90":  float64(l.Perc90) / 1000000,
+			"Perc99":  float64(l.Perc99) / 1000000,
+			"Perc100": float64(l.Perc100) / 1000000,
 		},
 		Unit: "ms",
 		Labels: map[string]string{
-			"Metric": "pod_startup",
+			"Metric": name,
 		},
 	}
-	perfData.DataItems = append(perfData.DataItems, item)
+}
+
+// PodStartupLatencyToPerfData transforms PodStartupLatency to PerfData.
+func PodStartupLatencyToPerfData(latency *PodStartupLatency) *perftype.PerfData {
+	perfData := &perftype.PerfData{Version: currentApiCallMetricsVersion}
+	perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.CreateToScheduleLatency, "create_to_schedule"))
+	perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.ScheduleToRunLatency, "schedule_to_run"))
+	perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.RunToWatchLatency, "run_to_watch"))
+	perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.ScheduleToWatchLatency, "schedule_to_watch"))
+	perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.E2ELatency, "pod_startup"))
 	return perfData
 }
 
diff --git a/test/e2e/scalability/density.go b/test/e2e/scalability/density.go
index 788259eab5..553bc7a316 100644
--- a/test/e2e/scalability/density.go
+++ b/test/e2e/scalability/density.go
@@ -829,15 +829,23 @@ var _ = SIGDescribe("Density", func() {
 				sort.Sort(framework.LatencySlice(schedToWatchLag))
 				sort.Sort(framework.LatencySlice(e2eLag))
 
-				framework.PrintLatencies(scheduleLag, "worst schedule latencies")
-				framework.PrintLatencies(startupLag, "worst run-after-schedule latencies")
-				framework.PrintLatencies(watchLag, "worst watch latencies")
-				framework.PrintLatencies(schedToWatchLag, "worst scheduled-to-end total latencies")
-				framework.PrintLatencies(e2eLag, "worst e2e total latencies")
+				framework.PrintLatencies(scheduleLag, "worst create-to-schedule latencies")
+				framework.PrintLatencies(startupLag, "worst schedule-to-run latencies")
+				framework.PrintLatencies(watchLag, "worst run-to-watch latencies")
+				framework.PrintLatencies(schedToWatchLag, "worst schedule-to-watch latencies")
+				framework.PrintLatencies(e2eLag, "worst e2e latencies")
+
+				// Capture latency metrics related to pod-startup.
+				podStartupLatency := &framework.PodStartupLatency{
+					CreateToScheduleLatency: framework.ExtractLatencyMetrics(scheduleLag),
+					ScheduleToRunLatency:    framework.ExtractLatencyMetrics(startupLag),
+					RunToWatchLatency:       framework.ExtractLatencyMetrics(watchLag),
+					ScheduleToWatchLatency:  framework.ExtractLatencyMetrics(schedToWatchLag),
+					E2ELatency:              framework.ExtractLatencyMetrics(e2eLag),
+				}
+				f.TestSummaries = append(f.TestSummaries, podStartupLatency)
 
 				// Test whether e2e pod startup time is acceptable.
-				podStartupLatency := &framework.PodStartupLatency{Latency: framework.ExtractLatencyMetrics(e2eLag)}
-				f.TestSummaries = append(f.TestSummaries, podStartupLatency)
 				framework.ExpectNoError(framework.VerifyPodStartupLatency(podStartupLatency))
 
 				framework.LogSuspiciousLatency(startupLag, e2eLag, nodeCount, c)
diff --git a/test/e2e_node/density_test.go b/test/e2e_node/density_test.go
index 592a5f86bb..845fff8cfe 100644
--- a/test/e2e_node/density_test.go
+++ b/test/e2e_node/density_test.go
@@ -556,14 +556,14 @@ func logAndVerifyLatency(batchLag time.Duration, e2eLags []framework.PodLatencyD
 	latencyMetrics, _ := getPodStartLatency(kubeletAddr)
 	framework.Logf("Kubelet Prometheus metrics (not reset):\n%s", framework.PrettyPrintJSON(latencyMetrics))
 
-	podCreateLatency := framework.PodStartupLatency{Latency: framework.ExtractLatencyMetrics(e2eLags)}
+	podStartupLatency := framework.ExtractLatencyMetrics(e2eLags)
 
 	// log latency perf data
-	logPerfData(getLatencyPerfData(podCreateLatency.Latency, testInfo), "latency")
+	logPerfData(getLatencyPerfData(podStartupLatency, testInfo), "latency")
 
 	if isVerify {
 		// check whether e2e pod startup time is acceptable.
-		framework.ExpectNoError(verifyPodStartupLatency(podStartupLimits, podCreateLatency.Latency))
+		framework.ExpectNoError(verifyPodStartupLatency(podStartupLimits, podStartupLatency))
 
 		// check bactch pod creation latency
 		if podBatchStartupLimit > 0 {

From 0f0c754eb433acb501fe64fa46007a922aa68715 Mon Sep 17 00:00:00 2001
From: Shyam Jeedigunta <shyamjvs@google.com>
Date: Wed, 21 Mar 2018 15:39:01 +0100
Subject: [PATCH 2/2] Get rid of duplicate VerifyPodStartupLatency util in node
 density tests

---
 test/e2e/framework/metrics_util.go | 19 +++++++++----------
 test/e2e/scalability/density.go    |  8 +++++++-
 test/e2e_node/density_test.go      | 17 +----------------
 3 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/test/e2e/framework/metrics_util.go b/test/e2e/framework/metrics_util.go
index 284beafde2..3b6b95c434 100644
--- a/test/e2e/framework/metrics_util.go
+++ b/test/e2e/framework/metrics_util.go
@@ -43,7 +43,6 @@ const (
 	// NodeStartupThreshold is a rough estimate of the time allocated for a pod to start on a node.
 	NodeStartupThreshold = 4 * time.Second
 
-	podStartupThreshold time.Duration = 5 * time.Second
 	// We are setting 1s threshold for apicalls even in small clusters to avoid flakes.
 	// The problem is that if long GC is happening in small clusters (where we have e.g.
 	// 1-core master machines) and tests are pretty short, it may consume significant
@@ -402,17 +401,17 @@ func HighLatencyRequests(c clientset.Interface, nodeCount int) (int, *APIRespons
 	return badMetrics, metrics, nil
 }
 
-// Verifies whether 50, 90 and 99th percentiles of e2e PodStartupLatency are
-// within the threshold.
-func VerifyPodStartupLatency(latency *PodStartupLatency) error {
-	if latency.E2ELatency.Perc50 > podStartupThreshold {
-		return fmt.Errorf("too high pod startup latency 50th percentile: %v", latency.E2ELatency.Perc50)
+// Verifies whether 50, 90 and 99th percentiles of a latency metric are
+// within the expected threshold.
+func VerifyLatencyWithinThreshold(threshold, actual LatencyMetric, metricName string) error {
+	if actual.Perc50 > threshold.Perc50 {
+		return fmt.Errorf("too high %v latency 50th percentile: %v", metricName, actual.Perc50)
 	}
-	if latency.E2ELatency.Perc90 > podStartupThreshold {
-		return fmt.Errorf("too high pod startup latency 90th percentile: %v", latency.E2ELatency.Perc90)
+	if actual.Perc90 > threshold.Perc90 {
+		return fmt.Errorf("too high %v latency 90th percentile: %v", metricName, actual.Perc90)
 	}
-	if latency.E2ELatency.Perc99 > podStartupThreshold {
-		return fmt.Errorf("too high pod startup latency 99th percentile: %v", latency.E2ELatency.Perc99)
+	if actual.Perc99 > threshold.Perc99 {
+		return fmt.Errorf("too high %v latency 99th percentile: %v", metricName, actual.Perc99)
 	}
 	return nil
 }
diff --git a/test/e2e/scalability/density.go b/test/e2e/scalability/density.go
index 553bc7a316..4f0c197934 100644
--- a/test/e2e/scalability/density.go
+++ b/test/e2e/scalability/density.go
@@ -50,6 +50,7 @@ import (
 )
 
 const (
+	PodStartupLatencyThreshold = 5 * time.Second
 	MinSaturationThreshold     = 2 * time.Minute
 	MinPodsPerSecondThroughput = 8
 	DensityPollInterval        = 10 * time.Second
@@ -846,7 +847,12 @@ var _ = SIGDescribe("Density", func() {
 				f.TestSummaries = append(f.TestSummaries, podStartupLatency)
 
 				// Test whether e2e pod startup time is acceptable.
-				framework.ExpectNoError(framework.VerifyPodStartupLatency(podStartupLatency))
+				podStartupLatencyThreshold := framework.LatencyMetric{
+					Perc50: PodStartupLatencyThreshold,
+					Perc90: PodStartupLatencyThreshold,
+					Perc99: PodStartupLatencyThreshold,
+				}
+				framework.ExpectNoError(framework.VerifyLatencyWithinThreshold(podStartupLatencyThreshold, podStartupLatency.E2ELatency, "pod startup"))
 
 				framework.LogSuspiciousLatency(startupLag, e2eLag, nodeCount, c)
 				latencyMeasurementPhase.End()
diff --git a/test/e2e_node/density_test.go b/test/e2e_node/density_test.go
index 845fff8cfe..9ee85cbe1d 100644
--- a/test/e2e_node/density_test.go
+++ b/test/e2e_node/density_test.go
@@ -472,21 +472,6 @@ func getPodStartLatency(node string) (framework.KubeletLatencyMetrics, error) {
 	return latencyMetrics, nil
 }
 
-// verifyPodStartupLatency verifies whether 50, 90 and 99th percentiles of PodStartupLatency are
-// within the threshold.
-func verifyPodStartupLatency(expect, actual framework.LatencyMetric) error {
-	if actual.Perc50 > expect.Perc50 {
-		return fmt.Errorf("too high pod startup latency 50th percentile: %v", actual.Perc50)
-	}
-	if actual.Perc90 > expect.Perc90 {
-		return fmt.Errorf("too high pod startup latency 90th percentile: %v", actual.Perc90)
-	}
-	if actual.Perc99 > expect.Perc99 {
-		return fmt.Errorf("too high pod startup latency 99th percentile: %v", actual.Perc99)
-	}
-	return nil
-}
-
 // newInformerWatchPod creates an informer to check whether all pods are running.
 func newInformerWatchPod(f *framework.Framework, mutex *sync.Mutex, watchTimes map[string]metav1.Time, podType string) cache.Controller {
 	ns := f.Namespace.Name
@@ -563,7 +548,7 @@ func logAndVerifyLatency(batchLag time.Duration, e2eLags []framework.PodLatencyD
 
 	if isVerify {
 		// check whether e2e pod startup time is acceptable.
-		framework.ExpectNoError(verifyPodStartupLatency(podStartupLimits, podStartupLatency))
+		framework.ExpectNoError(framework.VerifyLatencyWithinThreshold(podStartupLimits, podStartupLatency, "pod startup"))
 
 		// check bactch pod creation latency
 		if podBatchStartupLimit > 0 {