Merge pull request #61478 from shyamjvs/capture-pod-startup-phases-as-metrics

Automatic merge from submit-queue (batch tested with PRs 61378, 60915, 61499, 61507, 61478). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

Capture pod startup phases as metrics

Learning from https://github.com/kubernetes/kubernetes/issues/60589, we should also start collecting and graphing sub-parts of pod-startup latency.

/sig scalability
/kind feature
/priority important-soon
/cc @wojtek-t 

```release-note
NONE
```
pull/8/head
Kubernetes Submit Queue 2018-03-22 07:15:33 -07:00 committed by GitHub
commit 1b6b2ee790
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 56 additions and 47 deletions

View File

@ -43,7 +43,6 @@ const (
// NodeStartupThreshold is a rough estimate of the time allocated for a pod to start on a node. // NodeStartupThreshold is a rough estimate of the time allocated for a pod to start on a node.
NodeStartupThreshold = 4 * time.Second NodeStartupThreshold = 4 * time.Second
podStartupThreshold time.Duration = 5 * time.Second
// We are setting 1s threshold for apicalls even in small clusters to avoid flakes. // We are setting 1s threshold for apicalls even in small clusters to avoid flakes.
// The problem is that if long GC is happening in small clusters (where we have e.g. // The problem is that if long GC is happening in small clusters (where we have e.g.
// 1-core master machines) and tests are pretty short, it may consume significant // 1-core master machines) and tests are pretty short, it may consume significant
@ -188,7 +187,11 @@ type LatencyMetric struct {
} }
type PodStartupLatency struct { type PodStartupLatency struct {
Latency LatencyMetric `json:"latency"` CreateToScheduleLatency LatencyMetric `json:"createToScheduleLatency"`
ScheduleToRunLatency LatencyMetric `json:"scheduleToRunLatency"`
RunToWatchLatency LatencyMetric `json:"runToWatchLatency"`
ScheduleToWatchLatency LatencyMetric `json:"scheduleToWatchLatency"`
E2ELatency LatencyMetric `json:"e2eLatency"`
} }
func (l *PodStartupLatency) SummaryKind() string { func (l *PodStartupLatency) SummaryKind() string {
@ -398,17 +401,17 @@ func HighLatencyRequests(c clientset.Interface, nodeCount int) (int, *APIRespons
return badMetrics, metrics, nil return badMetrics, metrics, nil
} }
// Verifies whether 50, 90 and 99th percentiles of PodStartupLatency are // Verifies whether 50, 90 and 99th percentiles of a latency metric are
// within the threshold. // within the expected threshold.
func VerifyPodStartupLatency(latency *PodStartupLatency) error { func VerifyLatencyWithinThreshold(threshold, actual LatencyMetric, metricName string) error {
if latency.Latency.Perc50 > podStartupThreshold { if actual.Perc50 > threshold.Perc50 {
return fmt.Errorf("too high pod startup latency 50th percentile: %v", latency.Latency.Perc50) return fmt.Errorf("too high %v latency 50th percentile: %v", metricName, actual.Perc50)
} }
if latency.Latency.Perc90 > podStartupThreshold { if actual.Perc90 > threshold.Perc90 {
return fmt.Errorf("too high pod startup latency 90th percentile: %v", latency.Latency.Perc90) return fmt.Errorf("too high %v latency 90th percentile: %v", metricName, actual.Perc90)
} }
if latency.Latency.Perc99 > podStartupThreshold { if actual.Perc99 > threshold.Perc99 {
return fmt.Errorf("too high pod startup latency 99th percentile: %v", latency.Latency.Perc99) return fmt.Errorf("too high %v latency 99th percentile: %v", metricName, actual.Perc99)
} }
return nil return nil
} }

View File

@ -53,22 +53,29 @@ func ApiCallToPerfData(apicalls *APIResponsiveness) *perftype.PerfData {
return perfData return perfData
} }
// PodStartupLatencyToPerfData transforms PodStartupLatency to PerfData. func latencyToPerfData(l LatencyMetric, name string) perftype.DataItem {
func PodStartupLatencyToPerfData(latency *PodStartupLatency) *perftype.PerfData { return perftype.DataItem{
perfData := &perftype.PerfData{Version: currentApiCallMetricsVersion}
item := perftype.DataItem{
Data: map[string]float64{ Data: map[string]float64{
"Perc50": float64(latency.Latency.Perc50) / 1000000, // us -> ms "Perc50": float64(l.Perc50) / 1000000, // us -> ms
"Perc90": float64(latency.Latency.Perc90) / 1000000, "Perc90": float64(l.Perc90) / 1000000,
"Perc99": float64(latency.Latency.Perc99) / 1000000, "Perc99": float64(l.Perc99) / 1000000,
"Perc100": float64(latency.Latency.Perc100) / 1000000, "Perc100": float64(l.Perc100) / 1000000,
}, },
Unit: "ms", Unit: "ms",
Labels: map[string]string{ Labels: map[string]string{
"Metric": "pod_startup", "Metric": name,
}, },
} }
perfData.DataItems = append(perfData.DataItems, item) }
// PodStartupLatencyToPerfData transforms PodStartupLatency to PerfData.
func PodStartupLatencyToPerfData(latency *PodStartupLatency) *perftype.PerfData {
perfData := &perftype.PerfData{Version: currentApiCallMetricsVersion}
perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.CreateToScheduleLatency, "create_to_schedule"))
perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.ScheduleToRunLatency, "schedule_to_run"))
perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.RunToWatchLatency, "run_to_watch"))
perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.ScheduleToWatchLatency, "schedule_to_watch"))
perfData.DataItems = append(perfData.DataItems, latencyToPerfData(latency.E2ELatency, "pod_startup"))
return perfData return perfData
} }

View File

@ -50,6 +50,7 @@ import (
) )
const ( const (
PodStartupLatencyThreshold = 5 * time.Second
MinSaturationThreshold = 2 * time.Minute MinSaturationThreshold = 2 * time.Minute
MinPodsPerSecondThroughput = 8 MinPodsPerSecondThroughput = 8
DensityPollInterval = 10 * time.Second DensityPollInterval = 10 * time.Second
@ -829,16 +830,29 @@ var _ = SIGDescribe("Density", func() {
sort.Sort(framework.LatencySlice(schedToWatchLag)) sort.Sort(framework.LatencySlice(schedToWatchLag))
sort.Sort(framework.LatencySlice(e2eLag)) sort.Sort(framework.LatencySlice(e2eLag))
framework.PrintLatencies(scheduleLag, "worst schedule latencies") framework.PrintLatencies(scheduleLag, "worst create-to-schedule latencies")
framework.PrintLatencies(startupLag, "worst run-after-schedule latencies") framework.PrintLatencies(startupLag, "worst schedule-to-run latencies")
framework.PrintLatencies(watchLag, "worst watch latencies") framework.PrintLatencies(watchLag, "worst run-to-watch latencies")
framework.PrintLatencies(schedToWatchLag, "worst scheduled-to-end total latencies") framework.PrintLatencies(schedToWatchLag, "worst schedule-to-watch latencies")
framework.PrintLatencies(e2eLag, "worst e2e total latencies") framework.PrintLatencies(e2eLag, "worst e2e latencies")
// Capture latency metrics related to pod-startup.
podStartupLatency := &framework.PodStartupLatency{
CreateToScheduleLatency: framework.ExtractLatencyMetrics(scheduleLag),
ScheduleToRunLatency: framework.ExtractLatencyMetrics(startupLag),
RunToWatchLatency: framework.ExtractLatencyMetrics(watchLag),
ScheduleToWatchLatency: framework.ExtractLatencyMetrics(schedToWatchLag),
E2ELatency: framework.ExtractLatencyMetrics(e2eLag),
}
f.TestSummaries = append(f.TestSummaries, podStartupLatency)
// Test whether e2e pod startup time is acceptable. // Test whether e2e pod startup time is acceptable.
podStartupLatency := &framework.PodStartupLatency{Latency: framework.ExtractLatencyMetrics(e2eLag)} podStartupLatencyThreshold := framework.LatencyMetric{
f.TestSummaries = append(f.TestSummaries, podStartupLatency) Perc50: PodStartupLatencyThreshold,
framework.ExpectNoError(framework.VerifyPodStartupLatency(podStartupLatency)) Perc90: PodStartupLatencyThreshold,
Perc99: PodStartupLatencyThreshold,
}
framework.ExpectNoError(framework.VerifyLatencyWithinThreshold(podStartupLatencyThreshold, podStartupLatency.E2ELatency, "pod startup"))
framework.LogSuspiciousLatency(startupLag, e2eLag, nodeCount, c) framework.LogSuspiciousLatency(startupLag, e2eLag, nodeCount, c)
latencyMeasurementPhase.End() latencyMeasurementPhase.End()

View File

@ -472,21 +472,6 @@ func getPodStartLatency(node string) (framework.KubeletLatencyMetrics, error) {
return latencyMetrics, nil return latencyMetrics, nil
} }
// verifyPodStartupLatency verifies whether 50, 90 and 99th percentiles of PodStartupLatency are
// within the threshold.
func verifyPodStartupLatency(expect, actual framework.LatencyMetric) error {
if actual.Perc50 > expect.Perc50 {
return fmt.Errorf("too high pod startup latency 50th percentile: %v", actual.Perc50)
}
if actual.Perc90 > expect.Perc90 {
return fmt.Errorf("too high pod startup latency 90th percentile: %v", actual.Perc90)
}
if actual.Perc99 > expect.Perc99 {
return fmt.Errorf("too high pod startup latency 99th percentile: %v", actual.Perc99)
}
return nil
}
// newInformerWatchPod creates an informer to check whether all pods are running. // newInformerWatchPod creates an informer to check whether all pods are running.
func newInformerWatchPod(f *framework.Framework, mutex *sync.Mutex, watchTimes map[string]metav1.Time, podType string) cache.Controller { func newInformerWatchPod(f *framework.Framework, mutex *sync.Mutex, watchTimes map[string]metav1.Time, podType string) cache.Controller {
ns := f.Namespace.Name ns := f.Namespace.Name
@ -556,14 +541,14 @@ func logAndVerifyLatency(batchLag time.Duration, e2eLags []framework.PodLatencyD
latencyMetrics, _ := getPodStartLatency(kubeletAddr) latencyMetrics, _ := getPodStartLatency(kubeletAddr)
framework.Logf("Kubelet Prometheus metrics (not reset):\n%s", framework.PrettyPrintJSON(latencyMetrics)) framework.Logf("Kubelet Prometheus metrics (not reset):\n%s", framework.PrettyPrintJSON(latencyMetrics))
podCreateLatency := framework.PodStartupLatency{Latency: framework.ExtractLatencyMetrics(e2eLags)} podStartupLatency := framework.ExtractLatencyMetrics(e2eLags)
// log latency perf data // log latency perf data
logPerfData(getLatencyPerfData(podCreateLatency.Latency, testInfo), "latency") logPerfData(getLatencyPerfData(podStartupLatency, testInfo), "latency")
if isVerify { if isVerify {
// check whether e2e pod startup time is acceptable. // check whether e2e pod startup time is acceptable.
framework.ExpectNoError(verifyPodStartupLatency(podStartupLimits, podCreateLatency.Latency)) framework.ExpectNoError(framework.VerifyLatencyWithinThreshold(podStartupLimits, podStartupLatency, "pod startup"))
// check bactch pod creation latency // check bactch pod creation latency
if podBatchStartupLimit > 0 { if podBatchStartupLimit > 0 {