Merge pull request #30333 from coufon/node_density_and_performance_test

Automatic merge from submit-queue Add Time Series Data and Labels in Node density test This pull requests contain: 1. Increase the pod creation latency limit according to test results; 2. Add 'GetResourceSeriesWithLabels' in 'resource_collector.go' to provide resource usage time series data; 3. Modify 'GetBasicCPUStats' in 'resource_collector.go' to make a copy of CPU usage array before sorting (otherwise time series data is disordered); 4. Add 'ResourceUsageToPerfDataWithLabels' and 'CPUUsageToPerfDataWithLabels' to attach labels to 'PerfData' for benchmark dashboard;  --- This change is [<img src="https://reviewable.kubernetes.io/review_button.svg" height="34" align="absmiddle" alt="Reviewable"/>](https://reviewable.kubernetes.io/reviews/kubernetes/kubernetes/30333)
2016-08-11 16:27:04 -07:00 · 2016-08-11 16:27:04 -07:00 · 8c81c8340f
parent dfcb649ee8 ad81b6da80
commit 8c81c8340f
4 changed files with 193 additions and 118 deletions
--- a/test/e2e/framework/perf_util.go
+++ b/test/e2e/framework/perf_util.go
@ -57,56 +57,12 @@ const currentKubeletPerfMetricsVersion = "v1"
 // ResourceUsageToPerfData transforms ResourceUsagePerNode to PerfData. Notice that this function
 // only cares about memory usage, because cpu usage information will be extracted from NodesCPUSummary.
 func ResourceUsageToPerfData(usagePerNode ResourceUsagePerNode) *perftype.PerfData {
-	items := []perftype.DataItem{}
-	for node, usages := range usagePerNode {
-		for c, usage := range usages {
-			item := perftype.DataItem{
-				Data: map[string]float64{
-					"memory":     float64(usage.MemoryUsageInBytes) / (1024 * 1024),
-					"workingset": float64(usage.MemoryWorkingSetInBytes) / (1024 * 1024),
-					"rss":        float64(usage.MemoryRSSInBytes) / (1024 * 1024),
-				},
-				Unit: "MB",
-				Labels: map[string]string{
-					"node":      node,
-					"container": c,
-					"resource":  "memory",
-				},
-			}
-			items = append(items, item)
-		}
-	}
-	return &perftype.PerfData{
-		Version:   currentKubeletPerfMetricsVersion,
-		DataItems: items,
-	}
+	return ResourceUsageToPerfDataWithLabels(usagePerNode, nil)
 }

 // CPUUsageToPerfData transforms NodesCPUSummary to PerfData.
 func CPUUsageToPerfData(usagePerNode NodesCPUSummary) *perftype.PerfData {
-	items := []perftype.DataItem{}
-	for node, usages := range usagePerNode {
-		for c, usage := range usages {
-			data := map[string]float64{}
-			for perc, value := range usage {
-				data[fmt.Sprintf("Perc%02.0f", perc*100)] = value * 1000
-			}
-			item := perftype.DataItem{
-				Data: data,
-				Unit: "mCPU",
-				Labels: map[string]string{
-					"node":      node,
-					"container": c,
-					"resource":  "cpu",
-				},
-			}
-			items = append(items, item)
-		}
-	}
-	return &perftype.PerfData{
-		Version:   currentKubeletPerfMetricsVersion,
-		DataItems: items,
-	}
+	return CPUUsageToPerfDataWithLabels(usagePerNode, nil)
 }

 // PrintPerfData prints the perfdata in json format with PerfResultTag prefix.
@ -117,3 +73,73 @@ func PrintPerfData(p *perftype.PerfData) {
 		Logf("%s %s\n%s", perftype.PerfResultTag, str, perftype.PerfResultEnd)
 	}
 }
+
+// ResourceUsageToPerfDataWithLabels transforms ResourceUsagePerNode to PerfData with additional labels.
+// Notice that this function only cares about memory usage, because cpu usage information will be extracted from NodesCPUSummary.
+func ResourceUsageToPerfDataWithLabels(usagePerNode ResourceUsagePerNode, labels map[string]string) *perftype.PerfData {
+	items := []perftype.DataItem{}
+	for node, usages := range usagePerNode {
+		for c, usage := range usages {
+			newLabels := map[string]string{
+				"node":      node,
+				"container": c,
+				"resource":  "memory",
+			}
+			if labels != nil {
+				for k, v := range labels {
+					newLabels[k] = v
+				}
+			}
+
+			item := perftype.DataItem{
+				Data: map[string]float64{
+					"memory":     float64(usage.MemoryUsageInBytes) / (1024 * 1024),
+					"workingset": float64(usage.MemoryWorkingSetInBytes) / (1024 * 1024),
+					"rss":        float64(usage.MemoryRSSInBytes) / (1024 * 1024),
+				},
+				Unit:   "MB",
+				Labels: newLabels,
+			}
+			items = append(items, item)
+		}
+	}
+	return &perftype.PerfData{
+		Version:   currentKubeletPerfMetricsVersion,
+		DataItems: items,
+	}
+}
+
+// CPUUsageToPerfDataWithLabels transforms NodesCPUSummary to PerfData with additional labels.
+func CPUUsageToPerfDataWithLabels(usagePerNode NodesCPUSummary, labels map[string]string) *perftype.PerfData {
+	items := []perftype.DataItem{}
+	for node, usages := range usagePerNode {
+		for c, usage := range usages {
+			newLabels := map[string]string{
+				"node":      node,
+				"container": c,
+				"resource":  "cpu",
+			}
+			if labels != nil {
+				for k, v := range labels {
+					newLabels[k] = v
+				}
+			}
+
+			data := map[string]float64{}
+			for perc, value := range usage {
+				data[fmt.Sprintf("Perc%02.0f", perc*100)] = value * 1000
+			}
+
+			item := perftype.DataItem{
+				Data:   data,
+				Unit:   "mCPU",
+				Labels: newLabels,
+			}
+			items = append(items, item)
+		}
+	}
+	return &perftype.PerfData{
+		Version:   currentKubeletPerfMetricsVersion,
+		DataItems: items,
+	}
+}
--- a/test/e2e_node/density_test.go
+++ b/test/e2e_node/density_test.go
@ -73,7 +73,8 @@ var _ = framework.KubeDescribe("Density [Serial] [Slow]", func() {
 	})

 	Context("create a batch of pods", func() {
-		densityTests := []DensityTest{
+		// TODO(coufon): add more tests and the values are generous, set more precise limits after benchmark
+		dTests := []densityTest{
 			{
 				podsNr:   10,
 				interval: 0 * time.Millisecond,
@ -87,8 +88,8 @@ var _ = framework.KubeDescribe("Density [Serial] [Slow]", func() {
 				},
 				// percentile limit of single pod startup latency
 				podStartupLimits: framework.LatencyMetric{
-					Perc50: 10 * time.Second,
-					Perc90: 15 * time.Second,
+					Perc50: 16 * time.Second,
+					Perc90: 18 * time.Second,
 					Perc99: 20 * time.Second,
 				},
 				// upbound of startup latency of a batch of pods
@ -96,7 +97,7 @@ var _ = framework.KubeDescribe("Density [Serial] [Slow]", func() {
 			},
 		}

-		for _, testArg := range densityTests {
+		for _, testArg := range dTests {
 			itArg := testArg
 			It(fmt.Sprintf("latency/resource should be within limit when create %d pods with %v interval",
 				itArg.podsNr, itArg.interval), func() {
@ -185,13 +186,14 @@ var _ = framework.KubeDescribe("Density [Serial] [Slow]", func() {

 				// verify resource
 				By("Verifying resource")
-				verifyResource(f, testArg, rc)
+				verifyResource(f, itArg.cpuLimits, itArg.memLimits, rc)
 			})
 		}
 	})

 	Context("create a sequence of pods", func() {
-		densityTests := []DensityTest{
+		// TODO(coufon): add more tests and the values are generous, set more precise limits after benchmark
+		dTests := []densityTest{
 			{
 				podsNr:   10,
 				bgPodsNr: 10,
@ -211,7 +213,7 @@ var _ = framework.KubeDescribe("Density [Serial] [Slow]", func() {
 			},
 		}

-		for _, testArg := range densityTests {
+		for _, testArg := range dTests {
 			itArg := testArg
 			It(fmt.Sprintf("latency/resource should be within limit when create %d pods with %d background pods",
 				itArg.podsNr, itArg.bgPodsNr), func() {
@ -242,13 +244,13 @@ var _ = framework.KubeDescribe("Density [Serial] [Slow]", func() {

 				// verify resource
 				By("Verifying resource")
-				verifyResource(f, testArg, rc)
+				verifyResource(f, itArg.cpuLimits, itArg.memLimits, rc)
 			})
 		}
 	})
 })

-type DensityTest struct {
+type densityTest struct {
 	// number of pods
 	podsNr int
 	// number of background pods
@ -274,6 +276,7 @@ func createBatchPodWithRateControl(f *framework.Framework, pods []*api.Pod, inte
 	return createTimes
 }

+// checkPodDeleted checks whether a pod has been successfully deleted
 func checkPodDeleted(f *framework.Framework, podName string) error {
 	ns := f.Namespace.Name
 	_, err := f.Client.Pods(ns).Get(podName)
@ -304,7 +307,7 @@ func getPodStartLatency(node string) (framework.KubeletLatencyMetrics, error) {
 	return latencyMetrics, nil
 }

-// Verifies whether 50, 90 and 99th percentiles of PodStartupLatency are
+// verifyPodStartupLatency verifies whether 50, 90 and 99th percentiles of PodStartupLatency are
 // within the threshold.
 func verifyPodStartupLatency(expect, actual framework.LatencyMetric) error {
 	if actual.Perc50 > expect.Perc50 {
@ -319,6 +322,7 @@ func verifyPodStartupLatency(expect, actual framework.LatencyMetric) error {
 	return nil
 }

+// newInformerWatchPod creates an informer to check whether all pods are running.
 func newInformerWatchPod(f *framework.Framework, mutex *sync.Mutex, watchTimes map[string]unversioned.Time,
 	podType string) *controllerframework.Controller {
 	ns := f.Namespace.Name
@ -363,7 +367,8 @@ func newInformerWatchPod(f *framework.Framework, mutex *sync.Mutex, watchTimes m
 	return controller
 }

-func verifyLatency(batchLag time.Duration, e2eLags []framework.PodLatencyData, testArg DensityTest) {
+// verifyLatency verifies that whether pod creation latency satisfies the limit.
+func verifyLatency(batchLag time.Duration, e2eLags []framework.PodLatencyData, testArg densityTest) {
 	framework.PrintLatencies(e2eLags, "worst client e2e total latencies")

 	// Zhou: do not trust `kubelet' metrics since they are not reset!
@ -388,35 +393,7 @@ func verifyLatency(batchLag time.Duration, e2eLags []framework.PodLatencyData, t
 	framework.Logf("Sequential creation throughput is %.1f pods/min", throughputSequential)
 }

-func verifyResource(f *framework.Framework, testArg DensityTest, rc *ResourceCollector) {
-	nodeName := framework.TestContext.NodeName
-
-	// verify and log memory
-	usagePerContainer, err := rc.GetLatest()
-	Expect(err).NotTo(HaveOccurred())
-	framework.Logf("%s", formatResourceUsageStats(usagePerContainer))
-
-	usagePerNode := make(framework.ResourceUsagePerNode)
-	usagePerNode[nodeName] = usagePerContainer
-
-	memPerfData := framework.ResourceUsageToPerfData(usagePerNode)
-	framework.PrintPerfData(memPerfData)
-
-	verifyMemoryLimits(f.Client, testArg.memLimits, usagePerNode)
-
-	// verify and log cpu
-	cpuSummary := rc.GetCPUSummary()
-	framework.Logf("%s", formatCPUSummary(cpuSummary))
-
-	cpuSummaryPerNode := make(framework.NodesCPUSummary)
-	cpuSummaryPerNode[nodeName] = cpuSummary
-
-	cpuPerfData := framework.CPUUsageToPerfData(cpuSummaryPerNode)
-	framework.PrintPerfData(cpuPerfData)
-
-	verifyCPULimits(testArg.cpuLimits, cpuSummaryPerNode)
-}
-
+// createBatchPodSequential creats pods back-to-back in sequence.
 func createBatchPodSequential(f *framework.Framework, pods []*api.Pod) (time.Duration, []framework.PodLatencyData) {
 	batchStartTime := unversioned.Now()
 	e2eLags := make([]framework.PodLatencyData, 0)
--- a/test/e2e_node/resource_controller.go
+++ b/test/e2e_node/resource_controller.go
@ -53,6 +53,8 @@ const (
 	cadvisorPort      = 8090
 	// housekeeping interval of Cadvisor (second)
 	houseKeepingInterval = 1
+	// TODO(coufon): be consistent with perf_util.go version (not exposed)
+	currentTimeSeriesVersion = "v1"
 )

 var (
@ -69,6 +71,8 @@ type ResourceCollector struct {
 	stopCh          chan struct{}
 }

+// NewResourceCollector creates a resource collector object which collects
+// resource usage periodically from Cadvisor
 func NewResourceCollector(interval time.Duration) *ResourceCollector {
 	buffers := make(map[string][]*framework.ContainerResourceUsage)
 	return &ResourceCollector{
@ -77,8 +81,10 @@ func NewResourceCollector(interval time.Duration) *ResourceCollector {
 	}
 }

+// Start starts resource collector and connects to the standalone Cadvisor pod
+// then repeatedly runs collectStats.
 func (r *ResourceCollector) Start() {
-	// Get the cgroup containers for kubelet and docker
+	// Get the cgroup container names for kubelet and docker
 	kubeletContainer, err := getContainerNameForProcess(kubeletProcessName, "")
 	dockerContainer, err := getContainerNameForProcess(dockerProcessName, dockerPidFile)
 	if err == nil {
@ -108,10 +114,12 @@ func (r *ResourceCollector) Start() {
 	go wait.Until(func() { r.collectStats(oldStatsMap) }, r.pollingInterval, r.stopCh)
 }

+// Stop stops resource collector collecting stats. It does not clear the buffer
 func (r *ResourceCollector) Stop() {
 	close(r.stopCh)
 }

+// Reset clears the stats buffer of resource collector.
 func (r *ResourceCollector) Reset() {
 	r.lock.Lock()
 	defer r.lock.Unlock()
@ -120,6 +128,7 @@ func (r *ResourceCollector) Reset() {
 	}
 }

+// GetCPUSummary gets CPU usage in percentile.
 func (r *ResourceCollector) GetCPUSummary() framework.ContainersCPUSummary {
 	result := make(framework.ContainersCPUSummary)
 	for key, name := range systemContainers {
@ -129,6 +138,7 @@ func (r *ResourceCollector) GetCPUSummary() framework.ContainersCPUSummary {
 	return result
 }

+// LogLatest logs the latest resource usage.
 func (r *ResourceCollector) LogLatest() {
 	summary, err := r.GetLatest()
 	if err != nil {
@ -137,6 +147,7 @@ func (r *ResourceCollector) LogLatest() {
 	framework.Logf("%s", formatResourceUsageStats(summary))
 }

+// collectStats collects resource usage from Cadvisor.
 func (r *ResourceCollector) collectStats(oldStatsMap map[string]*cadvisorapiv2.ContainerStats) {
 	for _, name := range systemContainers {
 		ret, err := r.client.Stats(name, r.request)
@ -162,6 +173,7 @@ func (r *ResourceCollector) collectStats(oldStatsMap map[string]*cadvisorapiv2.C
 	}
 }

+// computeContainerResourceUsage computes resource usage based on new data sample.
 func computeContainerResourceUsage(name string, oldStats, newStats *cadvisorapiv2.ContainerStats) *framework.ContainerResourceUsage {
 	return &framework.ContainerResourceUsage{
 		Name:                    name,
@ -174,6 +186,7 @@ func computeContainerResourceUsage(name string, oldStats, newStats *cadvisorapiv
 	}
 }

+// GetLatest gets the latest resource usage from stats buffer.
 func (r *ResourceCollector) GetLatest() (framework.ResourceUsagePerContainer, error) {
 	r.lock.RLock()
 	defer r.lock.RUnlock()
@ -203,7 +216,13 @@ func (r *ResourceCollector) GetBasicCPUStats(containerName string) map[float64]f
 	r.lock.RLock()
 	defer r.lock.RUnlock()
 	result := make(map[float64]float64, len(percentiles))
-	usages := r.buffers[containerName]
+
+	// We must make a copy of array, otherwise the timeseries order is changed.
+	usages := make([]*framework.ContainerResourceUsage, 0)
+	for _, usage := range r.buffers[containerName] {
+		usages = append(usages, usage)
+	}
+
 	sort.Sort(resourceUsageByCPU(usages))
 	for _, q := range percentiles {
 		index := int(float64(len(usages))*q) - 1
@ -274,6 +293,7 @@ func formatCPUSummary(summary framework.ContainersCPUSummary) string {
 	return strings.Join(summaryStrings, "\n")
 }

+// createCadvisorPod creates a standalone cadvisor pod for fine-grain resource monitoring.
 func createCadvisorPod(f *framework.Framework) {
 	f.PodClient().CreateSync(&api.Pod{
 		ObjectMeta: api.ObjectMeta{
@ -348,6 +368,7 @@ func createCadvisorPod(f *framework.Framework) {
 	})
 }

+// deleteBatchPod deletes a batch of pods (synchronous).
 func deleteBatchPod(f *framework.Framework, pods []*api.Pod) {
 	ns := f.Namespace.Name
 	var wg sync.WaitGroup
@ -368,6 +389,7 @@ func deleteBatchPod(f *framework.Framework, pods []*api.Pod) {
 	return
 }

+// newTestPods creates a list of pods (specification) for test.
 func newTestPods(numPods int, imageName, podType string) []*api.Pod {
 	var pods []*api.Pod
 	for i := 0; i < numPods; i++ {
@ -383,8 +405,7 @@ func newTestPods(numPods int, imageName, podType string) []*api.Pod {
 					Labels: labels,
 				},
 				Spec: api.PodSpec{
-					// ToDo: restart policy is always
-					// check whether pods restart at the end of tests
+					// Restart policy is always (default).
 					Containers: []api.Container{
 						{
 							Image: imageName,
@ -397,7 +418,51 @@ func newTestPods(numPods int, imageName, podType string) []*api.Pod {
 	return pods
 }

-// code for getting container name of docker
+// Time series of resource usage
+type ResourceSeries struct {
+	Timestamp            []int64           `json:"ts"`
+	CPUUsageInMilliCores []int64           `json:"cpu"`
+	MemoryRSSInMegaBytes []int64           `json:"memory"`
+	Units                map[string]string `json:"unit"`
+}
+
+// Time series of resource usage per container
+type ResourceSeriesPerContainer struct {
+	Data    map[string]*ResourceSeries `json:"data"`
+	Labels  map[string]string          `json:"labels"`
+	Version string                     `json:"version"`
+}
+
+// GetResourceSeriesWithLabels gets the time series of resource usage of each container.
+// TODO(coufon): the labels are to be re-defined based on benchmark dashboard.
+func (r *ResourceCollector) GetResourceSeriesWithLabels(labels map[string]string) *ResourceSeriesPerContainer {
+	seriesPerContainer := &ResourceSeriesPerContainer{
+		Data: map[string]*ResourceSeries{},
+		Labels: map[string]string{
+			"node": framework.TestContext.NodeName,
+		},
+		Version: currentTimeSeriesVersion,
+	}
+	for key, name := range systemContainers {
+		newSeries := &ResourceSeries{Units: map[string]string{
+			"cpu":    "mCPU",
+			"memory": "MB",
+		}}
+		seriesPerContainer.Data[key] = newSeries
+		for _, usage := range r.buffers[name] {
+			newSeries.Timestamp = append(newSeries.Timestamp, usage.Timestamp.UnixNano())
+			newSeries.CPUUsageInMilliCores = append(newSeries.CPUUsageInMilliCores, int64(usage.CPUUsageInCores*1000))
+			newSeries.MemoryRSSInMegaBytes = append(newSeries.MemoryRSSInMegaBytes, int64(float64(usage.MemoryUsageInBytes)/(1024*1024)))
+		}
+	}
+	for k, v := range labels {
+		seriesPerContainer.Labels[k] = v
+	}
+	return seriesPerContainer
+}
+
+// Zhou: code for getting container name of docker, copied from pkg/kubelet/cm/container_manager_linux.go
+// since they are not exposed
 const (
 	kubeletProcessName    = "kubelet"
 	dockerProcessName     = "docker"
--- a/test/e2e_node/resource_usage_test.go
+++ b/test/e2e_node/resource_usage_test.go
@ -88,8 +88,6 @@ var _ = framework.KubeDescribe("Resource-usage [Serial] [Slow]", func() {
 			name := fmt.Sprintf("resource tracking for %d pods per node", podsPerNode)

 			It(name, func() {
-				expectedCPU, expectedMemory := itArg.cpuLimits, itArg.memLimits
-
 				// The test collects resource usage from a standalone Cadvisor pod.
 				// The Cadvsior of Kubelet has a housekeeping interval of 10s, which is too long to
 				// show the resource usage spikes. But changing its interval increases the overhead
@ -126,36 +124,16 @@ var _ = framework.KubeDescribe("Resource-usage [Serial] [Slow]", func() {
 					} else {
 						time.Sleep(reportingPeriod)
 					}
-					logPodsOnNode(f.Client)
+					logPods(f.Client)
 				}

 				rc.Stop()

 				By("Reporting overall resource usage")
-				logPodsOnNode(f.Client)
+				logPods(f.Client)

-				usagePerContainer, err := rc.GetLatest()
-				Expect(err).NotTo(HaveOccurred())
-
-				// TODO(random-liu): Remove the original log when we migrate to new perfdash
-				nodeName := framework.TestContext.NodeName
-				framework.Logf("%s", formatResourceUsageStats(usagePerContainer))
-
-				// Log perf result
-				usagePerNode := make(framework.ResourceUsagePerNode)
-				usagePerNode[nodeName] = usagePerContainer
-
-				framework.PrintPerfData(framework.ResourceUsageToPerfData(usagePerNode))
-				verifyMemoryLimits(f.Client, expectedMemory, usagePerNode)
-
-				cpuSummary := rc.GetCPUSummary()
-				framework.Logf("%s", formatCPUSummary(cpuSummary))
-
-				// Log perf result
-				cpuSummaryPerNode := make(framework.NodesCPUSummary)
-				cpuSummaryPerNode[nodeName] = cpuSummary
-				framework.PrintPerfData(framework.CPUUsageToPerfData(cpuSummaryPerNode))
-				verifyCPULimits(expectedCPU, cpuSummaryPerNode)
+				// Log and verify resource usage
+				verifyResource(f, itArg.cpuLimits, itArg.memLimits, rc)
 			})
 		}
 	})
@ -167,6 +145,35 @@ type resourceTest struct {
 	memLimits   framework.ResourceUsagePerContainer
 }

+// verifyResource verifies whether resource usage satisfies the limit.
+func verifyResource(f *framework.Framework, cpuLimits framework.ContainersCPUSummary,
+	memLimits framework.ResourceUsagePerContainer, rc *ResourceCollector) {
+	nodeName := framework.TestContext.NodeName
+
+	// Obtain memory PerfData
+	usagePerContainer, err := rc.GetLatest()
+	Expect(err).NotTo(HaveOccurred())
+	framework.Logf("%s", formatResourceUsageStats(usagePerContainer))
+
+	usagePerNode := make(framework.ResourceUsagePerNode)
+	usagePerNode[nodeName] = usagePerContainer
+
+	// Obtain cpu PerfData
+	cpuSummary := rc.GetCPUSummary()
+	framework.Logf("%s", formatCPUSummary(cpuSummary))
+
+	cpuSummaryPerNode := make(framework.NodesCPUSummary)
+	cpuSummaryPerNode[nodeName] = cpuSummary
+
+	// Log resource usage
+	framework.PrintPerfData(framework.ResourceUsageToPerfData(usagePerNode))
+	framework.PrintPerfData(framework.CPUUsageToPerfData(cpuSummaryPerNode))
+
+	// Verify resource usage
+	verifyMemoryLimits(f.Client, memLimits, usagePerNode)
+	verifyCPULimits(cpuLimits, cpuSummaryPerNode)
+}
+
 func verifyMemoryLimits(c *client.Client, expected framework.ResourceUsagePerContainer, actual framework.ResourceUsagePerNode) {
 	if expected == nil {
 		return
@ -237,7 +244,7 @@ func verifyCPULimits(expected framework.ContainersCPUSummary, actual framework.N
 	}
 }

-func logPodsOnNode(c *client.Client) {
+func logPods(c *client.Client) {
 	nodeName := framework.TestContext.NodeName
 	podList, err := framework.GetKubeletRunningPods(c, nodeName)
 	if err != nil {