e2e: query kubelet for resource usage of containers

This change adds simple utility functions to query kubelet for the resource usage of major containers (e.g. /kubelet, /docker-daemon, etc) for the past time interval (up to 2 minutes). This is intended for spot checking of the recent resource usage on the node.
2015-07-09 16:29:33 -07:00 · 2015-07-09 16:29:33 -07:00 · 378a44a287
parent 5450afdabc
commit 378a44a287
2 changed files with 146 additions and 1 deletions
--- a/test/e2e/kubelet.go
+++ b/test/e2e/kubelet.go
@ -34,6 +34,8 @@ import (
 const (
 	// Interval to poll /runningpods on a node
 	pollInterval = 1 * time.Second
 	// Interval used compute cpu usage of a container
 	cpuIntervalInSeconds = 60
 )
 // getPodMatches returns a set of pod names on the given node that matches the
@ -98,6 +100,7 @@ var _ = Describe("Clean up pods on node", func() {
 		for _, node := range nodes.Items {
 			nodeNames.Insert(node.Name)
 		}
 		logOneTimeResourceUsageSummary(framework.Client, nodeNames.List(), cpuIntervalInSeconds)
 	})
 	type DeleteTest struct {
@ -125,13 +128,13 @@ var _ = Describe("Clean up pods on node", func() {
 				Image:     "gcr.io/google_containers/pause:go",
 				Replicas:  totalPods,
 			})).NotTo(HaveOccurred())
 			// Perform a sanity check so that we know all desired pods are
 			// running on the nodes according to kubelet. The timeout is set to
 			// only 30 seconds here because RunRC already waited for all pods to
 			// transition to the running status.
 			Expect(waitTillNPodsRunningOnNodes(framework.Client, nodeNames, rcName, framework.Namespace.Name, totalPods,
 				time.Second*30)).NotTo(HaveOccurred())
 			logOneTimeResourceUsageSummary(framework.Client, nodeNames.List(), cpuIntervalInSeconds)
 			By("Deleting the RC")
 			DeleteRC(framework.Client, framework.Namespace.Name, rcName)
--- a/test/e2e/kubelet_stats.go
+++ b/test/e2e/kubelet_stats.go
@ -17,18 +17,23 @@ limitations under the License.
 package e2e
 import (
 	"bytes"
 	"encoding/json"
 	"fmt"
 	"io/ioutil"
 	"net/http"
 	"sort"
 	"strconv"
 	"strings"
 	"text/tabwriter"
 	"time"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/metrics"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/master/ports"
 	cadvisor "github.com/google/cadvisor/info/v1"
 )
 // KubeletMetric stores metrics scraped from the kubelet server's /metric endpoint.
@ -172,6 +177,143 @@ func HighLatencyKubeletOperations(c *client.Client, threshold time.Duration, nod
 	return badMetrics, nil
 }
 // getContainerInfo contacts kubelet for the container informaton. The "Stats"
 // in the returned ContainerInfo is subject to the requirements in statsRequest.
 func getContainerInfo(c *client.Client, nodeName string, req *kubelet.StatsRequest) (map[string]cadvisor.ContainerInfo, error) {
 	reqBody, err := json.Marshal(req)
 	if err != nil {
 		return nil, err
 	}
 	data, err := c.Post().
 		Prefix("proxy").
 		Resource("nodes").
 		Name(fmt.Sprintf("%v:%v", nodeName, ports.KubeletPort)).
 		Suffix("stats/container").
 		SetHeader("Content-Type", "application/json").
 		Body(reqBody).
 		Do().Raw()
 	var containers map[string]cadvisor.ContainerInfo
 	err = json.Unmarshal(data, &containers)
 	if err != nil {
 		return nil, err
 	}
 	return containers, nil
 }
 const (
 	// cadvisor records stats about every second.
 	cadvisorStatsPollingIntervalInSeconds float64 = 1.0
 	// cadvisor caches up to 2 minutes of stats (configured by kubelet).
 	maxNumStatsToRequest int = 120
 )
 // A list of containers for which we want to collect resource usage.
 var targetContainers = []string{
 	"/",
 	"/docker-daemon",
 	"/kubelet",
 	"/kube-proxy",
 	"/system",
 }
 type containerResourceUsage struct {
 	Name                    string
 	Timestamp               time.Time
 	CPUUsageInCores         float64
 	MemoryUsageInBytes      int64
 	MemoryWorkingSetInBytes int64
 	// The interval used to calculate CPUUsageInCores.
 	CPUInterval time.Duration
 }
 // getOneTimeResourceUsageOnNode queries the node's /stats/container endpoint
 // and returns the resource usage of targetContainers for the past
 // cpuInterval.
 // The acceptable range of the interval is 2s~120s. Be warned that as the
 // interval (and #containers) increases, the size of kubelet's response
 // could be sigificant. E.g., the 60s interval stats for ~20 containers is
 // ~1.5MB. Don't hammer the node with frequent, heavy requests.
 // TODO: Implement a constant, lightweight resource monitor, which polls
 // kubelet every few second, stores the data, and reports meaningful statistics
 // numbers over a longer period (e.g., max/mean cpu usage in the last hour).
 //
 // cadvisor records cumulative cpu usage in nanoseconds, so we need to have two
 // stats points to compute the cpu usage over the interval. Assuming cadvisor
 // polls every second, we'd need to get N stats points for N-second interval.
 // Note that this is an approximation and may not be accurate, hence we also
 // write the actual interval used for calcuation (based on the timestampes of
 // the stats points in containerResourceUsage.CPUInterval.
 func getOneTimeResourceUsageOnNode(c *client.Client, nodeName string, cpuInterval time.Duration) (map[string]*containerResourceUsage, error) {
 	numStats := int(float64(cpuInterval.Seconds()) / cadvisorStatsPollingIntervalInSeconds)
 	if numStats < 2 || numStats > maxNumStatsToRequest {
 		return nil, fmt.Errorf("numStats needs to be > 1 and < %d", maxNumStatsToRequest)
 	}
 	// Get information of all containers on the node.
 	containerInfos, err := getContainerInfo(c, nodeName, &kubelet.StatsRequest{
 		ContainerName: "/",
 		NumStats:      numStats,
 		Subcontainers: true,
 	})
 	if err != nil {
 		return nil, err
 	}
 	// Process container infos that are relevant to us.
 	usageMap := make(map[string]*containerResourceUsage, len(targetContainers))
 	for _, name := range targetContainers {
 		info, ok := containerInfos[name]
 		if !ok {
 			return nil, fmt.Errorf("missing info for container %q on node %q", name, nodeName)
 		}
 		first := info.Stats[0]
 		last := info.Stats[len(info.Stats)-1]
 		usageMap[name] = &containerResourceUsage{
 			Name:                    name,
 			Timestamp:               last.Timestamp,
 			CPUUsageInCores:         float64(last.Cpu.Usage.Total-first.Cpu.Usage.Total) / float64(last.Timestamp.Sub(first.Timestamp).Nanoseconds()),
 			MemoryUsageInBytes:      int64(last.Memory.Usage),
 			MemoryWorkingSetInBytes: int64(last.Memory.WorkingSet),
 			CPUInterval:             last.Timestamp.Sub(first.Timestamp),
 		}
 	}
 	return usageMap, nil
 }
 // logOneTimeResourceUsageSummary collects container resource for the list of
 // nodes, formats and logs the stats.
 func logOneTimeResourceUsageSummary(c *client.Client, nodeNames []string, cpuInterval time.Duration) {
 	var summary []string
 	for _, nodeName := range nodeNames {
 		stats, err := getOneTimeResourceUsageOnNode(c, nodeName, cpuInterval)
 		if err != nil {
 			summary = append(summary, fmt.Sprintf("Error getting resource usage from node %q, err: %v", nodeName, err))
 		} else {
 			summary = append(summary, formatResourceUsageStats(nodeName, stats))
 		}
 	}
 	Logf("\n%s", strings.Join(summary, "\n"))
 }
 func formatResourceUsageStats(nodeName string, containerStats map[string]*containerResourceUsage) string {
 	// Example output:
 	//
 	// Resource usage for node "e2e-test-foo-minion-abcde":
 	// container        cpu(cores)  memory(MB)
 	// "/"              0.363       2942.09
 	// "/docker-daemon" 0.088       521.80
 	// "/kubelet"       0.086       424.37
 	// "/kube-proxy"    0.011       4.66
 	// "/system"        0.007       119.88
 	buf := &bytes.Buffer{}
 	w := tabwriter.NewWriter(buf, 1, 0, 1, ' ', 0)
 	fmt.Fprintf(w, "container\tcpu(cores)\tmemory(MB)\n")
 	for name, s := range containerStats {
 		fmt.Fprintf(w, "%q\t%.3f\t%.2f\n", name, s.CPUUsageInCores, float64(s.MemoryUsageInBytes)/1000000)
 	}
 	w.Flush()
 	return fmt.Sprintf("Resource usage on node %q:\n%s", nodeName, buf.String())
 }
 // Performs a get on a node proxy endpoint given the nodename and rest client.
 func nodeProxyRequest(c *client.Client, node, endpoint string) client.Result {
 	return c.Get().