e2e: query kubelet for resource usage of containers

This change adds simple utility functions to query kubelet for the resource
usage of major containers (e.g. /kubelet, /docker-daemon, etc) for the past
time interval (up to 2 minutes). This is intended for spot checking of the
recent resource usage on the node.
pull/6/head
Yu-Ju Hong 2015-07-09 16:29:33 -07:00
parent 5450afdabc
commit 378a44a287
2 changed files with 146 additions and 1 deletions

View File

@ -34,6 +34,8 @@ import (
const ( const (
// Interval to poll /runningpods on a node // Interval to poll /runningpods on a node
pollInterval = 1 * time.Second pollInterval = 1 * time.Second
// Interval used compute cpu usage of a container
cpuIntervalInSeconds = 60
) )
// getPodMatches returns a set of pod names on the given node that matches the // getPodMatches returns a set of pod names on the given node that matches the
@ -98,6 +100,7 @@ var _ = Describe("Clean up pods on node", func() {
for _, node := range nodes.Items { for _, node := range nodes.Items {
nodeNames.Insert(node.Name) nodeNames.Insert(node.Name)
} }
logOneTimeResourceUsageSummary(framework.Client, nodeNames.List(), cpuIntervalInSeconds)
}) })
type DeleteTest struct { type DeleteTest struct {
@ -125,13 +128,13 @@ var _ = Describe("Clean up pods on node", func() {
Image: "gcr.io/google_containers/pause:go", Image: "gcr.io/google_containers/pause:go",
Replicas: totalPods, Replicas: totalPods,
})).NotTo(HaveOccurred()) })).NotTo(HaveOccurred())
// Perform a sanity check so that we know all desired pods are // Perform a sanity check so that we know all desired pods are
// running on the nodes according to kubelet. The timeout is set to // running on the nodes according to kubelet. The timeout is set to
// only 30 seconds here because RunRC already waited for all pods to // only 30 seconds here because RunRC already waited for all pods to
// transition to the running status. // transition to the running status.
Expect(waitTillNPodsRunningOnNodes(framework.Client, nodeNames, rcName, framework.Namespace.Name, totalPods, Expect(waitTillNPodsRunningOnNodes(framework.Client, nodeNames, rcName, framework.Namespace.Name, totalPods,
time.Second*30)).NotTo(HaveOccurred()) time.Second*30)).NotTo(HaveOccurred())
logOneTimeResourceUsageSummary(framework.Client, nodeNames.List(), cpuIntervalInSeconds)
By("Deleting the RC") By("Deleting the RC")
DeleteRC(framework.Client, framework.Namespace.Name, rcName) DeleteRC(framework.Client, framework.Namespace.Name, rcName)

View File

@ -17,18 +17,23 @@ limitations under the License.
package e2e package e2e
import ( import (
"bytes"
"encoding/json"
"fmt" "fmt"
"io/ioutil" "io/ioutil"
"net/http" "net/http"
"sort" "sort"
"strconv" "strconv"
"strings" "strings"
"text/tabwriter"
"time" "time"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api" "github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client" "github.com/GoogleCloudPlatform/kubernetes/pkg/client"
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet"
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/metrics" "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/metrics"
"github.com/GoogleCloudPlatform/kubernetes/pkg/master/ports" "github.com/GoogleCloudPlatform/kubernetes/pkg/master/ports"
cadvisor "github.com/google/cadvisor/info/v1"
) )
// KubeletMetric stores metrics scraped from the kubelet server's /metric endpoint. // KubeletMetric stores metrics scraped from the kubelet server's /metric endpoint.
@ -172,6 +177,143 @@ func HighLatencyKubeletOperations(c *client.Client, threshold time.Duration, nod
return badMetrics, nil return badMetrics, nil
} }
// getContainerInfo contacts kubelet for the container informaton. The "Stats"
// in the returned ContainerInfo is subject to the requirements in statsRequest.
func getContainerInfo(c *client.Client, nodeName string, req *kubelet.StatsRequest) (map[string]cadvisor.ContainerInfo, error) {
reqBody, err := json.Marshal(req)
if err != nil {
return nil, err
}
data, err := c.Post().
Prefix("proxy").
Resource("nodes").
Name(fmt.Sprintf("%v:%v", nodeName, ports.KubeletPort)).
Suffix("stats/container").
SetHeader("Content-Type", "application/json").
Body(reqBody).
Do().Raw()
var containers map[string]cadvisor.ContainerInfo
err = json.Unmarshal(data, &containers)
if err != nil {
return nil, err
}
return containers, nil
}
const (
// cadvisor records stats about every second.
cadvisorStatsPollingIntervalInSeconds float64 = 1.0
// cadvisor caches up to 2 minutes of stats (configured by kubelet).
maxNumStatsToRequest int = 120
)
// A list of containers for which we want to collect resource usage.
var targetContainers = []string{
"/",
"/docker-daemon",
"/kubelet",
"/kube-proxy",
"/system",
}
type containerResourceUsage struct {
Name string
Timestamp time.Time
CPUUsageInCores float64
MemoryUsageInBytes int64
MemoryWorkingSetInBytes int64
// The interval used to calculate CPUUsageInCores.
CPUInterval time.Duration
}
// getOneTimeResourceUsageOnNode queries the node's /stats/container endpoint
// and returns the resource usage of targetContainers for the past
// cpuInterval.
// The acceptable range of the interval is 2s~120s. Be warned that as the
// interval (and #containers) increases, the size of kubelet's response
// could be sigificant. E.g., the 60s interval stats for ~20 containers is
// ~1.5MB. Don't hammer the node with frequent, heavy requests.
// TODO: Implement a constant, lightweight resource monitor, which polls
// kubelet every few second, stores the data, and reports meaningful statistics
// numbers over a longer period (e.g., max/mean cpu usage in the last hour).
//
// cadvisor records cumulative cpu usage in nanoseconds, so we need to have two
// stats points to compute the cpu usage over the interval. Assuming cadvisor
// polls every second, we'd need to get N stats points for N-second interval.
// Note that this is an approximation and may not be accurate, hence we also
// write the actual interval used for calcuation (based on the timestampes of
// the stats points in containerResourceUsage.CPUInterval.
func getOneTimeResourceUsageOnNode(c *client.Client, nodeName string, cpuInterval time.Duration) (map[string]*containerResourceUsage, error) {
numStats := int(float64(cpuInterval.Seconds()) / cadvisorStatsPollingIntervalInSeconds)
if numStats < 2 || numStats > maxNumStatsToRequest {
return nil, fmt.Errorf("numStats needs to be > 1 and < %d", maxNumStatsToRequest)
}
// Get information of all containers on the node.
containerInfos, err := getContainerInfo(c, nodeName, &kubelet.StatsRequest{
ContainerName: "/",
NumStats: numStats,
Subcontainers: true,
})
if err != nil {
return nil, err
}
// Process container infos that are relevant to us.
usageMap := make(map[string]*containerResourceUsage, len(targetContainers))
for _, name := range targetContainers {
info, ok := containerInfos[name]
if !ok {
return nil, fmt.Errorf("missing info for container %q on node %q", name, nodeName)
}
first := info.Stats[0]
last := info.Stats[len(info.Stats)-1]
usageMap[name] = &containerResourceUsage{
Name: name,
Timestamp: last.Timestamp,
CPUUsageInCores: float64(last.Cpu.Usage.Total-first.Cpu.Usage.Total) / float64(last.Timestamp.Sub(first.Timestamp).Nanoseconds()),
MemoryUsageInBytes: int64(last.Memory.Usage),
MemoryWorkingSetInBytes: int64(last.Memory.WorkingSet),
CPUInterval: last.Timestamp.Sub(first.Timestamp),
}
}
return usageMap, nil
}
// logOneTimeResourceUsageSummary collects container resource for the list of
// nodes, formats and logs the stats.
func logOneTimeResourceUsageSummary(c *client.Client, nodeNames []string, cpuInterval time.Duration) {
var summary []string
for _, nodeName := range nodeNames {
stats, err := getOneTimeResourceUsageOnNode(c, nodeName, cpuInterval)
if err != nil {
summary = append(summary, fmt.Sprintf("Error getting resource usage from node %q, err: %v", nodeName, err))
} else {
summary = append(summary, formatResourceUsageStats(nodeName, stats))
}
}
Logf("\n%s", strings.Join(summary, "\n"))
}
func formatResourceUsageStats(nodeName string, containerStats map[string]*containerResourceUsage) string {
// Example output:
//
// Resource usage for node "e2e-test-foo-minion-abcde":
// container cpu(cores) memory(MB)
// "/" 0.363 2942.09
// "/docker-daemon" 0.088 521.80
// "/kubelet" 0.086 424.37
// "/kube-proxy" 0.011 4.66
// "/system" 0.007 119.88
buf := &bytes.Buffer{}
w := tabwriter.NewWriter(buf, 1, 0, 1, ' ', 0)
fmt.Fprintf(w, "container\tcpu(cores)\tmemory(MB)\n")
for name, s := range containerStats {
fmt.Fprintf(w, "%q\t%.3f\t%.2f\n", name, s.CPUUsageInCores, float64(s.MemoryUsageInBytes)/1000000)
}
w.Flush()
return fmt.Sprintf("Resource usage on node %q:\n%s", nodeName, buf.String())
}
// Performs a get on a node proxy endpoint given the nodename and rest client. // Performs a get on a node proxy endpoint given the nodename and rest client.
func nodeProxyRequest(c *client.Client, node, endpoint string) client.Result { func nodeProxyRequest(c *client.Client, node, endpoint string) client.Result {
return c.Get(). return c.Get().