mirror of https://github.com/k3s-io/k3s
e2e: query kubelet for resource usage of containers
This change adds simple utility functions to query kubelet for the resource usage of major containers (e.g. /kubelet, /docker-daemon, etc) for the past time interval (up to 2 minutes). This is intended for spot checking of the recent resource usage on the node.pull/6/head
parent
5450afdabc
commit
378a44a287
|
@ -34,6 +34,8 @@ import (
|
|||
const (
|
||||
// Interval to poll /runningpods on a node
|
||||
pollInterval = 1 * time.Second
|
||||
// Interval used compute cpu usage of a container
|
||||
cpuIntervalInSeconds = 60
|
||||
)
|
||||
|
||||
// getPodMatches returns a set of pod names on the given node that matches the
|
||||
|
@ -98,6 +100,7 @@ var _ = Describe("Clean up pods on node", func() {
|
|||
for _, node := range nodes.Items {
|
||||
nodeNames.Insert(node.Name)
|
||||
}
|
||||
logOneTimeResourceUsageSummary(framework.Client, nodeNames.List(), cpuIntervalInSeconds)
|
||||
})
|
||||
|
||||
type DeleteTest struct {
|
||||
|
@ -125,13 +128,13 @@ var _ = Describe("Clean up pods on node", func() {
|
|||
Image: "gcr.io/google_containers/pause:go",
|
||||
Replicas: totalPods,
|
||||
})).NotTo(HaveOccurred())
|
||||
|
||||
// Perform a sanity check so that we know all desired pods are
|
||||
// running on the nodes according to kubelet. The timeout is set to
|
||||
// only 30 seconds here because RunRC already waited for all pods to
|
||||
// transition to the running status.
|
||||
Expect(waitTillNPodsRunningOnNodes(framework.Client, nodeNames, rcName, framework.Namespace.Name, totalPods,
|
||||
time.Second*30)).NotTo(HaveOccurred())
|
||||
logOneTimeResourceUsageSummary(framework.Client, nodeNames.List(), cpuIntervalInSeconds)
|
||||
|
||||
By("Deleting the RC")
|
||||
DeleteRC(framework.Client, framework.Namespace.Name, rcName)
|
||||
|
|
|
@ -17,18 +17,23 @@ limitations under the License.
|
|||
package e2e
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"text/tabwriter"
|
||||
"time"
|
||||
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/metrics"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/master/ports"
|
||||
cadvisor "github.com/google/cadvisor/info/v1"
|
||||
)
|
||||
|
||||
// KubeletMetric stores metrics scraped from the kubelet server's /metric endpoint.
|
||||
|
@ -172,6 +177,143 @@ func HighLatencyKubeletOperations(c *client.Client, threshold time.Duration, nod
|
|||
return badMetrics, nil
|
||||
}
|
||||
|
||||
// getContainerInfo contacts kubelet for the container informaton. The "Stats"
|
||||
// in the returned ContainerInfo is subject to the requirements in statsRequest.
|
||||
func getContainerInfo(c *client.Client, nodeName string, req *kubelet.StatsRequest) (map[string]cadvisor.ContainerInfo, error) {
|
||||
reqBody, err := json.Marshal(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
data, err := c.Post().
|
||||
Prefix("proxy").
|
||||
Resource("nodes").
|
||||
Name(fmt.Sprintf("%v:%v", nodeName, ports.KubeletPort)).
|
||||
Suffix("stats/container").
|
||||
SetHeader("Content-Type", "application/json").
|
||||
Body(reqBody).
|
||||
Do().Raw()
|
||||
|
||||
var containers map[string]cadvisor.ContainerInfo
|
||||
err = json.Unmarshal(data, &containers)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return containers, nil
|
||||
}
|
||||
|
||||
const (
|
||||
// cadvisor records stats about every second.
|
||||
cadvisorStatsPollingIntervalInSeconds float64 = 1.0
|
||||
// cadvisor caches up to 2 minutes of stats (configured by kubelet).
|
||||
maxNumStatsToRequest int = 120
|
||||
)
|
||||
|
||||
// A list of containers for which we want to collect resource usage.
|
||||
var targetContainers = []string{
|
||||
"/",
|
||||
"/docker-daemon",
|
||||
"/kubelet",
|
||||
"/kube-proxy",
|
||||
"/system",
|
||||
}
|
||||
|
||||
type containerResourceUsage struct {
|
||||
Name string
|
||||
Timestamp time.Time
|
||||
CPUUsageInCores float64
|
||||
MemoryUsageInBytes int64
|
||||
MemoryWorkingSetInBytes int64
|
||||
// The interval used to calculate CPUUsageInCores.
|
||||
CPUInterval time.Duration
|
||||
}
|
||||
|
||||
// getOneTimeResourceUsageOnNode queries the node's /stats/container endpoint
|
||||
// and returns the resource usage of targetContainers for the past
|
||||
// cpuInterval.
|
||||
// The acceptable range of the interval is 2s~120s. Be warned that as the
|
||||
// interval (and #containers) increases, the size of kubelet's response
|
||||
// could be sigificant. E.g., the 60s interval stats for ~20 containers is
|
||||
// ~1.5MB. Don't hammer the node with frequent, heavy requests.
|
||||
// TODO: Implement a constant, lightweight resource monitor, which polls
|
||||
// kubelet every few second, stores the data, and reports meaningful statistics
|
||||
// numbers over a longer period (e.g., max/mean cpu usage in the last hour).
|
||||
//
|
||||
// cadvisor records cumulative cpu usage in nanoseconds, so we need to have two
|
||||
// stats points to compute the cpu usage over the interval. Assuming cadvisor
|
||||
// polls every second, we'd need to get N stats points for N-second interval.
|
||||
// Note that this is an approximation and may not be accurate, hence we also
|
||||
// write the actual interval used for calcuation (based on the timestampes of
|
||||
// the stats points in containerResourceUsage.CPUInterval.
|
||||
func getOneTimeResourceUsageOnNode(c *client.Client, nodeName string, cpuInterval time.Duration) (map[string]*containerResourceUsage, error) {
|
||||
numStats := int(float64(cpuInterval.Seconds()) / cadvisorStatsPollingIntervalInSeconds)
|
||||
if numStats < 2 || numStats > maxNumStatsToRequest {
|
||||
return nil, fmt.Errorf("numStats needs to be > 1 and < %d", maxNumStatsToRequest)
|
||||
}
|
||||
// Get information of all containers on the node.
|
||||
containerInfos, err := getContainerInfo(c, nodeName, &kubelet.StatsRequest{
|
||||
ContainerName: "/",
|
||||
NumStats: numStats,
|
||||
Subcontainers: true,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// Process container infos that are relevant to us.
|
||||
usageMap := make(map[string]*containerResourceUsage, len(targetContainers))
|
||||
for _, name := range targetContainers {
|
||||
info, ok := containerInfos[name]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("missing info for container %q on node %q", name, nodeName)
|
||||
}
|
||||
first := info.Stats[0]
|
||||
last := info.Stats[len(info.Stats)-1]
|
||||
usageMap[name] = &containerResourceUsage{
|
||||
Name: name,
|
||||
Timestamp: last.Timestamp,
|
||||
CPUUsageInCores: float64(last.Cpu.Usage.Total-first.Cpu.Usage.Total) / float64(last.Timestamp.Sub(first.Timestamp).Nanoseconds()),
|
||||
MemoryUsageInBytes: int64(last.Memory.Usage),
|
||||
MemoryWorkingSetInBytes: int64(last.Memory.WorkingSet),
|
||||
CPUInterval: last.Timestamp.Sub(first.Timestamp),
|
||||
}
|
||||
}
|
||||
return usageMap, nil
|
||||
}
|
||||
|
||||
// logOneTimeResourceUsageSummary collects container resource for the list of
|
||||
// nodes, formats and logs the stats.
|
||||
func logOneTimeResourceUsageSummary(c *client.Client, nodeNames []string, cpuInterval time.Duration) {
|
||||
var summary []string
|
||||
for _, nodeName := range nodeNames {
|
||||
stats, err := getOneTimeResourceUsageOnNode(c, nodeName, cpuInterval)
|
||||
if err != nil {
|
||||
summary = append(summary, fmt.Sprintf("Error getting resource usage from node %q, err: %v", nodeName, err))
|
||||
} else {
|
||||
summary = append(summary, formatResourceUsageStats(nodeName, stats))
|
||||
}
|
||||
}
|
||||
Logf("\n%s", strings.Join(summary, "\n"))
|
||||
}
|
||||
|
||||
func formatResourceUsageStats(nodeName string, containerStats map[string]*containerResourceUsage) string {
|
||||
// Example output:
|
||||
//
|
||||
// Resource usage for node "e2e-test-foo-minion-abcde":
|
||||
// container cpu(cores) memory(MB)
|
||||
// "/" 0.363 2942.09
|
||||
// "/docker-daemon" 0.088 521.80
|
||||
// "/kubelet" 0.086 424.37
|
||||
// "/kube-proxy" 0.011 4.66
|
||||
// "/system" 0.007 119.88
|
||||
buf := &bytes.Buffer{}
|
||||
w := tabwriter.NewWriter(buf, 1, 0, 1, ' ', 0)
|
||||
fmt.Fprintf(w, "container\tcpu(cores)\tmemory(MB)\n")
|
||||
for name, s := range containerStats {
|
||||
fmt.Fprintf(w, "%q\t%.3f\t%.2f\n", name, s.CPUUsageInCores, float64(s.MemoryUsageInBytes)/1000000)
|
||||
}
|
||||
w.Flush()
|
||||
return fmt.Sprintf("Resource usage on node %q:\n%s", nodeName, buf.String())
|
||||
}
|
||||
|
||||
// Performs a get on a node proxy endpoint given the nodename and rest client.
|
||||
func nodeProxyRequest(c *client.Client, node, endpoint string) client.Result {
|
||||
return c.Get().
|
||||
|
|
Loading…
Reference in New Issue