2015-06-09 21:01:23 +00:00
|
|
|
/*
|
|
|
|
Copyright 2014 The Kubernetes Authors All rights reserved.
|
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package e2e
|
|
|
|
|
|
|
|
import (
|
2015-07-09 23:29:33 +00:00
|
|
|
"bytes"
|
|
|
|
"encoding/json"
|
2015-06-09 21:01:23 +00:00
|
|
|
"fmt"
|
2015-06-25 01:38:23 +00:00
|
|
|
"io/ioutil"
|
2015-10-29 14:49:23 +00:00
|
|
|
"math"
|
2015-06-25 01:38:23 +00:00
|
|
|
"net/http"
|
2015-06-09 21:01:23 +00:00
|
|
|
"sort"
|
|
|
|
"strconv"
|
|
|
|
"strings"
|
2015-09-22 22:08:53 +00:00
|
|
|
"sync"
|
2015-07-09 23:29:33 +00:00
|
|
|
"text/tabwriter"
|
2015-06-09 21:01:23 +00:00
|
|
|
"time"
|
|
|
|
|
2015-10-16 03:00:28 +00:00
|
|
|
cadvisorapi "github.com/google/cadvisor/info/v1"
|
2015-09-21 19:20:25 +00:00
|
|
|
"github.com/prometheus/common/model"
|
2015-08-05 22:03:47 +00:00
|
|
|
"k8s.io/kubernetes/pkg/api"
|
2015-11-26 10:06:01 +00:00
|
|
|
"k8s.io/kubernetes/pkg/api/unversioned"
|
2015-08-13 19:01:50 +00:00
|
|
|
client "k8s.io/kubernetes/pkg/client/unversioned"
|
2015-08-05 22:03:47 +00:00
|
|
|
"k8s.io/kubernetes/pkg/kubelet"
|
|
|
|
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
|
|
|
"k8s.io/kubernetes/pkg/master/ports"
|
|
|
|
"k8s.io/kubernetes/pkg/util"
|
2015-09-09 17:45:01 +00:00
|
|
|
"k8s.io/kubernetes/pkg/util/sets"
|
2015-11-30 14:29:40 +00:00
|
|
|
|
|
|
|
. "github.com/onsi/gomega"
|
2015-06-09 21:01:23 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// KubeletMetric stores metrics scraped from the kubelet server's /metric endpoint.
|
2015-08-08 21:29:57 +00:00
|
|
|
// TODO: Get some more structure around the metrics and this type
|
2015-06-09 21:01:23 +00:00
|
|
|
type KubeletMetric struct {
|
|
|
|
// eg: list, info, create
|
|
|
|
Operation string
|
|
|
|
// eg: sync_pods, pod_worker
|
|
|
|
Method string
|
|
|
|
// 0 <= quantile <=1, e.g. 0.95 is 95%tile, 0.5 is median.
|
|
|
|
Quantile float64
|
|
|
|
Latency time.Duration
|
|
|
|
}
|
|
|
|
|
|
|
|
// KubeletMetricByLatency implements sort.Interface for []KubeletMetric based on
|
|
|
|
// the latency field.
|
|
|
|
type KubeletMetricByLatency []KubeletMetric
|
|
|
|
|
|
|
|
func (a KubeletMetricByLatency) Len() int { return len(a) }
|
|
|
|
func (a KubeletMetricByLatency) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
|
|
|
func (a KubeletMetricByLatency) Less(i, j int) bool { return a[i].Latency > a[j].Latency }
|
|
|
|
|
2015-09-21 19:20:25 +00:00
|
|
|
// ParseKubeletMetrics reads metrics from the kubelet server running on the given node
|
|
|
|
func ParseKubeletMetrics(metricsBlob string) ([]KubeletMetric, error) {
|
|
|
|
samples, err := extractMetricSamples(metricsBlob)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2015-07-24 01:09:06 +00:00
|
|
|
|
2015-09-09 17:45:01 +00:00
|
|
|
acceptedMethods := sets.NewString(
|
2015-07-24 01:09:06 +00:00
|
|
|
metrics.PodWorkerLatencyKey,
|
|
|
|
metrics.PodWorkerStartLatencyKey,
|
|
|
|
metrics.SyncPodsLatencyKey,
|
|
|
|
metrics.PodStartLatencyKey,
|
|
|
|
metrics.PodStatusLatencyKey,
|
|
|
|
metrics.ContainerManagerOperationsKey,
|
|
|
|
metrics.DockerOperationsKey,
|
|
|
|
metrics.DockerErrorsKey,
|
|
|
|
)
|
|
|
|
|
2015-09-21 19:20:25 +00:00
|
|
|
var kms []KubeletMetric
|
2015-07-24 01:09:06 +00:00
|
|
|
for _, sample := range samples {
|
|
|
|
const prefix = metrics.KubeletSubsystem + "_"
|
|
|
|
metricName := string(sample.Metric[model.MetricNameLabel])
|
|
|
|
if !strings.HasPrefix(metricName, prefix) {
|
|
|
|
// Not a kubelet metric.
|
2015-06-09 21:01:23 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2015-07-24 01:09:06 +00:00
|
|
|
method := strings.TrimPrefix(metricName, prefix)
|
|
|
|
if !acceptedMethods.Has(method) {
|
2015-06-09 21:01:23 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2015-07-24 01:09:06 +00:00
|
|
|
if method == metrics.DockerErrorsKey {
|
|
|
|
Logf("ERROR %v", sample)
|
2015-06-09 21:01:23 +00:00
|
|
|
}
|
|
|
|
|
2015-07-24 01:09:06 +00:00
|
|
|
latency := sample.Value
|
|
|
|
operation := string(sample.Metric["operation_type"])
|
2015-06-09 21:01:23 +00:00
|
|
|
var quantile float64
|
2015-07-24 01:09:06 +00:00
|
|
|
if val, ok := sample.Metric[model.QuantileLabel]; ok {
|
|
|
|
var err error
|
|
|
|
if quantile, err = strconv.ParseFloat(string(val), 64); err != nil {
|
2015-06-09 21:01:23 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
2015-07-24 01:09:06 +00:00
|
|
|
|
2015-09-21 19:20:25 +00:00
|
|
|
kms = append(kms, KubeletMetric{
|
|
|
|
operation,
|
|
|
|
method,
|
|
|
|
quantile,
|
|
|
|
time.Duration(int64(latency)) * time.Microsecond,
|
|
|
|
})
|
2015-06-09 21:01:23 +00:00
|
|
|
}
|
2015-09-21 19:20:25 +00:00
|
|
|
return kms, nil
|
2015-06-09 21:01:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// HighLatencyKubeletOperations logs and counts the high latency metrics exported by the kubelet server via /metrics.
|
|
|
|
func HighLatencyKubeletOperations(c *client.Client, threshold time.Duration, nodeName string) ([]KubeletMetric, error) {
|
2015-06-25 01:38:23 +00:00
|
|
|
var metricsBlob string
|
|
|
|
var err error
|
|
|
|
// If we haven't been given a client try scraping the nodename directly for a /metrics endpoint.
|
|
|
|
if c == nil {
|
|
|
|
metricsBlob, err = getKubeletMetricsThroughNode(nodeName)
|
|
|
|
} else {
|
|
|
|
metricsBlob, err = getKubeletMetricsThroughProxy(c, nodeName)
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
return []KubeletMetric{}, err
|
|
|
|
}
|
|
|
|
metric, err := ParseKubeletMetrics(metricsBlob)
|
2015-06-09 21:01:23 +00:00
|
|
|
if err != nil {
|
|
|
|
return []KubeletMetric{}, err
|
|
|
|
}
|
|
|
|
sort.Sort(KubeletMetricByLatency(metric))
|
|
|
|
var badMetrics []KubeletMetric
|
2015-07-07 01:11:30 +00:00
|
|
|
Logf("\nLatency metrics for node %v", nodeName)
|
2015-06-09 21:01:23 +00:00
|
|
|
for _, m := range metric {
|
|
|
|
if m.Latency > threshold {
|
|
|
|
badMetrics = append(badMetrics, m)
|
|
|
|
Logf("%+v", m)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return badMetrics, nil
|
|
|
|
}
|
|
|
|
|
2015-10-20 02:41:58 +00:00
|
|
|
// getContainerInfo contacts kubelet for the container information. The "Stats"
|
2015-07-09 23:29:33 +00:00
|
|
|
// in the returned ContainerInfo is subject to the requirements in statsRequest.
|
2015-10-16 03:00:28 +00:00
|
|
|
func getContainerInfo(c *client.Client, nodeName string, req *kubelet.StatsRequest) (map[string]cadvisorapi.ContainerInfo, error) {
|
2015-07-09 23:29:33 +00:00
|
|
|
reqBody, err := json.Marshal(req)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
data, err := c.Post().
|
|
|
|
Prefix("proxy").
|
|
|
|
Resource("nodes").
|
|
|
|
Name(fmt.Sprintf("%v:%v", nodeName, ports.KubeletPort)).
|
|
|
|
Suffix("stats/container").
|
|
|
|
SetHeader("Content-Type", "application/json").
|
|
|
|
Body(reqBody).
|
|
|
|
Do().Raw()
|
|
|
|
|
2015-10-16 03:00:28 +00:00
|
|
|
var containers map[string]cadvisorapi.ContainerInfo
|
2015-07-09 23:29:33 +00:00
|
|
|
err = json.Unmarshal(data, &containers)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
return containers, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
const (
|
|
|
|
// cadvisor records stats about every second.
|
|
|
|
cadvisorStatsPollingIntervalInSeconds float64 = 1.0
|
|
|
|
// cadvisor caches up to 2 minutes of stats (configured by kubelet).
|
|
|
|
maxNumStatsToRequest int = 120
|
|
|
|
)
|
|
|
|
|
|
|
|
// A list of containers for which we want to collect resource usage.
|
2015-09-23 13:25:52 +00:00
|
|
|
func targetContainers() []string {
|
|
|
|
if providerIs("gce", "gke") {
|
|
|
|
return []string{
|
|
|
|
"/",
|
|
|
|
"/docker-daemon",
|
|
|
|
"/kubelet",
|
|
|
|
"/system",
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return []string{
|
|
|
|
"/",
|
|
|
|
}
|
|
|
|
}
|
2015-07-09 23:29:33 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
type containerResourceUsage struct {
|
|
|
|
Name string
|
|
|
|
Timestamp time.Time
|
|
|
|
CPUUsageInCores float64
|
|
|
|
MemoryUsageInBytes int64
|
|
|
|
MemoryWorkingSetInBytes int64
|
|
|
|
// The interval used to calculate CPUUsageInCores.
|
|
|
|
CPUInterval time.Duration
|
|
|
|
}
|
|
|
|
|
2015-08-10 07:41:58 +00:00
|
|
|
func (r *containerResourceUsage) isStrictlyGreaterThan(rhs *containerResourceUsage) bool {
|
2015-12-08 11:12:16 +00:00
|
|
|
return r.CPUUsageInCores > rhs.CPUUsageInCores && r.MemoryWorkingSetInBytes > rhs.MemoryWorkingSetInBytes
|
2015-08-10 07:41:58 +00:00
|
|
|
}
|
|
|
|
|
2015-10-29 14:49:23 +00:00
|
|
|
type resourceUsagePerContainer map[string]*containerResourceUsage
|
|
|
|
|
2015-07-09 23:29:33 +00:00
|
|
|
// getOneTimeResourceUsageOnNode queries the node's /stats/container endpoint
|
2015-10-29 14:49:23 +00:00
|
|
|
// and returns the resource usage of all containerNames for the past
|
2015-07-09 23:29:33 +00:00
|
|
|
// cpuInterval.
|
|
|
|
// The acceptable range of the interval is 2s~120s. Be warned that as the
|
|
|
|
// interval (and #containers) increases, the size of kubelet's response
|
2015-10-20 02:41:58 +00:00
|
|
|
// could be significant. E.g., the 60s interval stats for ~20 containers is
|
2015-07-09 23:29:33 +00:00
|
|
|
// ~1.5MB. Don't hammer the node with frequent, heavy requests.
|
|
|
|
//
|
|
|
|
// cadvisor records cumulative cpu usage in nanoseconds, so we need to have two
|
|
|
|
// stats points to compute the cpu usage over the interval. Assuming cadvisor
|
|
|
|
// polls every second, we'd need to get N stats points for N-second interval.
|
|
|
|
// Note that this is an approximation and may not be accurate, hence we also
|
2015-10-20 02:41:58 +00:00
|
|
|
// write the actual interval used for calculation (based on the timestamps of
|
2015-07-09 23:29:33 +00:00
|
|
|
// the stats points in containerResourceUsage.CPUInterval.
|
2015-10-29 14:49:23 +00:00
|
|
|
//
|
|
|
|
// containerNames is a function returning a collection of contianer names in which
|
|
|
|
// user is interested in. ExpectMissingContainers is a flag which says if the test
|
|
|
|
// should fail if one of containers listed by containerNames is missing on any node
|
|
|
|
// (useful e.g. when looking for system containers or daemons). If set to true function
|
|
|
|
// is more forgiving and ignores missing containers.
|
|
|
|
func getOneTimeResourceUsageOnNode(
|
|
|
|
c *client.Client,
|
|
|
|
nodeName string,
|
|
|
|
cpuInterval time.Duration,
|
|
|
|
containerNames func() []string,
|
|
|
|
expectMissingContainers bool,
|
|
|
|
) (resourceUsagePerContainer, error) {
|
2015-07-09 23:29:33 +00:00
|
|
|
numStats := int(float64(cpuInterval.Seconds()) / cadvisorStatsPollingIntervalInSeconds)
|
|
|
|
if numStats < 2 || numStats > maxNumStatsToRequest {
|
|
|
|
return nil, fmt.Errorf("numStats needs to be > 1 and < %d", maxNumStatsToRequest)
|
|
|
|
}
|
|
|
|
// Get information of all containers on the node.
|
|
|
|
containerInfos, err := getContainerInfo(c, nodeName, &kubelet.StatsRequest{
|
|
|
|
ContainerName: "/",
|
|
|
|
NumStats: numStats,
|
|
|
|
Subcontainers: true,
|
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
// Process container infos that are relevant to us.
|
2015-10-29 14:49:23 +00:00
|
|
|
containers := containerNames()
|
|
|
|
usageMap := make(resourceUsagePerContainer, len(containers))
|
2015-09-23 13:25:52 +00:00
|
|
|
for _, name := range containers {
|
2015-07-09 23:29:33 +00:00
|
|
|
info, ok := containerInfos[name]
|
|
|
|
if !ok {
|
2015-10-29 14:49:23 +00:00
|
|
|
if !expectMissingContainers {
|
|
|
|
return nil, fmt.Errorf("missing info for container %q on node %q", name, nodeName)
|
|
|
|
}
|
|
|
|
continue
|
2015-07-09 23:29:33 +00:00
|
|
|
}
|
|
|
|
first := info.Stats[0]
|
|
|
|
last := info.Stats[len(info.Stats)-1]
|
2015-07-13 19:09:57 +00:00
|
|
|
usageMap[name] = computeContainerResourceUsage(name, first, last)
|
2015-07-09 23:29:33 +00:00
|
|
|
}
|
|
|
|
return usageMap, nil
|
|
|
|
}
|
|
|
|
|
2015-10-29 14:49:23 +00:00
|
|
|
func getKubeSystemContainersResourceUsage(c *client.Client) (resourceUsagePerContainer, error) {
|
2015-12-02 11:12:57 +00:00
|
|
|
pods, err := c.Pods("kube-system").List(unversioned.ListOptions{})
|
2015-10-29 14:49:23 +00:00
|
|
|
if err != nil {
|
|
|
|
return resourceUsagePerContainer{}, err
|
|
|
|
}
|
2015-12-02 11:12:57 +00:00
|
|
|
nodes, err := c.Nodes().List(unversioned.ListOptions{})
|
2015-10-29 14:49:23 +00:00
|
|
|
if err != nil {
|
|
|
|
return resourceUsagePerContainer{}, err
|
|
|
|
}
|
|
|
|
containerIDToNameMap := make(map[string]string)
|
|
|
|
containerIDs := make([]string, 0)
|
|
|
|
for _, pod := range pods.Items {
|
|
|
|
for _, container := range pod.Status.ContainerStatuses {
|
|
|
|
containerID := strings.TrimPrefix(container.ContainerID, "docker:/")
|
|
|
|
containerIDToNameMap[containerID] = pod.Name + "/" + container.Name
|
|
|
|
containerIDs = append(containerIDs, containerID)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex := sync.Mutex{}
|
|
|
|
wg := sync.WaitGroup{}
|
|
|
|
wg.Add(len(nodes.Items))
|
|
|
|
errors := make([]error, 0)
|
|
|
|
nameToUsageMap := make(resourceUsagePerContainer, len(containerIDToNameMap))
|
|
|
|
for _, node := range nodes.Items {
|
|
|
|
go func(nodeName string) {
|
|
|
|
defer wg.Done()
|
2015-12-03 10:11:05 +00:00
|
|
|
nodeUsage, err := getOneTimeResourceUsageOnNode(c, nodeName, 15*time.Second, func() []string { return containerIDs }, true)
|
2015-10-29 14:49:23 +00:00
|
|
|
mutex.Lock()
|
|
|
|
defer mutex.Unlock()
|
|
|
|
if err != nil {
|
|
|
|
errors = append(errors, err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
for k, v := range nodeUsage {
|
|
|
|
nameToUsageMap[containerIDToNameMap[k]] = v
|
|
|
|
}
|
|
|
|
}(node.Name)
|
|
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
if len(errors) != 0 {
|
|
|
|
return resourceUsagePerContainer{}, fmt.Errorf("Errors while gathering usage data: %v", errors)
|
|
|
|
}
|
|
|
|
return nameToUsageMap, nil
|
|
|
|
}
|
|
|
|
|
2015-07-09 23:29:33 +00:00
|
|
|
// logOneTimeResourceUsageSummary collects container resource for the list of
|
|
|
|
// nodes, formats and logs the stats.
|
|
|
|
func logOneTimeResourceUsageSummary(c *client.Client, nodeNames []string, cpuInterval time.Duration) {
|
|
|
|
var summary []string
|
|
|
|
for _, nodeName := range nodeNames {
|
2015-10-29 14:49:23 +00:00
|
|
|
stats, err := getOneTimeResourceUsageOnNode(c, nodeName, cpuInterval, targetContainers, false)
|
2015-07-09 23:29:33 +00:00
|
|
|
if err != nil {
|
|
|
|
summary = append(summary, fmt.Sprintf("Error getting resource usage from node %q, err: %v", nodeName, err))
|
|
|
|
} else {
|
|
|
|
summary = append(summary, formatResourceUsageStats(nodeName, stats))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Logf("\n%s", strings.Join(summary, "\n"))
|
|
|
|
}
|
|
|
|
|
2015-10-29 14:49:23 +00:00
|
|
|
func formatResourceUsageStats(nodeName string, containerStats resourceUsagePerContainer) string {
|
2015-07-09 23:29:33 +00:00
|
|
|
// Example output:
|
|
|
|
//
|
|
|
|
// Resource usage for node "e2e-test-foo-minion-abcde":
|
|
|
|
// container cpu(cores) memory(MB)
|
|
|
|
// "/" 0.363 2942.09
|
|
|
|
// "/docker-daemon" 0.088 521.80
|
|
|
|
// "/kubelet" 0.086 424.37
|
|
|
|
// "/system" 0.007 119.88
|
|
|
|
buf := &bytes.Buffer{}
|
|
|
|
w := tabwriter.NewWriter(buf, 1, 0, 1, ' ', 0)
|
|
|
|
fmt.Fprintf(w, "container\tcpu(cores)\tmemory(MB)\n")
|
|
|
|
for name, s := range containerStats {
|
2015-10-20 22:54:43 +00:00
|
|
|
fmt.Fprintf(w, "%q\t%.3f\t%.2f\n", name, s.CPUUsageInCores, float64(s.MemoryWorkingSetInBytes)/(1024*1024))
|
2015-07-09 23:29:33 +00:00
|
|
|
}
|
|
|
|
w.Flush()
|
|
|
|
return fmt.Sprintf("Resource usage on node %q:\n%s", nodeName, buf.String())
|
|
|
|
}
|
|
|
|
|
2015-10-29 14:49:23 +00:00
|
|
|
type int64arr []int64
|
|
|
|
|
|
|
|
func (a int64arr) Len() int { return len(a) }
|
|
|
|
func (a int64arr) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
|
|
|
func (a int64arr) Less(i, j int) bool { return a[i] < a[j] }
|
|
|
|
|
|
|
|
type usageDataPerContainer struct {
|
|
|
|
cpuData []float64
|
|
|
|
memUseData []int64
|
|
|
|
memWorkSetData []int64
|
|
|
|
}
|
|
|
|
|
|
|
|
func computePercentiles(timeSeries map[time.Time]resourceUsagePerContainer, percentilesToCompute []int) map[int]resourceUsagePerContainer {
|
|
|
|
if len(timeSeries) == 0 {
|
|
|
|
return make(map[int]resourceUsagePerContainer)
|
|
|
|
}
|
|
|
|
dataMap := make(map[string]*usageDataPerContainer)
|
|
|
|
for _, singleStatistic := range timeSeries {
|
|
|
|
for name, data := range singleStatistic {
|
2015-11-03 08:04:54 +00:00
|
|
|
if dataMap[name] == nil {
|
|
|
|
dataMap[name] = &usageDataPerContainer{
|
|
|
|
cpuData: make([]float64, len(timeSeries)),
|
|
|
|
memUseData: make([]int64, len(timeSeries)),
|
|
|
|
memWorkSetData: make([]int64, len(timeSeries)),
|
|
|
|
}
|
|
|
|
}
|
2015-10-29 14:49:23 +00:00
|
|
|
dataMap[name].cpuData = append(dataMap[name].cpuData, data.CPUUsageInCores)
|
|
|
|
dataMap[name].memUseData = append(dataMap[name].memUseData, data.MemoryUsageInBytes)
|
|
|
|
dataMap[name].memWorkSetData = append(dataMap[name].memWorkSetData, data.MemoryWorkingSetInBytes)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for _, v := range dataMap {
|
|
|
|
sort.Float64s(v.cpuData)
|
|
|
|
sort.Sort(int64arr(v.memUseData))
|
|
|
|
sort.Sort(int64arr(v.memWorkSetData))
|
|
|
|
}
|
|
|
|
|
|
|
|
result := make(map[int]resourceUsagePerContainer)
|
|
|
|
for _, perc := range percentilesToCompute {
|
|
|
|
data := make(resourceUsagePerContainer)
|
|
|
|
for k, v := range dataMap {
|
|
|
|
percentileIndex := int(math.Ceil(float64(len(v.cpuData)*perc)/100)) - 1
|
|
|
|
data[k] = &containerResourceUsage{
|
|
|
|
Name: k,
|
|
|
|
CPUUsageInCores: v.cpuData[percentileIndex],
|
|
|
|
MemoryUsageInBytes: v.memUseData[percentileIndex],
|
|
|
|
MemoryWorkingSetInBytes: v.memWorkSetData[percentileIndex],
|
|
|
|
}
|
|
|
|
}
|
|
|
|
result[perc] = data
|
|
|
|
}
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
2015-11-30 14:29:40 +00:00
|
|
|
type resourceConstraint struct {
|
|
|
|
cpuConstraint float64
|
|
|
|
memoryConstraint int64
|
|
|
|
}
|
|
|
|
|
2015-10-29 14:49:23 +00:00
|
|
|
type containerResourceGatherer struct {
|
|
|
|
usageTimeseries map[time.Time]resourceUsagePerContainer
|
|
|
|
stopCh chan struct{}
|
|
|
|
timer *time.Ticker
|
|
|
|
wg sync.WaitGroup
|
|
|
|
}
|
|
|
|
|
|
|
|
func (g *containerResourceGatherer) startGatheringData(c *client.Client, period time.Duration) {
|
|
|
|
g.usageTimeseries = make(map[time.Time]resourceUsagePerContainer)
|
|
|
|
g.wg.Add(1)
|
|
|
|
g.stopCh = make(chan struct{})
|
|
|
|
g.timer = time.NewTicker(period)
|
|
|
|
go func() error {
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-g.timer.C:
|
|
|
|
now := time.Now()
|
|
|
|
data, err := getKubeSystemContainersResourceUsage(c)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
g.usageTimeseries[now] = data
|
|
|
|
case <-g.stopCh:
|
|
|
|
g.wg.Done()
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
|
2015-11-30 14:29:40 +00:00
|
|
|
func (g *containerResourceGatherer) stopAndPrintData(percentiles []int, constraints map[string]resourceConstraint) {
|
2015-10-29 14:49:23 +00:00
|
|
|
close(g.stopCh)
|
|
|
|
g.timer.Stop()
|
|
|
|
g.wg.Wait()
|
|
|
|
if len(percentiles) == 0 {
|
|
|
|
Logf("Warning! Empty percentile list for stopAndPrintData.")
|
|
|
|
return
|
|
|
|
}
|
|
|
|
stats := computePercentiles(g.usageTimeseries, percentiles)
|
|
|
|
sortedKeys := []string{}
|
|
|
|
for name := range stats[percentiles[0]] {
|
|
|
|
sortedKeys = append(sortedKeys, name)
|
|
|
|
}
|
|
|
|
sort.Strings(sortedKeys)
|
2015-11-30 14:29:40 +00:00
|
|
|
violatedConstraints := make([]string, 0)
|
2015-10-29 14:49:23 +00:00
|
|
|
for _, perc := range percentiles {
|
|
|
|
buf := &bytes.Buffer{}
|
|
|
|
w := tabwriter.NewWriter(buf, 1, 0, 1, ' ', 0)
|
|
|
|
fmt.Fprintf(w, "container\tcpu(cores)\tmemory(MB)\n")
|
|
|
|
for _, name := range sortedKeys {
|
|
|
|
usage := stats[perc][name]
|
|
|
|
fmt.Fprintf(w, "%q\t%.3f\t%.2f\n", name, usage.CPUUsageInCores, float64(usage.MemoryWorkingSetInBytes)/(1024*1024))
|
2015-11-30 14:29:40 +00:00
|
|
|
// Verifying 99th percentile of resource usage
|
|
|
|
if perc == 99 {
|
|
|
|
// Name has a form: <pod_name>/<container_name>
|
|
|
|
containerName := strings.Split(name, "/")[1]
|
|
|
|
if constraint, ok := constraints[containerName]; ok {
|
|
|
|
if usage.CPUUsageInCores > constraint.cpuConstraint {
|
|
|
|
violatedConstraints = append(
|
|
|
|
violatedConstraints,
|
|
|
|
fmt.Sprintf("Container %v is using %v/%v CPU",
|
|
|
|
name,
|
|
|
|
usage.CPUUsageInCores,
|
|
|
|
constraint.cpuConstraint,
|
|
|
|
),
|
|
|
|
)
|
|
|
|
}
|
|
|
|
if usage.MemoryWorkingSetInBytes > constraint.memoryConstraint {
|
|
|
|
violatedConstraints = append(
|
|
|
|
violatedConstraints,
|
|
|
|
fmt.Sprintf("Container %v is using %v/%v MB of memory",
|
|
|
|
name,
|
|
|
|
float64(usage.MemoryWorkingSetInBytes)/(1024*1024),
|
|
|
|
float64(constraint.memoryConstraint)/(1024*1024),
|
|
|
|
),
|
|
|
|
)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2015-10-29 14:49:23 +00:00
|
|
|
}
|
|
|
|
w.Flush()
|
|
|
|
Logf("%v percentile:\n%v", perc, buf.String())
|
|
|
|
}
|
2015-11-30 14:29:40 +00:00
|
|
|
Expect(violatedConstraints).To(BeEmpty())
|
2015-10-29 14:49:23 +00:00
|
|
|
}
|
|
|
|
|
2015-06-24 20:00:56 +00:00
|
|
|
// Performs a get on a node proxy endpoint given the nodename and rest client.
|
|
|
|
func nodeProxyRequest(c *client.Client, node, endpoint string) client.Result {
|
|
|
|
return c.Get().
|
2015-06-09 21:01:23 +00:00
|
|
|
Prefix("proxy").
|
|
|
|
Resource("nodes").
|
|
|
|
Name(fmt.Sprintf("%v:%v", node, ports.KubeletPort)).
|
2015-06-24 20:00:56 +00:00
|
|
|
Suffix(endpoint).
|
|
|
|
Do()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Retrieve metrics from the kubelet server of the given node.
|
|
|
|
func getKubeletMetricsThroughProxy(c *client.Client, node string) (string, error) {
|
|
|
|
metric, err := nodeProxyRequest(c, node, "metrics").Raw()
|
2015-06-09 21:01:23 +00:00
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
return string(metric), nil
|
|
|
|
}
|
2015-06-25 01:38:23 +00:00
|
|
|
|
|
|
|
// Retrieve metrics from the kubelet on the given node using a simple GET over http.
|
|
|
|
// Currently only used in integration tests.
|
|
|
|
func getKubeletMetricsThroughNode(nodeName string) (string, error) {
|
|
|
|
resp, err := http.Get(fmt.Sprintf("http://%v/metrics", nodeName))
|
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
defer resp.Body.Close()
|
|
|
|
body, err := ioutil.ReadAll(resp.Body)
|
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
return string(body), nil
|
|
|
|
}
|
2015-06-24 20:00:56 +00:00
|
|
|
|
|
|
|
// GetKubeletPods retrieves the list of running pods on the kubelet. The pods
|
|
|
|
// includes necessary information (e.g., UID, name, namespace for
|
|
|
|
// pods/containers), but do not contain the full spec.
|
|
|
|
func GetKubeletPods(c *client.Client, node string) (*api.PodList, error) {
|
|
|
|
result := &api.PodList{}
|
|
|
|
if err := nodeProxyRequest(c, node, "runningpods").Into(result); err != nil {
|
|
|
|
return &api.PodList{}, err
|
|
|
|
}
|
|
|
|
return result, nil
|
|
|
|
}
|
2015-07-13 19:09:57 +00:00
|
|
|
|
2015-10-16 03:00:28 +00:00
|
|
|
func computeContainerResourceUsage(name string, oldStats, newStats *cadvisorapi.ContainerStats) *containerResourceUsage {
|
2015-07-13 19:09:57 +00:00
|
|
|
return &containerResourceUsage{
|
|
|
|
Name: name,
|
|
|
|
Timestamp: newStats.Timestamp,
|
|
|
|
CPUUsageInCores: float64(newStats.Cpu.Usage.Total-oldStats.Cpu.Usage.Total) / float64(newStats.Timestamp.Sub(oldStats.Timestamp).Nanoseconds()),
|
|
|
|
MemoryUsageInBytes: int64(newStats.Memory.Usage),
|
|
|
|
MemoryWorkingSetInBytes: int64(newStats.Memory.WorkingSet),
|
|
|
|
CPUInterval: newStats.Timestamp.Sub(oldStats.Timestamp),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// resourceCollector periodically polls the node, collect stats for a given
|
|
|
|
// list of containers, computes and cache resource usage up to
|
|
|
|
// maxEntriesPerContainer for each container.
|
|
|
|
type resourceCollector struct {
|
2015-09-22 22:08:53 +00:00
|
|
|
lock sync.RWMutex
|
2015-07-13 19:09:57 +00:00
|
|
|
node string
|
|
|
|
containers []string
|
|
|
|
client *client.Client
|
|
|
|
buffers map[string][]*containerResourceUsage
|
|
|
|
pollingInterval time.Duration
|
|
|
|
stopCh chan struct{}
|
|
|
|
}
|
|
|
|
|
|
|
|
func newResourceCollector(c *client.Client, nodeName string, containerNames []string, pollingInterval time.Duration) *resourceCollector {
|
|
|
|
buffers := make(map[string][]*containerResourceUsage)
|
|
|
|
return &resourceCollector{
|
|
|
|
node: nodeName,
|
|
|
|
containers: containerNames,
|
|
|
|
client: c,
|
|
|
|
buffers: buffers,
|
|
|
|
pollingInterval: pollingInterval,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-20 02:41:58 +00:00
|
|
|
// Start starts a goroutine to poll the node every pollingInterval.
|
2015-07-13 19:09:57 +00:00
|
|
|
func (r *resourceCollector) Start() {
|
|
|
|
r.stopCh = make(chan struct{}, 1)
|
|
|
|
// Keep the last observed stats for comparison.
|
2015-10-16 03:00:28 +00:00
|
|
|
oldStats := make(map[string]*cadvisorapi.ContainerStats)
|
2015-07-13 19:09:57 +00:00
|
|
|
go util.Until(func() { r.collectStats(oldStats) }, r.pollingInterval, r.stopCh)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Stop sends a signal to terminate the stats collecting goroutine.
|
|
|
|
func (r *resourceCollector) Stop() {
|
|
|
|
close(r.stopCh)
|
|
|
|
}
|
|
|
|
|
|
|
|
// collectStats gets the latest stats from kubelet's /stats/container, computes
|
|
|
|
// the resource usage, and pushes it to the buffer.
|
2015-10-16 03:00:28 +00:00
|
|
|
func (r *resourceCollector) collectStats(oldStats map[string]*cadvisorapi.ContainerStats) {
|
2015-07-13 19:09:57 +00:00
|
|
|
infos, err := getContainerInfo(r.client, r.node, &kubelet.StatsRequest{
|
|
|
|
ContainerName: "/",
|
|
|
|
NumStats: 1,
|
|
|
|
Subcontainers: true,
|
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
Logf("Error getting container info on %q, err: %v", r.node, err)
|
|
|
|
return
|
|
|
|
}
|
2015-09-22 22:08:53 +00:00
|
|
|
r.lock.Lock()
|
|
|
|
defer r.lock.Unlock()
|
2015-07-13 19:09:57 +00:00
|
|
|
for _, name := range r.containers {
|
|
|
|
info, ok := infos[name]
|
|
|
|
if !ok || len(info.Stats) < 1 {
|
|
|
|
Logf("Missing info/stats for container %q on node %q", name, r.node)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if _, ok := oldStats[name]; ok {
|
|
|
|
r.buffers[name] = append(r.buffers[name], computeContainerResourceUsage(name, oldStats[name], info.Stats[0]))
|
|
|
|
}
|
|
|
|
oldStats[name] = info.Stats[0]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// LogLatest logs the latest resource usage of each container.
|
|
|
|
func (r *resourceCollector) LogLatest() {
|
2015-09-22 22:08:53 +00:00
|
|
|
r.lock.RLock()
|
|
|
|
defer r.lock.RUnlock()
|
2015-07-13 19:09:57 +00:00
|
|
|
stats := make(map[string]*containerResourceUsage)
|
|
|
|
for _, name := range r.containers {
|
2015-09-26 01:20:56 +00:00
|
|
|
contStats, ok := r.buffers[name]
|
|
|
|
if !ok || len(contStats) == 0 {
|
2015-07-13 19:09:57 +00:00
|
|
|
Logf("Resource usage on node %q is not ready yet", r.node)
|
|
|
|
return
|
|
|
|
}
|
2015-09-26 01:20:56 +00:00
|
|
|
stats[name] = contStats[len(contStats)-1]
|
2015-07-13 19:09:57 +00:00
|
|
|
}
|
|
|
|
Logf("\n%s", formatResourceUsageStats(r.node, stats))
|
|
|
|
}
|
|
|
|
|
2015-09-22 22:08:53 +00:00
|
|
|
// Reset frees the stats and start over.
|
|
|
|
func (r *resourceCollector) Reset() {
|
|
|
|
r.lock.Lock()
|
|
|
|
defer r.lock.Unlock()
|
|
|
|
for _, name := range r.containers {
|
|
|
|
r.buffers[name] = []*containerResourceUsage{}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-07-13 19:09:57 +00:00
|
|
|
type resourceUsageByCPU []*containerResourceUsage
|
|
|
|
|
|
|
|
func (r resourceUsageByCPU) Len() int { return len(r) }
|
|
|
|
func (r resourceUsageByCPU) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
|
|
|
|
func (r resourceUsageByCPU) Less(i, j int) bool { return r[i].CPUUsageInCores < r[j].CPUUsageInCores }
|
|
|
|
|
2015-09-22 23:38:36 +00:00
|
|
|
// The percentiles to report.
|
|
|
|
var percentiles = [...]float64{0.05, 0.20, 0.50, 0.70, 0.90, 0.95, 0.99}
|
2015-07-13 19:09:57 +00:00
|
|
|
|
2015-09-22 23:38:36 +00:00
|
|
|
// GetBasicCPUStats returns the percentiles the cpu usage in cores for
|
|
|
|
// containerName. This method examines all data currently in the buffer.
|
2015-07-13 19:09:57 +00:00
|
|
|
func (r *resourceCollector) GetBasicCPUStats(containerName string) map[float64]float64 {
|
2015-09-22 22:08:53 +00:00
|
|
|
r.lock.RLock()
|
|
|
|
defer r.lock.RUnlock()
|
2015-07-13 19:09:57 +00:00
|
|
|
result := make(map[float64]float64, len(percentiles))
|
|
|
|
usages := r.buffers[containerName]
|
|
|
|
sort.Sort(resourceUsageByCPU(usages))
|
|
|
|
for _, q := range percentiles {
|
|
|
|
index := int(float64(len(usages))*q) - 1
|
|
|
|
if index < 0 {
|
2015-09-22 23:38:36 +00:00
|
|
|
// We don't have enough data.
|
|
|
|
result[q] = 0
|
|
|
|
continue
|
2015-07-13 19:09:57 +00:00
|
|
|
}
|
|
|
|
result[q] = usages[index].CPUUsageInCores
|
|
|
|
}
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
|
|
|
// resourceMonitor manages a resourceCollector per node.
|
|
|
|
type resourceMonitor struct {
|
|
|
|
client *client.Client
|
|
|
|
containers []string
|
|
|
|
pollingInterval time.Duration
|
|
|
|
collectors map[string]*resourceCollector
|
|
|
|
}
|
|
|
|
|
|
|
|
func newResourceMonitor(c *client.Client, containerNames []string, pollingInterval time.Duration) *resourceMonitor {
|
|
|
|
return &resourceMonitor{
|
|
|
|
containers: containerNames,
|
|
|
|
client: c,
|
|
|
|
pollingInterval: pollingInterval,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *resourceMonitor) Start() {
|
2015-12-02 11:12:57 +00:00
|
|
|
nodes, err := r.client.Nodes().List(unversioned.ListOptions{})
|
2015-07-13 19:09:57 +00:00
|
|
|
if err != nil {
|
|
|
|
Failf("resourceMonitor: unable to get list of nodes: %v", err)
|
|
|
|
}
|
|
|
|
r.collectors = make(map[string]*resourceCollector, 0)
|
|
|
|
for _, node := range nodes.Items {
|
2015-12-08 16:14:48 +00:00
|
|
|
collector := newResourceCollector(r.client, node.Name, r.containers, r.pollingInterval)
|
2015-07-13 19:09:57 +00:00
|
|
|
r.collectors[node.Name] = collector
|
|
|
|
collector.Start()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *resourceMonitor) Stop() {
|
|
|
|
for _, collector := range r.collectors {
|
|
|
|
collector.Stop()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-09-22 22:08:53 +00:00
|
|
|
func (r *resourceMonitor) Reset() {
|
|
|
|
for _, collector := range r.collectors {
|
|
|
|
collector.Reset()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-07-13 19:09:57 +00:00
|
|
|
func (r *resourceMonitor) LogLatest() {
|
|
|
|
for _, collector := range r.collectors {
|
|
|
|
collector.LogLatest()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-12-08 16:14:48 +00:00
|
|
|
// containersCPUSummary is indexed by the container name with each entry a
|
|
|
|
// (percentile, value) map.
|
|
|
|
type containersCPUSummary map[string]map[float64]float64
|
|
|
|
|
|
|
|
// nodesCPUSummary is indexed by the node name with each entry a
|
|
|
|
// containersCPUSummary map.
|
|
|
|
type nodesCPUSummary map[string]containersCPUSummary
|
|
|
|
|
|
|
|
func (r *resourceMonitor) FormatCPUSummary(summary nodesCPUSummary) string {
|
2015-09-22 23:38:36 +00:00
|
|
|
// Example output for a node (the percentiles may differ):
|
2015-12-08 16:14:48 +00:00
|
|
|
// CPU usage of containers on node "e2e-test-foo-minion-0vj7":
|
2015-07-13 19:09:57 +00:00
|
|
|
// container 5th% 50th% 90th% 95th%
|
|
|
|
// "/" 0.051 0.159 0.387 0.455
|
|
|
|
// "/docker-daemon" 0.000 0.000 0.146 0.166
|
|
|
|
// "/kubelet" 0.036 0.053 0.091 0.154
|
|
|
|
// "/system" 0.001 0.001 0.001 0.002
|
2015-12-08 16:14:48 +00:00
|
|
|
var summaryStrings []string
|
2015-07-13 19:09:57 +00:00
|
|
|
var header []string
|
|
|
|
header = append(header, "container")
|
|
|
|
for _, p := range percentiles {
|
|
|
|
header = append(header, fmt.Sprintf("%.0fth%%", p*100))
|
|
|
|
}
|
2015-12-08 16:14:48 +00:00
|
|
|
for nodeName, containers := range summary {
|
2015-07-13 19:09:57 +00:00
|
|
|
buf := &bytes.Buffer{}
|
|
|
|
w := tabwriter.NewWriter(buf, 1, 0, 1, ' ', 0)
|
|
|
|
fmt.Fprintf(w, "%s\n", strings.Join(header, "\t"))
|
2015-09-23 13:25:52 +00:00
|
|
|
for _, containerName := range targetContainers() {
|
2015-07-13 19:09:57 +00:00
|
|
|
var s []string
|
|
|
|
s = append(s, fmt.Sprintf("%q", containerName))
|
2015-12-08 16:14:48 +00:00
|
|
|
data, ok := containers[containerName]
|
2015-07-13 19:09:57 +00:00
|
|
|
for _, p := range percentiles {
|
2015-12-08 16:14:48 +00:00
|
|
|
value := "N/A"
|
|
|
|
if ok {
|
|
|
|
value = fmt.Sprintf("%.3f", data[p])
|
|
|
|
}
|
|
|
|
s = append(s, value)
|
2015-07-13 19:09:57 +00:00
|
|
|
}
|
|
|
|
fmt.Fprintf(w, "%s\n", strings.Join(s, "\t"))
|
|
|
|
}
|
|
|
|
w.Flush()
|
2015-12-08 16:14:48 +00:00
|
|
|
summaryStrings = append(summaryStrings, fmt.Sprintf("CPU usage of containers on node %q\n:%s", nodeName, buf.String()))
|
2015-07-13 19:09:57 +00:00
|
|
|
}
|
2015-12-08 16:14:48 +00:00
|
|
|
return strings.Join(summaryStrings, "\n")
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *resourceMonitor) LogCPUSummary() {
|
|
|
|
summary := r.GetCPUSummary()
|
|
|
|
Logf(r.FormatCPUSummary(summary))
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *resourceMonitor) GetCPUSummary() nodesCPUSummary {
|
|
|
|
result := make(nodesCPUSummary)
|
|
|
|
for nodeName, collector := range r.collectors {
|
|
|
|
result[nodeName] = make(containersCPUSummary)
|
|
|
|
for _, containerName := range targetContainers() {
|
|
|
|
data := collector.GetBasicCPUStats(containerName)
|
|
|
|
result[nodeName][containerName] = data
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return result
|
2015-07-13 19:09:57 +00:00
|
|
|
}
|