2015-09-29 14:49:07 +00:00
|
|
|
/*
|
|
|
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
*/
|
|
|
|
|
2016-04-07 00:47:39 +00:00
|
|
|
package framework
|
2015-09-29 14:49:07 +00:00
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"encoding/json"
|
|
|
|
"fmt"
|
2015-09-21 19:20:25 +00:00
|
|
|
"io"
|
2016-04-07 00:47:39 +00:00
|
|
|
"math"
|
2015-09-29 14:49:07 +00:00
|
|
|
"sort"
|
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
"time"
|
|
|
|
|
2015-12-10 09:39:03 +00:00
|
|
|
"k8s.io/kubernetes/pkg/api"
|
2015-09-29 14:49:07 +00:00
|
|
|
client "k8s.io/kubernetes/pkg/client/unversioned"
|
2015-12-08 16:04:49 +00:00
|
|
|
"k8s.io/kubernetes/pkg/master/ports"
|
2016-01-04 15:42:51 +00:00
|
|
|
"k8s.io/kubernetes/pkg/metrics"
|
2015-09-29 14:49:07 +00:00
|
|
|
"k8s.io/kubernetes/pkg/util/sets"
|
|
|
|
|
2015-09-21 19:20:25 +00:00
|
|
|
"github.com/prometheus/common/expfmt"
|
|
|
|
"github.com/prometheus/common/model"
|
2015-09-29 14:49:07 +00:00
|
|
|
)
|
|
|
|
|
2015-10-19 12:45:24 +00:00
|
|
|
const (
|
2016-04-07 00:47:39 +00:00
|
|
|
// NodeStartupThreshold is a rough estimate of the time allocated for a pod to start on a node.
|
|
|
|
NodeStartupThreshold = 4 * time.Second
|
|
|
|
|
2015-12-11 08:25:34 +00:00
|
|
|
podStartupThreshold time.Duration = 5 * time.Second
|
2015-12-14 08:20:12 +00:00
|
|
|
listPodLatencySmallThreshold time.Duration = 1 * time.Second
|
2015-12-11 08:25:34 +00:00
|
|
|
listPodLatencyMediumThreshold time.Duration = 1 * time.Second
|
|
|
|
listPodLatencyLargeThreshold time.Duration = 1 * time.Second
|
2015-12-15 07:14:07 +00:00
|
|
|
// TODO: Decrease the small threshold to 250ms once tests are fixed.
|
2016-01-18 10:09:47 +00:00
|
|
|
apiCallLatencySmallThreshold time.Duration = 500 * time.Millisecond
|
|
|
|
apiCallLatencyMediumThreshold time.Duration = 500 * time.Millisecond
|
2015-11-17 13:01:46 +00:00
|
|
|
apiCallLatencyLargeThreshold time.Duration = 1 * time.Second
|
2015-10-19 12:45:24 +00:00
|
|
|
)
|
|
|
|
|
2016-01-12 12:23:47 +00:00
|
|
|
type MetricsForE2E metrics.MetricsCollection
|
2016-01-04 15:42:51 +00:00
|
|
|
|
2016-01-15 08:57:07 +00:00
|
|
|
func (m *MetricsForE2E) filterMetrics() {
|
|
|
|
interestingApiServerMetrics := make(metrics.ApiServerMetrics)
|
|
|
|
for _, metric := range InterestingApiServerMetrics {
|
|
|
|
interestingApiServerMetrics[metric] = (*m).ApiServerMetrics[metric]
|
|
|
|
}
|
|
|
|
interestingKubeletMetrics := make(map[string]metrics.KubeletMetrics)
|
|
|
|
for kubelet, grabbed := range (*m).KubeletMetrics {
|
|
|
|
interestingKubeletMetrics[kubelet] = make(metrics.KubeletMetrics)
|
|
|
|
for _, metric := range InterestingKubeletMetrics {
|
|
|
|
interestingKubeletMetrics[kubelet][metric] = grabbed[metric]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
(*m).ApiServerMetrics = interestingApiServerMetrics
|
|
|
|
(*m).KubeletMetrics = interestingKubeletMetrics
|
|
|
|
}
|
|
|
|
|
2016-01-12 12:23:47 +00:00
|
|
|
func (m *MetricsForE2E) PrintHumanReadable() string {
|
2016-01-04 15:42:51 +00:00
|
|
|
buf := bytes.Buffer{}
|
|
|
|
for _, interestingMetric := range InterestingApiServerMetrics {
|
|
|
|
buf.WriteString(fmt.Sprintf("For %v:\n", interestingMetric))
|
|
|
|
for _, sample := range (*m).ApiServerMetrics[interestingMetric] {
|
|
|
|
buf.WriteString(fmt.Sprintf("\t%v\n", metrics.PrintSample(sample)))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for kubelet, grabbed := range (*m).KubeletMetrics {
|
|
|
|
buf.WriteString(fmt.Sprintf("For %v:\n", kubelet))
|
|
|
|
for _, interestingMetric := range InterestingKubeletMetrics {
|
|
|
|
buf.WriteString(fmt.Sprintf("\tFor %v:\n", interestingMetric))
|
|
|
|
for _, sample := range grabbed[interestingMetric] {
|
|
|
|
buf.WriteString(fmt.Sprintf("\t\t%v\n", metrics.PrintSample(sample)))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return buf.String()
|
|
|
|
}
|
|
|
|
|
2016-01-12 12:23:47 +00:00
|
|
|
func (m *MetricsForE2E) PrintJSON() string {
|
2016-01-15 08:57:07 +00:00
|
|
|
m.filterMetrics()
|
2016-04-07 17:21:31 +00:00
|
|
|
return PrettyPrintJSON(*m)
|
2016-01-04 15:42:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
var InterestingApiServerMetrics = []string{
|
2015-12-23 14:56:56 +00:00
|
|
|
"apiserver_request_count",
|
2016-02-03 15:53:19 +00:00
|
|
|
"apiserver_request_latencies_summary",
|
2015-12-23 14:56:56 +00:00
|
|
|
"etcd_helper_cache_entry_count",
|
|
|
|
"etcd_helper_cache_hit_count",
|
|
|
|
"etcd_helper_cache_miss_count",
|
|
|
|
"etcd_request_cache_add_latencies_summary",
|
|
|
|
"etcd_request_cache_get_latencies_summary",
|
|
|
|
"etcd_request_latencies_summary",
|
2016-01-04 15:42:51 +00:00
|
|
|
}
|
2015-12-23 14:56:56 +00:00
|
|
|
|
2016-01-04 15:42:51 +00:00
|
|
|
var InterestingKubeletMetrics = []string{
|
2015-12-23 14:56:56 +00:00
|
|
|
"kubelet_container_manager_latency_microseconds",
|
|
|
|
"kubelet_docker_errors",
|
|
|
|
"kubelet_docker_operations_latency_microseconds",
|
|
|
|
"kubelet_generate_pod_status_latency_microseconds",
|
|
|
|
"kubelet_pod_start_latency_microseconds",
|
|
|
|
"kubelet_pod_worker_latency_microseconds",
|
|
|
|
"kubelet_pod_worker_start_latency_microseconds",
|
|
|
|
"kubelet_sync_pods_latency_microseconds",
|
2016-01-04 15:42:51 +00:00
|
|
|
}
|
2015-12-23 14:56:56 +00:00
|
|
|
|
2015-09-29 14:49:07 +00:00
|
|
|
// Dashboard metrics
|
|
|
|
type LatencyMetric struct {
|
|
|
|
Perc50 time.Duration `json:"Perc50"`
|
|
|
|
Perc90 time.Duration `json:"Perc90"`
|
|
|
|
Perc99 time.Duration `json:"Perc99"`
|
|
|
|
}
|
|
|
|
|
|
|
|
type PodStartupLatency struct {
|
|
|
|
Latency LatencyMetric `json:"latency"`
|
|
|
|
}
|
|
|
|
|
2015-11-18 16:07:26 +00:00
|
|
|
type SchedulingLatency struct {
|
2016-02-26 14:37:35 +00:00
|
|
|
Scheduling LatencyMetric `json:"scheduling"`
|
2015-11-18 16:07:26 +00:00
|
|
|
Binding LatencyMetric `json:"binding"`
|
|
|
|
Total LatencyMetric `json:"total"`
|
|
|
|
}
|
|
|
|
|
2016-02-26 14:37:35 +00:00
|
|
|
type SaturationTime struct {
|
|
|
|
TimeToSaturate time.Duration `json:"timeToStaturate"`
|
|
|
|
NumberOfNodes int `json:"numberOfNodes"`
|
|
|
|
NumberOfPods int `json:"numberOfPods"`
|
|
|
|
Throughput float32 `json:"throughput"`
|
|
|
|
}
|
|
|
|
|
2015-09-29 14:49:07 +00:00
|
|
|
type APICall struct {
|
|
|
|
Resource string `json:"resource"`
|
|
|
|
Verb string `json:"verb"`
|
|
|
|
Latency LatencyMetric `json:"latency"`
|
|
|
|
}
|
|
|
|
|
|
|
|
type APIResponsiveness struct {
|
|
|
|
APICalls []APICall `json:"apicalls"`
|
|
|
|
}
|
|
|
|
|
|
|
|
func (a APIResponsiveness) Len() int { return len(a.APICalls) }
|
|
|
|
func (a APIResponsiveness) Swap(i, j int) { a.APICalls[i], a.APICalls[j] = a.APICalls[j], a.APICalls[i] }
|
|
|
|
func (a APIResponsiveness) Less(i, j int) bool {
|
|
|
|
return a.APICalls[i].Latency.Perc99 < a.APICalls[j].Latency.Perc99
|
|
|
|
}
|
|
|
|
|
|
|
|
// 0 <= quantile <=1 (e.g. 0.95 is 95%tile, 0.5 is median)
|
|
|
|
// Only 0.5, 0.9 and 0.99 quantiles are supported.
|
|
|
|
func (a *APIResponsiveness) addMetric(resource, verb string, quantile float64, latency time.Duration) {
|
|
|
|
for i, apicall := range a.APICalls {
|
|
|
|
if apicall.Resource == resource && apicall.Verb == verb {
|
2015-11-18 16:07:26 +00:00
|
|
|
a.APICalls[i] = setQuantileAPICall(apicall, quantile, latency)
|
2015-09-29 14:49:07 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
2015-11-18 16:07:26 +00:00
|
|
|
apicall := setQuantileAPICall(APICall{Resource: resource, Verb: verb}, quantile, latency)
|
2015-09-29 14:49:07 +00:00
|
|
|
a.APICalls = append(a.APICalls, apicall)
|
|
|
|
}
|
|
|
|
|
|
|
|
// 0 <= quantile <=1 (e.g. 0.95 is 95%tile, 0.5 is median)
|
|
|
|
// Only 0.5, 0.9 and 0.99 quantiles are supported.
|
2015-11-18 16:07:26 +00:00
|
|
|
func setQuantileAPICall(apicall APICall, quantile float64, latency time.Duration) APICall {
|
|
|
|
setQuantile(&apicall.Latency, quantile, latency)
|
|
|
|
return apicall
|
|
|
|
}
|
|
|
|
|
|
|
|
// Only 0.5, 0.9 and 0.99 quantiles are supported.
|
|
|
|
func setQuantile(metric *LatencyMetric, quantile float64, latency time.Duration) {
|
2015-09-29 14:49:07 +00:00
|
|
|
switch quantile {
|
|
|
|
case 0.5:
|
2015-11-18 16:07:26 +00:00
|
|
|
metric.Perc50 = latency
|
2015-09-29 14:49:07 +00:00
|
|
|
case 0.9:
|
2015-11-18 16:07:26 +00:00
|
|
|
metric.Perc90 = latency
|
2015-09-29 14:49:07 +00:00
|
|
|
case 0.99:
|
2015-11-18 16:07:26 +00:00
|
|
|
metric.Perc99 = latency
|
2015-09-29 14:49:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func readLatencyMetrics(c *client.Client) (APIResponsiveness, error) {
|
2015-09-21 19:20:25 +00:00
|
|
|
var a APIResponsiveness
|
|
|
|
|
2015-09-29 14:49:07 +00:00
|
|
|
body, err := getMetrics(c)
|
|
|
|
if err != nil {
|
2015-09-21 19:20:25 +00:00
|
|
|
return a, err
|
2015-09-29 14:49:07 +00:00
|
|
|
}
|
|
|
|
|
2015-09-21 19:20:25 +00:00
|
|
|
samples, err := extractMetricSamples(body)
|
|
|
|
if err != nil {
|
|
|
|
return a, err
|
|
|
|
}
|
|
|
|
|
|
|
|
ignoredResources := sets.NewString("events")
|
2015-10-29 14:49:23 +00:00
|
|
|
// TODO: figure out why we're getting non-capitalized proxy and fix this.
|
2016-03-07 19:31:22 +00:00
|
|
|
ignoredVerbs := sets.NewString("WATCHLIST", "PROXY", "proxy", "CONNECT")
|
2015-09-21 19:20:25 +00:00
|
|
|
|
|
|
|
for _, sample := range samples {
|
|
|
|
// Example line:
|
|
|
|
// apiserver_request_latencies_summary{resource="namespaces",verb="LIST",quantile="0.99"} 908
|
|
|
|
if sample.Metric[model.MetricNameLabel] != "apiserver_request_latencies_summary" {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
resource := string(sample.Metric["resource"])
|
|
|
|
verb := string(sample.Metric["verb"])
|
|
|
|
if ignoredResources.Has(resource) || ignoredVerbs.Has(verb) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
latency := sample.Value
|
|
|
|
quantile, err := strconv.ParseFloat(string(sample.Metric[model.QuantileLabel]), 64)
|
|
|
|
if err != nil {
|
|
|
|
return a, err
|
|
|
|
}
|
|
|
|
a.addMetric(resource, verb, quantile, time.Duration(int64(latency))*time.Microsecond)
|
|
|
|
}
|
|
|
|
|
|
|
|
return a, err
|
2015-09-29 14:49:07 +00:00
|
|
|
}
|
|
|
|
|
2015-11-17 13:01:46 +00:00
|
|
|
// Returns threshold for API call depending on the size of the cluster.
|
|
|
|
// In general our goal is 1s, but for smaller clusters, we want to enforce
|
|
|
|
// smaller limits, to allow noticing regressions.
|
|
|
|
func apiCallLatencyThreshold(numNodes int) time.Duration {
|
|
|
|
if numNodes <= 250 {
|
|
|
|
return apiCallLatencySmallThreshold
|
|
|
|
}
|
|
|
|
if numNodes <= 500 {
|
|
|
|
return apiCallLatencyMediumThreshold
|
|
|
|
}
|
|
|
|
return apiCallLatencyLargeThreshold
|
|
|
|
}
|
|
|
|
|
2015-11-25 16:17:25 +00:00
|
|
|
func listPodsLatencyThreshold(numNodes int) time.Duration {
|
|
|
|
if numNodes <= 250 {
|
|
|
|
return listPodLatencySmallThreshold
|
|
|
|
}
|
|
|
|
if numNodes <= 500 {
|
|
|
|
return listPodLatencyMediumThreshold
|
|
|
|
}
|
|
|
|
return listPodLatencyLargeThreshold
|
|
|
|
}
|
|
|
|
|
2015-10-19 12:45:24 +00:00
|
|
|
// Prints top five summary metrics for request types with latency and returns
|
|
|
|
// number of such request types above threshold.
|
|
|
|
func HighLatencyRequests(c *client.Client) (int, error) {
|
2015-12-10 09:39:03 +00:00
|
|
|
nodes, err := c.Nodes().List(api.ListOptions{})
|
2015-11-17 13:01:46 +00:00
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
numNodes := len(nodes.Items)
|
2015-09-29 14:49:07 +00:00
|
|
|
metrics, err := readLatencyMetrics(c)
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
sort.Sort(sort.Reverse(metrics))
|
|
|
|
badMetrics := 0
|
|
|
|
top := 5
|
|
|
|
for _, metric := range metrics.APICalls {
|
2015-11-17 13:01:46 +00:00
|
|
|
threshold := apiCallLatencyThreshold(numNodes)
|
2015-10-19 12:45:24 +00:00
|
|
|
if metric.Verb == "LIST" && metric.Resource == "pods" {
|
2015-11-25 16:17:25 +00:00
|
|
|
threshold = listPodsLatencyThreshold(numNodes)
|
2015-10-19 12:45:24 +00:00
|
|
|
}
|
|
|
|
|
2015-09-29 14:49:07 +00:00
|
|
|
isBad := false
|
|
|
|
if metric.Latency.Perc99 > threshold {
|
|
|
|
badMetrics++
|
|
|
|
isBad = true
|
|
|
|
}
|
|
|
|
if top > 0 || isBad {
|
|
|
|
top--
|
|
|
|
prefix := ""
|
|
|
|
if isBad {
|
|
|
|
prefix = "WARNING "
|
|
|
|
}
|
|
|
|
Logf("%vTop latency metric: %+v", prefix, metric)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-03-12 10:57:58 +00:00
|
|
|
// TODO(random-liu): Remove the log when we migrate to new perfdash
|
2016-04-07 17:21:31 +00:00
|
|
|
Logf("API calls latencies: %s", PrettyPrintJSON(metrics))
|
2016-03-12 10:57:58 +00:00
|
|
|
// Log perf data
|
|
|
|
PrintPerfData(ApiCallToPerfData(metrics))
|
2015-09-29 14:49:07 +00:00
|
|
|
|
|
|
|
return badMetrics, nil
|
|
|
|
}
|
|
|
|
|
2015-10-19 12:45:24 +00:00
|
|
|
// Verifies whether 50, 90 and 99th percentiles of PodStartupLatency are
|
|
|
|
// within the threshold.
|
|
|
|
func VerifyPodStartupLatency(latency PodStartupLatency) error {
|
2016-04-07 17:21:31 +00:00
|
|
|
Logf("Pod startup latency: %s", PrettyPrintJSON(latency))
|
2015-09-29 14:49:07 +00:00
|
|
|
|
|
|
|
if latency.Latency.Perc50 > podStartupThreshold {
|
|
|
|
return fmt.Errorf("too high pod startup latency 50th percentile: %v", latency.Latency.Perc50)
|
|
|
|
}
|
|
|
|
if latency.Latency.Perc90 > podStartupThreshold {
|
|
|
|
return fmt.Errorf("too high pod startup latency 90th percentile: %v", latency.Latency.Perc90)
|
|
|
|
}
|
|
|
|
if latency.Latency.Perc99 > podStartupThreshold {
|
|
|
|
return fmt.Errorf("too high pod startup latency 99th percentil: %v", latency.Latency.Perc99)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Resets latency metrics in apiserver.
|
2016-04-07 17:21:31 +00:00
|
|
|
func ResetMetrics(c *client.Client) error {
|
2015-09-29 14:49:07 +00:00
|
|
|
Logf("Resetting latency metrics in apiserver...")
|
2016-04-13 21:33:29 +00:00
|
|
|
body, err := c.Get().AbsPath("/resetMetrics").DoRaw()
|
2015-09-29 14:49:07 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if string(body) != "metrics reset\n" {
|
|
|
|
return fmt.Errorf("Unexpected response: %q", string(body))
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Retrieves metrics information.
|
|
|
|
func getMetrics(c *client.Client) (string, error) {
|
|
|
|
body, err := c.Get().AbsPath("/metrics").DoRaw()
|
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
return string(body), nil
|
|
|
|
}
|
|
|
|
|
2015-11-18 16:07:26 +00:00
|
|
|
// Retrieves scheduler metrics information.
|
2015-12-08 16:04:49 +00:00
|
|
|
func getSchedulingLatency(c *client.Client) (SchedulingLatency, error) {
|
2015-11-18 16:07:26 +00:00
|
|
|
result := SchedulingLatency{}
|
|
|
|
|
2015-12-08 16:04:49 +00:00
|
|
|
// Check if master Node is registered
|
|
|
|
nodes, err := c.Nodes().List(api.ListOptions{})
|
2016-04-07 17:21:31 +00:00
|
|
|
ExpectNoError(err)
|
2015-12-08 16:04:49 +00:00
|
|
|
|
|
|
|
var data string
|
|
|
|
var masterRegistered = false
|
|
|
|
for _, node := range nodes.Items {
|
|
|
|
if strings.HasSuffix(node.Name, "master") {
|
|
|
|
masterRegistered = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if masterRegistered {
|
|
|
|
rawData, err := c.Get().
|
|
|
|
Prefix("proxy").
|
|
|
|
Namespace(api.NamespaceSystem).
|
|
|
|
Resource("pods").
|
2016-04-07 17:21:31 +00:00
|
|
|
Name(fmt.Sprintf("kube-scheduler-%v:%v", TestContext.CloudConfig.MasterName, ports.SchedulerPort)).
|
2015-12-08 16:04:49 +00:00
|
|
|
Suffix("metrics").
|
|
|
|
Do().Raw()
|
|
|
|
|
2016-04-07 17:21:31 +00:00
|
|
|
ExpectNoError(err)
|
2015-12-08 16:04:49 +00:00
|
|
|
data = string(rawData)
|
|
|
|
} else {
|
|
|
|
// If master is not registered fall back to old method of using SSH.
|
|
|
|
cmd := "curl http://localhost:10251/metrics"
|
2016-04-07 17:21:31 +00:00
|
|
|
sshResult, err := SSH(cmd, GetMasterHost()+":22", TestContext.Provider)
|
2015-12-08 16:04:49 +00:00
|
|
|
if err != nil || sshResult.Code != 0 {
|
|
|
|
return result, fmt.Errorf("unexpected error (code: %d) in ssh connection to master: %#v", sshResult.Code, err)
|
|
|
|
}
|
|
|
|
data = sshResult.Stdout
|
2015-11-18 16:07:26 +00:00
|
|
|
}
|
2015-12-08 16:04:49 +00:00
|
|
|
samples, err := extractMetricSamples(data)
|
2015-11-18 16:07:26 +00:00
|
|
|
if err != nil {
|
|
|
|
return result, err
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, sample := range samples {
|
|
|
|
var metric *LatencyMetric = nil
|
|
|
|
switch sample.Metric[model.MetricNameLabel] {
|
|
|
|
case "scheduler_scheduling_algorithm_latency_microseconds":
|
|
|
|
metric = &result.Scheduling
|
|
|
|
case "scheduler_binding_latency_microseconds":
|
|
|
|
metric = &result.Binding
|
|
|
|
case "scheduler_e2e_scheduling_latency_microseconds":
|
|
|
|
metric = &result.Total
|
|
|
|
}
|
|
|
|
if metric == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
latency := sample.Value
|
|
|
|
quantile, err := strconv.ParseFloat(string(sample.Metric[model.QuantileLabel]), 64)
|
|
|
|
if err != nil {
|
|
|
|
return result, err
|
|
|
|
}
|
|
|
|
setQuantile(metric, quantile, time.Duration(int64(latency))*time.Microsecond)
|
|
|
|
}
|
|
|
|
return result, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Verifies (currently just by logging them) the scheduling latencies.
|
2015-12-08 16:04:49 +00:00
|
|
|
func VerifySchedulerLatency(c *client.Client) error {
|
|
|
|
latency, err := getSchedulingLatency(c)
|
2015-11-18 16:07:26 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-04-07 17:21:31 +00:00
|
|
|
Logf("Scheduling latency: %s", PrettyPrintJSON(latency))
|
2015-11-18 16:07:26 +00:00
|
|
|
|
|
|
|
// TODO: Add some reasonable checks once we know more about the values.
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2016-04-07 17:21:31 +00:00
|
|
|
func PrettyPrintJSON(metrics interface{}) string {
|
2015-09-29 14:49:07 +00:00
|
|
|
output := &bytes.Buffer{}
|
|
|
|
if err := json.NewEncoder(output).Encode(metrics); err != nil {
|
2016-01-08 08:33:06 +00:00
|
|
|
Logf("Error building encoder: %v", err)
|
2015-09-29 14:49:07 +00:00
|
|
|
return ""
|
|
|
|
}
|
|
|
|
formatted := &bytes.Buffer{}
|
|
|
|
if err := json.Indent(formatted, output.Bytes(), "", " "); err != nil {
|
2016-01-08 08:33:06 +00:00
|
|
|
Logf("Error indenting: %v", err)
|
2015-09-29 14:49:07 +00:00
|
|
|
return ""
|
|
|
|
}
|
|
|
|
return string(formatted.Bytes())
|
|
|
|
}
|
|
|
|
|
2015-09-21 19:20:25 +00:00
|
|
|
// extractMetricSamples parses the prometheus metric samples from the input string.
|
|
|
|
func extractMetricSamples(metricsBlob string) ([]*model.Sample, error) {
|
|
|
|
dec, err := expfmt.NewDecoder(strings.NewReader(metricsBlob), expfmt.FmtText)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
decoder := expfmt.SampleDecoder{
|
|
|
|
Dec: dec,
|
|
|
|
Opts: &expfmt.DecodeOptions{},
|
|
|
|
}
|
|
|
|
|
|
|
|
var samples []*model.Sample
|
|
|
|
for {
|
|
|
|
var v model.Vector
|
|
|
|
if err = decoder.Decode(&v); err != nil {
|
|
|
|
if err == io.EOF {
|
|
|
|
// Expected loop termination condition.
|
|
|
|
return samples, nil
|
|
|
|
}
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
samples = append(samples, v...)
|
|
|
|
}
|
|
|
|
}
|
2015-09-08 19:34:55 +00:00
|
|
|
|
2016-04-07 17:21:31 +00:00
|
|
|
// PodLatencyData encapsulates pod startup latency information.
|
|
|
|
type PodLatencyData struct {
|
2016-04-07 00:47:39 +00:00
|
|
|
// Name of the pod
|
|
|
|
Name string
|
|
|
|
// Node this pod was running on
|
|
|
|
Node string
|
|
|
|
// Latency information related to pod startuptime
|
|
|
|
Latency time.Duration
|
|
|
|
}
|
|
|
|
|
2016-04-07 17:21:31 +00:00
|
|
|
type LatencySlice []PodLatencyData
|
2016-04-07 00:47:39 +00:00
|
|
|
|
2016-04-07 17:21:31 +00:00
|
|
|
func (a LatencySlice) Len() int { return len(a) }
|
|
|
|
func (a LatencySlice) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
|
|
|
func (a LatencySlice) Less(i, j int) bool { return a[i].Latency < a[j].Latency }
|
2016-04-07 00:47:39 +00:00
|
|
|
|
2016-04-07 17:21:31 +00:00
|
|
|
func ExtractLatencyMetrics(latencies []PodLatencyData) LatencyMetric {
|
2016-04-07 00:47:39 +00:00
|
|
|
length := len(latencies)
|
|
|
|
perc50 := latencies[int(math.Ceil(float64(length*50)/100))-1].Latency
|
|
|
|
perc90 := latencies[int(math.Ceil(float64(length*90)/100))-1].Latency
|
|
|
|
perc99 := latencies[int(math.Ceil(float64(length*99)/100))-1].Latency
|
|
|
|
return LatencyMetric{Perc50: perc50, Perc90: perc90, Perc99: perc99}
|
|
|
|
}
|
|
|
|
|
2016-04-07 17:21:31 +00:00
|
|
|
// LogSuspiciousLatency logs metrics/docker errors from all nodes that had slow startup times
|
2015-09-08 19:34:55 +00:00
|
|
|
// If latencyDataLag is nil then it will be populated from latencyData
|
2016-04-07 17:21:31 +00:00
|
|
|
func LogSuspiciousLatency(latencyData []PodLatencyData, latencyDataLag []PodLatencyData, nodeCount int, c *client.Client) {
|
2015-09-08 19:34:55 +00:00
|
|
|
if latencyDataLag == nil {
|
|
|
|
latencyDataLag = latencyData
|
|
|
|
}
|
|
|
|
for _, l := range latencyData {
|
|
|
|
if l.Latency > NodeStartupThreshold {
|
|
|
|
HighLatencyKubeletOperations(c, 1*time.Second, l.Node)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Logf("Approx throughput: %v pods/min",
|
|
|
|
float64(nodeCount)/(latencyDataLag[len(latencyDataLag)-1].Latency.Minutes()))
|
|
|
|
}
|
|
|
|
|
|
|
|
// testMaximumLatencyValue verifies the highest latency value is less than or equal to
|
|
|
|
// the given time.Duration. Since the arrays are sorted we are looking at the last
|
|
|
|
// element which will always be the highest. If the latency is higher than the max Failf
|
|
|
|
// is called.
|
2016-04-07 17:21:31 +00:00
|
|
|
func testMaximumLatencyValue(latencies []PodLatencyData, max time.Duration, name string) {
|
2015-09-08 19:34:55 +00:00
|
|
|
highestLatency := latencies[len(latencies)-1]
|
|
|
|
if !(highestLatency.Latency <= max) {
|
|
|
|
Failf("%s were not all under %s: %#v", name, max.String(), latencies)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-04-07 17:21:31 +00:00
|
|
|
func PrintLatencies(latencies []PodLatencyData, header string) {
|
|
|
|
metrics := ExtractLatencyMetrics(latencies)
|
2015-09-08 19:34:55 +00:00
|
|
|
Logf("10%% %s: %v", header, latencies[(len(latencies)*9)/10:])
|
|
|
|
Logf("perc50: %v, perc90: %v, perc99: %v", metrics.Perc50, metrics.Perc90, metrics.Perc99)
|
|
|
|
}
|