Add a flag that will make test gather metrics from all running components after the test finishes.

2015-12-23 15:56:56 +01:00 · 2015-12-23 15:56:56 +01:00 · 2dcafa3854
parent c36226bc39
commit 2dcafa3854
8 changed files with 125 additions and 28 deletions
--- a/hack/verify-flags/known-flags.txt
+++ b/hack/verify-flags/known-flags.txt
@ -113,6 +113,7 @@ from-literal
 func-dest
 fuzz-iters
 gather-logs-sizes
+gather-metrics-at-teardown
 gather-resource-usage
 gce-project
 gce-service-account
--- a/pkg/metrics/api_server_metrics.go
+++ b/pkg/metrics/api_server_metrics.go
@ -42,8 +42,6 @@ var KnownApiServerMetrics = map[string][]string{
 	"etcd_request_latencies_summary":                 {"operation", "type", "quantile"},
 	"etcd_request_latencies_summary_count":           {"operation", "type"},
 	"etcd_request_latencies_summary_sum":             {"operation", "type"},
-	"get_token_count":                                {},
-	"get_token_fail_count":                           {},
 	"rest_client_request_latency_microseconds":       {"url", "verb", "quantile"},
 	"rest_client_request_latency_microseconds_count": {"url", "verb"},
 	"rest_client_request_latency_microseconds_sum":   {"url", "verb"},
--- a/pkg/metrics/generic_metrics.go
+++ b/pkg/metrics/generic_metrics.go
@ -29,38 +29,55 @@ import (
 )

 var CommonMetrics = map[string][]string{
-	"process_start_time_seconds":    {},
-	"process_resident_memory_bytes": {},
-	"process_virtual_memory_bytes":  {},
-	"process_cpu_seconds_total":     {},
-	"process_max_fds":               {},
-	"process_open_fds":              {},
-
-	"http_request_size_bytes":                  {"handler", "quantile"},
-	"http_request_size_bytes_count":            {"handler"},
-	"http_request_size_bytes_sum":              {"handler"},
-	"http_request_duration_microseconds":       {"handler", "quantile"},
-	"http_request_duration_microseconds_count": {"handler"},
-	"http_request_duration_microseconds_sum":   {"handler"},
-	"http_requests_total":                      {"handler", "method", "code"},
-
-	"http_response_size_bytes":       {"handler", "quantile"},
-	"http_response_size_bytes_count": {"handler"},
-	"http_response_size_bytes_sum":   {"handler"},
-
-	"ssh_tunnel_open_fail_count": {},
-	"ssh_tunnel_open_count":      {},
-
+	"get_token_count":                          {},
+	"get_token_fail_count":                     {},
 	"go_gc_duration_seconds":                   {"quantile"},
 	"go_gc_duration_seconds_count":             {},
 	"go_gc_duration_seconds_sum":               {},
 	"go_goroutines":                            {},
-
+	"http_request_duration_microseconds":       {"handler", "quantile"},
+	"http_request_duration_microseconds_count": {"handler"},
+	"http_request_duration_microseconds_sum":   {"handler"},
+	"http_request_size_bytes":                  {"handler", "quantile"},
+	"http_request_size_bytes_count":            {"handler"},
+	"http_request_size_bytes_sum":              {"handler"},
+	"http_requests_total":                      {"handler", "method", "code"},
+	"http_response_size_bytes":                 {"handler", "quantile"},
+	"http_response_size_bytes_count":           {"handler"},
+	"http_response_size_bytes_sum":             {"handler"},
 	"kubernetes_build_info":                    {"major", "minor", "gitCommit", "gitTreeState", "gitVersion"},
+	"process_cpu_seconds_total":                {},
+	"process_max_fds":                          {},
+	"process_open_fds":                         {},
+	"process_resident_memory_bytes":            {},
+	"process_start_time_seconds":               {},
+	"process_virtual_memory_bytes":             {},
+	"ssh_tunnel_open_count":                    {},
+	"ssh_tunnel_open_fail_count":               {},
 }

 type Metrics map[string]model.Samples

+func PrintSample(sample *model.Sample) string {
+	buf := make([]string, 0)
+	// Id is a VERY special label. For 'normal' container it's usless, but it's necessary
+	// for 'system' containers (e.g. /docker-daemon, /kubelet, etc.). We know if that's the
+	// case by checking if there's a label "kubernetes_container_name" present. It's hacky
+	// but it works...
+	_, normalContainer := sample.Metric["kubernetes_container_name"]
+	for k, v := range sample.Metric {
+		if strings.HasPrefix(string(k), "__") || KubeletMetricsLabelsToSkip.Has(string(k)) {
+			continue
+		}
+
+		if string(k) == "id" && normalContainer {
+			continue
+		}
+		buf = append(buf, fmt.Sprintf("%v=%v", string(k), v))
+	}
+	return fmt.Sprintf("[%v] = %v", strings.Join(buf, ","), sample.Value)
+}
+
 func NewMetrics() Metrics {
 	result := make(Metrics)
 	for metric := range CommonMetrics {
--- a/pkg/metrics/kubelet_metrics.go
+++ b/pkg/metrics/kubelet_metrics.go
@ -61,8 +61,6 @@ var KnownKubeletMetrics = map[string][]string{
 	"container_spec_memory_swap_limit_bytes":                 {"id", "image", "kubernetes_container_name", "kubernetes_namespace", "kubernetes_pod_name", "name"},
 	"container_start_time_seconds":                           {"id", "image", "kubernetes_container_name", "kubernetes_namespace", "kubernetes_pod_name", "name"},
 	"container_tasks_state":                                  {"id", "image", "kubernetes_container_name", "kubernetes_namespace", "kubernetes_pod_name", "name", "state"},
-	"get_token_count":                                        {},
-	"get_token_fail_count":                                   {},
 	"kubelet_container_manager_latency_microseconds":         {"operation_type", "quantile"},
 	"kubelet_container_manager_latency_microseconds_count":   {"operation_type"},
 	"kubelet_container_manager_latency_microseconds_sum":     {"operation_type"},
@ -98,6 +96,12 @@ var KnownKubeletMetrics = map[string][]string{
 	"rest_client_request_status_codes":                       {"code", "host", "method"},
 }

+var KubeletMetricsLabelsToSkip = sets.NewString(
+	"kubernetes_namespace",
+	"image",
+	"name",
+)
+
 type KubeletMetrics Metrics

 func NewKubeletMetrics() KubeletMetrics {
--- a/test/e2e/e2e_test.go
+++ b/test/e2e/e2e_test.go
@ -90,6 +90,7 @@ func init() {
 	flag.BoolVar(&testContext.CleanStart, "clean-start", false, "If true, purge all namespaces except default and system before running tests. This serves to cleanup test namespaces from failed/interrupted e2e runs in a long-lived cluster.")
 	flag.BoolVar(&testContext.GatherKubeSystemResourceUsageData, "gather-resource-usage", false, "If set to true framework will be monitoring resource usage of system add-ons in (some) e2e tests.")
 	flag.BoolVar(&testContext.GatherLogsSizes, "gather-logs-sizes", false, "If set to true framework will be monitoring logs sizes on all machines running e2e tests.")
+	flag.BoolVar(&testContext.GatherMetricsAfterTest, "gather-metrics-at-teardown", false, "If set to true framwork will gather metrics from all components after each test.")
 }

 func TestE2E(t *testing.T) {
--- a/test/e2e/framework.go
+++ b/test/e2e/framework.go
@ -26,6 +26,7 @@ import (
 	"k8s.io/kubernetes/pkg/api"
 	client "k8s.io/kubernetes/pkg/client/unversioned"
 	"k8s.io/kubernetes/pkg/fields"
+	"k8s.io/kubernetes/pkg/metrics"

 	. "github.com/onsi/ginkgo"
 	. "github.com/onsi/gomega"
@ -152,6 +153,38 @@ func (f *Framework) afterEach() {
 		close(f.logsSizeCloseChannel)
 		f.logsSizeWaitGroup.Wait()
 	}
+
+	if testContext.GatherMetricsAfterTest {
+		// TODO: enable Scheduler and ControllerManager metrics grabbing when Master's Kubelet will be registered.
+		grabber, err := metrics.NewMetricsGrabber(f.Client, true, false, false, true)
+		if err != nil {
+			Logf("Failed to create MetricsGrabber. Skipping metrics gathering.")
+		} else {
+			received, err := grabber.Grab(nil)
+			if err != nil {
+				Logf("MetricsGrabber failed grab metrics. Skipping metrics gathering.")
+			} else {
+				buf := bytes.Buffer{}
+				for interestingMetric := range InterestingApiServerMetrics {
+					buf.WriteString(fmt.Sprintf("For %v:\n", interestingMetric))
+					for _, sample := range received.ApiServerMetrics[interestingMetric] {
+						buf.WriteString(fmt.Sprintf("\t%v\n", metrics.PrintSample(sample)))
+					}
+				}
+				for kubelet, grabbed := range received.KubeletMetrics {
+					buf.WriteString(fmt.Sprintf("For %v:\n", kubelet))
+					for interestingMetric := range InterestingKubeletMetrics {
+						buf.WriteString(fmt.Sprintf("\tFor %v:\n", interestingMetric))
+						for _, sample := range grabbed[interestingMetric] {
+							buf.WriteString(fmt.Sprintf("\t\t%v\n", metrics.PrintSample(sample)))
+						}
+					}
+				}
+				Logf("%v", buf.String())
+			}
+		}
+	}
+
 	// Paranoia-- prevent reuse!
 	f.Namespace = nil
 	f.Client = nil
--- a/test/e2e/metrics_util.go
+++ b/test/e2e/metrics_util.go
@ -46,6 +46,48 @@ const (
 	apiCallLatencyLargeThreshold  time.Duration = 1 * time.Second
 )

+var InterestingApiServerMetrics = sets.NewString(
+	"apiserver_request_count",
+	"apiserver_request_latencies_bucket",
+	"etcd_helper_cache_entry_count",
+	"etcd_helper_cache_hit_count",
+	"etcd_helper_cache_miss_count",
+	"etcd_request_cache_add_latencies_summary",
+	"etcd_request_cache_get_latencies_summary",
+	"etcd_request_latencies_summary",
+	"go_gc_duration_seconds",
+	"go_goroutines",
+	"process_cpu_seconds_total",
+	"process_open_fds",
+	"process_resident_memory_bytes",
+	"process_start_time_seconds",
+	"process_virtual_memory_bytes",
+)
+
+var InterestingKubeletMetrics = sets.NewString(
+	"container_cpu_system_seconds_total",
+	"container_cpu_user_seconds_total",
+	"container_fs_io_time_weighted_seconds_total",
+	"container_memory_usage_bytes",
+	"container_spec_cpu_shares",
+	"container_start_time_seconds",
+	"go_gc_duration_seconds",
+	"go_goroutines",
+	"kubelet_container_manager_latency_microseconds",
+	"kubelet_docker_errors",
+	"kubelet_docker_operations_latency_microseconds",
+	"kubelet_generate_pod_status_latency_microseconds",
+	"kubelet_pod_start_latency_microseconds",
+	"kubelet_pod_worker_latency_microseconds",
+	"kubelet_pod_worker_start_latency_microseconds",
+	"kubelet_sync_pods_latency_microseconds",
+	"process_cpu_seconds_total",
+	"process_open_fds",
+	"process_resident_memory_bytes",
+	"process_start_time_seconds",
+	"process_virtual_memory_bytes",
+)
+
 // Dashboard metrics
 type LatencyMetric struct {
 	Perc50 time.Duration `json:"Perc50"`
--- a/test/e2e/util.go
+++ b/test/e2e/util.go
@ -153,6 +153,7 @@ type TestContextType struct {
 	// It will read the data every 30 seconds from all Nodes and print summary during afterEach.
 	GatherKubeSystemResourceUsageData bool
 	GatherLogsSizes                   bool
+	GatherMetricsAfterTest            bool
 }

 var testContext TestContextType