Merge pull request #53446 from sjenning/network-plugin-metrics

Automatic merge from submit-queue (batch tested with PRs 53454, 53446, 52935, 53443, 52917). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

kubelet: add latency metrics to network plugin manager

This PR adds latency metrics to the network plugin operations, namely `GetPodNetworkStatus()`, `SetUpPod()`, and `TearDownPod()`.

I recently had to debug and issue where a PLEG relist hang was occurring due to a hang in a CNI plugin and it would have been really nice to have these.  Between the these new metrics and `docker_operations_latency_microseconds`, we will be able to account for nearly all the time consuming routines in the PLEG relist.

@derekwaynecarr @smarterclayton @eparis @vishh 

```release-note
Metrics were added to network plugin to report latency of CNI operations
```
/sig node
pull/6/head
Kubernetes Submit Queue 2017-10-05 05:06:25 -07:00 committed by GitHub
commit eaaa93c70c
4 changed files with 96 additions and 0 deletions

View File

@ -15,6 +15,7 @@ go_library(
"//pkg/kubelet/apis/kubeletconfig:go_default_library",
"//pkg/kubelet/container:go_default_library",
"//pkg/kubelet/network/hostport:go_default_library",
"//pkg/kubelet/network/metrics:go_default_library",
"//pkg/util/sysctl:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
@ -42,6 +43,7 @@ filegroup(
"//pkg/kubelet/network/hairpin:all-srcs",
"//pkg/kubelet/network/hostport:all-srcs",
"//pkg/kubelet/network/kubenet:all-srcs",
"//pkg/kubelet/network/metrics:all-srcs",
"//pkg/kubelet/network/testing:all-srcs",
],
tags = ["automanaged"],

View File

@ -0,0 +1,22 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "go_default_library",
srcs = ["metrics.go"],
visibility = ["//visibility:public"],
deps = ["//vendor/github.com/prometheus/client_golang/prometheus:go_default_library"],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)

View File

@ -0,0 +1,61 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
)
const (
// NetworkPluginOperationsKey is the key for operation count metrics.
NetworkPluginOperationsKey = "network_plugin_operations"
// NetworkPluginOperationsLatencyKey is the key for the operation latency metrics.
NetworkPluginOperationsLatencyKey = "network_plugin_operations_latency_microseconds"
// Keep the "kubelet" subsystem for backward compatibility.
kubeletSubsystem = "kubelet"
)
var (
// NetworkPluginOperationsLatency collects operation latency numbers by operation
// type.
NetworkPluginOperationsLatency = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Subsystem: kubeletSubsystem,
Name: NetworkPluginOperationsLatencyKey,
Help: "Latency in microseconds of network plugin operations. Broken down by operation type.",
},
[]string{"operation_type"},
)
)
var registerMetrics sync.Once
// Register all metrics.
func Register() {
registerMetrics.Do(func() {
prometheus.MustRegister(NetworkPluginOperationsLatency)
})
}
// SinceInMicroseconds gets the time since the specified start in microseconds.
func SinceInMicroseconds(start time.Time) float64 {
return float64(time.Since(start).Nanoseconds() / time.Microsecond.Nanoseconds())
}

View File

@ -21,6 +21,7 @@ import (
"net"
"strings"
"sync"
"time"
"github.com/golang/glog"
"k8s.io/api/core/v1"
@ -32,6 +33,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/network/hostport"
"k8s.io/kubernetes/pkg/kubelet/network/metrics"
utilsysctl "k8s.io/kubernetes/pkg/util/sysctl"
utilexec "k8s.io/utils/exec"
)
@ -304,6 +306,7 @@ type PluginManager struct {
}
func NewPluginManager(plugin NetworkPlugin) *PluginManager {
metrics.Register()
return &PluginManager{
plugin: plugin,
pods: make(map[string]*podLock),
@ -371,7 +374,13 @@ func (pm *PluginManager) podUnlock(fullPodName string) {
}
}
// recordOperation records operation and duration
func recordOperation(operation string, start time.Time) {
metrics.NetworkPluginOperationsLatency.WithLabelValues(operation).Observe(metrics.SinceInMicroseconds(start))
}
func (pm *PluginManager) GetPodNetworkStatus(podNamespace, podName string, id kubecontainer.ContainerID) (*PodNetworkStatus, error) {
defer recordOperation("get_pod_network_status", time.Now())
fullPodName := kubecontainer.BuildPodFullName(podName, podNamespace)
pm.podLock(fullPodName).Lock()
defer pm.podUnlock(fullPodName)
@ -385,6 +394,7 @@ func (pm *PluginManager) GetPodNetworkStatus(podNamespace, podName string, id ku
}
func (pm *PluginManager) SetUpPod(podNamespace, podName string, id kubecontainer.ContainerID, annotations map[string]string) error {
defer recordOperation("set_up_pod", time.Now())
fullPodName := kubecontainer.BuildPodFullName(podName, podNamespace)
pm.podLock(fullPodName).Lock()
defer pm.podUnlock(fullPodName)
@ -398,6 +408,7 @@ func (pm *PluginManager) SetUpPod(podNamespace, podName string, id kubecontainer
}
func (pm *PluginManager) TearDownPod(podNamespace, podName string, id kubecontainer.ContainerID) error {
defer recordOperation("tear_down_pod", time.Now())
fullPodName := kubecontainer.BuildPodFullName(podName, podNamespace)
pm.podLock(fullPodName).Lock()
defer pm.podUnlock(fullPodName)