add dynamic config metrics

This PR exports config-releated metrics from the Kubelet.
The Guages for active, assigned, and last-known-good config can be used
to identify config versions and produce aggregate counts across several
nodes. The error-reporting Gauge can be used to determine whether a node
is experiencing a config-related error, and to prodouce an aggregate
count of nodes in an error state.
pull/8/head
Michael Taufen 2018-05-10 15:04:23 -07:00
parent 10b8665a1c
commit fd3432ef05
7 changed files with 298 additions and 4 deletions

View File

@ -11,6 +11,7 @@ go_library(
importpath = "k8s.io/kubernetes/pkg/kubelet/kubeletconfig/status", importpath = "k8s.io/kubernetes/pkg/kubelet/kubeletconfig/status",
deps = [ deps = [
"//pkg/kubelet/kubeletconfig/util/log:go_default_library", "//pkg/kubelet/kubeletconfig/util/log:go_default_library",
"//pkg/kubelet/metrics:go_default_library",
"//pkg/util/node:go_default_library", "//pkg/util/node:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",

View File

@ -25,6 +25,7 @@ import (
"k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/types"
clientset "k8s.io/client-go/kubernetes" clientset "k8s.io/client-go/kubernetes"
utillog "k8s.io/kubernetes/pkg/kubelet/kubeletconfig/util/log" utillog "k8s.io/kubernetes/pkg/kubelet/kubeletconfig/util/log"
"k8s.io/kubernetes/pkg/kubelet/metrics"
nodeutil "k8s.io/kubernetes/pkg/util/node" nodeutil "k8s.io/kubernetes/pkg/util/node"
) )
@ -176,6 +177,24 @@ func (s *nodeConfigStatus) Sync(client clientset.Interface, nodeName string) {
status.Error = s.errorOverride status.Error = s.errorOverride
} }
// update metrics based on the status we will sync
metrics.SetConfigError(len(status.Error) > 0)
err = metrics.SetAssignedConfig(status.Assigned)
if err != nil {
err = fmt.Errorf("failed to update Assigned config metric, error: %v", err)
return
}
err = metrics.SetActiveConfig(status.Active)
if err != nil {
err = fmt.Errorf("failed to update Active config metric, error: %v", err)
return
}
err = metrics.SetLastKnownGoodConfig(status.LastKnownGood)
if err != nil {
err = fmt.Errorf("failed to update LastKnownGood config metric, error: %v", err)
return
}
// apply the status to a copy of the node so we don't modify the object in the informer's store // apply the status to a copy of the node so we don't modify the object in the informer's store
newNode := oldNode.DeepCopy() newNode := oldNode.DeepCopy()
newNode.Status.Config = status newNode.Status.Config = status

View File

@ -10,9 +10,12 @@ go_library(
srcs = ["metrics.go"], srcs = ["metrics.go"],
importpath = "k8s.io/kubernetes/pkg/kubelet/metrics", importpath = "k8s.io/kubernetes/pkg/kubelet/metrics",
deps = [ deps = [
"//pkg/features:go_default_library",
"//pkg/kubelet/container:go_default_library", "//pkg/kubelet/container:go_default_library",
"//vendor/github.com/golang/glog:go_default_library", "//vendor/github.com/golang/glog:go_default_library",
"//vendor/github.com/prometheus/client_golang/prometheus:go_default_library", "//vendor/github.com/prometheus/client_golang/prometheus:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library",
], ],
) )

View File

@ -17,11 +17,15 @@ limitations under the License.
package metrics package metrics
import ( import (
"fmt"
"sync" "sync"
"time" "time"
"github.com/golang/glog" "github.com/golang/glog"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
corev1 "k8s.io/api/core/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/kubernetes/pkg/features"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
) )
@ -47,6 +51,17 @@ const (
// Metrics keys of device plugin operations // Metrics keys of device plugin operations
DevicePluginRegistrationCountKey = "device_plugin_registration_count" DevicePluginRegistrationCountKey = "device_plugin_registration_count"
DevicePluginAllocationLatencyKey = "device_plugin_alloc_latency_microseconds" DevicePluginAllocationLatencyKey = "device_plugin_alloc_latency_microseconds"
// Metric keys for node config
AssignedConfigKey = "node_config_assigned"
ActiveConfigKey = "node_config_active"
LastKnownGoodConfigKey = "node_config_last_known_good"
ConfigErrorKey = "node_config_error"
ConfigSourceLabelKey = "node_config_source"
ConfigSourceLabelValueLocal = "local"
ConfigUIDLabelKey = "node_config_uid"
ConfigResourceVersionLabelKey = "node_config_resource_version"
KubeletConfigKeyLabelKey = "node_config_kubelet_key"
) )
var ( var (
@ -150,6 +165,40 @@ var (
}, },
[]string{"resource_name"}, []string{"resource_name"},
) )
// Metrics for node config
AssignedConfig = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: AssignedConfigKey,
Help: "The node's understanding of intended config. The count is always 1.",
},
[]string{ConfigSourceLabelKey, ConfigUIDLabelKey, ConfigResourceVersionLabelKey, KubeletConfigKeyLabelKey},
)
ActiveConfig = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: ActiveConfigKey,
Help: "The config source the node is actively using. The count is always 1.",
},
[]string{ConfigSourceLabelKey, ConfigUIDLabelKey, ConfigResourceVersionLabelKey, KubeletConfigKeyLabelKey},
)
LastKnownGoodConfig = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: LastKnownGoodConfigKey,
Help: "The config source the node will fall back to when it encounters certain errors. The count is always 1.",
},
[]string{ConfigSourceLabelKey, ConfigUIDLabelKey, ConfigResourceVersionLabelKey, KubeletConfigKeyLabelKey},
)
ConfigError = prometheus.NewGauge(
prometheus.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: ConfigErrorKey,
Help: "This metric is true (1) if the node is experiencing a configuration-related error, false (0) otherwise.",
},
)
) )
var registerMetrics sync.Once var registerMetrics sync.Once
@ -172,6 +221,12 @@ func Register(containerCache kubecontainer.RuntimeCache, collectors ...prometheu
prometheus.MustRegister(EvictionStatsAge) prometheus.MustRegister(EvictionStatsAge)
prometheus.MustRegister(DevicePluginRegistrationCount) prometheus.MustRegister(DevicePluginRegistrationCount)
prometheus.MustRegister(DevicePluginAllocationLatency) prometheus.MustRegister(DevicePluginAllocationLatency)
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicKubeletConfig) {
prometheus.MustRegister(AssignedConfig)
prometheus.MustRegister(ActiveConfig)
prometheus.MustRegister(LastKnownGoodConfig)
prometheus.MustRegister(ConfigError)
}
for _, collector := range collectors { for _, collector := range collectors {
prometheus.MustRegister(collector) prometheus.MustRegister(collector)
} }
@ -232,3 +287,88 @@ func (pc *podAndContainerCollector) Collect(ch chan<- prometheus.Metric) {
prometheus.GaugeValue, prometheus.GaugeValue,
float64(runningContainers)) float64(runningContainers))
} }
const configMapAPIPathFmt = "/api/v1/namespaces/%s/configmaps/%s"
func configLabels(source *corev1.NodeConfigSource) (map[string]string, error) {
if source == nil {
return map[string]string{
// prometheus requires all of the labels that can be set on the metric
ConfigSourceLabelKey: "local",
ConfigUIDLabelKey: "",
ConfigResourceVersionLabelKey: "",
KubeletConfigKeyLabelKey: "",
}, nil
}
if source.ConfigMap != nil {
return map[string]string{
ConfigSourceLabelKey: fmt.Sprintf(configMapAPIPathFmt, source.ConfigMap.Namespace, source.ConfigMap.Name),
ConfigUIDLabelKey: string(source.ConfigMap.UID),
ConfigResourceVersionLabelKey: source.ConfigMap.ResourceVersion,
KubeletConfigKeyLabelKey: source.ConfigMap.KubeletConfigKey,
}, nil
}
return nil, fmt.Errorf("unrecognized config source type, all source subfields were nil")
}
// track labels across metric updates, so we can delete old label sets and prevent leaks
var assignedConfigLabels map[string]string = map[string]string{}
func SetAssignedConfig(source *corev1.NodeConfigSource) error {
// compute the timeseries labels from the source
labels, err := configLabels(source)
if err != nil {
return err
}
// clean up the old timeseries (WithLabelValues creates a new one for each distinct label set)
AssignedConfig.Delete(assignedConfigLabels)
// record the new timeseries
assignedConfigLabels = labels
// expose the new timeseries with a constant count of 1
AssignedConfig.With(assignedConfigLabels).Set(1)
return nil
}
// track labels across metric updates, so we can delete old label sets and prevent leaks
var activeConfigLabels map[string]string = map[string]string{}
func SetActiveConfig(source *corev1.NodeConfigSource) error {
// compute the timeseries labels from the source
labels, err := configLabels(source)
if err != nil {
return err
}
// clean up the old timeseries (WithLabelValues creates a new one for each distinct label set)
ActiveConfig.Delete(activeConfigLabels)
// record the new timeseries
activeConfigLabels = labels
// expose the new timeseries with a constant count of 1
ActiveConfig.With(activeConfigLabels).Set(1)
return nil
}
// track labels across metric updates, so we can delete old label sets and prevent leaks
var lastKnownGoodConfigLabels map[string]string = map[string]string{}
func SetLastKnownGoodConfig(source *corev1.NodeConfigSource) error {
// compute the timeseries labels from the source
labels, err := configLabels(source)
if err != nil {
return err
}
// clean up the old timeseries (WithLabelValues creates a new one for each distinct label set)
LastKnownGoodConfig.Delete(lastKnownGoodConfigLabels)
// record the new timeseries
lastKnownGoodConfigLabels = labels
// expose the new timeseries with a constant count of 1
LastKnownGoodConfig.With(lastKnownGoodConfigLabels).Set(1)
return nil
}
func SetConfigError(err bool) {
if err {
ConfigError.Set(1)
} else {
ConfigError.Set(0)
}
}

View File

@ -137,6 +137,7 @@ go_test(
"//pkg/kubelet/types:go_default_library", "//pkg/kubelet/types:go_default_library",
"//pkg/security/apparmor:go_default_library", "//pkg/security/apparmor:go_default_library",
"//test/e2e/framework:go_default_library", "//test/e2e/framework:go_default_library",
"//test/e2e/framework/metrics:go_default_library",
"//test/e2e_node/services:go_default_library", "//test/e2e_node/services:go_default_library",
"//test/utils/image:go_default_library", "//test/utils/image:go_default_library",
"//vendor/github.com/blang/semver:go_default_library", "//vendor/github.com/blang/semver:go_default_library",
@ -147,6 +148,7 @@ go_test(
"//vendor/github.com/onsi/gomega:go_default_library", "//vendor/github.com/onsi/gomega:go_default_library",
"//vendor/github.com/onsi/gomega/gstruct:go_default_library", "//vendor/github.com/onsi/gomega/gstruct:go_default_library",
"//vendor/github.com/onsi/gomega/types:go_default_library", "//vendor/github.com/onsi/gomega/types:go_default_library",
"//vendor/github.com/prometheus/common/model:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/equality:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/equality:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
@ -164,7 +166,6 @@ go_test(
] + select({ ] + select({
"@io_bazel_rules_go//go/platform:linux": [ "@io_bazel_rules_go//go/platform:linux": [
"//test/e2e/common:go_default_library", "//test/e2e/common:go_default_library",
"//test/e2e/framework/metrics:go_default_library",
"//test/e2e_node/system:go_default_library", "//test/e2e_node/system:go_default_library",
"//test/utils:go_default_library", "//test/utils:go_default_library",
"//vendor/github.com/kardianos/osext:go_default_library", "//vendor/github.com/kardianos/osext:go_default_library",

View File

@ -18,6 +18,7 @@ package e2e_node
import ( import (
"fmt" "fmt"
"reflect"
"strings" "strings"
"time" "time"
@ -27,12 +28,17 @@ import (
apiequality "k8s.io/apimachinery/pkg/api/equality" apiequality "k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors" apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig" "k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig"
controller "k8s.io/kubernetes/pkg/kubelet/kubeletconfig" controller "k8s.io/kubernetes/pkg/kubelet/kubeletconfig"
"k8s.io/kubernetes/pkg/kubelet/kubeletconfig/status" "k8s.io/kubernetes/pkg/kubelet/kubeletconfig/status"
"k8s.io/kubernetes/pkg/kubelet/metrics"
frameworkmetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
"k8s.io/kubernetes/test/e2e/framework" "k8s.io/kubernetes/test/e2e/framework"
"github.com/prometheus/common/model"
. "github.com/onsi/ginkgo" . "github.com/onsi/ginkgo"
. "github.com/onsi/gomega" . "github.com/onsi/gomega"
) )
@ -45,8 +51,6 @@ type expectNodeConfigStatus struct {
// If true, expect Status.Config.Active == Status.Config.LastKnownGood, // If true, expect Status.Config.Active == Status.Config.LastKnownGood,
// otherwise expect Status.Config.Active == Status.Config.Assigned. // otherwise expect Status.Config.Active == Status.Config.Assigned.
lkgActive bool lkgActive bool
// If true, skip checking Status.Config.LastKnownGood == this.lastKnownGood in the status.
skipLkg bool
} }
type nodeConfigTestCase struct { type nodeConfigTestCase struct {
@ -809,6 +813,8 @@ func (tc *nodeConfigTestCase) run(f *framework.Framework, fn func(f *framework.F
tc.checkNodeConfigSource(f) tc.checkNodeConfigSource(f)
// check status // check status
tc.checkConfigStatus(f) tc.checkConfigStatus(f)
// check that the Kubelet's config-related metrics are correct
tc.checkConfigMetrics(f)
// check expectConfig // check expectConfig
if tc.expectConfig != nil { if tc.expectConfig != nil {
tc.checkConfig(f) tc.checkConfig(f)
@ -929,7 +935,7 @@ func expectConfigStatus(tc *nodeConfigTestCase, actual *apiv1.NodeConfigStatus)
errs = append(errs, spew.Sprintf("expected Assigned %#v but got %#v", expectAssigned, actual.Assigned)) errs = append(errs, spew.Sprintf("expected Assigned %#v but got %#v", expectAssigned, actual.Assigned))
} }
// check LastKnownGood matches tc.expectConfigStatus.lastKnownGood // check LastKnownGood matches tc.expectConfigStatus.lastKnownGood
if !tc.expectConfigStatus.skipLkg && !apiequality.Semantic.DeepEqual(tc.expectConfigStatus.lastKnownGood, actual.LastKnownGood) { if !apiequality.Semantic.DeepEqual(tc.expectConfigStatus.lastKnownGood, actual.LastKnownGood) {
errs = append(errs, spew.Sprintf("expected LastKnownGood %#v but got %#v", tc.expectConfigStatus.lastKnownGood, actual.LastKnownGood)) errs = append(errs, spew.Sprintf("expected LastKnownGood %#v but got %#v", tc.expectConfigStatus.lastKnownGood, actual.LastKnownGood))
} }
// check Active matches Assigned or LastKnownGood, depending on tc.expectConfigStatus.lkgActive // check Active matches Assigned or LastKnownGood, depending on tc.expectConfigStatus.lkgActive
@ -1016,6 +1022,111 @@ func (tc *nodeConfigTestCase) checkEvent(f *framework.Framework) {
}, timeout, interval).Should(BeNil()) }, timeout, interval).Should(BeNil())
} }
// checkConfigMetrics makes sure the Kubelet's config related metrics are as we expect, given the test case
func (tc *nodeConfigTestCase) checkConfigMetrics(f *framework.Framework) {
const (
timeout = time.Minute
interval = time.Second
assignedConfigKey = metrics.KubeletSubsystem + "_" + metrics.AssignedConfigKey
activeConfigKey = metrics.KubeletSubsystem + "_" + metrics.ActiveConfigKey
lastKnownGoodConfigKey = metrics.KubeletSubsystem + "_" + metrics.LastKnownGoodConfigKey
configErrorKey = metrics.KubeletSubsystem + "_" + metrics.ConfigErrorKey
)
// local config helper
mkLocalSample := func(name model.LabelValue) *model.Sample {
return &model.Sample{
Metric: model.Metric(map[model.LabelName]model.LabelValue{
model.MetricNameLabel: name,
metrics.ConfigSourceLabelKey: metrics.ConfigSourceLabelValueLocal,
metrics.ConfigUIDLabelKey: "",
metrics.ConfigResourceVersionLabelKey: "",
metrics.KubeletConfigKeyLabelKey: "",
}),
Value: 1,
}
}
// remote config helper
mkRemoteSample := func(name model.LabelValue, source *apiv1.NodeConfigSource) *model.Sample {
return &model.Sample{
Metric: model.Metric(map[model.LabelName]model.LabelValue{
model.MetricNameLabel: name,
metrics.ConfigSourceLabelKey: model.LabelValue(fmt.Sprintf("/api/v1/namespaces/%s/configmaps/%s", source.ConfigMap.Namespace, source.ConfigMap.Name)),
metrics.ConfigUIDLabelKey: model.LabelValue(source.ConfigMap.UID),
metrics.ConfigResourceVersionLabelKey: model.LabelValue(source.ConfigMap.ResourceVersion),
metrics.KubeletConfigKeyLabelKey: model.LabelValue(source.ConfigMap.KubeletConfigKey),
}),
Value: 1,
}
}
// error helper
mkErrorSample := func(expectError bool) *model.Sample {
v := model.SampleValue(0)
if expectError {
v = model.SampleValue(1)
}
return &model.Sample{
Metric: model.Metric(map[model.LabelName]model.LabelValue{model.MetricNameLabel: configErrorKey}),
Value: v,
}
}
// construct expected metrics
// assigned
assignedSamples := model.Samples{mkLocalSample(assignedConfigKey)}
assignedSource := tc.configSource.DeepCopy()
if assignedSource != nil && assignedSource.ConfigMap != nil {
assignedSource.ConfigMap.UID = tc.configMap.UID
assignedSource.ConfigMap.ResourceVersion = tc.configMap.ResourceVersion
assignedSamples = model.Samples{mkRemoteSample(assignedConfigKey, assignedSource)}
}
// last-known-good
lastKnownGoodSamples := model.Samples{mkLocalSample(lastKnownGoodConfigKey)}
lastKnownGoodSource := tc.expectConfigStatus.lastKnownGood
if lastKnownGoodSource != nil && lastKnownGoodSource.ConfigMap != nil {
lastKnownGoodSamples = model.Samples{mkRemoteSample(lastKnownGoodConfigKey, lastKnownGoodSource)}
}
// active
activeSamples := model.Samples{mkLocalSample(activeConfigKey)}
activeSource := assignedSource
if tc.expectConfigStatus.lkgActive {
activeSource = lastKnownGoodSource
}
if activeSource != nil && activeSource.ConfigMap != nil {
activeSamples = model.Samples{mkRemoteSample(activeConfigKey, activeSource)}
}
// error
errorSamples := model.Samples{mkErrorSample(len(tc.expectConfigStatus.err) > 0)}
// expected metrics
expect := frameworkmetrics.KubeletMetrics(map[string]model.Samples{
assignedConfigKey: assignedSamples,
activeConfigKey: activeSamples,
lastKnownGoodConfigKey: lastKnownGoodSamples,
configErrorKey: errorSamples,
})
// wait for expected metrics to appear
Eventually(func() error {
actual, err := getKubeletMetrics(sets.NewString(
assignedConfigKey,
activeConfigKey,
lastKnownGoodConfigKey,
configErrorKey,
))
if err != nil {
return err
}
// clear timestamps from actual, so DeepEqual is time-invariant
for _, samples := range actual {
for _, sample := range samples {
sample.Timestamp = 0
}
}
// compare to expected
if !reflect.DeepEqual(expect, actual) {
return fmt.Errorf("checkConfigMetrics: case: %s: expect metrics %s but got %s", tc.desc, spew.Sprintf("%#v", expect), spew.Sprintf("%#v", actual))
}
return nil
}, timeout, interval).Should(BeNil())
}
// constructs the expected SelfLink for a config map // constructs the expected SelfLink for a config map
func configMapAPIPath(cm *apiv1.ConfigMap) string { func configMapAPIPath(cm *apiv1.ConfigMap) string {
return fmt.Sprintf("/api/v1/namespaces/%s/configmaps/%s", cm.Namespace, cm.Name) return fmt.Sprintf("/api/v1/namespaces/%s/configmaps/%s", cm.Namespace, cm.Name)

View File

@ -45,6 +45,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/remote" "k8s.io/kubernetes/pkg/kubelet/remote"
"k8s.io/kubernetes/test/e2e/framework" "k8s.io/kubernetes/test/e2e/framework"
"k8s.io/kubernetes/test/e2e/framework/metrics" "k8s.io/kubernetes/test/e2e/framework/metrics"
frameworkmetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
. "github.com/onsi/ginkgo" . "github.com/onsi/ginkgo"
. "github.com/onsi/gomega" . "github.com/onsi/gomega"
@ -334,6 +335,24 @@ func logKubeletMetrics(metricKeys ...string) {
} }
} }
// returns config related metrics from the local kubelet, filtered to the filterMetricNames passed in
func getKubeletMetrics(filterMetricNames sets.String) (frameworkmetrics.KubeletMetrics, error) {
// grab Kubelet metrics
ms, err := metrics.GrabKubeletMetricsWithoutProxy(framework.TestContext.NodeName + ":10255")
if err != nil {
return nil, err
}
filtered := metrics.NewKubeletMetrics()
for name := range ms {
if !filterMetricNames.Has(name) {
continue
}
filtered[name] = ms[name]
}
return filtered, nil
}
// runCommand runs the cmd and returns the combined stdout and stderr, or an // runCommand runs the cmd and returns the combined stdout and stderr, or an
// error if the command failed. // error if the command failed.
func runCommand(cmd ...string) (string, error) { func runCommand(cmd ...string) (string, error) {