2019-01-12 04:58:27 +00:00
/ *
Copyright 2015 The Kubernetes Authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package metrics
import (
"fmt"
"sync"
"time"
2019-12-12 01:27:03 +00:00
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
2019-01-12 04:58:27 +00:00
corev1 "k8s.io/api/core/v1"
2019-04-07 17:07:55 +00:00
"k8s.io/apimachinery/pkg/types"
2019-01-12 04:58:27 +00:00
utilfeature "k8s.io/apiserver/pkg/util/feature"
2020-08-10 17:43:49 +00:00
"k8s.io/klog/v2"
2019-01-12 04:58:27 +00:00
"k8s.io/kubernetes/pkg/features"
)
2019-09-27 21:51:53 +00:00
// This const block defines the metric names for the kubelet metrics.
2019-01-12 04:58:27 +00:00
const (
2020-03-26 21:07:15 +00:00
KubeletSubsystem = "kubelet"
NodeNameKey = "node_name"
NodeLabelKey = "node"
PodWorkerDurationKey = "pod_worker_duration_seconds"
PodStartDurationKey = "pod_start_duration_seconds"
CgroupManagerOperationsKey = "cgroup_manager_duration_seconds"
PodWorkerStartDurationKey = "pod_worker_start_duration_seconds"
PLEGRelistDurationKey = "pleg_relist_duration_seconds"
PLEGDiscardEventsKey = "pleg_discard_events"
PLEGRelistIntervalKey = "pleg_relist_interval_seconds"
PLEGLastSeenKey = "pleg_last_seen_seconds"
EvictionsKey = "evictions"
EvictionStatsAgeKey = "eviction_stats_age_seconds"
PreemptionsKey = "preemptions"
VolumeStatsCapacityBytesKey = "volume_stats_capacity_bytes"
VolumeStatsAvailableBytesKey = "volume_stats_available_bytes"
VolumeStatsUsedBytesKey = "volume_stats_used_bytes"
VolumeStatsInodesKey = "volume_stats_inodes"
VolumeStatsInodesFreeKey = "volume_stats_inodes_free"
VolumeStatsInodesUsedKey = "volume_stats_inodes_used"
2021-03-18 22:40:29 +00:00
RunningPodsKey = "running_pods"
RunningContainersKey = "running_containers"
2019-01-12 04:58:27 +00:00
// Metrics keys of remote runtime operations
2020-03-26 21:07:15 +00:00
RuntimeOperationsKey = "runtime_operations_total"
RuntimeOperationsDurationKey = "runtime_operations_duration_seconds"
RuntimeOperationsErrorsKey = "runtime_operations_errors_total"
2019-01-12 04:58:27 +00:00
// Metrics keys of device plugin operations
2020-03-26 21:07:15 +00:00
DevicePluginRegistrationCountKey = "device_plugin_registration_total"
DevicePluginAllocationDurationKey = "device_plugin_alloc_duration_seconds"
2020-12-01 01:06:26 +00:00
// Metrics keys of pod resources operations
2021-03-18 22:40:29 +00:00
PodResourcesEndpointRequestsTotalKey = "pod_resources_endpoint_requests_total"
PodResourcesEndpointRequestsListKey = "pod_resources_endpoint_requests_list"
PodResourcesEndpointRequestsGetAllocatableKey = "pod_resources_endpoint_requests_get_allocatable"
PodResourcesEndpointErrorsListKey = "pod_resources_endpoint_errors_list"
PodResourcesEndpointErrorsGetAllocatableKey = "pod_resources_endpoint_errors_get_allocatable"
2019-01-12 04:58:27 +00:00
// Metric keys for node config
AssignedConfigKey = "node_config_assigned"
ActiveConfigKey = "node_config_active"
LastKnownGoodConfigKey = "node_config_last_known_good"
ConfigErrorKey = "node_config_error"
ConfigSourceLabelKey = "node_config_source"
ConfigSourceLabelValueLocal = "local"
ConfigUIDLabelKey = "node_config_uid"
ConfigResourceVersionLabelKey = "node_config_resource_version"
KubeletConfigKeyLabelKey = "node_config_kubelet_key"
2019-04-07 17:07:55 +00:00
// Metrics keys for RuntimeClass
RunPodSandboxDurationKey = "run_podsandbox_duration_seconds"
RunPodSandboxErrorsKey = "run_podsandbox_errors_total"
2021-07-02 08:43:15 +00:00
// Metrics to keep track of total number of Pods and Containers started
StartedPodsTotalKey = "started_pods_total"
StartedPodsErrorsTotalKey = "started_pods_errors_total"
StartedContainersTotalKey = "started_containers_total"
StartedContainersErrorsTotalKey = "started_containers_errors_total"
// Metrics to track ephemeral container usage by this kubelet
ManagedEphemeralContainersKey = "managed_ephemeral_containers"
// Values used in metric labels
Container = "container"
InitContainer = "init_container"
EphemeralContainer = "ephemeral_container"
2019-01-12 04:58:27 +00:00
)
var (
2019-09-27 21:51:53 +00:00
// NodeName is a Gauge that tracks the ode's name. The count is always 1.
NodeName = metrics . NewGaugeVec (
& metrics . GaugeOpts {
Subsystem : KubeletSubsystem ,
Name : NodeNameKey ,
Help : "The node's name. The count is always 1." ,
StabilityLevel : metrics . ALPHA ,
2019-04-07 17:07:55 +00:00
} ,
[ ] string { NodeLabelKey } ,
)
2021-03-18 22:40:29 +00:00
// ContainersPerPodCount is a Histogram that tracks the number of containers per pod.
2019-09-27 21:51:53 +00:00
ContainersPerPodCount = metrics . NewHistogram (
& metrics . HistogramOpts {
Subsystem : KubeletSubsystem ,
Name : "containers_per_pod_count" ,
Help : "The number of containers per pod." ,
2021-03-18 22:40:29 +00:00
Buckets : metrics . ExponentialBuckets ( 1 , 2 , 5 ) ,
2019-09-27 21:51:53 +00:00
StabilityLevel : metrics . ALPHA ,
} ,
)
// PodWorkerDuration is a Histogram that tracks the duration (in seconds) in takes to sync a single pod.
// Broken down by the operation type.
PodWorkerDuration = metrics . NewHistogramVec (
& metrics . HistogramOpts {
Subsystem : KubeletSubsystem ,
Name : PodWorkerDurationKey ,
Help : "Duration in seconds to sync a single pod. Broken down by operation type: create, update, or sync" ,
2019-12-12 01:27:03 +00:00
Buckets : metrics . DefBuckets ,
2019-09-27 21:51:53 +00:00
StabilityLevel : metrics . ALPHA ,
2019-01-12 04:58:27 +00:00
} ,
[ ] string { "operation_type" } ,
)
2019-09-27 21:51:53 +00:00
// PodStartDuration is a Histogram that tracks the duration (in seconds) it takes for a single pod to go from pending to running.
PodStartDuration = metrics . NewHistogram (
& metrics . HistogramOpts {
Subsystem : KubeletSubsystem ,
Name : PodStartDurationKey ,
Help : "Duration in seconds for a single pod to go from pending to running." ,
2019-12-12 01:27:03 +00:00
Buckets : metrics . DefBuckets ,
2019-09-27 21:51:53 +00:00
StabilityLevel : metrics . ALPHA ,
} ,
)
// CgroupManagerDuration is a Histogram that tracks the duration (in seconds) it takes for cgroup manager operations to complete.
// Broken down by method.
CgroupManagerDuration = metrics . NewHistogramVec (
& metrics . HistogramOpts {
Subsystem : KubeletSubsystem ,
Name : CgroupManagerOperationsKey ,
Help : "Duration in seconds for cgroup manager operations. Broken down by method." ,
2019-12-12 01:27:03 +00:00
Buckets : metrics . DefBuckets ,
2019-09-27 21:51:53 +00:00
StabilityLevel : metrics . ALPHA ,
2019-01-12 04:58:27 +00:00
} ,
[ ] string { "operation_type" } ,
)
2019-09-27 21:51:53 +00:00
// PodWorkerStartDuration is a Histogram that tracks the duration (in seconds) it takes from seeing a pod to starting a worker.
PodWorkerStartDuration = metrics . NewHistogram (
& metrics . HistogramOpts {
Subsystem : KubeletSubsystem ,
Name : PodWorkerStartDurationKey ,
Help : "Duration in seconds from seeing a pod to starting a worker." ,
2019-12-12 01:27:03 +00:00
Buckets : metrics . DefBuckets ,
2019-09-27 21:51:53 +00:00
StabilityLevel : metrics . ALPHA ,
} ,
)
// PLEGRelistDuration is a Histogram that tracks the duration (in seconds) it takes for relisting pods in the Kubelet's
// Pod Lifecycle Event Generator (PLEG).
PLEGRelistDuration = metrics . NewHistogram (
& metrics . HistogramOpts {
Subsystem : KubeletSubsystem ,
Name : PLEGRelistDurationKey ,
Help : "Duration in seconds for relisting pods in PLEG." ,
2019-12-12 01:27:03 +00:00
Buckets : metrics . DefBuckets ,
2019-09-27 21:51:53 +00:00
StabilityLevel : metrics . ALPHA ,
} ,
)
2020-03-26 21:07:15 +00:00
// PLEGDiscardEvents is a Counter that tracks the number of discarding events in the Kubelet's Pod Lifecycle Event Generator (PLEG).
PLEGDiscardEvents = metrics . NewCounter (
2019-09-27 21:51:53 +00:00
& metrics . CounterOpts {
Subsystem : KubeletSubsystem ,
Name : PLEGDiscardEventsKey ,
Help : "The number of discard events in PLEG." ,
StabilityLevel : metrics . ALPHA ,
2019-04-07 17:07:55 +00:00
} ,
)
2020-03-26 21:07:15 +00:00
2019-09-27 21:51:53 +00:00
// PLEGRelistInterval is a Histogram that tracks the intervals (in seconds) between relisting in the Kubelet's
// Pod Lifecycle Event Generator (PLEG).
PLEGRelistInterval = metrics . NewHistogram (
& metrics . HistogramOpts {
Subsystem : KubeletSubsystem ,
Name : PLEGRelistIntervalKey ,
Help : "Interval in seconds between relisting in PLEG." ,
2019-12-12 01:27:03 +00:00
Buckets : metrics . DefBuckets ,
2019-09-27 21:51:53 +00:00
StabilityLevel : metrics . ALPHA ,
} ,
)
2020-03-26 21:07:15 +00:00
// PLEGLastSeen is a Gauge giving the Unix timestamp when the Kubelet's
// Pod Lifecycle Event Generator (PLEG) was last seen active.
PLEGLastSeen = metrics . NewGauge (
& metrics . GaugeOpts {
Subsystem : KubeletSubsystem ,
Name : PLEGLastSeenKey ,
Help : "Timestamp in seconds when PLEG was last seen active." ,
StabilityLevel : metrics . ALPHA ,
} ,
)
2019-09-27 21:51:53 +00:00
// RuntimeOperations is a Counter that tracks the cumulative number of remote runtime operations.
// Broken down by operation type.
RuntimeOperations = metrics . NewCounterVec (
& metrics . CounterOpts {
Subsystem : KubeletSubsystem ,
Name : RuntimeOperationsKey ,
Help : "Cumulative number of runtime operations by operation type." ,
StabilityLevel : metrics . ALPHA ,
2019-01-12 04:58:27 +00:00
} ,
2019-09-27 21:51:53 +00:00
[ ] string { "operation_type" } ,
2019-01-12 04:58:27 +00:00
)
2019-09-27 21:51:53 +00:00
// RuntimeOperationsDuration is a Histogram that tracks the duration (in seconds) for remote runtime operations to complete.
// Broken down by operation type.
RuntimeOperationsDuration = metrics . NewHistogramVec (
& metrics . HistogramOpts {
Subsystem : KubeletSubsystem ,
Name : RuntimeOperationsDurationKey ,
Help : "Duration in seconds of runtime operations. Broken down by operation type." ,
2020-12-01 01:06:26 +00:00
Buckets : metrics . ExponentialBuckets ( .005 , 2.5 , 14 ) ,
2019-09-27 21:51:53 +00:00
StabilityLevel : metrics . ALPHA ,
2019-01-12 04:58:27 +00:00
} ,
[ ] string { "operation_type" } ,
)
2019-09-27 21:51:53 +00:00
// RuntimeOperationsErrors is a Counter that tracks the cumulative number of remote runtime operations errors.
// Broken down by operation type.
RuntimeOperationsErrors = metrics . NewCounterVec (
& metrics . CounterOpts {
Subsystem : KubeletSubsystem ,
Name : RuntimeOperationsErrorsKey ,
Help : "Cumulative number of runtime operation errors by operation type." ,
StabilityLevel : metrics . ALPHA ,
2019-01-12 04:58:27 +00:00
} ,
[ ] string { "operation_type" } ,
)
2019-09-27 21:51:53 +00:00
// Evictions is a Counter that tracks the cumulative number of pod evictions initiated by the kubelet.
// Broken down by eviction signal.
Evictions = metrics . NewCounterVec (
& metrics . CounterOpts {
Subsystem : KubeletSubsystem ,
Name : EvictionsKey ,
Help : "Cumulative number of pod evictions by eviction signal" ,
StabilityLevel : metrics . ALPHA ,
2019-01-12 04:58:27 +00:00
} ,
2019-09-27 21:51:53 +00:00
[ ] string { "eviction_signal" } ,
2019-01-12 04:58:27 +00:00
)
2019-09-27 21:51:53 +00:00
// EvictionStatsAge is a Histogram that tracks the time (in seconds) between when stats are collected and when a pod is evicted
// based on those stats. Broken down by eviction signal.
EvictionStatsAge = metrics . NewHistogramVec (
& metrics . HistogramOpts {
Subsystem : KubeletSubsystem ,
Name : EvictionStatsAgeKey ,
Help : "Time between when stats are collected, and when pod is evicted based on those stats by eviction signal" ,
2019-12-12 01:27:03 +00:00
Buckets : metrics . DefBuckets ,
2019-09-27 21:51:53 +00:00
StabilityLevel : metrics . ALPHA ,
2019-01-12 04:58:27 +00:00
} ,
[ ] string { "eviction_signal" } ,
)
2019-12-12 01:27:03 +00:00
// Preemptions is a Counter that tracks the cumulative number of pod preemptions initiated by the kubelet.
// Broken down by preemption signal. A preemption is only recorded for one resource, the sum of all signals
// is the number of preemptions on the given node.
Preemptions = metrics . NewCounterVec (
& metrics . CounterOpts {
Subsystem : KubeletSubsystem ,
Name : PreemptionsKey ,
Help : "Cumulative number of pod preemptions by preemption resource" ,
StabilityLevel : metrics . ALPHA ,
} ,
[ ] string { "preemption_signal" } ,
)
2019-09-27 21:51:53 +00:00
// DevicePluginRegistrationCount is a Counter that tracks the cumulative number of device plugin registrations.
// Broken down by resource name.
DevicePluginRegistrationCount = metrics . NewCounterVec (
& metrics . CounterOpts {
Subsystem : KubeletSubsystem ,
Name : DevicePluginRegistrationCountKey ,
Help : "Cumulative number of device plugin registrations. Broken down by resource name." ,
StabilityLevel : metrics . ALPHA ,
2019-01-12 04:58:27 +00:00
} ,
[ ] string { "resource_name" } ,
)
2019-09-27 21:51:53 +00:00
// DevicePluginAllocationDuration is a Histogram that tracks the duration (in seconds) to serve a device plugin allocation request.
// Broken down by resource name.
DevicePluginAllocationDuration = metrics . NewHistogramVec (
& metrics . HistogramOpts {
Subsystem : KubeletSubsystem ,
Name : DevicePluginAllocationDurationKey ,
Help : "Duration in seconds to serve a device plugin Allocation request. Broken down by resource name." ,
2019-12-12 01:27:03 +00:00
Buckets : metrics . DefBuckets ,
2019-09-27 21:51:53 +00:00
StabilityLevel : metrics . ALPHA ,
2019-04-07 17:07:55 +00:00
} ,
[ ] string { "resource_name" } ,
)
2019-01-12 04:58:27 +00:00
2020-12-01 01:06:26 +00:00
// PodResourcesEndpointRequestsTotalCount is a Counter that tracks the cumulative number of requests to the PodResource endpoints.
// Broken down by server API version.
PodResourcesEndpointRequestsTotalCount = metrics . NewCounterVec (
& metrics . CounterOpts {
Subsystem : KubeletSubsystem ,
Name : PodResourcesEndpointRequestsTotalKey ,
Help : "Cumulative number of requests to the PodResource endpoint. Broken down by server api version." ,
StabilityLevel : metrics . ALPHA ,
} ,
[ ] string { "server_api_version" } ,
)
2021-03-18 22:40:29 +00:00
// PodResourcesEndpointRequestsListCount is a Counter that tracks the number of requests to the PodResource List() endpoint.
// Broken down by server API version.
PodResourcesEndpointRequestsListCount = metrics . NewCounterVec (
& metrics . CounterOpts {
Subsystem : KubeletSubsystem ,
Name : PodResourcesEndpointRequestsListKey ,
Help : "Number of requests to the PodResource List endpoint. Broken down by server api version." ,
StabilityLevel : metrics . ALPHA ,
} ,
[ ] string { "server_api_version" } ,
)
// PodResourcesEndpointRequestsGetAllocatableCount is a Counter that tracks the number of requests to the PodResource GetAllocatableResources() endpoint.
// Broken down by server API version.
PodResourcesEndpointRequestsGetAllocatableCount = metrics . NewCounterVec (
& metrics . CounterOpts {
Subsystem : KubeletSubsystem ,
Name : PodResourcesEndpointRequestsGetAllocatableKey ,
Help : "Number of requests to the PodResource GetAllocatableResources endpoint. Broken down by server api version." ,
StabilityLevel : metrics . ALPHA ,
} ,
[ ] string { "server_api_version" } ,
)
// PodResourcesEndpointErrorsListCount is a Counter that tracks the number of errors returned by he PodResource List() endpoint.
// Broken down by server API version.
PodResourcesEndpointErrorsListCount = metrics . NewCounterVec (
& metrics . CounterOpts {
Subsystem : KubeletSubsystem ,
Name : PodResourcesEndpointErrorsListKey ,
Help : "Number of requests to the PodResource List endpoint which returned error. Broken down by server api version." ,
StabilityLevel : metrics . ALPHA ,
} ,
[ ] string { "server_api_version" } ,
)
// PodResourcesEndpointErrorsGetAllocatableCount is a Counter that tracks the number of errors returned by the PodResource GetAllocatableResources() endpoint.
// Broken down by server API version.
PodResourcesEndpointErrorsGetAllocatableCount = metrics . NewCounterVec (
& metrics . CounterOpts {
Subsystem : KubeletSubsystem ,
Name : PodResourcesEndpointErrorsGetAllocatableKey ,
Help : "Number of requests to the PodResource GetAllocatableResources endpoint which returned error. Broken down by server api version." ,
StabilityLevel : metrics . ALPHA ,
} ,
[ ] string { "server_api_version" } ,
)
2019-01-12 04:58:27 +00:00
// Metrics for node config
2019-09-27 21:51:53 +00:00
// AssignedConfig is a Gauge that is set 1 if the Kubelet has a NodeConfig assigned.
AssignedConfig = metrics . NewGaugeVec (
& metrics . GaugeOpts {
2021-07-02 08:43:15 +00:00
Subsystem : KubeletSubsystem ,
Name : AssignedConfigKey ,
Help : "The node's understanding of intended config. The count is always 1." ,
DeprecatedVersion : "1.22.0" ,
StabilityLevel : metrics . ALPHA ,
2019-01-12 04:58:27 +00:00
} ,
[ ] string { ConfigSourceLabelKey , ConfigUIDLabelKey , ConfigResourceVersionLabelKey , KubeletConfigKeyLabelKey } ,
)
2019-09-27 21:51:53 +00:00
// ActiveConfig is a Gauge that is set to 1 if the Kubelet has an active NodeConfig.
ActiveConfig = metrics . NewGaugeVec (
& metrics . GaugeOpts {
2021-07-02 08:43:15 +00:00
Subsystem : KubeletSubsystem ,
Name : ActiveConfigKey ,
Help : "The config source the node is actively using. The count is always 1." ,
DeprecatedVersion : "1.22.0" ,
StabilityLevel : metrics . ALPHA ,
2019-01-12 04:58:27 +00:00
} ,
[ ] string { ConfigSourceLabelKey , ConfigUIDLabelKey , ConfigResourceVersionLabelKey , KubeletConfigKeyLabelKey } ,
)
2019-09-27 21:51:53 +00:00
// LastKnownGoodConfig is a Gauge that is set to 1 if the Kubelet has a NodeConfig it can fall back to if there
// are certain errors.
LastKnownGoodConfig = metrics . NewGaugeVec (
& metrics . GaugeOpts {
2021-07-02 08:43:15 +00:00
Subsystem : KubeletSubsystem ,
Name : LastKnownGoodConfigKey ,
Help : "The config source the node will fall back to when it encounters certain errors. The count is always 1." ,
DeprecatedVersion : "1.22.0" ,
StabilityLevel : metrics . ALPHA ,
2019-01-12 04:58:27 +00:00
} ,
[ ] string { ConfigSourceLabelKey , ConfigUIDLabelKey , ConfigResourceVersionLabelKey , KubeletConfigKeyLabelKey } ,
)
2019-09-27 21:51:53 +00:00
// ConfigError is a Gauge that is set to 1 if the node is experiencing a configuration-related error.
ConfigError = metrics . NewGauge (
& metrics . GaugeOpts {
2021-07-02 08:43:15 +00:00
Subsystem : KubeletSubsystem ,
Name : ConfigErrorKey ,
Help : "This metric is true (1) if the node is experiencing a configuration-related error, false (0) otherwise." ,
DeprecatedVersion : "1.22.0" ,
StabilityLevel : metrics . ALPHA ,
2019-01-12 04:58:27 +00:00
} ,
)
2019-09-27 21:51:53 +00:00
// RunPodSandboxDuration is a Histogram that tracks the duration (in seconds) it takes to run Pod Sandbox operations.
2020-03-26 21:07:15 +00:00
// Broken down by RuntimeClass.Handler.
2019-09-27 21:51:53 +00:00
RunPodSandboxDuration = metrics . NewHistogramVec (
& metrics . HistogramOpts {
2019-04-07 17:07:55 +00:00
Subsystem : KubeletSubsystem ,
Name : RunPodSandboxDurationKey ,
2020-03-26 21:07:15 +00:00
Help : "Duration in seconds of the run_podsandbox operations. Broken down by RuntimeClass.Handler." ,
2019-04-07 17:07:55 +00:00
// Use DefBuckets for now, will customize the buckets if necessary.
2019-12-12 01:27:03 +00:00
Buckets : metrics . DefBuckets ,
2019-09-27 21:51:53 +00:00
StabilityLevel : metrics . ALPHA ,
2019-04-07 17:07:55 +00:00
} ,
[ ] string { "runtime_handler" } ,
)
2019-09-27 21:51:53 +00:00
// RunPodSandboxErrors is a Counter that tracks the cumulative number of Pod Sandbox operations errors.
2020-03-26 21:07:15 +00:00
// Broken down by RuntimeClass.Handler.
2019-09-27 21:51:53 +00:00
RunPodSandboxErrors = metrics . NewCounterVec (
& metrics . CounterOpts {
Subsystem : KubeletSubsystem ,
Name : RunPodSandboxErrorsKey ,
2020-03-26 21:07:15 +00:00
Help : "Cumulative number of the run_podsandbox operation errors by RuntimeClass.Handler." ,
2019-09-27 21:51:53 +00:00
StabilityLevel : metrics . ALPHA ,
2019-04-07 17:07:55 +00:00
} ,
[ ] string { "runtime_handler" } ,
)
2019-09-27 21:51:53 +00:00
2021-07-02 08:43:15 +00:00
// RunningPodCount is a gauge that tracks the number of Pods currently with a running sandbox
// It is used to expose the kubelet internal state: how many pods have running containers in the container runtime, and mainly for debugging purpose.
2019-09-27 21:51:53 +00:00
RunningPodCount = metrics . NewGauge (
& metrics . GaugeOpts {
Subsystem : KubeletSubsystem ,
2021-03-18 22:40:29 +00:00
Name : RunningPodsKey ,
2021-07-02 08:43:15 +00:00
Help : "Number of pods that have a running pod sandbox" ,
2019-09-27 21:51:53 +00:00
StabilityLevel : metrics . ALPHA ,
} ,
)
// RunningContainerCount is a gauge that tracks the number of containers currently running
RunningContainerCount = metrics . NewGaugeVec (
& metrics . GaugeOpts {
Subsystem : KubeletSubsystem ,
2021-03-18 22:40:29 +00:00
Name : RunningContainersKey ,
2019-09-27 21:51:53 +00:00
Help : "Number of containers currently running" ,
StabilityLevel : metrics . ALPHA ,
} ,
[ ] string { "container_state" } ,
)
2021-07-02 08:43:15 +00:00
// StartedPodsTotal is a counter that tracks pod sandbox creation operations
StartedPodsTotal = metrics . NewCounter (
& metrics . CounterOpts {
Subsystem : KubeletSubsystem ,
Name : StartedPodsTotalKey ,
Help : "Cumulative number of pods started" ,
StabilityLevel : metrics . ALPHA ,
} ,
)
// StartedPodsErrorsTotal is a counter that tracks the number of errors creating pod sandboxes
StartedPodsErrorsTotal = metrics . NewCounterVec (
& metrics . CounterOpts {
Subsystem : KubeletSubsystem ,
Name : StartedPodsErrorsTotalKey ,
Help : "Cumulative number of errors when starting pods" ,
StabilityLevel : metrics . ALPHA ,
} ,
[ ] string { "message" } ,
)
// StartedContainersTotal is a counter that tracks the number of container creation operations
StartedContainersTotal = metrics . NewCounterVec (
& metrics . CounterOpts {
Subsystem : KubeletSubsystem ,
Name : StartedContainersTotalKey ,
Help : "Cumulative number of containers started" ,
StabilityLevel : metrics . ALPHA ,
} ,
[ ] string { "container_type" } ,
)
// StartedContainersTotal is a counter that tracks the number of errors creating containers
StartedContainersErrorsTotal = metrics . NewCounterVec (
& metrics . CounterOpts {
Subsystem : KubeletSubsystem ,
Name : StartedContainersErrorsTotalKey ,
Help : "Cumulative number of errors when starting containers" ,
StabilityLevel : metrics . ALPHA ,
} ,
[ ] string { "container_type" , "code" } ,
)
// ManagedEphemeralContainers is a gauge that indicates how many ephemeral containers are managed by this kubelet.
ManagedEphemeralContainers = metrics . NewGauge (
& metrics . GaugeOpts {
Subsystem : KubeletSubsystem ,
Name : ManagedEphemeralContainersKey ,
Help : "Current number of ephemeral containers in pods managed by this kubelet. Ephemeral containers will be ignored if disabled by the EphemeralContainers feature gate, and this number will be 0." ,
StabilityLevel : metrics . ALPHA ,
} ,
)
2019-01-12 04:58:27 +00:00
)
var registerMetrics sync . Once
2019-09-27 21:51:53 +00:00
// Register registers all metrics.
2020-12-01 01:06:26 +00:00
func Register ( collectors ... metrics . StableCollector ) {
2019-01-12 04:58:27 +00:00
// Register the metrics.
registerMetrics . Do ( func ( ) {
2019-09-27 21:51:53 +00:00
legacyregistry . MustRegister ( NodeName )
legacyregistry . MustRegister ( PodWorkerDuration )
legacyregistry . MustRegister ( PodStartDuration )
legacyregistry . MustRegister ( CgroupManagerDuration )
legacyregistry . MustRegister ( PodWorkerStartDuration )
legacyregistry . MustRegister ( ContainersPerPodCount )
legacyregistry . MustRegister ( PLEGRelistDuration )
legacyregistry . MustRegister ( PLEGDiscardEvents )
legacyregistry . MustRegister ( PLEGRelistInterval )
2020-03-26 21:07:15 +00:00
legacyregistry . MustRegister ( PLEGLastSeen )
2019-09-27 21:51:53 +00:00
legacyregistry . MustRegister ( RuntimeOperations )
legacyregistry . MustRegister ( RuntimeOperationsDuration )
legacyregistry . MustRegister ( RuntimeOperationsErrors )
legacyregistry . MustRegister ( Evictions )
legacyregistry . MustRegister ( EvictionStatsAge )
2019-12-12 01:27:03 +00:00
legacyregistry . MustRegister ( Preemptions )
2019-09-27 21:51:53 +00:00
legacyregistry . MustRegister ( DevicePluginRegistrationCount )
legacyregistry . MustRegister ( DevicePluginAllocationDuration )
legacyregistry . MustRegister ( RunningContainerCount )
legacyregistry . MustRegister ( RunningPodCount )
2021-07-02 08:43:15 +00:00
legacyregistry . MustRegister ( ManagedEphemeralContainers )
legacyregistry . MustRegister ( StartedPodsTotal )
legacyregistry . MustRegister ( StartedPodsErrorsTotal )
legacyregistry . MustRegister ( StartedContainersTotal )
legacyregistry . MustRegister ( StartedContainersErrorsTotal )
2020-03-26 21:07:15 +00:00
legacyregistry . MustRegister ( RunPodSandboxDuration )
legacyregistry . MustRegister ( RunPodSandboxErrors )
2019-01-12 04:58:27 +00:00
if utilfeature . DefaultFeatureGate . Enabled ( features . DynamicKubeletConfig ) {
2019-09-27 21:51:53 +00:00
legacyregistry . MustRegister ( AssignedConfig )
legacyregistry . MustRegister ( ActiveConfig )
legacyregistry . MustRegister ( LastKnownGoodConfig )
legacyregistry . MustRegister ( ConfigError )
2019-01-12 04:58:27 +00:00
}
for _ , collector := range collectors {
2019-12-12 01:27:03 +00:00
legacyregistry . CustomMustRegister ( collector )
2019-01-12 04:58:27 +00:00
}
} )
}
2019-12-12 01:27:03 +00:00
// GetGather returns the gatherer. It used by test case outside current package.
func GetGather ( ) metrics . Gatherer {
return legacyregistry . DefaultGatherer
}
2019-04-07 17:07:55 +00:00
// SinceInSeconds gets the time since the specified start in seconds.
func SinceInSeconds ( start time . Time ) float64 {
return time . Since ( start ) . Seconds ( )
}
2019-01-12 04:58:27 +00:00
const configMapAPIPathFmt = "/api/v1/namespaces/%s/configmaps/%s"
func configLabels ( source * corev1 . NodeConfigSource ) ( map [ string ] string , error ) {
if source == nil {
return map [ string ] string {
// prometheus requires all of the labels that can be set on the metric
ConfigSourceLabelKey : "local" ,
ConfigUIDLabelKey : "" ,
ConfigResourceVersionLabelKey : "" ,
KubeletConfigKeyLabelKey : "" ,
} , nil
}
if source . ConfigMap != nil {
return map [ string ] string {
ConfigSourceLabelKey : fmt . Sprintf ( configMapAPIPathFmt , source . ConfigMap . Namespace , source . ConfigMap . Name ) ,
ConfigUIDLabelKey : string ( source . ConfigMap . UID ) ,
ConfigResourceVersionLabelKey : source . ConfigMap . ResourceVersion ,
KubeletConfigKeyLabelKey : source . ConfigMap . KubeletConfigKey ,
} , nil
}
return nil , fmt . Errorf ( "unrecognized config source type, all source subfields were nil" )
}
// track labels across metric updates, so we can delete old label sets and prevent leaks
2019-09-27 21:51:53 +00:00
var assignedConfigLabels map [ string ] string
2019-01-12 04:58:27 +00:00
2019-09-27 21:51:53 +00:00
// SetAssignedConfig tracks labels according to the assigned NodeConfig. It also tracks labels
// across metric updates so old labels can be safely deleted.
2019-01-12 04:58:27 +00:00
func SetAssignedConfig ( source * corev1 . NodeConfigSource ) error {
// compute the timeseries labels from the source
labels , err := configLabels ( source )
if err != nil {
return err
}
// clean up the old timeseries (WithLabelValues creates a new one for each distinct label set)
2019-09-27 21:51:53 +00:00
if ! AssignedConfig . Delete ( assignedConfigLabels ) {
2021-03-18 22:40:29 +00:00
klog . InfoS ( "Failed to delete metric for labels. This may result in ambiguity from multiple metrics concurrently indicating different assigned configs." , "labels" , assignedConfigLabels )
2019-09-27 21:51:53 +00:00
}
2019-01-12 04:58:27 +00:00
// record the new timeseries
assignedConfigLabels = labels
// expose the new timeseries with a constant count of 1
AssignedConfig . With ( assignedConfigLabels ) . Set ( 1 )
return nil
}
// track labels across metric updates, so we can delete old label sets and prevent leaks
2019-09-27 21:51:53 +00:00
var activeConfigLabels map [ string ] string
2019-01-12 04:58:27 +00:00
2019-09-27 21:51:53 +00:00
// SetActiveConfig tracks labels according to the NodeConfig that is currently used by the Kubelet.
// It also tracks labels across metric updates so old labels can be safely deleted.
2019-01-12 04:58:27 +00:00
func SetActiveConfig ( source * corev1 . NodeConfigSource ) error {
// compute the timeseries labels from the source
labels , err := configLabels ( source )
if err != nil {
return err
}
// clean up the old timeseries (WithLabelValues creates a new one for each distinct label set)
2019-09-27 21:51:53 +00:00
if ! ActiveConfig . Delete ( activeConfigLabels ) {
2021-03-18 22:40:29 +00:00
klog . InfoS ( "Failed to delete metric for labels. This may result in ambiguity from multiple metrics concurrently indicating different active configs." , "labels" , activeConfigLabels )
2019-09-27 21:51:53 +00:00
}
2019-01-12 04:58:27 +00:00
// record the new timeseries
activeConfigLabels = labels
// expose the new timeseries with a constant count of 1
ActiveConfig . With ( activeConfigLabels ) . Set ( 1 )
return nil
}
// track labels across metric updates, so we can delete old label sets and prevent leaks
2019-09-27 21:51:53 +00:00
var lastKnownGoodConfigLabels map [ string ] string
2019-01-12 04:58:27 +00:00
2019-09-27 21:51:53 +00:00
// SetLastKnownGoodConfig tracks labels according to the NodeConfig that was successfully applied last.
// It also tracks labels across metric updates so old labels can be safely deleted.
2019-01-12 04:58:27 +00:00
func SetLastKnownGoodConfig ( source * corev1 . NodeConfigSource ) error {
// compute the timeseries labels from the source
labels , err := configLabels ( source )
if err != nil {
return err
}
// clean up the old timeseries (WithLabelValues creates a new one for each distinct label set)
2019-09-27 21:51:53 +00:00
if ! LastKnownGoodConfig . Delete ( lastKnownGoodConfigLabels ) {
2021-03-18 22:40:29 +00:00
klog . InfoS ( "Failed to delete metric for labels. This may result in ambiguity from multiple metrics concurrently indicating different last known good configs." , "labels" , lastKnownGoodConfigLabels )
2019-09-27 21:51:53 +00:00
}
2019-01-12 04:58:27 +00:00
// record the new timeseries
lastKnownGoodConfigLabels = labels
// expose the new timeseries with a constant count of 1
LastKnownGoodConfig . With ( lastKnownGoodConfigLabels ) . Set ( 1 )
return nil
}
2019-09-27 21:51:53 +00:00
// SetConfigError sets a the ConfigError metric to 1 in case any errors were encountered.
2019-01-12 04:58:27 +00:00
func SetConfigError ( err bool ) {
if err {
ConfigError . Set ( 1 )
} else {
ConfigError . Set ( 0 )
}
}
2019-04-07 17:07:55 +00:00
2019-09-27 21:51:53 +00:00
// SetNodeName sets the NodeName Gauge to 1.
2019-04-07 17:07:55 +00:00
func SetNodeName ( name types . NodeName ) {
NodeName . WithLabelValues ( string ( name ) ) . Set ( 1 )
}