mirror of https://github.com/k3s-io/k3s
350 lines
12 KiB
Go
350 lines
12 KiB
Go
|
// Copyright 2020 Google Inc. All Rights Reserved.
|
||
|
//
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
|
||
|
package metrics
|
||
|
|
||
|
import (
|
||
|
"strconv"
|
||
|
|
||
|
"github.com/google/cadvisor/container"
|
||
|
info "github.com/google/cadvisor/info/v1"
|
||
|
"github.com/prometheus/client_golang/prometheus"
|
||
|
|
||
|
"k8s.io/klog/v2"
|
||
|
)
|
||
|
|
||
|
var baseLabelsNames = []string{"machine_id", "system_uuid", "boot_id"}
|
||
|
|
||
|
const (
|
||
|
prometheusModeLabelName = "mode"
|
||
|
prometheusTypeLabelName = "type"
|
||
|
prometheusLevelLabelName = "level"
|
||
|
prometheusNodeLabelName = "node_id"
|
||
|
prometheusCoreLabelName = "core_id"
|
||
|
prometheusThreadLabelName = "thread_id"
|
||
|
prometheusPageSizeLabelName = "page_size"
|
||
|
|
||
|
nvmMemoryMode = "memory_mode"
|
||
|
nvmAppDirectMode = "app_direct_mode"
|
||
|
|
||
|
memoryByTypeDimmCountKey = "DimmCount"
|
||
|
memoryByTypeDimmCapacityKey = "Capacity"
|
||
|
|
||
|
emptyLabelValue = ""
|
||
|
)
|
||
|
|
||
|
// machineMetric describes a multi-dimensional metric used for exposing a
|
||
|
// certain type of machine statistic.
|
||
|
type machineMetric struct {
|
||
|
name string
|
||
|
help string
|
||
|
valueType prometheus.ValueType
|
||
|
extraLabels []string
|
||
|
condition func(machineInfo *info.MachineInfo) bool
|
||
|
getValues func(machineInfo *info.MachineInfo) metricValues
|
||
|
}
|
||
|
|
||
|
func (metric *machineMetric) desc(baseLabels []string) *prometheus.Desc {
|
||
|
return prometheus.NewDesc(metric.name, metric.help, append(baseLabels, metric.extraLabels...), nil)
|
||
|
}
|
||
|
|
||
|
// PrometheusMachineCollector implements prometheus.Collector.
|
||
|
type PrometheusMachineCollector struct {
|
||
|
infoProvider infoProvider
|
||
|
errors prometheus.Gauge
|
||
|
machineMetrics []machineMetric
|
||
|
}
|
||
|
|
||
|
// NewPrometheusMachineCollector returns a new PrometheusCollector.
|
||
|
func NewPrometheusMachineCollector(i infoProvider, includedMetrics container.MetricSet) *PrometheusMachineCollector {
|
||
|
c := &PrometheusMachineCollector{
|
||
|
|
||
|
infoProvider: i,
|
||
|
errors: prometheus.NewGauge(prometheus.GaugeOpts{
|
||
|
Namespace: "machine",
|
||
|
Name: "scrape_error",
|
||
|
Help: "1 if there was an error while getting machine metrics, 0 otherwise.",
|
||
|
}),
|
||
|
machineMetrics: []machineMetric{
|
||
|
{
|
||
|
name: "machine_cpu_physical_cores",
|
||
|
help: "Number of physical CPU cores.",
|
||
|
valueType: prometheus.GaugeValue,
|
||
|
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||
|
return metricValues{{value: float64(machineInfo.NumPhysicalCores), timestamp: machineInfo.Timestamp}}
|
||
|
},
|
||
|
},
|
||
|
{
|
||
|
name: "machine_cpu_cores",
|
||
|
help: "Number of logical CPU cores.",
|
||
|
valueType: prometheus.GaugeValue,
|
||
|
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||
|
return metricValues{{value: float64(machineInfo.NumCores), timestamp: machineInfo.Timestamp}}
|
||
|
},
|
||
|
},
|
||
|
{
|
||
|
name: "machine_cpu_sockets",
|
||
|
help: "Number of CPU sockets.",
|
||
|
valueType: prometheus.GaugeValue,
|
||
|
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||
|
return metricValues{{value: float64(machineInfo.NumSockets), timestamp: machineInfo.Timestamp}}
|
||
|
},
|
||
|
},
|
||
|
{
|
||
|
name: "machine_memory_bytes",
|
||
|
help: "Amount of memory installed on the machine.",
|
||
|
valueType: prometheus.GaugeValue,
|
||
|
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||
|
return metricValues{{value: float64(machineInfo.MemoryCapacity), timestamp: machineInfo.Timestamp}}
|
||
|
},
|
||
|
},
|
||
|
{
|
||
|
name: "machine_dimm_count",
|
||
|
help: "Number of RAM DIMM (all types memory modules) value labeled by dimm type.",
|
||
|
valueType: prometheus.GaugeValue,
|
||
|
extraLabels: []string{prometheusTypeLabelName},
|
||
|
condition: func(machineInfo *info.MachineInfo) bool { return len(machineInfo.MemoryByType) != 0 },
|
||
|
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||
|
return getMemoryByType(machineInfo, memoryByTypeDimmCountKey)
|
||
|
},
|
||
|
},
|
||
|
{
|
||
|
name: "machine_dimm_capacity_bytes",
|
||
|
help: "Total RAM DIMM capacity (all types memory modules) value labeled by dimm type.",
|
||
|
valueType: prometheus.GaugeValue,
|
||
|
extraLabels: []string{prometheusTypeLabelName},
|
||
|
condition: func(machineInfo *info.MachineInfo) bool { return len(machineInfo.MemoryByType) != 0 },
|
||
|
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||
|
return getMemoryByType(machineInfo, memoryByTypeDimmCapacityKey)
|
||
|
},
|
||
|
},
|
||
|
{
|
||
|
name: "machine_nvm_capacity",
|
||
|
help: "NVM capacity value labeled by NVM mode (memory mode or app direct mode).",
|
||
|
valueType: prometheus.GaugeValue,
|
||
|
extraLabels: []string{prometheusModeLabelName},
|
||
|
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||
|
return metricValues{
|
||
|
{value: float64(machineInfo.NVMInfo.MemoryModeCapacity), labels: []string{nvmMemoryMode}, timestamp: machineInfo.Timestamp},
|
||
|
{value: float64(machineInfo.NVMInfo.AppDirectModeCapacity), labels: []string{nvmAppDirectMode}, timestamp: machineInfo.Timestamp},
|
||
|
}
|
||
|
},
|
||
|
},
|
||
|
{
|
||
|
name: "machine_nvm_avg_power_budget_watts",
|
||
|
help: "NVM power budget.",
|
||
|
valueType: prometheus.GaugeValue,
|
||
|
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||
|
return metricValues{{value: float64(machineInfo.NVMInfo.AvgPowerBudget), timestamp: machineInfo.Timestamp}}
|
||
|
},
|
||
|
},
|
||
|
},
|
||
|
}
|
||
|
|
||
|
if includedMetrics.Has(container.CPUTopologyMetrics) {
|
||
|
c.machineMetrics = append(c.machineMetrics, []machineMetric{
|
||
|
{
|
||
|
name: "machine_cpu_cache_capacity_bytes",
|
||
|
help: "Cache size in bytes assigned to NUMA node and CPU core.",
|
||
|
valueType: prometheus.GaugeValue,
|
||
|
extraLabels: []string{prometheusNodeLabelName, prometheusCoreLabelName, prometheusTypeLabelName, prometheusLevelLabelName},
|
||
|
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||
|
return getCaches(machineInfo)
|
||
|
},
|
||
|
},
|
||
|
{
|
||
|
name: "machine_thread_siblings_count",
|
||
|
help: "Number of CPU thread siblings.",
|
||
|
valueType: prometheus.GaugeValue,
|
||
|
extraLabels: []string{prometheusNodeLabelName, prometheusCoreLabelName, prometheusThreadLabelName},
|
||
|
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||
|
return getThreadsSiblingsCount(machineInfo)
|
||
|
},
|
||
|
},
|
||
|
{
|
||
|
name: "machine_node_memory_capacity_bytes",
|
||
|
help: "Amount of memory assigned to NUMA node.",
|
||
|
valueType: prometheus.GaugeValue,
|
||
|
extraLabels: []string{prometheusNodeLabelName},
|
||
|
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||
|
return getNodeMemory(machineInfo)
|
||
|
},
|
||
|
},
|
||
|
{
|
||
|
name: "machine_node_hugepages_count",
|
||
|
help: "Numer of hugepages assigned to NUMA node.",
|
||
|
valueType: prometheus.GaugeValue,
|
||
|
extraLabels: []string{prometheusNodeLabelName, prometheusPageSizeLabelName},
|
||
|
getValues: func(machineInfo *info.MachineInfo) metricValues {
|
||
|
return getHugePagesCount(machineInfo)
|
||
|
},
|
||
|
},
|
||
|
}...)
|
||
|
}
|
||
|
return c
|
||
|
}
|
||
|
|
||
|
// Describe describes all the machine metrics ever exported by cadvisor. It
|
||
|
// implements prometheus.PrometheusCollector.
|
||
|
func (collector *PrometheusMachineCollector) Describe(ch chan<- *prometheus.Desc) {
|
||
|
collector.errors.Describe(ch)
|
||
|
for _, metric := range collector.machineMetrics {
|
||
|
ch <- metric.desc([]string{})
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Collect fetches information about machine and delivers them as
|
||
|
// Prometheus metrics. It implements prometheus.PrometheusCollector.
|
||
|
func (collector *PrometheusMachineCollector) Collect(ch chan<- prometheus.Metric) {
|
||
|
collector.errors.Set(0)
|
||
|
collector.collectMachineInfo(ch)
|
||
|
collector.errors.Collect(ch)
|
||
|
}
|
||
|
|
||
|
func (collector *PrometheusMachineCollector) collectMachineInfo(ch chan<- prometheus.Metric) {
|
||
|
machineInfo, err := collector.infoProvider.GetMachineInfo()
|
||
|
if err != nil {
|
||
|
collector.errors.Set(1)
|
||
|
klog.Warningf("Couldn't get machine info: %s", err)
|
||
|
return
|
||
|
}
|
||
|
|
||
|
baseLabelsValues := []string{machineInfo.MachineID, machineInfo.SystemUUID, machineInfo.BootID}
|
||
|
|
||
|
for _, metric := range collector.machineMetrics {
|
||
|
if metric.condition != nil && !metric.condition(machineInfo) {
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
for _, metricValue := range metric.getValues(machineInfo) {
|
||
|
labelValues := make([]string, len(baseLabelsValues))
|
||
|
copy(labelValues, baseLabelsValues)
|
||
|
if len(metric.extraLabels) != 0 {
|
||
|
labelValues = append(labelValues, metricValue.labels...)
|
||
|
}
|
||
|
|
||
|
prometheusMetric := prometheus.MustNewConstMetric(metric.desc(baseLabelsNames),
|
||
|
metric.valueType, metricValue.value, labelValues...)
|
||
|
|
||
|
if metricValue.timestamp.IsZero() {
|
||
|
ch <- prometheusMetric
|
||
|
} else {
|
||
|
ch <- prometheus.NewMetricWithTimestamp(metricValue.timestamp, prometheusMetric)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func getMemoryByType(machineInfo *info.MachineInfo, property string) metricValues {
|
||
|
mValues := make(metricValues, 0, len(machineInfo.MemoryByType))
|
||
|
for memoryType, memoryInfo := range machineInfo.MemoryByType {
|
||
|
propertyValue := 0.0
|
||
|
switch property {
|
||
|
case memoryByTypeDimmCapacityKey:
|
||
|
propertyValue = float64(memoryInfo.Capacity)
|
||
|
case memoryByTypeDimmCountKey:
|
||
|
propertyValue = float64(memoryInfo.DimmCount)
|
||
|
default:
|
||
|
klog.Warningf("Incorrect propery name for MemoryByType, property %s", property)
|
||
|
return metricValues{}
|
||
|
}
|
||
|
mValues = append(mValues, metricValue{value: propertyValue, labels: []string{memoryType}, timestamp: machineInfo.Timestamp})
|
||
|
}
|
||
|
return mValues
|
||
|
}
|
||
|
|
||
|
func getThreadsSiblingsCount(machineInfo *info.MachineInfo) metricValues {
|
||
|
mValues := make(metricValues, 0, machineInfo.NumCores)
|
||
|
for _, node := range machineInfo.Topology {
|
||
|
nodeID := strconv.Itoa(node.Id)
|
||
|
|
||
|
for _, core := range node.Cores {
|
||
|
coreID := strconv.Itoa(core.Id)
|
||
|
siblingsCount := len(core.Threads)
|
||
|
|
||
|
for _, thread := range core.Threads {
|
||
|
mValues = append(mValues,
|
||
|
metricValue{
|
||
|
value: float64(siblingsCount),
|
||
|
labels: []string{nodeID, coreID, strconv.Itoa(thread)},
|
||
|
timestamp: machineInfo.Timestamp,
|
||
|
})
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return mValues
|
||
|
}
|
||
|
|
||
|
func getNodeMemory(machineInfo *info.MachineInfo) metricValues {
|
||
|
mValues := make(metricValues, 0, len(machineInfo.Topology))
|
||
|
for _, node := range machineInfo.Topology {
|
||
|
nodeID := strconv.Itoa(node.Id)
|
||
|
mValues = append(mValues,
|
||
|
metricValue{
|
||
|
value: float64(node.Memory),
|
||
|
labels: []string{nodeID},
|
||
|
timestamp: machineInfo.Timestamp,
|
||
|
})
|
||
|
}
|
||
|
return mValues
|
||
|
}
|
||
|
|
||
|
func getHugePagesCount(machineInfo *info.MachineInfo) metricValues {
|
||
|
mValues := make(metricValues, 0)
|
||
|
for _, node := range machineInfo.Topology {
|
||
|
nodeID := strconv.Itoa(node.Id)
|
||
|
|
||
|
for _, hugePage := range node.HugePages {
|
||
|
mValues = append(mValues,
|
||
|
metricValue{
|
||
|
value: float64(hugePage.NumPages),
|
||
|
labels: []string{nodeID, strconv.FormatUint(hugePage.PageSize, 10)},
|
||
|
timestamp: machineInfo.Timestamp,
|
||
|
})
|
||
|
}
|
||
|
}
|
||
|
return mValues
|
||
|
}
|
||
|
|
||
|
func getCaches(machineInfo *info.MachineInfo) metricValues {
|
||
|
mValues := make(metricValues, 0)
|
||
|
for _, node := range machineInfo.Topology {
|
||
|
nodeID := strconv.Itoa(node.Id)
|
||
|
|
||
|
for _, core := range node.Cores {
|
||
|
coreID := strconv.Itoa(core.Id)
|
||
|
|
||
|
for _, cache := range core.Caches {
|
||
|
mValues = append(mValues,
|
||
|
metricValue{
|
||
|
value: float64(cache.Size),
|
||
|
labels: []string{nodeID, coreID, cache.Type, strconv.Itoa(cache.Level)},
|
||
|
timestamp: machineInfo.Timestamp,
|
||
|
})
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for _, cache := range node.Caches {
|
||
|
mValues = append(mValues,
|
||
|
metricValue{
|
||
|
value: float64(cache.Size),
|
||
|
labels: []string{nodeID, emptyLabelValue, cache.Type, strconv.Itoa(cache.Level)},
|
||
|
timestamp: machineInfo.Timestamp,
|
||
|
})
|
||
|
}
|
||
|
}
|
||
|
return mValues
|
||
|
}
|