Expose cpu bugs and flags as info metrics. (#1788)
* Expose cpu bugs and flags as info metrics with a regexp filter. * Automatically enable CPU info metrics when using flags or bugs feature. Signed-off-by: domgoer <domdoumc@gmail.com>pull/1801/head
parent
f4b89c79a2
commit
503e4fc848
|
@ -18,6 +18,7 @@ package collector
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
|
@ -32,16 +33,23 @@ type cpuCollector struct {
|
||||||
fs procfs.FS
|
fs procfs.FS
|
||||||
cpu *prometheus.Desc
|
cpu *prometheus.Desc
|
||||||
cpuInfo *prometheus.Desc
|
cpuInfo *prometheus.Desc
|
||||||
|
cpuFlagsInfo *prometheus.Desc
|
||||||
|
cpuBugsInfo *prometheus.Desc
|
||||||
cpuGuest *prometheus.Desc
|
cpuGuest *prometheus.Desc
|
||||||
cpuCoreThrottle *prometheus.Desc
|
cpuCoreThrottle *prometheus.Desc
|
||||||
cpuPackageThrottle *prometheus.Desc
|
cpuPackageThrottle *prometheus.Desc
|
||||||
logger log.Logger
|
logger log.Logger
|
||||||
cpuStats []procfs.CPUStat
|
cpuStats []procfs.CPUStat
|
||||||
cpuStatsMutex sync.Mutex
|
cpuStatsMutex sync.Mutex
|
||||||
|
|
||||||
|
cpuFlagsIncludeRegexp *regexp.Regexp
|
||||||
|
cpuBugsIncludeRegexp *regexp.Regexp
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool()
|
enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool()
|
||||||
|
flagsInclude = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String()
|
||||||
|
bugsInclude = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String()
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
|
@ -54,7 +62,7 @@ func NewCPUCollector(logger log.Logger) (Collector, error) {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to open procfs: %w", err)
|
return nil, fmt.Errorf("failed to open procfs: %w", err)
|
||||||
}
|
}
|
||||||
return &cpuCollector{
|
c := &cpuCollector{
|
||||||
fs: fs,
|
fs: fs,
|
||||||
cpu: nodeCPUSecondsDesc,
|
cpu: nodeCPUSecondsDesc,
|
||||||
cpuInfo: prometheus.NewDesc(
|
cpuInfo: prometheus.NewDesc(
|
||||||
|
@ -62,6 +70,16 @@ func NewCPUCollector(logger log.Logger) (Collector, error) {
|
||||||
"CPU information from /proc/cpuinfo.",
|
"CPU information from /proc/cpuinfo.",
|
||||||
[]string{"package", "core", "cpu", "vendor", "family", "model", "model_name", "microcode", "stepping", "cachesize"}, nil,
|
[]string{"package", "core", "cpu", "vendor", "family", "model", "model_name", "microcode", "stepping", "cachesize"}, nil,
|
||||||
),
|
),
|
||||||
|
cpuFlagsInfo: prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "flag_info"),
|
||||||
|
"The `flags` field of CPU information from /proc/cpuinfo.",
|
||||||
|
[]string{"flag"}, nil,
|
||||||
|
),
|
||||||
|
cpuBugsInfo: prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "bug_info"),
|
||||||
|
"The `bugs` field of CPU information from /proc/cpuinfo.",
|
||||||
|
[]string{"bug"}, nil,
|
||||||
|
),
|
||||||
cpuGuest: prometheus.NewDesc(
|
cpuGuest: prometheus.NewDesc(
|
||||||
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "guest_seconds_total"),
|
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "guest_seconds_total"),
|
||||||
"Seconds the cpus spent in guests (VMs) for each mode.",
|
"Seconds the cpus spent in guests (VMs) for each mode.",
|
||||||
|
@ -78,7 +96,34 @@ func NewCPUCollector(logger log.Logger) (Collector, error) {
|
||||||
[]string{"package"}, nil,
|
[]string{"package"}, nil,
|
||||||
),
|
),
|
||||||
logger: logger,
|
logger: logger,
|
||||||
}, nil
|
}
|
||||||
|
err = c.compileIncludeFlags(flagsInclude, bugsInclude)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("fail to compile --collector.cpu.info.flags-include and --collector.cpu.info.bugs-include, the values of them must be regular expressions: %w", err)
|
||||||
|
}
|
||||||
|
return c, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *cpuCollector) compileIncludeFlags(flagsIncludeFlag, bugsIncludeFlag *string) error {
|
||||||
|
if (*flagsIncludeFlag != "" || *bugsIncludeFlag != "") && !*enableCPUInfo {
|
||||||
|
*enableCPUInfo = true
|
||||||
|
level.Info(c.logger).Log("msg", "--collector.cpu.info has been set to `true` because you set the following flags, like --collector.cpu.info.flags-include and --collector.cpu.info.bugs-include")
|
||||||
|
}
|
||||||
|
|
||||||
|
var err error
|
||||||
|
if *flagsIncludeFlag != "" {
|
||||||
|
c.cpuFlagsIncludeRegexp, err = regexp.Compile(*flagsIncludeFlag)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if *bugsIncludeFlag != "" {
|
||||||
|
c.cpuBugsIncludeRegexp, err = regexp.Compile(*bugsIncludeFlag)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update implements Collector and exposes cpu related metrics from /proc/stat and /sys/.../cpu/.
|
// Update implements Collector and exposes cpu related metrics from /proc/stat and /sys/.../cpu/.
|
||||||
|
@ -117,6 +162,31 @@ func (c *cpuCollector) updateInfo(ch chan<- prometheus.Metric) error {
|
||||||
cpu.Microcode,
|
cpu.Microcode,
|
||||||
cpu.Stepping,
|
cpu.Stepping,
|
||||||
cpu.CacheSize)
|
cpu.CacheSize)
|
||||||
|
|
||||||
|
if err := updateFieldInfo(cpu.Flags, c.cpuFlagsIncludeRegexp, c.cpuFlagsInfo, ch); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := updateFieldInfo(cpu.Bugs, c.cpuBugsIncludeRegexp, c.cpuBugsInfo, ch); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func updateFieldInfo(valueList []string, filter *regexp.Regexp, desc *prometheus.Desc, ch chan<- prometheus.Metric) error {
|
||||||
|
if filter == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, val := range valueList {
|
||||||
|
if !filter.MatchString(val) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ch <- prometheus.MustNewConstMetric(desc,
|
||||||
|
prometheus.GaugeValue,
|
||||||
|
1,
|
||||||
|
val,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -184,12 +184,24 @@ node_cooling_device_cur_state{name="0",type="Processor"} 0
|
||||||
# HELP node_cooling_device_max_state Maximum throttle state of the cooling device
|
# HELP node_cooling_device_max_state Maximum throttle state of the cooling device
|
||||||
# TYPE node_cooling_device_max_state gauge
|
# TYPE node_cooling_device_max_state gauge
|
||||||
node_cooling_device_max_state{name="0",type="Processor"} 3
|
node_cooling_device_max_state{name="0",type="Processor"} 3
|
||||||
|
# HELP node_cpu_bug_info The `bugs` field of CPU information from /proc/cpuinfo.
|
||||||
|
# TYPE node_cpu_bug_info gauge
|
||||||
|
node_cpu_bug_info{bug="cpu_meltdown"} 1
|
||||||
|
node_cpu_bug_info{bug="mds"} 1
|
||||||
|
node_cpu_bug_info{bug="spectre_v1"} 1
|
||||||
|
node_cpu_bug_info{bug="spectre_v2"} 1
|
||||||
# HELP node_cpu_core_throttles_total Number of times this cpu core has been throttled.
|
# HELP node_cpu_core_throttles_total Number of times this cpu core has been throttled.
|
||||||
# TYPE node_cpu_core_throttles_total counter
|
# TYPE node_cpu_core_throttles_total counter
|
||||||
node_cpu_core_throttles_total{core="0",package="0"} 5
|
node_cpu_core_throttles_total{core="0",package="0"} 5
|
||||||
node_cpu_core_throttles_total{core="0",package="1"} 0
|
node_cpu_core_throttles_total{core="0",package="1"} 0
|
||||||
node_cpu_core_throttles_total{core="1",package="0"} 0
|
node_cpu_core_throttles_total{core="1",package="0"} 0
|
||||||
node_cpu_core_throttles_total{core="1",package="1"} 9
|
node_cpu_core_throttles_total{core="1",package="1"} 9
|
||||||
|
# HELP node_cpu_flag_info The `flags` field of CPU information from /proc/cpuinfo.
|
||||||
|
# TYPE node_cpu_flag_info gauge
|
||||||
|
node_cpu_flag_info{flag="aes"} 1
|
||||||
|
node_cpu_flag_info{flag="avx"} 1
|
||||||
|
node_cpu_flag_info{flag="avx2"} 1
|
||||||
|
node_cpu_flag_info{flag="constant_tsc"} 1
|
||||||
# HELP node_cpu_guest_seconds_total Seconds the cpus spent in guests (VMs) for each mode.
|
# HELP node_cpu_guest_seconds_total Seconds the cpus spent in guests (VMs) for each mode.
|
||||||
# TYPE node_cpu_guest_seconds_total counter
|
# TYPE node_cpu_guest_seconds_total counter
|
||||||
node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0.01
|
node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0.01
|
||||||
|
|
|
@ -232,12 +232,24 @@ node_cooling_device_cur_state{name="0",type="Processor"} 0
|
||||||
# HELP node_cooling_device_max_state Maximum throttle state of the cooling device
|
# HELP node_cooling_device_max_state Maximum throttle state of the cooling device
|
||||||
# TYPE node_cooling_device_max_state gauge
|
# TYPE node_cooling_device_max_state gauge
|
||||||
node_cooling_device_max_state{name="0",type="Processor"} 3
|
node_cooling_device_max_state{name="0",type="Processor"} 3
|
||||||
|
# HELP node_cpu_bug_info The `bugs` field of CPU information from /proc/cpuinfo.
|
||||||
|
# TYPE node_cpu_bug_info gauge
|
||||||
|
node_cpu_bug_info{bug="cpu_meltdown"} 1
|
||||||
|
node_cpu_bug_info{bug="mds"} 1
|
||||||
|
node_cpu_bug_info{bug="spectre_v1"} 1
|
||||||
|
node_cpu_bug_info{bug="spectre_v2"} 1
|
||||||
# HELP node_cpu_core_throttles_total Number of times this cpu core has been throttled.
|
# HELP node_cpu_core_throttles_total Number of times this cpu core has been throttled.
|
||||||
# TYPE node_cpu_core_throttles_total counter
|
# TYPE node_cpu_core_throttles_total counter
|
||||||
node_cpu_core_throttles_total{core="0",package="0"} 5
|
node_cpu_core_throttles_total{core="0",package="0"} 5
|
||||||
node_cpu_core_throttles_total{core="0",package="1"} 0
|
node_cpu_core_throttles_total{core="0",package="1"} 0
|
||||||
node_cpu_core_throttles_total{core="1",package="0"} 0
|
node_cpu_core_throttles_total{core="1",package="0"} 0
|
||||||
node_cpu_core_throttles_total{core="1",package="1"} 9
|
node_cpu_core_throttles_total{core="1",package="1"} 9
|
||||||
|
# HELP node_cpu_flag_info The `flags` field of CPU information from /proc/cpuinfo.
|
||||||
|
# TYPE node_cpu_flag_info gauge
|
||||||
|
node_cpu_flag_info{flag="aes"} 1
|
||||||
|
node_cpu_flag_info{flag="avx"} 1
|
||||||
|
node_cpu_flag_info{flag="avx2"} 1
|
||||||
|
node_cpu_flag_info{flag="constant_tsc"} 1
|
||||||
# HELP node_cpu_guest_seconds_total Seconds the cpus spent in guests (VMs) for each mode.
|
# HELP node_cpu_guest_seconds_total Seconds the cpus spent in guests (VMs) for each mode.
|
||||||
# TYPE node_cpu_guest_seconds_total counter
|
# TYPE node_cpu_guest_seconds_total counter
|
||||||
node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0.01
|
node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0.01
|
||||||
|
|
|
@ -107,6 +107,8 @@ fi
|
||||||
--collector.qdisc.fixtures="collector/fixtures/qdisc/" \
|
--collector.qdisc.fixtures="collector/fixtures/qdisc/" \
|
||||||
--collector.netclass.ignored-devices="(bond0|dmz|int)" \
|
--collector.netclass.ignored-devices="(bond0|dmz|int)" \
|
||||||
--collector.cpu.info \
|
--collector.cpu.info \
|
||||||
|
--collector.cpu.info.flags-include="^(aes|avx.?|constant_tsc)$" \
|
||||||
|
--collector.cpu.info.bugs-include="^(cpu_meltdown|spectre_.*|mds)$" \
|
||||||
--web.listen-address "127.0.0.1:${port}" \
|
--web.listen-address "127.0.0.1:${port}" \
|
||||||
--log.level="debug" > "${tmpdir}/node_exporter.log" 2>&1 &
|
--log.level="debug" > "${tmpdir}/node_exporter.log" 2>&1 &
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue