cpu: Metric 'package_throttles_total' is per package. (#657)

* cpu: Metric 'package_throttles_total' is per package.

'package_throttles_total' is per package, not per cpu. This also reduces
the total number of cpu time series a lot (esp for multi core cpus).

* cpu: Better handling of a cpulist edge-case.

* cpu: Extract the package number from the directory name.

Do not rely on the range index.

* cpu: Add package_throttle_count for node0 cpu1

This file must be ignored by the cpu collector.
pull/604/merge
Karsten Weiss 7 years ago committed by Ben Kochie
parent abb58a31e2
commit b0d5c00832

@ -17,8 +17,11 @@ package collector
import ( import (
"fmt" "fmt"
"io/ioutil"
"os" "os"
"path/filepath" "path/filepath"
"regexp"
"strings"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/log" "github.com/prometheus/common/log"
@ -29,6 +32,10 @@ const (
cpuCollectorNamespace = "cpu" cpuCollectorNamespace = "cpu"
) )
var (
digitRegexp = regexp.MustCompile("[0-9]+")
)
type cpuCollector struct { type cpuCollector struct {
cpu *prometheus.Desc cpu *prometheus.Desc
cpuFreq *prometheus.Desc cpuFreq *prometheus.Desc
@ -65,6 +72,7 @@ func NewCPUCollector() (Collector, error) {
"Maximum cpu thread frequency in hertz.", "Maximum cpu thread frequency in hertz.",
[]string{"cpu"}, nil, []string{"cpu"}, nil,
), ),
// FIXME: This should be a per core metric, not per cpu!
cpuCoreThrottle: prometheus.NewDesc( cpuCoreThrottle: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, cpuCollectorNamespace, "core_throttles_total"), prometheus.BuildFQName(Namespace, cpuCollectorNamespace, "core_throttles_total"),
"Number of times this cpu core has been throttled.", "Number of times this cpu core has been throttled.",
@ -73,7 +81,7 @@ func NewCPUCollector() (Collector, error) {
cpuPackageThrottle: prometheus.NewDesc( cpuPackageThrottle: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, cpuCollectorNamespace, "package_throttles_total"), prometheus.BuildFQName(Namespace, cpuCollectorNamespace, "package_throttles_total"),
"Number of times this cpu package has been throttled.", "Number of times this cpu package has been throttled.",
[]string{"cpu"}, nil, []string{"node"}, nil,
), ),
}, nil }, nil
} }
@ -98,6 +106,7 @@ func (c *cpuCollector) updateCPUfreq(ch chan<- prometheus.Metric) error {
var value uint64 var value uint64
// cpu loop
for _, cpu := range cpus { for _, cpu := range cpus {
_, cpuname := filepath.Split(cpu) _, cpuname := filepath.Split(cpu)
@ -106,17 +115,17 @@ func (c *cpuCollector) updateCPUfreq(ch chan<- prometheus.Metric) error {
} else { } else {
// sysfs cpufreq values are kHz, thus multiply by 1000 to export base units (hz). // sysfs cpufreq values are kHz, thus multiply by 1000 to export base units (hz).
// See https://www.kernel.org/doc/Documentation/cpu-freq/user-guide.txt // See https://www.kernel.org/doc/Documentation/cpu-freq/user-guide.txt
if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq/scaling_cur_freq")); err != nil { if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq", "scaling_cur_freq")); err != nil {
return err return err
} }
ch <- prometheus.MustNewConstMetric(c.cpuFreq, prometheus.GaugeValue, float64(value)*1000.0, cpuname) ch <- prometheus.MustNewConstMetric(c.cpuFreq, prometheus.GaugeValue, float64(value)*1000.0, cpuname)
if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq/scaling_min_freq")); err != nil { if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq", "scaling_min_freq")); err != nil {
return err return err
} }
ch <- prometheus.MustNewConstMetric(c.cpuFreqMin, prometheus.GaugeValue, float64(value)*1000.0, cpuname) ch <- prometheus.MustNewConstMetric(c.cpuFreqMin, prometheus.GaugeValue, float64(value)*1000.0, cpuname)
if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq/scaling_max_freq")); err != nil { if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq", "scaling_max_freq")); err != nil {
return err return err
} }
ch <- prometheus.MustNewConstMetric(c.cpuFreqMax, prometheus.GaugeValue, float64(value)*1000.0, cpuname) ch <- prometheus.MustNewConstMetric(c.cpuFreqMax, prometheus.GaugeValue, float64(value)*1000.0, cpuname)
@ -124,17 +133,44 @@ func (c *cpuCollector) updateCPUfreq(ch chan<- prometheus.Metric) error {
if _, err := os.Stat(filepath.Join(cpu, "thermal_throttle")); os.IsNotExist(err) { if _, err := os.Stat(filepath.Join(cpu, "thermal_throttle")); os.IsNotExist(err) {
log.Debugf("CPU %q is missing thermal_throttle", cpu) log.Debugf("CPU %q is missing thermal_throttle", cpu)
} else { continue
if value, err = readUintFromFile(filepath.Join(cpu, "thermal_throttle/core_throttle_count")); err != nil { }
if value, err = readUintFromFile(filepath.Join(cpu, "thermal_throttle", "core_throttle_count")); err != nil {
return err return err
} }
ch <- prometheus.MustNewConstMetric(c.cpuCoreThrottle, prometheus.CounterValue, float64(value), cpuname) ch <- prometheus.MustNewConstMetric(c.cpuCoreThrottle, prometheus.CounterValue, float64(value), cpuname)
}
pkgs, err := filepath.Glob(sysFilePath("bus/node/devices/node[0-9]*"))
if err != nil {
return err
}
if value, err = readUintFromFile(filepath.Join(cpu, "thermal_throttle/package_throttle_count")); err != nil { // package/node loop
for _, pkg := range pkgs {
if _, err := os.Stat(filepath.Join(pkg, "cpulist")); os.IsNotExist(err) {
log.Debugf("package %q is missing cpulist", pkg)
continue
}
cpulist, err := ioutil.ReadFile(filepath.Join(pkg, "cpulist"))
if err != nil {
log.Debugf("could not read cpulist of package %q", pkg)
return err return err
} }
ch <- prometheus.MustNewConstMetric(c.cpuPackageThrottle, prometheus.CounterValue, float64(value), cpuname) // cpulist example of one package/node with HT: "0-11,24-35"
line := strings.Split(string(cpulist), "\n")[0]
firstCPU := strings.FieldsFunc(line, func(r rune) bool {
return r == '-' || r == ','
})[0]
if _, err := os.Stat(filepath.Join(pkg, "cpu"+firstCPU, "thermal_throttle", "package_throttle_count")); os.IsNotExist(err) {
log.Debugf("Package %q CPU %q is missing package_throttle", pkg, firstCPU)
continue
}
if value, err = readUintFromFile(filepath.Join(pkg, "cpu"+firstCPU, "thermal_throttle", "package_throttle_count")); err != nil {
return err
} }
pkgno := digitRegexp.FindAllString(pkg, 1)[0]
ch <- prometheus.MustNewConstMetric(c.cpuPackageThrottle, prometheus.CounterValue, float64(value), pkgno)
} }
return nil return nil

@ -299,9 +299,7 @@ node_cpu_frequency_min_hertz{cpu="cpu1"} 8e+08
node_cpu_frequency_min_hertz{cpu="cpu3"} 1e+06 node_cpu_frequency_min_hertz{cpu="cpu3"} 1e+06
# HELP node_cpu_package_throttles_total Number of times this cpu package has been throttled. # HELP node_cpu_package_throttles_total Number of times this cpu package has been throttled.
# TYPE node_cpu_package_throttles_total counter # TYPE node_cpu_package_throttles_total counter
node_cpu_package_throttles_total{cpu="cpu0"} 30 node_cpu_package_throttles_total{node="0"} 30
node_cpu_package_throttles_total{cpu="cpu1"} 30
node_cpu_package_throttles_total{cpu="cpu2"} 6
# HELP node_disk_bytes_read The total number of bytes read successfully. # HELP node_disk_bytes_read The total number of bytes read successfully.
# TYPE node_disk_bytes_read counter # TYPE node_disk_bytes_read counter
node_disk_bytes_read{device="dm-0"} 5.13708655616e+11 node_disk_bytes_read{device="dm-0"} 5.13708655616e+11

@ -116,6 +116,42 @@ Lines: 1
1000 1000
Mode: 644 Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/bus/node
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/bus/node/devices
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/bus/node/devices/node0
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/bus/node/devices/node0/cpu0
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/bus/node/devices/node0/cpu0/thermal_throttle
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/bus/node/devices/node0/cpu0/thermal_throttle/package_throttle_count
Lines: 1
30
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/bus/node/devices/node0/cpu1
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/bus/node/devices/node0/cpu1/thermal_throttle
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/bus/node/devices/node0/cpu1/thermal_throttle/package_throttle_count
Lines: 1
30
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/bus/node/devices/node0/cpulist
Lines: 1
0-3
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class Directory: sys/class
Mode: 755 Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Loading…
Cancel
Save