From b0d5c00832eebf39beb7bc0b4b939ac429f6cd00 Mon Sep 17 00:00:00 2001 From: Karsten Weiss Date: Thu, 7 Sep 2017 23:24:18 +0200 Subject: [PATCH] cpu: Metric 'package_throttles_total' is per package. (#657) * cpu: Metric 'package_throttles_total' is per package. 'package_throttles_total' is per package, not per cpu. This also reduces the total number of cpu time series a lot (esp for multi core cpus). * cpu: Better handling of a cpulist edge-case. * cpu: Extract the package number from the directory name. Do not rely on the range index. * cpu: Add package_throttle_count for node0 cpu1 This file must be ignored by the cpu collector. --- collector/cpu_linux.go | 62 ++++++++++++++++++++++++------- collector/fixtures/e2e-output.txt | 4 +- collector/fixtures/sys.ttar | 36 ++++++++++++++++++ 3 files changed, 86 insertions(+), 16 deletions(-) diff --git a/collector/cpu_linux.go b/collector/cpu_linux.go index ebf47c72..dfd1667c 100644 --- a/collector/cpu_linux.go +++ b/collector/cpu_linux.go @@ -17,8 +17,11 @@ package collector import ( "fmt" + "io/ioutil" "os" "path/filepath" + "regexp" + "strings" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/log" @@ -29,6 +32,10 @@ const ( cpuCollectorNamespace = "cpu" ) +var ( + digitRegexp = regexp.MustCompile("[0-9]+") +) + type cpuCollector struct { cpu *prometheus.Desc cpuFreq *prometheus.Desc @@ -65,6 +72,7 @@ func NewCPUCollector() (Collector, error) { "Maximum cpu thread frequency in hertz.", []string{"cpu"}, nil, ), + // FIXME: This should be a per core metric, not per cpu! cpuCoreThrottle: prometheus.NewDesc( prometheus.BuildFQName(Namespace, cpuCollectorNamespace, "core_throttles_total"), "Number of times this cpu core has been throttled.", @@ -73,7 +81,7 @@ func NewCPUCollector() (Collector, error) { cpuPackageThrottle: prometheus.NewDesc( prometheus.BuildFQName(Namespace, cpuCollectorNamespace, "package_throttles_total"), "Number of times this cpu package has been throttled.", - []string{"cpu"}, nil, + []string{"node"}, nil, ), }, nil } @@ -98,6 +106,7 @@ func (c *cpuCollector) updateCPUfreq(ch chan<- prometheus.Metric) error { var value uint64 + // cpu loop for _, cpu := range cpus { _, cpuname := filepath.Split(cpu) @@ -106,17 +115,17 @@ func (c *cpuCollector) updateCPUfreq(ch chan<- prometheus.Metric) error { } else { // sysfs cpufreq values are kHz, thus multiply by 1000 to export base units (hz). // See https://www.kernel.org/doc/Documentation/cpu-freq/user-guide.txt - if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq/scaling_cur_freq")); err != nil { + if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq", "scaling_cur_freq")); err != nil { return err } ch <- prometheus.MustNewConstMetric(c.cpuFreq, prometheus.GaugeValue, float64(value)*1000.0, cpuname) - if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq/scaling_min_freq")); err != nil { + if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq", "scaling_min_freq")); err != nil { return err } ch <- prometheus.MustNewConstMetric(c.cpuFreqMin, prometheus.GaugeValue, float64(value)*1000.0, cpuname) - if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq/scaling_max_freq")); err != nil { + if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq", "scaling_max_freq")); err != nil { return err } ch <- prometheus.MustNewConstMetric(c.cpuFreqMax, prometheus.GaugeValue, float64(value)*1000.0, cpuname) @@ -124,17 +133,44 @@ func (c *cpuCollector) updateCPUfreq(ch chan<- prometheus.Metric) error { if _, err := os.Stat(filepath.Join(cpu, "thermal_throttle")); os.IsNotExist(err) { log.Debugf("CPU %q is missing thermal_throttle", cpu) - } else { - if value, err = readUintFromFile(filepath.Join(cpu, "thermal_throttle/core_throttle_count")); err != nil { - return err - } - ch <- prometheus.MustNewConstMetric(c.cpuCoreThrottle, prometheus.CounterValue, float64(value), cpuname) + continue + } + if value, err = readUintFromFile(filepath.Join(cpu, "thermal_throttle", "core_throttle_count")); err != nil { + return err + } + ch <- prometheus.MustNewConstMetric(c.cpuCoreThrottle, prometheus.CounterValue, float64(value), cpuname) + } - if value, err = readUintFromFile(filepath.Join(cpu, "thermal_throttle/package_throttle_count")); err != nil { - return err - } - ch <- prometheus.MustNewConstMetric(c.cpuPackageThrottle, prometheus.CounterValue, float64(value), cpuname) + pkgs, err := filepath.Glob(sysFilePath("bus/node/devices/node[0-9]*")) + if err != nil { + return err + } + + // package/node loop + for _, pkg := range pkgs { + if _, err := os.Stat(filepath.Join(pkg, "cpulist")); os.IsNotExist(err) { + log.Debugf("package %q is missing cpulist", pkg) + continue + } + cpulist, err := ioutil.ReadFile(filepath.Join(pkg, "cpulist")) + if err != nil { + log.Debugf("could not read cpulist of package %q", pkg) + return err + } + // cpulist example of one package/node with HT: "0-11,24-35" + line := strings.Split(string(cpulist), "\n")[0] + firstCPU := strings.FieldsFunc(line, func(r rune) bool { + return r == '-' || r == ',' + })[0] + if _, err := os.Stat(filepath.Join(pkg, "cpu"+firstCPU, "thermal_throttle", "package_throttle_count")); os.IsNotExist(err) { + log.Debugf("Package %q CPU %q is missing package_throttle", pkg, firstCPU) + continue + } + if value, err = readUintFromFile(filepath.Join(pkg, "cpu"+firstCPU, "thermal_throttle", "package_throttle_count")); err != nil { + return err } + pkgno := digitRegexp.FindAllString(pkg, 1)[0] + ch <- prometheus.MustNewConstMetric(c.cpuPackageThrottle, prometheus.CounterValue, float64(value), pkgno) } return nil diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index bd1b2b24..93a6a6c7 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -299,9 +299,7 @@ node_cpu_frequency_min_hertz{cpu="cpu1"} 8e+08 node_cpu_frequency_min_hertz{cpu="cpu3"} 1e+06 # HELP node_cpu_package_throttles_total Number of times this cpu package has been throttled. # TYPE node_cpu_package_throttles_total counter -node_cpu_package_throttles_total{cpu="cpu0"} 30 -node_cpu_package_throttles_total{cpu="cpu1"} 30 -node_cpu_package_throttles_total{cpu="cpu2"} 6 +node_cpu_package_throttles_total{node="0"} 30 # HELP node_disk_bytes_read The total number of bytes read successfully. # TYPE node_disk_bytes_read counter node_disk_bytes_read{device="dm-0"} 5.13708655616e+11 diff --git a/collector/fixtures/sys.ttar b/collector/fixtures/sys.ttar index 710e04a8..829425ab 100644 --- a/collector/fixtures/sys.ttar +++ b/collector/fixtures/sys.ttar @@ -116,6 +116,42 @@ Lines: 1 1000 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/bus/node +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/bus/node/devices +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/bus/node/devices/node0 +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/bus/node/devices/node0/cpu0 +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/bus/node/devices/node0/cpu0/thermal_throttle +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/bus/node/devices/node0/cpu0/thermal_throttle/package_throttle_count +Lines: 1 +30 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/bus/node/devices/node0/cpu1 +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/bus/node/devices/node0/cpu1/thermal_throttle +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/bus/node/devices/node0/cpu1/thermal_throttle/package_throttle_count +Lines: 1 +30 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/bus/node/devices/node0/cpulist +Lines: 1 +0-3 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/class Mode: 755 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -