From b4fa10ca9d06087d5443673299e23f3433668e39 Mon Sep 17 00:00:00 2001 From: Ben Kochie Date: Tue, 16 Aug 2016 17:10:23 +0200 Subject: [PATCH] Add collector for Linux EDAC Collect "Error detection and correction" metrics from memory controllers. * Supported on Linux only. * Add basic fixtures. * Enabled by default. --- README.md | 1 + collector/edac_linux.go | 151 ++++++++++++++++++ .../sys/devices/system/edac/mc/mc0/ce_count | 1 + .../system/edac/mc/mc0/ce_noinfo_count | 1 + .../system/edac/mc/mc0/csrow0/ce_count | 1 + .../system/edac/mc/mc0/csrow0/ue_count | 1 + .../sys/devices/system/edac/mc/mc0/ue_count | 1 + .../system/edac/mc/mc0/ue_noinfo_count | 1 + node_exporter.go | 2 +- 9 files changed, 159 insertions(+), 1 deletion(-) create mode 100644 collector/edac_linux.go create mode 100644 collector/fixtures/sys/devices/system/edac/mc/mc0/ce_count create mode 100644 collector/fixtures/sys/devices/system/edac/mc/mc0/ce_noinfo_count create mode 100644 collector/fixtures/sys/devices/system/edac/mc/mc0/csrow0/ce_count create mode 100644 collector/fixtures/sys/devices/system/edac/mc/mc0/csrow0/ue_count create mode 100644 collector/fixtures/sys/devices/system/edac/mc/mc0/ue_count create mode 100644 collector/fixtures/sys/devices/system/edac/mc/mc0/ue_noinfo_count diff --git a/README.md b/README.md index ba0b74bb..558ffab7 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ Name | Description | OS conntrack | Shows conntrack statistics (does nothing if no `/proc/sys/net/netfilter/` present). | Linux cpu | Exposes CPU statistics | Darwin, Dragonfly, FreeBSD diskstats | Exposes disk I/O statistics from `/proc/diskstats`. | Linux +edac | Exposes error detection and correction statistics. | Linux entropy | Exposes available entropy. | Linux filefd | Exposes file descriptor statistics from `/proc/sys/fs/file-nr`. | Linux filesystem | Exposes filesystem statistics, such as disk space used. | Darwin, Dragonfly, FreeBSD, Linux, OpenBSD diff --git a/collector/edac_linux.go b/collector/edac_linux.go new file mode 100644 index 00000000..8b6f6ab8 --- /dev/null +++ b/collector/edac_linux.go @@ -0,0 +1,151 @@ +// Copyright 2015 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build !noedac + +package collector + +import ( + "fmt" + "path" + "path/filepath" + "regexp" + + "github.com/prometheus/client_golang/prometheus" +) + +const ( + edacSubsystem = "edac" +) + +var ( + edacMemControllerRE = regexp.MustCompile(`.*devices/system/edac/mc/mc([0-9]*)`) + edacMemCsrowRE = regexp.MustCompile(`.*devices/system/edac/mc/mc[0-9]*/csrow([0-9]*)`) +) + +type edacMCMetric struct { + metricName string + metricType prometheus.ValueType + metricHelp string + memController string + value float64 +} + +type edacCollector struct { + ceCount *prometheus.Desc + ueCount *prometheus.Desc + csRowCECount *prometheus.Desc + csRowUECount *prometheus.Desc +} + +func init() { + Factories["edac"] = NewEdacCollector +} + +// Takes a prometheus registry and returns a new Collector exposing +// edac stats. +func NewEdacCollector() (Collector, error) { + return &edacCollector{ + ceCount: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, edacSubsystem, "correctable_errors_total"), + "Total correctable memory errors.", + []string{"controller"}, nil, + ), + ueCount: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, edacSubsystem, "uncorrectable_errors_total"), + "Total uncorrectable memory errors.", + []string{"controller"}, nil, + ), + csRowCECount: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, edacSubsystem, "csrow_correctable_errors_total"), + "Total correctable memory errors for this csrow.", + []string{"controller", "csrow"}, nil, + ), + csRowUECount: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, edacSubsystem, "csrow_uncorrectable_errors_total"), + "Total uncorrectable memory errors for this csrow.", + []string{"controller", "csrow"}, nil, + ), + }, nil +} + +func (c *edacCollector) Update(ch chan<- prometheus.Metric) (err error) { + memControllers, err := filepath.Glob(sysFilePath("devices/system/edac/mc/mc[0-9]*")) + if err != nil { + return err + } + for _, controller := range memControllers { + controllerMatch := edacMemControllerRE.FindStringSubmatch(controller) + if controllerMatch == nil { + return fmt.Errorf("controller string didn't match regexp: %s", controller) + } + controllerNumber := controllerMatch[1] + + value, err := readUintFromFile(path.Join(controller, "ce_count")) + if err != nil { + return fmt.Errorf("couldn't get ce_count for controller %s: %s", controllerNumber, err) + } + ch <- prometheus.MustNewConstMetric( + c.ceCount, prometheus.CounterValue, float64(value), controllerNumber) + + value, err = readUintFromFile(path.Join(controller, "ce_noinfo_count")) + if err != nil { + return fmt.Errorf("couldn't get ce_noinfo_count for controller %s: %s", controllerNumber, err) + } + ch <- prometheus.MustNewConstMetric( + c.csRowCECount, prometheus.CounterValue, float64(value), controllerNumber, "unknown") + + value, err = readUintFromFile(path.Join(controller, "ue_count")) + if err != nil { + return fmt.Errorf("couldn't get ue_count for controller %s: %s", controllerNumber, err) + } + ch <- prometheus.MustNewConstMetric( + c.ueCount, prometheus.CounterValue, float64(value), controllerNumber) + + value, err = readUintFromFile(path.Join(controller, "ue_noinfo_count")) + if err != nil { + return fmt.Errorf("couldn't get ue_noinfo_count for controller %s: %s", controllerNumber, err) + } + ch <- prometheus.MustNewConstMetric( + c.csRowUECount, prometheus.CounterValue, float64(value), controllerNumber, "uknown") + + // For each controller, walk the csrow directories. + csrows, err := filepath.Glob(controller + "/csrow[0-9]*") + if err != nil { + return err + } + for _, csrow := range csrows { + csrowMatch := edacMemCsrowRE.FindStringSubmatch(csrow) + if csrowMatch == nil { + return fmt.Errorf("csrow string didn't match regexp: %s", csrow) + } + csrowNumber := csrowMatch[1] + + value, err = readUintFromFile(path.Join(csrow, "ce_count")) + if err != nil { + return fmt.Errorf("couldn't get ce_count for controller/csrow %s/%s: %s", controllerNumber, csrowNumber, err) + } + ch <- prometheus.MustNewConstMetric( + c.csRowCECount, prometheus.CounterValue, float64(value), controllerNumber, csrowNumber) + + value, err = readUintFromFile(path.Join(csrow, "ue_count")) + if err != nil { + return fmt.Errorf("couldn't get ue_count for controller/csrow %s/%s: %s", controllerNumber, csrowNumber, err) + } + ch <- prometheus.MustNewConstMetric( + c.csRowUECount, prometheus.CounterValue, float64(value), controllerNumber, csrowNumber) + } + } + + return err +} diff --git a/collector/fixtures/sys/devices/system/edac/mc/mc0/ce_count b/collector/fixtures/sys/devices/system/edac/mc/mc0/ce_count new file mode 100644 index 00000000..d00491fd --- /dev/null +++ b/collector/fixtures/sys/devices/system/edac/mc/mc0/ce_count @@ -0,0 +1 @@ +1 diff --git a/collector/fixtures/sys/devices/system/edac/mc/mc0/ce_noinfo_count b/collector/fixtures/sys/devices/system/edac/mc/mc0/ce_noinfo_count new file mode 100644 index 00000000..0cfbf088 --- /dev/null +++ b/collector/fixtures/sys/devices/system/edac/mc/mc0/ce_noinfo_count @@ -0,0 +1 @@ +2 diff --git a/collector/fixtures/sys/devices/system/edac/mc/mc0/csrow0/ce_count b/collector/fixtures/sys/devices/system/edac/mc/mc0/csrow0/ce_count new file mode 100644 index 00000000..00750edc --- /dev/null +++ b/collector/fixtures/sys/devices/system/edac/mc/mc0/csrow0/ce_count @@ -0,0 +1 @@ +3 diff --git a/collector/fixtures/sys/devices/system/edac/mc/mc0/csrow0/ue_count b/collector/fixtures/sys/devices/system/edac/mc/mc0/csrow0/ue_count new file mode 100644 index 00000000..b8626c4c --- /dev/null +++ b/collector/fixtures/sys/devices/system/edac/mc/mc0/csrow0/ue_count @@ -0,0 +1 @@ +4 diff --git a/collector/fixtures/sys/devices/system/edac/mc/mc0/ue_count b/collector/fixtures/sys/devices/system/edac/mc/mc0/ue_count new file mode 100644 index 00000000..7ed6ff82 --- /dev/null +++ b/collector/fixtures/sys/devices/system/edac/mc/mc0/ue_count @@ -0,0 +1 @@ +5 diff --git a/collector/fixtures/sys/devices/system/edac/mc/mc0/ue_noinfo_count b/collector/fixtures/sys/devices/system/edac/mc/mc0/ue_noinfo_count new file mode 100644 index 00000000..1e8b3149 --- /dev/null +++ b/collector/fixtures/sys/devices/system/edac/mc/mc0/ue_noinfo_count @@ -0,0 +1 @@ +6 diff --git a/node_exporter.go b/node_exporter.go index e737232c..09a80c26 100644 --- a/node_exporter.go +++ b/node_exporter.go @@ -32,7 +32,7 @@ import ( ) const ( - defaultCollectors = "conntrack,cpu,diskstats,entropy,filefd,filesystem,hwmon,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat,zfs" + defaultCollectors = "conntrack,cpu,diskstats,entropy,edac,filefd,filesystem,hwmon,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat,zfs" ) var (