Add collector for Linux EDAC

Collect "Error detection and correction" metrics from memory
controllers.
* Supported on Linux only.
* Add basic fixtures.
* Enabled by default.
pull/324/head
Ben Kochie 2016-08-16 17:10:23 +02:00
parent e9cea11553
commit b4fa10ca9d
9 changed files with 159 additions and 1 deletions

View File

@ -21,6 +21,7 @@ Name | Description | OS
conntrack | Shows conntrack statistics (does nothing if no `/proc/sys/net/netfilter/` present). | Linux
cpu | Exposes CPU statistics | Darwin, Dragonfly, FreeBSD
diskstats | Exposes disk I/O statistics from `/proc/diskstats`. | Linux
edac | Exposes error detection and correction statistics. | Linux
entropy | Exposes available entropy. | Linux
filefd | Exposes file descriptor statistics from `/proc/sys/fs/file-nr`. | Linux
filesystem | Exposes filesystem statistics, such as disk space used. | Darwin, Dragonfly, FreeBSD, Linux, OpenBSD

151
collector/edac_linux.go Normal file
View File

@ -0,0 +1,151 @@
// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !noedac
package collector
import (
"fmt"
"path"
"path/filepath"
"regexp"
"github.com/prometheus/client_golang/prometheus"
)
const (
edacSubsystem = "edac"
)
var (
edacMemControllerRE = regexp.MustCompile(`.*devices/system/edac/mc/mc([0-9]*)`)
edacMemCsrowRE = regexp.MustCompile(`.*devices/system/edac/mc/mc[0-9]*/csrow([0-9]*)`)
)
type edacMCMetric struct {
metricName string
metricType prometheus.ValueType
metricHelp string
memController string
value float64
}
type edacCollector struct {
ceCount *prometheus.Desc
ueCount *prometheus.Desc
csRowCECount *prometheus.Desc
csRowUECount *prometheus.Desc
}
func init() {
Factories["edac"] = NewEdacCollector
}
// Takes a prometheus registry and returns a new Collector exposing
// edac stats.
func NewEdacCollector() (Collector, error) {
return &edacCollector{
ceCount: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, edacSubsystem, "correctable_errors_total"),
"Total correctable memory errors.",
[]string{"controller"}, nil,
),
ueCount: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, edacSubsystem, "uncorrectable_errors_total"),
"Total uncorrectable memory errors.",
[]string{"controller"}, nil,
),
csRowCECount: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, edacSubsystem, "csrow_correctable_errors_total"),
"Total correctable memory errors for this csrow.",
[]string{"controller", "csrow"}, nil,
),
csRowUECount: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, edacSubsystem, "csrow_uncorrectable_errors_total"),
"Total uncorrectable memory errors for this csrow.",
[]string{"controller", "csrow"}, nil,
),
}, nil
}
func (c *edacCollector) Update(ch chan<- prometheus.Metric) (err error) {
memControllers, err := filepath.Glob(sysFilePath("devices/system/edac/mc/mc[0-9]*"))
if err != nil {
return err
}
for _, controller := range memControllers {
controllerMatch := edacMemControllerRE.FindStringSubmatch(controller)
if controllerMatch == nil {
return fmt.Errorf("controller string didn't match regexp: %s", controller)
}
controllerNumber := controllerMatch[1]
value, err := readUintFromFile(path.Join(controller, "ce_count"))
if err != nil {
return fmt.Errorf("couldn't get ce_count for controller %s: %s", controllerNumber, err)
}
ch <- prometheus.MustNewConstMetric(
c.ceCount, prometheus.CounterValue, float64(value), controllerNumber)
value, err = readUintFromFile(path.Join(controller, "ce_noinfo_count"))
if err != nil {
return fmt.Errorf("couldn't get ce_noinfo_count for controller %s: %s", controllerNumber, err)
}
ch <- prometheus.MustNewConstMetric(
c.csRowCECount, prometheus.CounterValue, float64(value), controllerNumber, "unknown")
value, err = readUintFromFile(path.Join(controller, "ue_count"))
if err != nil {
return fmt.Errorf("couldn't get ue_count for controller %s: %s", controllerNumber, err)
}
ch <- prometheus.MustNewConstMetric(
c.ueCount, prometheus.CounterValue, float64(value), controllerNumber)
value, err = readUintFromFile(path.Join(controller, "ue_noinfo_count"))
if err != nil {
return fmt.Errorf("couldn't get ue_noinfo_count for controller %s: %s", controllerNumber, err)
}
ch <- prometheus.MustNewConstMetric(
c.csRowUECount, prometheus.CounterValue, float64(value), controllerNumber, "uknown")
// For each controller, walk the csrow directories.
csrows, err := filepath.Glob(controller + "/csrow[0-9]*")
if err != nil {
return err
}
for _, csrow := range csrows {
csrowMatch := edacMemCsrowRE.FindStringSubmatch(csrow)
if csrowMatch == nil {
return fmt.Errorf("csrow string didn't match regexp: %s", csrow)
}
csrowNumber := csrowMatch[1]
value, err = readUintFromFile(path.Join(csrow, "ce_count"))
if err != nil {
return fmt.Errorf("couldn't get ce_count for controller/csrow %s/%s: %s", controllerNumber, csrowNumber, err)
}
ch <- prometheus.MustNewConstMetric(
c.csRowCECount, prometheus.CounterValue, float64(value), controllerNumber, csrowNumber)
value, err = readUintFromFile(path.Join(csrow, "ue_count"))
if err != nil {
return fmt.Errorf("couldn't get ue_count for controller/csrow %s/%s: %s", controllerNumber, csrowNumber, err)
}
ch <- prometheus.MustNewConstMetric(
c.csRowUECount, prometheus.CounterValue, float64(value), controllerNumber, csrowNumber)
}
}
return err
}

View File

@ -0,0 +1 @@
1

View File

@ -0,0 +1 @@
5

View File

@ -32,7 +32,7 @@ import (
)
const (
defaultCollectors = "conntrack,cpu,diskstats,entropy,filefd,filesystem,hwmon,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat,zfs"
defaultCollectors = "conntrack,cpu,diskstats,entropy,edac,filefd,filesystem,hwmon,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat,zfs"
)
var (