From 9f2aa24e12bb95cba49c903d82740d6299b5ce35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonas=20Gro=C3=9Fe=20Sundrup?= <cherti@letopolis.de>
Date: Mon, 7 Sep 2015 15:49:30 +0200
Subject: [PATCH] Add collector for metrics of linux software raids

---
 README.md                 |   1 +
 collector/fixtures/mdstat |  26 ++++
 collector/mdadm.go        | 279 ++++++++++++++++++++++++++++++++++++++
 collector/mdadm_test.go   |  33 +++++
 node_exporter.go          |   2 +-
 5 files changed, 340 insertions(+), 1 deletion(-)
 create mode 100644 collector/fixtures/mdstat
 create mode 100644 collector/mdadm.go
 create mode 100644 collector/mdadm_test.go

diff --git a/README.md b/README.md
index f2dd5fe6..378d00ee 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,7 @@ netstat | Exposes network statistics from `/proc/net/netstat`. This is the same
 stat | Exposes various statistics from `/proc/stat`. This includes CPU usage, boot time, forks and interrupts.
 textfile | Exposes statistics read from local disk. The `--collector.textfile.directory` flag must be set.
 time | Exposes the current system time.
+mdadm | Exposes statistics about devices in `/proc/mdstat` (does nothing if no /proc/mdstat present)
 
 
 ### Disabled by default
diff --git a/collector/fixtures/mdstat b/collector/fixtures/mdstat
new file mode 100644
index 00000000..da5c691c
--- /dev/null
+++ b/collector/fixtures/mdstat
@@ -0,0 +1,26 @@
+Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5] [raid4] [raid10]
+md3 : active raid6 sda1[8] sdh1[7] sdg1[6] sdf1[5] sde1[11] sdd1[3] sdc1[10] sdb1[9]
+      5853468288 blocks super 1.2 level 6, 64k chunk, algorithm 2 [8/8] [UUUUUUUU]
+      
+md127 : active raid1 sdi2[0] sdj2[1]
+      312319552 blocks [2/2] [UU]
+      
+md0 : active raid1 sdi1[0] sdj1[1]
+      248896 blocks [2/2] [UU]
+      
+md4 : inactive raid1 sda3[0] sdb3[1]
+      4883648 blocks [2/2] [UU]
+
+md6 : active raid1 sdb2[2] sda2[0]
+      195310144 blocks [2/1] [U_]
+      [=>...................]  recovery =  8.5% (16775552/195310144) finish=17.0min speed=259783K/sec
+
+md8 : active raid1 sdb1[1] sda1[0]
+      195310144 blocks [2/2] [UU]
+      [=>...................]  resync =  8.5% (16775552/195310144) finish=17.0min speed=259783K/sec
+
+md7 : active raid6 sdb1[0] sde1[3] sdd1[2] sdc1[1]
+      7813735424 blocks super 1.2 level 6, 512k chunk, algorithm 2 [4/3] [U_UU]
+      bitmap: 0/30 pages [0KB], 65536KB chunk
+
+unused devices: <none>
diff --git a/collector/mdadm.go b/collector/mdadm.go
new file mode 100644
index 00000000..78f33d56
--- /dev/null
+++ b/collector/mdadm.go
@@ -0,0 +1,279 @@
+// +build !nomdadm
+
+package collector
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"regexp"
+	"strconv"
+	"strings"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/log"
+)
+
+var (
+	statusfile   = "/proc/mdstat"
+	statuslineRE = regexp.MustCompile(`(\d+) blocks .*\[(\d+)/(\d+)\] \[[U_]+\]`)
+	buildlineRE  = regexp.MustCompile(`\((\d+)/\d+\)`)
+)
+
+type mdStatus struct {
+	mdName       string
+	isActive     bool
+	disksActive  int64
+	disksTotal   int64
+	blocksTotal  int64
+	blocksSynced int64
+}
+
+type mdadmCollector struct{}
+
+func init() {
+	Factories["mdadm"] = NewMdadmCollector
+}
+
+func evalStatusline(statusline string) (active, total, size int64, err error) {
+	matches := statuslineRE.FindStringSubmatch(statusline)
+
+	// +1 to make it more obvious that the whole string containing the info is also returned as matches[0].
+	if len(matches) < 3+1 {
+		return 0, 0, 0, fmt.Errorf("too few matches found in statusline: %s", statusline)
+	} else {
+		if len(matches) > 3+1 {
+			return 0, 0, 0, fmt.Errorf("too many matches found in statusline: %s", statusline)
+		}
+	}
+
+	size, err = strconv.ParseInt(matches[1], 10, 64)
+	if err != nil {
+		return 0, 0, 0, fmt.Errorf("%s in statusline: %s", err, statusline)
+	}
+
+	total, err = strconv.ParseInt(matches[2], 10, 64)
+	if err != nil {
+		return 0, 0, 0, fmt.Errorf("%s in statusline: %s", err, statusline)
+	}
+	active, err = strconv.ParseInt(matches[3], 10, 64)
+	if err != nil {
+		return 0, 0, 0, fmt.Errorf("%s in statusline: %s", err, statusline)
+	}
+
+	return active, total, size, nil
+}
+
+// Gets the size that has already been synced out of the sync-line.
+func evalBuildline(buildline string) (int64, error) {
+	matches := buildlineRE.FindStringSubmatch(buildline)
+
+	// +1 to make it more obvious that the whole string containing the info is also returned as matches[0].
+	if len(matches) < 1+1 {
+		return 0, fmt.Errorf("too few matches found in buildline: %s", buildline)
+	}
+
+	if len(matches) > 1+1 {
+		return 0, fmt.Errorf("too many matches found in buildline: %s", buildline)
+	}
+
+	syncedSize, err := strconv.ParseInt(matches[1], 10, 64)
+
+	if err != nil {
+		return 0, fmt.Errorf("%s in buildline: %s", err, buildline)
+	}
+
+	return syncedSize, nil
+}
+
+// Parses an mdstat-file and returns a struct with the relevant infos.
+func parseMdstat(mdStatusFilePath string) ([]mdStatus, error) {
+	content, err := ioutil.ReadFile(mdStatusFilePath)
+	if err != nil {
+		return []mdStatus{}, fmt.Errorf("error parsing %s: %s", statusfile, err)
+	}
+
+	mdStatusFile := string(content)
+
+	lines := strings.Split(mdStatusFile, "\n")
+	var currentMD string
+
+	// Each md has at least the deviceline, statusline and one empty line afterwards
+	// so we will have probably something of the order len(lines)/3 devices
+	// so we use that for preallocation.
+	estimateMDs := len(lines) / 3
+	mdStates := make([]mdStatus, 0, estimateMDs)
+
+	for i, l := range lines {
+		if l == "" {
+			// Skip entirely empty lines.
+			continue
+		}
+
+		if l[0] == ' ' {
+			// Those lines are not the beginning of a md-section.
+			continue
+		}
+
+		if strings.HasPrefix(l, "Personalities") || strings.HasPrefix(l, "unused") {
+			// We aren't interested in lines with general info.
+			continue
+		}
+
+		mainLine := strings.Split(l, " ")
+		if len(mainLine) < 3 {
+			return mdStates, fmt.Errorf("error parsing mdline: %s", l)
+		}
+		currentMD = mainLine[0]               // name of md-device
+		isActive := (mainLine[2] == "active") // activity status of said md-device
+
+		if len(lines) <= i+3 {
+			return mdStates, fmt.Errorf("error parsing %s: entry for %s has fewer lines than expected", statusfile, currentMD)
+		}
+
+		active, total, size, err := evalStatusline(lines[i+1]) // parse statusline, always present
+
+		if err != nil {
+			return mdStates, fmt.Errorf("error parsing %s: %s", statusfile, err)
+		}
+
+		// Now get the number of synced blocks.
+		var syncedBlocks int64
+
+		// Get the line number of the syncing-line.
+		var j int
+		if strings.Contains(lines[i+2], "bitmap") { // then skip the bitmap line
+			j = i + 3
+		} else {
+			j = i + 2
+		}
+
+		// If device is syncing at the moment, get the number of currently synced bytes,
+		// otherwise that number equals the size of the device.
+		if strings.Contains(lines[j], "recovery") || strings.Contains(lines[j], "resync") {
+			syncedBlocks, err = evalBuildline(lines[j])
+			if err != nil {
+				return mdStates, fmt.Errorf("error parsing %s: %s", statusfile, err)
+			}
+		} else {
+			syncedBlocks = size
+		}
+
+		mdStates = append(mdStates, mdStatus{currentMD, isActive, active, total, size, syncedBlocks})
+
+	}
+
+	return mdStates, nil
+}
+
+// Just returns the pointer to an empty struct as we only use throwaway-metrics.
+func NewMdadmCollector() (Collector, error) {
+	return &mdadmCollector{}, nil
+}
+
+var (
+	isActiveDesc = prometheus.NewDesc(
+		prometheus.BuildFQName(Namespace, "md", "is_active"),
+		"Indicator whether the md-device is active or not.",
+		[]string{"device"},
+		nil,
+	)
+
+	disksActiveDesc = prometheus.NewDesc(
+		prometheus.BuildFQName(Namespace, "md", "disks_active"),
+		"Number of active disks of device.",
+		[]string{"device"},
+		nil,
+	)
+
+	disksTotalDesc = prometheus.NewDesc(
+		prometheus.BuildFQName(Namespace, "md", "disks"),
+		"Total number of disks of device.",
+		[]string{"device"},
+		nil,
+	)
+
+	blocksTotalDesc = prometheus.NewDesc(
+		prometheus.BuildFQName(Namespace, "md", "blocks"),
+		"Total number of blocks on device.",
+		[]string{"device"},
+		nil,
+	)
+
+	blocksSyncedDesc = prometheus.NewDesc(
+		prometheus.BuildFQName(Namespace, "md", "blocks_synced"),
+		"Number of blocks synced on device.",
+		[]string{"device"},
+		nil,
+	)
+)
+
+func (c *mdadmCollector) Update(ch chan<- prometheus.Metric) (err error) {
+	// take care we don't crash on non-existent statusfiles
+	_, err = os.Stat(statusfile)
+	if os.IsNotExist(err) {
+		// no such file or directory, nothing to do, just return
+		return nil
+	}
+
+	if err != nil { // now things get weird, better to return
+		return err
+	}
+
+	// First parse mdstat-file...
+	mdstate, err := parseMdstat(statusfile)
+	if err != nil {
+		return fmt.Errorf("error parsing %s: %s", statusfile, err)
+	}
+
+	// ... and then plug the result into the metrics to be exported.
+	var isActiveFloat float64
+	for _, mds := range mdstate {
+
+		log.Debugf("collecting metrics for device %s", mds.mdName)
+
+		if mds.isActive {
+			isActiveFloat = 1
+		} else {
+			isActiveFloat = 0
+		}
+
+		ch <- prometheus.MustNewConstMetric(
+			isActiveDesc,
+			prometheus.GaugeValue,
+			isActiveFloat,
+			mds.mdName,
+		)
+
+		ch <- prometheus.MustNewConstMetric(
+			disksActiveDesc,
+			prometheus.GaugeValue,
+			float64(mds.disksActive),
+			mds.mdName,
+		)
+
+		ch <- prometheus.MustNewConstMetric(
+			disksTotalDesc,
+			prometheus.GaugeValue,
+			float64(mds.disksTotal),
+			mds.mdName,
+		)
+
+		ch <- prometheus.MustNewConstMetric(
+			blocksTotalDesc,
+			prometheus.GaugeValue,
+			float64(mds.blocksTotal),
+			mds.mdName,
+		)
+
+		ch <- prometheus.MustNewConstMetric(
+			blocksSyncedDesc,
+			prometheus.GaugeValue,
+			float64(mds.blocksSynced),
+			mds.mdName,
+		)
+
+	}
+
+	return nil
+}
diff --git a/collector/mdadm_test.go b/collector/mdadm_test.go
new file mode 100644
index 00000000..1c83733e
--- /dev/null
+++ b/collector/mdadm_test.go
@@ -0,0 +1,33 @@
+package collector
+
+import (
+	"testing"
+)
+
+func TestMdadm(t *testing.T) {
+	mdStates, err := parseMdstat("fixtures/mdstat")
+
+	if err != nil {
+		t.Fatalf("parsing of reference-file failed entirely: %s", err)
+	}
+
+	refs := map[string]mdStatus{
+		"md3":   mdStatus{"md3", true, 8, 8, 5853468288, 5853468288},
+		"md127": mdStatus{"md127", true, 2, 2, 312319552, 312319552},
+		"md0":   mdStatus{"md0", true, 2, 2, 248896, 248896},
+		"md4":   mdStatus{"md4", false, 2, 2, 4883648, 4883648},
+		"md6":   mdStatus{"md6", true, 1, 2, 195310144, 16775552},
+		"md8":   mdStatus{"md8", true, 2, 2, 195310144, 16775552},
+		"md7":   mdStatus{"md7", true, 3, 4, 7813735424, 7813735424},
+	}
+
+	for _, md := range mdStates {
+		if md != refs[md.mdName] {
+			t.Errorf("failed parsing md-device %s correctly: want %v, got %v", md.mdName, refs[md.mdName], md)
+		}
+	}
+
+	if len(mdStates) != len(refs) {
+		t.Errorf("expected number of parsed md-device to be %s, but was %s", len(refs), len(mdStates))
+	}
+}
diff --git a/node_exporter.go b/node_exporter.go
index 872e5021..5e3d834b 100644
--- a/node_exporter.go
+++ b/node_exporter.go
@@ -28,7 +28,7 @@ var (
 	memProfile        = flag.String("debug.memprofile-file", "", "Write memory profile to this file upon receipt of SIGUSR1.")
 	listenAddress     = flag.String("web.listen-address", ":9100", "Address on which to expose metrics and web interface.")
 	metricsPath       = flag.String("web.telemetry-path", "/metrics", "Path under which to expose metrics.")
-	enabledCollectors = flag.String("collectors.enabled", "diskstats,filefd,filesystem,loadavg,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname", "Comma-separated list of collectors to use.")
+	enabledCollectors = flag.String("collectors.enabled", "diskstats,filefd,filesystem,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname", "Comma-separated list of collectors to use.")
 	printCollectors   = flag.Bool("collectors.print", false, "If true, print available collectors and exit.")
 	authUser          = flag.String("auth.user", "", "Username for basic auth.")
 	authPass          = flag.String("auth.pass", "", "Password for basic auth.")