From 6cdaf0e9698e02e7d28a03b134114503a0d56fa1 Mon Sep 17 00:00:00 2001 From: Brian Brazil Date: Fri, 23 May 2014 13:13:25 +0100 Subject: [PATCH 1/3] Change stats to follow name guidelines. Diskstats: Split out metrics, keep 'device' label Meminfo: Split out metrics, one each. Convert kB to bytes. Netstats: Split out metrics, keep 'device' label. Interrupts: Stays the same. Not perfect, but should be rarely used. Loadavg: Make it clear it's the 1m loadavg Last seen: Not clear this belongs in the node exporter, as it's more a user thing than a machine thing. Changed to absolute time rather than relative. All stats now have appropriate counter/gauge type. --- collector/native_collector.go | 202 ++++++++++++++++++---------------- 1 file changed, 110 insertions(+), 92 deletions(-) diff --git a/collector/native_collector.go b/collector/native_collector.go index bc917f7e..625b6ee8 100644 --- a/collector/native_collector.go +++ b/collector/native_collector.go @@ -9,6 +9,7 @@ import ( "io/ioutil" "os" "os/exec" + "regexp" "strconv" "strings" "time" @@ -24,26 +25,39 @@ const ( procDiskStats = "/proc/diskstats" ) +type diskStat struct { + name string + metric prometheus.Metric + documentation string +} + var ( - diskStatsHeader = []string{ - "reads_completed", "reads_merged", - "sectors_read", "read_time_ms", - "writes_completed", "writes_merged", - "sectors_written", "write_time_ms", - "io_now", "io_time_ms", "io_time_weighted", + // Docs from https://www.kernel.org/doc/Documentation/iostats.txt + diskStatsMetrics = []diskStat{ + {"reads_completed", prometheus.NewCounter(), "# of reads completed"}, + {"reads_merged", prometheus.NewCounter(), "# of reads merged"}, + {"sectors_read", prometheus.NewCounter(), "# of sectors read"}, + {"read_time_ms", prometheus.NewCounter(), "# of milliseconds spent reading"}, + {"writes_completed", prometheus.NewCounter(), "# of writes completed"}, + {"writes_merged", prometheus.NewCounter(), "# of writes merges"}, + {"sectors_written", prometheus.NewCounter(), "# of sectors written"}, + {"write_time_ms", prometheus.NewCounter(), "# of milliseconds spent writing"}, + {"io_now", prometheus.NewGauge(), "# of I/Os currently in progress"}, + {"io_time_ms", prometheus.NewCounter(), "# of milliseconds spent doing I/Os"}, + {"io_time_weighted", prometheus.NewCounter(), "weighted # of milliseconds spent doing I/Os"}, } + lastSeen = prometheus.NewGauge() + load1 = prometheus.NewGauge() + attributes = prometheus.NewGauge() + memInfoMetrics = map[string]prometheus.Gauge{} + netStatsMetrics = map[string]prometheus.Gauge{} + interruptsMetric = prometheus.NewCounter() ) type nativeCollector struct { - loadAvg prometheus.Gauge - attributes prometheus.Gauge - lastSeen prometheus.Gauge - memInfo prometheus.Gauge - interrupts prometheus.Counter - netStats prometheus.Counter - diskStats prometheus.Counter - name string - config Config + registry prometheus.Registry + name string + config Config } func init() { @@ -54,89 +68,71 @@ func init() { // load, seconds since last login and a list of tags as specified by config. func NewNativeCollector(config Config, registry prometheus.Registry) (Collector, error) { c := nativeCollector{ - name: "native_collector", - config: config, - loadAvg: prometheus.NewGauge(), - attributes: prometheus.NewGauge(), - lastSeen: prometheus.NewGauge(), - memInfo: prometheus.NewGauge(), - interrupts: prometheus.NewCounter(), - netStats: prometheus.NewCounter(), - diskStats: prometheus.NewCounter(), + name: "native_collector", + config: config, + registry: registry, } registry.Register( - "node_load", - "node_exporter: system load.", + "node_load1", + "1m load average", prometheus.NilLabels, - c.loadAvg, + load1, ) registry.Register( - "node_last_login_seconds", - "node_exporter: seconds since last login.", + "node_last_login_time", + "The time of the last login", prometheus.NilLabels, - c.lastSeen, + lastSeen, ) registry.Register( "node_attributes", - "node_exporter: system attributes.", + "node_exporter attributes", prometheus.NilLabels, - c.attributes, - ) - - registry.Register( - "node_mem", - "node_exporter: memory details.", - prometheus.NilLabels, - c.memInfo, + attributes, ) registry.Register( "node_interrupts", - "node_exporter: interrupt details.", + "Interrupt details from /proc/interrupts", prometheus.NilLabels, - c.interrupts, + interruptsMetric, ) - registry.Register( - "node_net", - "node_exporter: network stats.", - prometheus.NilLabels, - c.netStats, - ) - - registry.Register( - "node_disk", - "node_exporter: disk stats.", - prometheus.NilLabels, - c.diskStats, - ) + for _, v := range diskStatsMetrics { + registry.Register( + "node_disk_"+v.name, + v.documentation, + prometheus.NilLabels, + v.metric, + ) + } return &c, nil } func (c *nativeCollector) Name() string { return c.name } func (c *nativeCollector) Update() (updates int, err error) { - last, err := getSecondsSinceLastLogin() + last, err := getLastLoginTime() if err != nil { return updates, fmt.Errorf("Couldn't get last seen: %s", err) } updates++ - debug(c.Name(), "Set node_last_login_seconds: %f", last) - c.lastSeen.Set(nil, last) + debug(c.Name(), "Set node_last_login_time: %f", last) + lastSeen.Set(nil, last) - load, err := getLoad() + load, err := getLoad1() if err != nil { return updates, fmt.Errorf("Couldn't get load: %s", err) } updates++ debug(c.Name(), "Set node_load: %f", load) - c.loadAvg.Set(nil, load) + load1.Set(nil, load) debug(c.Name(), "Set node_attributes{%v}: 1", c.config.Attributes) - c.attributes.Set(c.config.Attributes, 1) + attributes.Set(c.config.Attributes, 1) memInfo, err := getMemInfo() if err != nil { @@ -144,12 +140,17 @@ func (c *nativeCollector) Update() (updates int, err error) { } debug(c.Name(), "Set node_mem: %#v", memInfo) for k, v := range memInfo { - updates++ - fv, err := strconv.ParseFloat(v, 64) - if err != nil { - return updates, fmt.Errorf("Invalid value in meminfo: %s", err) + if _, ok := memInfoMetrics[k]; !ok { + memInfoMetrics[k] = prometheus.NewGauge() + c.registry.Register( + "node_memory_"+k, + k+" from /proc/meminfo", + prometheus.NilLabels, + memInfoMetrics[k], + ) } - c.memInfo.Set(map[string]string{"type": k}, fv) + updates++ + memInfoMetrics[k].Set(nil, v) } interrupts, err := getInterrupts() @@ -169,7 +170,7 @@ func (c *nativeCollector) Update() (updates int, err error) { "info": interrupt.info, "devices": interrupt.devices, } - c.interrupts.Set(labels, fv) + interruptsMetric.Set(labels, fv) } } @@ -180,17 +181,22 @@ func (c *nativeCollector) Update() (updates int, err error) { for direction, devStats := range netStats { for dev, stats := range devStats { for t, value := range stats { + key := direction + "_" + t + if _, ok := netStatsMetrics[key]; !ok { + netStatsMetrics[key] = prometheus.NewGauge() + c.registry.Register( + "node_network_"+key, + t+" "+direction+" from /proc/net/dev", + prometheus.NilLabels, + netStatsMetrics[key], + ) + } updates++ v, err := strconv.ParseFloat(value, 64) if err != nil { - return updates, fmt.Errorf("Invalid value %s in interrupts: %s", value, err) + return updates, fmt.Errorf("Invalid value %s in netstats: %s", value, err) } - labels := map[string]string{ - "device": dev, - "direction": direction, - "type": t, - } - c.netStats.Set(labels, v) + netStatsMetrics[key].Set(map[string]string{"device": dev}, v) } } } @@ -206,14 +212,20 @@ func (c *nativeCollector) Update() (updates int, err error) { if err != nil { return updates, fmt.Errorf("Invalid value %s in diskstats: %s", value, err) } - labels := map[string]string{"device": dev, "type": k} - c.diskStats.Set(labels, v) + labels := map[string]string{"device": dev} + counter, ok := diskStatsMetrics[k].metric.(prometheus.Counter) + if ok { + counter.Set(labels, v) + } else { + var gauge = diskStatsMetrics[k].metric.(prometheus.Gauge) + gauge.Set(labels, v) + } } } return updates, err } -func getLoad() (float64, error) { +func getLoad1() (float64, error) { data, err := ioutil.ReadFile(procLoad) if err != nil { return 0, err @@ -230,7 +242,7 @@ func parseLoad(data string) (float64, error) { return load, nil } -func getSecondsSinceLastLogin() (float64, error) { +func getLastLoginTime() (float64, error) { who := exec.Command("who", "/var/log/wtmp", "-l", "-u", "-s") output, err := who.StdoutPipe() @@ -277,10 +289,10 @@ func getSecondsSinceLastLogin() (float64, error) { return 0, err } - return float64(time.Now().Sub(last).Seconds()), nil + return float64(last.Unix()), nil } -func getMemInfo() (map[string]string, error) { +func getMemInfo() (map[string]float64, error) { file, err := os.Open(procMemInfo) if err != nil { return nil, err @@ -288,23 +300,29 @@ func getMemInfo() (map[string]string, error) { return parseMemInfo(file) } -func parseMemInfo(r io.ReadCloser) (map[string]string, error) { +func parseMemInfo(r io.ReadCloser) (map[string]float64, error) { defer r.Close() - memInfo := map[string]string{} + memInfo := map[string]float64{} scanner := bufio.NewScanner(r) + re := regexp.MustCompile("\\((.*)\\)") for scanner.Scan() { line := scanner.Text() parts := strings.Fields(string(line)) - key := "" + fv, err := strconv.ParseFloat(parts[1], 64) + if err != nil { + return nil, fmt.Errorf("Invalid value in meminfo: %s", err) + } switch len(parts) { case 2: // no unit - key = parts[0][:len(parts[0])-1] // remove trailing : from key - case 3: // has unit - key = fmt.Sprintf("%s_%s", parts[0][:len(parts[0])-1], parts[2]) + case 3: // has unit, we presume kB + fv *= 1024 default: return nil, fmt.Errorf("Invalid line in %s: %s", procMemInfo, line) } - memInfo[key] = parts[1] + key := parts[0][:len(parts[0])-1] // remove trailing : from key + // Active(anon) -> Active_anon + key = re.ReplaceAllString(key, "_${1}") + memInfo[key] = fv } return memInfo, nil @@ -409,7 +427,7 @@ func parseNetDevLine(parts []string, header []string) (map[string]string, error) return devStats, nil } -func getDiskStats() (map[string]map[string]string, error) { +func getDiskStats() (map[string]map[int]string, error) { file, err := os.Open(procDiskStats) if err != nil { return nil, err @@ -417,19 +435,19 @@ func getDiskStats() (map[string]map[string]string, error) { return parseDiskStats(file) } -func parseDiskStats(r io.ReadCloser) (map[string]map[string]string, error) { +func parseDiskStats(r io.ReadCloser) (map[string]map[int]string, error) { defer r.Close() - diskStats := map[string]map[string]string{} + diskStats := map[string]map[int]string{} scanner := bufio.NewScanner(r) for scanner.Scan() { parts := strings.Fields(string(scanner.Text())) - if len(parts) != len(diskStatsHeader)+3 { // we strip major, minor and dev + if len(parts) != len(diskStatsMetrics)+3 { // we strip major, minor and dev return nil, fmt.Errorf("Invalid line in %s: %s", procDiskStats, scanner.Text()) } dev := parts[2] - diskStats[dev] = map[string]string{} + diskStats[dev] = map[int]string{} for i, v := range parts[3:] { - diskStats[dev][diskStatsHeader[i]] = v + diskStats[dev][i] = v } } return diskStats, nil From ffc811b3370bd1d2b6e754bdad5542095c6d5e6c Mon Sep 17 00:00:00 2001 From: Brian Brazil Date: Fri, 23 May 2014 15:38:44 +0100 Subject: [PATCH 2/3] Expand docs per code review. --- collector/native_collector.go | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/collector/native_collector.go b/collector/native_collector.go index 625b6ee8..80dd8851 100644 --- a/collector/native_collector.go +++ b/collector/native_collector.go @@ -34,17 +34,17 @@ type diskStat struct { var ( // Docs from https://www.kernel.org/doc/Documentation/iostats.txt diskStatsMetrics = []diskStat{ - {"reads_completed", prometheus.NewCounter(), "# of reads completed"}, - {"reads_merged", prometheus.NewCounter(), "# of reads merged"}, - {"sectors_read", prometheus.NewCounter(), "# of sectors read"}, - {"read_time_ms", prometheus.NewCounter(), "# of milliseconds spent reading"}, - {"writes_completed", prometheus.NewCounter(), "# of writes completed"}, - {"writes_merged", prometheus.NewCounter(), "# of writes merges"}, - {"sectors_written", prometheus.NewCounter(), "# of sectors written"}, - {"write_time_ms", prometheus.NewCounter(), "# of milliseconds spent writing"}, - {"io_now", prometheus.NewGauge(), "# of I/Os currently in progress"}, - {"io_time_ms", prometheus.NewCounter(), "# of milliseconds spent doing I/Os"}, - {"io_time_weighted", prometheus.NewCounter(), "weighted # of milliseconds spent doing I/Os"}, + {"reads_completed", prometheus.NewCounter(), "The total number of reads completed successfully."}, + {"reads_merged", prometheus.NewCounter(), "The number of reads merged. Reads and writes which are adjacent to each other may be merged for efficiency. Thus two 4K reads may become one 8K read before it is ultimately handed to the disk, and so it will be counted (and queued) as only one I/O. This metric lets you know how often this was done."}, + {"sectors_read", prometheus.NewCounter(), "The total number of sectors read successfully."}, + {"read_time_ms", prometheus.NewCounter(), "the total number of milliseconds spent by all reads (as measured from __make_request() to end_that_request_last())."}, + {"writes_completed", prometheus.NewCounter(), "The total number of writes completed successfully."}, + {"writes_merged", prometheus.NewCounter(), "The number of writes merged. Reads and writes which are adjacent to each other may be merged for efficiency. Thus two 4K reads may become one 8K read before it is ultimately handed to the disk, and so it will be counted (and queued) as only one I/O. This metric lets you know how often this was done."}, + {"sectors_written", prometheus.NewCounter(), "The total number of sectors written successfully."}, + {"write_time_ms", prometheus.NewCounter(), "This is the total number of milliseconds spent by all writes (as measured from __make_request() to end_that_request_last())."}, + {"io_now", prometheus.NewGauge(), "The number of I/Os currently in progress. Incremented as requests are given to appropriate struct request_queue and decremented as they finish."}, + {"io_time_ms", prometheus.NewCounter(), "Milliseconds spent doing I/Os. This metric increases so long as node_disk_io_now is nonzero."}, + {"io_time_weighted", prometheus.NewCounter(), "The weighted # of milliseconds spent doing I/Os. This metric is incremented at each I/O start, I/O completion, I/O merge, or read of these stats by the number of I/Os in progress (node_disk_io_now) times the number of milliseconds spent doing I/O since the last update of this field. This can provide an easy measure of both I/O completion time and the backlog that may be accumulating."}, } lastSeen = prometheus.NewGauge() load1 = prometheus.NewGauge() @@ -82,7 +82,7 @@ func NewNativeCollector(config Config, registry prometheus.Registry) (Collector, registry.Register( "node_last_login_time", - "The time of the last login", + "The time of the last login.", prometheus.NilLabels, lastSeen, ) From 964cdbfcc9f19561dcf8602aac3bf46ecf3b1e01 Mon Sep 17 00:00:00 2001 From: Brian Brazil Date: Fri, 23 May 2014 15:53:31 +0100 Subject: [PATCH 3/3] Trim down a bit per feedback. --- collector/native_collector.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/collector/native_collector.go b/collector/native_collector.go index 80dd8851..9faa23a3 100644 --- a/collector/native_collector.go +++ b/collector/native_collector.go @@ -35,16 +35,16 @@ var ( // Docs from https://www.kernel.org/doc/Documentation/iostats.txt diskStatsMetrics = []diskStat{ {"reads_completed", prometheus.NewCounter(), "The total number of reads completed successfully."}, - {"reads_merged", prometheus.NewCounter(), "The number of reads merged. Reads and writes which are adjacent to each other may be merged for efficiency. Thus two 4K reads may become one 8K read before it is ultimately handed to the disk, and so it will be counted (and queued) as only one I/O. This metric lets you know how often this was done."}, + {"reads_merged", prometheus.NewCounter(), "The number of reads merged. See https://www.kernel.org/doc/Documentation/iostats.txt"}, {"sectors_read", prometheus.NewCounter(), "The total number of sectors read successfully."}, - {"read_time_ms", prometheus.NewCounter(), "the total number of milliseconds spent by all reads (as measured from __make_request() to end_that_request_last())."}, + {"read_time_ms", prometheus.NewCounter(), "the total number of milliseconds spent by all reads."}, {"writes_completed", prometheus.NewCounter(), "The total number of writes completed successfully."}, - {"writes_merged", prometheus.NewCounter(), "The number of writes merged. Reads and writes which are adjacent to each other may be merged for efficiency. Thus two 4K reads may become one 8K read before it is ultimately handed to the disk, and so it will be counted (and queued) as only one I/O. This metric lets you know how often this was done."}, + {"writes_merged", prometheus.NewCounter(), "The number of writes merged. See https://www.kernel.org/doc/Documentation/iostats.txt"}, {"sectors_written", prometheus.NewCounter(), "The total number of sectors written successfully."}, - {"write_time_ms", prometheus.NewCounter(), "This is the total number of milliseconds spent by all writes (as measured from __make_request() to end_that_request_last())."}, - {"io_now", prometheus.NewGauge(), "The number of I/Os currently in progress. Incremented as requests are given to appropriate struct request_queue and decremented as they finish."}, - {"io_time_ms", prometheus.NewCounter(), "Milliseconds spent doing I/Os. This metric increases so long as node_disk_io_now is nonzero."}, - {"io_time_weighted", prometheus.NewCounter(), "The weighted # of milliseconds spent doing I/Os. This metric is incremented at each I/O start, I/O completion, I/O merge, or read of these stats by the number of I/Os in progress (node_disk_io_now) times the number of milliseconds spent doing I/O since the last update of this field. This can provide an easy measure of both I/O completion time and the backlog that may be accumulating."}, + {"write_time_ms", prometheus.NewCounter(), "This is the total number of milliseconds spent by all writes."}, + {"io_now", prometheus.NewGauge(), "The number of I/Os currently in progress."}, + {"io_time_ms", prometheus.NewCounter(), "Milliseconds spent doing I/Os."}, + {"io_time_weighted", prometheus.NewCounter(), "The weighted # of milliseconds spent doing I/Os. See https://www.kernel.org/doc/Documentation/iostats.txt"}, } lastSeen = prometheus.NewGauge() load1 = prometheus.NewGauge()