diff --git a/AUTHORS.md b/AUTHORS.md index 0fafa505..dbb55edc 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -32,6 +32,7 @@ The following individuals have contributed code to this repository * Ken Herner * Matt Layher * Matthias Rampke +* Robert Clark * Siavash Safi * Stephen Shirley * Steve Durrheimer diff --git a/README.md b/README.md index d70928fc..0be58655 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ entropy | Exposes available entropy. | Linux filefd | Exposes file descriptor statistics from `/proc/sys/fs/file-nr`. | Linux filesystem | Exposes filesystem statistics, such as disk space used. | Darwin, Dragonfly, FreeBSD, Linux, OpenBSD hwmon | Expose hardware monitoring and sensor data from `/sys/class/hwmon/`. | Linux +infiniband | Exposes network statistics specific to InfiniBand configurations. | Linux loadavg | Exposes load average. | Darwin, Dragonfly, FreeBSD, Linux, NetBSD, OpenBSD, Solaris mdadm | Exposes statistics about devices in `/proc/mdstat` (does nothing if no `/proc/mdstat` present). | Linux meminfo | Exposes memory statistics. | Darwin, Dragonfly, FreeBSD, Linux diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index 44acd871..e0f2fbed 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -689,6 +689,38 @@ node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp2"} 84 node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp3"} 84 node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp4"} 84 node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp5"} 84 +# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down +# TYPE node_infiniband_link_downed_total counter +node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0 +node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state +# TYPE node_infiniband_link_error_recovery_total counter +node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0 +node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors) +# TYPE node_infiniband_multicast_packets_received_total counter +node_infiniband_multicast_packets_received_total{device="mlx4_0",port="1"} 93 +node_infiniband_multicast_packets_received_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_multicast_packets_transmitted_total Number of multicast packets transmitted (including errors) +# TYPE node_infiniband_multicast_packets_transmitted_total counter +node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16 +node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_port_data_received_bytes Number of data octets received on all links +# TYPE node_infiniband_port_data_received_bytes counter +node_infiniband_port_data_received_bytes{device="mlx4_0",port="1"} 4.631917e+06 +node_infiniband_port_data_received_bytes{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_port_data_transmitted_bytes Number of data octets transmitted on all links +# TYPE node_infiniband_port_data_transmitted_bytes counter +node_infiniband_port_data_transmitted_bytes{device="mlx4_0",port="1"} 3.73344e+06 +node_infiniband_port_data_transmitted_bytes{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) +# TYPE node_infiniband_unicast_packets_received_total counter +node_infiniband_unicast_packets_received_total{device="mlx4_0",port="1"} 61148 +node_infiniband_unicast_packets_received_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_unicast_packets_transmitted_total Number of unicast packets transmitted (including errors) +# TYPE node_infiniband_unicast_packets_transmitted_total counter +node_infiniband_unicast_packets_transmitted_total{device="mlx4_0",port="1"} 61239 +node_infiniband_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 # HELP node_intr Total number of interrupts serviced. # TYPE node_intr counter node_intr 8.885917e+06 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_downed b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_downed new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_downed @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_error_recovery b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_error_recovery new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_error_recovery @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_rcv_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_rcv_packets new file mode 100644 index 00000000..c67f579c --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_rcv_packets @@ -0,0 +1 @@ +93 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_xmit_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_xmit_packets new file mode 100644 index 00000000..b6a7d89c --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_xmit_packets @@ -0,0 +1 @@ +16 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_data b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_data new file mode 100644 index 00000000..496ea27d --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_data @@ -0,0 +1 @@ +4631917 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_data b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_data new file mode 100644 index 00000000..85ea8ebf --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_data @@ -0,0 +1 @@ +3733440 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_rcv_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_rcv_packets new file mode 100644 index 00000000..2406651b --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_rcv_packets @@ -0,0 +1 @@ +61148 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_xmit_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_xmit_packets new file mode 100644 index 00000000..6279bd6a --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_xmit_packets @@ -0,0 +1 @@ +61239 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_downed b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_downed new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_downed @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_error_recovery b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_error_recovery new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_error_recovery @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_rcv_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_rcv_packets new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_rcv_packets @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_xmit_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_xmit_packets new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_xmit_packets @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_rcv_data b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_rcv_data new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_rcv_data @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_xmit_data b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_xmit_data new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_xmit_data @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_rcv_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_rcv_packets new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_rcv_packets @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_xmit_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_xmit_packets new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_xmit_packets @@ -0,0 +1 @@ +0 diff --git a/collector/infiniband_linux.go b/collector/infiniband_linux.go new file mode 100644 index 00000000..34ce4dab --- /dev/null +++ b/collector/infiniband_linux.go @@ -0,0 +1,177 @@ +// Copyright 2017 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux +// +build !noinfiniband + +package collector + +import ( + "errors" + "path/filepath" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/log" +) + +const infinibandPath = "class/infiniband" + +var ( + errInfinibandNoDevicesFound = errors.New("no InfiniBand devices detected") + errInfinibandNoPortsFound = errors.New("no InfiniBand ports detected") +) + +type infinibandCollector struct { + metricDescs map[string]*prometheus.Desc + counters map[string]infinibandMetric +} + +type infinibandMetric struct { + File string + Help string +} + +func init() { + Factories["infiniband"] = NewInfiniBandCollector +} + +func NewInfiniBandCollector() (Collector, error) { + var i infinibandCollector + + // Filenames of all InfiniBand counter metrics including a detailed description. + i.counters = map[string]infinibandMetric{ + "link_downed_total": {"link_downed", "Number of times the link failed to recover from an error state and went down"}, + "link_error_recovery_total": {"link_error_recovery", "Number of times the link successfully recovered from an error state"}, + "multicast_packets_received_total": {"multicast_rcv_packets", "Number of multicast packets received (including errors)"}, + "multicast_packets_transmitted_total": {"multicast_xmit_packets", "Number of multicast packets transmitted (including errors)"}, + "port_data_received_bytes": {"port_rcv_data", "Number of data octets received on all links"}, + "port_data_transmitted_bytes": {"port_xmit_data", "Number of data octets transmitted on all links"}, + "unicast_packets_received_total": {"unicast_rcv_packets", "Number of unicast packets received (including errors)"}, + "unicast_packets_transmitted_total": {"unicast_xmit_packets", "Number of unicast packets transmitted (including errors)"}, + } + + subsystem := "infiniband" + i.metricDescs = make(map[string]*prometheus.Desc) + + for metricName, infinibandMetric := range i.counters { + i.metricDescs[metricName] = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, subsystem, metricName), + infinibandMetric.Help, + []string{"device", "port"}, + nil, + ) + } + + return &i, nil +} + +// infinibandDevices retrieves a list of InfiniBand devices. +func infinibandDevices(infinibandPath string) ([]string, error) { + devices, err := filepath.Glob(filepath.Join(infinibandPath, "/*")) + if err != nil { + return nil, err + } + + if len(devices) < 1 { + log.Debugf("Unable to detect InfiniBand devices") + err = errInfinibandNoDevicesFound + return nil, err + } + + // Extract just the filenames which equate to the device names. + for i, device := range devices { + devices[i] = filepath.Base(device) + } + + return devices, nil +} + +// Retrieve a list of ports for the InfiniBand device. +func infinibandPorts(infinibandPath, device string) ([]string, error) { + ports, err := filepath.Glob(filepath.Join(infinibandPath, device, "ports/*")) + if err != nil { + return nil, err + } + + if len(ports) < 1 { + log.Debugf("Unable to detect ports for %s", device) + err = errInfinibandNoPortsFound + return nil, err + } + + // Extract just the filenames which equates to the port numbers. + for i, port := range ports { + ports[i] = filepath.Base(port) + } + + return ports, nil +} + +func readMetric(directory, metricFile string) (uint64, error) { + metric, err := readUintFromFile(filepath.Join(directory, metricFile)) + if err != nil { + log.Debugf("Error reading %q file", metricFile) + return 0, err + } + + return metric, nil +} + +func (c *infinibandCollector) Update(ch chan<- prometheus.Metric) (err error) { + devices, err := infinibandDevices(sysFilePath(infinibandPath)) + + // If no devices are found or another error is raised while attempting to find devices, + // InfiniBand is likely not installed and the collector should be skipped. + switch err { + case nil: + case errInfinibandNoDevicesFound: + return nil + default: + return err + } + + for _, device := range devices { + ports, err := infinibandPorts(sysFilePath(infinibandPath), device) + + // If no ports are found for the specified device, skip to the next device. + switch err { + case nil: + case errInfinibandNoPortsFound: + continue + default: + return err + } + + for _, port := range ports { + portFiles := sysFilePath(filepath.Join(infinibandPath, device, "ports", port)) + + // Add metrics for the InfiniBand counters. + for metricName, infinibandMetric := range c.counters { + metric, err := readMetric(filepath.Join(portFiles, "counters"), infinibandMetric.File) + if err != nil { + return err + } + + ch <- prometheus.MustNewConstMetric( + c.metricDescs[metricName], + prometheus.CounterValue, + float64(metric), + device, + port, + ) + } + } + } + + return nil +} diff --git a/collector/infiniband_linux_test.go b/collector/infiniband_linux_test.go new file mode 100644 index 00000000..991102d6 --- /dev/null +++ b/collector/infiniband_linux_test.go @@ -0,0 +1,40 @@ +// Copyright 2017 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package collector + +import ( + "testing" +) + +func TestInfiniBandDevices(t *testing.T) { + devices, err := infinibandDevices("fixtures/sys/class/infiniband") + if err != nil { + t.Fatal(err) + } + + if l := len(devices); l != 1 { + t.Fatal("Retrieved an unexpected number of InfiniBand devices: %d", l) + } +} + +func TestInfiniBandPorts(t *testing.T) { + ports, err := infinibandPorts("fixtures/sys/class/infiniband", "mlx4_0") + if err != nil { + t.Fatal(err) + } + + if l := len(ports); l != 2 { + t.Fatal("Retrieved an unexpected number of InfiniBand ports: %d", l) + } +} diff --git a/end-to-end-test.sh b/end-to-end-test.sh index 39e55413..26a191ba 100755 --- a/end-to-end-test.sh +++ b/end-to-end-test.sh @@ -11,6 +11,7 @@ collectors=$(cat << COLLECTORS entropy filefd hwmon + infiniband ksmd loadavg mdadm diff --git a/node_exporter.go b/node_exporter.go index b6a7f0e1..3e87bc43 100644 --- a/node_exporter.go +++ b/node_exporter.go @@ -32,7 +32,7 @@ import ( ) const ( - defaultCollectors = "conntrack,cpu,diskstats,entropy,edac,filefd,filesystem,hwmon,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat,wifi,zfs" + defaultCollectors = "conntrack,cpu,diskstats,entropy,edac,filefd,filesystem,hwmon,infiniband,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat,wifi,zfs" ) var (