Merge pull request #450 from roclark/add-infiniband
infiniband: Add new collector for InfiniBand statisticspull/451/head^2
commit
38cd07ebb9
|
@ -32,6 +32,7 @@ The following individuals have contributed code to this repository
|
|||
* Ken Herner <ken@modulus.io>
|
||||
* Matt Layher <mdlayher@gmail.com>
|
||||
* Matthias Rampke <matthias@rampke.de>
|
||||
* Robert Clark <robert.d.clark@hpe.com>
|
||||
* Siavash Safi <siavash.safi@gmail.com>
|
||||
* Stephen Shirley <kormat@gmail.com>
|
||||
* Steve Durrheimer <s.durrheimer@gmail.com>
|
||||
|
|
|
@ -28,6 +28,7 @@ entropy | Exposes available entropy. | Linux
|
|||
filefd | Exposes file descriptor statistics from `/proc/sys/fs/file-nr`. | Linux
|
||||
filesystem | Exposes filesystem statistics, such as disk space used. | Darwin, Dragonfly, FreeBSD, Linux, OpenBSD
|
||||
hwmon | Expose hardware monitoring and sensor data from `/sys/class/hwmon/`. | Linux
|
||||
infiniband | Exposes network statistics specific to InfiniBand configurations. | Linux
|
||||
loadavg | Exposes load average. | Darwin, Dragonfly, FreeBSD, Linux, NetBSD, OpenBSD, Solaris
|
||||
mdadm | Exposes statistics about devices in `/proc/mdstat` (does nothing if no `/proc/mdstat` present). | Linux
|
||||
meminfo | Exposes memory statistics. | Darwin, Dragonfly, FreeBSD, Linux
|
||||
|
|
|
@ -689,6 +689,38 @@ node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp2"} 84
|
|||
node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp3"} 84
|
||||
node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp4"} 84
|
||||
node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp5"} 84
|
||||
# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down
|
||||
# TYPE node_infiniband_link_downed_total counter
|
||||
node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0
|
||||
node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state
|
||||
# TYPE node_infiniband_link_error_recovery_total counter
|
||||
node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0
|
||||
node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors)
|
||||
# TYPE node_infiniband_multicast_packets_received_total counter
|
||||
node_infiniband_multicast_packets_received_total{device="mlx4_0",port="1"} 93
|
||||
node_infiniband_multicast_packets_received_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_multicast_packets_transmitted_total Number of multicast packets transmitted (including errors)
|
||||
# TYPE node_infiniband_multicast_packets_transmitted_total counter
|
||||
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16
|
||||
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_port_data_received_bytes Number of data octets received on all links
|
||||
# TYPE node_infiniband_port_data_received_bytes counter
|
||||
node_infiniband_port_data_received_bytes{device="mlx4_0",port="1"} 4.631917e+06
|
||||
node_infiniband_port_data_received_bytes{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_port_data_transmitted_bytes Number of data octets transmitted on all links
|
||||
# TYPE node_infiniband_port_data_transmitted_bytes counter
|
||||
node_infiniband_port_data_transmitted_bytes{device="mlx4_0",port="1"} 3.73344e+06
|
||||
node_infiniband_port_data_transmitted_bytes{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors)
|
||||
# TYPE node_infiniband_unicast_packets_received_total counter
|
||||
node_infiniband_unicast_packets_received_total{device="mlx4_0",port="1"} 61148
|
||||
node_infiniband_unicast_packets_received_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_unicast_packets_transmitted_total Number of unicast packets transmitted (including errors)
|
||||
# TYPE node_infiniband_unicast_packets_transmitted_total counter
|
||||
node_infiniband_unicast_packets_transmitted_total{device="mlx4_0",port="1"} 61239
|
||||
node_infiniband_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_intr Total number of interrupts serviced.
|
||||
# TYPE node_intr counter
|
||||
node_intr 8.885917e+06
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1 @@
|
|||
93
|
|
@ -0,0 +1 @@
|
|||
16
|
|
@ -0,0 +1 @@
|
|||
4631917
|
|
@ -0,0 +1 @@
|
|||
3733440
|
|
@ -0,0 +1 @@
|
|||
61148
|
|
@ -0,0 +1 @@
|
|||
61239
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1,177 @@
|
|||
// Copyright 2017 The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// +build linux
|
||||
// +build !noinfiniband
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/common/log"
|
||||
)
|
||||
|
||||
const infinibandPath = "class/infiniband"
|
||||
|
||||
var (
|
||||
errInfinibandNoDevicesFound = errors.New("no InfiniBand devices detected")
|
||||
errInfinibandNoPortsFound = errors.New("no InfiniBand ports detected")
|
||||
)
|
||||
|
||||
type infinibandCollector struct {
|
||||
metricDescs map[string]*prometheus.Desc
|
||||
counters map[string]infinibandMetric
|
||||
}
|
||||
|
||||
type infinibandMetric struct {
|
||||
File string
|
||||
Help string
|
||||
}
|
||||
|
||||
func init() {
|
||||
Factories["infiniband"] = NewInfiniBandCollector
|
||||
}
|
||||
|
||||
func NewInfiniBandCollector() (Collector, error) {
|
||||
var i infinibandCollector
|
||||
|
||||
// Filenames of all InfiniBand counter metrics including a detailed description.
|
||||
i.counters = map[string]infinibandMetric{
|
||||
"link_downed_total": {"link_downed", "Number of times the link failed to recover from an error state and went down"},
|
||||
"link_error_recovery_total": {"link_error_recovery", "Number of times the link successfully recovered from an error state"},
|
||||
"multicast_packets_received_total": {"multicast_rcv_packets", "Number of multicast packets received (including errors)"},
|
||||
"multicast_packets_transmitted_total": {"multicast_xmit_packets", "Number of multicast packets transmitted (including errors)"},
|
||||
"port_data_received_bytes": {"port_rcv_data", "Number of data octets received on all links"},
|
||||
"port_data_transmitted_bytes": {"port_xmit_data", "Number of data octets transmitted on all links"},
|
||||
"unicast_packets_received_total": {"unicast_rcv_packets", "Number of unicast packets received (including errors)"},
|
||||
"unicast_packets_transmitted_total": {"unicast_xmit_packets", "Number of unicast packets transmitted (including errors)"},
|
||||
}
|
||||
|
||||
subsystem := "infiniband"
|
||||
i.metricDescs = make(map[string]*prometheus.Desc)
|
||||
|
||||
for metricName, infinibandMetric := range i.counters {
|
||||
i.metricDescs[metricName] = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(Namespace, subsystem, metricName),
|
||||
infinibandMetric.Help,
|
||||
[]string{"device", "port"},
|
||||
nil,
|
||||
)
|
||||
}
|
||||
|
||||
return &i, nil
|
||||
}
|
||||
|
||||
// infinibandDevices retrieves a list of InfiniBand devices.
|
||||
func infinibandDevices(infinibandPath string) ([]string, error) {
|
||||
devices, err := filepath.Glob(filepath.Join(infinibandPath, "/*"))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(devices) < 1 {
|
||||
log.Debugf("Unable to detect InfiniBand devices")
|
||||
err = errInfinibandNoDevicesFound
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Extract just the filenames which equate to the device names.
|
||||
for i, device := range devices {
|
||||
devices[i] = filepath.Base(device)
|
||||
}
|
||||
|
||||
return devices, nil
|
||||
}
|
||||
|
||||
// Retrieve a list of ports for the InfiniBand device.
|
||||
func infinibandPorts(infinibandPath, device string) ([]string, error) {
|
||||
ports, err := filepath.Glob(filepath.Join(infinibandPath, device, "ports/*"))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(ports) < 1 {
|
||||
log.Debugf("Unable to detect ports for %s", device)
|
||||
err = errInfinibandNoPortsFound
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Extract just the filenames which equates to the port numbers.
|
||||
for i, port := range ports {
|
||||
ports[i] = filepath.Base(port)
|
||||
}
|
||||
|
||||
return ports, nil
|
||||
}
|
||||
|
||||
func readMetric(directory, metricFile string) (uint64, error) {
|
||||
metric, err := readUintFromFile(filepath.Join(directory, metricFile))
|
||||
if err != nil {
|
||||
log.Debugf("Error reading %q file", metricFile)
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return metric, nil
|
||||
}
|
||||
|
||||
func (c *infinibandCollector) Update(ch chan<- prometheus.Metric) (err error) {
|
||||
devices, err := infinibandDevices(sysFilePath(infinibandPath))
|
||||
|
||||
// If no devices are found or another error is raised while attempting to find devices,
|
||||
// InfiniBand is likely not installed and the collector should be skipped.
|
||||
switch err {
|
||||
case nil:
|
||||
case errInfinibandNoDevicesFound:
|
||||
return nil
|
||||
default:
|
||||
return err
|
||||
}
|
||||
|
||||
for _, device := range devices {
|
||||
ports, err := infinibandPorts(sysFilePath(infinibandPath), device)
|
||||
|
||||
// If no ports are found for the specified device, skip to the next device.
|
||||
switch err {
|
||||
case nil:
|
||||
case errInfinibandNoPortsFound:
|
||||
continue
|
||||
default:
|
||||
return err
|
||||
}
|
||||
|
||||
for _, port := range ports {
|
||||
portFiles := sysFilePath(filepath.Join(infinibandPath, device, "ports", port))
|
||||
|
||||
// Add metrics for the InfiniBand counters.
|
||||
for metricName, infinibandMetric := range c.counters {
|
||||
metric, err := readMetric(filepath.Join(portFiles, "counters"), infinibandMetric.File)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
c.metricDescs[metricName],
|
||||
prometheus.CounterValue,
|
||||
float64(metric),
|
||||
device,
|
||||
port,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
// Copyright 2017 The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestInfiniBandDevices(t *testing.T) {
|
||||
devices, err := infinibandDevices("fixtures/sys/class/infiniband")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if l := len(devices); l != 1 {
|
||||
t.Fatal("Retrieved an unexpected number of InfiniBand devices: %d", l)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInfiniBandPorts(t *testing.T) {
|
||||
ports, err := infinibandPorts("fixtures/sys/class/infiniband", "mlx4_0")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if l := len(ports); l != 2 {
|
||||
t.Fatal("Retrieved an unexpected number of InfiniBand ports: %d", l)
|
||||
}
|
||||
}
|
|
@ -11,6 +11,7 @@ collectors=$(cat << COLLECTORS
|
|||
entropy
|
||||
filefd
|
||||
hwmon
|
||||
infiniband
|
||||
ksmd
|
||||
loadavg
|
||||
mdadm
|
||||
|
|
|
@ -32,7 +32,7 @@ import (
|
|||
)
|
||||
|
||||
const (
|
||||
defaultCollectors = "conntrack,cpu,diskstats,entropy,edac,filefd,filesystem,hwmon,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat,wifi,zfs"
|
||||
defaultCollectors = "conntrack,cpu,diskstats,entropy,edac,filefd,filesystem,hwmon,infiniband,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat,wifi,zfs"
|
||||
)
|
||||
|
||||
var (
|
||||
|
|
Loading…
Reference in New Issue