Add new collector for InfiniBand statistics
Add new metrics for the InfiniBand network protocol including the amount of packets sent and received, the number of times the link has been downed and how many times the link has recovered from an error state. Signed-Off-By: Robert Clark <robert.d.clark@hpe.com>pull/450/head
parent
ba635842fc
commit
4866adcb71
|
@ -654,6 +654,38 @@ node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp2"} 84
|
|||
node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp3"} 84
|
||||
node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp4"} 84
|
||||
node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp5"} 84
|
||||
# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down
|
||||
# TYPE node_infiniband_link_downed_total counter
|
||||
node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0
|
||||
node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state
|
||||
# TYPE node_infiniband_link_error_recovery_total counter
|
||||
node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0
|
||||
node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors)
|
||||
# TYPE node_infiniband_multicast_packets_received_total counter
|
||||
node_infiniband_multicast_packets_received_total{device="mlx4_0",port="1"} 93
|
||||
node_infiniband_multicast_packets_received_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_multicast_packets_transmitted_total Number of multicast packets transmitted (including errors)
|
||||
# TYPE node_infiniband_multicast_packets_transmitted_total counter
|
||||
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16
|
||||
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_port_data_received_bytes Number of data octets received on all links
|
||||
# TYPE node_infiniband_port_data_received_bytes counter
|
||||
node_infiniband_port_data_received_bytes{device="mlx4_0",port="1"} 4.631917e+06
|
||||
node_infiniband_port_data_received_bytes{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_port_data_transmitted_bytes Number of data octets transmitted on all links
|
||||
# TYPE node_infiniband_port_data_transmitted_bytes counter
|
||||
node_infiniband_port_data_transmitted_bytes{device="mlx4_0",port="1"} 3.73344e+06
|
||||
node_infiniband_port_data_transmitted_bytes{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors)
|
||||
# TYPE node_infiniband_unicast_packets_received_total counter
|
||||
node_infiniband_unicast_packets_received_total{device="mlx4_0",port="1"} 61148
|
||||
node_infiniband_unicast_packets_received_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_infiniband_unicast_packets_transmitted_total Number of unicast packets transmitted (including errors)
|
||||
# TYPE node_infiniband_unicast_packets_transmitted_total counter
|
||||
node_infiniband_unicast_packets_transmitted_total{device="mlx4_0",port="1"} 61239
|
||||
node_infiniband_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
|
||||
# HELP node_intr Total number of interrupts serviced.
|
||||
# TYPE node_intr counter
|
||||
node_intr 8.885917e+06
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1 @@
|
|||
93
|
|
@ -0,0 +1 @@
|
|||
16
|
|
@ -0,0 +1 @@
|
|||
4631917
|
|
@ -0,0 +1 @@
|
|||
3733440
|
|
@ -0,0 +1 @@
|
|||
61148
|
|
@ -0,0 +1 @@
|
|||
61239
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1 @@
|
|||
0
|
|
@ -0,0 +1,177 @@
|
|||
// Copyright 2017 The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// +build linux
|
||||
// +build !noinfiniband
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/common/log"
|
||||
)
|
||||
|
||||
const infinibandPath = "class/infiniband"
|
||||
|
||||
var (
|
||||
errInfinibandNoDevicesFound = errors.New("no InfiniBand devices detected")
|
||||
errInfinibandNoPortsFound = errors.New("no InfiniBand ports detected")
|
||||
)
|
||||
|
||||
type infinibandCollector struct {
|
||||
metricDescs map[string]*prometheus.Desc
|
||||
counters map[string]infinibandMetric
|
||||
}
|
||||
|
||||
type infinibandMetric struct {
|
||||
File string
|
||||
Help string
|
||||
}
|
||||
|
||||
func init() {
|
||||
Factories["infiniband"] = NewInfiniBandCollector
|
||||
}
|
||||
|
||||
func NewInfiniBandCollector() (Collector, error) {
|
||||
var i infinibandCollector
|
||||
|
||||
// Filenames of all InfiniBand counter metrics including a detailed description.
|
||||
i.counters = map[string]infinibandMetric{
|
||||
"link_downed_total": {"link_downed", "Number of times the link failed to recover from an error state and went down"},
|
||||
"link_error_recovery_total": {"link_error_recovery", "Number of times the link successfully recovered from an error state"},
|
||||
"multicast_packets_received_total": {"multicast_rcv_packets", "Number of multicast packets received (including errors)"},
|
||||
"multicast_packets_transmitted_total": {"multicast_xmit_packets", "Number of multicast packets transmitted (including errors)"},
|
||||
"port_data_received_bytes": {"port_rcv_data", "Number of data octets received on all links"},
|
||||
"port_data_transmitted_bytes": {"port_xmit_data", "Number of data octets transmitted on all links"},
|
||||
"unicast_packets_received_total": {"unicast_rcv_packets", "Number of unicast packets received (including errors)"},
|
||||
"unicast_packets_transmitted_total": {"unicast_xmit_packets", "Number of unicast packets transmitted (including errors)"},
|
||||
}
|
||||
|
||||
subsystem := "infiniband"
|
||||
i.metricDescs = make(map[string]*prometheus.Desc)
|
||||
|
||||
for metricName, infinibandMetric := range i.counters {
|
||||
i.metricDescs[metricName] = prometheus.NewDesc(
|
||||
prometheus.BuildFQName(Namespace, subsystem, metricName),
|
||||
infinibandMetric.Help,
|
||||
[]string{"device", "port"},
|
||||
nil,
|
||||
)
|
||||
}
|
||||
|
||||
return &i, nil
|
||||
}
|
||||
|
||||
// infinibandDevices retrieves a list of InfiniBand devices.
|
||||
func infinibandDevices(infinibandPath string) ([]string, error) {
|
||||
devices, err := filepath.Glob(filepath.Join(infinibandPath, "/*"))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(devices) < 1 {
|
||||
log.Debugf("Unable to detect InfiniBand devices")
|
||||
err = errInfinibandNoDevicesFound
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Extract just the filenames which equate to the device names.
|
||||
for i, device := range devices {
|
||||
devices[i] = filepath.Base(device)
|
||||
}
|
||||
|
||||
return devices, nil
|
||||
}
|
||||
|
||||
// Retrieve a list of ports for the InfiniBand device.
|
||||
func infinibandPorts(infinibandPath, device string) ([]string, error) {
|
||||
ports, err := filepath.Glob(filepath.Join(infinibandPath, device, "ports/*"))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(ports) < 1 {
|
||||
log.Debugf("Unable to detect ports for %s", device)
|
||||
err = errInfinibandNoPortsFound
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Extract just the filenames which equates to the port numbers.
|
||||
for i, port := range ports {
|
||||
ports[i] = filepath.Base(port)
|
||||
}
|
||||
|
||||
return ports, nil
|
||||
}
|
||||
|
||||
func readMetric(directory, metricFile string) (uint64, error) {
|
||||
metric, err := readUintFromFile(filepath.Join(directory, metricFile))
|
||||
if err != nil {
|
||||
log.Debugf("Error reading %q file", metricFile)
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return metric, nil
|
||||
}
|
||||
|
||||
func (c *infinibandCollector) Update(ch chan<- prometheus.Metric) (err error) {
|
||||
devices, err := infinibandDevices(sysFilePath(infinibandPath))
|
||||
|
||||
// If no devices are found or another error is raised while attempting to find devices,
|
||||
// InfiniBand is likely not installed and the collector should be skipped.
|
||||
switch err {
|
||||
case nil:
|
||||
case errInfinibandNoDevicesFound:
|
||||
return nil
|
||||
default:
|
||||
return err
|
||||
}
|
||||
|
||||
for _, device := range devices {
|
||||
ports, err := infinibandPorts(sysFilePath(infinibandPath), device)
|
||||
|
||||
// If no ports are found for the specified device, skip to the next device.
|
||||
switch err {
|
||||
case nil:
|
||||
case errInfinibandNoPortsFound:
|
||||
continue
|
||||
default:
|
||||
return err
|
||||
}
|
||||
|
||||
for _, port := range ports {
|
||||
portFiles := sysFilePath(filepath.Join(infinibandPath, device, "ports", port))
|
||||
|
||||
// Add metrics for the InfiniBand counters.
|
||||
for metricName, infinibandMetric := range c.counters {
|
||||
metric, err := readMetric(filepath.Join(portFiles, "counters"), infinibandMetric.File)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
c.metricDescs[metricName],
|
||||
prometheus.CounterValue,
|
||||
float64(metric),
|
||||
device,
|
||||
port,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
|
@ -10,6 +10,7 @@ collectors=$(cat << COLLECTORS
|
|||
entropy
|
||||
filefd
|
||||
hwmon
|
||||
infiniband
|
||||
ksmd
|
||||
loadavg
|
||||
mdadm
|
||||
|
|
Loading…
Reference in New Issue