Browse Source

infiniband: Handle iWARP* RDMA modules N/A (#974)

* infiniband: Add not connected i40iw0/ports/1 fixtures
* infiniband: Handle issue when iWARP* RDMA modules are not available

This is related to #966, and handle this error,

Jun 07 13:33:24 hostname node_exporter[81888]: time="2018-06-07T13:33:24+02:00" level=error msg="ERROR: infiniband
collector failed after 0.000929s: strconv.ParseUint: parsing \"N/A (no PMA)\": invalid syntax" source="collector.go:132"

Signed-off-by: Mario Trangoni <mjtrangoni@gmail.com>
pull/1084/head
Mario Trangoni 6 years ago committed by Ben Kochie
parent
commit
3659260b66
  1. 4
      collector/fixtures/e2e-64k-page-output.txt
  2. 4
      collector/fixtures/e2e-output.txt
  3. 97
      collector/fixtures/sys.ttar
  4. 10
      collector/infiniband_linux.go
  5. 2
      collector/infiniband_linux_test.go

4
collector/fixtures/e2e-64k-page-output.txt

@ -787,10 +787,12 @@ node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="1
node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239 node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239
# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down # HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down
# TYPE node_infiniband_link_downed_total counter # TYPE node_infiniband_link_downed_total counter
node_infiniband_link_downed_total{device="i40iw0",port="1"} 0
node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0 node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0
node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0 node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state # HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state
# TYPE node_infiniband_link_error_recovery_total counter # TYPE node_infiniband_link_error_recovery_total counter
node_infiniband_link_error_recovery_total{device="i40iw0",port="1"} 0
node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0 node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0
node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0 node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors) # HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors)
@ -803,10 +805,12 @@ node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links # HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links
# TYPE node_infiniband_port_data_received_bytes_total counter # TYPE node_infiniband_port_data_received_bytes_total counter
node_infiniband_port_data_received_bytes_total{device="i40iw0",port="1"} 0
node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07 node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07
node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0 node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links # HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links
# TYPE node_infiniband_port_data_transmitted_bytes_total counter # TYPE node_infiniband_port_data_transmitted_bytes_total counter
node_infiniband_port_data_transmitted_bytes_total{device="i40iw0",port="1"} 0
node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07 node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07
node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0 node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) # HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors)

4
collector/fixtures/e2e-output.txt

@ -787,10 +787,12 @@ node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="1
node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239 node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239
# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down # HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down
# TYPE node_infiniband_link_downed_total counter # TYPE node_infiniband_link_downed_total counter
node_infiniband_link_downed_total{device="i40iw0",port="1"} 0
node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0 node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0
node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0 node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state # HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state
# TYPE node_infiniband_link_error_recovery_total counter # TYPE node_infiniband_link_error_recovery_total counter
node_infiniband_link_error_recovery_total{device="i40iw0",port="1"} 0
node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0 node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0
node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0 node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors) # HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors)
@ -803,10 +805,12 @@ node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links # HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links
# TYPE node_infiniband_port_data_received_bytes_total counter # TYPE node_infiniband_port_data_received_bytes_total counter
node_infiniband_port_data_received_bytes_total{device="i40iw0",port="1"} 0
node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07 node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07
node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0 node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links # HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links
# TYPE node_infiniband_port_data_transmitted_bytes_total counter # TYPE node_infiniband_port_data_transmitted_bytes_total counter
node_infiniband_port_data_transmitted_bytes_total{device="i40iw0",port="1"} 0
node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07 node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07
node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0 node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) # HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors)

97
collector/fixtures/sys.ttar

@ -109,6 +109,103 @@ Mode: 644
Directory: sys/class/infiniband Directory: sys/class/infiniband
Mode: 755 Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/infiniband/i40iw0
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/infiniband/i40iw0/ports
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/infiniband/i40iw0/ports/1
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/infiniband/i40iw0/ports/1/counters
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/infiniband/i40iw0/ports/1/counters/excessive_buffer_overrun_errors
Lines: 1
N/A (no PMA)
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/infiniband/i40iw0/ports/1/counters/link_downed
Lines: 1
N/A (no PMA)
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/infiniband/i40iw0/ports/1/counters/link_error_recovery
Lines: 1
N/A (no PMA)
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/infiniband/i40iw0/ports/1/counters/local_link_integrity_errors
Lines: 1
N/A (no PMA)
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_constraint_errors
Lines: 1
N/A (no PMA)
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_data
Lines: 1
N/A (no PMA)
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_errors
Lines: 1
N/A (no PMA)
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_packets
Lines: 1
N/A (no PMA)
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_remote_physical_errors
Lines: 1
N/A (no PMA)
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_switch_relay_errors
Lines: 1
N/A (no PMA)
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_constraint_errors
Lines: 1
N/A (no PMA)
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_data
Lines: 1
N/A (no PMA)
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_discards
Lines: 1
N/A (no PMA)
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_packets
Lines: 1
N/A (no PMA)
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_wait
Lines: 1
N/A (no PMA)
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/infiniband/i40iw0/ports/1/counters/symbol_error
Lines: 1
N/A (no PMA)
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/infiniband/i40iw0/ports/1/counters/VL15_dropped
Lines: 1
N/A (no PMA)
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/infiniband/mlx4_0 Directory: sys/class/infiniband/mlx4_0
Mode: 755 Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

10
collector/infiniband_linux.go

@ -20,6 +20,7 @@ import (
"errors" "errors"
"os" "os"
"path/filepath" "path/filepath"
"strings"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/log" "github.com/prometheus/common/log"
@ -144,6 +145,15 @@ func infinibandPorts(infinibandPath, device string) ([]string, error) {
func readMetric(directory, metricFile string) (uint64, error) { func readMetric(directory, metricFile string) (uint64, error) {
metric, err := readUintFromFile(filepath.Join(directory, metricFile)) metric, err := readUintFromFile(filepath.Join(directory, metricFile))
if err != nil { if err != nil {
// Ugly workaround for handling #966, when counters are
// `N/A (not available)`.
// This was already patched and submitted, see
// https://www.spinics.net/lists/linux-rdma/msg68596.html
// Remove this as soon as the fix lands in the enterprise distros.
if strings.Contains(err.Error(), "N/A (no PMA)") {
log.Debugf("%q value is N/A", metricFile)
return 0, nil
}
log.Debugf("Error reading %q file", metricFile) log.Debugf("Error reading %q file", metricFile)
return 0, err return 0, err
} }

2
collector/infiniband_linux_test.go

@ -23,7 +23,7 @@ func TestInfiniBandDevices(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
if l := len(devices); l != 1 { if l := len(devices); l != 2 {
t.Fatalf("Retrieved an unexpected number of InfiniBand devices: %d", l) t.Fatalf("Retrieved an unexpected number of InfiniBand devices: %d", l)
} }
} }

Loading…
Cancel
Save