Browse Source

Add mellanox_hca_temp text collector example (#1128)

* deleted_libraries: Upgrade to Python 3

Python 2.7 will not be maintained past 2020. Therefore upgrade
text_collector_examples/deleted_libraries.py to Python 3.

* Add mellanox_hca_temp text collector example

mellanox_hca_temp is a script that reads Mellanox HCA temperature using
the Mellanox mget_temp_ext tool.

Signed-off-by: Benjamin Drung <benjamin.drung@cloud.ionos.com>
pull/1143/head
Benjamin Drung 6 years ago committed by Ben Kochie
parent
commit
2d5fcdeef4
  1. 10
      text_collector_examples/deleted_libraries.py
  2. 59
      text_collector_examples/mellanox_hca_temp

10
text_collector_examples/deleted_libraries.py

@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
"""
Script to count the number of deleted libraries that are linked by running
processes and expose a summary as Prometheus metrics.
@ -20,7 +20,7 @@ def main():
try:
with open(path, 'rb') as file:
for line in file:
part = line.strip().split()
part = line.decode().strip().split()
if len(part) == 7:
library = part[5]
@ -42,9 +42,9 @@ def main():
num_processes_per_library = {}
for process, library_count in processes_linking_deleted_libraries.iteritems():
for process, library_count in processes_linking_deleted_libraries.items():
libraries_seen = set()
for library, count in library_count.iteritems():
for library, count in library_count.items():
if library in libraries_seen:
continue
@ -59,7 +59,7 @@ def main():
print('# HELP {0} {1}'.format(metric_name, description))
print('# TYPE {0} gauge'.format(metric_name))
for library, count in num_processes_per_library.iteritems():
for library, count in num_processes_per_library.items():
dir_path, basename = os.path.split(library)
basename = basename.replace('"', '\\"')
dir_path = dir_path.replace('"', '\\"')

59
text_collector_examples/mellanox_hca_temp

@ -0,0 +1,59 @@
#!/bin/bash
set -eu
# Script to read Mellanox HCA temperature using the Mellanox mget_temp_ext tool
# Copyright 2018 The Prometheus Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Author: Jan Phillip Greimann <jan.greimann@cloud.ionos.com>
# check if root
if [ "$EUID" -ne 0 ]; then
echo "${0##*/}: Please run as root!" >&2
exit 1
fi
# check if programs are installed
if ! command -v mget_temp_ext >/dev/null 2>&1; then
echo "${0##*/}: mget_temp_ext is not installed. Aborting." >&2
exit 1
fi
cat <<EOF
# HELP node_infiniband_hca_temp_celsius Celsius temperature of Mellanox InfiniBand HCA.
# TYPE node_infiniband_hca_temp_celsius gauge
EOF
# run for each found Mellanox device
for dev in /sys/class/infiniband/*; do
if test ! -d "$dev"; then
continue
fi
device="${dev##*/}"
# get temperature
if temperature="$(mget_temp_ext -d "${device}")"; then
# output
echo "node_infiniband_hca_temp_celsius{hca_device=\"${device}\"} ${temperature//[[:space:]]/}"
else
echo "${0##*/}: Failed to get temperature from InfiniBand HCA '${device}'!" >&2
fi
done
# if device is empty, no device was found
if [ -z "${device-}" ]; then
echo "${0##*/}: No InfiniBand HCA device found!" >&2
exit 1
fi
Loading…
Cancel
Save