textfile example storcli enhancements (#1145)
* storcli.py: Remove IntEnum This removes an external dependency. Moved VD state to VD info labels * storcli.py: Fix BBU health detection BBU Status is 0 for a healthy cache vault and 32 for a healthy BBU. * storcli.py: Strip all strings from PD Strip all strings that we get from PDs. They often contain whitespaces.... * storcli.py: Add formatting options Add help text explaining how this documented was formatted * storcli.py: Add DG to pd_info label Add disk group to pd_info. That way we can relate to PDs in the same DG. For example to check if all disks in one RAID use the same interface... * storcli.py: Fix promtool issues Fix linting issues reported by promtool check-metrics * storcli.py: Exit if storcli reports issues storcli reports if the command was a success. We should not continue if there are issues. * storcli.py: Try to parse metrics to float This will sanitize the values we hand over to node_exporter - eliminating any unforeseen values we read out... * storcli.py: Refactor code to implement handle_sas_controller() Move code into methods so that we can now also support HBA queries. * storcli.py: Sort inputs "...like a good python developer" - Daniel Swarbrick * storcli.py: Replace external dateutil library with internal datetime Removes external dependency... * storcli.py: Also collect temperature on megaraid cards We have already collected them on mpt3sas cards... * storcli.py: Clean up old code Removed dead code that is not used any more. * storcli.py: strip() all information for labels They often contain whitespaces... * storcli.py: Try to catch KeyErrors generally If some key we expect is not there, we will want to still print whatever we have collected so far... * storcli.py: Increment version number We have made some changes here and there. The general look of the data has not been changed. * storcli.py: Fix CodeSpell issue Split string to avoid issues with Codespell due to Celcius in JSON Key Signed-off-by: Christopher Blum <zeichenanonym@web.de>pull/1150/head
parent
29d4629f55
commit
1b98db9fa7
|
@ -12,21 +12,23 @@ Advanced Software Options (ASO) not exposed as metrics currently.
|
||||||
|
|
||||||
JSON key abbreviations used by StorCLI are documented in the standard command
|
JSON key abbreviations used by StorCLI are documented in the standard command
|
||||||
output, i.e. when you omit the trailing 'J' from the command.
|
output, i.e. when you omit the trailing 'J' from the command.
|
||||||
|
|
||||||
|
Formatting done with YAPF:
|
||||||
|
$ yapf -i --style '{COLUMN_LIMIT: 99}' storcli.py
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
from datetime import datetime
|
||||||
import argparse
|
import argparse
|
||||||
|
import collections
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import subprocess
|
|
||||||
import shlex
|
import shlex
|
||||||
from dateutil.parser import parse
|
import subprocess
|
||||||
import collections
|
|
||||||
from enum import IntEnum
|
|
||||||
|
|
||||||
DESCRIPTION = """Parses StorCLI's JSON output and exposes MegaRAID health as
|
DESCRIPTION = """Parses StorCLI's JSON output and exposes MegaRAID health as
|
||||||
Prometheus metrics."""
|
Prometheus metrics."""
|
||||||
VERSION = '0.0.2'
|
VERSION = '0.0.3'
|
||||||
|
|
||||||
storcli_path = ''
|
storcli_path = ''
|
||||||
metric_prefix = 'megaraid_'
|
metric_prefix = 'megaraid_'
|
||||||
|
@ -34,59 +36,55 @@ metric_list = {}
|
||||||
metric_list = collections.defaultdict(list)
|
metric_list = collections.defaultdict(list)
|
||||||
|
|
||||||
|
|
||||||
class VD_State(IntEnum):
|
|
||||||
Optl = 0 # Optimal
|
|
||||||
Dgrd = 1 # Degraded
|
|
||||||
Pdgd = 2 # Partially Degraded
|
|
||||||
OfLn = 3 # Offline
|
|
||||||
Rec = 4 # Recovery
|
|
||||||
Cac = 5 # CacheCade
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
""" main """
|
""" main """
|
||||||
global storcli_path
|
global storcli_path
|
||||||
storcli_path = args.storcli_path
|
storcli_path = args.storcli_path
|
||||||
data = json.loads(get_storcli_json('/cALL show all J'))
|
data = get_storcli_json('/cALL show all J')
|
||||||
|
|
||||||
# All the information is collected underneath the Controllers key
|
try:
|
||||||
data = data['Controllers']
|
# All the information is collected underneath the Controllers key
|
||||||
|
data = data['Controllers']
|
||||||
|
|
||||||
# try:
|
for controller in data:
|
||||||
# overview = status['Response Data']['System Overview']
|
response = controller['Response Data']
|
||||||
# except KeyError:
|
if response['Version']['Driver Name'] == 'megaraid_sas':
|
||||||
# pass
|
handle_megaraid_controller(response)
|
||||||
|
elif response['Version']['Driver Name'] == 'mpt3sas':
|
||||||
|
handle_sas_controller(response)
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
|
||||||
for controller in data:
|
|
||||||
response = controller['Response Data']
|
|
||||||
if response['Version']['Driver Name'] == 'megaraid_sas':
|
|
||||||
handle_megaraid_controller(response)
|
|
||||||
elif response['Version']['Driver Name'] == 'mpt3sas':
|
|
||||||
handle_sas_controller(response)
|
|
||||||
|
|
||||||
# print_dict_to_exporter({'controller_info': [1]}, controller_info_list)
|
|
||||||
# print_dict_to_exporter({'virtual_disk_info': [1]}, vd_info_list)
|
|
||||||
# print_dict_to_exporter({'physical_disk_info': [1]}, pd_info_list)
|
|
||||||
# print_all_metrics(vd_metric_list)
|
|
||||||
print_all_metrics(metric_list)
|
print_all_metrics(metric_list)
|
||||||
|
|
||||||
|
|
||||||
def handle_sas_controller(response):
|
def handle_sas_controller(response):
|
||||||
pass
|
(controller_index, baselabel) = get_basic_controller_info(response)
|
||||||
|
add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'OK'))
|
||||||
|
add_metric('ports', baselabel, response['HwCfg']['Backend Port Count'])
|
||||||
|
try:
|
||||||
|
# The number of physical disks is half of the number of items in this dict
|
||||||
|
# Every disk is listed twice - once for basic info, again for detailed info
|
||||||
|
add_metric('physical_drives', baselabel,
|
||||||
|
len(response['Physical Device Information'].keys()) / 2)
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
# Split up string to not trigger CodeSpell issues
|
||||||
|
add_metric('temperature', baselabel,
|
||||||
|
int(response['HwCfg']['ROC temperature(Degree Celc' + 'ius)']))
|
||||||
|
for key, basic_disk_info in response['Physical Device Information'].items():
|
||||||
|
if 'Detailed Information' in key:
|
||||||
|
continue
|
||||||
|
create_metrcis_of_physical_drive(basic_disk_info[0],
|
||||||
|
response['Physical Device Information'], controller_index)
|
||||||
|
|
||||||
|
|
||||||
def handle_megaraid_controller(response):
|
def handle_megaraid_controller(response):
|
||||||
controller_index = response['Basics']['Controller']
|
(controller_index, baselabel) = get_basic_controller_info(response)
|
||||||
baselabel = 'controller="{}"'.format(controller_index)
|
|
||||||
|
|
||||||
controller_info_label = baselabel + ',model="{}",serial="{}",fwversion="{}"'.format(
|
# BBU Status Optimal value is 0 for cachevault and 32 for BBU
|
||||||
response['Basics']['Model'],
|
add_metric('battery_backup_healthy', baselabel,
|
||||||
response['Basics']['Serial Number'],
|
int(response['Status']['BBU Status'] in [0, 32]))
|
||||||
response['Version']['Firmware Version'],
|
|
||||||
)
|
|
||||||
add_metric('controller_info', controller_info_label, 1)
|
|
||||||
|
|
||||||
add_metric('battery_backup_healthy', baselabel, int(response['Status']['BBU Status'] == 0))
|
|
||||||
add_metric('degraded', baselabel, int(response['Status']['Controller Status'] == 'Degraded'))
|
add_metric('degraded', baselabel, int(response['Status']['Controller Status'] == 'Degraded'))
|
||||||
add_metric('failed', baselabel, int(response['Status']['Controller Status'] == 'Failed'))
|
add_metric('failed', baselabel, int(response['Status']['Controller Status'] == 'Failed'))
|
||||||
add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'Optimal'))
|
add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'Optimal'))
|
||||||
|
@ -96,10 +94,13 @@ def handle_megaraid_controller(response):
|
||||||
add_metric('ports', baselabel, response['HwCfg']['Backend Port Count'])
|
add_metric('ports', baselabel, response['HwCfg']['Backend Port Count'])
|
||||||
add_metric('scheduled_patrol_read', baselabel,
|
add_metric('scheduled_patrol_read', baselabel,
|
||||||
int('hrs' in response['Scheduled Tasks']['Patrol Read Reoccurrence']))
|
int('hrs' in response['Scheduled Tasks']['Patrol Read Reoccurrence']))
|
||||||
|
add_metric('temperature', baselabel, int(response['HwCfg']['ROC temperature(Degree Celsius)']))
|
||||||
|
|
||||||
time_difference_seconds = -1
|
time_difference_seconds = -1
|
||||||
system_time = parse(response['Basics'].get('Current System Date/time'))
|
system_time = datetime.strptime(response['Basics'].get('Current System Date/time'),
|
||||||
controller_time = parse(response['Basics'].get('Current Controller Date/Time'))
|
"%m/%d/%Y, %H:%M:%S")
|
||||||
|
controller_time = datetime.strptime(response['Basics'].get('Current Controller Date/Time'),
|
||||||
|
"%m/%d/%Y, %H:%M:%S")
|
||||||
if system_time and controller_time:
|
if system_time and controller_time:
|
||||||
time_difference_seconds = abs(system_time - controller_time).seconds
|
time_difference_seconds = abs(system_time - controller_time).seconds
|
||||||
add_metric('time_difference', baselabel, time_difference_seconds)
|
add_metric('time_difference', baselabel, time_difference_seconds)
|
||||||
|
@ -112,58 +113,84 @@ def handle_megaraid_controller(response):
|
||||||
volume_group = vd_position.split('/')[1]
|
volume_group = vd_position.split('/')[1]
|
||||||
vd_baselabel = 'controller="{}",DG="{}",VG="{}"'.format(controller_index, drive_group,
|
vd_baselabel = 'controller="{}",DG="{}",VG="{}"'.format(controller_index, drive_group,
|
||||||
volume_group)
|
volume_group)
|
||||||
vd_info_label = vd_baselabel + ',name="{}",cache="{}",type="{}"'.format(
|
vd_info_label = vd_baselabel + ',name="{}",cache="{}",type="{}",state="{}"'.format(
|
||||||
virtual_drive.get('Name'), virtual_drive.get('Cache'), virtual_drive.get('TYPE'))
|
str(virtual_drive.get('Name')).strip(),
|
||||||
|
str(virtual_drive.get('Cache')).strip(),
|
||||||
|
str(virtual_drive.get('TYPE')).strip(),
|
||||||
|
str(virtual_drive.get('State')).strip())
|
||||||
add_metric('vd_info', vd_info_label, 1)
|
add_metric('vd_info', vd_info_label, 1)
|
||||||
add_metric('vd_status', vd_baselabel, int(VD_State[virtual_drive.get('State')]))
|
|
||||||
|
|
||||||
if response['Physical Drives'] > 0:
|
if response['Physical Drives'] > 0:
|
||||||
data = json.loads(get_storcli_json('/cALL/eALL/sALL show all J'))
|
data = get_storcli_json('/cALL/eALL/sALL show all J')
|
||||||
drive_info = data['Controllers'][controller_index]['Response Data']
|
drive_info = data['Controllers'][controller_index]['Response Data']
|
||||||
for physical_drive in response['PD LIST']:
|
for physical_drive in response['PD LIST']:
|
||||||
enclosure = physical_drive.get('EID:Slt').split(':')[0]
|
create_metrcis_of_physical_drive(physical_drive, drive_info, controller_index)
|
||||||
slot = physical_drive.get('EID:Slt').split(':')[1]
|
|
||||||
|
|
||||||
pd_baselabel = 'controller="{}",enclosure="{}",slot="{}"'.format(
|
|
||||||
controller_index, enclosure, slot)
|
|
||||||
pd_info_label = pd_baselabel + ',disk_id="{}",interface="{}",media="{}",model="{}"'.format(
|
|
||||||
physical_drive.get('DID'), physical_drive.get('Intf'), physical_drive.get('Med'),
|
|
||||||
physical_drive.get('Model').strip())
|
|
||||||
|
|
||||||
drive_identifier = 'Drive /c' + str(controller_index) + '/e' + str(enclosure) + '/s' + str(
|
def get_basic_controller_info(response):
|
||||||
slot)
|
controller_index = response['Basics']['Controller']
|
||||||
try:
|
baselabel = 'controller="{}"'.format(controller_index)
|
||||||
info = drive_info[drive_identifier + ' - Detailed Information']
|
|
||||||
state = info[drive_identifier + ' State']
|
|
||||||
attributes = info[drive_identifier + ' Device attributes']
|
|
||||||
settings = info[drive_identifier + ' Policies/Settings']
|
|
||||||
|
|
||||||
add_metric('pd_shield_counter', pd_baselabel, state['Shield Counter'])
|
controller_info_label = baselabel + ',model="{}",serial="{}",fwversion="{}"'.format(
|
||||||
add_metric('pd_media_errors_total', pd_baselabel, state['Media Error Count'])
|
str(response['Basics']['Model']).strip(),
|
||||||
add_metric('pd_other_errors_total', pd_baselabel, state['Other Error Count'])
|
str(response['Basics']['Serial Number']).strip(),
|
||||||
add_metric('pd_predictive_errors_total', pd_baselabel,
|
str(response['Version']['Firmware Version']).strip(),
|
||||||
state['Predictive Failure Count'])
|
)
|
||||||
add_metric('pd_smart_alerted', pd_baselabel,
|
add_metric('controller_info', controller_info_label, 1)
|
||||||
int(state['S.M.A.R.T alert flagged by drive'] == 'Yes'))
|
|
||||||
add_metric('pd_link_speed_gbps', pd_baselabel, attributes['Link Speed'].split('.')[0])
|
return (controller_index, baselabel)
|
||||||
add_metric('pd_device_speed_gbps', pd_baselabel,
|
|
||||||
attributes['Device Speed'].split('.')[0])
|
|
||||||
add_metric('pd_commissioned_spare', pd_baselabel,
|
def create_metrcis_of_physical_drive(physical_drive, detailed_info_array, controller_index):
|
||||||
int(settings['Commissioned Spare'] == 'Yes'))
|
enclosure = physical_drive.get('EID:Slt').split(':')[0]
|
||||||
add_metric('pd_emergency_spare', pd_baselabel,
|
slot = physical_drive.get('EID:Slt').split(':')[1]
|
||||||
int(settings['Emergency Spare'] == 'Yes'))
|
|
||||||
pd_info_label += ',firmware="{}"'.format(attributes['Firmware Revision'])
|
pd_baselabel = 'controller="{}",enclosure="{}",slot="{}"'.format(controller_index, enclosure,
|
||||||
except KeyError:
|
slot)
|
||||||
pass
|
pd_info_label = pd_baselabel + \
|
||||||
add_metric('pd_info', pd_info_label, 1)
|
',disk_id="{}",interface="{}",media="{}",model="{}",DG="{}"'.format(
|
||||||
|
str(physical_drive.get('DID')).strip(),
|
||||||
|
str(physical_drive.get('Intf')).strip(),
|
||||||
|
str(physical_drive.get('Med')).strip(),
|
||||||
|
str(physical_drive.get('Model')).strip(),
|
||||||
|
str(physical_drive.get('DG')).strip())
|
||||||
|
|
||||||
|
drive_identifier = 'Drive /c' + str(controller_index) + '/e' + str(enclosure) + '/s' + str(
|
||||||
|
slot)
|
||||||
|
if enclosure == ' ':
|
||||||
|
drive_identifier = 'Drive /c' + str(controller_index) + '/s' + str(slot)
|
||||||
|
try:
|
||||||
|
info = detailed_info_array[drive_identifier + ' - Detailed Information']
|
||||||
|
state = info[drive_identifier + ' State']
|
||||||
|
attributes = info[drive_identifier + ' Device attributes']
|
||||||
|
settings = info[drive_identifier + ' Policies/Settings']
|
||||||
|
|
||||||
|
add_metric('pd_shield_counter', pd_baselabel, state['Shield Counter'])
|
||||||
|
add_metric('pd_media_errors', pd_baselabel, state['Media Error Count'])
|
||||||
|
add_metric('pd_other_errors', pd_baselabel, state['Other Error Count'])
|
||||||
|
add_metric('pd_predictive_errors', pd_baselabel, state['Predictive Failure Count'])
|
||||||
|
add_metric('pd_smart_alerted', pd_baselabel,
|
||||||
|
int(state['S.M.A.R.T alert flagged by drive'] == 'Yes'))
|
||||||
|
add_metric('pd_link_speed_gbps', pd_baselabel, attributes['Link Speed'].split('.')[0])
|
||||||
|
add_metric('pd_device_speed_gbps', pd_baselabel, attributes['Device Speed'].split('.')[0])
|
||||||
|
add_metric('pd_commissioned_spare', pd_baselabel,
|
||||||
|
int(settings['Commissioned Spare'] == 'Yes'))
|
||||||
|
add_metric('pd_emergency_spare', pd_baselabel, int(settings['Emergency Spare'] == 'Yes'))
|
||||||
|
pd_info_label += ',firmware="{}"'.format(attributes['Firmware Revision'].strip())
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
add_metric('pd_info', pd_info_label, 1)
|
||||||
|
|
||||||
|
|
||||||
def add_metric(name, labels, value):
|
def add_metric(name, labels, value):
|
||||||
global metric_list
|
global metric_list
|
||||||
metric_list[name].append({
|
try:
|
||||||
'labels': labels,
|
metric_list[name].append({
|
||||||
'value': value,
|
'labels': labels,
|
||||||
})
|
'value': float(value),
|
||||||
|
})
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def print_all_metrics(metrics):
|
def print_all_metrics(metrics):
|
||||||
|
@ -184,8 +211,11 @@ def get_storcli_json(storcli_args):
|
||||||
proc = subprocess.Popen(
|
proc = subprocess.Popen(
|
||||||
storcli_cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
storcli_cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
output_json = proc.communicate()[0]
|
output_json = proc.communicate()[0]
|
||||||
|
data = json.loads(output_json.decode("utf-8"))
|
||||||
|
|
||||||
return output_json.decode("utf-8")
|
if data["Controllers"][0]["Command Status"]["Status"] != "Success":
|
||||||
|
SystemExit(1)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in New Issue