@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python 3
"""
"""
Script to parse StorCLI ' s JSON output and expose
Script to parse StorCLI ' s JSON output and expose
MegaRAID health as Prometheus metrics .
MegaRAID health as Prometheus metrics .
@ -19,110 +19,181 @@ import argparse
import json
import json
import os
import os
import subprocess
import subprocess
import shlex
from dateutil . parser import parse
import collections
from enum import IntEnum
DESCRIPTION = """ Parses StorCLI ' s JSON output and exposes MegaRAID health as
DESCRIPTION = """ Parses StorCLI ' s JSON output and exposes MegaRAID health as
Prometheus metrics . """
Prometheus metrics . """
VERSION = ' 0.0.1 '
VERSION = ' 0.0.2 '
storcli_path = ' '
metric_prefix = ' megaraid_ '
metric_list = { }
metric_list = collections . defaultdict ( list )
class VD_State ( IntEnum ) :
Optl = 0 # Optimal
Dgrd = 1 # Degraded
Pdgd = 2 # Partially Degraded
OfLn = 3 # Offline
Rec = 4 # Recovery
Cac = 5 # CacheCade
def main ( args ) :
def main ( args ) :
""" main """
""" main """
global storcli_path
# exporter variables
storcli_path = args . storcli_path
metric_prefix = ' megaraid_ '
data = json . loads ( get_storcli_json ( ' /cALL show all J ' ) )
metric_controller_labels = ' {{ controller= " {} " , model= " {} " }} '
# All the information is collected underneath the Controllers key
data = json . loads ( get_storcli_json ( args . storcli_path ) )
data = data [ ' Controllers ' ]
# It appears that the data we need will always be present in the first
# try:
# item in the Controllers array
# overview = status['Response Data']['System Overview']
status = data [ ' Controllers ' ] [ 0 ]
# except KeyError:
# pass
metrics = {
' status_code ' : status [ ' Command Status ' ] [ ' Status Code ' ] ,
for controller in data :
' controllers ' : status [ ' Response Data ' ] [ ' Number of Controllers ' ] ,
response = controller [ ' Response Data ' ]
}
if response [ ' Version ' ] [ ' Driver Name ' ] == ' megaraid_sas ' :
handle_megaraid_controller ( response )
for name , value in metrics . iteritems ( ) :
elif response [ ' Version ' ] [ ' Driver Name ' ] == ' mpt3sas ' :
print ( ' # HELP {} {} MegaRAID {} ' . format ( metric_prefix , name , name . replace ( ' _ ' , ' ' ) ) )
handle_sas_controller ( response )
print ( ' # TYPE {} {} gauge ' . format ( metric_prefix , name ) )
print ( " {} {} {} " . format ( metric_prefix , name , value ) )
# print_dict_to_exporter({'controller_info': [1]}, controller_info_list)
# print_dict_to_exporter({'virtual_disk_info': [1]}, vd_info_list)
controller_info = [ ]
# print_dict_to_exporter({'physical_disk_info': [1]}, pd_info_list)
controller_metrics = { }
# print_all_metrics(vd_metric_list)
overview = [ ]
print_all_metrics ( metric_list )
try :
overview = status [ ' Response Data ' ] [ ' System Overview ' ]
def handle_sas_controller ( response ) :
except KeyError :
pass
pass
for controller in overview :
def handle_megaraid_controller ( response ) :
controller_index = controller [ ' Ctl ' ]
controller_index = response [ ' Basics ' ] [ ' Controller ' ]
model = controller [ ' Model ' ]
baselabel = ' controller= " {} " ' . format ( controller_index )
controller_info . append ( metric_controller_labels . format ( controller_index , model ) )
controller_info_label = baselabel + ' ,model= " {} " ,serial= " {} " ,fwversion= " {} " ' . format (
controller_metrics = {
response [ ' Basics ' ] [ ' Model ' ] ,
# FIXME: Parse dimmer switch options
response [ ' Basics ' ] [ ' Serial Number ' ] ,
# 'dimmer_switch': controller['DS'],
response [ ' Version ' ] [ ' Firmware Version ' ] ,
)
' battery_backup_healthy ' : int ( controller [ ' BBU ' ] == ' Opt ' ) ,
add_metric ( ' controller_info ' , controller_info_label , 1 )
' degraded ' : int ( controller [ ' Hlth ' ] == ' Dgd ' ) ,
' drive_groups ' : controller [ ' DGs ' ] ,
add_metric ( ' battery_backup_healthy ' , baselabel , int ( response [ ' Status ' ] [ ' BBU Status ' ] == 0 ) )
' emergency_hot_spare ' : int ( controller [ ' EHS ' ] == ' Y ' ) ,
add_metric ( ' degraded ' , baselabel , int ( response [ ' Status ' ] [ ' Controller Status ' ] == ' Degraded ' ) )
' failed ' : int ( controller [ ' Hlth ' ] == ' Fld ' ) ,
add_metric ( ' failed ' , baselabel , int ( response [ ' Status ' ] [ ' Controller Status ' ] == ' Failed ' ) )
' healthy ' : int ( controller [ ' Hlth ' ] == ' Opt ' ) ,
add_metric ( ' healthy ' , baselabel , int ( response [ ' Status ' ] [ ' Controller Status ' ] == ' Optimal ' ) )
' physical_drives ' : controller [ ' PDs ' ] ,
add_metric ( ' drive_groups ' , baselabel , response [ ' Drive Groups ' ] )
' ports ' : controller [ ' Ports ' ] ,
add_metric ( ' virtual_drives ' , baselabel , response [ ' Virtual Drives ' ] )
' scheduled_patrol_read ' : int ( controller [ ' sPR ' ] == ' On ' ) ,
add_metric ( ' physical_drives ' , baselabel , response [ ' Physical Drives ' ] )
' virtual_drives ' : controller [ ' VDs ' ] ,
add_metric ( ' ports ' , baselabel , response [ ' HwCfg ' ] [ ' Backend Port Count ' ] )
add_metric ( ' scheduled_patrol_read ' , baselabel ,
# Reverse StorCLI's logic to make metrics consistent
int ( ' hrs ' in response [ ' Scheduled Tasks ' ] [ ' Patrol Read Reoccurrence ' ] ) )
' drive_groups_optimal ' : int ( controller [ ' DNOpt ' ] == 0 ) ,
' virtual_drives_optimal ' : int ( controller [ ' VNOpt ' ] == 0 ) ,
time_difference_seconds = - 1
}
system_time = parse ( response [ ' Basics ' ] . get ( ' Current System Date/time ' ) )
controller_time = parse ( response [ ' Basics ' ] . get ( ' Current Controller Date/Time ' ) )
for name , value in controller_metrics . iteritems ( ) :
if system_time and controller_time :
print ( ' # HELP {} {} MegaRAID {} ' . format ( metric_prefix , name , name . replace ( ' _ ' , ' ' ) ) )
time_difference_seconds = abs ( system_time - controller_time ) . seconds
print ( ' # TYPE {} {} gauge ' . format ( metric_prefix , name ) )
add_metric ( ' time_difference ' , baselabel , time_difference_seconds )
print ( ' {} {} {{ controller= " {} " }} {} ' . format ( metric_prefix , name ,
controller_index , value ) )
for virtual_drive in response [ ' VD LIST ' ] :
vd_position = virtual_drive . get ( ' DG/VD ' )
if controller_info :
drive_group , volume_group = - 1 , - 1
print ( ' # HELP {} {} MegaRAID controller info ' . format ( metric_prefix , ' controller_info ' ) )
if vd_position :
print ( ' # TYPE {} {} gauge ' . format ( metric_prefix , ' controller_info ' ) )
drive_group = vd_position . split ( ' / ' ) [ 0 ]
for labels in controller_info :
volume_group = vd_position . split ( ' / ' ) [ 1 ]
print ( ' {} {} {} {} ' . format ( metric_prefix , ' controller_info ' , labels , 1 ) )
vd_baselabel = ' controller= " {} " ,DG= " {} " ,VG= " {} " ' . format ( controller_index , drive_group ,
volume_group )
vd_info_label = vd_baselabel + ' ,name= " {} " ,cache= " {} " ,type= " {} " ' . format (
def get_storcli_json ( storcli_path ) :
virtual_drive . get ( ' Name ' ) , virtual_drive . get ( ' Cache ' ) , virtual_drive . get ( ' TYPE ' ) )
add_metric ( ' vd_info ' , vd_info_label , 1 )
add_metric ( ' vd_status ' , vd_baselabel , int ( VD_State [ virtual_drive . get ( ' State ' ) ] ) )
if response [ ' Physical Drives ' ] > 0 :
data = json . loads ( get_storcli_json ( ' /cALL/eALL/sALL show all J ' ) )
drive_info = data [ ' Controllers ' ] [ controller_index ] [ ' Response Data ' ]
for physical_drive in response [ ' PD LIST ' ] :
enclosure = physical_drive . get ( ' EID:Slt ' ) . split ( ' : ' ) [ 0 ]
slot = physical_drive . get ( ' EID:Slt ' ) . split ( ' : ' ) [ 1 ]
pd_baselabel = ' controller= " {} " ,enclosure= " {} " ,slot= " {} " ' . format (
controller_index , enclosure , slot )
pd_info_label = pd_baselabel + ' ,disk_id= " {} " ,interface= " {} " ,media= " {} " ,model= " {} " ' . format (
physical_drive . get ( ' DID ' ) , physical_drive . get ( ' Intf ' ) , physical_drive . get ( ' Med ' ) ,
physical_drive . get ( ' Model ' ) . strip ( ) )
drive_identifier = ' Drive /c ' + str ( controller_index ) + ' /e ' + str ( enclosure ) + ' /s ' + str (
slot )
try :
info = drive_info [ drive_identifier + ' - Detailed Information ' ]
state = info [ drive_identifier + ' State ' ]
attributes = info [ drive_identifier + ' Device attributes ' ]
settings = info [ drive_identifier + ' Policies/Settings ' ]
add_metric ( ' pd_shield_counter ' , pd_baselabel , state [ ' Shield Counter ' ] )
add_metric ( ' pd_media_errors_total ' , pd_baselabel , state [ ' Media Error Count ' ] )
add_metric ( ' pd_other_errors_total ' , pd_baselabel , state [ ' Other Error Count ' ] )
add_metric ( ' pd_predictive_errors_total ' , pd_baselabel ,
state [ ' Predictive Failure Count ' ] )
add_metric ( ' pd_smart_alerted ' , pd_baselabel ,
int ( state [ ' S.M.A.R.T alert flagged by drive ' ] == ' Yes ' ) )
add_metric ( ' pd_link_speed_gbps ' , pd_baselabel , attributes [ ' Link Speed ' ] . split ( ' . ' ) [ 0 ] )
add_metric ( ' pd_device_speed_gbps ' , pd_baselabel ,
attributes [ ' Device Speed ' ] . split ( ' . ' ) [ 0 ] )
add_metric ( ' pd_commissioned_spare ' , pd_baselabel ,
int ( settings [ ' Commissioned Spare ' ] == ' Yes ' ) )
add_metric ( ' pd_emergency_spare ' , pd_baselabel ,
int ( settings [ ' Emergency Spare ' ] == ' Yes ' ) )
pd_info_label + = ' ,firmware= " {} " ' . format ( attributes [ ' Firmware Revision ' ] )
except KeyError :
pass
add_metric ( ' pd_info ' , pd_info_label , 1 )
def add_metric ( name , labels , value ) :
global metric_list
metric_list [ name ] . append ( {
' labels ' : labels ,
' value ' : value ,
} )
def print_all_metrics ( metrics ) :
for metric , measurements in metrics . items ( ) :
print ( ' # HELP {} {} MegaRAID {} ' . format ( metric_prefix , metric , metric . replace ( ' _ ' , ' ' ) ) )
print ( ' # TYPE {} {} gauge ' . format ( metric_prefix , metric ) )
for measurement in measurements :
print ( ' {} {} {} {} ' . format ( metric_prefix , metric , ' { ' + measurement [ ' labels ' ] + ' } ' ,
measurement [ ' value ' ] ) )
def get_storcli_json ( storcli_args ) :
""" Get storcli output in JSON format. """
""" Get storcli output in JSON format. """
# Check if storcli is installed and executable
if not ( os . path . isfile ( storcli_path ) and os . access ( storcli_path , os . X_OK ) ) :
SystemExit ( 1 )
storcli_cmd = shlex . split ( storcli_path + ' ' + storcli_args )
proc = subprocess . Popen (
storcli_cmd , shell = False , stdout = subprocess . PIPE , stderr = subprocess . PIPE )
output_json = proc . communicate ( ) [ 0 ]
return output_json . decode ( " utf-8 " )
# Check if storcli is installed
if os . path . isfile ( storcli_path ) and os . access ( storcli_path , os . X_OK ) :
storcli_cmd = [ storcli_path , ' show ' , ' all ' , ' J ' ]
proc = subprocess . Popen ( storcli_cmd , shell = False ,
stdout = subprocess . PIPE , stderr = subprocess . PIPE )
output_json = proc . communicate ( ) [ 0 ]
else :
# Create an empty dummy-JSON where storcli not installed.
dummy_json = { " Controllers " : [ {
" Command Status " : { " Status Code " : 0 , " Status " : " Success " ,
" Description " : " None " } ,
" Response Data " : { " Number of Controllers " : 0 } } ] }
output_json = json . dumps ( dummy_json )
return output_json
if __name__ == " __main__ " :
if __name__ == " __main__ " :
PARSER = argparse . ArgumentParser ( description = DESCRIPTION ,
PARSER = argparse . ArgumentParser (
formatter_class = argparse . ArgumentDefaultsHelpFormatter )
description = DESCRIPTION , formatter_class = argparse . ArgumentDefaultsHelpFormatter )
PARSER . add_argument ( ' --storcli_path ' ,
PARSER . add_argument (
default = ' /opt/MegaRAID/storcli/storcli64 ' ,
' --storcli_path ' , default = ' /opt/MegaRAID/storcli/storcli64 ' , help = ' path to StorCLi binary ' )
help = ' path to StorCLi binary ' )
PARSER . add_argument ( ' --version ' , action = ' version ' , version = ' %(prog)s {} ' . format ( VERSION ) )
PARSER . add_argument ( ' --version ' ,
action = ' version ' ,
version = ' %(prog)s {} ' . format ( VERSION ) )
ARGS = PARSER . parse_args ( )
ARGS = PARSER . parse_args ( )
main ( ARGS )
main ( ARGS )