diff --git a/bin/check_megacli b/bin/check_megacli index 7b5cbd5..edc1653 100644 --- a/bin/check_megacli +++ b/bin/check_megacli @@ -2,6 +2,7 @@ # Example script using pymegacli which is suitable for invocation by nagios +from __future__ import print_function import argparse import os import sys @@ -31,11 +32,39 @@ def main(): action='append', help='Subsystems to check. If not passed, defaults to all' ) + parser.add_argument( + '--led', + action='store_true', + help='Use libstoragemgmt (megaraid plugin) to toggle the fault led on unhealthy disks' + ) + parser.set_defaults(led=False) args = parser.parse_args() if os.geteuid() != 0: parser.error('Must run as root!') + if args.led: + try: + import lsm + try: + import lsm.plugin.megaraid + HAVE_LSM_MEGARAID = True + except: + HAVE_LSM_MEGARAID = False + print( + "--led specified but libstoragemgmt megaraid plugin was not found, install like so on Fedora/CentOS... :\n" + "\tyum install libstoragemgmt-megaraid-plugin", + file=sys.stderr + ) + except ImportError: + HAVE_LSM_MEGARAID = False + print( + "--led specified but libstoragemgmt module was not found, install like so on Fedora/CentOS... :\n" + "\tyum install libstoragemgmt libstoragemgmt-megaraid-plugin", + file=sys.stderr + ) + + checks = {} for check in CHECKS: checks[check] = (not args.check or check in args.check) @@ -52,12 +81,14 @@ def main(): def check_component(component): if component.healthy: messages[OK].append('%s is healthy' % component.identifier) + return OK else: for message in component.health_messages: messages[CRITICAL].append('%s %s' % ( component.identifier, message )) + return CRITICAL controllers = list(connection.controllers) if not controllers: @@ -66,10 +97,19 @@ def main(): for controller in controllers: if checks['PD']: for disk in controller.PDs: - check_component(disk) + disk_health = check_component(disk) + if args.led and HAVE_LSM_MEGARAID: + try: + if CRITICAL == disk_health: + lsm.LocalDisk.fault_led_on(disk.devnode) + else: + lsm.LocalDisk.fault_led_off(disk.devnode) + except: + print('Couldn\'t manipulate enclosure LED', file=sys.stderr) + if checks['LD']: for logical_device in controller.LDs: - check_component(disk) + check_component(logical_device) if 'WriteBack' not in logical_device['Current Cache Policy']: messages[WARNING].append('%s has cache policy %s, which does not include WriteBack' % ( logical_device.identifier, @@ -82,16 +122,16 @@ def main(): messages[WARNING].append('%s is in learn cycle' % bbu.identifier) if messages[CRITICAL]: - print 'CRITICAL: %s' % '; '.join(messages[CRITICAL]) + print('CRITICAL: %s' % '; '.join(messages[CRITICAL])) return CRITICAL elif messages[WARNING]: - print 'WARNING: %s' % '; '.join(messages[WARNING]) + print('WARNING: %s' % '; '.join(messages[WARNING])) return WARNING elif messages[UNKNOWN]: - print 'UNKNOWN: %s' % '; '.join(messages[UNKNOWN]) + print('UNKNOWN: %s' % '; '.join(messages[UNKNOWN])) return UNKNOWN else: - print 'OK: %s' % '; '.join(messages[OK]) + print('OK: %s' % '; '.join(messages[OK])) return OK diff --git a/pymegacli/components.py b/pymegacli/components.py index f46d7a1..d0d7ede 100644 --- a/pymegacli/components.py +++ b/pymegacli/components.py @@ -1,6 +1,8 @@ import pipes import subprocess import re +import os +import glob from .parser import BlockParser from .parser import bail_on @@ -131,20 +133,29 @@ class Disk(Component): 'Predictive Failure Count', ) ERROR_BOOL_KEYS = ('Drive has flagged a S.M.A.R.T alert', ) - REQUIRED_FIELDS = ('Enclosure Device ID', 'Slot Number') + REQUIRED_FIELDS = ('Enclosure Device ID', 'Slot Number', 'WWN') PARSER = BlockParser(rules=[ once_per_block(colon_field('Enclosure Device ID', int_or_na)), rule(colon_field('Slot Number', int)), + rule(colon_field('WWN', str)), rule(colon_field('Other Error Count', int)), rule(colon_field('Predictive Failure Count', int)), rule(colon_field('Media Error Count', int)), rule(colon_field('Drive has flagged a S.M.A.R.T alert', yesnobool)), ], default_constructor=colon_field(None, str)) - def __init__(self, enclosure_id, slot_number, parent, props=None): + def __init__(self, enclosure_id, slot_number, wwn, parent, props=None): self.enclosure_id = enclosure_id self.slot_number = slot_number + self.wwn = wwn.lower() + disk_by_id_path_glob = "/dev/disk/by-id/wwn-0x%s?" % self.wwn[:-1] + disk_by_id_path_glob_results = glob.glob(disk_by_id_path_glob) + if len(disk_by_id_path_glob_results) == 1: + self.linux_disk_by_id = disk_by_id_path_glob_results[0] + else: + self.linux_disk_by_id = None + self.thresholds = dict( (k, 0) for k @@ -157,7 +168,14 @@ def set_threshold(self, key, value): @property def identifier(self): - return 'PhysDrv [%d:%d]' % (self.enclosure_id, self.slot_number) + return 'PhysDrv [%d:%d] WWN: %s Dev: %s' % (self.enclosure_id, self.slot_number, self.wwn, self.devnode) + + @property + def devnode(self): + if self.linux_disk_by_id: + return os.path.realpath(self.linux_disk_by_id) + else: + return None @property def health_status(self):