On 8/25/2014 1:03 PM, Les Mikesell wrote:
I just had an IBM in a remote location with a hardware raid1 have both drives go bad. With local machines I probably would have caught it from the drive light before the 2nd one died... What is the state of the art in linux software monitoring for this? Long ago when that box was set up I think the best I could have done was a Java GUI tool that IBM had for their servers - and that seemed like overkill for a simple monitor. Is there anything more lightweight that knows about the underlying drives in a hardware raid set on IBM's - and also recent HP servers?
IF megacli64 works for this raid controller, then I tweaked some python scripts I found online and use these two scripts.. these live in /root/bin as they are only for root's use.
here's the typical output of the first script...
[root@server1 bin]# lsi-raidinfo -- Controllers -- -- ID | Model c0 | LSI MegaRAID SAS 9261-8i
-- Volumes -- -- ID | Type | Size | Status | InProgress volume c0u0 | RAID10 1x2 | 2727G | Optimal | None volume c0u1 | RAID60 1x8 | 16370G | Optimal | None volume c0u2 | RAID60 1x8 | 16370G | Optimal | None
-- Disks -- -- Encl:Slot | vol-span-unit | Model | Status disk 8:0 | 0-0-0 | Z291VTS5ST33000650NS 0003 | Online, Spun Up disk 8:1 | 0-0-1 | Z291VTRPST33000650NS 0003 | Online, Spun Up disk 8:2 | 1-0-0 | Z291VTKWST33000650NS 0003 | Online, Spun Up disk 8:3 | 1-0-1 | Z291VT9YST33000650NS 0003 | Online, Spun Up disk 8:4 | 1-0-2 | Z291VTT6ST33000650NS 0003 | Online, Spun Up disk 8:5 | 1-0-3 | Z291VT6CST33000650NS 0003 | Online, Spun Up disk 8:6 | 1-0-4 | Z291VTLAST33000650NS 0003 | Online, Spun Up disk 8:7 | 1-0-5 | Z291VTK1ST33000650NS 0003 | Online, Spun Up disk 8:8 | 1-0-6 | Z291VTNGST33000650NS 0003 | Online, Spun Up disk 8:9 | 1-0-7 | Z291VTRAST33000650NS 0003 | Online, Spun Up disk 8:10 | 2-0-0 | Z291VV05ST33000650NS 0003 | Online, Spun Up disk 8:11 | 2-0-1 | Z291VTW1ST33000650NS 0003 | Online, Spun Up disk 8:12 | 2-0-2 | Z291VTRLST33000650NS 0003 | Online, Spun Up disk 8:13 | 2-0-3 | Z291VTRXST33000650NS 0003 | Online, Spun Up disk 8:14 | 2-0-4 | Z291VSZGST33000650NS 0003 | Online, Spun Up disk 8:15 | 2-0-5 | Z291VSW1ST33000650NS 0003 | Online, Spun Up disk 8:16 | 2-0-6 | Z291VTB5ST33000650NS 0003 | Online, Spun Up disk 8:17 | 2-0-7 | Z291VSX8ST33000650NS 0003 | Online, Spun Up disk 8:18 | x-x-x | Z291VTS7ST33000650NS 0003 | Hotspare, Spun down disk 8:19 | x-x-x | Z291VT3HST33000650NS 0003 | Hotspare, Spun down
first script parses megacli64's gawdawful output format....
/root/bin/lsi-raidinfo: #!/usr/bin/python
# megaclisas-status 0.6 # renamed lsi-raidinfo # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Pulse 2; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. # # Copyright (C) 2007-2009 Adam Cecile (Le_Vert)
## modified by johnpuskar@gmail.com 08/14/11 # fixed for LSI 9285-8e on Openfiler
## modified by pierce@hogranch.com 2012-01-05 # fixed for newer version of megacli output on RHEL6/CentOS6 # output format extended to show raid span-unit and rebuild % complete
import os import re import sys
if len(sys.argv) > 2: print 'Usage: lsi-raidinfo [-d]' sys.exit(1)
# if argument -d, only print disk info printarray = True printcontroller = True if len(sys.argv) > 1: if sys.argv[1] == '-d': printarray = False printcontroller = False else: print 'Usage: lsi-raidinfo [-d]' sys.exit(1)
# Get command output def getOutput(cmd): output = os.popen(cmd) lines = [] for line in output: if not re.match(r'^$',line.strip()): lines.append(line.strip()) return lines
def returnControllerNumber(output): for line in output: if re.match(r'^Controller Count.*$',line.strip()): return int(line.split(':')[1].strip().strip('.'))
def returnControllerModel(output): for line in output: if re.match(r'^Product Name.*$',line.strip()): return line.split(':')[1].strip()
def returnArrayNumber(output): i = 0 for line in output: if re.match(r'^Virtual (Drive|Disk).*$',line.strip()): i += 1 return i
def returnArrayInfo(output,controllerid,arrayid): id = 'c'+str(controllerid)+'u'+str(arrayid) # print 'DEBUG: id = '+str(id) operationlinennumber = False linenumber = 0 units = 1 type = 'JBOD' span = 0 size = 0 for line in output: if re.match(r'^RAID Level.*$',line.strip()): type = line.strip().split(':')[1].strip() type = 'RAID' + type.split(',')[0].split('-')[1].strip() # print 'debug: type = '+str(type) if re.match(r'^Number.*$',line.strip()): units = line.strip().split(':')[1].strip() if re.match(r'^Span Depth.*$',line.strip()): span = line.strip().split(':')[1].strip() if re.match(r'^Size.*$',line.strip()): # Size reported in MB if re.match(r'^.*MB$',line.strip().split(':')[1]): size = line.strip().split(':')[1].strip('MB').strip() size = str(int(round((float(size) / 1000))))+'G' # Size reported in TB elif re.match(r'^.*TB$',line.strip().split(':')[1]): size = line.strip().split(':')[1].strip('TB').strip() size = str(int(round((float(size) * 1000))))+'G' # Size reported in GB (default) else: size = line.strip().split(':')[1].strip('GB').strip() size = str(int(round((float(size)))))+'G' if re.match(r'^State.*$',line.strip()): state = line.strip().split(':')[1].strip() if re.match(r'^Ongoing Progresses.*$',line.strip()): operationlinennumber = linenumber linenumber += 1 if operationlinennumber: inprogress = output[operationlinennumber+1] else: inprogress = 'None' if span > 1: type = type+'0' type = type + ' ' + str(span) + 'x' + str(units) return [id,type,size,state,inprogress]
def returnDiskInfo(output,controllerid,currentarrayid): arrayid = False oldarrayid = False olddiskid = False table = [] state = 'Offline' model = 'Unknown' enclnum = 'Unknown' slotnum = 'Unknown' enclsl = 'Unknown'
firstDisk = True for line in output: if re.match(r'Firmware state: .*$',line.strip()): state = line.split(':')[1].strip() if re.match(r'Rebuild',state): cmd2 = '/opt/MegaRAID/MegaCli/MegaCli64 pdrbld showprog physdrv['+str(enclnum)+':'+str(slotnum)+'] a'+str(controllerid)+' nolog' ll = getOutput(cmd2) state += ' completed ' + re.sub(r'Rebuild Progress.*Completed', '', ll[0]).strip(); if re.match(r'Slot Number: .*$',line.strip()): slotnum = line.split(':')[1].strip() if re.match(r'Inquiry Data: .*$',line.strip()): model = line.split(':')[1].strip() model = re.sub(' +', ' ', model) model = re.sub('Hotspare Information', '', model).strip() #remove bogus output from firmware 12.12 if re.match(r"(Drive|Disk)'s postion: .*$",line.strip()): spans = line.split(',') span = re.sub(r"(Drive|Disk).*DiskGroup:", '', spans[0]).strip()+'-' span += spans[1].split(':')[1].strip()+'-' span += spans[2].split(':')[1].strip() if re.match(r'Enclosure Device ID: [0-9]+$',line.strip()): if firstDisk == True: firstDisk = False else: enclsl = str(enclnum)+':'+str(slotnum) table.append([str(enclsl), span, model, state]) span = 'x-x-x' enclnum = line.split(':')[1].strip() # Last disk of last array enclsl = str(enclnum)+':'+str(slotnum) table.append([str(enclsl), span, model, state]) arraytable = [] for disk in table: arraytable.append(disk) return arraytable
cmd = '/opt/MegaRAID/MegaCli/MegaCli64 adpcount nolog' output = getOutput(cmd) controllernumber = returnControllerNumber(output)
bad = False
# List available controller if printcontroller: print '-- Controllers --' print '-- ID | Model' controllerid = 0 while controllerid < controllernumber: cmd = '/opt/MegaRAID/MegaCli/MegaCli64 adpallinfo a'+str(controllerid)+' nolog' output = getOutput(cmd) controllermodel = returnControllerModel(output) print 'c'+str(controllerid)+' | '+controllermodel controllerid += 1 print ''
if printarray: controllerid = 0 print '-- Volumes --' print '-- ID | Type | Size | Status | InProgress' # print 'controller number'+str(controllernumber) while controllerid < controllernumber: arrayid = 0 cmd = '/opt/MegaRAID/MegaCli/MegaCli64 ldinfo lall a'+str(controllerid)+' nolog' output = getOutput(cmd) arraynumber = returnArrayNumber(output) # print 'array number'+str(arraynumber) while arrayid < arraynumber: cmd = '/opt/MegaRAID/MegaCli/MegaCli64 ldinfo l'+str(arrayid)+' a'+str(controllerid)+' nolog' # print 'DEBUG: running '+str(cmd) output = getOutput(cmd) # print 'DEBUG: output '+str(output) arrayinfo = returnArrayInfo(output,controllerid,arrayid) print 'volume '+arrayinfo[0]+' | '+arrayinfo[1]+' | '+arrayinfo[2]+' | '+arrayinfo[3]+' | '+arrayinfo[4] if not arrayinfo[3] == 'Optimal': bad = True arrayid += 1 controllerid += 1 print ''
print '-- Disks --' print '-- Encl:Slot | vol-span-unit | Model | Status'
controllerid = 0 while controllerid < controllernumber: arrayid = 0 cmd = '/opt/MegaRAID/MegaCli/MegaCli64 ldinfo lall a'+str(controllerid)+' nolog' output = getOutput(cmd) arraynumber = returnArrayNumber(output) while arrayid<arraynumber: #grab disk arrayId info cmd = '/opt/MegaRAID/MegaCli/MegaCli64 pdlist a'+str(controllerid)+' nolog' #print 'debug: running '+str(cmd) output = getOutput(cmd) arraydisk = returnDiskInfo(output,controllerid,arrayid)
for array in arraydisk: print 'disk '+array[0]+' | '+array[1]+' | '+array[2]+' | '+array[3] arrayid += 1 controllerid += 1
if bad: print '\nThere is at least one disk/array in a NOT OPTIMAL state.' sys.exit(1) ******************************************************************************************************
second script checks the output of that first one and summarizes errors only.
/root/bin/lsi-checkraid:
#!/usr/bin/python
# created by johnpuskar@gmail.com on 08/14/11 # rev 01
import os import re import sys
if len(sys.argv) > 1: print 'Usage: accepts stdin from lsi-raidinfo' sys.exit(1)
blnBadDisk = False infile = sys.stdin for line in infile: # print 'DEBUG!! checking line:'+str(line) if re.match(r'disk .*$',line.strip()): if re.match(r'^((?!Online, Spun Up|Online, Spun down|Hotspare, Spun Up|Hotspare, Spun down|Unconfigured(good), Spun Up).)*$',line .strip()): blnBadDisk = True badLine = line # print 'DEBUG!! bad disk found!' if re.match(r'volume ',line.strip()): if re.match(r'^((?!Optimal).)*$',line.strip()): # print 'DEBUG!! bad vol found!' blnBadDisk = True badLine = line
if blnBadDisk == True: print 'RAID ERROR' # print badLine else: print 'RAID CLEAN'
******************************************************************************************************
finally, this script uses those and sends email alerts. its run from crontab hourly as root.
/root/bin/lsi-emailalerts
#!/bin/sh
MAILTOADDR=root HOST=$(hostname -s| tr [a-z] [A-Z])
#get megaraid status info /root/bin/lsi-raidinfo | tee /tmp/lsi-raidinfo.txt | /root/bin/lsi-checkraid > /tmp/lsi-checkraid.txt
#check megaraid status info if grep -qE "RAID ERROR" /tmp/lsi-checkraid.txt ; then cat /tmp/lsi-raidinfo.txt | mailx -s "$HOST Warning: failed disk or degraded array" $MAILTOADDR fi
#rm -f /tmp/lsi-raidinfo.txt #rm -f /tmp/lsi-checkraid.txt exit 0
******************************************************************************************************