[CentOS] Hardware raid health?

Mon Aug 25 20:39:45 UTC 2014
John R Pierce <pierce at hogranch.com>

On 8/25/2014 1:03 PM, Les Mikesell wrote:
> I just had an IBM in a remote location with a hardware raid1 have both
> drives go bad.  With local machines I probably would have caught it
> from the drive light before the 2nd one died...  What is the state of
> the art in linux software monitoring for this?   Long ago when that
> box was set up I think the best I could have done was a Java GUI tool
> that IBM had for their servers - and that seemed like overkill for a
> simple monitor.    Is there anything more lightweight that knows about
> the underlying drives in a hardware raid set on IBM's - and also
> recent HP servers?
>


IF megacli64 works for this raid controller, then I tweaked some python 
scripts I found online and use these two scripts..   these live in 
/root/bin as they are only for root's use.

here's the typical output of the first script...

[root at server1 bin]# lsi-raidinfo
-- Controllers --
-- ID | Model
c0 | LSI MegaRAID SAS 9261-8i

-- Volumes --
-- ID | Type | Size | Status | InProgress
volume c0u0 | RAID10 1x2 | 2727G | Optimal | None
volume c0u1 | RAID60 1x8 | 16370G | Optimal | None
volume c0u2 | RAID60 1x8 | 16370G | Optimal | None

-- Disks --
-- Encl:Slot | vol-span-unit | Model | Status
disk 8:0 | 0-0-0 | Z291VTS5ST33000650NS 0003 | Online, Spun Up
disk 8:1 | 0-0-1 | Z291VTRPST33000650NS 0003 | Online, Spun Up
disk 8:2 | 1-0-0 | Z291VTKWST33000650NS 0003 | Online, Spun Up
disk 8:3 | 1-0-1 | Z291VT9YST33000650NS 0003 | Online, Spun Up
disk 8:4 | 1-0-2 | Z291VTT6ST33000650NS 0003 | Online, Spun Up
disk 8:5 | 1-0-3 | Z291VT6CST33000650NS 0003 | Online, Spun Up
disk 8:6 | 1-0-4 | Z291VTLAST33000650NS 0003 | Online, Spun Up
disk 8:7 | 1-0-5 | Z291VTK1ST33000650NS 0003 | Online, Spun Up
disk 8:8 | 1-0-6 | Z291VTNGST33000650NS 0003 | Online, Spun Up
disk 8:9 | 1-0-7 | Z291VTRAST33000650NS 0003 | Online, Spun Up
disk 8:10 | 2-0-0 | Z291VV05ST33000650NS 0003 | Online, Spun Up
disk 8:11 | 2-0-1 | Z291VTW1ST33000650NS 0003 | Online, Spun Up
disk 8:12 | 2-0-2 | Z291VTRLST33000650NS 0003 | Online, Spun Up
disk 8:13 | 2-0-3 | Z291VTRXST33000650NS 0003 | Online, Spun Up
disk 8:14 | 2-0-4 | Z291VSZGST33000650NS 0003 | Online, Spun Up
disk 8:15 | 2-0-5 | Z291VSW1ST33000650NS 0003 | Online, Spun Up
disk 8:16 | 2-0-6 | Z291VTB5ST33000650NS 0003 | Online, Spun Up
disk 8:17 | 2-0-7 | Z291VSX8ST33000650NS 0003 | Online, Spun Up
disk 8:18 | x-x-x | Z291VTS7ST33000650NS 0003 | Hotspare, Spun down
disk 8:19 | x-x-x | Z291VT3HST33000650NS 0003 | Hotspare, Spun down


first script parses megacli64's gawdawful output format....

/root/bin/lsi-raidinfo:
#!/usr/bin/python

# megaclisas-status 0.6
# renamed lsi-raidinfo
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Pulse 2; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
# Copyright (C) 2007-2009 Adam Cecile (Le_Vert)

## modified by johnpuskar at gmail.com 08/14/11
# fixed for LSI 9285-8e on Openfiler

## modified by pierce at hogranch.com 2012-01-05
# fixed for newer version of megacli output on RHEL6/CentOS6
# output format extended to show raid span-unit and rebuild % complete

import os
import re
import sys

if len(sys.argv) > 2:
     print 'Usage: lsi-raidinfo [-d]'
     sys.exit(1)

# if argument -d, only print disk info
printarray = True
printcontroller = True
if len(sys.argv) > 1:
     if sys.argv[1] == '-d':
         printarray = False
         printcontroller = False
     else:
         print 'Usage: lsi-raidinfo [-d]'
         sys.exit(1)

# Get command output
def getOutput(cmd):
     output = os.popen(cmd)
     lines = []
     for line in output:
         if not re.match(r'^$',line.strip()):
             lines.append(line.strip())
     return lines

def returnControllerNumber(output):
     for line in output:
         if re.match(r'^Controller Count.*$',line.strip()):
             return int(line.split(':')[1].strip().strip('.'))

def returnControllerModel(output):
     for line in output:
         if re.match(r'^Product Name.*$',line.strip()):
             return line.split(':')[1].strip()

def returnArrayNumber(output):
     i = 0
     for line in output:
         if re.match(r'^Virtual (Drive|Disk).*$',line.strip()):
             i += 1
     return i

def returnArrayInfo(output,controllerid,arrayid):
     id = 'c'+str(controllerid)+'u'+str(arrayid)
     # print 'DEBUG: id = '+str(id)
     operationlinennumber = False
     linenumber = 0
     units = 1
     type = 'JBOD'
     span = 0
     size = 0
     for line in output:
         if re.match(r'^RAID Level.*$',line.strip()):
             type = line.strip().split(':')[1].strip()
             type = 'RAID' + type.split(',')[0].split('-')[1].strip()
             # print 'debug: type = '+str(type)
         if re.match(r'^Number.*$',line.strip()):
             units = line.strip().split(':')[1].strip()
         if re.match(r'^Span Depth.*$',line.strip()):
             span = line.strip().split(':')[1].strip()
         if re.match(r'^Size.*$',line.strip()):
             # Size reported in MB
             if re.match(r'^.*MB$',line.strip().split(':')[1]):
                 size = line.strip().split(':')[1].strip('MB').strip()
                 size = str(int(round((float(size) / 1000))))+'G'
             # Size reported in TB
             elif re.match(r'^.*TB$',line.strip().split(':')[1]):
                 size = line.strip().split(':')[1].strip('TB').strip()
                 size = str(int(round((float(size) * 1000))))+'G'
             # Size reported in GB (default)
             else:
                 size = line.strip().split(':')[1].strip('GB').strip()
                 size = str(int(round((float(size)))))+'G'
         if re.match(r'^State.*$',line.strip()):
             state = line.strip().split(':')[1].strip()
         if re.match(r'^Ongoing Progresses.*$',line.strip()):
             operationlinennumber = linenumber
         linenumber += 1
         if operationlinennumber:
             inprogress = output[operationlinennumber+1]
         else:
             inprogress = 'None'
     if span > 1:
         type = type+'0'
     type = type + ' ' + str(span) + 'x' + str(units)
     return [id,type,size,state,inprogress]

def returnDiskInfo(output,controllerid,currentarrayid):
     arrayid = False
     oldarrayid = False
     olddiskid = False
     table = []
     state = 'Offline'
     model = 'Unknown'
     enclnum = 'Unknown'
     slotnum = 'Unknown'
     enclsl = 'Unknown'

     firstDisk = True
     for line in output:
         if re.match(r'Firmware state: .*$',line.strip()):
             state = line.split(':')[1].strip()
             if re.match(r'Rebuild',state):
                 cmd2 = '/opt/MegaRAID/MegaCli/MegaCli64 pdrbld showprog 
physdrv['+str(enclnum)+':'+str(slotnum)+'] a'+str(controllerid)+' nolog'
                 ll = getOutput(cmd2)
                 state += ' completed ' + re.sub(r'Rebuild 
Progress.*Completed', '', ll[0]).strip();
         if re.match(r'Slot Number: .*$',line.strip()):
             slotnum = line.split(':')[1].strip()
         if re.match(r'Inquiry Data: .*$',line.strip()):
             model = line.split(':')[1].strip()
             model = re.sub(' +', ' ', model)
             model = re.sub('Hotspare Information', '', 
model).strip()     #remove bogus output from firmware 12.12
         if re.match(r"(Drive|Disk)'s postion: .*$",line.strip()):
             spans = line.split(',')
             span = re.sub(r"(Drive|Disk).*DiskGroup:", '', 
spans[0]).strip()+'-'
             span += spans[1].split(':')[1].strip()+'-'
             span += spans[2].split(':')[1].strip()
         if re.match(r'Enclosure Device ID: [0-9]+$',line.strip()):
             if firstDisk == True:
                 firstDisk = False
             else:
                 enclsl = str(enclnum)+':'+str(slotnum)
                 table.append([str(enclsl), span, model, state])
             span = 'x-x-x'
             enclnum = line.split(':')[1].strip()
     # Last disk of last array
     enclsl = str(enclnum)+':'+str(slotnum)
     table.append([str(enclsl), span, model, state])
     arraytable = []
     for disk in table:
         arraytable.append(disk)
     return arraytable

cmd = '/opt/MegaRAID/MegaCli/MegaCli64 adpcount nolog'
output = getOutput(cmd)
controllernumber = returnControllerNumber(output)

bad = False

# List available controller
if printcontroller:
     print '-- Controllers --'
     print '-- ID | Model'
     controllerid = 0
     while controllerid < controllernumber:
         cmd = '/opt/MegaRAID/MegaCli/MegaCli64 adpallinfo 
a'+str(controllerid)+' nolog'
         output = getOutput(cmd)
         controllermodel = returnControllerModel(output)
         print 'c'+str(controllerid)+' | '+controllermodel
         controllerid += 1
     print ''

if printarray:
     controllerid = 0
     print '-- Volumes --'
     print '-- ID | Type | Size | Status | InProgress'
     # print 'controller number'+str(controllernumber)
     while controllerid < controllernumber:
         arrayid = 0
         cmd = '/opt/MegaRAID/MegaCli/MegaCli64 ldinfo lall 
a'+str(controllerid)+' nolog'
         output = getOutput(cmd)
         arraynumber = returnArrayNumber(output)
#       print 'array number'+str(arraynumber)
         while arrayid < arraynumber:
             cmd = '/opt/MegaRAID/MegaCli/MegaCli64 ldinfo 
l'+str(arrayid)+' a'+str(controllerid)+' nolog'
#           print 'DEBUG: running '+str(cmd)
             output = getOutput(cmd)
#           print 'DEBUG: output '+str(output)
             arrayinfo = returnArrayInfo(output,controllerid,arrayid)
             print 'volume '+arrayinfo[0]+' | '+arrayinfo[1]+' | 
'+arrayinfo[2]+' | '+arrayinfo[3]+' | '+arrayinfo[4]
             if not arrayinfo[3] == 'Optimal':
                 bad = True
             arrayid += 1
         controllerid += 1
     print ''

print '-- Disks --'
print '-- Encl:Slot | vol-span-unit | Model | Status'

controllerid = 0
while controllerid < controllernumber:
     arrayid = 0
     cmd = '/opt/MegaRAID/MegaCli/MegaCli64 ldinfo lall 
a'+str(controllerid)+' nolog'
     output = getOutput(cmd)
     arraynumber = returnArrayNumber(output)
     while arrayid<arraynumber:         #grab disk arrayId info
         cmd = '/opt/MegaRAID/MegaCli/MegaCli64 pdlist 
a'+str(controllerid)+' nolog'
         #print 'debug: running '+str(cmd)
         output = getOutput(cmd)
         arraydisk = returnDiskInfo(output,controllerid,arrayid)

         for array in arraydisk:
             print 'disk '+array[0]+' | '+array[1]+' | '+array[2]+' | 
'+array[3]
             arrayid += 1
     controllerid += 1

if bad:
     print '\nThere is at least one disk/array in a NOT OPTIMAL state.'
     sys.exit(1)
******************************************************************************************************

second script checks the output of that first one and summarizes errors 
only.

/root/bin/lsi-checkraid:

#!/usr/bin/python

# created by johnpuskar at gmail.com on 08/14/11
# rev 01

import os
import re
import sys

if len(sys.argv) > 1:
   print 'Usage: accepts stdin from lsi-raidinfo'
   sys.exit(1)

blnBadDisk = False
infile = sys.stdin
for line in infile:
#  print 'DEBUG!! checking line:'+str(line)
   if re.match(r'disk .*$',line.strip()):
     if re.match(r'^((?!Online, Spun Up|Online, Spun down|Hotspare, Spun 
Up|Hotspare, Spun down|Unconfigured\(good\), Spun Up).)*$',line
.strip()):
       blnBadDisk = True
       badLine = line
#      print 'DEBUG!! bad disk found!'
   if re.match(r'volume ',line.strip()):
     if re.match(r'^((?!Optimal).)*$',line.strip()):
#      print 'DEBUG!! bad vol found!'
       blnBadDisk = True
       badLine = line

if blnBadDisk == True:
   print 'RAID ERROR'
#  print badLine
else:
   print 'RAID CLEAN'

******************************************************************************************************

finally, this script uses those and sends email alerts.   its run from 
crontab hourly as root.

/root/bin/lsi-emailalerts

#!/bin/sh

MAILTOADDR=root
HOST=$(hostname -s| tr [a-z] [A-Z])

#get megaraid status info
/root/bin/lsi-raidinfo | tee /tmp/lsi-raidinfo.txt | 
/root/bin/lsi-checkraid > /tmp/lsi-checkraid.txt

#check megaraid status info
if grep -qE "RAID ERROR" /tmp/lsi-checkraid.txt ; then
   cat /tmp/lsi-raidinfo.txt | mailx -s "$HOST Warning: failed disk or 
degraded array" $MAILTOADDR
fi


#rm -f /tmp/lsi-raidinfo.txt
#rm -f /tmp/lsi-checkraid.txt
exit 0

******************************************************************************************************




-- 
john r pierce                                      37N 122W
somewhere on the middle of the left coast