This page contains scripts, links to other sites and "howto" docs that the staff at Open Technologies Inc. commonly use. You are free to use this material as you wish, but we take NO responsibility for your proper or improper use of this material. See Limitation of Liability below. ----------------------------------------------------------------------------- #!/bin/sh # # Description: Basic script to monitor filesystem levels, memory and swap # usage, cpu and workload levels, and looks for warning and # errors in /var/adm/messages. The script sends an email to # the defined user/group. Emails are only sent once a day for # repeative alerts, but sent more often if they are cleared. # Also checks to see if the system was recently rebooted. # # Created by: Phillip Fiedler, Open Technologies, Inc. # Created on: 08-17-09, Version 0.1 # Updated on: 08-17-09, Version 0.2 # Updated on: 08-24-09, Version 0.3 # Updated on: 08-26-09, Version 0.4 # Updated on: 09-01-09, Version 0.5 #=========================================================================== # Location to run the script (where the "alerts" directory is created) runpath=/opt/scripts # Email address for the alerts notify=otadmin@open-tech.com # Send alert when idle CPU is less than X percent for a minute idleLevel=10 # Send alert when maximum file system is greater than X percent maxSize=85 # Send alert when free memory is less than X percent memFree=5 # Send alert when free swap space is less than X percent swapFree=20 # Send alert when 5 min CPU workload is greater than or equal to X value workLevel=5 #=========================================================================== if [ ! -d $runpath ]; then mkdir -p $runpath; fi cd $runpath if [ ! -d alerts ]; then mkdir alerts; fi now=`date '+%y%m%d%H%M'` tomorrow=`TZ=CST-19 date +%y%m%d%H%M` #=========================================================================== # Description: Used to check CPU idle/usage alertFile=./alerts/cpu-check.alert if [ -f $alertFile ]; then if [ "$now" -ge "`cat $alertFile`" ]; then rm $alertFile fi fi #--------------------------------------------------------------------------- /usr/bin/vmstat 60 2 > /tmp/vmstat12 cpuidle=`/usr/bin/tail -1 /tmp/vmstat12 | awk '{print $22}'` if [ ! -f $alertFile -a "$cpuidle" -le "$idleLevel" ]; then emailBody="CPU Idle for the past minute is greater than $idleLevel %, currently at $cpuidle" printf "$emailBody" | mailx -s "`hostname`.`domainname` - Low CPU Idle" $notify printf "$tomorrow" > $alertFile fi #=========================================================================== # Description: Used to check filesystem usage/full level alertFile=./alerts/fs-check.alert if [ -f $alertFile ]; then if [ "$now" -ge "`cat $alertFile`" ]; then rm $alertFile fi fi #--------------------------------------------------------------------------- flag="" df -h | grep -v "cdrom" | grep -v "mnt" | grep -v "swap" > /tmp/fs.tmp while read line; do fs=`echo $line | awk '{print $6}'` size=`echo $line | awk '{print $5}' | awk '{FS="%"; print $1}'` if [ ! -f $alertFile -a "$size" -ge "$maxSize" ]; then emailBody="Filesystem: $fs is at $size percent full" printf "$emailBody" | mailx -s "`hostname`.`domainname` - Filesystem: $fs is over $maxSize percent" $notify flag="1" fi done < /tmp/fs.tmp if [ "$flag" = "1" ]; then printf "$tomorrow" > $alertFile; fi #=========================================================================== # Description: Used to check free memory alertFile=./alerts/mem-check.alert if [ -f $alertFile ]; then if [ "$now" -ge "`cat $alertFile`" ]; then rm $alertFile fi fi #--------------------------------------------------------------------------- totalmem=`/usr/sbin/prtconf | grep Mem | awk '{print $3}'` totalmem=`echo "$totalmem * 1024" | bc` /usr/bin/vmstat 1 2 > /tmp/vmstat12 usedmem=`/usr/bin/tail -1 /tmp/vmstat12 | awk '{print $5}'` pctfree=`echo "$usedmem * 100 / $totalmem" | bc` if [ ! -f $alertFile -a "$pctfree" -le "$memFree" ]; then emailBody="Total Memory: $totalmem, Free Memory: $usedmem, Percent Free: $pctfree" printf "$emailBody" | mailx -s "`hostname`.`domainname` - Free memory low" $notify printf "$tomorrow" > $alertFile fi #=========================================================================== # Description: Used to check free swap space alertFile=./alerts/swap-check.alert if [ -f $alertFile ]; then if [ "$now" -ge "`cat $alertFile`" ]; then rm $alertFile fi fi #--------------------------------------------------------------------------- totalswap=`/usr/sbin/swap -s | awk '{print $11}' | awk '{FS="k"; print $1}'` usedswap=`/usr/sbin/swap -s | awk '{print $9}' | awk '{FS="k"; print $1}'` pctfree=`echo "100 - $usedswap * 100 / ($totalswap + $usedswap)" | bc` if [ ! -f $alertFile -a "$pctfree" -le "$swapFree" ]; then emailBody="Total Swap: $totalswap, Free Swap: $usedswap, Percent Free: $pctfree" printf "$emailBody" | mailx -s "`hostname`.`domainname` - Low swap space" $notify printf "$tomorrow" > $alertFile fi #=========================================================================== # Description: Used to check for warnings and errors in /var/adm/messages emailBody="" alertFile=./alerts/cpu-check.alert if [ -f $alertFile ]; then if [ "$now" -ge "`cat $alertFile`" ]; then rm $alertFile fi fi #--------------------------------------------------------------------------- d="`date '+%b %e'`" b=`grep -i "^$d" /var/adm/messages | grep warn` if [ "$b" = "" ] ; then emailBody="$emailBody" else emailBody="WARNING(s) in \/var\/messages\n$b\n" fi b=`grep -i "^$d" /var/adm/messages | grep error` if [ "$b" = "" ] ; then emailBody="$emailBody" else emailBody="$emailBody\nERROR(s) in \/var\/messages\n$b\n" fi if [ ! -f $alertFile -a "$emailBody" != "" ]; then printf "$emailBody" | mailx -s "`hostname`.`domainname` - Warn/Error message alert(s)" $notify printf "$tomorrow" > $alertFile fi #=========================================================================== # Description: Used to check 10 minute workload on the server alertFile=./alerts/workload-check.alert if [ -f $alertFile ]; then if [ "$now" -ge "`cat $alertFile`" ]; then rm $alertFile fi fi #--------------------------------------------------------------------------- workload=`/usr/bin/w -u | awk '{print $10}' | awk '{FS=","; print $1}'` if [ ! -f $alertFile -a "$workload" -ge "$workLevel" ]; then emailBody="Ten minute workload was: $workload" printf "$emailBody" | mailx -s "`hostname`.`domainname` - Workload over threshold" $notify printf "$tomorrow" > $alertFile fi #=========================================================================== # Description: Used to check if the server was recently rebooted if [ -f ./alerts/uptime-check.boot ]; then lastboot=`cat ./alerts/uptime-check.boot` if [ "$lastboot" != "`who -b`" ]; then who -b > ./alerts/uptime-check.boot emailBody="System last rebooted on `who -b`" printf "$emailBody" | mailx -s "`hostname`.`domainname` - System reboot detected" $notify fi else who -b > ./alerts/uptime-check.boot fi #=========================================================================== # Description: Used to check if zfs pools generated errors zpool status | grep "state:" | awk '{print $2}' > situ for i in `cat situ` do if [ $i != "ONLINE" ]; then echo "One of the pools in `hostname` is '$i' " >> reportme fi done rm situ zpool list | awk '{print $1, $6}' | grep -v "NAME" | grep -v "HEALTH" > pool_status cat pool_status | grep -v "ONLINE" | while read i do printf "Here is the pool that has a problem:\n$i\n" >> reportme done rm pool_status if [ -s reportme ]; then emailBody="In `hostname`\n `cat reportme`" printf "$emailBody" | mailx -s "`hostname`.`domainname` detected a pool error" $notify rm reportme fi exit 0 -------------------------------------------------------------------------------------- LIMITATION OF LIABILITY TO THE FULL EXTENT PERMITTED BY LAW, HOST IS NOT LIABLE TO YOU OR ANY OTHER INDIVIDUAL OR ENTITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, PUNITIVE, SPECIAL OR CONSEQUENTIAL DAMAGES RELATED TO OR ARISING OUT OF ANY USE OF, ACCESS TO, OR INABILITY TO ACCESS THIS WEBSITE, CONTENT, SERVICES, OR OF ANY OTHER LINKED WEBSITE OR EXTERNAL RESOURCE INCLUDING, WITHOUT LIMITATION, ANY LOST PROFITS, LOST SALES, LOST REVENUE, LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOSS OF PROGRAMS OR OTHER DATA EVEN IF OPEN TECHNOLOGIES INC. IS EXPRESSLY ADVISED OR AWARE OF THE POSSIBILITY OF SUCH DAMAGES OR LOSSES. YOU ASSUME ALL RISK FOR ANY DAMAGE TO YOUR COMPUTER SYSTEM OR LOSS OF DATA THAT RESULTS FROM OBTAINING ANY CONTENT FROM THE WEBSITE, INCLUDING ANY DAMAGES RESULTING FROM COMPUTER VIRUSES, WORMS, OR OTHER ITEMS OF A DESTRUCTIVE NATURE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OR LIMITATION OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, SO THE ABOVE LIMITATION MAY NOT APPLY TO YOU.