Platform Automated Shutdown Scripts

From Define Wiki
Revision as of 11:51, 7 December 2012 by Michael (talk | contribs) (Created page with "3 Stage Shutdown in the event of an overheat. Systems will shutdown if any of the following are breached: # Individual node breaches threshold # More than 10 nodes per rack b...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

3 Stage Shutdown in the event of an overheat. Systems will shutdown if any of the following are breached:

  1. Individual node breaches threshold
  2. More than 10 nodes per rack breach a threshold (vig_rack_temp_chk.sh)
  3. Front/Rear Air Temp breaches threshold

Located on PDD: HPC Software Information/Platform/Overheat_Scripts

PDD Link: <file>\\srv-vfs2\PDD_DATA\Product Development\High Performance Computing\HPC Software Information\Platform\Overheat_Scripts|Overheat_Scripts on PDD</file>

Scripts on headnode (Rack Test)
# File: ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_rack_temp_chk.sh
#!/bin/bash

# Global Params
# Enable output
DEBUG=1
POWER_MODE=soft

cd /opt/kusu/etc/vig_temp_chk/

# define the snmp params
LCP_UNITS=(10.24.5.11 10.24.5.12 10.24.5.13);
COM_STRING="gild82urdu";
THRESHOLD_TEMP_NODES=10;

DATE=`date`
THRESHOLD=`cat /opt/kusu/etc/vig_temp_chk/threshold.temp | head -n 1 | sed "s/=/ /" | awk '{print $2}'`

ALL_RACKS=( RACK1 RACK2 RACK4 RACK5 RACK6 );

RACK1=( $(cat .hosts.ipmi.cab1) );
RACK2=( $(cat .hosts.ipmi.cab2) );
RACK4=( $(cat .hosts.ipmi.cab4) );
RACK5=( $(cat .hosts.ipmi.cab5) );
RACK6=( $(cat .hosts.ipmi.cab6) );

RACK_THRESHOLD=12
# rack counter;
i=0;

for RACK in ${ALL_RACKS[@]}
do

        i=`echo "$i +1"|bc`;

        if [ "$i" -eq "3" ]
        then
                i=`echo "$i +1"|bc`;
        fi

        RACK_NUM_NODES_ABOVE_THRESHOLD=0

        for NODE in `eval echo \\\${$RACK[@]}`
        do
                # get the node temperature via ipmitool
                NODE_TEMP=`ipmitool -U ADMIN -P ADMIN -H $NODE sdr type Temperature | head -n 1 | awk '{print $10}'`

                # report the temperature
                if [ "$DEBUG" -eq "1" ]
                then
                        echo -en "$DATE RACK${i} NODE: $NODE TEMP: $NODE_TEMP "
                fi

                if [ "${NODE_TEMP}" -gt "${THRESHOLD}" ]
                then

                        RACK_NUM_NODES_ABOVE_THRESHOLD=`echo "$RACK_NUM_NODES_ABOVE_THRESHOLD + 1 "| bc`

                        if [ "$DEBUG" -eq "1" ]
                        then
                                echo "THRESHOLD ($THRESHOLD) EXCEEDED, NODES_ABOVE: $RACK_NUM_NODES_ABOVE_THRESHOLD "
                        fi
                else
                        if [ "$DEBUG" -eq "1" ]
                        then
                                echo "Temperature OK. No action"
                        fi
                fi
        done

        if [ "$DEBUG" -eq "1" ]
        then
                echo -en "$DATE RACK${i} NODES ABOVE THRESHOLD: ${RACK_NUM_NODES_ABOVE_THRESHOLD} "
        fi

        if [ "${RACK_NUM_NODES_ABOVE_THRESHOLD}" -gt "${RACK_THRESHOLD}" ]
        then
                if [ "$DEBUG" -eq "1" ]
                then
                        echo "TOO MANY NODES ABOVE THRESHOLD: SHUTTING RACK${i} DOWN!";
                        ./vig_ipmi.sh -c${i} -s 0 -p $POWER_MODE
                fi
        else
                if [ "$DEBUG" -eq "1" ]
                then
                        echo "Threshold not exceeded. No action";
                fi
        fi

done
Scripts on headnode (Rittal LCP Check)
# File: ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_snmp_chk.sh
#!/bin/bash

DEBUG=1
LCP_UNITS=(10.24.5.11 10.24.5.12 10.24.5.13);
RACKS_TO_SHUT=(1 "2 3" 4);
COM_STRING="gild82urdu";
DATE=`date`
FRONT_THRESHOLD=`cat /opt/kusu/etc/vig_temp_chk/front_air_threshold.temp | head -n 1 | sed "s/=/ /" | awk '{print $2}'`
REAR_THRESHOLD=`cat /opt/kusu/etc/vig_temp_chk/rear_air_threshold.temp | head -n 1 | sed "s/=/ /" | awk '{print $2}'`
POWER_MODE=soft

cd /opt/kusu/etc/vig_temp_chk/

i=0;
for UNIT in "${LCP_UNITS[@]}"; do

        UNIT_TEMPS=(`snmpwalk -v1 -Oq -c $COM_STRING $UNIT .1.3.6.1.4.1.2606.4.2.4.5.2.1.5 | grep 5.13 -A 5 | awk '{print $2}'`)

        if [ "$DEBUG" -eq "1" ]
        then
               echo -en "${DATE} UNIT $UNIT TEMPS: ${UNIT_TEMPS[@]} "
        fi

        FRONT_AVERAGE=`echo "( ${UNIT_TEMPS[0]} + ${UNIT_TEMPS[2]} + ${UNIT_TEMPS[4]} ) / 3 " | bc`;
        REAR_AVERAGE=`echo "( ${UNIT_TEMPS[1]} + ${UNIT_TEMPS[3]} + ${UNIT_TEMPS[5]} ) / 3 " | bc`;

        echo -en "F_Thrsh: $FRONT_THRESHOLD F_Avg: $FRONT_AVERAGE ";

        if [ "${FRONT_AVERAGE}" -gt "${FRONT_THRESHOLD}" ]
        then

               if [ "$DEBUG" -eq "1" ]
               then
                        echo "F_Thrsh EXCEEDED, SHUTDOWN"
                        for flag in `eval echo \\\${RACKS_TO_SHUT[$i]}`
                        do
                                echo "./vig_ipmi.sh -c${flag} -s 0 -p $POWER_MODE"
                                ./vig_ipmi.sh -c${flag} -s 0 -p $POWER_MODE
                        done
               fi
        else
               if [ "$DEBUG" -eq "1" ]
               then
                        echo -en "F_Thrsh OK. No action "
               fi
        fi

        echo -en "R_Thrsh: $REAR_THRESHOLD R_Avg: $REAR_AVERAGE ";

        if [ "${REAR_AVERAGE}" -gt "${REAR_THRESHOLD}" ]
        then

               if [ "$DEBUG" -eq "1" ]
               then
                       echo "R_Thrsh EXCEEDED, SHUTDOWN "
                        for flag in `eval echo \\\${RACKS_TO_SHUT[$i]}`
                        do
                                echo "./vig_ipmi.sh -c${flag} -s 0 -p $POWER_MODE"
                                ./vig_ipmi.sh -c${flag} -s 0 -p $POWER_MODE
                        done
               fi
        else
               if [ "$DEBUG" -eq "1" ]
               then
                       echo "R_Thrsh OK. No action "
               fi
       fi

        i=`echo "$i +1"|bc`;
done
Scripts on headnode (IPMI Wrapper)
# File: ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_ipmi.sh
#!/bin/bash

sleep=0;

if [ $UID -ne 0 ]
then
    echo "User must be root!!!"
    exit 1
fi

if [ $# -eq 0 ]
then
        echo "Args ya fool!"
        exit -1
fi

while [ $# -gt 0 ] 
do
  case $1
  in
    -a)
      nodelist=all
      shift 1
    ;;

    -c1)
      nodelist=cab1
      shift 1
    ;;

    -c2)
      nodelist=cab2
      shift 1
    ;;

    -c3)
      nodelist=cab3
      shift 1
    ;;

    -c4)
      nodelist=cab4
      shift 1
    ;;

    -c5)
      nodelist=cab5
      shift 1
    ;;

    -c6)
      nodelist=cab6
      shift 1
    ;;

    -s)
      sleep=$2
      shift 2
    ;;

    -p)
      power=$2
      shift 2
    ;;
  esac
done

NODES=(`cat ./.hosts.ipmi.$nodelist`);
NUM_NODES=${#NODES[@]};

for node in ${NODES[@]}; do
        echo -en "[$node]: "
        ipmitool -U ADMIN -P ADMIN -H $node power $power
        sleep $sleep
done
Scripts on headnode (cron)
# File: ipmi_monitoring/headnode/etc/cron.d/vig_rack_temp_chk
# Run rack temp check every 5 minutes
*/5 * * * *    root    /opt/kusu/etc/vig_temp_chk/vig_rack_temp_chk.sh >> /var/log/vig_rack_temp_chk.log
*/5 * * * *    root    /opt/kusu/etc/vig_temp_chk/vig_storage_rack_temp_chk.sh >> /var/log/vig_storage_rack_temp_chk.log
*/6 * * * *    root    /opt/kusu/etc/vig_temp_chk/vig_snmp_chk.sh >> /var/log/vig_snmp_chk.log
Scripts on compute node (Shutdown)
# File: ipmi_monitoring/compute/opt/kusu/etc/vig_temp_chk/check_temp.sh
#!/bin/bash

#############################
#
# Viglen Temperature Checker
#
#############################


# Enable output
DEBUG=1;

DATE=`date`

THRESHOLD=`cat /opt/kusu/etc/vig_temp_chk/threshold.temp | head -n 1 | sed "s/=/ /" | awk '{print $2}'`
CURRENT_TEMP=`ipmitool sdr type Temperature | head -n 1 | awk '{print $10}'`

if [ "$DEBUG" -eq "1" ]
then
        echo -en "${DATE} CURRENT TEMP: ${CURRENT_TEMP} THRESHOLD: ${THRESHOLD} "
fi

if [ "${CURRENT_TEMP}" -gt "${THRESHOLD}" ]
then

        if [ "$DEBUG" -eq "1" ]
        then
                echo "THRESHOLD EXCEEDED, INITIATING SOFT SHUTDOWN"
        fi
        ipmitool chassis power soft
else
        if [ "$DEBUG" -eq "1" ]
        then
                echo "Temperature OK. Not shutting down"
        fi
fi
File List from the archive
ipmi_monitoring/
ipmi_monitoring/headnode/
ipmi_monitoring/headnode/opt/
ipmi_monitoring/headnode/opt/kusu/
ipmi_monitoring/headnode/opt/kusu/etc/
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_rack_temp_chk.sh
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab6
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/storage_threshold.temp
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_ipmi.sh
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab3
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.all
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab2
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/front_air_threshold.temp
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_storage_rack_temp_chk.sh
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab4
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/rear_air_threshold.temp
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/threshold.temp
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_rack_temp_chk.sh.orig
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab1
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab5
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_snmp_chk.sh
ipmi_monitoring/headnode/etc/
ipmi_monitoring/headnode/etc/cron.d/
ipmi_monitoring/headnode/etc/cron.d/vig_rack_temp_chk
ipmi_monitoring/compute/
ipmi_monitoring/compute/opt/
ipmi_monitoring/compute/opt/kusu/
ipmi_monitoring/compute/opt/kusu/etc/
ipmi_monitoring/compute/opt/kusu/etc/vig_temp_chk/
ipmi_monitoring/compute/opt/kusu/etc/vig_temp_chk/check_temp.sh
ipmi_monitoring/compute/opt/kusu/etc/vig_temp_chk/threshold.temp
ipmi_monitoring/compute/etc/
ipmi_monitoring/compute/etc/cron.d/
ipmi_monitoring/compute/etc/cron.d/viglen_temp_check
Use logrotate for the logs
  • Contents of file: /etc/logrotate.d/viglen_logs
/var/log/vig_pdu_load.log {
        daily
        create 600 root root
        copytruncate
        rotate 12
}

/var/log/vig_snmp_chk.log {
        daily
        create 600 root root
        copytruncate
        rotate 12
}

/var/log/vig_rack_temp_chk.log {
        daily
        create 600 root root
        copytruncate
        rotate 12
}

/var/log/vig_node-envcheck.log {
        daily
        create 600 root root
        copytruncate
        rotate 12
}
  • Verify logrotate works
logrotate -d /etc/logrotate.conf