Difference between revisions of "Platform Automated Shutdown Scripts"

From Define Wiki
Jump to navigation Jump to search
(Created page with "3 Stage Shutdown in the event of an overheat. Systems will shutdown if any of the following are breached: # Individual node breaches threshold # More than 10 nodes per rack b...")
 
(No difference)

Latest revision as of 11:51, 7 December 2012

3 Stage Shutdown in the event of an overheat. Systems will shutdown if any of the following are breached:

  1. Individual node breaches threshold
  2. More than 10 nodes per rack breach a threshold (vig_rack_temp_chk.sh)
  3. Front/Rear Air Temp breaches threshold

Located on PDD: HPC Software Information/Platform/Overheat_Scripts

PDD Link: <file>\\srv-vfs2\PDD_DATA\Product Development\High Performance Computing\HPC Software Information\Platform\Overheat_Scripts|Overheat_Scripts on PDD</file>

Scripts on headnode (Rack Test)
# File: ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_rack_temp_chk.sh
#!/bin/bash

# Global Params
# Enable output
DEBUG=1
POWER_MODE=soft

cd /opt/kusu/etc/vig_temp_chk/

# define the snmp params
LCP_UNITS=(10.24.5.11 10.24.5.12 10.24.5.13);
COM_STRING="gild82urdu";
THRESHOLD_TEMP_NODES=10;

DATE=`date`
THRESHOLD=`cat /opt/kusu/etc/vig_temp_chk/threshold.temp | head -n 1 | sed "s/=/ /" | awk '{print $2}'`

ALL_RACKS=( RACK1 RACK2 RACK4 RACK5 RACK6 );

RACK1=( $(cat .hosts.ipmi.cab1) );
RACK2=( $(cat .hosts.ipmi.cab2) );
RACK4=( $(cat .hosts.ipmi.cab4) );
RACK5=( $(cat .hosts.ipmi.cab5) );
RACK6=( $(cat .hosts.ipmi.cab6) );

RACK_THRESHOLD=12
# rack counter;
i=0;

for RACK in ${ALL_RACKS[@]}
do

        i=`echo "$i +1"|bc`;

        if [ "$i" -eq "3" ]
        then
                i=`echo "$i +1"|bc`;
        fi

        RACK_NUM_NODES_ABOVE_THRESHOLD=0

        for NODE in `eval echo \\\${$RACK[@]}`
        do
                # get the node temperature via ipmitool
                NODE_TEMP=`ipmitool -U ADMIN -P ADMIN -H $NODE sdr type Temperature | head -n 1 | awk '{print $10}'`

                # report the temperature
                if [ "$DEBUG" -eq "1" ]
                then
                        echo -en "$DATE RACK${i} NODE: $NODE TEMP: $NODE_TEMP "
                fi

                if [ "${NODE_TEMP}" -gt "${THRESHOLD}" ]
                then

                        RACK_NUM_NODES_ABOVE_THRESHOLD=`echo "$RACK_NUM_NODES_ABOVE_THRESHOLD + 1 "| bc`

                        if [ "$DEBUG" -eq "1" ]
                        then
                                echo "THRESHOLD ($THRESHOLD) EXCEEDED, NODES_ABOVE: $RACK_NUM_NODES_ABOVE_THRESHOLD "
                        fi
                else
                        if [ "$DEBUG" -eq "1" ]
                        then
                                echo "Temperature OK. No action"
                        fi
                fi
        done

        if [ "$DEBUG" -eq "1" ]
        then
                echo -en "$DATE RACK${i} NODES ABOVE THRESHOLD: ${RACK_NUM_NODES_ABOVE_THRESHOLD} "
        fi

        if [ "${RACK_NUM_NODES_ABOVE_THRESHOLD}" -gt "${RACK_THRESHOLD}" ]
        then
                if [ "$DEBUG" -eq "1" ]
                then
                        echo "TOO MANY NODES ABOVE THRESHOLD: SHUTTING RACK${i} DOWN!";
                        ./vig_ipmi.sh -c${i} -s 0 -p $POWER_MODE
                fi
        else
                if [ "$DEBUG" -eq "1" ]
                then
                        echo "Threshold not exceeded. No action";
                fi
        fi

done
Scripts on headnode (Rittal LCP Check)
# File: ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_snmp_chk.sh
#!/bin/bash

DEBUG=1
LCP_UNITS=(10.24.5.11 10.24.5.12 10.24.5.13);
RACKS_TO_SHUT=(1 "2 3" 4);
COM_STRING="gild82urdu";
DATE=`date`
FRONT_THRESHOLD=`cat /opt/kusu/etc/vig_temp_chk/front_air_threshold.temp | head -n 1 | sed "s/=/ /" | awk '{print $2}'`
REAR_THRESHOLD=`cat /opt/kusu/etc/vig_temp_chk/rear_air_threshold.temp | head -n 1 | sed "s/=/ /" | awk '{print $2}'`
POWER_MODE=soft

cd /opt/kusu/etc/vig_temp_chk/

i=0;
for UNIT in "${LCP_UNITS[@]}"; do

        UNIT_TEMPS=(`snmpwalk -v1 -Oq -c $COM_STRING $UNIT .1.3.6.1.4.1.2606.4.2.4.5.2.1.5 | grep 5.13 -A 5 | awk '{print $2}'`)

        if [ "$DEBUG" -eq "1" ]
        then
               echo -en "${DATE} UNIT $UNIT TEMPS: ${UNIT_TEMPS[@]} "
        fi

        FRONT_AVERAGE=`echo "( ${UNIT_TEMPS[0]} + ${UNIT_TEMPS[2]} + ${UNIT_TEMPS[4]} ) / 3 " | bc`;
        REAR_AVERAGE=`echo "( ${UNIT_TEMPS[1]} + ${UNIT_TEMPS[3]} + ${UNIT_TEMPS[5]} ) / 3 " | bc`;

        echo -en "F_Thrsh: $FRONT_THRESHOLD F_Avg: $FRONT_AVERAGE ";

        if [ "${FRONT_AVERAGE}" -gt "${FRONT_THRESHOLD}" ]
        then

               if [ "$DEBUG" -eq "1" ]
               then
                        echo "F_Thrsh EXCEEDED, SHUTDOWN"
                        for flag in `eval echo \\\${RACKS_TO_SHUT[$i]}`
                        do
                                echo "./vig_ipmi.sh -c${flag} -s 0 -p $POWER_MODE"
                                ./vig_ipmi.sh -c${flag} -s 0 -p $POWER_MODE
                        done
               fi
        else
               if [ "$DEBUG" -eq "1" ]
               then
                        echo -en "F_Thrsh OK. No action "
               fi
        fi

        echo -en "R_Thrsh: $REAR_THRESHOLD R_Avg: $REAR_AVERAGE ";

        if [ "${REAR_AVERAGE}" -gt "${REAR_THRESHOLD}" ]
        then

               if [ "$DEBUG" -eq "1" ]
               then
                       echo "R_Thrsh EXCEEDED, SHUTDOWN "
                        for flag in `eval echo \\\${RACKS_TO_SHUT[$i]}`
                        do
                                echo "./vig_ipmi.sh -c${flag} -s 0 -p $POWER_MODE"
                                ./vig_ipmi.sh -c${flag} -s 0 -p $POWER_MODE
                        done
               fi
        else
               if [ "$DEBUG" -eq "1" ]
               then
                       echo "R_Thrsh OK. No action "
               fi
       fi

        i=`echo "$i +1"|bc`;
done
Scripts on headnode (IPMI Wrapper)
# File: ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_ipmi.sh
#!/bin/bash

sleep=0;

if [ $UID -ne 0 ]
then
    echo "User must be root!!!"
    exit 1
fi

if [ $# -eq 0 ]
then
        echo "Args ya fool!"
        exit -1
fi

while [ $# -gt 0 ] 
do
  case $1
  in
    -a)
      nodelist=all
      shift 1
    ;;

    -c1)
      nodelist=cab1
      shift 1
    ;;

    -c2)
      nodelist=cab2
      shift 1
    ;;

    -c3)
      nodelist=cab3
      shift 1
    ;;

    -c4)
      nodelist=cab4
      shift 1
    ;;

    -c5)
      nodelist=cab5
      shift 1
    ;;

    -c6)
      nodelist=cab6
      shift 1
    ;;

    -s)
      sleep=$2
      shift 2
    ;;

    -p)
      power=$2
      shift 2
    ;;
  esac
done

NODES=(`cat ./.hosts.ipmi.$nodelist`);
NUM_NODES=${#NODES[@]};

for node in ${NODES[@]}; do
        echo -en "[$node]: "
        ipmitool -U ADMIN -P ADMIN -H $node power $power
        sleep $sleep
done
Scripts on headnode (cron)
# File: ipmi_monitoring/headnode/etc/cron.d/vig_rack_temp_chk
# Run rack temp check every 5 minutes
*/5 * * * *    root    /opt/kusu/etc/vig_temp_chk/vig_rack_temp_chk.sh >> /var/log/vig_rack_temp_chk.log
*/5 * * * *    root    /opt/kusu/etc/vig_temp_chk/vig_storage_rack_temp_chk.sh >> /var/log/vig_storage_rack_temp_chk.log
*/6 * * * *    root    /opt/kusu/etc/vig_temp_chk/vig_snmp_chk.sh >> /var/log/vig_snmp_chk.log
Scripts on compute node (Shutdown)
# File: ipmi_monitoring/compute/opt/kusu/etc/vig_temp_chk/check_temp.sh
#!/bin/bash

#############################
#
# Viglen Temperature Checker
#
#############################


# Enable output
DEBUG=1;

DATE=`date`

THRESHOLD=`cat /opt/kusu/etc/vig_temp_chk/threshold.temp | head -n 1 | sed "s/=/ /" | awk '{print $2}'`
CURRENT_TEMP=`ipmitool sdr type Temperature | head -n 1 | awk '{print $10}'`

if [ "$DEBUG" -eq "1" ]
then
        echo -en "${DATE} CURRENT TEMP: ${CURRENT_TEMP} THRESHOLD: ${THRESHOLD} "
fi

if [ "${CURRENT_TEMP}" -gt "${THRESHOLD}" ]
then

        if [ "$DEBUG" -eq "1" ]
        then
                echo "THRESHOLD EXCEEDED, INITIATING SOFT SHUTDOWN"
        fi
        ipmitool chassis power soft
else
        if [ "$DEBUG" -eq "1" ]
        then
                echo "Temperature OK. Not shutting down"
        fi
fi
File List from the archive
ipmi_monitoring/
ipmi_monitoring/headnode/
ipmi_monitoring/headnode/opt/
ipmi_monitoring/headnode/opt/kusu/
ipmi_monitoring/headnode/opt/kusu/etc/
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_rack_temp_chk.sh
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab6
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/storage_threshold.temp
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_ipmi.sh
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab3
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.all
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab2
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/front_air_threshold.temp
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_storage_rack_temp_chk.sh
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab4
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/rear_air_threshold.temp
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/threshold.temp
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_rack_temp_chk.sh.orig
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab1
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab5
ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_snmp_chk.sh
ipmi_monitoring/headnode/etc/
ipmi_monitoring/headnode/etc/cron.d/
ipmi_monitoring/headnode/etc/cron.d/vig_rack_temp_chk
ipmi_monitoring/compute/
ipmi_monitoring/compute/opt/
ipmi_monitoring/compute/opt/kusu/
ipmi_monitoring/compute/opt/kusu/etc/
ipmi_monitoring/compute/opt/kusu/etc/vig_temp_chk/
ipmi_monitoring/compute/opt/kusu/etc/vig_temp_chk/check_temp.sh
ipmi_monitoring/compute/opt/kusu/etc/vig_temp_chk/threshold.temp
ipmi_monitoring/compute/etc/
ipmi_monitoring/compute/etc/cron.d/
ipmi_monitoring/compute/etc/cron.d/viglen_temp_check
Use logrotate for the logs
  • Contents of file: /etc/logrotate.d/viglen_logs
/var/log/vig_pdu_load.log {
        daily
        create 600 root root
        copytruncate
        rotate 12
}

/var/log/vig_snmp_chk.log {
        daily
        create 600 root root
        copytruncate
        rotate 12
}

/var/log/vig_rack_temp_chk.log {
        daily
        create 600 root root
        copytruncate
        rotate 12
}

/var/log/vig_node-envcheck.log {
        daily
        create 600 root root
        copytruncate
        rotate 12
}
  • Verify logrotate works
logrotate -d /etc/logrotate.conf