Difference between revisions of "Platform Automated Shutdown Scripts"
Jump to navigation
Jump to search
(Created page with "3 Stage Shutdown in the event of an overheat. Systems will shutdown if any of the following are breached: # Individual node breaches threshold # More than 10 nodes per rack b...") |
(No difference)
|
Latest revision as of 11:51, 7 December 2012
3 Stage Shutdown in the event of an overheat. Systems will shutdown if any of the following are breached:
- Individual node breaches threshold
- More than 10 nodes per rack breach a threshold (vig_rack_temp_chk.sh)
- Front/Rear Air Temp breaches threshold
Located on PDD: HPC Software Information/Platform/Overheat_Scripts
PDD Link: <file>\\srv-vfs2\PDD_DATA\Product Development\High Performance Computing\HPC Software Information\Platform\Overheat_Scripts|Overheat_Scripts on PDD</file>
Scripts on headnode (Rack Test)
# File: ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_rack_temp_chk.sh
#!/bin/bash
# Global Params
# Enable output
DEBUG=1
POWER_MODE=soft
cd /opt/kusu/etc/vig_temp_chk/
# define the snmp params
LCP_UNITS=(10.24.5.11 10.24.5.12 10.24.5.13);
COM_STRING="gild82urdu";
THRESHOLD_TEMP_NODES=10;
DATE=`date`
THRESHOLD=`cat /opt/kusu/etc/vig_temp_chk/threshold.temp | head -n 1 | sed "s/=/ /" | awk '{print $2}'`
ALL_RACKS=( RACK1 RACK2 RACK4 RACK5 RACK6 );
RACK1=( $(cat .hosts.ipmi.cab1) );
RACK2=( $(cat .hosts.ipmi.cab2) );
RACK4=( $(cat .hosts.ipmi.cab4) );
RACK5=( $(cat .hosts.ipmi.cab5) );
RACK6=( $(cat .hosts.ipmi.cab6) );
RACK_THRESHOLD=12
# rack counter;
i=0;
for RACK in ${ALL_RACKS[@]}
do
i=`echo "$i +1"|bc`;
if [ "$i" -eq "3" ]
then
i=`echo "$i +1"|bc`;
fi
RACK_NUM_NODES_ABOVE_THRESHOLD=0
for NODE in `eval echo \\\${$RACK[@]}`
do
# get the node temperature via ipmitool
NODE_TEMP=`ipmitool -U ADMIN -P ADMIN -H $NODE sdr type Temperature | head -n 1 | awk '{print $10}'`
# report the temperature
if [ "$DEBUG" -eq "1" ]
then
echo -en "$DATE RACK${i} NODE: $NODE TEMP: $NODE_TEMP "
fi
if [ "${NODE_TEMP}" -gt "${THRESHOLD}" ]
then
RACK_NUM_NODES_ABOVE_THRESHOLD=`echo "$RACK_NUM_NODES_ABOVE_THRESHOLD + 1 "| bc`
if [ "$DEBUG" -eq "1" ]
then
echo "THRESHOLD ($THRESHOLD) EXCEEDED, NODES_ABOVE: $RACK_NUM_NODES_ABOVE_THRESHOLD "
fi
else
if [ "$DEBUG" -eq "1" ]
then
echo "Temperature OK. No action"
fi
fi
done
if [ "$DEBUG" -eq "1" ]
then
echo -en "$DATE RACK${i} NODES ABOVE THRESHOLD: ${RACK_NUM_NODES_ABOVE_THRESHOLD} "
fi
if [ "${RACK_NUM_NODES_ABOVE_THRESHOLD}" -gt "${RACK_THRESHOLD}" ]
then
if [ "$DEBUG" -eq "1" ]
then
echo "TOO MANY NODES ABOVE THRESHOLD: SHUTTING RACK${i} DOWN!";
./vig_ipmi.sh -c${i} -s 0 -p $POWER_MODE
fi
else
if [ "$DEBUG" -eq "1" ]
then
echo "Threshold not exceeded. No action";
fi
fi
doneScripts on headnode (Rittal LCP Check)
# File: ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_snmp_chk.sh
#!/bin/bash
DEBUG=1
LCP_UNITS=(10.24.5.11 10.24.5.12 10.24.5.13);
RACKS_TO_SHUT=(1 "2 3" 4);
COM_STRING="gild82urdu";
DATE=`date`
FRONT_THRESHOLD=`cat /opt/kusu/etc/vig_temp_chk/front_air_threshold.temp | head -n 1 | sed "s/=/ /" | awk '{print $2}'`
REAR_THRESHOLD=`cat /opt/kusu/etc/vig_temp_chk/rear_air_threshold.temp | head -n 1 | sed "s/=/ /" | awk '{print $2}'`
POWER_MODE=soft
cd /opt/kusu/etc/vig_temp_chk/
i=0;
for UNIT in "${LCP_UNITS[@]}"; do
UNIT_TEMPS=(`snmpwalk -v1 -Oq -c $COM_STRING $UNIT .1.3.6.1.4.1.2606.4.2.4.5.2.1.5 | grep 5.13 -A 5 | awk '{print $2}'`)
if [ "$DEBUG" -eq "1" ]
then
echo -en "${DATE} UNIT $UNIT TEMPS: ${UNIT_TEMPS[@]} "
fi
FRONT_AVERAGE=`echo "( ${UNIT_TEMPS[0]} + ${UNIT_TEMPS[2]} + ${UNIT_TEMPS[4]} ) / 3 " | bc`;
REAR_AVERAGE=`echo "( ${UNIT_TEMPS[1]} + ${UNIT_TEMPS[3]} + ${UNIT_TEMPS[5]} ) / 3 " | bc`;
echo -en "F_Thrsh: $FRONT_THRESHOLD F_Avg: $FRONT_AVERAGE ";
if [ "${FRONT_AVERAGE}" -gt "${FRONT_THRESHOLD}" ]
then
if [ "$DEBUG" -eq "1" ]
then
echo "F_Thrsh EXCEEDED, SHUTDOWN"
for flag in `eval echo \\\${RACKS_TO_SHUT[$i]}`
do
echo "./vig_ipmi.sh -c${flag} -s 0 -p $POWER_MODE"
./vig_ipmi.sh -c${flag} -s 0 -p $POWER_MODE
done
fi
else
if [ "$DEBUG" -eq "1" ]
then
echo -en "F_Thrsh OK. No action "
fi
fi
echo -en "R_Thrsh: $REAR_THRESHOLD R_Avg: $REAR_AVERAGE ";
if [ "${REAR_AVERAGE}" -gt "${REAR_THRESHOLD}" ]
then
if [ "$DEBUG" -eq "1" ]
then
echo "R_Thrsh EXCEEDED, SHUTDOWN "
for flag in `eval echo \\\${RACKS_TO_SHUT[$i]}`
do
echo "./vig_ipmi.sh -c${flag} -s 0 -p $POWER_MODE"
./vig_ipmi.sh -c${flag} -s 0 -p $POWER_MODE
done
fi
else
if [ "$DEBUG" -eq "1" ]
then
echo "R_Thrsh OK. No action "
fi
fi
i=`echo "$i +1"|bc`;
doneScripts on headnode (IPMI Wrapper)
# File: ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_ipmi.sh
#!/bin/bash
sleep=0;
if [ $UID -ne 0 ]
then
echo "User must be root!!!"
exit 1
fi
if [ $# -eq 0 ]
then
echo "Args ya fool!"
exit -1
fi
while [ $# -gt 0 ]
do
case $1
in
-a)
nodelist=all
shift 1
;;
-c1)
nodelist=cab1
shift 1
;;
-c2)
nodelist=cab2
shift 1
;;
-c3)
nodelist=cab3
shift 1
;;
-c4)
nodelist=cab4
shift 1
;;
-c5)
nodelist=cab5
shift 1
;;
-c6)
nodelist=cab6
shift 1
;;
-s)
sleep=$2
shift 2
;;
-p)
power=$2
shift 2
;;
esac
done
NODES=(`cat ./.hosts.ipmi.$nodelist`);
NUM_NODES=${#NODES[@]};
for node in ${NODES[@]}; do
echo -en "[$node]: "
ipmitool -U ADMIN -P ADMIN -H $node power $power
sleep $sleep
doneScripts on headnode (cron)
# File: ipmi_monitoring/headnode/etc/cron.d/vig_rack_temp_chk
# Run rack temp check every 5 minutes
*/5 * * * * root /opt/kusu/etc/vig_temp_chk/vig_rack_temp_chk.sh >> /var/log/vig_rack_temp_chk.log
*/5 * * * * root /opt/kusu/etc/vig_temp_chk/vig_storage_rack_temp_chk.sh >> /var/log/vig_storage_rack_temp_chk.log
*/6 * * * * root /opt/kusu/etc/vig_temp_chk/vig_snmp_chk.sh >> /var/log/vig_snmp_chk.logScripts on compute node (Shutdown)
# File: ipmi_monitoring/compute/opt/kusu/etc/vig_temp_chk/check_temp.sh
#!/bin/bash
#############################
#
# Viglen Temperature Checker
#
#############################
# Enable output
DEBUG=1;
DATE=`date`
THRESHOLD=`cat /opt/kusu/etc/vig_temp_chk/threshold.temp | head -n 1 | sed "s/=/ /" | awk '{print $2}'`
CURRENT_TEMP=`ipmitool sdr type Temperature | head -n 1 | awk '{print $10}'`
if [ "$DEBUG" -eq "1" ]
then
echo -en "${DATE} CURRENT TEMP: ${CURRENT_TEMP} THRESHOLD: ${THRESHOLD} "
fi
if [ "${CURRENT_TEMP}" -gt "${THRESHOLD}" ]
then
if [ "$DEBUG" -eq "1" ]
then
echo "THRESHOLD EXCEEDED, INITIATING SOFT SHUTDOWN"
fi
ipmitool chassis power soft
else
if [ "$DEBUG" -eq "1" ]
then
echo "Temperature OK. Not shutting down"
fi
fiFile List from the archive
ipmi_monitoring/ ipmi_monitoring/headnode/ ipmi_monitoring/headnode/opt/ ipmi_monitoring/headnode/opt/kusu/ ipmi_monitoring/headnode/opt/kusu/etc/ ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/ ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_rack_temp_chk.sh ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab6 ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/storage_threshold.temp ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_ipmi.sh ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab3 ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.all ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab2 ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/front_air_threshold.temp ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_storage_rack_temp_chk.sh ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab4 ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/rear_air_threshold.temp ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/threshold.temp ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_rack_temp_chk.sh.orig ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab1 ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab5 ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_snmp_chk.sh ipmi_monitoring/headnode/etc/ ipmi_monitoring/headnode/etc/cron.d/ ipmi_monitoring/headnode/etc/cron.d/vig_rack_temp_chk ipmi_monitoring/compute/ ipmi_monitoring/compute/opt/ ipmi_monitoring/compute/opt/kusu/ ipmi_monitoring/compute/opt/kusu/etc/ ipmi_monitoring/compute/opt/kusu/etc/vig_temp_chk/ ipmi_monitoring/compute/opt/kusu/etc/vig_temp_chk/check_temp.sh ipmi_monitoring/compute/opt/kusu/etc/vig_temp_chk/threshold.temp ipmi_monitoring/compute/etc/ ipmi_monitoring/compute/etc/cron.d/ ipmi_monitoring/compute/etc/cron.d/viglen_temp_check
Use logrotate for the logs
- Contents of file: /etc/logrotate.d/viglen_logs
/var/log/vig_pdu_load.log {
daily
create 600 root root
copytruncate
rotate 12
}
/var/log/vig_snmp_chk.log {
daily
create 600 root root
copytruncate
rotate 12
}
/var/log/vig_rack_temp_chk.log {
daily
create 600 root root
copytruncate
rotate 12
}
/var/log/vig_node-envcheck.log {
daily
create 600 root root
copytruncate
rotate 12
}- Verify logrotate works
logrotate -d /etc/logrotate.conf