Platform Automated Shutdown Scripts
Jump to navigation
Jump to search
3 Stage Shutdown in the event of an overheat. Systems will shutdown if any of the following are breached:
- Individual node breaches threshold
- More than 10 nodes per rack breach a threshold (vig_rack_temp_chk.sh)
- Front/Rear Air Temp breaches threshold
Located on PDD: HPC Software Information/Platform/Overheat_Scripts
PDD Link: <file>\\srv-vfs2\PDD_DATA\Product Development\High Performance Computing\HPC Software Information\Platform\Overheat_Scripts|Overheat_Scripts on PDD</file>
Scripts on headnode (Rack Test)
# File: ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_rack_temp_chk.sh
#!/bin/bash
# Global Params
# Enable output
DEBUG=1
POWER_MODE=soft
cd /opt/kusu/etc/vig_temp_chk/
# define the snmp params
LCP_UNITS=(10.24.5.11 10.24.5.12 10.24.5.13);
COM_STRING="gild82urdu";
THRESHOLD_TEMP_NODES=10;
DATE=`date`
THRESHOLD=`cat /opt/kusu/etc/vig_temp_chk/threshold.temp | head -n 1 | sed "s/=/ /" | awk '{print $2}'`
ALL_RACKS=( RACK1 RACK2 RACK4 RACK5 RACK6 );
RACK1=( $(cat .hosts.ipmi.cab1) );
RACK2=( $(cat .hosts.ipmi.cab2) );
RACK4=( $(cat .hosts.ipmi.cab4) );
RACK5=( $(cat .hosts.ipmi.cab5) );
RACK6=( $(cat .hosts.ipmi.cab6) );
RACK_THRESHOLD=12
# rack counter;
i=0;
for RACK in ${ALL_RACKS[@]}
do
i=`echo "$i +1"|bc`;
if [ "$i" -eq "3" ]
then
i=`echo "$i +1"|bc`;
fi
RACK_NUM_NODES_ABOVE_THRESHOLD=0
for NODE in `eval echo \\\${$RACK[@]}`
do
# get the node temperature via ipmitool
NODE_TEMP=`ipmitool -U ADMIN -P ADMIN -H $NODE sdr type Temperature | head -n 1 | awk '{print $10}'`
# report the temperature
if [ "$DEBUG" -eq "1" ]
then
echo -en "$DATE RACK${i} NODE: $NODE TEMP: $NODE_TEMP "
fi
if [ "${NODE_TEMP}" -gt "${THRESHOLD}" ]
then
RACK_NUM_NODES_ABOVE_THRESHOLD=`echo "$RACK_NUM_NODES_ABOVE_THRESHOLD + 1 "| bc`
if [ "$DEBUG" -eq "1" ]
then
echo "THRESHOLD ($THRESHOLD) EXCEEDED, NODES_ABOVE: $RACK_NUM_NODES_ABOVE_THRESHOLD "
fi
else
if [ "$DEBUG" -eq "1" ]
then
echo "Temperature OK. No action"
fi
fi
done
if [ "$DEBUG" -eq "1" ]
then
echo -en "$DATE RACK${i} NODES ABOVE THRESHOLD: ${RACK_NUM_NODES_ABOVE_THRESHOLD} "
fi
if [ "${RACK_NUM_NODES_ABOVE_THRESHOLD}" -gt "${RACK_THRESHOLD}" ]
then
if [ "$DEBUG" -eq "1" ]
then
echo "TOO MANY NODES ABOVE THRESHOLD: SHUTTING RACK${i} DOWN!";
./vig_ipmi.sh -c${i} -s 0 -p $POWER_MODE
fi
else
if [ "$DEBUG" -eq "1" ]
then
echo "Threshold not exceeded. No action";
fi
fi
doneScripts on headnode (Rittal LCP Check)
# File: ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_snmp_chk.sh
#!/bin/bash
DEBUG=1
LCP_UNITS=(10.24.5.11 10.24.5.12 10.24.5.13);
RACKS_TO_SHUT=(1 "2 3" 4);
COM_STRING="gild82urdu";
DATE=`date`
FRONT_THRESHOLD=`cat /opt/kusu/etc/vig_temp_chk/front_air_threshold.temp | head -n 1 | sed "s/=/ /" | awk '{print $2}'`
REAR_THRESHOLD=`cat /opt/kusu/etc/vig_temp_chk/rear_air_threshold.temp | head -n 1 | sed "s/=/ /" | awk '{print $2}'`
POWER_MODE=soft
cd /opt/kusu/etc/vig_temp_chk/
i=0;
for UNIT in "${LCP_UNITS[@]}"; do
UNIT_TEMPS=(`snmpwalk -v1 -Oq -c $COM_STRING $UNIT .1.3.6.1.4.1.2606.4.2.4.5.2.1.5 | grep 5.13 -A 5 | awk '{print $2}'`)
if [ "$DEBUG" -eq "1" ]
then
echo -en "${DATE} UNIT $UNIT TEMPS: ${UNIT_TEMPS[@]} "
fi
FRONT_AVERAGE=`echo "( ${UNIT_TEMPS[0]} + ${UNIT_TEMPS[2]} + ${UNIT_TEMPS[4]} ) / 3 " | bc`;
REAR_AVERAGE=`echo "( ${UNIT_TEMPS[1]} + ${UNIT_TEMPS[3]} + ${UNIT_TEMPS[5]} ) / 3 " | bc`;
echo -en "F_Thrsh: $FRONT_THRESHOLD F_Avg: $FRONT_AVERAGE ";
if [ "${FRONT_AVERAGE}" -gt "${FRONT_THRESHOLD}" ]
then
if [ "$DEBUG" -eq "1" ]
then
echo "F_Thrsh EXCEEDED, SHUTDOWN"
for flag in `eval echo \\\${RACKS_TO_SHUT[$i]}`
do
echo "./vig_ipmi.sh -c${flag} -s 0 -p $POWER_MODE"
./vig_ipmi.sh -c${flag} -s 0 -p $POWER_MODE
done
fi
else
if [ "$DEBUG" -eq "1" ]
then
echo -en "F_Thrsh OK. No action "
fi
fi
echo -en "R_Thrsh: $REAR_THRESHOLD R_Avg: $REAR_AVERAGE ";
if [ "${REAR_AVERAGE}" -gt "${REAR_THRESHOLD}" ]
then
if [ "$DEBUG" -eq "1" ]
then
echo "R_Thrsh EXCEEDED, SHUTDOWN "
for flag in `eval echo \\\${RACKS_TO_SHUT[$i]}`
do
echo "./vig_ipmi.sh -c${flag} -s 0 -p $POWER_MODE"
./vig_ipmi.sh -c${flag} -s 0 -p $POWER_MODE
done
fi
else
if [ "$DEBUG" -eq "1" ]
then
echo "R_Thrsh OK. No action "
fi
fi
i=`echo "$i +1"|bc`;
doneScripts on headnode (IPMI Wrapper)
# File: ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_ipmi.sh
#!/bin/bash
sleep=0;
if [ $UID -ne 0 ]
then
echo "User must be root!!!"
exit 1
fi
if [ $# -eq 0 ]
then
echo "Args ya fool!"
exit -1
fi
while [ $# -gt 0 ]
do
case $1
in
-a)
nodelist=all
shift 1
;;
-c1)
nodelist=cab1
shift 1
;;
-c2)
nodelist=cab2
shift 1
;;
-c3)
nodelist=cab3
shift 1
;;
-c4)
nodelist=cab4
shift 1
;;
-c5)
nodelist=cab5
shift 1
;;
-c6)
nodelist=cab6
shift 1
;;
-s)
sleep=$2
shift 2
;;
-p)
power=$2
shift 2
;;
esac
done
NODES=(`cat ./.hosts.ipmi.$nodelist`);
NUM_NODES=${#NODES[@]};
for node in ${NODES[@]}; do
echo -en "[$node]: "
ipmitool -U ADMIN -P ADMIN -H $node power $power
sleep $sleep
doneScripts on headnode (cron)
# File: ipmi_monitoring/headnode/etc/cron.d/vig_rack_temp_chk
# Run rack temp check every 5 minutes
*/5 * * * * root /opt/kusu/etc/vig_temp_chk/vig_rack_temp_chk.sh >> /var/log/vig_rack_temp_chk.log
*/5 * * * * root /opt/kusu/etc/vig_temp_chk/vig_storage_rack_temp_chk.sh >> /var/log/vig_storage_rack_temp_chk.log
*/6 * * * * root /opt/kusu/etc/vig_temp_chk/vig_snmp_chk.sh >> /var/log/vig_snmp_chk.logScripts on compute node (Shutdown)
# File: ipmi_monitoring/compute/opt/kusu/etc/vig_temp_chk/check_temp.sh
#!/bin/bash
#############################
#
# Viglen Temperature Checker
#
#############################
# Enable output
DEBUG=1;
DATE=`date`
THRESHOLD=`cat /opt/kusu/etc/vig_temp_chk/threshold.temp | head -n 1 | sed "s/=/ /" | awk '{print $2}'`
CURRENT_TEMP=`ipmitool sdr type Temperature | head -n 1 | awk '{print $10}'`
if [ "$DEBUG" -eq "1" ]
then
echo -en "${DATE} CURRENT TEMP: ${CURRENT_TEMP} THRESHOLD: ${THRESHOLD} "
fi
if [ "${CURRENT_TEMP}" -gt "${THRESHOLD}" ]
then
if [ "$DEBUG" -eq "1" ]
then
echo "THRESHOLD EXCEEDED, INITIATING SOFT SHUTDOWN"
fi
ipmitool chassis power soft
else
if [ "$DEBUG" -eq "1" ]
then
echo "Temperature OK. Not shutting down"
fi
fiFile List from the archive
ipmi_monitoring/ ipmi_monitoring/headnode/ ipmi_monitoring/headnode/opt/ ipmi_monitoring/headnode/opt/kusu/ ipmi_monitoring/headnode/opt/kusu/etc/ ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/ ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_rack_temp_chk.sh ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab6 ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/storage_threshold.temp ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_ipmi.sh ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab3 ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.all ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab2 ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/front_air_threshold.temp ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_storage_rack_temp_chk.sh ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab4 ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/rear_air_threshold.temp ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/threshold.temp ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_rack_temp_chk.sh.orig ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab1 ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/.hosts.ipmi.cab5 ipmi_monitoring/headnode/opt/kusu/etc/vig_temp_chk/vig_snmp_chk.sh ipmi_monitoring/headnode/etc/ ipmi_monitoring/headnode/etc/cron.d/ ipmi_monitoring/headnode/etc/cron.d/vig_rack_temp_chk ipmi_monitoring/compute/ ipmi_monitoring/compute/opt/ ipmi_monitoring/compute/opt/kusu/ ipmi_monitoring/compute/opt/kusu/etc/ ipmi_monitoring/compute/opt/kusu/etc/vig_temp_chk/ ipmi_monitoring/compute/opt/kusu/etc/vig_temp_chk/check_temp.sh ipmi_monitoring/compute/opt/kusu/etc/vig_temp_chk/threshold.temp ipmi_monitoring/compute/etc/ ipmi_monitoring/compute/etc/cron.d/ ipmi_monitoring/compute/etc/cron.d/viglen_temp_check
Use logrotate for the logs
- Contents of file: /etc/logrotate.d/viglen_logs
/var/log/vig_pdu_load.log {
daily
create 600 root root
copytruncate
rotate 12
}
/var/log/vig_snmp_chk.log {
daily
create 600 root root
copytruncate
rotate 12
}
/var/log/vig_rack_temp_chk.log {
daily
create 600 root root
copytruncate
rotate 12
}
/var/log/vig_node-envcheck.log {
daily
create 600 root root
copytruncate
rotate 12
}- Verify logrotate works
logrotate -d /etc/logrotate.conf