Difference between revisions of "GIGAIO:Lab Setup"
Jump to navigation
Jump to search
(Created page with " <nowiki> [root@vcontroller test]# cat test.sh #!/bin/bash #SBATCH -N 1 # number of nodes #SBATCH --gres=gpu:k80:8 hostname lspci | grep NVIDIA nvidia-smi sleep 18 </nowik...") |
(No difference)
|
Revision as of 22:11, 8 September 2020
[root@vcontroller test]# cat test.sh #!/bin/bash #SBATCH -N 1 # number of nodes #SBATCH --gres=gpu:k80:8 hostname lspci | grep NVIDIA nvidia-smi sleep 18
[root@vcontroller sbin]# cat slurm_resume
#!/usr/bin/python3
import sys
import subprocess
import json
def run_command(command):
cmd_arr = command.split(' ')
process = subprocess.Popen(cmd_arr, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print(cmd_arr)
stdout, stderr = process.communicate()
print(stdout)
print(stderr)
return str(stdout)
enabled_ports = [9, 17]
nodelist = ["node0001", "node0002"]
def main():
nodename = str(sys.argv[1])
print(str(sys.argv))
if nodename in nodelist:
print("Nodename is " + nodename)
else:
exit(0)
if nodename == 'node0001':
part_id = '0'
elif nodename == 'node0002':
part_id = '1'
else:
exit(0)
# Find job id assigned to this node
cmd_out = run_command("squeue -o %A|%N")
print(cmd_out)
jobid = '-1'
for line in cmd_out.split("\\n"):
if nodename in line:
jobid = line.split('|')[0]
break
if jobid == '-1':
exit(0)
print("Job id is " + jobid)
# Find num of gpus requested by this job
cmd_output = run_command("scontrol show job " + str(jobid))
num_req_gpus = cmd_output.split('TresPerNode=gpu:k80:')[1].split('\\n')[0]
print("Num of req gpus is " + str(num_req_gpus).split('\\n')[0])
if num_req_gpus == "8" or num_req_gpus == "16":
pass
else:
print("Invalid number of gpus requested by job " + num_req_gpus)
# Cancel job
exit(1)
# Get current state of ports
cmd_output = run_command("fmtool -s virgo12a")
print("Getting unbound ports")
cmd_json = (cmd_output.split('Response:')[1].split('Success')[0]).replace("\\n"," ")
#print(cmd_json)
cmd_obj = json.loads(cmd_json)
unbound = cmd_obj["body"]["fabric"]["switches"][0]["binding"]["unbound"]
if len(unbound) == 0:
print("No unbound ports available")
#scancel jobid assigned to this node
exit(1)
print("Unbound ports")
print(unbound)
# If num of requested gpus are unbound, then bind them, else scancel the jobid
if num_req_gpus == "8":
if 9 in unbound:
run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:9 virgo12a')
pass
elif 17 in unbound:
run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:17 virgo12a')
pass
else:
print("9 or 17 port not present in unbound")
exit(1)
elif num_req_gpus == "16":
if 9 in unbound and 17 in unbound:
run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:9 virgo12a')
run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:17 virgo12a')
pass
else:
print("9 and 17 port not present in unbound")
exit(1)
# Reboot the node
run_command("ssh " + nodename + " -t reboot")
main()
[root@vcontroller sbin]# cat slurm_suspend #!/bin/bash # Example SuspendProgram echo "`date` Suspend invoked $0 $*" >>/var/log/power_save.log echo "Power Save Module" fmtool -U switch:virgo12a,port_id:9 virgo12a >>/var/log/power_save.log fmtool -U switch:virgo12a,port_id:17 virgo12a >>/var/log/power_save.log exit 0