Difference between revisions of "GIGAIO:Lab Setup"
(Created page with " <nowiki> [root@vcontroller test]# cat test.sh #!/bin/bash #SBATCH -N 1 # number of nodes #SBATCH --gres=gpu:k80:8 hostname lspci | grep NVIDIA nvidia-smi sleep 18 </nowik...") |
|||
| Line 137: | Line 137: | ||
</nowiki> | </nowiki> | ||
| + | |||
| + | <nowiki> | ||
| + | [root@vcontroller ~]# cat slurm.py | ||
| + | import subprocess | ||
| + | import json | ||
| + | def run_command(command): | ||
| + | cmd_arr = command.split(' ') | ||
| + | process = subprocess.Popen(cmd_arr, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | ||
| + | print(cmd_arr) | ||
| + | stdout, stderr = process.communicate() | ||
| + | print(stdout) | ||
| + | print(stderr) | ||
| + | return str(stdout) | ||
| + | |||
| + | enabled_ports = [9, 17] | ||
| + | def main(): | ||
| + | # Find if there is a job available | ||
| + | cmd_output = run_command("squeue -t PD -o %i|%r -h") | ||
| + | job_arr = cmd_output.split('\n') | ||
| + | if len(job_arr) == 0: | ||
| + | print("No jobs present in squeue") | ||
| + | exit(1) | ||
| + | job_id = "-1" | ||
| + | for job in job_arr: | ||
| + | if "ReqNodeNotAvail" in job: | ||
| + | job_id = job_arr[0].split('|')[0] | ||
| + | print ("Job id is " + str(job_id)) | ||
| + | break | ||
| + | if job_id == "-1": | ||
| + | print("No PENDING jobs present in squeue") | ||
| + | exit(1) | ||
| + | # Find how many gpus it requests | ||
| + | cmd_output = run_command("scontrol show job " + str(job_id)) | ||
| + | num_req_gpus = cmd_output.split('TresPerNode=gpu:k80:')[1][0] | ||
| + | |||
| + | print("Num of req gpus is " + str(num_req_gpus).split('\n')[0]) | ||
| + | if num_req_gpus == "8" or num_req_gpus == "16": | ||
| + | pass | ||
| + | else: | ||
| + | print("Invalid number of gpus requested by job " + num_req_gpus) | ||
| + | # Cancel job | ||
| + | exit(1) | ||
| + | |||
| + | # Get current state of ports | ||
| + | cmd_output = run_command("fmtool -s virgo12a") | ||
| + | cmd_json = cmd_output.split('Response:')[1].split('Success')[0] | ||
| + | cmd_obj = json.loads(cmd_json) | ||
| + | unbound = cmd_obj["body"]["fabric"]["switches"][0]["binding"]["unbound"] | ||
| + | |||
| + | if len(unbound) == 0: | ||
| + | print("No unbound ports available") | ||
| + | exit(1) | ||
| + | print("Unbound ports") | ||
| + | print(unbound) | ||
| + | # Find available slurm node and corresponding partition id | ||
| + | cmd_output = run_command('sinfo') | ||
| + | cmd_arr = cmd_output.split('\n')[1:] | ||
| + | part_id = '-1' | ||
| + | nodename = 'dontexist' | ||
| + | for node in cmd_arr: | ||
| + | if 'idle' in node: | ||
| + | if 'n0001' in node: | ||
| + | part_id = '0' | ||
| + | nodename = 'n0001' | ||
| + | elif 'n0002' in node: | ||
| + | part_id = '1' | ||
| + | nodename = 'n0002' | ||
| + | break | ||
| + | |||
| + | if part_id == '-1': | ||
| + | print("No idle slurm nodes available") | ||
| + | exit(1) | ||
| + | print("Partition ID selected : " + part_id) | ||
| + | # Check if requested gpus are available | ||
| + | # Bind gpus to available node | ||
| + | if num_req_gpus == 8: | ||
| + | if 9 in unbound: | ||
| + | run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:9 virgo12a') | ||
| + | pass | ||
| + | elif 17 in unbound: | ||
| + | run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:17 virgo12a') | ||
| + | pass | ||
| + | else: | ||
| + | print("9 or 17 port not present in unbound") | ||
| + | exit(1) | ||
| + | else: | ||
| + | if 9 in unbound and 17 in unbound: | ||
| + | run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:9 virgo12a') | ||
| + | run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:17 virgo12a') | ||
| + | pass | ||
| + | else: | ||
| + | print("9 and 17 port not present in unbound") | ||
| + | exit(1) | ||
| + | |||
| + | |||
| + | # Update gres number with scontrol for slurm node | ||
| + | run_command('scontrol update Nodename=' + nodename + ' gres=gpu:k80:' + str(num_req_gpus)) | ||
| + | # scontrol reboot | ||
| + | #run_command('scontrol reboot '+ nodename) | ||
| + | |||
| + | |||
| + | main() | ||
| + | |||
| + | </nowiki> | ||
| + | |||
| + | |||
| + | <nowiki> | ||
| + | [root@vcontroller slurm]# cat slurm.conf | ||
| + | # | ||
| + | # Ansible managed | ||
| + | # | ||
| + | #GresTypes=gpu | ||
| + | ClusterName=cluster | ||
| + | ControlMachine=vcontroller, | ||
| + | Include /etc/slurm/slurm-nodes.conf | ||
| + | Include /etc/slurm/slurm-partitions.conf | ||
| + | Include /etc/slurm/slurm-user.conf | ||
| + | #Include /etc/slurm/gres.conf | ||
| + | #Include /etc/slurm/slurm-health.conf | ||
| + | |||
| + | |||
| + | TopologyPlugin=topology/tree | ||
| + | |||
| + | SwitchType=switch/none | ||
| + | TaskPlugin=task/none | ||
| + | MpiDefault=none | ||
| + | ReturnToService=2 | ||
| + | |||
| + | |||
| + | # Accounting. | ||
| + | AccountingStorageType=accounting_storage/slurmdbd | ||
| + | AccountingStorageHost=vcontroller | ||
| + | AccountingStorageUser=slurm | ||
| + | #AccountingStorageEnforce=qos,limits | ||
| + | #AccountingStoragePass= | ||
| + | #AccountingStoragePort= | ||
| + | |||
| + | # Logging | ||
| + | SlurmctldLogFile=/var/log/slurm/slurmctld.log | ||
| + | SlurmdLogFile=/var/log/slurm/slurmd.log | ||
| + | |||
| + | StateSaveLocation=/vscaler/local/var/spool/slurm | ||
| + | AuthType=auth/munge | ||
| + | ControlAddr=10.6.44.152 | ||
| + | |||
| + | |||
| + | #Name=gpu Type=k80 File=/dev/nvidia[0-15] | ||
| + | |||
| + | DebugFlags=Gres | ||
| + | RebootProgram = "/sbin/shutdown -r now" | ||
| + | GresTypes=gpu | ||
| + | |||
| + | #SelectType=select/cons_res | ||
| + | #SelectTypeParameters=CR_CORE_Memory | ||
| + | |||
| + | SuspendProgram=/usr/sbin/slurm_suspend | ||
| + | ResumeProgram=/usr/sbin/slurm_resume | ||
| + | SuspendTime=15 | ||
| + | NodeName=node0001 Weight=8 Feature=cloud State=CLOUD Gres=gpu:k80:16 | ||
| + | NodeName=node0002 Weight=8 Feature=cloud State=CLOUD Gres=gpu:k80:16 | ||
| + | |||
| + | ResumeTimeout=300 | ||
| + | |||
| + | </nowiki | ||
Revision as of 22:13, 8 September 2020
[root@vcontroller test]# cat test.sh #!/bin/bash #SBATCH -N 1 # number of nodes #SBATCH --gres=gpu:k80:8 hostname lspci | grep NVIDIA nvidia-smi sleep 18
[root@vcontroller sbin]# cat slurm_resume
#!/usr/bin/python3
import sys
import subprocess
import json
def run_command(command):
cmd_arr = command.split(' ')
process = subprocess.Popen(cmd_arr, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print(cmd_arr)
stdout, stderr = process.communicate()
print(stdout)
print(stderr)
return str(stdout)
enabled_ports = [9, 17]
nodelist = ["node0001", "node0002"]
def main():
nodename = str(sys.argv[1])
print(str(sys.argv))
if nodename in nodelist:
print("Nodename is " + nodename)
else:
exit(0)
if nodename == 'node0001':
part_id = '0'
elif nodename == 'node0002':
part_id = '1'
else:
exit(0)
# Find job id assigned to this node
cmd_out = run_command("squeue -o %A|%N")
print(cmd_out)
jobid = '-1'
for line in cmd_out.split("\\n"):
if nodename in line:
jobid = line.split('|')[0]
break
if jobid == '-1':
exit(0)
print("Job id is " + jobid)
# Find num of gpus requested by this job
cmd_output = run_command("scontrol show job " + str(jobid))
num_req_gpus = cmd_output.split('TresPerNode=gpu:k80:')[1].split('\\n')[0]
print("Num of req gpus is " + str(num_req_gpus).split('\\n')[0])
if num_req_gpus == "8" or num_req_gpus == "16":
pass
else:
print("Invalid number of gpus requested by job " + num_req_gpus)
# Cancel job
exit(1)
# Get current state of ports
cmd_output = run_command("fmtool -s virgo12a")
print("Getting unbound ports")
cmd_json = (cmd_output.split('Response:')[1].split('Success')[0]).replace("\\n"," ")
#print(cmd_json)
cmd_obj = json.loads(cmd_json)
unbound = cmd_obj["body"]["fabric"]["switches"][0]["binding"]["unbound"]
if len(unbound) == 0:
print("No unbound ports available")
#scancel jobid assigned to this node
exit(1)
print("Unbound ports")
print(unbound)
# If num of requested gpus are unbound, then bind them, else scancel the jobid
if num_req_gpus == "8":
if 9 in unbound:
run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:9 virgo12a')
pass
elif 17 in unbound:
run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:17 virgo12a')
pass
else:
print("9 or 17 port not present in unbound")
exit(1)
elif num_req_gpus == "16":
if 9 in unbound and 17 in unbound:
run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:9 virgo12a')
run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:17 virgo12a')
pass
else:
print("9 and 17 port not present in unbound")
exit(1)
# Reboot the node
run_command("ssh " + nodename + " -t reboot")
main()
[root@vcontroller sbin]# cat slurm_suspend #!/bin/bash # Example SuspendProgram echo "`date` Suspend invoked $0 $*" >>/var/log/power_save.log echo "Power Save Module" fmtool -U switch:virgo12a,port_id:9 virgo12a >>/var/log/power_save.log fmtool -U switch:virgo12a,port_id:17 virgo12a >>/var/log/power_save.log exit 0
[root@vcontroller ~]# cat slurm.py
import subprocess
import json
def run_command(command):
cmd_arr = command.split(' ')
process = subprocess.Popen(cmd_arr, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print(cmd_arr)
stdout, stderr = process.communicate()
print(stdout)
print(stderr)
return str(stdout)
enabled_ports = [9, 17]
def main():
# Find if there is a job available
cmd_output = run_command("squeue -t PD -o %i|%r -h")
job_arr = cmd_output.split('\n')
if len(job_arr) == 0:
print("No jobs present in squeue")
exit(1)
job_id = "-1"
for job in job_arr:
if "ReqNodeNotAvail" in job:
job_id = job_arr[0].split('|')[0]
print ("Job id is " + str(job_id))
break
if job_id == "-1":
print("No PENDING jobs present in squeue")
exit(1)
# Find how many gpus it requests
cmd_output = run_command("scontrol show job " + str(job_id))
num_req_gpus = cmd_output.split('TresPerNode=gpu:k80:')[1][0]
print("Num of req gpus is " + str(num_req_gpus).split('\n')[0])
if num_req_gpus == "8" or num_req_gpus == "16":
pass
else:
print("Invalid number of gpus requested by job " + num_req_gpus)
# Cancel job
exit(1)
# Get current state of ports
cmd_output = run_command("fmtool -s virgo12a")
cmd_json = cmd_output.split('Response:')[1].split('Success')[0]
cmd_obj = json.loads(cmd_json)
unbound = cmd_obj["body"]["fabric"]["switches"][0]["binding"]["unbound"]
if len(unbound) == 0:
print("No unbound ports available")
exit(1)
print("Unbound ports")
print(unbound)
# Find available slurm node and corresponding partition id
cmd_output = run_command('sinfo')
cmd_arr = cmd_output.split('\n')[1:]
part_id = '-1'
nodename = 'dontexist'
for node in cmd_arr:
if 'idle' in node:
if 'n0001' in node:
part_id = '0'
nodename = 'n0001'
elif 'n0002' in node:
part_id = '1'
nodename = 'n0002'
break
if part_id == '-1':
print("No idle slurm nodes available")
exit(1)
print("Partition ID selected : " + part_id)
# Check if requested gpus are available
# Bind gpus to available node
if num_req_gpus == 8:
if 9 in unbound:
run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:9 virgo12a')
pass
elif 17 in unbound:
run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:17 virgo12a')
pass
else:
print("9 or 17 port not present in unbound")
exit(1)
else:
if 9 in unbound and 17 in unbound:
run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:9 virgo12a')
run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:17 virgo12a')
pass
else:
print("9 and 17 port not present in unbound")
exit(1)
# Update gres number with scontrol for slurm node
run_command('scontrol update Nodename=' + nodename + ' gres=gpu:k80:' + str(num_req_gpus))
# scontrol reboot
#run_command('scontrol reboot '+ nodename)
main()
<nowiki>
[root@vcontroller slurm]# cat slurm.conf
- Ansible managed
- GresTypes=gpu
ClusterName=cluster ControlMachine=vcontroller, Include /etc/slurm/slurm-nodes.conf Include /etc/slurm/slurm-partitions.conf Include /etc/slurm/slurm-user.conf
- Include /etc/slurm/gres.conf
- Include /etc/slurm/slurm-health.conf
TopologyPlugin=topology/tree
SwitchType=switch/none TaskPlugin=task/none MpiDefault=none ReturnToService=2
- Accounting.
AccountingStorageType=accounting_storage/slurmdbd AccountingStorageHost=vcontroller AccountingStorageUser=slurm
- AccountingStorageEnforce=qos,limits
- AccountingStoragePass=
- AccountingStoragePort=
- Logging
SlurmctldLogFile=/var/log/slurm/slurmctld.log SlurmdLogFile=/var/log/slurm/slurmd.log
StateSaveLocation=/vscaler/local/var/spool/slurm AuthType=auth/munge ControlAddr=10.6.44.152
- Name=gpu Type=k80 File=/dev/nvidia[0-15]
DebugFlags=Gres RebootProgram = "/sbin/shutdown -r now" GresTypes=gpu
- SelectType=select/cons_res
- SelectTypeParameters=CR_CORE_Memory
SuspendProgram=/usr/sbin/slurm_suspend ResumeProgram=/usr/sbin/slurm_resume SuspendTime=15 NodeName=node0001 Weight=8 Feature=cloud State=CLOUD Gres=gpu:k80:16 NodeName=node0002 Weight=8 Feature=cloud State=CLOUD Gres=gpu:k80:16
ResumeTimeout=300
</nowiki