Difference between revisions of "GIGAIO:Lab Setup"

From Define Wiki
Jump to navigation Jump to search
(Created page with " <nowiki> [root@vcontroller test]# cat test.sh #!/bin/bash #SBATCH -N 1 # number of nodes #SBATCH --gres=gpu:k80:8 hostname lspci | grep NVIDIA nvidia-smi sleep 18 </nowik...")
 
Line 137: Line 137:
  
 
</nowiki>
 
</nowiki>
 +
 +
<nowiki>
 +
[root@vcontroller ~]# cat slurm.py
 +
import subprocess
 +
import json
 +
def run_command(command):
 +
    cmd_arr = command.split(' ')
 +
    process = subprocess.Popen(cmd_arr, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 +
    print(cmd_arr)
 +
    stdout, stderr = process.communicate()
 +
    print(stdout)
 +
    print(stderr)
 +
    return str(stdout)
 +
 +
enabled_ports = [9, 17]
 +
def main():
 +
    # Find if there is a job available
 +
    cmd_output = run_command("squeue -t PD -o %i|%r -h")
 +
    job_arr = cmd_output.split('\n')
 +
    if len(job_arr) == 0:
 +
        print("No jobs present in squeue")
 +
        exit(1)
 +
    job_id = "-1"
 +
    for job in job_arr:
 +
        if "ReqNodeNotAvail" in job:
 +
            job_id = job_arr[0].split('|')[0]
 +
            print ("Job id  is "  + str(job_id))
 +
            break
 +
    if job_id == "-1":
 +
        print("No PENDING jobs present in squeue")
 +
        exit(1)
 +
    # Find how many gpus it requests
 +
    cmd_output = run_command("scontrol show job " + str(job_id))
 +
    num_req_gpus = cmd_output.split('TresPerNode=gpu:k80:')[1][0]
 +
 
 +
    print("Num of req gpus is " + str(num_req_gpus).split('\n')[0])
 +
    if num_req_gpus == "8" or num_req_gpus == "16":
 +
        pass
 +
    else:
 +
        print("Invalid number of gpus requested by job " + num_req_gpus)
 +
        # Cancel job   
 +
        exit(1)
 +
 +
    # Get current state of ports
 +
    cmd_output = run_command("fmtool -s virgo12a")
 +
    cmd_json = cmd_output.split('Response:')[1].split('Success')[0]
 +
    cmd_obj = json.loads(cmd_json)
 +
    unbound = cmd_obj["body"]["fabric"]["switches"][0]["binding"]["unbound"]
 +
   
 +
    if len(unbound) == 0:
 +
        print("No unbound ports available")
 +
        exit(1)
 +
    print("Unbound ports")
 +
    print(unbound)
 +
    # Find available slurm node and corresponding partition id
 +
    cmd_output = run_command('sinfo')
 +
    cmd_arr = cmd_output.split('\n')[1:]
 +
    part_id = '-1'
 +
    nodename = 'dontexist'
 +
    for node in cmd_arr:
 +
        if 'idle' in node:
 +
            if 'n0001' in node:
 +
                part_id = '0'
 +
                nodename = 'n0001'
 +
            elif 'n0002' in node:
 +
                part_id = '1'
 +
                nodename = 'n0002'
 +
            break
 +
 +
    if part_id == '-1':
 +
        print("No idle slurm nodes available")
 +
        exit(1)
 +
    print("Partition ID selected : " + part_id)
 +
    # Check if requested gpus are available
 +
    # Bind gpus to available node
 +
    if num_req_gpus == 8:
 +
        if 9 in unbound:
 +
            run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:9 virgo12a')
 +
            pass
 +
        elif 17 in unbound:
 +
            run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:17 virgo12a')
 +
            pass
 +
        else:
 +
            print("9 or 17 port not present in unbound")
 +
            exit(1)
 +
    else:
 +
        if 9 in unbound and 17 in unbound:
 +
            run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:9 virgo12a')
 +
            run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:17 virgo12a')
 +
            pass
 +
        else:
 +
            print("9 and 17 port not present in unbound")
 +
            exit(1)
 +
 +
   
 +
    # Update gres number with scontrol for slurm node
 +
    run_command('scontrol update Nodename=' + nodename + ' gres=gpu:k80:' + str(num_req_gpus))
 +
    # scontrol reboot
 +
    #run_command('scontrol reboot '+ nodename)
 +
   
 +
 +
main()
 +
 +
</nowiki>
 +
 +
 +
<nowiki>
 +
[root@vcontroller slurm]# cat slurm.conf
 +
#
 +
# Ansible managed
 +
#
 +
#GresTypes=gpu
 +
ClusterName=cluster
 +
ControlMachine=vcontroller,
 +
Include /etc/slurm/slurm-nodes.conf
 +
Include /etc/slurm/slurm-partitions.conf
 +
Include /etc/slurm/slurm-user.conf
 +
#Include /etc/slurm/gres.conf
 +
#Include /etc/slurm/slurm-health.conf
 +
 +
 +
TopologyPlugin=topology/tree
 +
 +
SwitchType=switch/none
 +
TaskPlugin=task/none
 +
MpiDefault=none
 +
ReturnToService=2
 +
 +
 +
# Accounting.
 +
AccountingStorageType=accounting_storage/slurmdbd
 +
AccountingStorageHost=vcontroller
 +
AccountingStorageUser=slurm
 +
#AccountingStorageEnforce=qos,limits
 +
#AccountingStoragePass=
 +
#AccountingStoragePort=
 +
 +
# Logging
 +
SlurmctldLogFile=/var/log/slurm/slurmctld.log
 +
SlurmdLogFile=/var/log/slurm/slurmd.log
 +
 +
StateSaveLocation=/vscaler/local/var/spool/slurm
 +
AuthType=auth/munge
 +
ControlAddr=10.6.44.152
 +
 +
 +
#Name=gpu Type=k80 File=/dev/nvidia[0-15]
 +
 +
DebugFlags=Gres
 +
RebootProgram = "/sbin/shutdown -r now"
 +
GresTypes=gpu
 +
 +
#SelectType=select/cons_res
 +
#SelectTypeParameters=CR_CORE_Memory
 +
 +
SuspendProgram=/usr/sbin/slurm_suspend
 +
ResumeProgram=/usr/sbin/slurm_resume
 +
SuspendTime=15
 +
NodeName=node0001 Weight=8 Feature=cloud State=CLOUD Gres=gpu:k80:16
 +
NodeName=node0002 Weight=8 Feature=cloud State=CLOUD Gres=gpu:k80:16
 +
 +
ResumeTimeout=300
 +
 +
</nowiki

Revision as of 22:13, 8 September 2020

[root@vcontroller test]# cat test.sh 
#!/bin/bash
#SBATCH -N 1 # number of nodes
#SBATCH --gres=gpu:k80:8

hostname
lspci | grep NVIDIA
nvidia-smi

sleep 18


[root@vcontroller sbin]# cat slurm_resume 
#!/usr/bin/python3

import sys

import subprocess
import json
def run_command(command):
    cmd_arr = command.split(' ') 
    process = subprocess.Popen(cmd_arr, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    print(cmd_arr)
    stdout, stderr = process.communicate()
    print(stdout)
    print(stderr)
    return str(stdout)

enabled_ports = [9, 17]
nodelist = ["node0001", "node0002"]
def main():

    nodename = str(sys.argv[1])
    print(str(sys.argv))
    
    if nodename in nodelist:
       print("Nodename is " + nodename)
    else:
       exit(0) 
    

    if nodename == 'node0001':
        part_id = '0'
    elif nodename == 'node0002':
        part_id = '1'
    else:
        exit(0)

    # Find job id assigned to this node
    cmd_out = run_command("squeue -o %A|%N")
    
    print(cmd_out)
    jobid = '-1'
    for line in cmd_out.split("\\n"):
        if nodename in line:
            jobid = line.split('|')[0]
            break
    if jobid == '-1':
        exit(0)

    print("Job id is " + jobid)

    # Find num of gpus requested by this job
    cmd_output = run_command("scontrol show job " + str(jobid))
    num_req_gpus = cmd_output.split('TresPerNode=gpu:k80:')[1].split('\\n')[0]
   
    print("Num of req gpus is " + str(num_req_gpus).split('\\n')[0])
    if num_req_gpus == "8" or num_req_gpus == "16":
        pass
    else:
        print("Invalid number of gpus requested by job " + num_req_gpus)
        # Cancel job    
        exit(1)




    # Get current state of ports
    cmd_output = run_command("fmtool -s virgo12a")
    print("Getting unbound ports")

    cmd_json = (cmd_output.split('Response:')[1].split('Success')[0]).replace("\\n"," ")
    #print(cmd_json)
    cmd_obj = json.loads(cmd_json)
    unbound = cmd_obj["body"]["fabric"]["switches"][0]["binding"]["unbound"]
    
    if len(unbound) == 0:
        print("No unbound ports available")
        #scancel jobid assigned to this node
        exit(1)
    print("Unbound ports")
    print(unbound)
    
    # If num of requested gpus are unbound, then bind them, else scancel the jobid

    if num_req_gpus == "8":
        if 9 in unbound:
            run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:9 virgo12a')
            pass
        elif 17 in unbound:
            run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:17 virgo12a')
            pass
        else:
            print("9 or 17 port not present in unbound")
            exit(1)
    elif num_req_gpus == "16":
        if 9 in unbound and 17 in unbound:
            run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:9 virgo12a')
            run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:17 virgo12a')
            pass
        else:
            print("9 and 17 port not present in unbound")
            exit(1)


        

    # Reboot the node
    run_command("ssh " + nodename + " -t reboot")
main()


[root@vcontroller sbin]# cat slurm_suspend 
#!/bin/bash
# Example SuspendProgram
echo "`date` Suspend invoked $0 $*" >>/var/log/power_save.log
echo "Power Save Module"
fmtool -U switch:virgo12a,port_id:9 virgo12a >>/var/log/power_save.log
fmtool -U switch:virgo12a,port_id:17 virgo12a >>/var/log/power_save.log
exit 0


[root@vcontroller ~]# cat slurm.py 
import subprocess
import json
def run_command(command):
    cmd_arr = command.split(' ') 
    process = subprocess.Popen(cmd_arr, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    print(cmd_arr)
    stdout, stderr = process.communicate()
    print(stdout)
    print(stderr)
    return str(stdout)

enabled_ports = [9, 17]
def main():
    # Find if there is a job available
    cmd_output = run_command("squeue -t PD -o %i|%r -h")
    job_arr = cmd_output.split('\n')
    if len(job_arr) == 0:
        print("No jobs present in squeue")
        exit(1)
    job_id = "-1"
    for job in job_arr:
        if "ReqNodeNotAvail" in job:
            job_id = job_arr[0].split('|')[0]
            print ("Job id  is "  + str(job_id))
            break
    if job_id == "-1":
        print("No PENDING jobs present in squeue")
        exit(1)
    # Find how many gpus it requests
    cmd_output = run_command("scontrol show job " + str(job_id))
    num_req_gpus = cmd_output.split('TresPerNode=gpu:k80:')[1][0]
   
    print("Num of req gpus is " + str(num_req_gpus).split('\n')[0])
    if num_req_gpus == "8" or num_req_gpus == "16":
        pass
    else:
        print("Invalid number of gpus requested by job " + num_req_gpus)
        # Cancel job    
        exit(1)

    # Get current state of ports
    cmd_output = run_command("fmtool -s virgo12a")
    cmd_json = cmd_output.split('Response:')[1].split('Success')[0]
    cmd_obj = json.loads(cmd_json)
    unbound = cmd_obj["body"]["fabric"]["switches"][0]["binding"]["unbound"]
    
    if len(unbound) == 0:
        print("No unbound ports available")
        exit(1)
    print("Unbound ports")
    print(unbound)
    # Find available slurm node and corresponding partition id
    cmd_output = run_command('sinfo')
    cmd_arr = cmd_output.split('\n')[1:]
    part_id = '-1'
    nodename = 'dontexist'
    for node in cmd_arr:
        if 'idle' in node:
            if 'n0001' in node:
                part_id = '0'
                nodename = 'n0001'
            elif 'n0002' in node:
                part_id = '1'
                nodename = 'n0002'
            break

    if part_id == '-1':
        print("No idle slurm nodes available")
        exit(1)
    print("Partition ID selected : " + part_id)
    # Check if requested gpus are available
    # Bind gpus to available node
    if num_req_gpus == 8:
        if 9 in unbound:
            run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:9 virgo12a')
            pass
        elif 17 in unbound:
            run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:17 virgo12a')
            pass
        else:
            print("9 or 17 port not present in unbound")
            exit(1)
    else:
        if 9 in unbound and 17 in unbound:
            run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:9 virgo12a')
            run_command('fmtool -B switch:virgo12a,part_id:' + part_id + ',port_id:17 virgo12a')
            pass
        else:
            print("9 and 17 port not present in unbound")
            exit(1)

    
    # Update gres number with scontrol for slurm node
    run_command('scontrol update Nodename=' + nodename + ' gres=gpu:k80:' + str(num_req_gpus))
    # scontrol reboot
    #run_command('scontrol reboot '+ nodename)
    

main()



<nowiki>

[root@vcontroller slurm]# cat slurm.conf

  1. Ansible managed
  2. GresTypes=gpu

ClusterName=cluster ControlMachine=vcontroller, Include /etc/slurm/slurm-nodes.conf Include /etc/slurm/slurm-partitions.conf Include /etc/slurm/slurm-user.conf

  1. Include /etc/slurm/gres.conf
  2. Include /etc/slurm/slurm-health.conf


TopologyPlugin=topology/tree

SwitchType=switch/none TaskPlugin=task/none MpiDefault=none ReturnToService=2


  1. Accounting.

AccountingStorageType=accounting_storage/slurmdbd AccountingStorageHost=vcontroller AccountingStorageUser=slurm

  1. AccountingStorageEnforce=qos,limits
  2. AccountingStoragePass=
  3. AccountingStoragePort=
  1. Logging

SlurmctldLogFile=/var/log/slurm/slurmctld.log SlurmdLogFile=/var/log/slurm/slurmd.log

StateSaveLocation=/vscaler/local/var/spool/slurm AuthType=auth/munge ControlAddr=10.6.44.152


  1. Name=gpu Type=k80 File=/dev/nvidia[0-15]

DebugFlags=Gres RebootProgram = "/sbin/shutdown -r now" GresTypes=gpu

  1. SelectType=select/cons_res
  2. SelectTypeParameters=CR_CORE_Memory

SuspendProgram=/usr/sbin/slurm_suspend ResumeProgram=/usr/sbin/slurm_resume SuspendTime=15 NodeName=node0001 Weight=8 Feature=cloud State=CLOUD Gres=gpu:k80:16 NodeName=node0002 Weight=8 Feature=cloud State=CLOUD Gres=gpu:k80:16

ResumeTimeout=300

</nowiki