Difference between revisions of "Bright: WLM Slurm"
Jump to navigation
Jump to search
(Created page with "== Information on the hosts == * Use <tt>sinfo</tt> <syntaxhighlight> [root@head1 ~]# sinfo PARTITION AVAIL TIMELIMIT NODES STATE NODELIST defq* up infinite 1 down* st...") |
|||
| Line 6: | Line 6: | ||
defq* up infinite 1 down* storage1 | defq* up infinite 1 down* storage1 | ||
defq* up infinite 16 idle gpuNode[01-16] | defq* up infinite 16 idle gpuNode[01-16] | ||
| + | </syntaxhighlight> | ||
| + | |||
| + | == Control Nodes == | ||
| + | <syntaxhighlight> | ||
| + | # Drain a node | ||
| + | scontrol update nodename=gpuNode02 state=drain reason=knackered | ||
| + | # Resume after draining | ||
| + | scontrol update nodename=gpuNode02 state=resume | ||
| + | # Stop jobs running | ||
| + | scontrol update nodename=gpuNode02 state=down | ||
| + | </syntaxhighlight> | ||
| + | |||
| + | == Handy Comparison of Slurm / PBS / SGE == | ||
| + | <syntaxhighlight> | ||
| + | |||
| + | Action Slurm Torque/PBS Maui SGE | ||
| + | ------------------------------------------------------------------------------------------------------------ | ||
| + | Get information about the job scontrol show job "jobid" qstat -f "jobid" checkjob | ||
| + | Display the queue info squeue qstat showq qstat | ||
| + | Delete a job scancel "jobid" qdel qdel | ||
| + | Submit a job srun/sbatch/salloc testjob qsub testjob msub qsub | ||
| + | Submit a interactive job salloc -N 4 -p active sh qsub -I qlogin | ||
| + | Display all job info squeue -al qstat -f | ||
| + | scontrol show job | ||
| + | Display job scontrol show job "jobID" qstat -f "jobID" | ||
| + | Display free processors srun --test-only -p normal -n 1 -t 10:00 sh showbf | ||
| + | Display the expected start time squeue --start -j "jobid" showstart "jobid" | ||
| + | Display blocked jobs squeue --start mdiag -b/showq -b | ||
| + | Display queues/partitions scontrol show partition qstat -Qf | ||
| + | Display queue sinfo -h qstat -q | ||
| + | sinfo -o "%P %l %c %D " | ||
| + | Graphical Frontend sview xpbs qmon | ||
| + | |||
| + | |||
| + | Node control | ||
| + | |||
| + | Action Slurm Torque/PBS Maui | ||
| + | ------------------------------------------------------------------------------------------------ | ||
| + | |||
| + | Display node info scontrol show node "node" pbsnodes "node" checknode "node" | ||
| + | Drain node scontrol update NodeName=gpu-1-4 pbsnodes -oN "Timeout" tcc-1-4 | ||
| + | State=DRAIN Reason=Timeout | ||
| + | Clear node scontrol update NodeName=gpu-1-4 pbsnodes -cN "" tcc-1-4 | ||
| + | State=RESUME | ||
| + | List down Nodes sinfo --list-reasons pbsnodes -ln | ||
| + | sinfo --long -R | ||
| + | sinfo -RlN | ||
| + | pbsnodes -l all: | ||
| + | alias pn='sinfo --format="%25N %.3D %9P %11T %.4c %14C %.8z %.8m %.4d %.8w %10f %20E"' | ||
| + | |||
| + | pbsnodes -ln | ||
| + | alias pl='sinfo --states=down,drain,fail,no_respond,maint,unk --format="%12n %20f %20H %12u %32E"' | ||
| + | |||
| + | showstart: | ||
| + | alias lj='sacct -o user,jobid,jobname,state,node,start' | ||
</syntaxhighlight> | </syntaxhighlight> | ||
Latest revision as of 22:57, 2 October 2014
Information on the hosts
- Use sinfo
[root@head1 ~]# sinfo
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
defq* up infinite 1 down* storage1
defq* up infinite 16 idle gpuNode[01-16]Control Nodes
# Drain a node
scontrol update nodename=gpuNode02 state=drain reason=knackered
# Resume after draining
scontrol update nodename=gpuNode02 state=resume
# Stop jobs running
scontrol update nodename=gpuNode02 state=downHandy Comparison of Slurm / PBS / SGE
Action Slurm Torque/PBS Maui SGE
------------------------------------------------------------------------------------------------------------
Get information about the job scontrol show job "jobid" qstat -f "jobid" checkjob
Display the queue info squeue qstat showq qstat
Delete a job scancel "jobid" qdel qdel
Submit a job srun/sbatch/salloc testjob qsub testjob msub qsub
Submit a interactive job salloc -N 4 -p active sh qsub -I qlogin
Display all job info squeue -al qstat -f
scontrol show job
Display job scontrol show job "jobID" qstat -f "jobID"
Display free processors srun --test-only -p normal -n 1 -t 10:00 sh showbf
Display the expected start time squeue --start -j "jobid" showstart "jobid"
Display blocked jobs squeue --start mdiag -b/showq -b
Display queues/partitions scontrol show partition qstat -Qf
Display queue sinfo -h qstat -q
sinfo -o "%P %l %c %D "
Graphical Frontend sview xpbs qmon
Node control
Action Slurm Torque/PBS Maui
------------------------------------------------------------------------------------------------
Display node info scontrol show node "node" pbsnodes "node" checknode "node"
Drain node scontrol update NodeName=gpu-1-4 pbsnodes -oN "Timeout" tcc-1-4
State=DRAIN Reason=Timeout
Clear node scontrol update NodeName=gpu-1-4 pbsnodes -cN "" tcc-1-4
State=RESUME
List down Nodes sinfo --list-reasons pbsnodes -ln
sinfo --long -R
sinfo -RlN
pbsnodes -l all:
alias pn='sinfo --format="%25N %.3D %9P %11T %.4c %14C %.8z %.8m %.4d %.8w %10f %20E"'
pbsnodes -ln
alias pl='sinfo --states=down,drain,fail,no_respond,maint,unk --format="%12n %20f %20H %12u %32E"'
showstart:
alias lj='sacct -o user,jobid,jobname,state,node,start'