SLURM
Basic commands
Many info here are stolen from: https://www.rc.fas.harvard.edu/resources/documentation/convenient-slurm-commands/
Submitting jobs
# submit jobs via script
sbatch myscript.sh
# to submit jobs directly without passing script
sbatch << EOF
#!/bin/sh
sleep 30
EOF
Listing jobs
squeue
# list all current jobs for a user
squeue -u <username>
# List all running jobs for a user
squeue -u <username> -t RUNNING
# List all pending jobs for a user
squeue -u <username> -t PENDING
# squeue aliases I find useful
alias SQ='squeue -o "%.8i %.20j %.10P %.7u %.5D %.4C %.11M %.11l %.3t %.11m %R" -u $USER'
alias SQ_long='squeue -o "%.8i %.20j %.10P %.7u %.5D %.4C %.11M %.11l %.3t %.11m %R %V %o" -u $USER' #also shows submission time and command ran
# Total no. of jobs
alias no_jobs="SQ -h | wc -l"
Info on currently running jobs
scontrol
# List detailed information for a job (useful for troubleshooting)
scontrol show job -d <jobid>
# aliases I find useful
alias SCONTR='scontrol show job -d'
sstat
# List status info for a currently running job
sstat --format=JobID,AveCPU,AvePages,AveRSS,AveVMSize --allsteps -j <jobid>
# aliases I find useful
alias SR='sstat --format="JobID,NTasks,AveCPU,AvePages,AveRSS,AveVMSize,MaxRSSNode" --allsteps'
Info on completed jobs
seff
# provides useful info on completed job, including the memory used and what percent of your allocated memory that amounts to.
seff <jobid>
sacct
# To view the same information for all jobs of a user
sacct --format=JobID,JobName,MaxRSS,Elapsed,state,ReqMem,MaxVMSize,AveVMSize --units=M
# To get statistics on completed jobs by jobID
sacct --format=JobID,JobName,MaxRSS,Elapsed,state,ReqMem,MaxVMSize,AveVMSize --units=M -j <jobid>
# shows resources allocated
sacct --allocations
# jobs between select dates; valid formats for day-time: HH:MM[:SS] [AM|PM], MMDD[YY] or MM/DD[/YY] or MM.DD[.YY], MM/DD[/YY]-HH:MM[:SS], YYYY-MM-DD[THH:MM[:SS]]
sacct --starttime 2020-04-17T14:00:00 --endtime 2020-04-25T23:59:59 --allocations
sacct -S $(date -d '1 month ago' +%D-%R) -E $(date -d '2 weeks ago' +%D-%R)
# sacct aliases I find useful
alias SC='sacct --format="JobID,JobName,Ntasks,MaxRSS,Elapsed,state,NodeList,ReqMem,MaxVMSize,AveVMSize,Partition,AllocTRES%40" --units=M'
Useful flags and their meaning
Source
CPU
| Flag |
Meaning |
| NCPUs |
Number of CPUs used by the job |
| NNodes |
Number of nodes used by the job |
| UserCPU |
User CPU time used by the job |
| SystemCPU |
System CPU time used by the job |
| TotalCPU |
Total CPU time used by the job; sum of UserCPU and SystemCPU |
| CPUTime |
Elapsed*NCPUs (total CPU time a perfectly efficient job would use) |
Memory
| Flag |
Meaning |
| ReqMem |
Amount of memory requested; suffixed with 'c' if per CPU, 'n' if per node |
| AveRSS |
Average memory use for all tasks |
| MaxRSS |
Maximum memory use of any task |
Time
| Flag |
Meaning |
| Submit |
When the job was submitted |
| Start |
When the job started |
| End |
When the job ended |
| TimeLimit |
How much time the job was allocated |
| Elapsed |
How much time the job used |
I/O
| Flag |
Meaning |
| AveDiskRead |
Average number of bytes read for all tasks |
| MaxDiskRead |
Maximum number of bytes read for any task |
| AveDiskWrite |
Average number of bytes written for all tasks |
| MaxDiskWrite |
Maximum number of bytes read for any task |
| AvePages |
Average number of page faults for all tasks |
| MaxPages |
Maximum number of page faults for any task |
Controlling jobs
scancel
# To cancel one job
scancel <jobid>
# To cancel all the jobs for a user
scancel -u <username>
# To cancel all the pending jobs for a user
scancel -t PENDING -u <username>
# To cancel one or more jobs by name
scancel --name myJobName
scontrol
# To hold a particular job from being scheduled
scontrol hold <jobid>
# To release a particular job to be scheduled
scontrol release <jobid>
# To requeue (cancel and rerun) a particular job
scontrol requeue <jobid>
Interactive jobs
srun
# aliases to make my life less miserable
alias SRUN_SIMPLE="srun --pty /bin/bash"
alias SRUN_EXPRESS="srun --ntasks=1 --cpus-per-task=4 --mem-per-cpu=4096 --partition=express --pty /bin/bash"
alias SRUN_MEDIUM="srun --ntasks=1 --cpus-per-task=4 --mem-per-cpu=4096 --partition=medium --pty /bin/bash"
SRUN_CUSTOM() {
if [ "$#" -ne 3 ]; then
echo "Usage:"
echo "SRUN_CUSTOM cpus-per-task mem-per-cpu partition"
else
srun --ntasks=1 --cpus-per-task="$1" --mem-per-cpu="$2" --partition="$3" --pty /bin/bash
fi
}