-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from HPC/unit-tests
Overhaul parallel-slurm to use unit tests and allow resumable jobs
- Loading branch information
Showing
9 changed files
with
350 additions
and
91 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#!/bin/bash -x | ||
#SBATCH --nodes 2 | ||
#SBATCH --ntasks 5 | ||
#SBATCH --output submit.out | ||
|
||
# Overwrite instead of appending to output file. | ||
echo -n > submit.out | ||
|
||
parallel_opts=$(~/parallel-slurm/parallel_opts.sh) | ||
module load parallel | ||
|
||
# Print the name of each host that GNU Parallel is running on. | ||
parallel $parallel_opts -n0 hostname ::: $(seq $SLURM_NTASKS) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
#!/bin/bash | ||
#SBATCH --ntasks 5 | ||
#SBATCH --output submit.out | ||
|
||
#SBATCH --dependency singleton | ||
#SBATCH --job-name unambiguous-name-for-resumable-job | ||
# Kill job after 15 seconds to show resuming feature. | ||
#SBATCH --time 0:15 | ||
|
||
parallel_opts=$(~/parallel-slurm/parallel_opts.sh) | ||
module load parallel | ||
|
||
# Run a failure prone program. | ||
echo "Started SLURM job $SLURM_JOB_ID" | ||
parallel $parallel_opts \ | ||
--joblog joblog \ | ||
--resume-failed \ | ||
--line-buffer \ | ||
./script_that_sometimes_fails.sh \ | ||
::: $(seq $SLURM_NTASKS) | ||
echo "Completed SLURM job $SLURM_JOB_ID in $(sacct -nXj $SLURM_JOB_ID -o elapsed)" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#!/bin/bash | ||
|
||
seed=$SLURM_JOB_ID | ||
ID=$1 | ||
|
||
prng () { | ||
# Use the linear conguential generator algorithm: | ||
# https://en.wikipedia.org/wiki/Random_number_generation#Computational_methods | ||
# | ||
# We seed b with the SLURM_JOB_ID so that we independently have | ||
# the same seed for all tasks for a given job. | ||
|
||
x_n=0 | ||
a=1 | ||
b=$seed | ||
m=$SLURM_NTASKS | ||
# Recur as many times as the task id to generate different numbers | ||
# for each SLURM task. | ||
for i in $(seq 1 $ID) | ||
do | ||
x_n=$(( $(( a * $((x_n + b)) )) % m)) | ||
done | ||
echo $x_n | ||
} | ||
|
||
main () { | ||
# Randomly fail half of the tasks. | ||
random_int=$(prng) | ||
echo -n "Task $ID started (seed $seed, random number $random_int) ... " | ||
sleep "$random_int" | ||
if (( $random_int % 4 == 0 )) | ||
then | ||
echo "succeeded!" | ||
exit 0 | ||
fi | ||
echo "failed!" | ||
exit 1 | ||
} | ||
|
||
[ "$0" != "${BASH_SOURCE[0]}" ] || main "$@" |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
#!/usr/bin/env bash | ||
|
||
# GNU Parallel setup for SLURM | ||
# | ||
# Author: Pariksheet Nanda <hpc@uconn.edu> 2016-2017,2019 | ||
# | ||
# License: Public Domain / CC0 | ||
# | ||
# To the extent possible under law, Pariksheet Nanda has waived all | ||
# copyright and related or neighboring rights to GNU Parallel setup | ||
# for SLURM. | ||
|
||
# This directive applies to the entire script. | ||
# shellcheck disable=2039 | ||
true | ||
|
||
is_slurm_env () { | ||
if [[ -n "$SLURM_JOB_ID" ]] | ||
then # yes | ||
return 0 | ||
else | ||
return 1 | ||
fi | ||
} | ||
|
||
# Helper to expand hostnames | ||
has_clustershell () { | ||
if python -m ClusterShell.CLI.Nodeset -h &> /dev/null | ||
then | ||
return 0 | ||
else | ||
return 1 | ||
fi | ||
} | ||
|
||
install_clustershell () { | ||
python -m pip install --user clustershell | ||
} | ||
|
||
setup_on_cluster () { | ||
# Allow export of environment using `--env` option | ||
if [[ ! -e ~/.parallel/ignored_vars ]]; then | ||
# Create an empty ignored_vars file to pass all the environment | ||
# variables to the SSH instance | ||
mkdir -p ~/.parallel | ||
touch ~/.parallel/ignored_vars | ||
fi | ||
} | ||
|
||
# Expand tasks from "2,5(x1),3(x2)" to "2 5 3 3 " | ||
expand_slurm_tasks_per_node () { | ||
[[ -z "${SLURM_TASKS_PER_NODE}" ]] && return | ||
|
||
local tasks | ||
# shellcheck disable=2207 | ||
tasks=( $(echo "${SLURM_TASKS_PER_NODE}" | tr ',' ' ') ) | ||
|
||
local num count | ||
for val in ${tasks[*]}; do | ||
num="${val/(*)/}" | ||
if [[ -z "${val%%*)}" ]]; then | ||
count=$(echo "$val" | sed -E 's#[0-9]+\(x([0-9]+)\)#\1#') | ||
else | ||
count=1 | ||
fi | ||
# shellcheck disable=2046 | ||
printf "$num%.0s " $(seq $count) | ||
done | ||
} | ||
|
||
# Make list in the form of "cpu/host" | ||
cpu_host_array () { | ||
local nodeset hosts cpus | ||
nodeset="python -m ClusterShell.CLI.Nodeset" | ||
|
||
# shellcheck disable=2207 | ||
hosts=( $($nodeset -e "${SLURM_NODELIST}") ) | ||
# shellcheck disable=2207 | ||
cpus=( $(expand_slurm_tasks_per_node) ) | ||
for ((i=0; i<${#hosts[*]}; ++i)); do | ||
echo "${cpus[i]}/${hosts[i]}" | ||
done | ||
} | ||
|
||
prefix () { | ||
echo "${SLURM_JOB_NAME%.*}" | ||
} | ||
|
||
machinefile () { | ||
echo "$(prefix).sshloginfile" | ||
} | ||
|
||
write_machinefile () { | ||
cpu_host_array > "$(machinefile)" | ||
} | ||
|
||
parallel_opts () { | ||
local machinefile | ||
machinefile=$(machinefile) | ||
|
||
echo " | ||
--env _ | ||
--sshdelay 0.1 | ||
--sshloginfile $machinefile | ||
--workdir . | ||
" | ||
} | ||
|
||
main () { | ||
is_slurm_env && setup_on_cluster | ||
! has_clustershell && install_clustershell | ||
write_machinefile | ||
parallel_opts | ||
} | ||
|
||
[[ "$0" != "${BASH_SOURCE[0]}" ]] || main "$@" |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.