Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
ENH: Give user control over resuming and add example
  • Loading branch information
pan14001 committed Apr 23, 2019
1 parent b2f7467 commit d362129
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 4 deletions.
23 changes: 23 additions & 0 deletions examples/02-submit-resumable.slurm
@@ -0,0 +1,23 @@
#!/bin/bash -x
#SBATCH --nodes 2
#SBATCH --ntasks 5
#SBATCH --output submit.out

#SBATCH --dependency singleton
#SBATCH --job-name unambiguous-name-for-resumable-job
# Kill job after 15 seconds to show resuming feature.
#SBATCH --time 0:15

# Overwrite instead of appending to output file.
echo -n > submit.out

parallel_opts=$(~/parallel-slurm/parallel_opts.sh)
module load parallel

# Run a failure prone program.
parallel $parallel_opts \
--joblog joblog \
--resume \
--line-buffer \
./script_that_sometimes_fails.sh \
::: $(seq $SLURM_NTASKS)
40 changes: 40 additions & 0 deletions examples/script_that_sometimes_fails.sh
@@ -0,0 +1,40 @@
#!/bin/bash

prng () {
# Use the linear conguential generator algorithm:
# https://en.wikipedia.org/wiki/Random_number_generation#Computational_methods
#
# Set it with:
# a = 1
# b = SLURM_JOB_ID
# m = SLURM_NTASKS
#
# We seed b with the SLURM_JOB_ID so that we independently have
# the same seed for all tasks for a given job.

x_n=0
a=1
b=$SLURM_JOB_ID
m=$SLURM_NTASKS
# Recur as many times as the task id to generate different numbers
# for each SLURM task.
for _ in 1..$SLURM_PROCID
do
x_n=$(( $(( a * $((x_n + b)) )) % m))
done
}

main () {
# Randomly fail half of the tasks.
"Task $SLURM_PROC_ID started..."
random_int=$(prng)
sleep "$random_int"
if _=$(( random_int % 2 ))
then
"Task $SLURM_PROC_ID failed!"
exit 1
fi
"Task $SLURM_PROC_ID succeeded!"
}

[ "$0" != "${BASH_SOURCE[0]}" ] || main "$@"
5 changes: 1 addition & 4 deletions parallel_opts.sh
Expand Up @@ -95,17 +95,14 @@ write_machinefile () {
}

parallel_opts () {
local machinefile joblog
local machinefile
machinefile=$(machinefile)
joblog=$(prefix).joblog

echo "
--env _
--sshdelay 0.1
--sshloginfile $machinefile
--workdir .
--joblog $joblog
--resume
"
}

Expand Down

0 comments on commit d362129

Please sign in to comment.