Skip to content

Commit

Permalink
ENH: Validate example 02 on the cluster
Browse files Browse the repository at this point in the history
  • Loading branch information
pan14001 committed Apr 23, 2019
1 parent 427f794 commit 1bdd3b8
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 18 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,9 @@ Note that if you resubmit the job you will not see any output. This
is because of the `--joblog` and `--resume` options; the job remembers
that the work was complete and does not needlessly re-run the program.
To re-run the program you would need to delete the *.joblog file.

Run the resumable example with:

```
rm -f submit.out joblog; touch submit.out; for _ in {1..5} ; do sbatch --partition debug -N1 02-submit-resumable.slurm ; done; tail -f submit.out
```
9 changes: 3 additions & 6 deletions examples/02-submit-resumable.slurm
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/bin/bash -x
#SBATCH --nodes 2
#!/bin/bash
#SBATCH --ntasks 5
#SBATCH --output submit.out

Expand All @@ -8,16 +7,14 @@
# Kill job after 15 seconds to show resuming feature.
#SBATCH --time 0:15

# Overwrite instead of appending to output file.
echo -n > submit.out

parallel_opts=$(~/parallel-slurm/parallel_opts.sh)
module load parallel

# Run a failure prone program.
echo "Started SLURM job $SLURM_JOB_ID"
parallel $parallel_opts \
--joblog joblog \
--resume \
--resume-failed \
--line-buffer \
./script_that_sometimes_fails.sh \
::: $(seq $SLURM_NTASKS)
24 changes: 12 additions & 12 deletions examples/script_that_sometimes_fails.sh
Original file line number Diff line number Diff line change
@@ -1,40 +1,40 @@
#!/bin/bash

seed=$SLURM_JOB_ID
ID=$1

prng () {
# Use the linear conguential generator algorithm:
# https://en.wikipedia.org/wiki/Random_number_generation#Computational_methods
#
# Set it with:
# a = 1
# b = SLURM_JOB_ID
# m = SLURM_NTASKS
#
# We seed b with the SLURM_JOB_ID so that we independently have
# the same seed for all tasks for a given job.

x_n=0
a=1
b=$SLURM_JOB_ID
b=$seed
m=$SLURM_NTASKS
# Recur as many times as the task id to generate different numbers
# for each SLURM task.
for _ in {1..$SLURM_PROCID}
for i in $(seq 1 $ID)
do
x_n=$(( $(( a * $((x_n + b)) )) % m))
done
echo $x_n
}

main () {
# Randomly fail half of the tasks.
"Task $SLURM_PROC_ID started..."
random_int=$(prng)
echo -n "Task $ID started (seed $seed, random number $random_int) ... "
sleep "$random_int"
if _=$(( random_int % 2 ))
if (( $random_int % 4 == 0 ))
then
"Task $SLURM_PROC_ID failed!"
exit 1
echo "succeeded!"
exit 0
fi
"Task $SLURM_PROC_ID succeeded!"
echo "failed!"
exit 1
}

[ "$0" != "${BASH_SOURCE[0]}" ] || main "$@"

0 comments on commit 1bdd3b8

Please sign in to comment.