diff --git a/README.md b/README.md index 9e83b4d..b287328 100644 --- a/README.md +++ b/README.md @@ -57,3 +57,9 @@ Note that if you resubmit the job you will not see any output. This is because of the `--joblog` and `--resume` options; the job remembers that the work was complete and does not needlessly re-run the program. To re-run the program you would need to delete the *.joblog file. + +Run the resumable example with: + +``` +rm -f submit.out joblog; touch submit.out; for _ in {1..5} ; do sbatch --partition debug -N1 02-submit-resumable.slurm ; done; tail -f submit.out +``` diff --git a/examples/02-submit-resumable.slurm b/examples/02-submit-resumable.slurm index 8a42875..34812be 100644 --- a/examples/02-submit-resumable.slurm +++ b/examples/02-submit-resumable.slurm @@ -1,5 +1,4 @@ -#!/bin/bash -x -#SBATCH --nodes 2 +#!/bin/bash #SBATCH --ntasks 5 #SBATCH --output submit.out @@ -8,16 +7,14 @@ # Kill job after 15 seconds to show resuming feature. #SBATCH --time 0:15 -# Overwrite instead of appending to output file. -echo -n > submit.out - parallel_opts=$(~/parallel-slurm/parallel_opts.sh) module load parallel # Run a failure prone program. +echo "Started SLURM job $SLURM_JOB_ID" parallel $parallel_opts \ --joblog joblog \ - --resume \ + --resume-failed \ --line-buffer \ ./script_that_sometimes_fails.sh \ ::: $(seq $SLURM_NTASKS) diff --git a/examples/script_that_sometimes_fails.sh b/examples/script_that_sometimes_fails.sh index fad09e3..9617495 100755 --- a/examples/script_that_sometimes_fails.sh +++ b/examples/script_that_sometimes_fails.sh @@ -1,40 +1,40 @@ #!/bin/bash +seed=$SLURM_JOB_ID +ID=$1 + prng () { # Use the linear conguential generator algorithm: # https://en.wikipedia.org/wiki/Random_number_generation#Computational_methods # - # Set it with: - # a = 1 - # b = SLURM_JOB_ID - # m = SLURM_NTASKS - # # We seed b with the SLURM_JOB_ID so that we independently have # the same seed for all tasks for a given job. x_n=0 a=1 - b=$SLURM_JOB_ID + b=$seed m=$SLURM_NTASKS # Recur as many times as the task id to generate different numbers # for each SLURM task. - for _ in {1..$SLURM_PROCID} + for i in $(seq 1 $ID) do x_n=$(( $(( a * $((x_n + b)) )) % m)) done + echo $x_n } main () { # Randomly fail half of the tasks. - "Task $SLURM_PROC_ID started..." random_int=$(prng) + echo -n "Task $ID started (seed $seed, random number $random_int) ... " sleep "$random_int" - if _=$(( random_int % 2 )) + if (( $random_int % 4 == 0 )) then - "Task $SLURM_PROC_ID failed!" - exit 1 + echo "succeeded!" + exit 0 fi - "Task $SLURM_PROC_ID succeeded!" + echo "failed!" + exit 1 } [ "$0" != "${BASH_SOURCE[0]}" ] || main "$@"