diff --git a/examples/02-submit-resumable.slurm b/examples/02-submit-resumable.slurm index 8a42875..a94a684 100644 --- a/examples/02-submit-resumable.slurm +++ b/examples/02-submit-resumable.slurm @@ -1,5 +1,4 @@ -#!/bin/bash -x -#SBATCH --nodes 2 +#!/bin/bash #SBATCH --ntasks 5 #SBATCH --output submit.out @@ -8,16 +7,15 @@ # Kill job after 15 seconds to show resuming feature. #SBATCH --time 0:15 -# Overwrite instead of appending to output file. -echo -n > submit.out - parallel_opts=$(~/parallel-slurm/parallel_opts.sh) module load parallel # Run a failure prone program. +echo "Started SLURM job $SLURM_JOB_ID" parallel $parallel_opts \ --joblog joblog \ - --resume \ + --resume-failed \ --line-buffer \ ./script_that_sometimes_fails.sh \ ::: $(seq $SLURM_NTASKS) +echo "Completed SLURM job $SLURM_JOB_ID in $(sacct -nXj $SLURM_JOB_ID -o elapsed)" diff --git a/examples/script_that_sometimes_fails.sh b/examples/script_that_sometimes_fails.sh index e344ab5..9617495 100755 --- a/examples/script_that_sometimes_fails.sh +++ b/examples/script_that_sometimes_fails.sh @@ -1,40 +1,40 @@ #!/bin/bash +seed=$SLURM_JOB_ID +ID=$1 + prng () { # Use the linear conguential generator algorithm: # https://en.wikipedia.org/wiki/Random_number_generation#Computational_methods # - # Set it with: - # a = 1 - # b = SLURM_JOB_ID - # m = SLURM_NTASKS - # # We seed b with the SLURM_JOB_ID so that we independently have # the same seed for all tasks for a given job. x_n=0 a=1 - b=$SLURM_JOB_ID + b=$seed m=$SLURM_NTASKS # Recur as many times as the task id to generate different numbers # for each SLURM task. - for _ in 1..$SLURM_PROCID + for i in $(seq 1 $ID) do x_n=$(( $(( a * $((x_n + b)) )) % m)) done + echo $x_n } main () { # Randomly fail half of the tasks. - "Task $SLURM_PROC_ID started..." random_int=$(prng) + echo -n "Task $ID started (seed $seed, random number $random_int) ... " sleep "$random_int" - if _=$(( random_int % 2 )) + if (( $random_int % 4 == 0 )) then - "Task $SLURM_PROC_ID failed!" - exit 1 + echo "succeeded!" + exit 0 fi - "Task $SLURM_PROC_ID succeeded!" + echo "failed!" + exit 1 } [ "$0" != "${BASH_SOURCE[0]}" ] || main "$@"