From 427f794877d4005a636f6d739c14f01a16d650c8 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 23 Apr 2019 12:56:23 -0400 Subject: [PATCH 1/3] BUG: Fix brace expansion --- examples/script_that_sometimes_fails.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/script_that_sometimes_fails.sh b/examples/script_that_sometimes_fails.sh index e344ab5..fad09e3 100755 --- a/examples/script_that_sometimes_fails.sh +++ b/examples/script_that_sometimes_fails.sh @@ -18,7 +18,7 @@ prng () { m=$SLURM_NTASKS # Recur as many times as the task id to generate different numbers # for each SLURM task. - for _ in 1..$SLURM_PROCID + for _ in {1..$SLURM_PROCID} do x_n=$(( $(( a * $((x_n + b)) )) % m)) done From 1bdd3b8cac9bcd1c229b18772692691d1fb34b4a Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 23 Apr 2019 13:33:48 -0400 Subject: [PATCH 2/3] ENH: Validate example 02 on the cluster --- README.md | 6 ++++++ examples/02-submit-resumable.slurm | 9 +++------ examples/script_that_sometimes_fails.sh | 24 ++++++++++++------------ 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 9e83b4d..b287328 100644 --- a/README.md +++ b/README.md @@ -57,3 +57,9 @@ Note that if you resubmit the job you will not see any output. This is because of the `--joblog` and `--resume` options; the job remembers that the work was complete and does not needlessly re-run the program. To re-run the program you would need to delete the *.joblog file. + +Run the resumable example with: + +``` +rm -f submit.out joblog; touch submit.out; for _ in {1..5} ; do sbatch --partition debug -N1 02-submit-resumable.slurm ; done; tail -f submit.out +``` diff --git a/examples/02-submit-resumable.slurm b/examples/02-submit-resumable.slurm index 8a42875..34812be 100644 --- a/examples/02-submit-resumable.slurm +++ b/examples/02-submit-resumable.slurm @@ -1,5 +1,4 @@ -#!/bin/bash -x -#SBATCH --nodes 2 +#!/bin/bash #SBATCH --ntasks 5 #SBATCH --output submit.out @@ -8,16 +7,14 @@ # Kill job after 15 seconds to show resuming feature. #SBATCH --time 0:15 -# Overwrite instead of appending to output file. -echo -n > submit.out - parallel_opts=$(~/parallel-slurm/parallel_opts.sh) module load parallel # Run a failure prone program. +echo "Started SLURM job $SLURM_JOB_ID" parallel $parallel_opts \ --joblog joblog \ - --resume \ + --resume-failed \ --line-buffer \ ./script_that_sometimes_fails.sh \ ::: $(seq $SLURM_NTASKS) diff --git a/examples/script_that_sometimes_fails.sh b/examples/script_that_sometimes_fails.sh index fad09e3..9617495 100755 --- a/examples/script_that_sometimes_fails.sh +++ b/examples/script_that_sometimes_fails.sh @@ -1,40 +1,40 @@ #!/bin/bash +seed=$SLURM_JOB_ID +ID=$1 + prng () { # Use the linear conguential generator algorithm: # https://en.wikipedia.org/wiki/Random_number_generation#Computational_methods # - # Set it with: - # a = 1 - # b = SLURM_JOB_ID - # m = SLURM_NTASKS - # # We seed b with the SLURM_JOB_ID so that we independently have # the same seed for all tasks for a given job. x_n=0 a=1 - b=$SLURM_JOB_ID + b=$seed m=$SLURM_NTASKS # Recur as many times as the task id to generate different numbers # for each SLURM task. - for _ in {1..$SLURM_PROCID} + for i in $(seq 1 $ID) do x_n=$(( $(( a * $((x_n + b)) )) % m)) done + echo $x_n } main () { # Randomly fail half of the tasks. - "Task $SLURM_PROC_ID started..." random_int=$(prng) + echo -n "Task $ID started (seed $seed, random number $random_int) ... " sleep "$random_int" - if _=$(( random_int % 2 )) + if (( $random_int % 4 == 0 )) then - "Task $SLURM_PROC_ID failed!" - exit 1 + echo "succeeded!" + exit 0 fi - "Task $SLURM_PROC_ID succeeded!" + echo "failed!" + exit 1 } [ "$0" != "${BASH_SOURCE[0]}" ] || main "$@" From f16aa34929cdcab3c229e16db3e005e8e73e40b7 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 23 Apr 2019 13:40:06 -0400 Subject: [PATCH 3/3] ENH: Add output from example 02 --- README.md | 34 ++++++++++++++++++++++++++++-- examples/02-submit-resumable.slurm | 1 + 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b287328..d213480 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,36 @@ To re-run the program you would need to delete the *.joblog file. Run the resumable example with: +```sh +rm -f submit.out joblog; touch submit.out; for _ in {1..5} ; do sbatch --partition debug 02-submit-resumable.slurm; done; tail -f submit.out ``` -rm -f submit.out joblog; touch submit.out; for _ in {1..5} ; do sbatch --partition debug -N1 02-submit-resumable.slurm ; done; tail -f submit.out -``` + +```sh +# Inside your submit.out +Submitted batch job 2320058 +Submitted batch job 2320059 +Submitted batch job 2320060 +Submitted batch job 2320061 +Submitted batch job 2320062 +Started SLURM job 2320058 +Task 5 started (seed 2320058, random number 0) ... succeeded! +Task 2 started (seed 2320058, random number 1) ... failed! +Task 4 started (seed 2320058, random number 2) ... failed! +Task 1 started (seed 2320058, random number 3) ... failed! +Task 3 started (seed 2320058, random number 4) ... succeeded! +Completed SLURM job 2320058 in 00:00:05 +Started SLURM job 2320059 +Task 4 started (seed 2320059, random number 1) ... failed! +Task 2 started (seed 2320059, random number 3) ... failed! +Task 1 started (seed 2320059, random number 4) ... succeeded! +Completed SLURM job 2320059 in 00:00:04 +Started SLURM job 2320060 +Task 2 started (seed 2320060, random number 0) ... succeeded! +Task 4 started (seed 2320060, random number 0) ... succeeded! +Completed SLURM job 2320060 in 00:00:00 +Started SLURM job 2320061 +Completed SLURM job 2320061 in 00:00:01 +Started SLURM job 2320062 +Completed SLURM job 2320062 in 00:00:00 +^C +``` \ No newline at end of file diff --git a/examples/02-submit-resumable.slurm b/examples/02-submit-resumable.slurm index 34812be..a94a684 100644 --- a/examples/02-submit-resumable.slurm +++ b/examples/02-submit-resumable.slurm @@ -18,3 +18,4 @@ parallel $parallel_opts \ --line-buffer \ ./script_that_sometimes_fails.sh \ ::: $(seq $SLURM_NTASKS) +echo "Completed SLURM job $SLURM_JOB_ID in $(sacct -nXj $SLURM_JOB_ID -o elapsed)"