From d362129e91418a505ef83fe87a06733fea6522eb Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 23 Apr 2019 12:52:59 -0400 Subject: [PATCH] ENH: Give user control over resuming and add example --- examples/02-submit-resumable.slurm | 23 ++++++++++++++ examples/script_that_sometimes_fails.sh | 40 +++++++++++++++++++++++++ parallel_opts.sh | 5 +--- 3 files changed, 64 insertions(+), 4 deletions(-) create mode 100644 examples/02-submit-resumable.slurm create mode 100755 examples/script_that_sometimes_fails.sh diff --git a/examples/02-submit-resumable.slurm b/examples/02-submit-resumable.slurm new file mode 100644 index 0000000..8a42875 --- /dev/null +++ b/examples/02-submit-resumable.slurm @@ -0,0 +1,23 @@ +#!/bin/bash -x +#SBATCH --nodes 2 +#SBATCH --ntasks 5 +#SBATCH --output submit.out + +#SBATCH --dependency singleton +#SBATCH --job-name unambiguous-name-for-resumable-job +# Kill job after 15 seconds to show resuming feature. +#SBATCH --time 0:15 + +# Overwrite instead of appending to output file. +echo -n > submit.out + +parallel_opts=$(~/parallel-slurm/parallel_opts.sh) +module load parallel + +# Run a failure prone program. +parallel $parallel_opts \ + --joblog joblog \ + --resume \ + --line-buffer \ + ./script_that_sometimes_fails.sh \ + ::: $(seq $SLURM_NTASKS) diff --git a/examples/script_that_sometimes_fails.sh b/examples/script_that_sometimes_fails.sh new file mode 100755 index 0000000..e344ab5 --- /dev/null +++ b/examples/script_that_sometimes_fails.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +prng () { + # Use the linear conguential generator algorithm: + # https://en.wikipedia.org/wiki/Random_number_generation#Computational_methods + # + # Set it with: + # a = 1 + # b = SLURM_JOB_ID + # m = SLURM_NTASKS + # + # We seed b with the SLURM_JOB_ID so that we independently have + # the same seed for all tasks for a given job. + + x_n=0 + a=1 + b=$SLURM_JOB_ID + m=$SLURM_NTASKS + # Recur as many times as the task id to generate different numbers + # for each SLURM task. + for _ in 1..$SLURM_PROCID + do + x_n=$(( $(( a * $((x_n + b)) )) % m)) + done +} + +main () { + # Randomly fail half of the tasks. + "Task $SLURM_PROC_ID started..." + random_int=$(prng) + sleep "$random_int" + if _=$(( random_int % 2 )) + then + "Task $SLURM_PROC_ID failed!" + exit 1 + fi + "Task $SLURM_PROC_ID succeeded!" +} + +[ "$0" != "${BASH_SOURCE[0]}" ] || main "$@" diff --git a/parallel_opts.sh b/parallel_opts.sh index 90d3685..fedde1a 100755 --- a/parallel_opts.sh +++ b/parallel_opts.sh @@ -95,17 +95,14 @@ write_machinefile () { } parallel_opts () { - local machinefile joblog + local machinefile machinefile=$(machinefile) - joblog=$(prefix).joblog echo " --env _ --sshdelay 0.1 --sshloginfile $machinefile --workdir . - --joblog $joblog - --resume " }