From cc283861f6582437c345360d2bfcf366211e55f4 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 16 Apr 2019 12:53:58 -0400 Subject: [PATCH 01/27] STY: Pass shellcheck --- parallel-slurm-setup.sh | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/parallel-slurm-setup.sh b/parallel-slurm-setup.sh index 15e884d..2e44860 100644 --- a/parallel-slurm-setup.sh +++ b/parallel-slurm-setup.sh @@ -1,6 +1,8 @@ +#!/usr/bin/env bash + # GNU Parallel setup for SLURM # -# Author: Pariksheet Nanda 2016-2017 +# Author: Pariksheet Nanda 2016-2017,2019 # # License: Public Domain / CC0 # @@ -8,8 +10,11 @@ # copyright and related or neighboring rights to GNU Parallel setup # for SLURM. +# This directive applies to the entire script. +# shellcheck disable=2039 +true + # Load the modules -source /etc/profile.d/modules.sh module load parallel # Allow export of environment using `--env` option @@ -24,15 +29,15 @@ fi prefix=${SLURM_JOB_NAME%.*} machine_file=${prefix}.sshloginfile -function expand_slurm_tasks_per_node () { +expand_slurm_tasks_per_node () { local tasks - tasks=( $(echo "${SLURM_TASKS_PER_NODE}" | tr ',' ' ') ) + mapfile -t tasks < "$(echo "${SLURM_TASKS_PER_NODE}" | tr ',' ' ')" local num count for val in ${tasks[*]}; do num="${val/(*)/}" if [[ -z "${val%%*)}" ]]; then - count=$(echo $val | sed -E 's#[0-9]+\(x([0-9]+)\)#\1#') + count=$(echo "$val" | sed -E 's#[0-9]+\(x([0-9]+)\)#\1#') else count=1 fi @@ -41,22 +46,22 @@ function expand_slurm_tasks_per_node () { } # Make list in the form of "cpu/host" -function cpu_host_array () { +cpu_host_array () { local hostlist hosts cpus # The SLURM `hostlist` executable is part of the Python PIP # "python-hostlist" package hostlist=/apps2/python/2.7.6-gcc/bin/hostlist - hosts=( $($hostlist -e ${SLURM_NODELIST}) ) - cpus=( $(expand_slurm_tasks_per_node) ) + mapfile -t hosts < "$($hostlist -e "${SLURM_NODELIST}")" + mapfile -t cpus < "$(expand_slurm_tasks_per_node)" for ((i=0; i<${#hosts[*]}; ++i)); do - echo ${cpus[i]}/${hosts[i]} + echo "${cpus[i]}/${hosts[i]}" done } -arr=( $(cpu_host_array) ) -printf "%s\n" ${arr[*]} > $machine_file +mapfile -t arr < "$(cpu_host_array)" +printf "%s\n" "${arr[*]}" > "$machine_file" # Pass default options to GNU Parallel -parallel="parallel +export parallel_opts=" --env _ --sshdelay 0.1 --sshloginfile $machine_file From 98a0873a9780c092f8b7c43add944ae781201536 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 16 Apr 2019 13:26:31 -0400 Subject: [PATCH 02/27] MAINT: Move all code into functions --- parallel-slurm-setup.sh | 59 ++++++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/parallel-slurm-setup.sh b/parallel-slurm-setup.sh index 2e44860..1f1b63c 100644 --- a/parallel-slurm-setup.sh +++ b/parallel-slurm-setup.sh @@ -14,20 +14,27 @@ # shellcheck disable=2039 true -# Load the modules -module load parallel +is_slurm_env () { + if [[ -n "$SLURM_JOB_ID" ]] + then # yes + echo 0 + else + echo 1 + fi +} -# Allow export of environment using `--env` option -if [[ ! -e ~/.parallel/ignored_vars ]]; then - # Create an empty ignored_vars file to pass all the environment - # variables to the SSH instance - mkdir -p ~/.parallel - touch ~/.parallel/ignored_vars -fi +setup_on_cluster () { + # Load the modules + module load parallel -# Create the machine file for this job -prefix=${SLURM_JOB_NAME%.*} -machine_file=${prefix}.sshloginfile + # Allow export of environment using `--env` option + if [[ ! -e ~/.parallel/ignored_vars ]]; then + # Create an empty ignored_vars file to pass all the environment + # variables to the SSH instance + mkdir -p ~/.parallel + touch ~/.parallel/ignored_vars + fi +} expand_slurm_tasks_per_node () { local tasks @@ -57,11 +64,19 @@ cpu_host_array () { echo "${cpus[i]}/${hosts[i]}" done } -mapfile -t arr < "$(cpu_host_array)" -printf "%s\n" "${arr[*]}" > "$machine_file" -# Pass default options to GNU Parallel -export parallel_opts=" +generate_machinefile () { + # Create the machine file for this job + prefix=${SLURM_JOB_NAME%.*} + machine_file=${prefix}.sshloginfile + + mapfile -t arr < "$(cpu_host_array)" + printf "%s\n" "${arr[*]}" > "$machine_file" +} + +generate_parallel_opts () { + # Pass default options to GNU Parallel + export parallel_opts=" --env _ --sshdelay 0.1 --sshloginfile $machine_file @@ -69,3 +84,15 @@ export parallel_opts=" " # --joblog ${prefix}.joblog # --resume +} + +main () { + if is_slurm_env + then + setup_on_cluster + fi + generate_machinefile + generate_parallel_opts +} + +[[ "$0" != "${BASH_SOURCE[0]}" ]] || main "$@" From 139ebc296a251d52ec5d088a7ca4e04dd10e3fef Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 16 Apr 2019 13:26:47 -0400 Subject: [PATCH 03/27] TST: Cover generate_parallel_opts and is_slurm_env --- tests/parallel.bats | 13 +++++++++++++ tests/slurm_fixture.bats | 14 ++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 tests/parallel.bats create mode 100644 tests/slurm_fixture.bats diff --git a/tests/parallel.bats b/tests/parallel.bats new file mode 100644 index 0000000..870e81a --- /dev/null +++ b/tests/parallel.bats @@ -0,0 +1,13 @@ +#!/usr/bin/env bats + +# shellcheck disable=1083 +true + +# shellcheck source=../parallel-slurm-setup.sh +. "${BATS_TEST_DIRNAME}/../parallel-slurm-setup.sh" + +@test 'parallel_opts env var is generated' { + [[ -z "${parallel_opts}" ]] + generate_parallel_opts + [[ -n "${parallel_opts}" ]] +} diff --git a/tests/slurm_fixture.bats b/tests/slurm_fixture.bats new file mode 100644 index 0000000..6531924 --- /dev/null +++ b/tests/slurm_fixture.bats @@ -0,0 +1,14 @@ +#!/usr/bin/env bats + +# shellcheck disable=1083 +true + +# shellcheck source=../parallel-slurm-setup.sh +. "${BATS_TEST_DIRNAME}/../parallel-slurm-setup.sh" + +@test 'SLURM environment is recognized' { + unset SLURM_JOB_ID + ! is_slurm_env + set SLURM_JOB_ID + is_slurm_env +} From 5dd8169e9697a44351a81469397c7ddde4ebc714 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Wed, 17 Apr 2019 12:23:19 -0400 Subject: [PATCH 04/27] DEV: Use descriptive function names --- parallel-slurm-setup.sh | 8 ++++---- tests/parallel.bats | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/parallel-slurm-setup.sh b/parallel-slurm-setup.sh index 1f1b63c..d39a4f7 100644 --- a/parallel-slurm-setup.sh +++ b/parallel-slurm-setup.sh @@ -65,7 +65,7 @@ cpu_host_array () { done } -generate_machinefile () { +write_machinefile () { # Create the machine file for this job prefix=${SLURM_JOB_NAME%.*} machine_file=${prefix}.sshloginfile @@ -74,7 +74,7 @@ generate_machinefile () { printf "%s\n" "${arr[*]}" > "$machine_file" } -generate_parallel_opts () { +export_parallel_opts () { # Pass default options to GNU Parallel export parallel_opts=" --env _ @@ -91,8 +91,8 @@ main () { then setup_on_cluster fi - generate_machinefile - generate_parallel_opts + write_machinefile + export_parallel_opts } [[ "$0" != "${BASH_SOURCE[0]}" ]] || main "$@" diff --git a/tests/parallel.bats b/tests/parallel.bats index 870e81a..1f968fa 100644 --- a/tests/parallel.bats +++ b/tests/parallel.bats @@ -8,6 +8,6 @@ true @test 'parallel_opts env var is generated' { [[ -z "${parallel_opts}" ]] - generate_parallel_opts + export_parallel_opts [[ -n "${parallel_opts}" ]] } From c549306ee8b62efc9229db38d10219fbba78783e Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Wed, 17 Apr 2019 12:49:37 -0400 Subject: [PATCH 05/27] TST: Cover expand_slurm_tasks_per_node --- parallel-slurm-setup.sh | 6 +++++- tests/slurm_fixture.bats | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/parallel-slurm-setup.sh b/parallel-slurm-setup.sh index d39a4f7..2abe718 100644 --- a/parallel-slurm-setup.sh +++ b/parallel-slurm-setup.sh @@ -36,9 +36,12 @@ setup_on_cluster () { fi } +# Expand tasks from "2,5(x1),3(x2)" to "2 5 3 3 " expand_slurm_tasks_per_node () { + [[ -z "${SLURM_TASKS_PER_NODE}" ]] && return + local tasks - mapfile -t tasks < "$(echo "${SLURM_TASKS_PER_NODE}" | tr ',' ' ')" + tasks=( $(echo "${SLURM_TASKS_PER_NODE}" | tr ',' ' ') ) local num count for val in ${tasks[*]}; do @@ -48,6 +51,7 @@ expand_slurm_tasks_per_node () { else count=1 fi + # shellcheck disable=2046 printf "$num%.0s " $(seq $count) done } diff --git a/tests/slurm_fixture.bats b/tests/slurm_fixture.bats index 6531924..1e74065 100644 --- a/tests/slurm_fixture.bats +++ b/tests/slurm_fixture.bats @@ -6,6 +6,24 @@ true # shellcheck source=../parallel-slurm-setup.sh . "${BATS_TEST_DIRNAME}/../parallel-slurm-setup.sh" +@test 'SLURM host list is expanded' { + unset SLURM_TASKS_PER_NODE + result=$(expand_slurm_tasks_per_node) + [[ -z "${result}" ]] + + SLURM_TASKS_PER_NODE="2(x3)" + result=$(expand_slurm_tasks_per_node) + echo "${result}" + [[ -n "${result}" ]] + [[ ${result} == "2 2 2 " ]] + + SLURM_TASKS_PER_NODE="2,5(x1),3(x2)" + result=$(expand_slurm_tasks_per_node) + echo "${result}" + [[ -n "${result}" ]] + [[ ${result} == "2 5 3 3 " ]] +} + @test 'SLURM environment is recognized' { unset SLURM_JOB_ID ! is_slurm_env From cf502c9ac6ed7aa0118d6f062c727086c3b39cc8 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Wed, 17 Apr 2019 12:58:06 -0400 Subject: [PATCH 06/27] TST: Add failing test for generating machinefile --- tests/slurm_fixture.bats | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/slurm_fixture.bats b/tests/slurm_fixture.bats index 1e74065..79dd01d 100644 --- a/tests/slurm_fixture.bats +++ b/tests/slurm_fixture.bats @@ -6,6 +6,19 @@ true # shellcheck source=../parallel-slurm-setup.sh . "${BATS_TEST_DIRNAME}/../parallel-slurm-setup.sh" +@test 'SLURM tasks are paired with nodes' { + unset SLURM_NODELIST + result=$(cpu_host_array) + [[ -z "${result}" ]] + + SLURM_TASKS_PER_NODE="2(x3)" + SLURM_NODELIST="cn[100-102]" + result=$(cpu_host_array) + echo "${result}" + [[ -n "${result}" ]] + [[ ${result} == "2/cn100 2/cn101 2/cn102" ]] +} + @test 'SLURM host list is expanded' { unset SLURM_TASKS_PER_NODE result=$(expand_slurm_tasks_per_node) From d5e6b468b6e90796ff888934ea0f0f804c261756 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Fri, 19 Apr 2019 12:00:49 -0400 Subject: [PATCH 07/27] DEV: Silence shellcheck warning --- parallel-slurm-setup.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/parallel-slurm-setup.sh b/parallel-slurm-setup.sh index 2abe718..ca41cd8 100644 --- a/parallel-slurm-setup.sh +++ b/parallel-slurm-setup.sh @@ -41,6 +41,7 @@ expand_slurm_tasks_per_node () { [[ -z "${SLURM_TASKS_PER_NODE}" ]] && return local tasks + # shellcheck disable=2207 tasks=( $(echo "${SLURM_TASKS_PER_NODE}" | tr ',' ' ') ) local num count From 359852ba2f52883b1d5287a8a2903e8903d5225f Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Fri, 19 Apr 2019 12:01:08 -0400 Subject: [PATCH 08/27] TST: Expect newlines from cpu_host_array() --- tests/slurm_fixture.bats | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/slurm_fixture.bats b/tests/slurm_fixture.bats index 79dd01d..016b662 100644 --- a/tests/slurm_fixture.bats +++ b/tests/slurm_fixture.bats @@ -16,7 +16,9 @@ true result=$(cpu_host_array) echo "${result}" [[ -n "${result}" ]] - [[ ${result} == "2/cn100 2/cn101 2/cn102" ]] + [[ "${result}" == "2/cn100 +2/cn101 +2/cn102" ]] } @test 'SLURM host list is expanded' { From 8e515564ec59d6bb06d262f714c4511870887127 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Fri, 19 Apr 2019 12:01:46 -0400 Subject: [PATCH 09/27] MAINT: Replace python-hostlist with better maintained clustershell --- parallel-slurm-setup.sh | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/parallel-slurm-setup.sh b/parallel-slurm-setup.sh index ca41cd8..ba90f90 100644 --- a/parallel-slurm-setup.sh +++ b/parallel-slurm-setup.sh @@ -23,6 +23,20 @@ is_slurm_env () { fi } +# Helper to expand hostnames +has_clustershell () { + if python -m ClusterShell.CLI.Nodeset -h &> /dev/null + then + echo 0 + else + echo 1 + fi +} + +install_clustershell () { + python -m pip install --user clustershell +} + setup_on_cluster () { # Load the modules module load parallel @@ -59,12 +73,13 @@ expand_slurm_tasks_per_node () { # Make list in the form of "cpu/host" cpu_host_array () { - local hostlist hosts cpus -# The SLURM `hostlist` executable is part of the Python PIP -# "python-hostlist" package - hostlist=/apps2/python/2.7.6-gcc/bin/hostlist - mapfile -t hosts < "$($hostlist -e "${SLURM_NODELIST}")" - mapfile -t cpus < "$(expand_slurm_tasks_per_node)" + local nodeset hosts cpus + nodeset="python -m ClusterShell.CLI.Nodeset" + + # shellcheck disable=2207 + hosts=( $($nodeset -e "${SLURM_NODELIST}") ) + # shellcheck disable=2207 + cpus=( $(expand_slurm_tasks_per_node) ) for ((i=0; i<${#hosts[*]}; ++i)); do echo "${cpus[i]}/${hosts[i]}" done From d3449653cebbfedf2ac0077a6da803c460c17cdf Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Fri, 19 Apr 2019 12:38:43 -0400 Subject: [PATCH 10/27] MAINT: Rewrite environment setup as one liner --- parallel-slurm-setup.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/parallel-slurm-setup.sh b/parallel-slurm-setup.sh index ba90f90..dfcaf87 100644 --- a/parallel-slurm-setup.sh +++ b/parallel-slurm-setup.sh @@ -107,10 +107,7 @@ export_parallel_opts () { } main () { - if is_slurm_env - then - setup_on_cluster - fi + is_slurm_env && setup_on_cluster write_machinefile export_parallel_opts } From b3d0885c450d6b56039a14fce5bdae0bf7e79f4f Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Fri, 19 Apr 2019 12:39:17 -0400 Subject: [PATCH 11/27] BUG: Automatically install clustershell dependency --- parallel-slurm-setup.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/parallel-slurm-setup.sh b/parallel-slurm-setup.sh index dfcaf87..139ecb8 100644 --- a/parallel-slurm-setup.sh +++ b/parallel-slurm-setup.sh @@ -108,6 +108,7 @@ export_parallel_opts () { main () { is_slurm_env && setup_on_cluster + ! has_clustershell && install_clustershell write_machinefile export_parallel_opts } From cf2ffbcc1166bb11d720d4537e1a14d1cce02458 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Fri, 19 Apr 2019 12:40:08 -0400 Subject: [PATCH 12/27] TST: Cover generation of machinefile --- parallel-slurm-setup.sh | 25 ++++++++++++++++--------- tests/slurm_fixture.bats | 24 ++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/parallel-slurm-setup.sh b/parallel-slurm-setup.sh index 139ecb8..536cc24 100644 --- a/parallel-slurm-setup.sh +++ b/parallel-slurm-setup.sh @@ -85,25 +85,32 @@ cpu_host_array () { done } -write_machinefile () { - # Create the machine file for this job - prefix=${SLURM_JOB_NAME%.*} - machine_file=${prefix}.sshloginfile +prefix () { + echo "${SLURM_JOB_NAME%.*}" +} + +machinefile () { + echo "$(prefix).sshloginfile" +} - mapfile -t arr < "$(cpu_host_array)" - printf "%s\n" "${arr[*]}" > "$machine_file" +write_machinefile () { + cpu_host_array > "$(machinefile)" } export_parallel_opts () { + local machinefile joblog + machinefile=$(machinefile) + joblog=$(prefix).joblog + # Pass default options to GNU Parallel export parallel_opts=" --env _ --sshdelay 0.1 - --sshloginfile $machine_file + --sshloginfile $machinefile --workdir . + --joblog $joblog + --resume " -# --joblog ${prefix}.joblog -# --resume } main () { diff --git a/tests/slurm_fixture.bats b/tests/slurm_fixture.bats index 016b662..9c54664 100644 --- a/tests/slurm_fixture.bats +++ b/tests/slurm_fixture.bats @@ -6,6 +6,30 @@ true # shellcheck source=../parallel-slurm-setup.sh . "${BATS_TEST_DIRNAME}/../parallel-slurm-setup.sh" +@test 'machinefile is generated from environment' { + SLURM_JOB_NAME="submit.slurm" + SLURM_NODELIST="cn[100-102]" + SLURM_TASKS_PER_NODE="2(x3)" + machinefile=$(machinefile) + write_machinefile + [[ $(cat "$machinefile") == "2/cn100 +2/cn101 +2/cn102" ]] + rm -f "$machinefile" +} + +@test 'prefix is generated from SLURM_JOB_NAME' { + unset SLURM_JOB_NAME + result=$(prefix) + [[ -z ${result} ]] + + # shellcheck disable=2034 + SLURM_JOB_NAME="submit.slurm" + result=$(prefix) + echo "${result}" + [[ ${result} == "submit" ]] +} + @test 'SLURM tasks are paired with nodes' { unset SLURM_NODELIST result=$(cpu_host_array) From 38e06b5c295906580c8667336a3a77078670d0b1 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Fri, 19 Apr 2019 12:43:33 -0400 Subject: [PATCH 13/27] MAINT: Clear file without adding leading newline --- submit.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submit.slurm b/submit.slurm index f98278c..13e2192 100644 --- a/submit.slurm +++ b/submit.slurm @@ -4,7 +4,7 @@ #SBATCH --output=submit.out # Overwrite instead of appending to output file. -echo > submit.out +echo -n > submit.out # Print the name of each host that GNU Parallel is running on. source parallel-slurm-setup.sh From 89450f7fa633a1d7246509c0e54e33ea61e4f905 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Fri, 19 Apr 2019 12:44:39 -0400 Subject: [PATCH 14/27] API: Use $parallel_opts, no need to source script anymore --- README.md | 6 +++--- submit.slurm | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2d74b25..3434db5 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ the SLURM scheduler. Namely one has to: - Export the environment, including the current directory. The `parallel-slurm-setup.sh` takes care of both these and provides an -environmental variable `$parallel` for you to run the parallel +environmental variable `$parallel_opts` for you to run the parallel executable with some sensible options. ## Usage @@ -27,8 +27,8 @@ Add the following 2 lines to your SLURM job submission file ``` sh # Inside your SLURM submission file -source ~/parallel-slurm/parallel-slurm-setup.sh -$parallel YOUR_PROGRAM ... +~/parallel-slurm/parallel-slurm-setup.sh +parallel $parallel_opts ... YOUR_PROGRAM ... ``` ## Example diff --git a/submit.slurm b/submit.slurm index 13e2192..c2605fe 100644 --- a/submit.slurm +++ b/submit.slurm @@ -8,4 +8,5 @@ echo -n > submit.out # Print the name of each host that GNU Parallel is running on. source parallel-slurm-setup.sh -$parallel -n0 hostname ::: $(seq $SLURM_NTASKS) +# shellcheck disable=2086 +parallel $parallel_opts -n0 hostname ::: $(seq $SLURM_NTASKS) From 395abcb50d1c60774419d1b4e0d616bb67041651 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Fri, 19 Apr 2019 13:24:40 -0400 Subject: [PATCH 15/27] API: Directly echo GNU parallel options - One cannot export environmental variables in bash without sourcing the script, but sourcing the script is needlessly disruptive for the environment. - Use the return keyword in bash for predicate functions instead of echoing values and corrupting the GNU parallel options output. - Fix failing test detection of the SLURM environment. - Document the additional --joblog and --resume options. --- README.md | 13 +++++++++---- parallel-slurm-setup.sh => parallel_opts.sh | 20 ++++++++------------ submit.slurm | 5 +++-- tests/parallel.bats | 6 +++--- tests/slurm_fixture.bats | 6 +++--- 5 files changed, 26 insertions(+), 24 deletions(-) rename parallel-slurm-setup.sh => parallel_opts.sh (91%) mode change 100644 => 100755 diff --git a/README.md b/README.md index 3434db5..47dd4cf 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,8 @@ the SLURM scheduler. Namely one has to: and CPU counts that have been assigned by SLURM. - Export the environment, including the current directory. -The `parallel-slurm-setup.sh` takes care of both these and provides an -environmental variable `$parallel_opts` for you to run the parallel -executable with some sensible options. +The `parallel_opts.sh` takes care of both these job setup steps and +echoes sensible default options to GNU parallel. ## Usage @@ -27,7 +26,7 @@ Add the following 2 lines to your SLURM job submission file ``` sh # Inside your SLURM submission file -~/parallel-slurm/parallel-slurm-setup.sh +parallel_opts=$(~/parallel-slurm/parallel_opts.sh) parallel $parallel_opts ... YOUR_PROGRAM ... ``` @@ -37,6 +36,7 @@ See the `submit.slurm` example file. Run it using: ``` sh # From the command-line +cd ~/parallel-slurm sbatch submit.slurm ``` @@ -51,3 +51,8 @@ cn327 cn328 cn327 ``` + +Note that if you resubmit the job you will not see any output. This +is because of the `--joblog` and `--resume` options; the job remembers +that the work was complete and does not needlessly re-run the program. +To re-run the program you would need to delete the *.joblog file. \ No newline at end of file diff --git a/parallel-slurm-setup.sh b/parallel_opts.sh old mode 100644 new mode 100755 similarity index 91% rename from parallel-slurm-setup.sh rename to parallel_opts.sh index 536cc24..90d3685 --- a/parallel-slurm-setup.sh +++ b/parallel_opts.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # GNU Parallel setup for SLURM -# +# # Author: Pariksheet Nanda 2016-2017,2019 # # License: Public Domain / CC0 @@ -17,9 +17,9 @@ true is_slurm_env () { if [[ -n "$SLURM_JOB_ID" ]] then # yes - echo 0 + return 0 else - echo 1 + return 1 fi } @@ -27,9 +27,9 @@ is_slurm_env () { has_clustershell () { if python -m ClusterShell.CLI.Nodeset -h &> /dev/null then - echo 0 + return 0 else - echo 1 + return 1 fi } @@ -38,9 +38,6 @@ install_clustershell () { } setup_on_cluster () { - # Load the modules - module load parallel - # Allow export of environment using `--env` option if [[ ! -e ~/.parallel/ignored_vars ]]; then # Create an empty ignored_vars file to pass all the environment @@ -97,13 +94,12 @@ write_machinefile () { cpu_host_array > "$(machinefile)" } -export_parallel_opts () { +parallel_opts () { local machinefile joblog machinefile=$(machinefile) joblog=$(prefix).joblog - # Pass default options to GNU Parallel - export parallel_opts=" + echo " --env _ --sshdelay 0.1 --sshloginfile $machinefile @@ -117,7 +113,7 @@ main () { is_slurm_env && setup_on_cluster ! has_clustershell && install_clustershell write_machinefile - export_parallel_opts + parallel_opts } [[ "$0" != "${BASH_SOURCE[0]}" ]] || main "$@" diff --git a/submit.slurm b/submit.slurm index c2605fe..75acab1 100644 --- a/submit.slurm +++ b/submit.slurm @@ -6,7 +6,8 @@ # Overwrite instead of appending to output file. echo -n > submit.out +parallel_opts=$(./parallel_opts.sh) +module load parallel + # Print the name of each host that GNU Parallel is running on. -source parallel-slurm-setup.sh -# shellcheck disable=2086 parallel $parallel_opts -n0 hostname ::: $(seq $SLURM_NTASKS) diff --git a/tests/parallel.bats b/tests/parallel.bats index 1f968fa..d09701b 100644 --- a/tests/parallel.bats +++ b/tests/parallel.bats @@ -3,11 +3,11 @@ # shellcheck disable=1083 true -# shellcheck source=../parallel-slurm-setup.sh -. "${BATS_TEST_DIRNAME}/../parallel-slurm-setup.sh" +# shellcheck source=../parallel_opts.sh +. "${BATS_TEST_DIRNAME}/../parallel_opts.sh" @test 'parallel_opts env var is generated' { [[ -z "${parallel_opts}" ]] - export_parallel_opts + parallel_opts=$(parallel_opts) [[ -n "${parallel_opts}" ]] } diff --git a/tests/slurm_fixture.bats b/tests/slurm_fixture.bats index 9c54664..47c8711 100644 --- a/tests/slurm_fixture.bats +++ b/tests/slurm_fixture.bats @@ -3,8 +3,8 @@ # shellcheck disable=1083 true -# shellcheck source=../parallel-slurm-setup.sh -. "${BATS_TEST_DIRNAME}/../parallel-slurm-setup.sh" +# shellcheck source=../parallel_opts.sh +. "${BATS_TEST_DIRNAME}/../parallel_opts.sh" @test 'machinefile is generated from environment' { SLURM_JOB_NAME="submit.slurm" @@ -66,6 +66,6 @@ true @test 'SLURM environment is recognized' { unset SLURM_JOB_ID ! is_slurm_env - set SLURM_JOB_ID + SLURM_JOB_ID=12345 is_slurm_env } From b02f16f98e21f80a72bfd52d8938c395fc67063b Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 23 Apr 2019 11:48:55 -0400 Subject: [PATCH 16/27] DOC: SLURM scripts probably need the module load command --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 47dd4cf..0abe467 100644 --- a/README.md +++ b/README.md @@ -22,11 +22,12 @@ cd git clone https://github.uconn.edu/HPC/parallel-slurm.git ``` -Add the following 2 lines to your SLURM job submission file +Add the following 3 lines to your SLURM job submission file ``` sh # Inside your SLURM submission file parallel_opts=$(~/parallel-slurm/parallel_opts.sh) +module load parallel parallel $parallel_opts ... YOUR_PROGRAM ... ``` From f6ca55b732d42e1be8bdaef9d00c39062bc8573f Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 23 Apr 2019 11:49:30 -0400 Subject: [PATCH 17/27] DOC: Explain why we need cd --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0abe467..b9292bc 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Clone this Git repository e.g. in your home directory: ``` sh # From the command-line -cd +cd # Go to home directory git clone https://github.uconn.edu/HPC/parallel-slurm.git ``` From 4a934ee874e73419e4fcc78a0ae0fb508e31f3c8 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 23 Apr 2019 11:49:47 -0400 Subject: [PATCH 18/27] DEV: Add trailing newline --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b9292bc..9e83b4d 100644 --- a/README.md +++ b/README.md @@ -56,4 +56,4 @@ cn327 Note that if you resubmit the job you will not see any output. This is because of the `--joblog` and `--resume` options; the job remembers that the work was complete and does not needlessly re-run the program. -To re-run the program you would need to delete the *.joblog file. \ No newline at end of file +To re-run the program you would need to delete the *.joblog file. From cbcd2b4c042d29542978ff4f664800aca069acd2 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 23 Apr 2019 11:51:53 -0400 Subject: [PATCH 19/27] MAINT: Use absolute path to move around submission scripts --- submit.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submit.slurm b/submit.slurm index 75acab1..9d484fc 100644 --- a/submit.slurm +++ b/submit.slurm @@ -6,7 +6,7 @@ # Overwrite instead of appending to output file. echo -n > submit.out -parallel_opts=$(./parallel_opts.sh) +parallel_opts=$(~/parallel-slurm/parallel_opts.sh) module load parallel # Print the name of each host that GNU Parallel is running on. From 2202a4ff55173a961347d0a095016a2dbf828108 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 23 Apr 2019 11:57:22 -0400 Subject: [PATCH 20/27] MAINT: Move *.slurm file into examples/ directory --- submit.slurm => examples/01-submit-hostname.slurm | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename submit.slurm => examples/01-submit-hostname.slurm (100%) diff --git a/submit.slurm b/examples/01-submit-hostname.slurm similarity index 100% rename from submit.slurm rename to examples/01-submit-hostname.slurm From b2f7467dc7caca6d98199fd231ca344329e9b6cd Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 23 Apr 2019 12:04:09 -0400 Subject: [PATCH 21/27] MAINT: Remove = symbols for readability --- examples/01-submit-hostname.slurm | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/01-submit-hostname.slurm b/examples/01-submit-hostname.slurm index 9d484fc..3749b27 100644 --- a/examples/01-submit-hostname.slurm +++ b/examples/01-submit-hostname.slurm @@ -1,7 +1,7 @@ #!/bin/bash -x -#SBATCH --nodes=2 -#SBATCH --ntasks=5 -#SBATCH --output=submit.out +#SBATCH --nodes 2 +#SBATCH --ntasks 5 +#SBATCH --output submit.out # Overwrite instead of appending to output file. echo -n > submit.out From d362129e91418a505ef83fe87a06733fea6522eb Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 23 Apr 2019 12:52:59 -0400 Subject: [PATCH 22/27] ENH: Give user control over resuming and add example --- examples/02-submit-resumable.slurm | 23 ++++++++++++++ examples/script_that_sometimes_fails.sh | 40 +++++++++++++++++++++++++ parallel_opts.sh | 5 +--- 3 files changed, 64 insertions(+), 4 deletions(-) create mode 100644 examples/02-submit-resumable.slurm create mode 100755 examples/script_that_sometimes_fails.sh diff --git a/examples/02-submit-resumable.slurm b/examples/02-submit-resumable.slurm new file mode 100644 index 0000000..8a42875 --- /dev/null +++ b/examples/02-submit-resumable.slurm @@ -0,0 +1,23 @@ +#!/bin/bash -x +#SBATCH --nodes 2 +#SBATCH --ntasks 5 +#SBATCH --output submit.out + +#SBATCH --dependency singleton +#SBATCH --job-name unambiguous-name-for-resumable-job +# Kill job after 15 seconds to show resuming feature. +#SBATCH --time 0:15 + +# Overwrite instead of appending to output file. +echo -n > submit.out + +parallel_opts=$(~/parallel-slurm/parallel_opts.sh) +module load parallel + +# Run a failure prone program. +parallel $parallel_opts \ + --joblog joblog \ + --resume \ + --line-buffer \ + ./script_that_sometimes_fails.sh \ + ::: $(seq $SLURM_NTASKS) diff --git a/examples/script_that_sometimes_fails.sh b/examples/script_that_sometimes_fails.sh new file mode 100755 index 0000000..e344ab5 --- /dev/null +++ b/examples/script_that_sometimes_fails.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +prng () { + # Use the linear conguential generator algorithm: + # https://en.wikipedia.org/wiki/Random_number_generation#Computational_methods + # + # Set it with: + # a = 1 + # b = SLURM_JOB_ID + # m = SLURM_NTASKS + # + # We seed b with the SLURM_JOB_ID so that we independently have + # the same seed for all tasks for a given job. + + x_n=0 + a=1 + b=$SLURM_JOB_ID + m=$SLURM_NTASKS + # Recur as many times as the task id to generate different numbers + # for each SLURM task. + for _ in 1..$SLURM_PROCID + do + x_n=$(( $(( a * $((x_n + b)) )) % m)) + done +} + +main () { + # Randomly fail half of the tasks. + "Task $SLURM_PROC_ID started..." + random_int=$(prng) + sleep "$random_int" + if _=$(( random_int % 2 )) + then + "Task $SLURM_PROC_ID failed!" + exit 1 + fi + "Task $SLURM_PROC_ID succeeded!" +} + +[ "$0" != "${BASH_SOURCE[0]}" ] || main "$@" diff --git a/parallel_opts.sh b/parallel_opts.sh index 90d3685..fedde1a 100755 --- a/parallel_opts.sh +++ b/parallel_opts.sh @@ -95,17 +95,14 @@ write_machinefile () { } parallel_opts () { - local machinefile joblog + local machinefile machinefile=$(machinefile) - joblog=$(prefix).joblog echo " --env _ --sshdelay 0.1 --sshloginfile $machinefile --workdir . - --joblog $joblog - --resume " } From 427f794877d4005a636f6d739c14f01a16d650c8 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 23 Apr 2019 12:56:23 -0400 Subject: [PATCH 23/27] BUG: Fix brace expansion --- examples/script_that_sometimes_fails.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/script_that_sometimes_fails.sh b/examples/script_that_sometimes_fails.sh index e344ab5..fad09e3 100755 --- a/examples/script_that_sometimes_fails.sh +++ b/examples/script_that_sometimes_fails.sh @@ -18,7 +18,7 @@ prng () { m=$SLURM_NTASKS # Recur as many times as the task id to generate different numbers # for each SLURM task. - for _ in 1..$SLURM_PROCID + for _ in {1..$SLURM_PROCID} do x_n=$(( $(( a * $((x_n + b)) )) % m)) done From 1bdd3b8cac9bcd1c229b18772692691d1fb34b4a Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 23 Apr 2019 13:33:48 -0400 Subject: [PATCH 24/27] ENH: Validate example 02 on the cluster --- README.md | 6 ++++++ examples/02-submit-resumable.slurm | 9 +++------ examples/script_that_sometimes_fails.sh | 24 ++++++++++++------------ 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 9e83b4d..b287328 100644 --- a/README.md +++ b/README.md @@ -57,3 +57,9 @@ Note that if you resubmit the job you will not see any output. This is because of the `--joblog` and `--resume` options; the job remembers that the work was complete and does not needlessly re-run the program. To re-run the program you would need to delete the *.joblog file. + +Run the resumable example with: + +``` +rm -f submit.out joblog; touch submit.out; for _ in {1..5} ; do sbatch --partition debug -N1 02-submit-resumable.slurm ; done; tail -f submit.out +``` diff --git a/examples/02-submit-resumable.slurm b/examples/02-submit-resumable.slurm index 8a42875..34812be 100644 --- a/examples/02-submit-resumable.slurm +++ b/examples/02-submit-resumable.slurm @@ -1,5 +1,4 @@ -#!/bin/bash -x -#SBATCH --nodes 2 +#!/bin/bash #SBATCH --ntasks 5 #SBATCH --output submit.out @@ -8,16 +7,14 @@ # Kill job after 15 seconds to show resuming feature. #SBATCH --time 0:15 -# Overwrite instead of appending to output file. -echo -n > submit.out - parallel_opts=$(~/parallel-slurm/parallel_opts.sh) module load parallel # Run a failure prone program. +echo "Started SLURM job $SLURM_JOB_ID" parallel $parallel_opts \ --joblog joblog \ - --resume \ + --resume-failed \ --line-buffer \ ./script_that_sometimes_fails.sh \ ::: $(seq $SLURM_NTASKS) diff --git a/examples/script_that_sometimes_fails.sh b/examples/script_that_sometimes_fails.sh index fad09e3..9617495 100755 --- a/examples/script_that_sometimes_fails.sh +++ b/examples/script_that_sometimes_fails.sh @@ -1,40 +1,40 @@ #!/bin/bash +seed=$SLURM_JOB_ID +ID=$1 + prng () { # Use the linear conguential generator algorithm: # https://en.wikipedia.org/wiki/Random_number_generation#Computational_methods # - # Set it with: - # a = 1 - # b = SLURM_JOB_ID - # m = SLURM_NTASKS - # # We seed b with the SLURM_JOB_ID so that we independently have # the same seed for all tasks for a given job. x_n=0 a=1 - b=$SLURM_JOB_ID + b=$seed m=$SLURM_NTASKS # Recur as many times as the task id to generate different numbers # for each SLURM task. - for _ in {1..$SLURM_PROCID} + for i in $(seq 1 $ID) do x_n=$(( $(( a * $((x_n + b)) )) % m)) done + echo $x_n } main () { # Randomly fail half of the tasks. - "Task $SLURM_PROC_ID started..." random_int=$(prng) + echo -n "Task $ID started (seed $seed, random number $random_int) ... " sleep "$random_int" - if _=$(( random_int % 2 )) + if (( $random_int % 4 == 0 )) then - "Task $SLURM_PROC_ID failed!" - exit 1 + echo "succeeded!" + exit 0 fi - "Task $SLURM_PROC_ID succeeded!" + echo "failed!" + exit 1 } [ "$0" != "${BASH_SOURCE[0]}" ] || main "$@" From f16aa34929cdcab3c229e16db3e005e8e73e40b7 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 23 Apr 2019 13:40:06 -0400 Subject: [PATCH 25/27] ENH: Add output from example 02 --- README.md | 34 ++++++++++++++++++++++++++++-- examples/02-submit-resumable.slurm | 1 + 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b287328..d213480 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,36 @@ To re-run the program you would need to delete the *.joblog file. Run the resumable example with: +```sh +rm -f submit.out joblog; touch submit.out; for _ in {1..5} ; do sbatch --partition debug 02-submit-resumable.slurm; done; tail -f submit.out ``` -rm -f submit.out joblog; touch submit.out; for _ in {1..5} ; do sbatch --partition debug -N1 02-submit-resumable.slurm ; done; tail -f submit.out -``` + +```sh +# Inside your submit.out +Submitted batch job 2320058 +Submitted batch job 2320059 +Submitted batch job 2320060 +Submitted batch job 2320061 +Submitted batch job 2320062 +Started SLURM job 2320058 +Task 5 started (seed 2320058, random number 0) ... succeeded! +Task 2 started (seed 2320058, random number 1) ... failed! +Task 4 started (seed 2320058, random number 2) ... failed! +Task 1 started (seed 2320058, random number 3) ... failed! +Task 3 started (seed 2320058, random number 4) ... succeeded! +Completed SLURM job 2320058 in 00:00:05 +Started SLURM job 2320059 +Task 4 started (seed 2320059, random number 1) ... failed! +Task 2 started (seed 2320059, random number 3) ... failed! +Task 1 started (seed 2320059, random number 4) ... succeeded! +Completed SLURM job 2320059 in 00:00:04 +Started SLURM job 2320060 +Task 2 started (seed 2320060, random number 0) ... succeeded! +Task 4 started (seed 2320060, random number 0) ... succeeded! +Completed SLURM job 2320060 in 00:00:00 +Started SLURM job 2320061 +Completed SLURM job 2320061 in 00:00:01 +Started SLURM job 2320062 +Completed SLURM job 2320062 in 00:00:00 +^C +``` \ No newline at end of file diff --git a/examples/02-submit-resumable.slurm b/examples/02-submit-resumable.slurm index 34812be..a94a684 100644 --- a/examples/02-submit-resumable.slurm +++ b/examples/02-submit-resumable.slurm @@ -18,3 +18,4 @@ parallel $parallel_opts \ --line-buffer \ ./script_that_sometimes_fails.sh \ ::: $(seq $SLURM_NTASKS) +echo "Completed SLURM job $SLURM_JOB_ID in $(sacct -nXj $SLURM_JOB_ID -o elapsed)" From 01744ce26c104c92203c08169b5e05ebc063ee30 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 7 May 2019 13:36:18 -0400 Subject: [PATCH 26/27] DOC: Tweak wording --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9e83b4d..197ba36 100644 --- a/README.md +++ b/README.md @@ -10,11 +10,11 @@ the SLURM scheduler. Namely one has to: - Export the environment, including the current directory. The `parallel_opts.sh` takes care of both these job setup steps and -echoes sensible default options to GNU parallel. +adds sensible default options to GNU parallel. ## Usage -Clone this Git repository e.g. in your home directory: +Clone this Git repository into your home directory: ``` sh # From the command-line From 46a9ad9e86b46540a8e49da48889076dee1741d1 Mon Sep 17 00:00:00 2001 From: Pariksheet Nanda Date: Tue, 7 May 2019 13:36:33 -0400 Subject: [PATCH 27/27] DOC: Properly document the 2 examples --- README.md | 78 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 67 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 197ba36..668441b 100644 --- a/README.md +++ b/README.md @@ -31,21 +31,28 @@ module load parallel parallel $parallel_opts ... YOUR_PROGRAM ... ``` -## Example +## Examples -See the `submit.slurm` example file. Run it using: +See the `*.slurm` example files. Run each of them using `sbatch` as +explained below: + +### Example 01: Hostname + +This minimal example simply outputs the compute node names in +`submit.out`. ``` sh # From the command-line -cd ~/parallel-slurm -sbatch submit.slurm +cd ~/parallel-slurm/examples +sbatch 01-submit-hostname.slurm +touch submit.out && tail -f submit.out +# Hit Ctrl+C to exit ``` -You should see the output of the compute node names in submit.out. -For example: +The last few lines of your output should show on which nodes your 5 +CPUs were allocated and the `hostname` command was run; for example: ``` sh -# Inside your submit.out cn328 cn327 cn327 @@ -53,7 +60,56 @@ cn328 cn327 ``` -Note that if you resubmit the job you will not see any output. This -is because of the `--joblog` and `--resume` options; the job remembers -that the work was complete and does not needlessly re-run the program. -To re-run the program you would need to delete the *.joblog file. +### Example 02: Resumable + +A typical problem that parallel tasks need to deal with is recovering +from failure. Tasks can fail when they hit the SLURM job time limit. +Or they can fail due to the stochastic nature of a simulation +intermittently not converging; in other words re-running the job can +produce success. + +This example shows how to automatically resume jobs and retry only +failed tasks. This works using the `--joblog` and `--resume` options +to GNU Parallel. Using `--resume` tells GNU Parallel to ignore +completed jobs. The joblog remembers that the work was complete and +does not needlessly re-run completed tasks. If for some reason you +need to re-run the a completed task you would need to delete the +*.joblog file. + +To run the example: + +``` sh +# From the command-line +cd ~/parallel-slurm/examples +rm -f joblog submit.out +for i in {1..5}; do sbatch 02-submit-resumable.slurm; done +touch submit.out && tail -f submit.out +# Hit Ctrl+C to exit +``` + +The output shows that some tasks intermittently failing and some +succeeding. But always by the 5th job all of them succeed. + +``` +Started SLURM job 2339006 +Task 5 started (seed 2339006, random number 0) ... succeeded! +Task 1 started (seed 2339006, random number 1) ... failed! +Task 2 started (seed 2339006, random number 2) ... failed! +Task 3 started (seed 2339006, random number 3) ... failed! +Task 4 started (seed 2339006, random number 4) ... succeeded! +Completed SLURM job 2339006 in 00:00:05 +Started SLURM job 2339007 +Task 3 started (seed 2339007, random number 1) ... failed! +Task 1 started (seed 2339007, random number 2) ... failed! +Task 2 started (seed 2339007, random number 4) ... succeeded! +Completed SLURM job 2339007 in 00:00:05 +Started SLURM job 2339008 +Task 1 started (seed 2339008, random number 3) ... failed! +Task 3 started (seed 2339008, random number 4) ... succeeded! +Completed SLURM job 2339008 in 00:00:05 +Started SLURM job 2339009 +Task 1 started (seed 2339009, random number 4) ... succeeded! +Completed SLURM job 2339009 in 00:00:04 +Started SLURM job 2339010 +Completed SLURM job 2339010 in 00:00:00 +```