diff --git a/README.md b/README.md index 3f81348..2c5d8dc 100644 --- a/README.md +++ b/README.md @@ -131,8 +131,9 @@ IDs. The record ID is read by the program to find the corresponding input parameters, the program calculates the result from the parameters and saves the result in a directory. -Once all records are completed, the results from the directory are -aggregated into a single results file. +Once all records are completed, the results from the directory can be +aggregated by you into a single results file. Aggregating files +doesn't really require a SLURM job. GNU Parallel automatically feeds record IDs to task workers as the task workers complete records and become available. @@ -142,16 +143,52 @@ task workers complete records and become available. module purge module load python/2.7.6-gcc-unicode cd ~/parallel-slurm/examples -rm -rf joblog submit.out results/ results.csv -for i in {1..5}; do sbatch 03-submit-param-sweep.slurm; done +rm -rf joblog submit.out results/ +for i in {1..3}; do sbatch 03-submit-param-sweep.slurm; done touch submit.out && tail -f submit.out # Hit Ctrl+C to exit ``` Output: +``` +Started SLURM job 2346922 +Running 60 of total 60 simulations. +1: Fitting model to parameters: x = 0.1, y = -0.1, z = control ... +1: ... done! Saved result 4.196 to results/01.dat +2: Fitting model to parameters: x = 0.1, y = -0.1, z = positive ... +2: ... done! Saved result 1.574 to results/02.dat +3: Fitting model to parameters: x = 0.1, y = -0.1, z = negative ... +3: ... done! Saved result 12.589 to results/03.dat +... +44: Fitting model to parameters: x = 0.8, y = -0.1, z = positive ... +44: ... done! Saved result 1.278 to results/44.dat +slurmstepd: *** JOB 2346922 ON cn338 CANCELLED AT 2019-05-13T18:52:16 DUE TO TIME LIMIT *** +Started SLURM job 2346923 +Running 15 of total 60 simulations. +45: Fitting model to parameters: x = 0.8, y = -0.1, z = negative ... +45: ... done! Saved result 10.226 to results/45.dat +... +59: Fitting model to parameters: x = 1.0, y = 0.1, z = positive ... +59: ... done! Saved result 1.250 to results/59.dat +60: Fitting model to parameters: x = 1.0, y = 0.1, z = negative ... +60: ... done! Saved result 10.000 to results/60.dat +Completed SLURM job 2346923 in 00:00:32 +Started SLURM job 2346924 +Nothing to run; all 60 simulations complete. +Completed SLURM job 2346924 in 00:00:01 ``` +```console +$ head -n 4 joblog; echo "..."; tail -n 3 joblog +Seq Host Starttime JobRuntime Send Receive Exitval Signal Command +1 cn236 1557787112.519 5.414 0 119 0 0 python model.py 1 +3 cn236 1557787112.750 5.250 0 120 0 0 python model.py 3 +2 cn236 1557787117.940 5.276 0 429 0 0 python model.py 2 +... +58 cn338 1557787220.703 5.284 0 120 0 0 python model.py 58 +59 cn338 1557787220.815 5.267 0 121 0 0 python model.py 59 +60 cn338 1557787225.850 5.263 0 121 0 0 python model.py 60 ``` ## Next Steps diff --git a/examples/03-submit-param-sweep.slurm b/examples/03-submit-param-sweep.slurm index 99583d3..19d323e 100644 --- a/examples/03-submit-param-sweep.slurm +++ b/examples/03-submit-param-sweep.slurm @@ -1,11 +1,10 @@ #!/bin/bash -#SBATCH --ntasks 5 +#SBATCH --ntasks 3 #SBATCH --output submit.out - #SBATCH --dependency singleton #SBATCH --job-name example-03-parameter-sweep -# Kill job after 5 minutes seconds to show resuming feature. -#SBATCH --time 5:00 +# Kill job after 1 minute to show resuming feature. +#SBATCH --time 1:00 parallel_opts=$(~/parallel-slurm/parallel_opts.sh) module load parallel @@ -22,6 +21,7 @@ echo "Started SLURM job $SLURM_JOB_ID" if [[ n_remaining -eq 0 ]] then echo "Nothing to run; all $n_sim simulations complete." + echo "Completed SLURM job $SLURM_JOB_ID in $(sacct -nXj $SLURM_JOB_ID -o elapsed)" exit 0 fi diff --git a/examples/model.py b/examples/model.py index edd1691..bf95307 100755 --- a/examples/model.py +++ b/examples/model.py @@ -38,7 +38,7 @@ def run(index): for param, value in record.items()]))) value = {'positive': 0.8, 'negative': 0.1, 'control': 0.3} result = record['x'] ** record['y'] / value[record['z']] - time.sleep(1) + time.sleep(5) path_result = os.path.join(RESULTS_DIR, '{:02d}.dat'.format(index)) if not os.path.exists(RESULTS_DIR): os.mkdir(RESULTS_DIR)