Skip to content
Permalink
Newer
Older
100755 141 lines (127 sloc) 3.23 KB
Jun 22, 2017
1
#!/bin/env bash
2
3
# Create an H2O cluster in SLURM.
4
#
5
# Usage: ./dietslurm [optional_path_to_h2o.jar]
6
#
7
# Requirements: Make sure you have loaded an r module with the "h2o"
8
# package installed. In the future we might support spawning h2o
9
# clusters for other languages in addition to R; for now we allow
10
# specifying a path to h2o.jar that may be installed by other
11
# languages.
12
#
13
# This script follows these style guides:
14
# - https://github.com/progrium/bashstyle
15
# - https://github.com/bahamas10/bash-style-guide
16
17
# Test if we are inside a SLURM job.
18
#
19
# Globals: (SLURM `sbatch` generated environmental variables)
20
# Arguments:
21
# None
22
# Returns: None
23
error_if_not_slurm_job() {
24
if [[ -z "$SLURM_JOB_ID" ]]; then
25
echo "\
26
Error: You must run this script inside a SLURM job."
27
exit 1
28
fi
29
30
}
31
32
# SLURM job node IP addresses.
33
#
34
# Globals: (SLURM `sbatch` generated environmental variables)
35
# Arguments:
36
# None
37
# Returns: IP addresses, one per line, of nodes allocated to job
38
slurm_ips() {
39
getent hosts $(nodeset -e $SLURM_JOB_NODELIST) | awk '{print $1}'
40
}
41
42
# Path to h2o.jar installed by R "h2o" package.
43
#
44
# Globals: (R module must be loaded in PATH)
45
# Arguments:
46
# None
47
# Returns: Path to h2o.jar installed by R "h2o" package
48
r_h2o_jarpath() {
49
Rscript - <<EOF
50
sink(stdout(), type = "message")
51
message(system.file(package = "h2o", "java", "h2o.jar"))
52
EOF
Jun 22, 2017
53
}
54
55
# Launch h2o cluster in current SLURM allocation.
56
#
57
# Globals: (SLURM `sbatch` generated environmental variables)
58
# Arguments:
59
# None
60
# Returns: None
61
launch_r_h2o_cluster() {
62
local h2o_jar ips ips_mask cluster_name
63
64
# Sanity checks
65
#
66
# Use the R installed h2o package jar.
67
h2o_jar=$(r_h2o_jarpath)
68
if ! [[ -e "$h2o_jar" ]]; then
69
echo "\
70
Error: Could not find h2o.jar.
71
Have you installed the h2o package in R?"
72
exit 1
73
fi
74
error_if_not_slurm_job
75
76
ips=($(slurm_ips))
77
ips_mask="$(join_by '/32,' ${ips[*]})/32"
78
cluster_name="h2o_cluster"
79
# Allow all CPUs per https://github.com/xianyi/OpenBLAS/wiki/faq#multi-threaded
80
export OPENBLAS_NUM_THREADS=1
81
# Quote the command so that it executes remotely.
82
srun \
83
--label \
84
--kill-on-bad-exit=1 \
85
--overcommit \
86
--job-name=$cluster_name \
87
sh -c "\
88
java \
89
-Xmx\$(free -g | awk 'NR==2 {print \$2}')g \
90
-jar $h2o_jar \
91
-name $cluster_name \
92
-network $ips_mask \
93
" &> ${cluster_name}.out &
94
95
wait_till_h2o_running
96
}
97
98
# Wait until h2o is running on all nodes.
99
#
100
# Globals: (SLURM `sbatch` generated environmental variables)
101
# Arguments:
102
# None
103
# Returns: None
104
wait_till_h2o_running() {
105
# FIXME: Check the output instead of using sleep :/
106
sleep 5
107
clush -w $SLURM_JOB_NODELIST pgrep -fl -u $USER ^java
108
# wait
109
}
110
111
# Python style string join from https://stackoverflow.com/a/17841619
112
#
113
# Globals: None
114
# Arguments:
115
# $1 join string e.g. ", "
116
# $@ (except $1) strings to be joined
117
# Returns: Joined string
118
join_by() {
119
local delim
120
delim=$1
121
shift
122
echo -n "$1"
123
shift
124
printf "%s" "${@/#/$delim}"
125
}
126
127
# Main program to launch h2o cluster.
128
#
129
# Globals: None
130
# Arguments:
131
# None
132
# Returns: None
133
main() {
134
# Make any errors fatal and propagate pipe error codes.
135
set -eo pipefail
136
137
launch_r_h2o_cluster
138
}
139
140
# Boilerplate for running `main()`.
141
[[ "$0" != "$BASH_SOURCE" ]] || main "$@"
You can’t perform that action at this time.