Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 199 lines (182 sloc) 4.68 KB
#!/bin/env bash
# Create an H2O cluster in SLURM.
#
# Usage: ./dietslurm [optional_path_to_h2o.jar]
#
# Requirements: h2o.jar file; if you have the R "h2o" package installed and the
# r module loaded, the path will be automatically detected, otherwise you must
# provide it with -j, --jar PATH_TO_H2O.JAR.
# Test if we are inside a SLURM job.
#
# Globals: (SLURM `sbatch` generated environmental variables)
# Arguments:
# None
# Returns: None
error_if_not_slurm_job() {
if [[ -z "$SLURM_JOB_ID" ]]; then
echo "\
Error: You must run this script inside a SLURM job."
exit 1
fi
}
# SLURM job node IP addresses.
#
# Globals: (SLURM `sbatch` generated environmental variables)
# Arguments:
# None
# Returns: IP addresses, one per line, of nodes allocated to job
slurm_ips() {
getent hosts $(nodeset -e $SLURM_JOB_NODELIST) | awk '{print $1}'
}
# Path to h2o.jar installed by R "h2o" package.
#
# Globals: (R module must be loaded in PATH)
# Arguments:
# None
# Returns: Path to h2o.jar installed by R "h2o" package
r_h2o_jarpath() {
Rscript - <<EOF
sink(stdout(), type = "message")
message(system.file(package = "h2o", "java", "h2o.jar"))
EOF
}
# Launch h2o cluster in current SLURM allocation.
#
# Globals: (SLURM `sbatch` generated environmental variables)
# Arguments:
# (Optional) -j, --jar Path to h2o.jar (default: searches R package)
# (Optional) -n, --name Cluster name (default: h2o_cluster)
# (Optional) -o, --output Output file (default: CLUSTER_NAME.out)
# (Optional) -t, --timeout Wait seconds to confirm launch (default: 30)
# Returns: None, but
# exports OPENBLAS_NUM_THREADS
# writes to CLUSTER_NAME.out file
launch_r_h2o_cluster() {
local h2o_jar ips ips_mask cluster_name file_out timeout elapsed_sec \
increment
# Option parsing
while [[ $# -gt 1 ]]; do
key="$1"
case $key in
-j|--jar)
h2o_jar="$2"
shift;;
-n|--name)
cluster_name="$2"
shift;;
-o|--output)
file_out="$2"
shift;;
-t|--timeout)
timeout="$2"
shift;;
*)
# Unknown option.
;;
esac
shift
done
# Sanity checks
if [[ -z "$h2o_jar" ]]; then
# Use the R installed h2o package jar.
h2o_jar=$(r_h2o_jarpath)
if ! [[ -e "$h2o_jar" ]]; then
echo "\
Error: Could not find h2o.jar.
Have you installed the h2o package in R?"
exit 1
fi
fi
if ! [[ -e "$h2o_jar" ]]; then
echo "\
Error: Path to provided h2o.jar does not exist: $h2o_jar"
exit 1
fi
error_if_not_slurm_job
ips=($(slurm_ips))
ips_mask="$(join_by '/32,' ${ips[*]})/32"
cluster_name=${cluster_name:-h2o_cluster}
file_out=${file_out:-${cluster_name}.out}
# Quote the shell command so that it executes remotely.
srun \
--nodes $SLURM_JOB_NUM_NODES \
--ntasks $SLURM_JOB_NUM_NODES \
--cpu_bind=boards \
--label \
--kill-on-bad-exit=1 \
--job-name=$cluster_name \
sh -c "\
java \
-Xmx\$(free -g | awk 'NR==2 {print \$2}')g \
-jar $h2o_jar \
-name $cluster_name \
-network $ips_mask \
-nthreads \$SLURM_CPUS_ON_NODE \
" &> $file_out &
timeout=${timeout:-30}
wait_till_h2o_running "${file_out}" &
elapsed_sec=0
increment=1
while [[ $elapsed_sec -le $timeout ]]; do
jobs %2 || break
sleep $increment
elapsed_sec=$(( $elapsed_sec + $increment ))
done
if [[ "$elapsed_sec" -ge "$timeout" ]]; then
echo "\
Error: Timed out after ${timeout} seconds waiting for h2o cluster to launch. \
See output in ${file_out}"
exit 1
fi
}
# Wait until h2o is running on all nodes.
#
# Globals: (SLURM `sbatch` generated environmental variables)
# Arguments:
# Output file from h2o cluster.
# Returns: None
wait_till_h2o_running() {
local file_out magic_string node_count
file_out="$1"
magic_string="Cloud of size ${SLURM_JOB_NUM_NODES} formed"
node_count=0
# Wait until we see the number of tasks. See https://superuser.com/a/449307
tail -f ${file_out} | while read LINE
do
# Wait for all nodes to register as connected.
if [[ "${LINE}" == *"$magic_string"* ]]; then
node_count=$(( $node_count + 1 ))
if [[ ${node_count} -eq ${SLURM_JOB_NUM_NODES} ]]; then
pkill -f "tail -f ${file_out}"
fi
fi
done
}
# Python style string join from https://stackoverflow.com/a/17841619
#
# Globals: None
# Arguments:
# $1 join string e.g. ", "
# $@ (except $1) strings to be joined
# Returns: Joined string
join_by() {
local delim
delim=$1
shift
echo -n "$1"
shift
printf "%s" "${@/#/$delim}"
}
# Main program to launch h2o cluster.
#
# Globals: None
# Arguments:
# None
# Returns: None
main() {
# Make any errors fatal and propagate pipe error codes.
set -eo pipefail
launch_r_h2o_cluster
}
# Boilerplate for running `main()`.
[[ "$0" != "$BASH_SOURCE" ]] || main "$@"
You can’t perform that action at this time.