Skip to content
Permalink
dcefe7b42e
Go to file
 
 
Cannot retrieve contributors at this time
executable file 141 lines (127 sloc) 3.23 KB
#!/bin/env bash
# Create an H2O cluster in SLURM.
#
# Usage: ./dietslurm [optional_path_to_h2o.jar]
#
# Requirements: Make sure you have loaded an r module with the "h2o"
# package installed. In the future we might support spawning h2o
# clusters for other languages in addition to R; for now we allow
# specifying a path to h2o.jar that may be installed by other
# languages.
#
# This script follows these style guides:
# - https://github.com/progrium/bashstyle
# - https://github.com/bahamas10/bash-style-guide
# Test if we are inside a SLURM job.
#
# Globals: (SLURM `sbatch` generated environmental variables)
# Arguments:
# None
# Returns: None
error_if_not_slurm_job() {
if [[ -z "$SLURM_JOB_ID" ]]; then
echo "\
Error: You must run this script inside a SLURM job."
exit 1
fi
}
# SLURM job node IP addresses.
#
# Globals: (SLURM `sbatch` generated environmental variables)
# Arguments:
# None
# Returns: IP addresses, one per line, of nodes allocated to job
slurm_ips() {
getent hosts $(nodeset -e $SLURM_JOB_NODELIST) | awk '{print $1}'
}
# Path to h2o.jar installed by R "h2o" package.
#
# Globals: (R module must be loaded in PATH)
# Arguments:
# None
# Returns: Path to h2o.jar installed by R "h2o" package
r_h2o_jarpath() {
Rscript - <<EOF
sink(stdout(), type = "message")
message(system.file(package = "h2o", "java", "h2o.jar"))
EOF
}
# Launch h2o cluster in current SLURM allocation.
#
# Globals: (SLURM `sbatch` generated environmental variables)
# Arguments:
# None
# Returns: None
launch_r_h2o_cluster() {
local h2o_jar ips ips_mask cluster_name
# Sanity checks
#
# Use the R installed h2o package jar.
h2o_jar=$(r_h2o_jarpath)
if ! [[ -e "$h2o_jar" ]]; then
echo "\
Error: Could not find h2o.jar.
Have you installed the h2o package in R?"
exit 1
fi
error_if_not_slurm_job
ips=($(slurm_ips))
ips_mask="$(join_by '/32,' ${ips[*]})/32"
cluster_name="h2o_cluster"
# Allow all CPUs per https://github.com/xianyi/OpenBLAS/wiki/faq#multi-threaded
export OPENBLAS_NUM_THREADS=1
# Quote the command so that it executes remotely.
srun \
--label \
--kill-on-bad-exit=1 \
--overcommit \
--job-name=$cluster_name \
sh -c "\
java \
-Xmx\$(free -g | awk 'NR==2 {print \$2}')g \
-jar $h2o_jar \
-name $cluster_name \
-network $ips_mask \
" &> ${cluster_name}.out &
wait_till_h2o_running
}
# Wait until h2o is running on all nodes.
#
# Globals: (SLURM `sbatch` generated environmental variables)
# Arguments:
# None
# Returns: None
wait_till_h2o_running() {
# FIXME: Check the output instead of using sleep :/
sleep 5
clush -w $SLURM_JOB_NODELIST pgrep -fl -u $USER ^java
# wait
}
# Python style string join from https://stackoverflow.com/a/17841619
#
# Globals: None
# Arguments:
# $1 join string e.g. ", "
# $@ (except $1) strings to be joined
# Returns: Joined string
join_by() {
local delim
delim=$1
shift
echo -n "$1"
shift
printf "%s" "${@/#/$delim}"
}
# Main program to launch h2o cluster.
#
# Globals: None
# Arguments:
# None
# Returns: None
main() {
# Make any errors fatal and propagate pipe error codes.
set -eo pipefail
launch_r_h2o_cluster
}
# Boilerplate for running `main()`.
[[ "$0" != "$BASH_SOURCE" ]] || main "$@"
You can’t perform that action at this time.