Skip to content
Permalink
Browse files

ENH: h2o cluster finally launches to use all CPUs

Still need to fix `job` check failing.  Once the function job
completes, it still considers `srun` as a running job.
  • Loading branch information
pan14001 committed Jun 23, 2017
1 parent e281064 commit cc8544b3412600713db0b2ea689ae1ef2096fb34
Showing with 80 additions and 29 deletions.
  1. +80 −29 dietslurm-network.sh
@@ -4,15 +4,9 @@
#
# Usage: ./dietslurm [optional_path_to_h2o.jar]
#
# Requirements: Make sure you have loaded an r module with the "h2o"
# package installed. In the future we might support spawning h2o
# clusters for other languages in addition to R; for now we allow
# specifying a path to h2o.jar that may be installed by other
# languages.
#
# This script follows these style guides:
# - https://github.com/progrium/bashstyle
# - https://github.com/bahamas10/bash-style-guide
# Requirements: h2o.jar file; if you have the R "h2o" package installed and the
# r module loaded, the path will be automatically detected, otherwise you must
# provide it with -j, --jar PATH_TO_H2O.JAR.

# Test if we are inside a SLURM job.
#
@@ -56,56 +50,113 @@ EOF
#
# Globals: (SLURM `sbatch` generated environmental variables)
# Arguments:
# None
# Returns: None
# (Optional) -j, --jar Path to h2o.jar (default: searches R package)
# (Optional) -n, --name Cluster name (default: h2o_cluster)
# (Optional) -o, --output Output file (default: CLUSTER_NAME.out)
# (Optional) -t, --timeout Wait seconds to confirm launch (default: 30)
# Returns: None, but
# exports OPENBLAS_NUM_THREADS
# writes to CLUSTER_NAME.out file
launch_r_h2o_cluster() {
local h2o_jar ips ips_mask cluster_name
local h2o_jar ips ips_mask cluster_name file_out timeout elapsed_sec \
increment

# Option parsing
while [[ $# -gt 1 ]]; do
key="$1"
case $key in
-j|--jar)
h2o_jar="$2"
shift;;
-n|--name)
cluster_name="$2"
shift;;
-o|--output)
file_out="$2"
shift;;
-t|--timeout)
timeout="$2"
shift;;
*)
# Unknown option.
;;
esac
shift
done

# Sanity checks
#
# Use the R installed h2o package jar.
h2o_jar=$(r_h2o_jarpath)
if ! [[ -e "$h2o_jar" ]]; then
echo "\
if [[ -z "$h2o_jar" ]]; then
# Use the R installed h2o package jar.
h2o_jar=$(r_h2o_jarpath)
if ! [[ -e "$h2o_jar" ]]; then
echo "\
Error: Could not find h2o.jar.
Have you installed the h2o package in R?"
exit 1
fi
fi
if ! [[ -e "$h2o_jar" ]]; then
echo "\
Error: Path to provided h2o.jar does not exist: $h2o_jar"
exit 1
fi
error_if_not_slurm_job

ips=($(slurm_ips))
ips_mask="$(join_by '/32,' ${ips[*]})/32"
cluster_name="h2o_cluster"
# Allow all CPUs per https://github.com/xianyi/OpenBLAS/wiki/faq#multi-threaded
export OPENBLAS_NUM_THREADS=1
# Quote the command so that it executes remotely.
cluster_name=${cluster_name:-h2o_cluster}
file_out=${file_out:-${cluster_name}.out}

# Quote the shell command so that it executes remotely.
srun \
--nodes $SLURM_JOB_NUM_NODES \
--ntasks $SLURM_JOB_NUM_NODES \
--cpu_bind=boards \
--label \
--kill-on-bad-exit=1 \
--overcommit \
--job-name=$cluster_name \
sh -c "\
java \
-Xmx\$(free -g | awk 'NR==2 {print \$2}')g \
-jar $h2o_jar \
-name $cluster_name \
-network $ips_mask \
" &> ${cluster_name}.out &
-nthreads \$SLURM_CPUS_ON_NODE \
" &> $file_out &

wait_till_h2o_running
timeout=${timeout:-30}
wait_till_h2o_running "${file_out}" &
elapsed_sec=0
increment=1
while [[ $elapsed_sec -le $timeout ]]; do
jobs -r %+ || break
sleep $increment
elapsed_sec=$(( $elapsed_sec + $increment ))
done
if [[ "$elapsed_sec" -ge "$timeout" ]]; then
echo "\
Error: Timed out after ${timeout} seconds waiting for h2o cluster to launch. \
See output in ${file_out}"
exit 1
fi
}

# Wait until h2o is running on all nodes.
#
# Globals: (SLURM `sbatch` generated environmental variables)
# Arguments:
# None
# Output file from h2o cluster.
# Returns: None
wait_till_h2o_running() {
# FIXME: Check the output instead of using sleep :/
sleep 5
clush -w $SLURM_JOB_NODELIST pgrep -fl -u $USER ^java
# wait
local file_out magic_string
file_out="$1"
magic_string="Cloud of size ${SLURM_JOB_NUM_NODES} formed"

# Wait until we see the number of tasks. See https://superuser.com/a/449307
tail -f ${file_out} | while read LINE
do
[[ "${LINE}" == *"$magic_string"* ]] && pkill -P $$ tail
done
}

# Python style string join from https://stackoverflow.com/a/17841619

0 comments on commit cc8544b

Please sign in to comment.
You can’t perform that action at this time.