Skip to content
Permalink
Browse files

ENH: Initial commit

  • Loading branch information
pan14001 committed Jun 22, 2017
0 parents commit ea7ff69c23bd21a493c084dc704394b3156db159
Showing with 344 additions and 0 deletions.
  1. +3 −0 .gitignore
  2. +86 −0 README.md
  3. +1 −0 dietslurm
  4. +76 −0 dietslurm-flatfile.sh
  5. +138 −0 dietslurm-network.sh
  6. +17 −0 h2o.slurm
  7. +23 −0 tests/slurm.bats
@@ -0,0 +1,3 @@
h2o.flatfile
h2o.out
h2o_cluster.out
@@ -0,0 +1,86 @@
## dietslurm

Run h2o on a SLURM cluster using the h2o R package.

To use h2o on a SLURM cluster,
one needs to spawn multiple instances of java
across several compute nodes,
specifying details like memory, IP addresses, etc.

While the h2o docs suggest launching the cluster from
[inside R](http://docs.h2o.ai/h2o/latest-stable/h2o-docs/faq.html),
it makes suboptimal use of memory and CPUs,
and does a poor job of validating the cluster resources.
We instead use `bash` to make the h2o cluster:

1. Language independent:
one can use the script to launch h2o clusters
for languages besides R.

2. Use SLURM's `srun` process manager
to log resource usage
which helps troubleshoot performance issues, etc.

3. Additional checks and unit tests.

The name "dietslurm" is a play on
the [slurm drink](https://en.wikipedia.org/wiki/Fry_and_the_Slurm_Factory#Slurm)
from Futurama,
and adding water or "h2o"
to make a diet version.
Thanks to Luke Malinowski for the name.

## Installation

Install the "h2o" R package for your preferred version of R.
See "Install in R" installation instructions in the
[h2o docs](http://docs.h2o.ai/h2o/latest-stable/h2o-docs/downloading.html).

At the time of writing,
on the Storrs HPC cluster
we have installed the "h2o" package for r/3.3.3

## Usage

See the example submission script [h2o.slurm](h2o.slurm)

## Tests

We use the [bats](https://github.com/sstephenson/bats) unit test suite.

Run the unit tests on the cluster with:

``` bash
module load bats
tests/*.bats
```

If you have done any unit testing before,
the format is similar
and the documentation is simple and concise.
Learn to use bats from reading the man pages, or GitHub.
The GitHub link includes links to projects using bats
as well as setting up your favorite text editor for Syntax highlighting, etc.

``` bash
man bats
man 7 bats
# Or see https://github.com/sstephenson/bats
```

## Hacking

1. Read the dirclean script.
Many stylistic elements can be intuitively determined
by reading and copying what's already there.
Some of the style guides below are not strictly followed.
Use your noodle ;)

2. As far as possible follow
[Jeff Lindsay](https://github.com/progrium/bashstyle)

3. When in doubt, follow
[Greg Wooledge](http://mywiki.wooledge.org/BashGuide/Practices)

4. Only for documenting functions, follow the
[Google style guide](https://google.github.io/styleguide/shell.xml?showone=Function_Comments#Function_Comments)
@@ -0,0 +1,76 @@
#!/bin/bash -x

# Create an H2O cluster in SLURM.
#
# Usage: ./dietslurm.sh
#
# Requirements: Make sure you have loaded an r module with the "h2o"
# package installed. In the future one might add functions this to
# spawn h2o clusters for other languages.
#
# This script follows these style guides:
# - https://github.com/progrium/bashstyle
# - https://github.com/bahamas10/bash-style-guide

set -e

function slurm_ips() {
getent hosts $(nodeset -e $SLURM_JOB_NODELIST) | awk '{print $1}'
}

function r_h2o_jarpath() {
Rscript -e 'message(system.file(package = "h2o", "java", "h2o.jar"))' 2>&1
}

function launch_r_h2o_cluster() {
# Use the installed h2o package jar.
local h2o_jar=$(r_h2o_jarpath)
if ! [[ -e "$h2o_jar" ]]; then
echo "\
Error: Could not find h2o.jar.
Have you installed the h2o package in R?"
exit 1
fi
if [[ -z "$SLURM_JOB_ID" ]]; then
echo "\
Error: You must run this script inside a SLURM job."
exit 1
fi
local flatfile="h2o.flatfile"
echo "$(slurm_ips)" > ${flatfile}
local cluster_name="h2o_cluster"
# We quote the command so that it executes remotely.
srun \
--label \
--kill-on-bad-exit=1 \
--overcommit \
--nodelist=$HOSTNAME \
--job-name=$cluster_name \
sh -c "\
/usr/bin/java \
-Xmx\$(free -g | awk 'NR==2 {print \$2}')g \
-jar $h2o_jar \
-name ${cluster_name} \
-flatfile ${flatfile} \
" &> ${cluster_name}.out &
# Wait until h2o is running on all nodes.
sleep 5
# clush -w $SLURM_JOB_NODELIST pgrep -fl -u $USER java
# wait
}

# From https://stackoverflow.com/a/17841619
function join_by {
local d=$1
shift
echo -n "$1"
shift
printf "%s" "${@/#/$d}"
}

function main() {
launch_r_h2o_cluster
}

# Python-like boilerplate
[[ "$0" == "$BASH_SOURCE" ]] && main "$@"
@@ -0,0 +1,138 @@
#!/bin/env bash

# Create an H2O cluster in SLURM.
#
# Usage: ./dietslurm [optional_path_to_h2o.jar]
#
# Requirements: Make sure you have loaded an r module with the "h2o"
# package installed. In the future we might support spawning h2o
# clusters for other languages in addition to R; for now we allow
# specifying a path to h2o.jar that may be installed by other
# languages.
#
# This script follows these style guides:
# - https://github.com/progrium/bashstyle
# - https://github.com/bahamas10/bash-style-guide

# Test if we are inside a SLURM job.
#
# Globals: (SLURM `sbatch` generated environmental variables)
# Arguments:
# None
# Returns: None
error_if_not_slurm_job() {
if [[ -z "$SLURM_JOB_ID" ]]; then
echo "\
Error: You must run this script inside a SLURM job."
exit 1
fi

}

# SLURM job node IP addresses.
#
# Globals: (SLURM `sbatch` generated environmental variables)
# Arguments:
# None
# Returns: IP addresses, one per line, of nodes allocated to job
slurm_ips() {
getent hosts $(nodeset -e $SLURM_JOB_NODELIST) | awk '{print $1}'
}

# Path to h2o.jar installed by R "h2o" package.
#
# Globals: (R module must be loaded in PATH)
# Arguments:
# None
# Returns: Path to h2o.jar installed by R "h2o" package
r_h2o_jarpath() {
Rscript -e 'message(system.file(package = "h2o", "java", "h2o.jar"))' 2>&1
}

# Launch h2o cluster in current SLURM allocation.
#
# Globals: (SLURM `sbatch` generated environmental variables)
# Arguments:
# None
# Returns: None
launch_r_h2o_cluster() {
local h2o_jar ips ips_mask cluster_name

# Sanity checks
#
# Use the R installed h2o package jar.
h2o_jar=$(r_h2o_jarpath)
if ! [[ -e "$h2o_jar" ]]; then
echo "\
Error: Could not find h2o.jar.
Have you installed the h2o package in R?"
exit 1
fi
error_if_not_slurm_job

ips=($(slurm_ips))
ips_mask="$(join_by '/32,' ${ips[*]})/32"
cluster_name="h2o_cluster"
# Allow all CPUs per https://github.com/xianyi/OpenBLAS/wiki/faq#multi-threaded
export OPENBLAS_NUM_THREADS=1
# Quote the command so that it executes remotely.
srun \
--label \
--kill-on-bad-exit=1 \
--overcommit \
--job-name=$cluster_name \
sh -c "\
java \
-Xmx\$(free -g | awk 'NR==2 {print \$2}')g \
-jar $h2o_jar \
-name $cluster_name \
-network $ips_mask \
" &> ${cluster_name}.out &

wait_till_h2o_running
}

# Wait until h2o is running on all nodes.
#
# Globals: (SLURM `sbatch` generated environmental variables)
# Arguments:
# None
# Returns: None
wait_till_h2o_running() {
# FIXME: Check the output instead of using sleep :/
sleep 5
clush -w $SLURM_JOB_NODELIST pgrep -fl -u $USER ^java
# wait
}

# Python style string join from https://stackoverflow.com/a/17841619
#
# Globals: None
# Arguments:
# $1 join string e.g. ", "
# $@ (except $1) strings to be joined
# Returns: Joined string
join_by() {
local delim
delim=$1
shift
echo -n "$1"
shift
printf "%s" "${@/#/$delim}"
}

# Main program to launch h2o cluster.
#
# Globals: None
# Arguments:
# None
# Returns: None
main() {
# Make any errors fatal and propagate pipe error codes.
set -eo pipefail

launch_r_h2o_cluster
}

# Boilerplate for running `main()`.
[[ "$0" != "$BASH_SOURCE" ]] || main "$@"
@@ -0,0 +1,17 @@
#!/bin/bash
#SBATCH --partition phi
#SBATCH --ntasks 30
#SBATCH --output h2o.out

# Clear output file.
echo > h2o.out

# Ensure the module environment is clean and only load R.
module purge
module load r/3.3.3

# Launch h2o cluster.
./dietslurm.sh

# Run the R script.
Rscript -e 'library(h2o); h2o.init(); system(paste0("cat /proc/", Sys.getpid(), "/status"))'
@@ -0,0 +1,23 @@
#!/usr/bin/env bats

source $BATS_TEST_DIRNAME/../dietslurm

setup() {
SLURM_JOB_ID=123
}

teardown() {
unset SLURM_JOB_ID
}

@test "SLURM job is detected" {
local magic_job_string
magic_job_string="SLURM job"
run error_if_not_slurm_job
[[ $status = 0 ]]
[[ "${output}" != *"${magic_job_string}"* ]]
unset SLURM_JOB_ID
run error_if_not_slurm_job
[[ $status = 1 ]]
[[ "${output}" == *"${magic_job_string}"* ]]
}

0 comments on commit ea7ff69

Please sign in to comment.
You can’t perform that action at this time.