Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
wait_notify/wait_notify.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
193 lines (159 sloc)
6.25 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
#----------------------------------------------------------------------- | |
# Todo | |
#----------------------------------------------------------------------- | |
# (_) user_notify.user_notify should return whether email was sent | |
# (_) Have wait_notify log whether email was sent | |
#----------------------------------------------------------------------- | |
# About | |
#----------------------------------------------------------------------- | |
# | |
# Print squeue job info in my preferred format, along with | |
# info from sacct | |
# | |
from __future__ import print_function | |
#----------------------------------------------------------------------- | |
# Imports | |
#----------------------------------------------------------------------- | |
# Standard modules | |
import pwd,sys,time,getopt,subprocess,sqlite3 | |
# Custom module | |
import user_notify | |
from wait_notify_config import * | |
#----------------------------------------------------------------------- | |
# Constants | |
#----------------------------------------------------------------------- | |
# squeue command: -h no header -o custom output | |
SQUEUE_CMD_STR = [ 'squeue', '-h', '-o', '%r %i %u' ] | |
SCONTROL_CMD_STR = 'scontrol -o show job ' | |
#----------------------------------------------------------------------- | |
# Functions | |
#----------------------------------------------------------------------- | |
# Used for to make Python < 2.7 work like newer versions | |
def _check_output(command,shell=False): | |
if type(command)==type(''): | |
command = command.split() | |
p = subprocess.Popen(command, stdout=subprocess.PIPE) | |
return p.communicate()[0] | |
# Print error message to stderr and quit | |
def print_error(msg): | |
print("ERROR: ",msg,file=sys.stderr) | |
sys.exit() | |
def get_scontrol_info(jobid): | |
output = _check_output(SCONTROL_CMD_STR + jobid).strip() | |
info = {} | |
for part in output.split(): | |
try: | |
k,v = part.split("=",1) | |
except ValueError: | |
continue | |
info[k.lower()] = v | |
return info | |
def print_reason(jobid,user,reason,would_send_mail): | |
scontrol_info = get_scontrol_info(jobid) | |
mailstr = "SendMail" if would_send_mail else "NoMail" | |
if reason in ('PartitionTimeLimit'): | |
print(jobid,user,mailstr,reason,"partition="+scontrol_info['partition'],"timelimit="+scontrol_info['timelimit']) | |
else: | |
print(jobid,user,mailstr,reason) | |
def get_reason_msg(reason,scontrol_info): | |
template = REASON_TEMPLATES[reason] | |
return template % scontrol_info | |
def Usage(msg=None): | |
print(""" | |
Usage: wait_notify [-Icx] [-n N] [-t ADMIN_EMAIL] | |
Read Slurm's sinfo output to determine which pending jobs are in a stuck | |
state that will not run, and email the jobs's owner so they can | |
cancel them and re-run if desired. | |
Each email is logged in an SQLITE3 database, and emails will not be sent | |
if an email has already gone out in the previous week. | |
OPTIONS | |
-h Show this help. | |
-I Run the first time to initialize Sqlite3 database | |
that records emails. | |
-x Email users with stuck jobs. | |
OPTIONS USEFUL FOR TESTING | |
-c Check only. List jobs that are in a cancelled state | |
-t ADMIN_EMAIL Useful for testing. Use only with -x. Emails will be | |
sent not to users, but to the ADMIN_EMAIL instead. | |
-n N Only email for first N users. | |
-f Force email, even if one was sent in past week | |
""") | |
if msg: | |
print(msg) | |
sys.exit() | |
def get_eventid(useracct,jobid,reason): | |
return "%s-%s-%s" % (useracct,jobid,reason) | |
#----------------------------------------------------------------------- | |
# Main | |
#----------------------------------------------------------------------- | |
def main(): | |
opts, args = getopt.getopt(sys.argv[1:],'t:hxvfcIn:') | |
opts = dict(opts) | |
admin_email = opts['-t'] if '-t' in opts else None | |
try: | |
count_limit = int(opts['-n']) if '-n' in opts else 0 | |
except ValueError: | |
count_limit = 1 # Play it safe | |
verbose = '-v' in opts | |
force = '-f' in opts | |
Check, Execute, Initial = '-c' in opts, '-x' in opts, '-I' in opts | |
if Check + Execute + Initial == 0: | |
Usage() | |
if Check + Execute + Initial > 1: | |
Usage(" ERROR: You cannot use -c, -x, or -I options together") | |
if admin_email and not Execute: | |
Usage(" ERROR: You can only use an admin user with the -x option") | |
# Inititize Sqlite3 database - only needs to be done the first time program is run | |
if Initial: | |
result = user_notify.create_table() | |
print(result) | |
sys.exit() | |
# Get list of jobs that cannot run | |
data = {} | |
for line in _check_output(SQUEUE_CMD_STR).split("\n"): | |
try: | |
reason, jobid, user = line.split() | |
except ValueError: | |
continue | |
# These reason always indicate an OK job | |
if reason in ("None","Priority"): continue | |
# Find bad jobs | |
if reason in REASON_TEMPLATES.keys(): | |
if not user in data: | |
data[user] = [] | |
data[user].append((jobid,reason)) | |
# List found jobs to console | |
if Check: | |
for useracct in sorted(data.keys()): | |
for jobid, reason in data[useracct]: | |
eventid = get_eventid(useracct,jobid,reason) | |
try: | |
would_send_email = force or user_notify.need_new_notice( [ eventid ], NOTICE_AGE_DAYS ) | |
except sqlite3.OperationalError: | |
print_error("SQLite DB file does exist. Rerun with the -I option to create it.") | |
sys.exit() | |
print_reason(jobid,useracct,reason,would_send_email) | |
elif Execute: | |
log = user_notify.LOG(LOG_FILE) # Initialize log file | |
for count,useracct in enumerate(sorted(data.keys())): | |
if count_limit and count>=count_limit: break # Limit number of users for testing purposes | |
username, email = user_notify.get_username_email(useracct) | |
msg = MAIL_TEMPLATE % {'username':username} # Start mail message | |
eventids = [] | |
for jobid, reason in data[useracct]: | |
scontrol_info = get_scontrol_info(jobid) # Info from scontrol util. | |
msg += get_reason_msg(reason,scontrol_info) | |
eventids.append( get_eventid(useracct,jobid,reason) ) | |
should_send_email = force or user_notify.need_new_notice(eventids, NOTICE_AGE_DAYS) | |
# Send email and log | |
if should_send_email: | |
#user_notify.send_mail(username,'jon.rifkin@uconn.edu',MAIL_SUBJECT,msg) - testing | |
user_notify.send_mail( username, admin_email or email, MAIL_SUBJECT, msg ) | |
# Log email if sent to user | |
if not admin_email: | |
user_notify.update_notify_log(eventids) | |
log.write(" ".join(eventids)) # Write to user log | |
main() |