Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 193 lines (159 sloc) 6.25 KB
#!/usr/bin/python
#-----------------------------------------------------------------------
# Todo
#-----------------------------------------------------------------------
# (_) user_notify.user_notify should return whether email was sent
# (_) Have wait_notify log whether email was sent
#-----------------------------------------------------------------------
# About
#-----------------------------------------------------------------------
#
# Print squeue job info in my preferred format, along with
# info from sacct
#
from __future__ import print_function
#-----------------------------------------------------------------------
# Imports
#-----------------------------------------------------------------------
# Standard modules
import pwd,sys,time,getopt,subprocess,sqlite3
# Custom module
import user_notify
from wait_notify_config import *
#-----------------------------------------------------------------------
# Constants
#-----------------------------------------------------------------------
# squeue command: -h no header -o custom output
SQUEUE_CMD_STR = [ 'squeue', '-h', '-o', '%r %i %u' ]
SCONTROL_CMD_STR = 'scontrol -o show job '
#-----------------------------------------------------------------------
# Functions
#-----------------------------------------------------------------------
# Used for to make Python < 2.7 work like newer versions
def _check_output(command,shell=False):
if type(command)==type(''):
command = command.split()
p = subprocess.Popen(command, stdout=subprocess.PIPE)
return p.communicate()[0]
# Print error message to stderr and quit
def print_error(msg):
print("ERROR: ",msg,file=sys.stderr)
sys.exit()
def get_scontrol_info(jobid):
output = _check_output(SCONTROL_CMD_STR + jobid).strip()
info = {}
for part in output.split():
try:
k,v = part.split("=",1)
except ValueError:
continue
info[k.lower()] = v
return info
def print_reason(jobid,user,reason,would_send_mail):
scontrol_info = get_scontrol_info(jobid)
mailstr = "SendMail" if would_send_mail else "NoMail"
if reason in ('PartitionTimeLimit'):
print(jobid,user,mailstr,reason,"partition="+scontrol_info['partition'],"timelimit="+scontrol_info['timelimit'])
else:
print(jobid,user,mailstr,reason)
def get_reason_msg(reason,scontrol_info):
template = REASON_TEMPLATES[reason]
return template % scontrol_info
def Usage(msg=None):
print("""
Usage: wait_notify [-Icx] [-n N] [-t ADMIN_EMAIL]
Read Slurm's sinfo output to determine which pending jobs are in a stuck
state that will not run, and email the jobs's owner so they can
cancel them and re-run if desired.
Each email is logged in an SQLITE3 database, and emails will not be sent
if an email has already gone out in the previous week.
OPTIONS
-h Show this help.
-I Run the first time to initialize Sqlite3 database
that records emails.
-x Email users with stuck jobs.
OPTIONS USEFUL FOR TESTING
-c Check only. List jobs that are in a cancelled state
-t ADMIN_EMAIL Useful for testing. Use only with -x. Emails will be
sent not to users, but to the ADMIN_EMAIL instead.
-n N Only email for first N users.
-f Force email, even if one was sent in past week
""")
if msg:
print
print(msg)
print
sys.exit()
def get_eventid(useracct,jobid,reason):
return "%s-%s-%s" % (useracct,jobid,reason)
#-----------------------------------------------------------------------
# Main
#-----------------------------------------------------------------------
def main():
opts, args = getopt.getopt(sys.argv[1:],'t:hxvfcIn:')
opts = dict(opts)
admin_email = opts['-t'] if '-t' in opts else None
try:
count_limit = int(opts['-n']) if '-n' in opts else 0
except ValueError:
count_limit = 1 # Play it safe
verbose = '-v' in opts
force = '-f' in opts
Check, Execute, Initial = '-c' in opts, '-x' in opts, '-I' in opts
if Check + Execute + Initial == 0:
Usage()
if Check + Execute + Initial > 1:
Usage(" ERROR: You cannot use -c, -x, or -I options together")
if admin_email and not Execute:
Usage(" ERROR: You can only use an admin user with the -x option")
# Inititize Sqlite3 database - only needs to be done the first time program is run
if Initial:
result = user_notify.create_table()
print(result)
sys.exit()
# Get list of jobs that cannot run
data = {}
for line in _check_output(SQUEUE_CMD_STR).split("\n"):
try:
reason, jobid, user = line.split()
except ValueError:
continue
# These reason always indicate an OK job
if reason in ("None","Priority"): continue
# Find bad jobs
if reason in REASON_TEMPLATES.keys():
if not user in data:
data[user] = []
data[user].append((jobid,reason))
# List found jobs to console
if Check:
for useracct in sorted(data.keys()):
for jobid, reason in data[useracct]:
eventid = get_eventid(useracct,jobid,reason)
try:
would_send_email = force or user_notify.need_new_notice( [ eventid ], NOTICE_AGE_DAYS )
except sqlite3.OperationalError:
print_error("SQLite DB file does exist. Rerun with the -I option to create it.")
sys.exit()
print_reason(jobid,useracct,reason,would_send_email)
elif Execute:
log = user_notify.LOG(LOG_FILE) # Initialize log file
for count,useracct in enumerate(sorted(data.keys())):
if count_limit and count>=count_limit: break # Limit number of users for testing purposes
username, email = user_notify.get_username_email(useracct)
msg = MAIL_TEMPLATE % {'username':username} # Start mail message
eventids = []
for jobid, reason in data[useracct]:
scontrol_info = get_scontrol_info(jobid) # Info from scontrol util.
msg += get_reason_msg(reason,scontrol_info)
eventids.append( get_eventid(useracct,jobid,reason) )
should_send_email = force or user_notify.need_new_notice(eventids, NOTICE_AGE_DAYS)
# Send email and log
if should_send_email:
#user_notify.send_mail(username,'jon.rifkin@uconn.edu',MAIL_SUBJECT,msg) - testing
user_notify.send_mail( username, admin_email or email, MAIL_SUBJECT, msg )
# Log email if sent to user
if not admin_email:
user_notify.update_notify_log(eventids)
log.write(" ".join(eventids)) # Write to user log
main()