Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 648 lines (569 sloc) 23 KB
#!/usr/bin/env python3
#from __future__ import print_function
#TODO
# (_) Verify data is correct - compare to ssgunix
# (_) Add / to -F argument
# (_) Check -F for :
# (_) Add cmd-line options for Other and Stacking
# (_) Add stacking to gnuplot output
# (_) Ability to make synthetic attributes (eg: general node group)
# (_) When using file, default time range to entire file
# (_) Verify that time mathces sacct time (local or gmt)
#-----------------------------------------------------------------------
# Imports
#-----------------------------------------------------------------------
# Standard modules
import sys, time, getopt, calendar, subprocess
#-----------------------------------------------------------------------
# Configuration
#-----------------------------------------------------------------------
SACCT = '/gpfs/gpfs1/slurm/bin/sacct'
DEF_INTERVAL = 1800
DEF_START = '7d'
DEF_PLOTSIZE = (1280,360)
REQUIRED_HEADERS_STR = 'jobid,partition,qos,account,user,submit,start,end,ncpus,state,nodelist'
DEF_NTOP = 5
OPTION_DEFAULTS = (
('-i',DEF_INTERVAL),
('-t','NO TITLE'),
('-g',None),
('-n',DEF_NTOP ),
('-a','partition'), # attribute
('-I',''), # include
('-F',''), # filter
('-C',''), # csv
('-G',''), # gnuplot
)
#-----------------------------------------------------------------------
# Functions
#-----------------------------------------------------------------------
def Usage(msg=None):
print("""
Usage: hpc-stats.py [-S START] [-E END] -Q
Write out sact command line that is used by the program
hpc-stats.py [-S START] [-E END] -W SACCT.DAT
Write sacct data into file SACCT.DAT for subsequent use
hpc-stats.py [-SE] [-g GRAPH.PNG ] [SACCT.DAT [ SACCT.DAT ...] ]
Read sacct data (from file(s) if give, otherwise from sacct live),
and writ graph on X terminal, or optionally into PNG file -g.
Options:
DATA OPTIONS
-Q Write shell command line that calls 'sacct' with
correct options to generate the output needed for this
program to parse
-S START Start time (see TIME FORMAT below). Default is '1d'
-E END End time (see TIME FORMAT below). Default is the time when
command is run
-i SPAN Time interval to divide data. Default is 30 minutes.
(this is given in "Relative Time", see TIME FORMAT below)
-W FILE Read Slurm's sacct and write to FILE, with metadata
stored in comments.
PLOTTING AND CSV OPTIONS
-g GRAPH Name of file to write graph. If unspecified, graph
displays on X terminal
-t TITLE Plot title
-C FNAME Write plot data into file FNAME.csv
-G FNAME Write Gnuplot plotting file FNAME.gnuplot, and data into
FNAME.csv
-a CATEGORY Draw individuals curves for each item in this category.
CATEGORY can be partition,qos,account, or user.
-F FILTER Only consider data which passes the filter. For example:
user:jar01204,gms02014,alf02013
Here, data only used for user equal to jar02014,
gms02014 or alf02013. User above can be replaced with any
category. Also, -F can be used multiple times, to apply
multiple filters.
-n NTOP Only print NTOP of the CATEGORY curves, and one additional
curve called "OTHER" with the sum of the valuse for
the non-top categories. If there are no non-top categories,
then "OTHER" is omitted.
TIME FORMAT
Relative time: Value is a number appended with either 's','m','h','d'
(seconds,minutes,hours,days).
For -i, this time is taken to be the time interval.
For -S and -E, this time is subtracted from the current
time, and rounded to value of time interval (-i).
Absolute time: A single string with between 3 and 6 integers, delimited
by non-numeric characters, like '2017-4-17-12:51'.
If you specify just a date, the time is 00:00:00.
Any unspecified integers (i.e. hour, minute, second) default
to zero.
EXAMPLES
# Default 'sacct' command-line - for previous 24 hours period
> hpc-stats.py -Q -S 30m
sacct -aXP -S2018-04-22-00:00:00 -E2018-04-23-16:18:50 -o jobid,partition,qos,account,user,submit,start,end,ncpus,state
# Dump 'sacct data'
> hpc-stats.py -S 7d -E 30m -W sacct.dat
""")
if msg: print("\n"+msg+"\n")
sys.exit()
# Yield data values, while handling begining and ending edge conditions
def get_datapoints(series,first_marker,last_marker):
# Find first data point inside marker range
for ix in range(len(series)):
if series[ix][0]>first_marker:
break
prev_t = first_marker
prev_v = 0 if ix==0 else series[ix-1][1]
while ix<len(series):
yield (prev_t, prev_v), series[ix]
prev_t, prev_v = series[ix]
ix += 1
yield (prev_t,prev_v), [last_marker, prev_v]
# Generator to yield alternate data points and markers
# Returns
# prev_t, prev_v, t, is_marker
# prev_t,prev_t = position and value for previous point
# t = position for current point
# is_marker = True if current position is a marker, otherise data position
def next_point(series,marker,last_marker,span):
datapoints = get_datapoints(series,marker,last_marker)
(prev_t, prev_v), (t,v) = next(datapoints)
while True:
# Next point is a data point
if t<marker+span:
yield prev_t, prev_v, t, False
try:
(prev_t, prev_v), (t,v) = next(datapoints)
except StopIteration:
return
# Next point is a marker
else:
marker += span
yield prev_t, prev_v, marker, True
prev_t = marker
# A generator. Reads an irregularly spaced *step* function (passed in array
# series [ (t,v),(t,v).. ]) and coverts it into a regularly spaced function
# (not a step function). New function runs from postions 'first_marker' to
# 'last_marker', with values at intervals of 'span'
def stepify(series,first_marker,last_marker,span):
# If markers and data series do not overlap, return all zero values
if last_marker<=series[0][0] or first_marker>=series[-1][0]:
for i in range(int((last_marker-first_marker)/span)):
yield (first_marker+i*span,0)
total = 0
for prev_t,prev_v,t,is_marker in next_point(series,first_marker,last_marker,span):
if is_marker:
yield t-span, (total + (t-prev_t)*prev_v)/float(span)
total = 0
else:
total += (t-prev_t)*prev_v
def print_error(msg):
print(" ERROR: ", msg, file=sys.stderr)
sys.exit()
def print_error_plot(title,msg,sizes,plotname):
import datetime, matplotlib
if plotname!=None: matplotlib.use('Agg') # Do not use the default X backend when writing file
import matplotlib.dates
try:
import matplotlib.pyplot as plt
except ImportError:
print_error("Interactive graphics not available. You need tcl loaded first")
fig = plt.figure()
fig.suptitle(title,fontsize=32, fontweight='bold')
fig.text(0.5,0.5,msg,horizontalalignment='center',fontsize='16')
dpi = float(fig.get_dpi())
set_fig_size_pixels(fig, sizes[0], sizes[1])
fig.set_size_inches(sizes[0]/dpi, sizes[1]/dpi)
plt.savefig(plotname)
# Used for Python < 2.7
def _check_output(command,shell=False):
if type(command)==type(''):
command = command.split()
p = subprocess.Popen(command, stdout=subprocess.PIPE)
return p.communicate()[0]
def set_defaults(args, defaults):
for k,v in defaults:
if not k in args: args[k] =v
return args
def get_pasttime(delstr,now=time.time(),interval=3600*24):
p = parse_reltime(delstr)
if p:
p = now - p
p -= (p % interval)
return p
else:
return None
# Convert time string to epoch seconds. As time string is a series of
# integers representing year, month, day, hour, min, sec in that order, with
# at least one number, the remaining being set to 0 (or the first month or day)
# Each integer is seperated by one or more non-integers.
# Examples: "2018-01-20" "1997-7-17T21:17:05" "1999/12/31 23:59:59"
def timekey2sec(timekey):
s = ''
timearray = []
for c in timekey.strip():
if c >= '0' and c <= '9':
s += c
elif s!='':
timearray.append(int(s))
s = ''
if s:
timearray.append(int(s))
if not timearray: return None
if len(timearray)<3: return None
timearray += (9-len(timearray)) * [0] # Pad array with 0s to make a 9 element time array
return calendar.timegm(timearray) # Like time.mktime(), but works correctly when tuple is GMT
# Convert epoch seconds to date format
def sec2timekey(secs,sacctfmt=False,gm=False):
timearray = time.gmtime(secs) if gm else time.localtime(secs)
if sacctfmt:
return time.strftime("%Y-%m-%dT%H:%M:%S", timearray)
else:
return time.strftime("%Y-%m-%d-%H:%M:%S", timearray)
# Convert integer and time unit string (i.e. 1s, 5d, 3m) to
# seconds
def parse_reltime(timestr):
if type(timestr)==type(1):
return timestr
if timestr[-1] in 'dhmsDHMS':
timestr, unitstr = timestr[:-1], timestr[-1].lower()
else:
timestr, unitstr = timestr[:-1], 's'
try:
return int(timestr) * {'s':1, 'm': 60, 'h': 3600, 'd':86400}[unitstr]
except ValueError:
return None
def parse_filter_string(filter, filterstr):
if not filter: filter = {}
if filterstr.rstrip():
filterstr = filterstr.lower()
if not ':' in filterstr:
k,v = filterstr, None
else:
k,v = filterstr.split(":")
v = v.split(",")
filter.update( ((k,v),) )
return filter
def entry_passes_filter(filter,entry):
if not filter: return True
for k in filter:
if not k in entry:
return False
for v in filter[k]:
if entry[k].lower() == v:
break
else:
return False
return True
def get_opts(argsin):
live_opts, args = getopt.getopt(argsin, 'QS:R:W:i:hg:t:n:a:I:E:F:C:G:') # a-attributes, I include, F filter
def_start_str = sec2timekey(get_pasttime(DEF_START) or timekey2sec(DEF_START), gm=True)
opts = {}
opts.update( OPTION_DEFAULTS )
# Use live_opts to update filter (convert all -F options)
filter = None
for k,v in live_opts:
if k=='-F':
filter = parse_filter_string(filter,v.lower())
opts['filter'] = filter
opts.update( ( ('-S',def_start_str),('-E',sec2timekey(time.time(),sacctfmt=True)) ) )
opts.update( live_opts)
if '-h' in opts: Usage()
# Sanity check
intersection = set(opts.keys()).intersection(set(('-Q',)))
if '-h' in opts:
Usage(" ERROR: You must specify a database file")
# Convert options
interval = parse_reltime(opts['-i'])
opts['interval' ] = interval
opts['start'] = get_pasttime(opts['-S'],interval=interval) or timekey2sec(opts['-S']) or None
opts['end' ] = get_pasttime(opts['-E'],interval=interval) or timekey2sec(opts['-E']) or None
opts['-S' ] = sec2timekey(opts['start'])
opts['-E' ] = sec2timekey(opts['end'])
opts['-a' ] = opts['-a'].lower()
if opts['-I']:
opts['include'] = opts['-I'].split(',')
else:
opts['include'] = None
try:
opts['ntop' ] = int(opts['-n'])
except ValueError:
print_error("Value for NTOP (-n) must be an integer")
if opts['start']>opts['end']: Usage(" ERROR: Start time must precede end time ")
return opts, args
# Return next non-comment/non-blank line, along with line cnt
def read_next_noncomment(fnames):
for fname in fnames:
fname_pretty = 'stdin' if fname=='-' else fname
fin = sys.stdin if fname=='-' else open(fname)
datacnt = 0
for linecnt, line in enumerate(fin):
sline = line.lstrip()
# reject comments and blank lines
if not sline or sline[0]=='#': continue
datacnt += 1
yield (fname_pretty,linecnt,datacnt), line
if fname=='-': fin.close()
def read_job_run_entries_from_file(startsec, endsec, fnames, filter=None, limit=None, delimiter="|"):
# Convert field labels to lower case
requested_headers = [f.lower() for f in REQUIRED_HEADERS_STR.split(',')]
# Read non-comments from file(s)
for (fname,linecnt,datacnt),line in read_next_noncomment(fnames):
# Get column headers from first non-comment
sline = line.rstrip()
if datacnt==1:
found_headers = sline.lower().split(delimiter)
found_headers = dict( [ (i,f) for i,f in enumerate(found_headers) if f in requested_headers ] )
# Not all columns were found
if len(found_headers)<len(requested_headers):
missing_fields = [ f for f in requested_headers if not f in found_headers.values() ]
print_error("Input file is missing following columns:" + " ".join(missing_fields))
# Yield dict parsed from data line
else:
if limit and datacnt==limit: break
if not sline: continue
entry = dict( [ (found_headers[i], value) for i,value in enumerate(sline.split('|')) if i<len(found_headers) ] )
if not entry_passes_filter(filter,entry): continue
entry['startstr'], entry['endstr'] = entry['start'],entry['end']
if not entry['start']=='Unknown':
entry['start'] = timekey2sec(entry['start'])
entry['end' ] = endsec if entry['end']=='Unknown' else timekey2sec(entry['end'])
# Does job entry overlap the time range
if entry['start']<endsec and entry['end']>=startsec:
entry['elapsed'] = endsec - startsec
yield entry
def get_sacct_cmd(start,end,in_secs=True):
if in_secs:
startstr = sec2timekey(start)
endstr = sec2timekey(end)
return "sacct -aXP -S %s -E %s -o %s" % (startstr,endstr,REQUIRED_HEADERS_STR)
else:
return "sacct -aXP -S %s -E %s -o %s" % (start,end,REQUIRED_HEADERS_STR)
def get_sacct_output(startstr,endstr):
cmd = get_sacct_cmd(startstr,endstr,False).split()
try:
return subprocess.check_output(cmd).decode('UTF-8')
except FileNotFoundError:
print_error("Sacct does not appear to be installed on this machine")
# Read job run info from sacct
# startstr, endstr - datetimes in 'sact' format: ex 2018-04-30T15:21:45 (note 'T' in string)
# returns jobs entries from subprocess
def read_job_run_entries_from_sacct(startsec,endsec,filter=None):
cmd = get_sacct_cmd(startsec,endsec).split()
for i,line in enumerate(subprocess.check_output(cmd).decode('UTF-8').split("\n")):
if i==0:
headers = line.split('|')
continue
sline = line.rstrip()
if not sline: continue
entry = dict( [ (headers[i].lower(), value) for i,value in enumerate(sline.split('|')) if i<len(headers) ] )
if entry['start']!='Unknown':
entry['start'] = timekey2sec(entry['start'])
entry['end' ] = endsec if entry['end']=='Unknown' else timekey2sec(entry['end'])
if not entry_passes_filter(filter,entry): continue
yield entry
def stack_values(oldvalues):
values = []
#for points in zip(oldvalues):
for points in oldvalues:
# When points contain two element, for reasons unknown it is a tuple,
# otherwise it is a list
points = list(points)
for i in range(len(points)-1):
points[i+1] += points[i]
values.append(tuple(points))
return zip(*values)
def fill_plot(ax, xs, ys, ylevel=0, color='k', label='', filled=True):
if filled:
if ylevel==None:
ax.fill( xs, ys, color=color, label=label)
else:
ax.fill( [xs[0]] + list(xs) + [xs[-1]], [ylevel] + list(ys) + [ylevel], color=color, label=label )
else:
ax.plot( xs, ys, color=color, label=label )
def set_fig_size_pixels(fig, x, y):
dpi = float(fig.get_dpi())
fig.set_size_inches(x/dpi, y/dpi)
# data - Plot lines: format is list of points ( (t0,(y0,y1,y2)), (t1,(y0,y1,y2)), ... )
# curve_names - Names of curves 0,1,2, etc, used for labels
# plotname - Output file - if None, write to screen
# filled - If True, draw filled curves
# title - Plot title
def plot_matplotlib(data, curve_names, sizes=DEF_PLOTSIZE,plotname=None,filled=False,stacked=False,opts={}):
import datetime, matplotlib
if plotname!=None: matplotlib.use('Agg') # Do not use the default X backend when writing file
import matplotlib.dates
try:
import matplotlib.pyplot as plt
except ImportError:
print_error("Interactive graphics not available. You have tcl loaded first")
FILL_COLORS = ['b','r','g','y','k']
# Reorganize data: from data = ( t1,(a1,b1,c1) ), (t2,(a2,b2,c2)), (t3,(a3,b3,c3)) to
# time = (t1,t2,t3), values = ((a1,a2,a3),(b1,b2,b3),(c1,c2,c3))
times, values = zip(*data)
# Stack values if requested
values = list(stack_values(values) if stacked else zip(*values))
# Convert times to datetime type
times = [datetime.datetime.strptime(t,"%Y-%m-%d-%H:%M:%S") for t in times]
# Draw graph
fig, ax = plt.subplots()
for cnt in reversed(range(len(values))):
# Top user is filled curve
color = FILL_COLORS[cnt % len(FILL_COLORS)]
#label = "%04d - %s" % (int(values[cnt][-1]), curve_names[cnt])
label = curve_names[cnt][:11]
fill_plot( ax, times, values[cnt], color=color, label=label, filled=filled)
# Can use plt.legend() also
#ax.legend(loc='center left', prop={'size':8})
ax.legend(bbox_to_anchor=(1.00, 0.8), loc=2, borderaxespad=0.0, prop={'size':9})
# Format x axis
#yearsFmt = matplotlib.dates.DateFormatter('%b %d')
#ax.xaxis.set_major_formatter(yearsFmt)
# Format labels and plot
if 'title' in opts:
title = plt.title(opts['title'])
plt.setp(title, size='large', color='b')
if 'xlabel' in opts:
xlabel = plt.xlabel(opts['xlabel'])
plt.setp(xlabel, size='medium', weight='bold', color='g')
if 'ylabel' in opts:
ylabel = plt.ylabel(opts['ylabel'])
plt.setp(ylabel, size='medium', weight='bold', color='g')
plt.figtext(1.0, 1.0, time.strftime("%Y-%m-%d %H:%M:%S"),color='gray',horizontalalignment='right',verticalalignment='top',size=10)
plt.grid(linewidth=0.2)
yearsFmt = matplotlib.dates.DateFormatter('%a\n%b %d')
ax.xaxis.set_major_formatter(yearsFmt)
set_fig_size_pixels(fig, sizes[0], sizes[1])
if plotname:
dpi = float(fig.get_dpi())
fig.set_size_inches(sizes[0]/dpi, sizes[1]/dpi)
plt.savefig(plotname)
# Show plot on terminal
else:
plt.show()
def combine_top0(data,ntop=None,other=None):
# Find combined values
k_sum = {}
for dt, items in data.items():
for k,val in items:
k_sum[k] = k_sum[k] + val if k in k_sum else val
# Sort by sum, get top
toplist = [k for k,v in sorted(k_sum.items(), key=lambda l: l[1])]
if ntop:
if ntop<len(toplist):
toplist = toplist[-ntop:]
else:
other = None
else:
other = None
# Order items
newdata = []
for dt, items in data.items():
items = dict(items)
top_vals = [ items.get(p,0) for p in toplist ]
if other:
top_vals.append( sum(items.values()) - sum(top_vals) )
newdata.append( (dt, top_vals) )
if other: toplist.append(other)
return toplist, sorted( newdata )
# Read: categories { category:[v0,v1,v2,..], category:[v0,v1,v2,...], ... }, ntop, other
# Return [ top_names, data_series [ (v1,v2,v3,...), (v1,v2,v3,...), ... ]
def combine_top(category_series,ntop=None,other=None,include=None):
# Find combined values
sums = [ (category,sum(series)) for category,series in category_series.items() ]
if include:
top_names = include
else:
top_names = [ k for k,v in sorted(sums, key=lambda l: l[1]) ]
# Re-order items
time_points = list(zip(*[category_series[category] for category in top_names])) # => [ (v0,v0,v0..), (v1,v1,v1..) .. ]
# Truncate list of names to top names)
if ntop and ntop<len(top_names):
top_names = top_names[-ntop:]
# Use all names
else:
other = None
# Add value of 'other' to names, points
if ntop and other:
top_names.append(other)
time_points = [ list(values[-ntop:]) + [sum(values)-sum(values[-ntop:])] for values in time_points ]
return top_names, time_points
# Configuration - which data column will we key data on
def filter_entry(entry,opts):
attribute = opts['-a']
if not attribute in entry: print_error("Attribute %s does not exist" % attribute)
#if opts['include'] and not entry[attribute] in opts['include']: return None
return entry[attribute]
def csv_quoted_vals(vals):
return '"' + '","'.join([str(v) for v in vals]) + '"'
#-----------------------------------------------------------------------
# Main
#-----------------------------------------------------------------------
def main():
opts, args = get_opts(sys.argv[1:])
# Print out sacct command line
if '-Q' in opts:
print(get_sacct_cmd(opts['-S'],opts['-E'],False))
sys.exit()
# Run sacct and write to file
if '-W' in opts:
with open(opts['-W'],"w") as fout:
print( "# SACCT READ ON: ", time.strftime("%Y-%m-%d %H:%M:%S"), file=fout )
print( "# SACCT COMMAND: ", get_sacct_cmd(opts['-S'],opts['-E'],False), file=fout )
print( get_sacct_output(opts['-S'],opts['-E']), file=fout )
sys.exit()
if args:
entry_generator = read_job_run_entries_from_file(opts['start'],opts['end'],args,filter=opts['filter'])
else:
entry_generator = read_job_run_entries_from_sacct(opts['start'],opts['end'],filter=opts['filter'])
job_changes = {}
for entry in entry_generator:
attribute = filter_entry(entry,opts)
if not attribute: continue
if not attribute in job_changes: job_changes[attribute] = []
job_changes[attribute].append( ( entry['start'], int(entry['ncpus']) ) )
job_changes[attribute].append( ( entry['end' ], -int(entry['ncpus']) ) )
if not job_changes:
msg = "No data was eligible for plotting"
title = opts['-t']
plotname = opts['-g']
if not opts['-C'] and not opts['-G']:
print_error_plot(title,msg,DEF_PLOTSIZE,plotname)
else:
print_error(msg)
sys.exit() # End program - no graph to make
# For attribute, combine job start,end info above to get total cores used over time
timeseries = {}
startsec = opts['start']
for attribute, series in list(job_changes.items()):
timeseries[attribute] = [(startsec,0)]
value = 0
prevt = None
for t,v in sorted(series):
value += v
# This prevent time value from repeating
if t==prevt:
timeseries[attribute][-1] = (t,value)
else:
timeseries[attribute].append((t,value))
prevt = t
# Divide irregularly spaced time series data into regulary spaced intervals
for attribute in timeseries:
generator = stepify(timeseries[attribute],opts['start'],opts['end'],opts['interval'])
times, timeseries[attribute] = zip( *generator ) # All lists times[] are identical
times = [sec2timekey(t) for t in times]
# Determine subgroup order and choose top curves
curve_names, timeseries = combine_top(timeseries,opts['ntop'],'OTHER',opts['include'])
# CSV output
if opts['-C'] or opts['-G']:
fname = opts['-C'] or opts['-G']
with open(fname + ".csv", "w") as fout:
print(csv_quoted_vals(['Date'] + list(curve_names)), file=fout)
for t,vals in zip(times,timeseries):
print(csv_quoted_vals([t]+list(vals)), file=fout)
# GNUPLOT OUTPUT
if opts['-G']:
with open(opts['-G'] + ".gnuplot", "w") as fout:
print("""set datafile separator ","
set timefmt "%%Y-%%m-%%d %%H:%%M:%%S"
set xdata time
set title "%s"
plot for [col=2:%d] "%s.csv" using 1:col with lp ls col title columnhead(col) at end
""" % (opts['-t'], len(curve_names)+1, opts['-G']), file=fout)
if not opts['-C'] and not opts['-G']:
# Plot data
title = opts['-t']
plot_matplotlib( zip(times,timeseries), curve_names, plotname=opts['-g'],filled=True,stacked=True,opts={'title':title,'ylabel':'Core Hours'})
if __name__=='__main__':
main()