gpfs-scan-files.py

#!/usr/bin/python

from __future__ import print_function


#-----------------------------------------------------------------------
#  Todo
#-----------------------------------------------------------------------
#
#  Learn how to redirect stdout and stderr into files
#

#-----------------------------------------------------------------------
#  Imports
#-----------------------------------------------------------------------
#  Standard modules
import os,sys,time,getopt,tempfile,subprocess

#  Custom modules
import DB, User


#-----------------------------------------------------------------------
#  Configuration
#-----------------------------------------------------------------------
MMAPPLYPOLICY = '/usr/lpp/mmfs/bin/mmapplypolicy'
TOTAL =         '/usr/local/bin/total'
HTMLTEMPLATE  = '/gpfs/scratchfs1/admin/gpfs-scan-files/html.template'
MMAPPLY_HOSTS = ['ddnnsd1.storrs.hpc.uconn.edu', 'ddnnsd2.storrs.hpc.uconn.edu']

DEF_OUTDIR    = "/gpfs/scratchfs1/admin/gpfs-scan-files/tmp"

THRESHOLD     = 1000000000000  #  Highlight sizes over 1TB


#  This directory is always exclude from mmapplypolicy when using an
#  automatically generated list of directories to include
ALWAYS_EXCLUDE_DIRS = ['.snapshots']


#-----------------------------------------------------------------------
#  Functions
#-----------------------------------------------------------------------

def print_error(msg):
	print()
	print("  ERROR:",msg)
	print()
	sys.exit()

def print_warn(msg):
	print()
	print("  WARNING:",msg)
	print()

#  Accepts either 
#    a single command represented by a single string
#    a single command represented by an array of string components
#    a series of commands represented by an array of an array of string components
def _ncheck_output(cmds):
	if type(cmds   )!=type([]): cmds = [cmds.split()]
	if type(cmds[0])!=type([]): cmds = [cmds]
	ncmds = len(cmds)
	ps = ncmds*[None]
	for i,cmd in enumerate(cmds):
		if i==0:
			ps[i] = subprocess.Popen(cmds[i],                       stdout=subprocess.PIPE, shell=False)
		else:
			ps[i] = subprocess.Popen(cmds[i], stdin=ps[i-1].stdout, stdout=subprocess.PIPE, shell=False)
	for i in range(ncmds-1):
		ps[i].stdout.close()
	return ps[-1].communicate()[0]

def get_hostname():
	return os.uname()[1]

#  Write list of files in include_file_list to file in out_dir and return file name
def write_include_file_list(out_dir, include_file_list):
	fpath = out_dir + "/include_file_list.txt"
	with open(fpath,"w") as fout:
		for fname in include_file_list:
			print(fname, file=fout)
	return fpath

#  Return list of strings from file; each line a string, omit blanks and comment lines
def read_file_list(fname):
	file_list = []
	with open(fname) as fin:
		for line in fin:
			sline = line.strip()
			if not sline or sline[0]=='#': continue
			file_list.append(sline.split("#")[0])  # Remove trailing comments
	return file_list

def get_ok_dirs(top_dir, exclude_dirs):
	top_dir = top_dir.rstrip("/")
	ok_dirs = []
	for item in os.listdir(top_dir):
		path = top_dir + "/" + item
		if os.path.isdir(path) and not os.path.islink(path) and not item in exclude_dirs and not item in ALWAYS_EXCLUDE_DIRS:
			ok_dirs.append(path)
	return ok_dirs

def run_mmapplypolicy(exefile, target_dir, policy_file, temp_dir, out_dir, include_file_list, cmdonly=False):
	#  Ensure we are runnning mmapplypolicy on the correct host
	hostname = get_hostname()
	if not hostname in MMAPPLY_HOSTS and not cmdonly:
		print_error("  When using the -s option, you must run on one of the DDN NSD nodes (%s)" % ",".join(MMAPPLY_HOSTS))
	#  Create output directories
	if not os.path.isdir(out_dir ): os.makedirs(out_dir )
	if not os.path.isdir(temp_dir): os.makedirs(temp_dir)
	#  Create policy file
	with open(policy_file,"w") as fout:
		print("RULE 'listall' list 'all-files'\nSHOW( varchar(access_time) || ' ' || varchar(kb_allocated) || '  ' || varchar(file_size) || ' ' || varchar(user_id) || ' ' || fileset_name  )", file=fout)
	#  Run GPFS utility 'mmapplypolicy'
	#    Notes about mmapplypolicy
	#     - mmapplypolicy is the fastest way to get list of all files on scratch partition
	#     - mmapplypolicy reads the RULES file that we wrote to temp file above
	#     - Name of output data file is hard-coded in mmapplypolicy as "list.all-files"
	#     - Status message are written to stdout and stderr
	#     - It takes about 1.25 hours for mmapplypolicy to list all files on scratch disk
	#     - Column output in list.all-files is
	#           Inode_number, gen_number, snapshot_id, access_time, kb_allocated, file_size, "--", filename
	#       The first three and last two columns are always present, the other columns are determined
	#       from the SHOW policy command
	#   MMAPPLYPOLICY OPTIONS
	#     -s  Location of temporary directories used by mmapplypolicy.  Must be on same device as that scanned.
	#     -I  'defer'.  Always used when doing a list.  Just one of those things ...
	#     -P  Location of policy file
	#     -f  Location of output files
	#     -i  File list list of directories to include.  Must exclude .snapshot from this list.  We also
	#         exclude /gpfs/scratchfs/BROKEN, which contains borked tmp/ and tmp_ard/ - these are broken
	#         and cause mmapplypolicy to hang indefinitely.
	#     -x  File list of top level directories to include.  
	#     -a4 Number of threads to use.  Default is 2. 
	if not include_file_list:
		cmd = "%(exefile)s %(target_dir)s -s %(temp_dir)s -I defer -P %(policy_file)s -f %(out_dir)s -a4" % locals()
	else:
		include_dir_file = write_include_file_list(out_dir, include_file_list)
		cmd = "%(exefile)s %(target_dir)s -s %(temp_dir)s -I defer -P %(policy_file)s -f %(out_dir)s -i %(include_dir_file)s -a4" % locals()
	if cmdonly:
		print ("mmapplypolicy command is (%s)" % cmd)
		sys.exit()
	p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	out,err = p.communicate()
	with open(out_dir + "/mm.out", "w") as fout: print(out,file=fout)
	with open(out_dir + "/mm.err", "w") as ferr: print(err,file=ferr)
	return p.returncode

#  Dummy version of run_mmapplypolicy().  Instead of generating new output file, it copies one
#  from TEST_OUTPUT.  So, you need to have that in place beforehand.
def run_test(temp_dir, out_dir):
	TEST_OUTPUT = '/gpfs/scratchfs1/admin/gpfs-scan-files/testfiles/list.all-files'
	#  Create output directories
	if not os.path.isdir(out_dir ): os.makedirs(out_dir )
	if not os.path.isdir(temp_dir): os.makedirs(temp_dir)
	cmd = [ 'cp', TEST_OUTPUT, out_dir+"/list.all-files" ]
	p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	out,err = p.communicate()
	with open(out_dir + "/mm.out", "w") as fout: print(out,file=fout)
	with open(out_dir + "/mm.err", "w") as ferr: print(err,file=ferr)
	return p.returncode
	
def csvrow(row):
	return ",".join(['"%s"' % str(col) for col in row])

def htmlheader(headerstr):
	row = [ str(col) for col in headerstr.split() ]
	return "<tr><th>" + "</th><th>".join(row) + "</th></tr>\n"

def htmlrow(*row,**args):
	tclass = args.get('tclass','t')
	#newrow = [ str(col) for col in row ]
	#return "<tr><td>" + "</td><td>".join(newrow) + "</td></tr>"
	return "<tr>" + ''.join( ["<td class='%s-%d'>%s</td>" % (tclass,i+1,str(col)) for i,col in enumerate(row)] ) + "</tr>\n"

def ReadDB(dbfile,datetime=None):
	db = DB.DB(dbfile)
	#  Get most recent time
	if datetime==None:
		[[datetime]] = db.read('select max(datetime) from user_footprint;')
	file_list  = db.read("select footprint,count,userid,user from user_footprint      where datetime='%s' order by footprint desc;" % datetime)
	homedirs   = db.read("select footprint,count,dir         from homedir_footprint   where datetime='%s' order by footprint desc;" % datetime)
	shareddirs = db.read("select footprint,count,dir         from shareddir_footprint where datetime='%s' order by footprint desc;" % datetime)
	return datetime,file_list, homedirs, shareddirs


def WriteDB(dbfile,datetime,file_list,homedirs,shareddirs):
	db = DB.DB(dbfile)
	#  Table user_footprint
	for rank,(size,count,uid,username) in enumerate(file_list):
		db.write("insert into user_footprint values('%s','%s','%s','%s','%s','%s');" % (datetime,uid,username,rank+1,count,size))
	#  Table dir_footprint
	for rank,(size,count,dirname) in enumerate(homedirs):
		db.write("insert into homedir_footprint values('%s','%s','%s','%s','%s');" % (datetime,dirname,rank+1,count,size))
	for rank,(size,count,dirname) in enumerate(shareddirs):
		db.write("insert into shareddir_footprint values('%s','%s','%s','%s','%s');" % (datetime,dirname,rank+1,count,size))
	db.close()


def WriteCSV(csvfile,datetime,file_list,growths,homedirs,shareddirs):
	with open(csvfile,'w') as fcsv:
		#  Table user_footprint
		print('"User Space Used"', file=fcsv)
		print(csvrow('DateTime Rank UID User Count Growth Size'.split()), file=fcsv)
		for rank,(size,count,uid,username) in enumerate(file_list):
			growth = growths.get(username,'-')
			print(csvrow([datetime,rank+1,uid,username,count,growth,size]), file=fcsv)
		print(file=fcsv)

		#  Table homedir_footprint
		print('"Home Directories"', file=fcsv)
		print(csvrow('DateTime Rank Home_Directory Count Size'.split()), file=fcsv)
		for rank,(size,count,dirname) in enumerate(homedirs):
			print(csvrow([datetime,rank+1,dirname,count,size]), file=fcsv)
		print(file=fcsv)

		#  Table shareddir_footprint
		print('"Shared Directories"', file=fcsv)
		print(csvrow('DateTime Rank Shared_Directory Count Size'.split()), file=fcsv)
		for rank,(size,count,dirname) in enumerate(shareddirs):
			print(csvrow([datetime,rank+1,dirname,count,size]), file=fcsv)
		print(file=fcsv)


def WriteHTML(htmlfile,datetime,file_list,growths,homedirs,shareddirs,topn=100):
	with open(HTMLTEMPLATE) as fin:
		htmltemplate = fin.read()
	#  Set datetime
	htmltemplate = htmltemplate.replace('{{TIMESTAMP}}',datetime)
	#  Table user_footprint
	msg  = "<table>\n"
	msg += htmlheader('Rank UID User Count Growth Size Cumulative')
	cumsize = cumcount = 0
	for rank,(size,count,uid,username) in enumerate(file_list):
		cumsize  += size
		cumcount += count
		growth    = growths.get(username,'-')
		if rank+1<=topn:
			msg += htmlrow(rank+1,uid,username,comma(count),hilite_size(comma(growth),growth,THRESHOLD),comma(size),comma(cumsize),tclass='t1')
	cumgrowth = sum(growths.values())
	msg += htmlrow(len(file_list),'ALL','',comma(cumcount),comma(cumgrowth),0,comma(cumsize),tclass='t1')
	msg += "</table>\n"
	htmltemplate = htmltemplate.replace('{{USER_TABLE}}',msg)
	#  Table homedir_footprint
	msg  = "<table>\n"
	msg += htmlheader('Rank Home_Directory Count Size Cumulative')
	cumsize = 0
	for rank,(size,count,dirname) in enumerate(homedirs):
		cumsize  += size
		msg += htmlrow(rank+1,dirname,comma(count),comma(size),comma(cumsize),tclass='t2')
	msg += "</table>\n"
	htmltemplate = htmltemplate.replace('{{HOMEDIR_TABLE}}',msg)
	#  Table shareddir_footprint
	msg  = "<table>\n"
	msg += htmlheader('Rank Shared_Directory Count Size Cumulative')
	cumsize = 0
	for rank,(size,count,dirname) in enumerate(shareddirs):
		cumsize  += size
		msg += htmlrow(rank+1,dirname,comma(count),comma(size),comma(cumsize),tclass='t3')
	msg += "</table>\n"
	htmltemplate = htmltemplate.replace('{{SHAREDDIR_TABLE}}',msg)
	#  Configure date navigation links
	htmltemplate = htmltemplate.replace('{{BEFORE}}',add_date(datetime,-1))
	htmltemplate = htmltemplate.replace('{{AFTER}}', add_date(datetime, 1))
	#  Write final version of page
	with open(htmlfile,"w") as fout:
		fout.write(htmltemplate)
	
def comma(s):
    new = ''
    #sign, s = '' if s>-0 else '-', abs(int(s))
    s = str(s)
    for i,c in enumerate(s[-1:0:-1]):
            new = c + new
            if (i % 3) == 2:
                    new = "," + new
    return (s[0] + new).replace("-,","-")

#  Make string bold if size reaches or exceeds threshold
def hilite_size(strval,size,threshold):
	if size>=threshold:
		return '<span class="threshold">' + strval + '</span>'
	elif size<=-threshold:
		return '<span class="thresholdneg">' + strval + '</span>'
	else:
		return strval


#  Reads in datetime string "%Y-%m-%d %H:%M:%S", but returns date string
def add_date(datetime, days):
	secs = time.mktime(time.strptime(datetime, "%Y-%m-%d %H:%M:%S")) + days*24*3600
	return time.strftime("%Y-%m-%d",time.localtime(secs))


#  Read arguments from command line
def parse_args(args):
	try:
		opts,args = getopt.getopt(sys.argv[1:],'s:p:D:C:H:hId:o:i:qx:')
	except getopt.GetoptError as e:
		Usage("   ERROR: " + str(e))
	opts = dict(opts)
	if not opts or '-h' in opts: Usage()   #   Print usage
	##if '-s' in opts and not '-p' in opts: 
	##	Usage("You must use -s and -p together")
	if '-p' in opts and not ( ('-D' in opts) + ('-C' in opts) + ('-H' in opts) ):
		Usage(" You must use -p with at least one of -D, -C and -H")
	if '-o' in opts and not '-s' in opts:
		Usage(" You can only use -o with the -s option")
	if '-I' in opts and not '-D' in opts:
		Usage("You must specify a database using -D when using -I")
	if '-D' in opts and not ('-C' in opts or '-H' in opts or '-I' in opts or '-p' in opts):
		Usage("You must use either -C, -H, -p, or -I, when using -D")
	if '-i' in opts and '-x' in opts:
		Usage("You cannot use both -i and -x")
	opts['writedb'] = (('-s' in opts) or ('-p' in opts)) and ('-D' in opts)
	opts['readdb' ] = ('-D' in opts) and ('-C' in opts or '-H' in opts)
	return opts, args

def Usage(msg=None):
	print("""
   Usage:  gpfs-scan-files.py [-s directory] [-p filelist] [-D dbname] [-C csvfile] [-H htmlfile] [-h] [-I]

   Options: 
    -d DATETIME   Set datetime (format YYYY-mm-dd HH:MM:SS) to write to DB, CSV or HTML
                  when using -s and/or -D.
    -s DIRECTORY  Scan directory recursively using mmapplypolicy.  Mmapplypolicy output
                  will be written to tmp/list.all-files, where tmp is a directory
                  created under the current directory (i.e., not the /tmp directory)
    -p MMOUTPUT   Parse mmapplyoutput and write to DB, CSV and/or HTML.  MMOUTPUT
                  is the file created from the -s option above.
    -D DBNAME     Name of database file 
    -I            Create new empty datbase file in DBNAME given by -D
    -o            Output directory for mmapplypolicy, must be on scratch partition. 
                  Only valid with -s option.  Default value is %s
    -q            With -s, print mmapplypolicy command only
    -x DIR[,DIR,...]  List of directories under DIRECTORY (-s option) that will be excluded.
    -i FILE       File with list of top-level directories under DIRECTORY (-s option) that
                    will be searched.  Others top-level directories are ignored.
""" % DEF_OUTDIR )
	if msg:
		print();print(msg);print()
	sys.exit()


def read_compare(dbfile,datetime):
	db = DB.DB(dbfile)
	#  Get all datetimes to find datetime previous to requested datetime
	datetimes = db.read("select distinct(datetime) from user_footprint order by datetime;")
	prev_datetime = None
	for [curr_datetime] in datetimes:
		if prev_datetime and curr_datetime==datetime:
			break
		prev_datetime = curr_datetime
	if not prev_datetime:
		print_error("Could not find previous datetime to %s" % datetime)
	#  Get files for two datetimes
	files0  = dict(db.read("select user,footprint from user_footprint where datetime='%s' order by footprint desc;" % prev_datetime))
	files1  = dict(db.read("select user,footprint from user_footprint where datetime='%s' order by footprint desc;" % datetime)     )
	#  Combine the two lists
	return dict([ (user,files1[user]-files0[user]) for user in set(files0.keys()).intersection(files1.keys()) ])

#-----------------------------------------------------------------------
#  Main
#-----------------------------------------------------------------------
def main():

	opts, args = parse_args(sys.argv[1:])
	#  Read DB records corresponding to given date/time
	if '-d' in opts:
		datetime = opts['-d']
		db_read_datetime = datetime  #  used to retreive DB data
	#  Read most recent DB records
	else:
		datetime    = time.strftime("%Y-%m-%d %H:%M:%S")
		db_read_datetime = None


	#  Initialize a new, empty database
	if '-I' in opts:
		import DB
		db = DB.DB(opts['-D'])
		db.create_tables()

	#  Scan directories with mmapplypolicy, and write to list.all-files in out_dir
	if '-s' in opts:
		#  Write outpufile into opts['-o']
		if '-o' in opts:
			out_dir  = opts['-o']
		#  Write output into default location
		else:
			if not os.path.isdir(DEF_OUTDIR):
				os.makedirs(DEF_OUTDIR)
			out_dir = DEF_OUTDIR
		temp_dir         = out_dir + "/tmp"
		policy_file      = out_dir + "/policy_file"
		target_dir       = opts['-s']
		#  Read user-supplied file with list of directories to include in search
		if '-i' in opts:
			include_file_list = read_file_list(opts['-i'])
		#  Create file with list of directories to include in search, by
		#  by excluding directories in opts['-x'] from top level of 'target_dir'
		elif '-x' in opts:
			include_file_list = get_ok_dirs(target_dir, opts['-x'].split(','))
		#  Call mmapplypolicy - this is where the 99% of execution time is spent
		#   Results are written to out_dir
		rc = run_mmapplypolicy(MMAPPLYPOLICY,target_dir,policy_file,temp_dir,out_dir,include_file_list, '-q' in opts)
		#rc = run_test(temp_dir,out_dir)  # For testing
		if rc>0: print_warn(" mmapplypolicy return code is (%d)" % rc)

	#  Process file from previous section to produce usage per user summary
	if '-p' in opts:
		#  Get list of total files sizes per owner
		inputfile = opts['-p']
		file_list = []
		user = User.User()
		for line in _ncheck_output([[TOTAL, "-s1r", "8", "6,n", inputfile]]).split("\n"):
			sline = line.strip()
			if not sline or sline[0]=='#': continue
			uid, kb_allocated, count = sline.split()
			#  Size is number of kb block allocated, plus 4096 per inode (assume #files==#inodes)
			size = int(kb_allocated) * 1024 + int(count) * 4096
			file_list.append((size,count,uid,user.get_user(uid)))
		file_list.sort(key=lambda x: x[0], reverse=True)

		#  Get list of total files kb_allocateds per home and shared directories
		homedirs, shareddirs = [], []
		for line in _ncheck_output([['sed','s#/# #g',inputfile],['/usr/local/bin/total','-s1r','13,14','6,n','-']]).split("\n"):
			sline = line.strip()
			if not sline or sline[0]=='#': continue
			home_or_shared, name, kb_allocated, count = sline.split()
			#  Size is number of kb block allocated, plus 4096 per inode (assume #files==#inodes)
			size = int(kb_allocated) * 1024 + int(count) * 4096
			if home_or_shared=='home':
				homedirs.append((size,count,name))
			elif home_or_shared=='shared':
				shareddirs.append((size,count,name))
		homedirs.sort(key=lambda x: x[0], reverse=True)
		shareddirs.sort(key=lambda x: x[0], reverse=True)

	#  Write data to database
	if opts['writedb']:
		WriteDB(opts['-D'],datetime,file_list,homedirs,shareddirs)

	#  Read data from database
	if opts['readdb']:
		datetime, file_list, homedirs, shareddirs = ReadDB(opts['-D'],datetime=db_read_datetime)
		growth = read_compare(opts['-D'], datetime) # Use datetime from previous search

	#  Write three tables in one csv file/sheet
	if '-C' in opts:
		WriteCSV(opts['-C'],datetime,file_list,growth,homedirs,shareddirs)

	#  Write three tables in HTML file
	if '-H' in opts:
		WriteHTML(opts['-H'],datetime,file_list,growth,homedirs,shareddirs)
		
				
main()