dtree.py

#!/usr/bin/python

from __future__ import print_function

import os, gzip, pwd, sys, copy, time, getopt, collections

DEF_AGE         = 120      #  120 days
DEF_MIN         = 1024**4  #  1 TB
DEF_BLOCKSIZE   = 1        #  1 byte
MAX_ROWS        = 1000000  #  Maximum number of table rows for rowformat
DEF_DATE_FORMAT = None

CNT_DATE_ERRORS = 0   # Count number of date errors on input
VERBOSE         = False

#  Record object of directory information
class DirRecord:
	def __init__(self):
	   	self.size = 0
   		self.cnt  = 0
   		self.mindate = '9999-99-99'
   		self.maxdate = '0000-00-00'
	def update(self,cnt,size,mindate,maxdate):
		self.cnt     += cnt
		self.size    += size
		self.mindate  = min(self.mindate,mindate)
		self.maxdate  = max(self.maxdate,maxdate)
	def combine(self, dirrecord):
		self.cnt     += dirrecord.cnt
		self.size    += dirrecord.size
		self.mindate  = min(self.mindate,dirrecord.mindate)
		self.maxdate  = max(self.maxdate,dirrecord.maxdate)
	def is_as_old(self,olddate):
		return self.maxdate<=olddate
	def is_as_large(self,size,blocksize=1):
		return blocksize*self.size>=size
	def get_tuple(self):
		return self.size, self.cnt, self.mindate, self.maxdate
	def __repr__(self):
		return "%s %s %s %s" % (self.size, self.cnt, self.mindate, self.maxdate)


#  Print single error message and halt the program
def print_error(msg):
	print("  ERROR: ", msg)
	sys.exit()

#  Print verbose text
def print_verbose(verbose, msg):
	if verbose:
		datetimestr = time.strftime("%Y-%m-%d %H:%M:%S");
		print(datetimestr, msg)

#  Pad text
def pad(s,width,left=False):
	if left:
		return (s + width*' ')[:width]
	else:
		return (width*' ' + s)[-width:]


#  Padd and justify rows, like Unix command 'column -t'
def rowformat(rows):
	if len(rows)>MAX_ROWS:
		print("ERROR:  Number of rows (%d) exceeds MAX_ROWS (%d)" % len(rows), MAX_ROWS)
		sys.exit()
	#  get maximum width for each row
	widths = []
	for row in rows:
		#  Extend lenght of widths if necessary
		if len(row)>len(widths):
			widths += (len(row)-len(widths)) * [0]
		widths = [ max(width,len(str(item))) for width,item in zip(widths,row) ]
	for row in rows:
		line = '  '.join( [ pad(str(item),width,True) for width,item in zip(widths,row)  ] ).rstrip()
		yield line

	
#  Read config file that describes reports
def read_config(fname):
	config = []
	with open(fname) as fin:
		for lineno, line in enumerate(fin):
			sline = line.strip()
			if not sline or sline.startswith("#"):
				continue
			try:
				age, mindir, fname = sline.split()
			except ValueError:
				print_error("Wrong number of arguments on line %d of config file %s" % (lineno, fname))
			datestr = age2date(age)
			config.append((datestr, read_human(mindir), fname))
	return config


#  Convert integers to number of KB, MB, GB or TB
def write_human(n):
        for symbol,size in ('T',1024**4), ('G',1024**3), ('M',1024**2), ('K',1024):
                if n>=size:
                        x = n/float(size)
                        if x<10:
                                return "%.1f%s" % ( x, symbol )
                        else:
                                return "%.f%s" % ( x, symbol )
        return "%d" % n

#  Given string like 2.1T or 17G, etc, return integer
def read_human(nstr):
	last_chr = nstr[-1].lower()
	if last_chr.lower() in ('t','g','m','k'):
		factor = 1024**{'t':4, 'g':3, 'm':2, 'k':1}[last_chr]
		nstr = nstr[:-1]
	else:
		factor = 1
	return int(float(nstr)*factor)
	

def age2date(daystr):
	days = int(daystr)
	return time.strftime("%Y-%m-%d", time.localtime(time.time()-3600*24*days))

#  Process command line arguments and return to main 
def process_args():
	opts, args = getopt.getopt(sys.argv[1:], 'm:a:c:b:vF:')
	opts = dict(opts)
	opts['age'       ] = age2date  (opts['-a']) if '-a' in opts else age2date(DEF_AGE)
	opts['blocksize' ] = read_human(opts['-b']) if '-b' in opts else DEF_BLOCKSIZE
	opts['min'       ] = read_human(opts['-m']) if '-m' in opts else DEF_MIN
	opts['dateformat'] =            opts['-F']  if '-F' in opts else DEF_DATE_FORMAT
	# Sanity check
	if '-c' in opts and ('-a' in opts or '-m' in opts):
		Usage("You cannot use options -a or -m with -c")
	try:
		file_in, top_dir = args[:2]
		col_name, col_size, col_date = [int(x)-1 for x in args[2:5]]
		return opts, file_in, top_dir, col_name, col_size, col_date
	except ValueError:
		Usage()

#  Read lines from file list
def read_line(fin, col_name, col_size, col_date, opts):
	global CNT_DATE_ERRORS
	if opts['dateformat']:
		num_date_cols = len( opts['dateformat'].split() )
		max_col = max(col_name, col_size, col_date + num_date_cols - 1)
	else:
		max_col = max(col_name, col_size, col_date)
	for lineno, line in enumerate(fin):
		parts = line.decode().split(None,max_col+1)
		if len(parts)>=max_col:
			#  Convert date field(s) to standard date format YYYY-mm-dd
			if opts['dateformat']:
				#  Join adjacent date fields into one space-delimited strings
				date_field = " ".join(parts[col_date:col_date+num_date_cols])
				before = date_field
				#  Convert string to time-array using strptime
				try:
					date_field = time.strptime( date_field, opts['dateformat'] )
				except ValueError:
					CNT_DATE_ERRORS += 1
					print_verbose(VERBOSE, "Cannot parse date field from line %d: (%s)" % (lineno+1, date_field))
					continue
				date_field = time.strftime( "%Y-%m-%d", date_field )
			#  Assume date field already in standard date format
			else:
				date_field = parts[col_date]
			yield parts[col_name], int(parts[col_size]), date_field


#  Remove tailing file name from file path, leaving just directory
def truncate_dir(fname):
	#  Note:  This is 20% faster than using os.path.dirname(fname)
	#         and it is called *a lot* (about 400 million times)
	pos = fname.rfind("/")
	return fname[:pos] if pos>=0 else fname

#  Adpated from https://stackoverflow.com/questions/1830618/how-to-find-the-owner-of-a-file-or-directory-in-python/1830635
def find_owner(filename):
	try:
		return pwd.getpwuid(os.stat(filename).st_uid).pw_name
	except (OSError, KeyError):
		return "N/A"

#  Return directories that are either 'home', 'shared', or 'neither'
def get_dir_group(group_name,fnames):
	home_str   = "/gpfs/scratchfs1/home/"
	shared_str = "/gpfs/scratchfs1/shared/"
	if group_name=='home':
		return [ fname for fname in fnames if fname.startswith(home_str)  ]
	if group_name=='shared':
		return [ fname for fname in fnames if fname.startswith(shared_str)]
	if group_name=='neither':
		return [ fname for fname in fnames if not (fname.startswith(shared_str) or fname.startswith(home_str)) ]
	return fnames
	

#  Get component directories in name, except for top_dir and above
#    Example:
#       top_dir = /gpfs/scratchfs1
#       name    = /gpfs/scratchfs1/CREST/Blue/stat/snapshot/8.err
#       Returns:
#          /gpfs/scratchfs1/CREST/Blue/stat/snapshot
#          /gpfs/scratchfs1/CREST/Blue/stat
#          /gpfs/scratchfs1/CREST/Blue
#          /gpfs/scratchfs1/CREST
def get_upper_dirs(top_dir, name):
	#  Remove trailing slash from top_dir
	if top_dir.endswith("/"): top_dir = top_dir[:-1]
	while True:
		yield name
		name = truncate_dir(name)
		if name==top_dir or not name: break


def Usage(msg=None):
	print("""
   Usage:  dtree [-a AGE_DAYS]  [-m MIN_DIRSIZE] FILELIST TOPDIR COL_NAME COL_SIZE COL_DATE
   Usage:  dtree [-c CONFIG  ]  FILELIST TOPDIR COL_NAME COL_SIZE COL_DATE

   Parmaters:
      FILELIST - File with list of file system info.  Each line contains the name
                 of a file and its directory path, size of the file, and the age of the file.
		 FILELIST can be a plain text file, a gzip'ed plain text file if FILELIST
		 ends in .gz, or standard input if FILELIST a hyphen (-)
      TOPDIR   - Directory or directories to strip off from the top of each file path, when
                 displaying information in the output
      COL_NAME - Which column contains the file names.  Columns numbered starting with 1.
      COL_SIZE - Which column contains the file size.  The units are determined by -b option.
                 Default is bytes.
      COL_DATE - Which column contains the date.  Default format is YYYY-mm-dd (e.g. 2010-12-05).
                 You can use the -F option to specify a different data format.

   Options:
     -a  AGE_DAYS    Only print directories older than AGE_DAYS: default is %d days
     -m  MIN_DIRSIZE Minimum directory size: default is %s.  You can use K,M,G and T suffixes.
     -c  CONFIG      Read AGE_DAYS, MIN_DIRSIZE, and output file name from CONFIG file
                     Each line produced separate output.
     -b  BLOCKSIZE   Size of blocks used in COL_SIZE.  Default is 1 byte.
                     You can use K,M,G and T suffixes.
     -F  DATE_FORMAT Date format for date in COL_DATE.  Default is %%Y-%%m-%%d.  See 'date' 
                     command for allowed formats.

   Config Example:
     #  Comments begin with '#'
        7  1T  dtree-007.out
       30  1T  dtree-030.out
       60  1T  dtree-060.out
       90  1T  dtree-090.out
      120  1T  dtree-120.out

""" % (DEF_AGE, write_human(DEF_MIN)) )
	#  Print error msg
	if msg:
		print()
		print("   ", msg)
		print()
	sys.exit()


#  For group_str of directories, return formated table string ready to print
#    Table columns are right? justified by rowformat
def format_table(group_str, top_dir_names, all_tree, blocksize):
	rows = [
		( "DIR-SIZE", "FILE-CNT", "AVG-FILE-SIZE", "DATE-NEWEST", "OWNER", "DIRECTORY" ),
		( "========", "========", "=============", "===========", "=========", "================" ),
	]
	for dir_name in get_dir_group(group_str, top_dir_names):
		d = all_tree[dir_name]
		#  Save: directory size, file count, avg file size, newest file date, directory owner, directory name
		size = blocksize*d.size
		rows.append( (write_human(size), d.cnt, write_human(size/d.cnt), d.maxdate, find_owner(dir_name), dir_name) )
	return "\n".join ( rowformat( rows ) )

def main():
	global VERBOSE
	#  Get arguments
	#  col_name,col_size,col_date are the column position of file name, size and date.
	opts, file_in, top_dir, col_name, col_size, col_date = process_args()

	#  Set config
	if '-c' in opts:  
		config = read_config(opts['-c'])
	else:
		config = [ (opts['age'], opts['min'], '-' ) ]

	VERBOSE = '-v' in opts

	print_verbose(VERBOSE, "Starting to read file listing")

	#  Store data for each folder in a dict of DirRecord objects 
	bottom_tree = collections.defaultdict(DirRecord)
	#  Read from stdin, or gzipped file, or plain text file
	fin = sys.stdin if file_in=='-' else gzip.open(file_in) if file_in.endswith(".gz") else open(file_in)
	for lineno, (fname, size, datestr) in enumerate(read_line(fin, col_name, col_size, col_date, opts)):
		if VERBOSE and (lineno % 1000000)==0:
			print_verbose(VERBOSE,"Reading line %d" % lineno)
		bottom_dir = truncate_dir(fname)
		bottom_tree[bottom_dir].update( 1, size, datestr, datestr )
	if not file_in=='-': fin.close()
	if CNT_DATE_ERRORS>0:  print("WARNING: Number of lines with unparseable date fields: %d" % CNT_DATE_ERRORS)


	#  Combine data about folders
	#  Process every bottom directory
	print_verbose(VERBOSE, "Collecting bottom directory info into higher directories")
	all_tree = collections.defaultdict(DirRecord)
	for bottom_dir in bottom_tree.keys():
		#  For every full dir path, get paths for each upper directory
		for upper_dir in get_upper_dirs(top_dir, bottom_dir):
			#  Combine bottom directory with upper direcgtory info
			all_tree[upper_dir].combine( bottom_tree[bottom_dir] )

	#  Do each report from config
	blocksize = opts['blocksize']
	for nreport, (age, mindir, fname) in enumerate(config):

		print_verbose(VERBOSE, "Collecting directories old enough and large enough")

		#  Remove young and small directories
		eligible_names = []
		for name, props in all_tree.items():
			if props.is_as_old(age) and props.is_as_large(mindir,blocksize):
				eligible_names.append(name)

		#  Remove sub-directories which are included in higher directories
		print_verbose(VERBOSE, "Keeping only top directories")
		top_dir_names = []
		prev_name = None
		for name in sorted(eligible_names):
			#  This directory is a subdirectory of the previous one
			if not (prev_name and name.startswith(prev_name+"/")):
				prev_name = name
				top_dir_names.append(name)
		#  Sort remaining directories by size and print
		top_dir_names = sorted(top_dir_names, reverse=True, key=lambda k: all_tree[k].size)

		#  Print this report
		print_verbose(VERBOSE, "Printing list of quailifying directories to file %s" % fname)
		fout = sys.stdout if fname=='-' else open(fname,"w")

		print("#SCRATCH DIRECTORIES", file=fout)
		print(format_table("neither", top_dir_names, all_tree, blocksize), file=fout)

		print("\n#HOME DIRECTORIES", file=fout)
		print(format_table("home", top_dir_names, all_tree, blocksize), file=fout)

		print("\n#SHARED DIRECTORIES", file=fout)
		print(format_table("shared", top_dir_names, all_tree, blocksize), file=fout)

		#  Close file
		if not fname=='-': fout.close()

		
main()