Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 354 lines (302 sloc) 12.7 KB
#!/usr/bin/python
from __future__ import print_function
import os, gzip, pwd, sys, copy, time, getopt, collections
DEF_AGE = 120 # 120 days
DEF_MIN = 1024**4 # 1 TB
DEF_BLOCKSIZE = 1 # 1 byte
MAX_ROWS = 1000000 # Maximum number of table rows for rowformat
DEF_DATE_FORMAT = None
CNT_DATE_ERRORS = 0 # Count number of date errors on input
VERBOSE = False
# Record object of directory information
class DirRecord:
def __init__(self):
self.size = 0
self.cnt = 0
self.mindate = '9999-99-99'
self.maxdate = '0000-00-00'
def update(self,cnt,size,mindate,maxdate):
self.cnt += cnt
self.size += size
self.mindate = min(self.mindate,mindate)
self.maxdate = max(self.maxdate,maxdate)
def combine(self, dirrecord):
self.cnt += dirrecord.cnt
self.size += dirrecord.size
self.mindate = min(self.mindate,dirrecord.mindate)
self.maxdate = max(self.maxdate,dirrecord.maxdate)
def is_as_old(self,olddate):
return self.maxdate<=olddate
def is_as_large(self,size,blocksize=1):
return blocksize*self.size>=size
def get_tuple(self):
return self.size, self.cnt, self.mindate, self.maxdate
def __repr__(self):
return "%s %s %s %s" % (self.size, self.cnt, self.mindate, self.maxdate)
# Print single error message and halt the program
def print_error(msg):
print(" ERROR: ", msg)
sys.exit()
# Print verbose text
def print_verbose(verbose, msg):
if verbose:
datetimestr = time.strftime("%Y-%m-%d %H:%M:%S");
print(datetimestr, msg)
# Pad text
def pad(s,width,left=False):
if left:
return (s + width*' ')[:width]
else:
return (width*' ' + s)[-width:]
# Padd and justify rows, like Unix command 'column -t'
def rowformat(rows):
if len(rows)>MAX_ROWS:
print("ERROR: Number of rows (%d) exceeds MAX_ROWS (%d)" % len(rows), MAX_ROWS)
sys.exit()
# get maximum width for each row
widths = []
for row in rows:
# Extend lenght of widths if necessary
if len(row)>len(widths):
widths += (len(row)-len(widths)) * [0]
widths = [ max(width,len(str(item))) for width,item in zip(widths,row) ]
for row in rows:
line = ' '.join( [ pad(str(item),width,True) for width,item in zip(widths,row) ] ).rstrip()
yield line
# Read config file that describes reports
def read_config(fname):
config = []
with open(fname) as fin:
for lineno, line in enumerate(fin):
sline = line.strip()
if not sline or sline.startswith("#"):
continue
try:
age, mindir, fname = sline.split()
except ValueError:
print_error("Wrong number of arguments on line %d of config file %s" % (lineno, fname))
datestr = age2date(age)
config.append((datestr, read_human(mindir), fname))
return config
# Convert integers to number of KB, MB, GB or TB
def write_human(n):
for symbol,size in ('T',1024**4), ('G',1024**3), ('M',1024**2), ('K',1024):
if n>=size:
x = n/float(size)
if x<10:
return "%.1f%s" % ( x, symbol )
else:
return "%.f%s" % ( x, symbol )
return "%d" % n
# Given string like 2.1T or 17G, etc, return integer
def read_human(nstr):
last_chr = nstr[-1].lower()
if last_chr.lower() in ('t','g','m','k'):
factor = 1024**{'t':4, 'g':3, 'm':2, 'k':1}[last_chr]
nstr = nstr[:-1]
else:
factor = 1
return int(float(nstr)*factor)
def age2date(daystr):
days = int(daystr)
return time.strftime("%Y-%m-%d", time.localtime(time.time()-3600*24*days))
# Process command line arguments and return to main
def process_args():
opts, args = getopt.getopt(sys.argv[1:], 'm:a:c:b:vF:')
opts = dict(opts)
opts['age' ] = age2date (opts['-a']) if '-a' in opts else age2date(DEF_AGE)
opts['blocksize' ] = read_human(opts['-b']) if '-b' in opts else DEF_BLOCKSIZE
opts['min' ] = read_human(opts['-m']) if '-m' in opts else DEF_MIN
opts['dateformat'] = opts['-F'] if '-F' in opts else DEF_DATE_FORMAT
# Sanity check
if '-c' in opts and ('-a' in opts or '-m' in opts):
Usage("You cannot use options -a or -m with -c")
try:
file_in, top_dir = args[:2]
col_name, col_size, col_date = [int(x)-1 for x in args[2:5]]
return opts, file_in, top_dir, col_name, col_size, col_date
except ValueError:
Usage()
# Read lines from file list
def read_line(fin, col_name, col_size, col_date, opts):
global CNT_DATE_ERRORS
if opts['dateformat']:
num_date_cols = len( opts['dateformat'].split() )
max_col = max(col_name, col_size, col_date + num_date_cols - 1)
else:
max_col = max(col_name, col_size, col_date)
for lineno, line in enumerate(fin):
parts = line.decode().split(None,max_col+1)
if len(parts)>=max_col:
# Convert date field(s) to standard date format YYYY-mm-dd
if opts['dateformat']:
# Join adjacent date fields into one space-delimited strings
date_field = " ".join(parts[col_date:col_date+num_date_cols])
before = date_field
# Convert string to time-array using strptime
try:
date_field = time.strptime( date_field, opts['dateformat'] )
except ValueError:
CNT_DATE_ERRORS += 1
print_verbose(VERBOSE, "Cannot parse date field from line %d: (%s)" % (lineno+1, date_field))
continue
date_field = time.strftime( "%Y-%m-%d", date_field )
# Assume date field already in standard date format
else:
date_field = parts[col_date]
yield parts[col_name], int(parts[col_size]), date_field
# Remove tailing file name from file path, leaving just directory
def truncate_dir(fname):
# Note: This is 20% faster than using os.path.dirname(fname)
# and it is called *a lot* (about 400 million times)
pos = fname.rfind("/")
return fname[:pos] if pos>=0 else fname
# Adpated from https://stackoverflow.com/questions/1830618/how-to-find-the-owner-of-a-file-or-directory-in-python/1830635
def find_owner(filename):
try:
return pwd.getpwuid(os.stat(filename).st_uid).pw_name
except (OSError, KeyError):
return "N/A"
# Return directories that are either 'home', 'shared', or 'neither'
def get_dir_group(group_name,fnames):
home_str = "/gpfs/scratchfs1/home/"
shared_str = "/gpfs/scratchfs1/shared/"
if group_name=='home':
return [ fname for fname in fnames if fname.startswith(home_str) ]
if group_name=='shared':
return [ fname for fname in fnames if fname.startswith(shared_str)]
if group_name=='neither':
return [ fname for fname in fnames if not (fname.startswith(shared_str) or fname.startswith(home_str)) ]
return fnames
# Get component directories in name, except for top_dir and above
# Example:
# top_dir = /gpfs/scratchfs1
# name = /gpfs/scratchfs1/CREST/Blue/stat/snapshot/8.err
# Returns:
# /gpfs/scratchfs1/CREST/Blue/stat/snapshot
# /gpfs/scratchfs1/CREST/Blue/stat
# /gpfs/scratchfs1/CREST/Blue
# /gpfs/scratchfs1/CREST
def get_upper_dirs(top_dir, name):
# Remove trailing slash from top_dir
if top_dir.endswith("/"): top_dir = top_dir[:-1]
while True:
yield name
name = truncate_dir(name)
if name==top_dir or not name: break
def Usage(msg=None):
print("""
Usage: dtree [-a AGE_DAYS] [-m MIN_DIRSIZE] FILELIST TOPDIR COL_NAME COL_SIZE COL_DATE
Usage: dtree [-c CONFIG ] FILELIST TOPDIR COL_NAME COL_SIZE COL_DATE
Parmaters:
FILELIST - File with list of file system info. Each line contains the name
of a file and its directory path, size of the file, and the age of the file.
FILELIST can be a plain text file, a gzip'ed plain text file if FILELIST
ends in .gz, or standard input if FILELIST a hyphen (-)
TOPDIR - Directory or directories to strip off from the top of each file path, when
displaying information in the output
COL_NAME - Which column contains the file names. Columns numbered starting with 1.
COL_SIZE - Which column contains the file size. The units are determined by -b option.
Default is bytes.
COL_DATE - Which column contains the date. Default format is YYYY-mm-dd (e.g. 2010-12-05).
You can use the -F option to specify a different data format.
Options:
-a AGE_DAYS Only print directories older than AGE_DAYS: default is %d days
-m MIN_DIRSIZE Minimum directory size: default is %s. You can use K,M,G and T suffixes.
-c CONFIG Read AGE_DAYS, MIN_DIRSIZE, and output file name from CONFIG file
Each line produced separate output.
-b BLOCKSIZE Size of blocks used in COL_SIZE. Default is 1 byte.
You can use K,M,G and T suffixes.
-F DATE_FORMAT Date format for date in COL_DATE. Default is %%Y-%%m-%%d. See 'date'
command for allowed formats.
Config Example:
# Comments begin with '#'
7 1T dtree-007.out
30 1T dtree-030.out
60 1T dtree-060.out
90 1T dtree-090.out
120 1T dtree-120.out
""" % (DEF_AGE, write_human(DEF_MIN)) )
# Print error msg
if msg:
print()
print(" ", msg)
print()
sys.exit()
# For group_str of directories, return formated table string ready to print
# Table columns are right? justified by rowformat
def format_table(group_str, top_dir_names, all_tree, blocksize):
rows = [
( "DIR-SIZE", "FILE-CNT", "AVG-FILE-SIZE", "DATE-NEWEST", "OWNER", "DIRECTORY" ),
( "========", "========", "=============", "===========", "=========", "================" ),
]
for dir_name in get_dir_group(group_str, top_dir_names):
d = all_tree[dir_name]
# Save: directory size, file count, avg file size, newest file date, directory owner, directory name
size = blocksize*d.size
rows.append( (write_human(size), d.cnt, write_human(size/d.cnt), d.maxdate, find_owner(dir_name), dir_name) )
return "\n".join ( rowformat( rows ) )
def main():
global VERBOSE
# Get arguments
# col_name,col_size,col_date are the column position of file name, size and date.
opts, file_in, top_dir, col_name, col_size, col_date = process_args()
# Set config
if '-c' in opts:
config = read_config(opts['-c'])
else:
config = [ (opts['age'], opts['min'], '-' ) ]
VERBOSE = '-v' in opts
print_verbose(VERBOSE, "Starting to read file listing")
# Store data for each folder in a dict of DirRecord objects
bottom_tree = collections.defaultdict(DirRecord)
# Read from stdin, or gzipped file, or plain text file
fin = sys.stdin if file_in=='-' else gzip.open(file_in) if file_in.endswith(".gz") else open(file_in)
for lineno, (fname, size, datestr) in enumerate(read_line(fin, col_name, col_size, col_date, opts)):
if VERBOSE and (lineno % 1000000)==0:
print_verbose(VERBOSE,"Reading line %d" % lineno)
bottom_dir = truncate_dir(fname)
bottom_tree[bottom_dir].update( 1, size, datestr, datestr )
if not file_in=='-': fin.close()
if CNT_DATE_ERRORS>0: print("WARNING: Number of lines with unparseable date fields: %d" % CNT_DATE_ERRORS)
# Combine data about folders
# Process every bottom directory
print_verbose(VERBOSE, "Collecting bottom directory info into higher directories")
all_tree = collections.defaultdict(DirRecord)
for bottom_dir in bottom_tree.keys():
# For every full dir path, get paths for each upper directory
for upper_dir in get_upper_dirs(top_dir, bottom_dir):
# Combine bottom directory with upper direcgtory info
all_tree[upper_dir].combine( bottom_tree[bottom_dir] )
# Do each report from config
blocksize = opts['blocksize']
for nreport, (age, mindir, fname) in enumerate(config):
print_verbose(VERBOSE, "Collecting directories old enough and large enough")
# Remove young and small directories
eligible_names = []
for name, props in all_tree.items():
if props.is_as_old(age) and props.is_as_large(mindir,blocksize):
eligible_names.append(name)
# Remove sub-directories which are included in higher directories
print_verbose(VERBOSE, "Keeping only top directories")
top_dir_names = []
prev_name = None
for name in sorted(eligible_names):
# This directory is a subdirectory of the previous one
if not (prev_name and name.startswith(prev_name+"/")):
prev_name = name
top_dir_names.append(name)
# Sort remaining directories by size and print
top_dir_names = sorted(top_dir_names, reverse=True, key=lambda k: all_tree[k].size)
# Print this report
print_verbose(VERBOSE, "Printing list of quailifying directories to file %s" % fname)
fout = sys.stdout if fname=='-' else open(fname,"w")
print("#SCRATCH DIRECTORIES", file=fout)
print(format_table("neither", top_dir_names, all_tree, blocksize), file=fout)
print("\n#HOME DIRECTORIES", file=fout)
print(format_table("home", top_dir_names, all_tree, blocksize), file=fout)
print("\n#SHARED DIRECTORIES", file=fout)
print(format_table("shared", top_dir_names, all_tree, blocksize), file=fout)
# Close file
if not fname=='-': fout.close()
main()