Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
dtree/dtree.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
354 lines (302 sloc)
12.7 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from __future__ import print_function | |
import os, gzip, pwd, sys, copy, time, getopt, collections | |
DEF_AGE = 120 # 120 days | |
DEF_MIN = 1024**4 # 1 TB | |
DEF_BLOCKSIZE = 1 # 1 byte | |
MAX_ROWS = 1000000 # Maximum number of table rows for rowformat | |
DEF_DATE_FORMAT = None | |
CNT_DATE_ERRORS = 0 # Count number of date errors on input | |
VERBOSE = False | |
# Record object of directory information | |
class DirRecord: | |
def __init__(self): | |
self.size = 0 | |
self.cnt = 0 | |
self.mindate = '9999-99-99' | |
self.maxdate = '0000-00-00' | |
def update(self,cnt,size,mindate,maxdate): | |
self.cnt += cnt | |
self.size += size | |
self.mindate = min(self.mindate,mindate) | |
self.maxdate = max(self.maxdate,maxdate) | |
def combine(self, dirrecord): | |
self.cnt += dirrecord.cnt | |
self.size += dirrecord.size | |
self.mindate = min(self.mindate,dirrecord.mindate) | |
self.maxdate = max(self.maxdate,dirrecord.maxdate) | |
def is_as_old(self,olddate): | |
return self.maxdate<=olddate | |
def is_as_large(self,size,blocksize=1): | |
return blocksize*self.size>=size | |
def get_tuple(self): | |
return self.size, self.cnt, self.mindate, self.maxdate | |
def __repr__(self): | |
return "%s %s %s %s" % (self.size, self.cnt, self.mindate, self.maxdate) | |
# Print single error message and halt the program | |
def print_error(msg): | |
print(" ERROR: ", msg) | |
sys.exit() | |
# Print verbose text | |
def print_verbose(verbose, msg): | |
if verbose: | |
datetimestr = time.strftime("%Y-%m-%d %H:%M:%S"); | |
print(datetimestr, msg) | |
# Pad text | |
def pad(s,width,left=False): | |
if left: | |
return (s + width*' ')[:width] | |
else: | |
return (width*' ' + s)[-width:] | |
# Padd and justify rows, like Unix command 'column -t' | |
def rowformat(rows): | |
if len(rows)>MAX_ROWS: | |
print("ERROR: Number of rows (%d) exceeds MAX_ROWS (%d)" % len(rows), MAX_ROWS) | |
sys.exit() | |
# get maximum width for each row | |
widths = [] | |
for row in rows: | |
# Extend lenght of widths if necessary | |
if len(row)>len(widths): | |
widths += (len(row)-len(widths)) * [0] | |
widths = [ max(width,len(str(item))) for width,item in zip(widths,row) ] | |
for row in rows: | |
line = ' '.join( [ pad(str(item),width,True) for width,item in zip(widths,row) ] ).rstrip() | |
yield line | |
# Read config file that describes reports | |
def read_config(fname): | |
config = [] | |
with open(fname) as fin: | |
for lineno, line in enumerate(fin): | |
sline = line.strip() | |
if not sline or sline.startswith("#"): | |
continue | |
try: | |
age, mindir, fname = sline.split() | |
except ValueError: | |
print_error("Wrong number of arguments on line %d of config file %s" % (lineno, fname)) | |
datestr = age2date(age) | |
config.append((datestr, read_human(mindir), fname)) | |
return config | |
# Convert integers to number of KB, MB, GB or TB | |
def write_human(n): | |
for symbol,size in ('T',1024**4), ('G',1024**3), ('M',1024**2), ('K',1024): | |
if n>=size: | |
x = n/float(size) | |
if x<10: | |
return "%.1f%s" % ( x, symbol ) | |
else: | |
return "%.f%s" % ( x, symbol ) | |
return "%d" % n | |
# Given string like 2.1T or 17G, etc, return integer | |
def read_human(nstr): | |
last_chr = nstr[-1].lower() | |
if last_chr.lower() in ('t','g','m','k'): | |
factor = 1024**{'t':4, 'g':3, 'm':2, 'k':1}[last_chr] | |
nstr = nstr[:-1] | |
else: | |
factor = 1 | |
return int(float(nstr)*factor) | |
def age2date(daystr): | |
days = int(daystr) | |
return time.strftime("%Y-%m-%d", time.localtime(time.time()-3600*24*days)) | |
# Process command line arguments and return to main | |
def process_args(): | |
opts, args = getopt.getopt(sys.argv[1:], 'm:a:c:b:vF:') | |
opts = dict(opts) | |
opts['age' ] = age2date (opts['-a']) if '-a' in opts else age2date(DEF_AGE) | |
opts['blocksize' ] = read_human(opts['-b']) if '-b' in opts else DEF_BLOCKSIZE | |
opts['min' ] = read_human(opts['-m']) if '-m' in opts else DEF_MIN | |
opts['dateformat'] = opts['-F'] if '-F' in opts else DEF_DATE_FORMAT | |
# Sanity check | |
if '-c' in opts and ('-a' in opts or '-m' in opts): | |
Usage("You cannot use options -a or -m with -c") | |
try: | |
file_in, top_dir = args[:2] | |
col_name, col_size, col_date = [int(x)-1 for x in args[2:5]] | |
return opts, file_in, top_dir, col_name, col_size, col_date | |
except ValueError: | |
Usage() | |
# Read lines from file list | |
def read_line(fin, col_name, col_size, col_date, opts): | |
global CNT_DATE_ERRORS | |
if opts['dateformat']: | |
num_date_cols = len( opts['dateformat'].split() ) | |
max_col = max(col_name, col_size, col_date + num_date_cols - 1) | |
else: | |
max_col = max(col_name, col_size, col_date) | |
for lineno, line in enumerate(fin): | |
parts = line.decode().split(None,max_col+1) | |
if len(parts)>=max_col: | |
# Convert date field(s) to standard date format YYYY-mm-dd | |
if opts['dateformat']: | |
# Join adjacent date fields into one space-delimited strings | |
date_field = " ".join(parts[col_date:col_date+num_date_cols]) | |
before = date_field | |
# Convert string to time-array using strptime | |
try: | |
date_field = time.strptime( date_field, opts['dateformat'] ) | |
except ValueError: | |
CNT_DATE_ERRORS += 1 | |
print_verbose(VERBOSE, "Cannot parse date field from line %d: (%s)" % (lineno+1, date_field)) | |
continue | |
date_field = time.strftime( "%Y-%m-%d", date_field ) | |
# Assume date field already in standard date format | |
else: | |
date_field = parts[col_date] | |
yield parts[col_name], int(parts[col_size]), date_field | |
# Remove tailing file name from file path, leaving just directory | |
def truncate_dir(fname): | |
# Note: This is 20% faster than using os.path.dirname(fname) | |
# and it is called *a lot* (about 400 million times) | |
pos = fname.rfind("/") | |
return fname[:pos] if pos>=0 else fname | |
# Adpated from https://stackoverflow.com/questions/1830618/how-to-find-the-owner-of-a-file-or-directory-in-python/1830635 | |
def find_owner(filename): | |
try: | |
return pwd.getpwuid(os.stat(filename).st_uid).pw_name | |
except (OSError, KeyError): | |
return "N/A" | |
# Return directories that are either 'home', 'shared', or 'neither' | |
def get_dir_group(group_name,fnames): | |
home_str = "/gpfs/scratchfs1/home/" | |
shared_str = "/gpfs/scratchfs1/shared/" | |
if group_name=='home': | |
return [ fname for fname in fnames if fname.startswith(home_str) ] | |
if group_name=='shared': | |
return [ fname for fname in fnames if fname.startswith(shared_str)] | |
if group_name=='neither': | |
return [ fname for fname in fnames if not (fname.startswith(shared_str) or fname.startswith(home_str)) ] | |
return fnames | |
# Get component directories in name, except for top_dir and above | |
# Example: | |
# top_dir = /gpfs/scratchfs1 | |
# name = /gpfs/scratchfs1/CREST/Blue/stat/snapshot/8.err | |
# Returns: | |
# /gpfs/scratchfs1/CREST/Blue/stat/snapshot | |
# /gpfs/scratchfs1/CREST/Blue/stat | |
# /gpfs/scratchfs1/CREST/Blue | |
# /gpfs/scratchfs1/CREST | |
def get_upper_dirs(top_dir, name): | |
# Remove trailing slash from top_dir | |
if top_dir.endswith("/"): top_dir = top_dir[:-1] | |
while True: | |
yield name | |
name = truncate_dir(name) | |
if name==top_dir or not name: break | |
def Usage(msg=None): | |
print(""" | |
Usage: dtree [-a AGE_DAYS] [-m MIN_DIRSIZE] FILELIST TOPDIR COL_NAME COL_SIZE COL_DATE | |
Usage: dtree [-c CONFIG ] FILELIST TOPDIR COL_NAME COL_SIZE COL_DATE | |
Parmaters: | |
FILELIST - File with list of file system info. Each line contains the name | |
of a file and its directory path, size of the file, and the age of the file. | |
FILELIST can be a plain text file, a gzip'ed plain text file if FILELIST | |
ends in .gz, or standard input if FILELIST a hyphen (-) | |
TOPDIR - Directory or directories to strip off from the top of each file path, when | |
displaying information in the output | |
COL_NAME - Which column contains the file names. Columns numbered starting with 1. | |
COL_SIZE - Which column contains the file size. The units are determined by -b option. | |
Default is bytes. | |
COL_DATE - Which column contains the date. Default format is YYYY-mm-dd (e.g. 2010-12-05). | |
You can use the -F option to specify a different data format. | |
Options: | |
-a AGE_DAYS Only print directories older than AGE_DAYS: default is %d days | |
-m MIN_DIRSIZE Minimum directory size: default is %s. You can use K,M,G and T suffixes. | |
-c CONFIG Read AGE_DAYS, MIN_DIRSIZE, and output file name from CONFIG file | |
Each line produced separate output. | |
-b BLOCKSIZE Size of blocks used in COL_SIZE. Default is 1 byte. | |
You can use K,M,G and T suffixes. | |
-F DATE_FORMAT Date format for date in COL_DATE. Default is %%Y-%%m-%%d. See 'date' | |
command for allowed formats. | |
Config Example: | |
# Comments begin with '#' | |
7 1T dtree-007.out | |
30 1T dtree-030.out | |
60 1T dtree-060.out | |
90 1T dtree-090.out | |
120 1T dtree-120.out | |
""" % (DEF_AGE, write_human(DEF_MIN)) ) | |
# Print error msg | |
if msg: | |
print() | |
print(" ", msg) | |
print() | |
sys.exit() | |
# For group_str of directories, return formated table string ready to print | |
# Table columns are right? justified by rowformat | |
def format_table(group_str, top_dir_names, all_tree, blocksize): | |
rows = [ | |
( "DIR-SIZE", "FILE-CNT", "AVG-FILE-SIZE", "DATE-NEWEST", "OWNER", "DIRECTORY" ), | |
( "========", "========", "=============", "===========", "=========", "================" ), | |
] | |
for dir_name in get_dir_group(group_str, top_dir_names): | |
d = all_tree[dir_name] | |
# Save: directory size, file count, avg file size, newest file date, directory owner, directory name | |
size = blocksize*d.size | |
rows.append( (write_human(size), d.cnt, write_human(size/d.cnt), d.maxdate, find_owner(dir_name), dir_name) ) | |
return "\n".join ( rowformat( rows ) ) | |
def main(): | |
global VERBOSE | |
# Get arguments | |
# col_name,col_size,col_date are the column position of file name, size and date. | |
opts, file_in, top_dir, col_name, col_size, col_date = process_args() | |
# Set config | |
if '-c' in opts: | |
config = read_config(opts['-c']) | |
else: | |
config = [ (opts['age'], opts['min'], '-' ) ] | |
VERBOSE = '-v' in opts | |
print_verbose(VERBOSE, "Starting to read file listing") | |
# Store data for each folder in a dict of DirRecord objects | |
bottom_tree = collections.defaultdict(DirRecord) | |
# Read from stdin, or gzipped file, or plain text file | |
fin = sys.stdin if file_in=='-' else gzip.open(file_in) if file_in.endswith(".gz") else open(file_in) | |
for lineno, (fname, size, datestr) in enumerate(read_line(fin, col_name, col_size, col_date, opts)): | |
if VERBOSE and (lineno % 1000000)==0: | |
print_verbose(VERBOSE,"Reading line %d" % lineno) | |
bottom_dir = truncate_dir(fname) | |
bottom_tree[bottom_dir].update( 1, size, datestr, datestr ) | |
if not file_in=='-': fin.close() | |
if CNT_DATE_ERRORS>0: print("WARNING: Number of lines with unparseable date fields: %d" % CNT_DATE_ERRORS) | |
# Combine data about folders | |
# Process every bottom directory | |
print_verbose(VERBOSE, "Collecting bottom directory info into higher directories") | |
all_tree = collections.defaultdict(DirRecord) | |
for bottom_dir in bottom_tree.keys(): | |
# For every full dir path, get paths for each upper directory | |
for upper_dir in get_upper_dirs(top_dir, bottom_dir): | |
# Combine bottom directory with upper direcgtory info | |
all_tree[upper_dir].combine( bottom_tree[bottom_dir] ) | |
# Do each report from config | |
blocksize = opts['blocksize'] | |
for nreport, (age, mindir, fname) in enumerate(config): | |
print_verbose(VERBOSE, "Collecting directories old enough and large enough") | |
# Remove young and small directories | |
eligible_names = [] | |
for name, props in all_tree.items(): | |
if props.is_as_old(age) and props.is_as_large(mindir,blocksize): | |
eligible_names.append(name) | |
# Remove sub-directories which are included in higher directories | |
print_verbose(VERBOSE, "Keeping only top directories") | |
top_dir_names = [] | |
prev_name = None | |
for name in sorted(eligible_names): | |
# This directory is a subdirectory of the previous one | |
if not (prev_name and name.startswith(prev_name+"/")): | |
prev_name = name | |
top_dir_names.append(name) | |
# Sort remaining directories by size and print | |
top_dir_names = sorted(top_dir_names, reverse=True, key=lambda k: all_tree[k].size) | |
# Print this report | |
print_verbose(VERBOSE, "Printing list of quailifying directories to file %s" % fname) | |
fout = sys.stdout if fname=='-' else open(fname,"w") | |
print("#SCRATCH DIRECTORIES", file=fout) | |
print(format_table("neither", top_dir_names, all_tree, blocksize), file=fout) | |
print("\n#HOME DIRECTORIES", file=fout) | |
print(format_table("home", top_dir_names, all_tree, blocksize), file=fout) | |
print("\n#SHARED DIRECTORIES", file=fout) | |
print(format_table("shared", top_dir_names, all_tree, blocksize), file=fout) | |
# Close file | |
if not fname=='-': fout.close() | |
main() |