diff --git a/DB.py b/DB.py
new file mode 100644
index 0000000..e542f29
--- /dev/null
+++ b/DB.py
@@ -0,0 +1,51 @@
+# Provide stateful DB access accessible across functions
+import sqlite3
+class DB:
+ def __init__(self,dbpath):
+ self.db = sqlite3.connect(dbpath)
+ self.dbc = self.db.cursor()
+ def write(self,sqls):
+ # If sqls is a single string, convert to list of one
+ if not type(sqls) in ( type(()), type([]) ): sqls = [ sqls ]
+ for sql in sqls:
+ self.dbc.execute(sql)
+ self.db.commit()
+ def read(self,sql):
+ self.dbc.execute(sql)
+ return self.dbc.fetchall()
+ def close(self):
+ self.db.close()
+ # Only needed when creating the database
+ def create_tables(self):
+ CREATE_TABLE_SQLS = [
+ """CREATE TABLE IF NOT EXISTS user_footprint (
+ datetime text,
+ userid integer,
+ user text,
+ rank integer,
+ count integer,
+ footprint integer,
+ primary key (datetime,user)
+ );
+ """,
+ """CREATE TABLE IF NOT EXISTS homedir_footprint (
+ datetime text,
+ dir text,
+ rank integer,
+ count integer,
+ footprint integer,
+ primary key (datetime,dir)
+ );
+ """,
+ """CREATE TABLE IF NOT EXISTS shareddir_footprint (
+ datetime text,
+ dir text,
+ rank integer,
+ count integer,
+ footprint integer,
+ primary key (datetime,dir)
+ );
+ """
+ ]
+ self.write(CREATE_TABLE_SQLS)
+
diff --git a/User.py b/User.py
new file mode 100644
index 0000000..f235278
--- /dev/null
+++ b/User.py
@@ -0,0 +1,34 @@
+#!/usr/bin/python
+
+import subprocess
+def _ncheck_output(cmds):
+ if type(cmds )!=type([]): cmds = [cmds.split()]
+ if type(cmds[0])!=type([]): cmds = [cmds]
+ ncmds = len(cmds)
+ ps = ncmds*[None]
+ for i,cmd in enumerate(cmds):
+ if i==0:
+ ps[i] = subprocess.Popen(cmds[i], stdout=subprocess.PIPE, shell=False)
+ else:
+ ps[i] = subprocess.Popen(cmds[i], stdin=ps[i-1].stdout, stdout=subprocess.PIPE, shell=False)
+ for i in range(ncmds-1):
+ ps[i].stdout.close()
+ return ps[-1].communicate()[0]
+
+class User:
+ def __init__(self):
+ self.data = {}
+ self.is_loaded = False
+ def get_user(self,uid):
+ if not self.data:
+ self.load_data()
+ return self.data.get(int(uid),"N/A-" + str(uid))
+ def load_data(self):
+ for line in _ncheck_output('ssh head1 getent passwd').split("\n"):
+ sline = line.strip()
+ if not sline or sline[0]=='#': continue
+ fields = sline.split(":")
+ self.data[int(fields[2])] = fields[0]
+ self.is_loaded = True
+
+
diff --git a/gpfs-scan-files.py b/gpfs-scan-files.py
new file mode 100755
index 0000000..cea3ac3
--- /dev/null
+++ b/gpfs-scan-files.py
@@ -0,0 +1,387 @@
+#!/usr/bin/python
+
+from __future__ import print_function
+
+
+#-----------------------------------------------------------------------
+# Todo
+#-----------------------------------------------------------------------
+#
+# Learn how to redirect stdout and stderr into files
+#
+
+#-----------------------------------------------------------------------
+# Imports
+#-----------------------------------------------------------------------
+# Standard modules
+import os,sys,time,getopt,tempfile,subprocess
+
+# Custom modules
+import DB, User
+
+
+#-----------------------------------------------------------------------
+# Configuration
+#-----------------------------------------------------------------------
+MMAPPLYPOLICY = '/usr/lpp/mmfs/bin/mmapplypolicy'
+TOTAL = '/usr/local/bin/total'
+HTMLTEMPLATE = '/gpfs/scratchfs1/admin/gpfs-scan-files/html.template'
+MMAPPLY_HOSTS = ['ddnnsd1.storrs.hpc.uconn.edu', 'ddnnsd2.storrs.hpc.uconn.edu']
+
+DEF_OUTDIR = "/gpfs/scratchfs1/admin/gpfs-scan-files/tmp"
+
+THRESHOLD = 1000000000000 # Highlight sizes over 1TB
+
+
+#-----------------------------------------------------------------------
+# Functions
+#-----------------------------------------------------------------------
+
+def print_error(msg):
+ print()
+ print(" ERROR:",msg)
+ print()
+ sys.exit()
+
+def print_warn(msg):
+ print()
+ print(" WARNING:",msg)
+ print()
+
+# Accepts either
+# a single command represented by a single string
+# a single command represented by an array of string components
+# a series of commands represented by an array of an array of string components
+def _ncheck_output(cmds):
+ if type(cmds )!=type([]): cmds = [cmds.split()]
+ if type(cmds[0])!=type([]): cmds = [cmds]
+ ncmds = len(cmds)
+ ps = ncmds*[None]
+ for i,cmd in enumerate(cmds):
+ if i==0:
+ ps[i] = subprocess.Popen(cmds[i], stdout=subprocess.PIPE, shell=False)
+ else:
+ ps[i] = subprocess.Popen(cmds[i], stdin=ps[i-1].stdout, stdout=subprocess.PIPE, shell=False)
+ for i in range(ncmds-1):
+ ps[i].stdout.close()
+ return ps[-1].communicate()[0]
+
+def get_hostname():
+ return os.uname()[1]
+
+def run_mmapplypolicy(exefile, target_dir, policy_file, temp_dir, out_dir):
+ # Ensure we are runnning mmapplypolicy on the correct host
+ hostname = get_hostname()
+ if not hostname in MMAPPLY_HOSTS:
+ print_error(" When using the -s option, you must run on one of the DDN NSD nodes (%s)" % ",".join(MMAPPLY_HOSTS))
+ # Create output directories
+ if not os.path.isdir(out_dir ): os.makedirs(out_dir )
+ if not os.path.isdir(temp_dir): os.makedirs(temp_dir)
+ # Create policy file
+ with open(policy_file,"w") as fout:
+ print("RULE 'listall' list 'all-files'\nSHOW( varchar(access_time) || ' ' || varchar(kb_allocated) || ' ' || varchar(file_size) || ' ' || varchar(user_id) || ' ' || fileset_name )", file=fout)
+ # Run GPFS utility 'mmapplypolicy'
+ # Notes about mmapplypolicy
+ # - mmapplypolicy is the fastest way to get list of all files on scratch partition
+ # - mmapplypolicy reads the RULES file that we wrote to temp file above
+ # - Name of output data file is hard-coded in mmapplypolicy as "list.all-files"
+ # - Status message are written to stdout and stderr
+ # - It takes about 1.25 hours for mmapplypolicy to list all files on scratch disk
+ # - Column output in list.all-files is
+ # Inode_number, gen_number, snapshot_id, access_time, kb_allocated, file_size, "--", filename
+ # The first three and last two columns are always present, the other columns are determined
+ # from the SHOW policy command
+ cmd = "%(exefile)s %(target_dir)s -s %(temp_dir)s -I defer -P %(policy_file)s -f %(out_dir)s" % locals()
+ p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ out,err = p.communicate()
+ with open(out_dir + "/mm.out", "w") as fout: print(out,file=fout)
+ with open(out_dir + "/mm.err", "w") as ferr: print(err,file=ferr)
+ return p.returncode
+
+# Dummy version of run_mmapplypolicy(). Instead of generating new output file, it copies one
+# from TEST_OUTPUT. So, you need to have that in place beforehand.
+def run_test(temp_dir, out_dir):
+ TEST_OUTPUT = '/gpfs/scratchfs1/admin/gpfs-scan-files/testfiles/list.all-files'
+ # Create output directories
+ if not os.path.isdir(out_dir ): os.makedirs(out_dir )
+ if not os.path.isdir(temp_dir): os.makedirs(temp_dir)
+ cmd = [ 'cp', TEST_OUTPUT, out_dir+"/list.all-files" ]
+ p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ out,err = p.communicate()
+ with open(out_dir + "/mm.out", "w") as fout: print(out,file=fout)
+ with open(out_dir + "/mm.err", "w") as ferr: print(err,file=ferr)
+ return p.returncode
+
+def csvrow(row):
+ return ",".join(['"%s"' % str(col) for col in row])
+
+def htmlheader(headerstr):
+ row = [ str(col) for col in headerstr.split() ]
+ return "
" + " | ".join(row) + " |
\n"
+
+def htmlrow(*row,**args):
+ tclass = args.get('tclass','t')
+ #newrow = [ str(col) for col in row ]
+ #return "" + " | ".join(newrow) + " |
"
+ return "" + ''.join( ["%s | " % (tclass,i+1,str(col)) for i,col in enumerate(row)] ) + "
\n"
+
+def ReadDB(dbfile,datetime=None):
+ db = DB.DB(dbfile)
+ # Get most recent time
+ if datetime==None:
+ [[datetime]] = db.read('select max(datetime) from user_footprint;')
+ file_list = db.read("select footprint,count,userid,user from user_footprint where datetime='%s' order by footprint desc;" % datetime)
+ homedirs = db.read("select footprint,count,dir from homedir_footprint where datetime='%s' order by footprint desc;" % datetime)
+ shareddirs = db.read("select footprint,count,dir from shareddir_footprint where datetime='%s' order by footprint desc;" % datetime)
+ return datetime,file_list, homedirs, shareddirs
+
+
+def WriteDB(dbfile,datetime,file_list,homedirs,shareddirs):
+ db = DB.DB(dbfile)
+ # Table user_footprint
+ for rank,(size,count,uid,username) in enumerate(file_list):
+ db.write("insert into user_footprint values('%s','%s','%s','%s','%s','%s');" % (datetime,uid,username,rank+1,count,size))
+ # Table dir_footprint
+ for rank,(size,count,dirname) in enumerate(homedirs):
+ db.write("insert into homedir_footprint values('%s','%s','%s','%s','%s');" % (datetime,dirname,rank+1,count,size))
+ for rank,(size,count,dirname) in enumerate(shareddirs):
+ db.write("insert into shareddir_footprint values('%s','%s','%s','%s','%s');" % (datetime,dirname,rank+1,count,size))
+ db.close()
+
+
+def WriteCSV(csvfile,datetime,file_list,growths,homedirs,shareddirs):
+ with open(csvfile,'w') as fcsv:
+ # Table user_footprint
+ print('"User Space Used"', file=fcsv)
+ print(csvrow('DateTime Rank UID User Count Growth Size'.split()), file=fcsv)
+ for rank,(size,count,uid,username) in enumerate(file_list):
+ growth = growths.get(username,'-')
+ print(csvrow([datetime,rank+1,uid,username,count,growth,size]), file=fcsv)
+ print(file=fcsv)
+
+ # Table homedir_footprint
+ print('"Home Directories"', file=fcsv)
+ print(csvrow('DateTime Rank Home_Directory Count Size'.split()), file=fcsv)
+ for rank,(size,count,dirname) in enumerate(homedirs):
+ print(csvrow([datetime,rank+1,dirname,count,size]), file=fcsv)
+ print(file=fcsv)
+
+ # Table shareddir_footprint
+ print('"Shared Directories"', file=fcsv)
+ print(csvrow('DateTime Rank Shared_Directory Count Size'.split()), file=fcsv)
+ for rank,(size,count,dirname) in enumerate(shareddirs):
+ print(csvrow([datetime,rank+1,dirname,count,size]), file=fcsv)
+ print(file=fcsv)
+
+
+def WriteHTML(htmlfile,datetime,file_list,growths,homedirs,shareddirs,topn=100):
+ with open(HTMLTEMPLATE) as fin:
+ htmltemplate = fin.read()
+ # Set datetime
+ htmltemplate = htmltemplate.replace('{{TIMESTAMP}}',datetime)
+ # Table user_footprint
+ msg = "\n"
+ msg += htmlheader('Rank UID User Count Growth Size Cumulative')
+ cumsize = cumcount = 0
+ for rank,(size,count,uid,username) in enumerate(file_list):
+ cumsize += size
+ cumcount += count
+ growth = growths.get(username,'-')
+ if rank+1<=topn:
+ msg += htmlrow(rank+1,uid,username,comma(count),hilite_size(comma(growth),growth,THRESHOLD),comma(size),comma(cumsize),tclass='t1')
+ cumgrowth = sum(growths.values())
+ msg += htmlrow(len(file_list),'ALL','',comma(cumcount),comma(cumgrowth),0,comma(cumsize),tclass='t1')
+ msg += "
\n"
+ htmltemplate = htmltemplate.replace('{{USER_TABLE}}',msg)
+ # Table homedir_footprint
+ msg = "\n"
+ msg += htmlheader('Rank Home_Directory Count Size Cumulative')
+ cumsize = 0
+ for rank,(size,count,dirname) in enumerate(homedirs):
+ cumsize += size
+ msg += htmlrow(rank+1,dirname,comma(count),comma(size),comma(cumsize),tclass='t2')
+ msg += "
\n"
+ htmltemplate = htmltemplate.replace('{{HOMEDIR_TABLE}}',msg)
+ # Table shareddir_footprint
+ msg = "\n"
+ msg += htmlheader('Rank Shared_Directory Count Size Cumulative')
+ cumsize = 0
+ for rank,(size,count,dirname) in enumerate(shareddirs):
+ cumsize += size
+ msg += htmlrow(rank+1,dirname,comma(count),comma(size),comma(cumsize),tclass='t3')
+ msg += "
\n"
+ htmltemplate = htmltemplate.replace('{{SHAREDDIR_TABLE}}',msg)
+ # Configure date navigation links
+ htmltemplate = htmltemplate.replace('{{BEFORE}}',add_date(datetime,-1))
+ htmltemplate = htmltemplate.replace('{{AFTER}}', add_date(datetime, 1))
+ # Write final version of page
+ with open(htmlfile,"w") as fout:
+ fout.write(htmltemplate)
+
+def comma(s):
+ new = ''
+ #sign, s = '' if s>-0 else '-', abs(int(s))
+ s = str(s)
+ for i,c in enumerate(s[-1:0:-1]):
+ new = c + new
+ if (i % 3) == 2:
+ new = "," + new
+ return (s[0] + new).replace("-,","-")
+
+# Make string bold if size reaches or exceeds threshold
+def hilite_size(strval,size,threshold):
+ if size>=threshold:
+ return '' + strval + ''
+ elif size<=-threshold:
+ return '' + strval + ''
+ else:
+ return strval
+
+
+# Reads in datetime string "%Y-%m-%d %H:%M:%S", but returns date string
+def add_date(datetime, days):
+ secs = time.mktime(time.strptime(datetime, "%Y-%m-%d %H:%M:%S")) + days*24*3600
+ return time.strftime("%Y-%m-%d",time.localtime(secs))
+
+
+# Read arguments from command line
+def parse_args(args):
+ try:
+ opts,args = getopt.getopt(sys.argv[1:],'s:p:D:C:H:hId:o:')
+ except getopt.GetoptError as e:
+ Usage(" ERROR: " + str(e))
+ opts = dict(opts)
+ if not opts or '-h' in opts: Usage() # Print usage
+ ##if '-s' in opts and not '-p' in opts:
+ ## Usage("You must use -s and -p together")
+ if '-p' in opts and not ( ('-D' in opts) + ('-C' in opts) + ('-H' in opts) ):
+ Usage(" You must use -p with at least one of -D, -C and -H")
+ if '-o' in opts and not '-s' in opts:
+ Usage(" You can only use -o with the -s option")
+ if '-I' in opts and not '-D' in opts:
+ Usage("You must specify a database using -D when using -I")
+ if '-D' in opts and not ('-C' in opts or '-H' in opts or '-I' in opts or '-p' in opts):
+ Usage("You must use either -C, -H, -p, or -I, when using -D")
+ opts['writedb'] = (('-s' in opts) or ('-p' in opts)) and ('-D' in opts)
+ opts['readdb' ] = ('-D' in opts) and ('-C' in opts or '-H' in opts)
+ return opts, args
+
+def Usage(msg=None):
+ print("""
+ Usage: gpfs-scan-files.py [-s directory] [-p filelist] [-D dbname] [-C csvfile] [-H htmlfile] [-h] [-I]
+
+ Options:
+ -d DATETIME Set datetime (format YYYY-mm-dd HH:MM:SS) to write to DB, CSV or HTML
+ when using -s and/or -D.
+ -s DIRECTORY Scan directory recursively using mmapplypolicy. Mmapplypolicy output
+ will be written to tmp/list.all-files, where tmp is a directory
+ created under the current directory (i.e., not the /tmp directory)
+ -p MMOUTPUT Parse mmapplyoutput and write to DB, CSV and/or HTML. MMOUTPUT
+ is the file created from the -s option above.
+ -D DBNAME Name of database file
+ -I Create new empty datbase file in DBNAME given by -D
+ -o Output directory for mmapplypolicy, must be on scratch partition.
+ Only valid with -s option. Default value is %s
+""" % DEF_OUTDIR )
+ if msg:
+ print();print(msg);print()
+ sys.exit()
+
+
+def read_compare(dbfile,datetime):
+ db = DB.DB(dbfile)
+ # Get all datetimes to find datetime previous to requested datetime
+ datetimes = db.read("select distinct(datetime) from user_footprint order by datetime;")
+ prev_datetime = None
+ for [curr_datetime] in datetimes:
+ if prev_datetime and curr_datetime==datetime:
+ break
+ prev_datetime = curr_datetime
+ if not prev_datetime:
+ print_error("Could not find previous datetime to %s" % datetime)
+ # Get files for two datetimes
+ files0 = dict(db.read("select user,footprint from user_footprint where datetime='%s' order by footprint desc;" % prev_datetime))
+ files1 = dict(db.read("select user,footprint from user_footprint where datetime='%s' order by footprint desc;" % datetime) )
+ # Combine the two lists
+ return dict([ (user,files1[user]-files0[user]) for user in set(files0.keys()).intersection(files1.keys()) ])
+
+#-----------------------------------------------------------------------
+# Main
+#-----------------------------------------------------------------------
+def main():
+
+ opts, args = parse_args(sys.argv[1:])
+ if '-d' in opts:
+ datetime = opts['-d']
+ db_read_datetime = datetime # used to retreive DB data
+ else:
+ datetime = time.strftime("%Y-%m-%d %H:%M:%S")
+ db_read_datetime = None
+
+
+ # Initialize database
+ if '-I' in opts:
+ import DB
+ db = DB.DB(opts['-D'])
+ db.create_tables()
+
+ # Scan directories with mmapplypolicy, and write to list.all-files in out_dir
+ if '-s' in opts:
+ out_dir = opts['-o'] if '-o' in opts else tempfile.mkdtemp(DEF_OUTDIR)
+ temp_dir = out_dir + "/tmp"
+ policy_file = out_dir + "/policy_file"
+ target_dir = opts['-s']
+ rc = run_mmapplypolicy(MMAPPLYPOLICY,target_dir,policy_file,temp_dir,out_dir)
+ #rc = run_test(temp_dir,out_dir)
+ if rc>0: print_warn(" mmapplypolicy return code is (%d)" % rc)
+
+ if '-p' in opts:
+ # Get list of total files sizes per owner
+ inputfile = opts['-p']
+ file_list = []
+ user = User.User()
+ for line in _ncheck_output([[TOTAL, "-s1r", "8", "6,n", inputfile]]).split("\n"):
+ sline = line.strip()
+ if not sline or sline[0]=='#': continue
+ uid, kb_allocated, count = sline.split()
+ # Size is number of kb block allocated, plus 4096 per inode (assume #files==#inodes)
+ size = int(kb_allocated) * 1024 + int(count) * 4096
+ file_list.append((size,count,uid,user.get_user(uid)))
+ file_list.sort(key=lambda x: x[0], reverse=True)
+
+ # Get list of total files kb_allocateds per home and shared directories
+ homedirs, shareddirs = [], []
+ for line in _ncheck_output([['sed','s#/# #g',inputfile],['/usr/local/bin/total','-s1r','13,14','6,n','-']]).split("\n"):
+ sline = line.strip()
+ if not sline or sline[0]=='#': continue
+ home_or_shared, name, kb_allocated, count = sline.split()
+ # Size is number of kb block allocated, plus 4096 per inode (assume #files==#inodes)
+ size = int(kb_allocated) * 1024 + int(count) * 4096
+ if home_or_shared=='home':
+ homedirs.append((size,count,name))
+ elif home_or_shared=='shared':
+ shareddirs.append((size,count,name))
+ homedirs.sort(key=lambda x: x[0], reverse=True)
+ shareddirs.sort(key=lambda x: x[0], reverse=True)
+
+ # Write data to database
+ if opts['writedb']:
+ WriteDB(opts['-D'],datetime,file_list,homedirs,shareddirs)
+
+ # Read data from database
+ if opts['readdb']:
+ datetime, file_list, homedirs, shareddirs = ReadDB(opts['-D'],datetime=db_read_datetime)
+ growth = read_compare(opts['-D'], datetime) # Use datetime from previous search
+
+ # Write three tables in one csv file/sheet
+ if '-C' in opts:
+ WriteCSV(opts['-C'],datetime,file_list,growth,homedirs,shareddirs)
+
+ # Write three tables in HTML file
+ if '-H' in opts:
+ WriteHTML(opts['-H'],datetime,file_list,growth,homedirs,shareddirs)
+
+
+
+
+main()