Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 278 lines (232 sloc) 8.7 KB
#!/usr/bin/python
from __future__ import print_function
#-----------------------------------------------------------------------
# About
#-----------------------------------------------------------------------
#
# Convert from original Perl version 2011-02-13, after it was
# found to be dropping records.
# Updated 2011-09-15: USE_LAST_VALUE option -l
#-----------------------------------------------------------------------
# Imports
#-----------------------------------------------------------------------
import sys
import getopt
#-----------------------------------------------------------------------
# Default option values
#-----------------------------------------------------------------------
# Name the option keys
SEP_OUTPUT = '-S'
SEP_INPUT = '-F'
SEP_FIELD = '-V'
FIELD_START = '-s'
FIELD_FINISH = '-f'
IGNORE_CASE = '-i'
USE_LAST_VALUE = '-l'
PRINT_HEADER = '-h'
PRINT_CSV = '-c'
DEF_OPTS = {
#SEP_INPUT : " ",
#SEP_INPUT : ":",
SEP_INPUT : None,
SEP_OUTPUT : " ",
SEP_FIELD : "/",
FIELD_START : "",
FIELD_FINISH : "",
USE_LAST_VALUE : False,
IGNORE_CASE : False,
PRINT_HEADER : False,
PRINT_CSV : False,
}
#-----------------------------------------------------------------------
# Functions
#-----------------------------------------------------------------------
def Usage(msg=None):
print("""
Usage: flatten [-chifsFSV] <-|file> key1 [key2 [key3 .. ] ]
Read input file <file> (or standard in <file> is -) with records
in the following "clause" form
...
key1: value1
key2: value2
key3: value3
key1: value5
key2: value6
...
and convert records to rows like
value1 value2 value3
value5 value5 ...
By default, clauses are separated by blank lines, and contain no blank lines
themselves. Alternatively, you can identify clauses by their staring key
(-s) or their final key (-f).
The input separator between the key and value is ":" by default. This can
be changed with the -F option. The output separate is " " by default, and
is controlled by the -S option. Blanks around both the key and value are
removed.
OPTIONS
-c Print output in CSV form
-h Include column header (taken from key values)
-i Ignore case of key
-f <key> The presence of <key> means this is last field of current record.
-s <key> The presence of <key> means this is first field of current record.
-F <sep> Input field separator.
-S <sep> Output field separator.
-V <sep> Separator between values in a multi-valued field (default is '/').
-l If field not present, use last known value.
""")
if msg:
print()
print(" ", msg)
print()
raise SystemExit
# Return format outtput line, either text output or CSV output
def format_vals(vals,opts):
# Join fields with "," to make a CSV file with every field quoted
# Fields with multiple values are separated by SEP_FIELD inside the quotes
if opts[PRINT_CSV]:
return '"' + '","'.join([opts[SEP_FIELD].join(sorted(val)) for val in vals]) + '"'
# Join fields into a string
else:
return opts[SEP_OUTPUT].join([opts[SEP_FIELD].join(sorted(val)) for val in vals])
# Read input file, returning each clause as it is found
cnt_ValueError = 0
cnt_Success = 0
def get_next_clause(fin,opts):
global cnt_ValueError, cnt_Success
nfields = len(outkey_pos)
nseen = 0
end_of_previous = False
# Mark all fields as unseen so we can propagate prevously seen fields to subequent clauses
if opts[USE_LAST_VALUE]:
fields_seen = nfields * [ False ] # Mark fields as unseen
# Clear initial values
vals = [ [] for i in range(nfields) ] # To store input values for current record
#vals = nfields * [ [] ]
# Set options to pass to get_next_clause()
while True:
# Read line
line = fin.readline()
# Empty line (not even a linefeed) means end of file
if not line: break
line = line.strip()
# Get key from non-blank line
if line:
try:
key, value = line.split(opts[SEP_INPUT],1)
# Not a key,value pair, so skip this line
except ValueError:
cnt_ValueError += 1
continue
# Save key
cnt_Success += 1
# Remove from key spaces on either side, and trailing colon ':'
key = key.strip().rstrip(':')
# Either blank line, or this key, signals end of previous clause
end_of_previous = (
line=='' and opts[FIELD_START]=='' and opts[FIELD_FINISH]==''
) or (
opts[FIELD_START]!='' and key==opts[FIELD_START]
)
if end_of_previous:
# At least one field was seen, so return clause
if nseen:
yield vals
# Clear number of keys seen
nseen = 0
# Mark fields as unseen, and keep previous fields values
if opts[USE_LAST_VALUE]:
fields_seen = nfields * [ False ] # Mark all fields as unseen
# Clear values for this clause
else:
vals = [ [] for i in range(nfields) ]
# If empty line (not end of file), stop processing it
if not line: continue
# Add this key to current clause
if key in outkey_pos:
nseen += 1
pos = outkey_pos[key]
# We are keeping previous value of field to populate empty fields,
# but this is not an empty field, so clear it out
if opts[USE_LAST_VALUE] and not fields_seen[pos]:
vals[pos] = []
fields_seen[pos] = True
vals[pos].append(value.strip())
# This key signals end of this record
if FIELD_FINISH and key==opts[FIELD_FINISH]:
# At least one field was seen, so return clause
if nseen:
yield vals
# Clear values for this clause
vals = [ [] for i in range(nfields)]
# Clear number of keys seen
nseen = 0
# Return any remaining clause
if nseen:
yield vals
#-----------------------------------------------------------------------
# Main
#-----------------------------------------------------------------------
# Read options and arguments
newopts, args = getopt.getopt(sys.argv[1:],"chls:f:iF:S:V:")
# For options c,h,l,i, change '' to True
newopts = dict(newopts)
for key in ['-c','-h','-l','-i']:
if key in newopts and newopts[key]=='':
newopts[key] = True
opts = DEF_OPTS
opts.update(newopts)
# Cannot have both CSV output and use output field separator
if opts[PRINT_HEADER] and not opts[SEP_OUTPUT]==DEF_OPTS[SEP_OUTPUT]:
print("ERROR: Cannot specify both PRINT_CSV (-c) and SEP_OUTPUT (-S)")
raise SystemExit
# Remove trailing ':' from FIELD start and finish
opts[FIELD_START].rstrip(":")
opts[FIELD_FINISH].rstrip(":")
# Sanity check
if (opts[FIELD_START] and opts[FIELD_FINISH]):
Usage("ERROR: You cannot specify both -f and -s")
# No arguments, show Usage.
if len(args)<1:
Usage("")
# Not enough arguments, show error and Usage.
if len(args)<2:
Usage("ERROR: You must list at least one output field.")
# Get program arguments
filename = args[0]
keys = args[1:]
# Strip trailing : and lower case keys
if opts[IGNORE_CASE]:
keys = [ key.rstrip(":").lower() for key in keys]
opts[FIELD_START] = opts[FIELD_START].lower()
opts[FIELD_FINISH] = opts[FIELD_FINISH].lower()
# Strip trailing :
else:
keys = [ key.rstrip(":") for key in keys]
# Convert keys to dict of key positions
outkey_pos = dict([(val,index) for index,val in enumerate(keys)])
# Open file
if filename=="-":
fin = sys.stdin
else:
fin = open(filename,"r")
# Print header
if opts[PRINT_HEADER]:
headers = [(index,key) for key,index in outkey_pos.items()]
headers.sort()
headers = [[key.capitalize()] for index,key in headers]
print(format_vals(headers,opts))
#
# Read file and print
#
cnt = 0
for vals in get_next_clause(fin,opts):
cnt += 1
print(format_vals([val if val else '-' for val in vals],opts))
# Print warning if all splits failed
if cnt_ValueError > 2*cnt_Success:
print("WARNING: More than half of the input lines could not be parsed.")
print(" Do you need to specify the correct field separator (-F)?")
print(" The default is (%s)" % SEP_INPUT)
# Close file
if not filename=="-":
fin.close()