Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ctools/flatten.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
278 lines (232 sloc)
8.7 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from __future__ import print_function | |
#----------------------------------------------------------------------- | |
# About | |
#----------------------------------------------------------------------- | |
# | |
# Convert from original Perl version 2011-02-13, after it was | |
# found to be dropping records. | |
# Updated 2011-09-15: USE_LAST_VALUE option -l | |
#----------------------------------------------------------------------- | |
# Imports | |
#----------------------------------------------------------------------- | |
import sys | |
import getopt | |
#----------------------------------------------------------------------- | |
# Default option values | |
#----------------------------------------------------------------------- | |
# Name the option keys | |
SEP_OUTPUT = '-S' | |
SEP_INPUT = '-F' | |
SEP_FIELD = '-V' | |
FIELD_START = '-s' | |
FIELD_FINISH = '-f' | |
IGNORE_CASE = '-i' | |
USE_LAST_VALUE = '-l' | |
PRINT_HEADER = '-h' | |
PRINT_CSV = '-c' | |
DEF_OPTS = { | |
#SEP_INPUT : " ", | |
#SEP_INPUT : ":", | |
SEP_INPUT : None, | |
SEP_OUTPUT : " ", | |
SEP_FIELD : "/", | |
FIELD_START : "", | |
FIELD_FINISH : "", | |
USE_LAST_VALUE : False, | |
IGNORE_CASE : False, | |
PRINT_HEADER : False, | |
PRINT_CSV : False, | |
} | |
#----------------------------------------------------------------------- | |
# Functions | |
#----------------------------------------------------------------------- | |
def Usage(msg=None): | |
print(""" | |
Usage: flatten [-chifsFSV] <-|file> key1 [key2 [key3 .. ] ] | |
Read input file <file> (or standard in <file> is -) with records | |
in the following "clause" form | |
... | |
key1: value1 | |
key2: value2 | |
key3: value3 | |
key1: value5 | |
key2: value6 | |
... | |
and convert records to rows like | |
value1 value2 value3 | |
value5 value5 ... | |
By default, clauses are separated by blank lines, and contain no blank lines | |
themselves. Alternatively, you can identify clauses by their staring key | |
(-s) or their final key (-f). | |
The input separator between the key and value is ":" by default. This can | |
be changed with the -F option. The output separate is " " by default, and | |
is controlled by the -S option. Blanks around both the key and value are | |
removed. | |
OPTIONS | |
-c Print output in CSV form | |
-h Include column header (taken from key values) | |
-i Ignore case of key | |
-f <key> The presence of <key> means this is last field of current record. | |
-s <key> The presence of <key> means this is first field of current record. | |
-F <sep> Input field separator. | |
-S <sep> Output field separator. | |
-V <sep> Separator between values in a multi-valued field (default is '/'). | |
-l If field not present, use last known value. | |
""") | |
if msg: | |
print() | |
print(" ", msg) | |
print() | |
raise SystemExit | |
# Return format outtput line, either text output or CSV output | |
def format_vals(vals,opts): | |
# Join fields with "," to make a CSV file with every field quoted | |
# Fields with multiple values are separated by SEP_FIELD inside the quotes | |
if opts[PRINT_CSV]: | |
return '"' + '","'.join([opts[SEP_FIELD].join(sorted(val)) for val in vals]) + '"' | |
# Join fields into a string | |
else: | |
return opts[SEP_OUTPUT].join([opts[SEP_FIELD].join(sorted(val)) for val in vals]) | |
# Read input file, returning each clause as it is found | |
cnt_ValueError = 0 | |
cnt_Success = 0 | |
def get_next_clause(fin,opts): | |
global cnt_ValueError, cnt_Success | |
nfields = len(outkey_pos) | |
nseen = 0 | |
end_of_previous = False | |
# Mark all fields as unseen so we can propagate prevously seen fields to subequent clauses | |
if opts[USE_LAST_VALUE]: | |
fields_seen = nfields * [ False ] # Mark fields as unseen | |
# Clear initial values | |
vals = [ [] for i in range(nfields) ] # To store input values for current record | |
#vals = nfields * [ [] ] | |
# Set options to pass to get_next_clause() | |
while True: | |
# Read line | |
line = fin.readline() | |
# Empty line (not even a linefeed) means end of file | |
if not line: break | |
line = line.strip() | |
# Get key from non-blank line | |
if line: | |
try: | |
key, value = line.split(opts[SEP_INPUT],1) | |
# Not a key,value pair, so skip this line | |
except ValueError: | |
cnt_ValueError += 1 | |
continue | |
# Save key | |
cnt_Success += 1 | |
# Remove from key spaces on either side, and trailing colon ':' | |
key = key.strip().rstrip(':') | |
# Either blank line, or this key, signals end of previous clause | |
end_of_previous = ( | |
line=='' and opts[FIELD_START]=='' and opts[FIELD_FINISH]=='' | |
) or ( | |
opts[FIELD_START]!='' and key==opts[FIELD_START] | |
) | |
if end_of_previous: | |
# At least one field was seen, so return clause | |
if nseen: | |
yield vals | |
# Clear number of keys seen | |
nseen = 0 | |
# Mark fields as unseen, and keep previous fields values | |
if opts[USE_LAST_VALUE]: | |
fields_seen = nfields * [ False ] # Mark all fields as unseen | |
# Clear values for this clause | |
else: | |
vals = [ [] for i in range(nfields) ] | |
# If empty line (not end of file), stop processing it | |
if not line: continue | |
# Add this key to current clause | |
if key in outkey_pos: | |
nseen += 1 | |
pos = outkey_pos[key] | |
# We are keeping previous value of field to populate empty fields, | |
# but this is not an empty field, so clear it out | |
if opts[USE_LAST_VALUE] and not fields_seen[pos]: | |
vals[pos] = [] | |
fields_seen[pos] = True | |
vals[pos].append(value.strip()) | |
# This key signals end of this record | |
if FIELD_FINISH and key==opts[FIELD_FINISH]: | |
# At least one field was seen, so return clause | |
if nseen: | |
yield vals | |
# Clear values for this clause | |
vals = [ [] for i in range(nfields)] | |
# Clear number of keys seen | |
nseen = 0 | |
# Return any remaining clause | |
if nseen: | |
yield vals | |
#----------------------------------------------------------------------- | |
# Main | |
#----------------------------------------------------------------------- | |
# Read options and arguments | |
newopts, args = getopt.getopt(sys.argv[1:],"chls:f:iF:S:V:") | |
# For options c,h,l,i, change '' to True | |
newopts = dict(newopts) | |
for key in ['-c','-h','-l','-i']: | |
if key in newopts and newopts[key]=='': | |
newopts[key] = True | |
opts = DEF_OPTS | |
opts.update(newopts) | |
# Cannot have both CSV output and use output field separator | |
if opts[PRINT_HEADER] and not opts[SEP_OUTPUT]==DEF_OPTS[SEP_OUTPUT]: | |
print("ERROR: Cannot specify both PRINT_CSV (-c) and SEP_OUTPUT (-S)") | |
raise SystemExit | |
# Remove trailing ':' from FIELD start and finish | |
opts[FIELD_START].rstrip(":") | |
opts[FIELD_FINISH].rstrip(":") | |
# Sanity check | |
if (opts[FIELD_START] and opts[FIELD_FINISH]): | |
Usage("ERROR: You cannot specify both -f and -s") | |
# No arguments, show Usage. | |
if len(args)<1: | |
Usage("") | |
# Not enough arguments, show error and Usage. | |
if len(args)<2: | |
Usage("ERROR: You must list at least one output field.") | |
# Get program arguments | |
filename = args[0] | |
keys = args[1:] | |
# Strip trailing : and lower case keys | |
if opts[IGNORE_CASE]: | |
keys = [ key.rstrip(":").lower() for key in keys] | |
opts[FIELD_START] = opts[FIELD_START].lower() | |
opts[FIELD_FINISH] = opts[FIELD_FINISH].lower() | |
# Strip trailing : | |
else: | |
keys = [ key.rstrip(":") for key in keys] | |
# Convert keys to dict of key positions | |
outkey_pos = dict([(val,index) for index,val in enumerate(keys)]) | |
# Open file | |
if filename=="-": | |
fin = sys.stdin | |
else: | |
fin = open(filename,"r") | |
# Print header | |
if opts[PRINT_HEADER]: | |
headers = [(index,key) for key,index in outkey_pos.items()] | |
headers.sort() | |
headers = [[key.capitalize()] for index,key in headers] | |
print(format_vals(headers,opts)) | |
# | |
# Read file and print | |
# | |
cnt = 0 | |
for vals in get_next_clause(fin,opts): | |
cnt += 1 | |
print(format_vals([val if val else '-' for val in vals],opts)) | |
# Print warning if all splits failed | |
if cnt_ValueError > 2*cnt_Success: | |
print("WARNING: More than half of the input lines could not be parsed.") | |
print(" Do you need to specify the correct field separator (-F)?") | |
print(" The default is (%s)" % SEP_INPUT) | |
# Close file | |
if not filename=="-": | |
fin.close() |