flatten.py

#!/usr/bin/python
from __future__ import print_function

#-----------------------------------------------------------------------
#   About
#-----------------------------------------------------------------------
#
#  Convert from original Perl version 2011-02-13, after it was
#  found to be dropping records.
#  Updated 2011-09-15:  USE_LAST_VALUE option -l

#-----------------------------------------------------------------------
#   Imports
#-----------------------------------------------------------------------
import sys
import getopt

#-----------------------------------------------------------------------
#   Default option values
#-----------------------------------------------------------------------
#  Name the option keys
SEP_OUTPUT      =  '-S'
SEP_INPUT       =  '-F'
SEP_FIELD       =  '-V'
FIELD_START     =  '-s'
FIELD_FINISH    =  '-f'
IGNORE_CASE     =  '-i'
USE_LAST_VALUE  =  '-l'
PRINT_HEADER    =  '-h'
PRINT_CSV       =  '-c'

DEF_OPTS =  {
    #SEP_INPUT      :  " ",
    #SEP_INPUT      :  ":",
    SEP_INPUT      :  None,
    SEP_OUTPUT     :  " ",
    SEP_FIELD      :  "/",
    FIELD_START    :  "",
    FIELD_FINISH   :  "",
    USE_LAST_VALUE :  False,
    IGNORE_CASE    :  False,
    PRINT_HEADER   :  False,
    PRINT_CSV      :  False,
}

#-----------------------------------------------------------------------
#   Functions
#-----------------------------------------------------------------------
def Usage(msg=None):

    print("""
   Usage: flatten [-chifsFSV] <-|file> key1 [key2 [key3 .. ] ]

   Read input file <file> (or standard in <file> is -) with records
   in the following "clause" form

     ...

     key1:  value1
     key2:  value2
     key3:  value3

     key1:  value5
     key2:  value6
     ...

  and convert records to rows like

      value1 value2 value3
      value5 value5 ...

   By default, clauses are separated by blank lines, and contain no blank lines
   themselves.  Alternatively, you can identify clauses by their staring key
   (-s) or their final key (-f).

   The input separator between the key and value is ":" by default.  This can
   be changed with the -F option.  The output separate is " " by default, and
   is controlled by the -S option.  Blanks around both the key and value are
   removed.

   OPTIONS
   -c        Print output in CSV form
   -h        Include column header (taken from key values)
   -i        Ignore case of key
   -f <key>  The presence of <key> means this is last  field of current record.
   -s <key>  The presence of <key> means this is first field of current record.
   -F <sep>  Input  field separator.
   -S <sep>  Output field separator.
   -V <sep>  Separator between values in a multi-valued field (default is '/').
   -l        If field not present, use last known value.

    """)
    if msg:
        print()
        print("   ", msg)
        print()
    raise SystemExit


#  Return format outtput line, either text output or CSV output
def format_vals(vals,opts):
    #  Join fields with "," to make a CSV file with every field quoted
    #   Fields with multiple values are separated by SEP_FIELD inside the quotes
    if opts[PRINT_CSV]:
        return '"' + '","'.join([opts[SEP_FIELD].join(sorted(val)) for val in vals]) + '"'
    #  Join fields into a string
    else:
        return opts[SEP_OUTPUT].join([opts[SEP_FIELD].join(sorted(val)) for val in vals])


#  Read input file, returning each clause as it is found
cnt_ValueError = 0
cnt_Success    = 0
def get_next_clause(fin,opts):
    global cnt_ValueError, cnt_Success
    nfields     = len(outkey_pos)
    nseen       = 0
    end_of_previous = False
    #  Mark all fields as unseen so we can propagate prevously seen fields to subequent clauses
    if opts[USE_LAST_VALUE]:
        fields_seen = nfields * [ False ]   #  Mark fields as unseen
    #  Clear initial values
    vals        = [ [] for i in range(nfields)    ]   #  To store input values for current record
    #vals = nfields * [ [] ]
    #  Set options to pass to get_next_clause()
    while True:
        #  Read line
        line = fin.readline()
        #  Empty line (not even a linefeed) means end of file
        if not line: break
        line = line.strip()
        #  Get key from non-blank line
        if line:
            try:
                key, value = line.split(opts[SEP_INPUT],1)
            #  Not a key,value pair, so skip this line
            except ValueError:
                cnt_ValueError += 1
                continue
            #  Save key
            cnt_Success += 1
            # Remove from key spaces on either side, and trailing colon ':'
            key = key.strip().rstrip(':')
        #  Either blank line, or this key, signals end of previous clause
        end_of_previous = (
                line=='' and opts[FIELD_START]=='' and opts[FIELD_FINISH]==''
            ) or (
                opts[FIELD_START]!='' and key==opts[FIELD_START]
            )
        if end_of_previous:
            #  At least one field was seen, so return clause
            if nseen:
                yield vals
                #  Clear number of keys seen
                nseen = 0
                #  Mark fields as unseen, and keep previous fields values
                if opts[USE_LAST_VALUE]:
                    fields_seen = nfields * [ False ]   #  Mark all fields as unseen
                #  Clear values for this clause
                else:
                    vals = [ [] for i in range(nfields) ]

        #  If empty line (not end of file), stop processing it
        if not line: continue


        #  Add this key to current clause
        if key in outkey_pos:
            nseen += 1
            pos = outkey_pos[key]
            #  We are keeping previous value of field to populate empty fields,
            #  but this is not an empty field, so clear it out
            if opts[USE_LAST_VALUE] and not fields_seen[pos]:
                vals[pos] = []
                fields_seen[pos] = True
            vals[pos].append(value.strip())
        #  This key signals end of this record
        if FIELD_FINISH and key==opts[FIELD_FINISH]:
            #  At least one field was seen, so return clause
            if nseen:
                yield vals
                #  Clear values for this clause
                vals = [ [] for i in range(nfields)]
                #  Clear number of keys seen
                nseen = 0
    #  Return any remaining clause
    if nseen:
        yield vals


#-----------------------------------------------------------------------
#   Main
#-----------------------------------------------------------------------

#  Read options and arguments
newopts, args = getopt.getopt(sys.argv[1:],"chls:f:iF:S:V:")

#  For options c,h,l,i, change '' to True
newopts = dict(newopts)
for key in ['-c','-h','-l','-i']:
    if key in newopts and newopts[key]=='':
        newopts[key] = True

opts = DEF_OPTS
opts.update(newopts)

#  Cannot have both CSV output and use output field separator
if opts[PRINT_HEADER] and not opts[SEP_OUTPUT]==DEF_OPTS[SEP_OUTPUT]:
    print("ERROR:  Cannot specify both PRINT_CSV (-c) and SEP_OUTPUT (-S)")
    raise SystemExit


#  Remove trailing ':' from FIELD start and finish
opts[FIELD_START].rstrip(":")
opts[FIELD_FINISH].rstrip(":")

#  Sanity check
if (opts[FIELD_START] and opts[FIELD_FINISH]):
    Usage("ERROR:  You cannot specify both -f and -s")

#  No arguments, show Usage.
if len(args)<1:
    Usage("")

#  Not enough arguments, show error and Usage.
if len(args)<2:
    Usage("ERROR:  You must list at least one output field.")


#  Get program arguments
filename = args[0]
keys     = args[1:]

#  Strip trailing : and lower case keys
if opts[IGNORE_CASE]:
    keys = [ key.rstrip(":").lower() for key in keys]
    opts[FIELD_START]  = opts[FIELD_START].lower()
    opts[FIELD_FINISH] = opts[FIELD_FINISH].lower()
#  Strip trailing :
else:
    keys = [ key.rstrip(":")         for key in keys]

#  Convert keys to dict of key positions
outkey_pos = dict([(val,index) for index,val in enumerate(keys)])

#  Open file
if filename=="-":
    fin = sys.stdin
else:
    fin = open(filename,"r")

#  Print header
if opts[PRINT_HEADER]:
    headers = [(index,key) for key,index in outkey_pos.items()]
    headers.sort()
    headers = [[key.capitalize()] for index,key in headers]
    print(format_vals(headers,opts))


#
#   Read file and print
#
cnt = 0
for vals in get_next_clause(fin,opts):
    cnt += 1
    print(format_vals([val if val else '-' for val in vals],opts))

#  Print warning if all splits failed
if cnt_ValueError > 2*cnt_Success:
        print("WARNING:  More than half of the input lines could not be parsed.")
        print("          Do you need to specify the correct field separator (-F)?")
        print("          The default is (%s)" % SEP_INPUT)


#  Close file
if not filename=="-":
    fin.close()
	#!/usr/bin/python
	from __future__ import print_function

	#-----------------------------------------------------------------------
	# About
	#-----------------------------------------------------------------------
	#
	# Convert from original Perl version 2011-02-13, after it was
	# found to be dropping records.
	# Updated 2011-09-15: USE_LAST_VALUE option -l

	#-----------------------------------------------------------------------
	# Imports
	#-----------------------------------------------------------------------
	import sys
	import getopt

	#-----------------------------------------------------------------------
	# Default option values
	#-----------------------------------------------------------------------
	# Name the option keys
	SEP_OUTPUT = '-S'
	SEP_INPUT = '-F'
	SEP_FIELD = '-V'
	FIELD_START = '-s'
	FIELD_FINISH = '-f'
	IGNORE_CASE = '-i'
	USE_LAST_VALUE = '-l'
	PRINT_HEADER = '-h'
	PRINT_CSV = '-c'

	DEF_OPTS = {
	#SEP_INPUT : " ",
	#SEP_INPUT : ":",
	SEP_INPUT : None,
	SEP_OUTPUT : " ",
	SEP_FIELD : "/",
	FIELD_START : "",
	FIELD_FINISH : "",
	USE_LAST_VALUE : False,
	IGNORE_CASE : False,
	PRINT_HEADER : False,
	PRINT_CSV : False,
	}

	#-----------------------------------------------------------------------
	# Functions
	#-----------------------------------------------------------------------
	def Usage(msg=None):

	print("""
	Usage: flatten [-chifsFSV] <-\|file> key1 [key2 [key3 .. ] ]

	Read input file <file> (or standard in <file> is -) with records
	in the following "clause" form

	...

	key1: value1
	key2: value2
	key3: value3

	key1: value5
	key2: value6
	...

	and convert records to rows like

	value1 value2 value3
	value5 value5 ...

	By default, clauses are separated by blank lines, and contain no blank lines
	themselves. Alternatively, you can identify clauses by their staring key
	(-s) or their final key (-f).

	The input separator between the key and value is ":" by default. This can
	be changed with the -F option. The output separate is " " by default, and
	is controlled by the -S option. Blanks around both the key and value are
	removed.

	OPTIONS
	-c Print output in CSV form
	-h Include column header (taken from key values)
	-i Ignore case of key
	-f <key> The presence of <key> means this is last field of current record.
	-s <key> The presence of <key> means this is first field of current record.
	-F <sep> Input field separator.
	-S <sep> Output field separator.
	-V <sep> Separator between values in a multi-valued field (default is '/').
	-l If field not present, use last known value.

	""")
	if msg:
	print()
	print(" ", msg)
	print()
	raise SystemExit


	# Return format outtput line, either text output or CSV output
	def format_vals(vals,opts):
	# Join fields with "," to make a CSV file with every field quoted
	# Fields with multiple values are separated by SEP_FIELD inside the quotes
	if opts[PRINT_CSV]:
	return '"' + '","'.join([opts[SEP_FIELD].join(sorted(val)) for val in vals]) + '"'
	# Join fields into a string
	else:
	return opts[SEP_OUTPUT].join([opts[SEP_FIELD].join(sorted(val)) for val in vals])


	# Read input file, returning each clause as it is found
	cnt_ValueError = 0
	cnt_Success = 0
	def get_next_clause(fin,opts):
	global cnt_ValueError, cnt_Success
	nfields = len(outkey_pos)
	nseen = 0
	end_of_previous = False
	# Mark all fields as unseen so we can propagate prevously seen fields to subequent clauses
	if opts[USE_LAST_VALUE]:
	fields_seen = nfields * [ False ] # Mark fields as unseen
	# Clear initial values
	vals = [ [] for i in range(nfields) ] # To store input values for current record
	#vals = nfields * [ [] ]
	# Set options to pass to get_next_clause()
	while True:
	# Read line
	line = fin.readline()
	# Empty line (not even a linefeed) means end of file
	if not line: break
	line = line.strip()
	# Get key from non-blank line
	if line:
	try:
	key, value = line.split(opts[SEP_INPUT],1)
	# Not a key,value pair, so skip this line
	except ValueError:
	cnt_ValueError += 1
	continue
	# Save key
	cnt_Success += 1
	# Remove from key spaces on either side, and trailing colon ':'
	key = key.strip().rstrip(':')
	# Either blank line, or this key, signals end of previous clause
	end_of_previous = (
	line=='' and opts[FIELD_START]=='' and opts[FIELD_FINISH]==''
	) or (
	opts[FIELD_START]!='' and key==opts[FIELD_START]
	)
	if end_of_previous:
	# At least one field was seen, so return clause
	if nseen:
	yield vals
	# Clear number of keys seen
	nseen = 0
	# Mark fields as unseen, and keep previous fields values
	if opts[USE_LAST_VALUE]:
	fields_seen = nfields * [ False ] # Mark all fields as unseen
	# Clear values for this clause
	else:
	vals = [ [] for i in range(nfields) ]

	# If empty line (not end of file), stop processing it
	if not line: continue


	# Add this key to current clause
	if key in outkey_pos:
	nseen += 1
	pos = outkey_pos[key]
	# We are keeping previous value of field to populate empty fields,
	# but this is not an empty field, so clear it out
	if opts[USE_LAST_VALUE] and not fields_seen[pos]:
	vals[pos] = []
	fields_seen[pos] = True
	vals[pos].append(value.strip())
	# This key signals end of this record
	if FIELD_FINISH and key==opts[FIELD_FINISH]:
	# At least one field was seen, so return clause
	if nseen:
	yield vals
	# Clear values for this clause
	vals = [ [] for i in range(nfields)]
	# Clear number of keys seen
	nseen = 0
	# Return any remaining clause
	if nseen:
	yield vals



	#-----------------------------------------------------------------------
	# Main
	#-----------------------------------------------------------------------

	# Read options and arguments
	newopts, args = getopt.getopt(sys.argv[1:],"chls:f:iF:S:V:")

	# For options c,h,l,i, change '' to True
	newopts = dict(newopts)
	for key in ['-c','-h','-l','-i']:
	if key in newopts and newopts[key]=='':
	newopts[key] = True

	opts = DEF_OPTS
	opts.update(newopts)

	# Cannot have both CSV output and use output field separator
	if opts[PRINT_HEADER] and not opts[SEP_OUTPUT]==DEF_OPTS[SEP_OUTPUT]:
	print("ERROR: Cannot specify both PRINT_CSV (-c) and SEP_OUTPUT (-S)")
	raise SystemExit


	# Remove trailing ':' from FIELD start and finish
	opts[FIELD_START].rstrip(":")
	opts[FIELD_FINISH].rstrip(":")

	# Sanity check
	if (opts[FIELD_START] and opts[FIELD_FINISH]):
	Usage("ERROR: You cannot specify both -f and -s")

	# No arguments, show Usage.
	if len(args)<1:
	Usage("")

	# Not enough arguments, show error and Usage.
	if len(args)<2:
	Usage("ERROR: You must list at least one output field.")


	# Get program arguments
	filename = args[0]
	keys = args[1:]

	# Strip trailing : and lower case keys
	if opts[IGNORE_CASE]:
	keys = [ key.rstrip(":").lower() for key in keys]
	opts[FIELD_START] = opts[FIELD_START].lower()
	opts[FIELD_FINISH] = opts[FIELD_FINISH].lower()
	# Strip trailing :
	else:
	keys = [ key.rstrip(":") for key in keys]

	# Convert keys to dict of key positions
	outkey_pos = dict([(val,index) for index,val in enumerate(keys)])

	# Open file
	if filename=="-":
	fin = sys.stdin
	else:
	fin = open(filename,"r")

	# Print header
	if opts[PRINT_HEADER]:
	headers = [(index,key) for key,index in outkey_pos.items()]
	headers.sort()
	headers = [[key.capitalize()] for index,key in headers]
	print(format_vals(headers,opts))


	#
	# Read file and print
	#
	cnt = 0
	for vals in get_next_clause(fin,opts):
	cnt += 1
	print(format_vals([val if val else '-' for val in vals],opts))

	# Print warning if all splits failed
	if cnt_ValueError > 2*cnt_Success:
	print("WARNING: More than half of the input lines could not be parsed.")
	print(" Do you need to specify the correct field separator (-F)?")
	print(" The default is (%s)" % SEP_INPUT)


	# Close file
	if not filename=="-":
	fin.close()