extract-concepts.py

"""Extract concepts from XML file and output one concept per line"""

from lxml import etree
from sys import argv

with open(argv[1], "r") as file_handle:
    xml = file_handle.read()
root = etree.XML(xml)
# We can't use XPath here since we need both the tags elements with
# <b> and without.
lines = []
for elem in root.findall('.//text'):
    line = list(elem.itertext())[0]
    # Remove junk characters
    for char in u'\t\n\xa0\xad\u2013\u2018\u2019\u2010':
        line = line.replace(char, '')
    lines += [line]
# Extract tags between "Key terms and concepts:" and "Practise
# problems:"
start = -1
end = -1
for i in range(len(lines)):
    if "Key terms and concepts" in lines[i]:
        start = i
    elif "Practice problems" in lines[i]:
        end = i
lines = [lines[i] for i in range(start+1, end-1)]
# Split lines with leading closing paranthesis
split_lines = []
for line in lines:
    if line[0] == ')':
        split_lines += [line[:2]]
        if len(line) > 2:
            split_lines += [line[2:]]
    else:
        split_lines += [line]
lines = split_lines
# Join together lines beginning with lowercase characters, and also group
# lines containing matching parantheses
prev_line = lines[0]
joined_lines = [prev_line]
active_parens = False
i = 1
for line in [lines[i] for i in range(1, len(lines))]:
    if '(' in line and ')' not in line:
        active_parens = True
    if line[0].islower() or active_parens:
        joined_lines.pop()
        joined_line = prev_line + line
    else:
        joined_line = line
    if ')' in line:
        active_parens = False
    joined_lines += [joined_line]
    prev_line = joined_lines[-1]
# Remove trailing spaces and convert from unicode to string
lines = []
import unicodedata
for line in joined_lines:
    line = line.strip()         # Remove padded spaces
    if line != '':              # Ignore blank lines
        lines += [unicodedata.normalize('NFKD', line).encode('ascii','ignore')
                  + '\r\n']         # Add newline
# Write output to file
output_file_name = argv[1][:-4] + '-extracted.txt'
with open(output_file_name, "w") as file_handle:
    file_handle.writelines(lines)
	"""Extract concepts from XML file and output one concept per line"""

	from lxml import etree
	from sys import argv

	with open(argv[1], "r") as file_handle:
	xml = file_handle.read()
	root = etree.XML(xml)
	# We can't use XPath here since we need both the tags elements with
	# <b> and without.
	lines = []
	for elem in root.findall('.//text'):
	line = list(elem.itertext())[0]
	# Remove junk characters
	for char in u'\t\n\xa0\xad\u2013\u2018\u2019\u2010':
	line = line.replace(char, '')
	lines += [line]
	# Extract tags between "Key terms and concepts:" and "Practise
	# problems:"
	start = -1
	end = -1
	for i in range(len(lines)):
	if "Key terms and concepts" in lines[i]:
	start = i
	elif "Practice problems" in lines[i]:
	end = i
	lines = [lines[i] for i in range(start+1, end-1)]
	# Split lines with leading closing paranthesis
	split_lines = []
	for line in lines:
	if line[0] == ')':
	split_lines += [line[:2]]
	if len(line) > 2:
	split_lines += [line[2:]]
	else:
	split_lines += [line]
	lines = split_lines
	# Join together lines beginning with lowercase characters, and also group
	# lines containing matching parantheses
	prev_line = lines[0]
	joined_lines = [prev_line]
	active_parens = False
	i = 1
	for line in [lines[i] for i in range(1, len(lines))]:
	if '(' in line and ')' not in line:
	active_parens = True
	if line[0].islower() or active_parens:
	joined_lines.pop()
	joined_line = prev_line + line
	else:
	joined_line = line
	if ')' in line:
	active_parens = False
	joined_lines += [joined_line]
	prev_line = joined_lines[-1]
	# Remove trailing spaces and convert from unicode to string
	lines = []
	import unicodedata
	for line in joined_lines:
	line = line.strip() # Remove padded spaces
	if line != '': # Ignore blank lines
	lines += [unicodedata.normalize('NFKD', line).encode('ascii','ignore')
	+ '\r\n'] # Add newline
	# Write output to file
	output_file_name = argv[1][:-4] + '-extracted.txt'
	with open(output_file_name, "w") as file_handle:
	file_handle.writelines(lines)