Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
67 lines (57 sloc) 2.13 KB
import string
import nltk
import xml.sax
class ReviewHandler(xml.sax.ContentHandler):
def __init__(self):
self.ids = []
self.title = ''
self.reviews = []
self.data = ""
def startElement(self, tag, attributes):
self.data = ''
def endElement(self, tag):
if tag == 'unique_id':
self.ids.append(clean_data(self.data.strip()))
elif tag == 'title':
self.title = clean_data(self.data.strip())
elif tag == 'review_text':
self.reviews.append(nltk.word_tokenize(self.title + ' ' + clean_data(self.data.strip())))
def characters(self, data):
self.data += data
def get_reviews(filename):
parser = xml.sax.make_parser()
f = open(filename, 'rU')
data = f.read()
cleaned_data = clean_data(data)
# hack because review format doesn't have an enclosing tag
xmldata = '<reviews>' + cleaned_data + '</reviews>'
f.close()
handler = ReviewHandler()
xml.sax.parseString(xmldata, handler)
# Concatenate the review and title.
return (handler.ids, handler.reviews)
def clean_data(data):
# hack because there's a malformed ampersand...
cleaned_data = data.replace('&', '&amp;')
cleaned_data = cleaned_data.replace('\x1a', '')
# hack because there's a u'\ufffd'... wtf is that?
cleaned_data2 = ''
for char in cleaned_data:
if ord(char) < 255:
cleaned_data2 += char
return cleaned_data2
def get_all_reviews():
filenames = ['sorted_data_acl/books/positive.review', 'sorted_data_acl/books/negative.review',
'sorted_data_acl/dvd/positive.review', 'sorted_data_acl/dvd/negative.review',
'sorted_data_acl/electronics/positive.review', 'sorted_data_acl/electronics/negative.review',
'sorted_data_acl/kitchen_&_housewares/positive.review', 'sorted_data_acl/kitchen_&_housewares/negative.review']
filename_labels = [1, -1, 1, -1, 1, -1, 1, -1]
(all_ids, all_reviews, labels) = ([],[],[])
for i in range(len(filenames)):
filename = filenames[i]
label = filename_labels[i]
(ids, reviews) = get_reviews(filename)
all_ids += ids
all_reviews += reviews
labels += [label] * len(ids)
return (all_ids, all_reviews, labels)
You can’t perform that action at this time.