Skip to content
Permalink
4e6b38c6e6
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
67 lines (57 sloc) 2.13 KB
import string
import nltk
import xml.sax
class ReviewHandler(xml.sax.ContentHandler):
def __init__(self):
self.ids = []
self.title = ''
self.reviews = []
self.data = ""
def startElement(self, tag, attributes):
self.data = ''
def endElement(self, tag):
if tag == 'unique_id':
self.ids.append(clean_data(self.data.strip()))
elif tag == 'title':
self.title = clean_data(self.data.strip())
elif tag == 'review_text':
self.reviews.append(nltk.word_tokenize(self.title + ' ' + clean_data(self.data.strip())))
def characters(self, data):
self.data += data
def get_reviews(filename):
parser = xml.sax.make_parser()
f = open(filename, 'rU')
data = f.read()
cleaned_data = clean_data(data)
# hack because review format doesn't have an enclosing tag
xmldata = '<reviews>' + cleaned_data + '</reviews>'
f.close()
handler = ReviewHandler()
xml.sax.parseString(xmldata, handler)
# Concatenate the review and title.
return (handler.ids, handler.reviews)
def clean_data(data):
# hack because there's a malformed ampersand...
cleaned_data = data.replace('&', '&amp;')
cleaned_data = cleaned_data.replace('\x1a', '')
# hack because there's a u'\ufffd'... wtf is that?
cleaned_data2 = ''
for char in cleaned_data:
if ord(char) < 255:
cleaned_data2 += char
return cleaned_data2
def get_all_reviews():
filenames = ['sorted_data_acl/books/positive.review', 'sorted_data_acl/books/negative.review',
'sorted_data_acl/dvd/positive.review', 'sorted_data_acl/dvd/negative.review',
'sorted_data_acl/electronics/positive.review', 'sorted_data_acl/electronics/negative.review',
'sorted_data_acl/kitchen_&_housewares/positive.review', 'sorted_data_acl/kitchen_&_housewares/negative.review']
filename_labels = [1, -1, 1, -1, 1, -1, 1, -1]
(all_ids, all_reviews, labels) = ([],[],[])
for i in range(len(filenames)):
filename = filenames[i]
label = filename_labels[i]
(ids, reviews) = get_reviews(filename)
all_ids += ids
all_reviews += reviews
labels += [label] * len(ids)
return (all_ids, all_reviews, labels)