Skip to content
Permalink
c15369d692
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
58 lines (49 sloc) 1.83 KB
import string
import nltk
import xml.sax
class ReviewHandler(xml.sax.ContentHandler):
def __init__(self):
self.ids = []
self.title = ''
self.reviews = []
self.data = ""
def startElement(self, tag, attributes):
self.data = ''
def endElement(self, tag):
if tag == 'unique_id':
self.ids.append(self.data.strip())
elif tag == 'title':
self.title = self.data.strip()
elif tag == 'review_text':
self.reviews.append(nltk.word_tokenize(self.title + ' ' + self.data.strip()))
def characters(self, data):
self.data += data
def get_reviews(filename):
parser = xml.sax.make_parser()
f = open(filename, 'rU')
data = f.read()
# hack because review format doesn't have an enclosing tag
# hack because there's a malformed ampersand...
data = data.replace('&', '&')
data = data.replace('\x1a', '')
xmldata = '<reviews>' + data + '</reviews>'
f.close()
handler = ReviewHandler()
xml.sax.parseString(xmldata, handler)
# Concatenate the review and title.
return (handler.ids, handler.reviews)
def get_all_reviews():
filenames = ['sorted_data_acl/books/positive.review', 'sorted_data_acl/books/negative.review',
'sorted_data_acl/dvd/positive.review', 'sorted_data_acl/dvd/negative.review',
'sorted_data_acl/electronics/positive.review', 'sorted_data_acl/electronics/negative.review',
'sorted_data_acl/kitchen_&_housewares/positive.review', 'sorted_data_acl/kitchen_&_housewares/negative.review']
filename_labels = [1, -1, 1, -1, 1, -1, 1, -1]
(all_ids, all_reviews, labels) = ([],[],[])
for i in range(len(filenames)):
filename = filenames[i]
label = filename_labels[i]
(ids, reviews) = get_reviews(filename)
all_ids += ids
all_reviews += reviews
labels += [label] * len(ids)
return (all_ids, all_reviews, labels)