Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Added parser for XML Amazon reviews; standardized SVM code to use Skl…
…earn and NLTK
  • Loading branch information
job13011 committed Apr 9, 2016
1 parent e9e9004 commit c15369d
Showing 1 changed file with 58 additions and 0 deletions.
58 changes: 58 additions & 0 deletions XMLParser.py
@@ -0,0 +1,58 @@
import string
import nltk
import xml.sax

class ReviewHandler(xml.sax.ContentHandler):
def __init__(self):
self.ids = []
self.title = ''
self.reviews = []
self.data = ""

def startElement(self, tag, attributes):
self.data = ''

def endElement(self, tag):
if tag == 'unique_id':
self.ids.append(self.data.strip())
elif tag == 'title':
self.title = self.data.strip()
elif tag == 'review_text':
self.reviews.append(nltk.word_tokenize(self.title + ' ' + self.data.strip()))

def characters(self, data):
self.data += data

def get_reviews(filename):
parser = xml.sax.make_parser()
f = open(filename, 'rU')

data = f.read()
# hack because review format doesn't have an enclosing tag
# hack because there's a malformed ampersand...
data = data.replace('&', '&')
data = data.replace('\x1a', '')
xmldata = '<reviews>' + data + '</reviews>'
f.close()

handler = ReviewHandler()
xml.sax.parseString(xmldata, handler)
# Concatenate the review and title.
return (handler.ids, handler.reviews)

def get_all_reviews():
filenames = ['sorted_data_acl/books/positive.review', 'sorted_data_acl/books/negative.review',
'sorted_data_acl/dvd/positive.review', 'sorted_data_acl/dvd/negative.review',
'sorted_data_acl/electronics/positive.review', 'sorted_data_acl/electronics/negative.review',
'sorted_data_acl/kitchen_&_housewares/positive.review', 'sorted_data_acl/kitchen_&_housewares/negative.review']
filename_labels = [1, -1, 1, -1, 1, -1, 1, -1]

(all_ids, all_reviews, labels) = ([],[],[])
for i in range(len(filenames)):
filename = filenames[i]
label = filename_labels[i]
(ids, reviews) = get_reviews(filename)
all_ids += ids
all_reviews += reviews
labels += [label] * len(ids)
return (all_ids, all_reviews, labels)

0 comments on commit c15369d

Please sign in to comment.