diff --git a/XMLParser.py b/XMLParser.py new file mode 100644 index 0000000..77eb31e --- /dev/null +++ b/XMLParser.py @@ -0,0 +1,58 @@ +import string +import nltk +import xml.sax + +class ReviewHandler(xml.sax.ContentHandler): + def __init__(self): + self.ids = [] + self.title = '' + self.reviews = [] + self.data = "" + + def startElement(self, tag, attributes): + self.data = '' + + def endElement(self, tag): + if tag == 'unique_id': + self.ids.append(self.data.strip()) + elif tag == 'title': + self.title = self.data.strip() + elif tag == 'review_text': + self.reviews.append(nltk.word_tokenize(self.title + ' ' + self.data.strip())) + + def characters(self, data): + self.data += data + +def get_reviews(filename): + parser = xml.sax.make_parser() + f = open(filename, 'rU') + + data = f.read() + # hack because review format doesn't have an enclosing tag + # hack because there's a malformed ampersand... + data = data.replace('&', '&') + data = data.replace('\x1a', '') + xmldata = '' + data + '' + f.close() + + handler = ReviewHandler() + xml.sax.parseString(xmldata, handler) + # Concatenate the review and title. + return (handler.ids, handler.reviews) + +def get_all_reviews(): + filenames = ['sorted_data_acl/books/positive.review', 'sorted_data_acl/books/negative.review', + 'sorted_data_acl/dvd/positive.review', 'sorted_data_acl/dvd/negative.review', + 'sorted_data_acl/electronics/positive.review', 'sorted_data_acl/electronics/negative.review', + 'sorted_data_acl/kitchen_&_housewares/positive.review', 'sorted_data_acl/kitchen_&_housewares/negative.review'] + filename_labels = [1, -1, 1, -1, 1, -1, 1, -1] + + (all_ids, all_reviews, labels) = ([],[],[]) + for i in range(len(filenames)): + filename = filenames[i] + label = filename_labels[i] + (ids, reviews) = get_reviews(filename) + all_ids += ids + all_reviews += reviews + labels += [label] * len(ids) + return (all_ids, all_reviews, labels) \ No newline at end of file