diff --git a/GlossLexicon.py b/GlossLexicon.py
index 0fd0c4e..9d0f7d7 100644
--- a/GlossLexicon.py
+++ b/GlossLexicon.py
@@ -93,14 +93,6 @@ def do_stem(text):
global stemmer
return [stemmer.stem(word) for word in text]
-def create_lexicon(words, labels):
- lexicon = {}
- for i in range(len(words)):
- word = words[i]
- label = labels[i]
- lexicon[word] = label
- return lexicon
-
def create(test_words, test_labels):
# Set up initial Sets S_p and S_n
neutral = []
@@ -184,7 +176,7 @@ def create(test_words, test_labels):
for word in neg_words:
#lexicon[word] = -1
- lexicon[word] = -1.5
+ lexicon[word] = -1
return lexicon
diff --git a/LexiconEval.py b/LexiconEval.py
index 0c84cad..52e0db8 100644
--- a/LexiconEval.py
+++ b/LexiconEval.py
@@ -14,6 +14,7 @@ LEX_ALG = "gloss"
LEX_SOURCE = "mpqa"
# new and improved finite state machine
+# kinda-sorta based on Taboada 2011.
# states are as follows:
# 0 - base
# 1 - negator found
@@ -35,9 +36,13 @@ def calculate_score(text, lexicon):
num_neg = 0
num_halfneg = 0
for word in text:
+ if lexicon.has_key(word):
+ word_score = lexicon[word]
+ # EXPERIMENTAL
+ if word_score < 0: word_score *= 1.5
if state == 0:
if lexicon.has_key(word):
- score += lexicon[word]
+ score += word_score
num_single += 1
elif word in negators:
state = 1
@@ -45,7 +50,7 @@ def calculate_score(text, lexicon):
state = 2
elif state == 1:
if lexicon.has_key(word):
- score += -1 * lexicon[word]
+ score += -1 * word_score
num_neg += 1
state = 0
elif word in intensifiers:
@@ -54,7 +59,7 @@ def calculate_score(text, lexicon):
state = 0
elif state == 2:
if lexicon.has_key(word):
- score += 2 * lexicon[word]
+ score += 2 * word_score
num_double += 1
state = 0
else:
@@ -63,7 +68,7 @@ def calculate_score(text, lexicon):
pass #TODO
elif state == 4:
if lexicon.has_key(word):
- score += -0.5 * lexicon[word]
+ score += -0.5 * word_score
num_halfneg += 1
state = 0
else:
@@ -77,6 +82,15 @@ def do_stem(text):
def get_label(id):
return movie_reviews.categories(fileids=[id])[0]
+
+# Used to create a lexicon instance from the words + labels directly (i.e. without using an algorithm)
+def create_lexicon(words, labels):
+ lexicon = {}
+ for i in range(len(words)):
+ word = words[i]
+ label = labels[i]
+ lexicon[word] = label
+ return lexicon
i = 0
try:
@@ -87,6 +101,8 @@ try:
LEX_ALG = "gloss"
elif args[i+1] == "conjunction":
LEX_ALG = "conjunction"
+ elif args[i+1] == "none":
+ LEX_ALG = "none"
else:
print "Invalid algorithm"
i += 2
@@ -137,9 +153,10 @@ elif LEX_ALG == "conjunction":
elif LEX_ALG == "none":
lexicon = create_lexicon(test_words, test_labels)
-correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]])
-lex_acc = correct/len(lexicon.items())
-print "Lexicon accuracy:", lex_acc
+if LEX_ALG != "none":
+ correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]])
+ lex_acc = correct/len(lexicon.items())
+ print "Lexicon accuracy:", lex_acc
# Iterate through all of the reviews and compute scores by taking the sum of their
# component lexicon words. Includes rudimentary negation testing.
@@ -159,7 +176,6 @@ for id in ids:
for word in words:
if lexicon.has_key(word):
score += lexicon[word]
- x += 1
scores.append(score)
#print id, score
diff --git a/TFIDF.py b/TFIDF.py
index f863212..202180c 100644
--- a/TFIDF.py
+++ b/TFIDF.py
@@ -20,7 +20,7 @@ def compute_idfs(documents):
def tfidf(term, document, documents, idfs={}):
if idfs == {}:
- all_doc_appearances = sum([doc for doc in documents if term in doc])
+ all_doc_appearances = len([doc for doc in documents if term in doc])
idf = math.log(len(documents)/all_doc_appearances, 10)
else:
if idfs.has_key(term):
diff --git a/XMLParser.py b/XMLParser.py
index 77eb31e..2bad81c 100644
--- a/XMLParser.py
+++ b/XMLParser.py
@@ -14,11 +14,11 @@ class ReviewHandler(xml.sax.ContentHandler):
def endElement(self, tag):
if tag == 'unique_id':
- self.ids.append(self.data.strip())
+ self.ids.append(clean_data(self.data.strip()))
elif tag == 'title':
- self.title = self.data.strip()
+ self.title = clean_data(self.data.strip())
elif tag == 'review_text':
- self.reviews.append(nltk.word_tokenize(self.title + ' ' + self.data.strip()))
+ self.reviews.append(nltk.word_tokenize(self.title + ' ' + clean_data(self.data.strip())))
def characters(self, data):
self.data += data
@@ -28,11 +28,9 @@ def get_reviews(filename):
f = open(filename, 'rU')
data = f.read()
+ cleaned_data = clean_data(data)
# hack because review format doesn't have an enclosing tag
- # hack because there's a malformed ampersand...
- data = data.replace('&', '&')
- data = data.replace('\x1a', '')
- xmldata = '' + data + ''
+ xmldata = '' + cleaned_data + ''
f.close()
handler = ReviewHandler()
@@ -40,6 +38,17 @@ def get_reviews(filename):
# Concatenate the review and title.
return (handler.ids, handler.reviews)
+def clean_data(data):
+ # hack because there's a malformed ampersand...
+ cleaned_data = data.replace('&', '&')
+ cleaned_data = cleaned_data.replace('\x1a', '')
+ # hack because there's a u'\ufffd'... wtf is that?
+ cleaned_data2 = ''
+ for char in cleaned_data:
+ if ord(char) < 255:
+ cleaned_data2 += char
+ return cleaned_data2
+
def get_all_reviews():
filenames = ['sorted_data_acl/books/positive.review', 'sorted_data_acl/books/negative.review',
'sorted_data_acl/dvd/positive.review', 'sorted_data_acl/dvd/negative.review',
diff --git a/cblexicon.py b/cblexicon.py
index 6054d4a..ac41cca 100644
--- a/cblexicon.py
+++ b/cblexicon.py
@@ -228,14 +228,13 @@ class cblexicon:
# Generate dissimilarity matrix
dis = vectorize(conjSet,defSet)
- """
# Its Cluster time
set1 = defSet[len(defSet)//2:]
set2 = defSet[:len(defSet)//2]
"""
set1 = random.sample(defSet, len(defSet)//4)
set2 = [x for x in defSet if x not in set1]
-
+ """
# Optimize objective function
sets = optimize2(set1,set2,conjSet,defSet,dis)
set1 = sets[0]
@@ -243,15 +242,14 @@ class cblexicon:
print(set1)
print(set2)
- """
- f = open('set1.txt', 'w+')
- f2 = open('set1.txt', 'w+')
+ f1 = open('set1.txt', 'w+')
+ f2 = open('set2.txt', 'w+')
for word in set1:
- f.write(word + "/n")
+ f1.write(word + "\n")
for word in set2:
- f2.write(word + "/n")
- """
-
+ f2.write(word + "\n")
+ f1.close()
+ f2.close()
# Can we classify and then run bag of words?
#negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
diff --git a/review_svm.py b/review_svm.py
index 5fa1573..649f4bc 100644
--- a/review_svm.py
+++ b/review_svm.py
@@ -27,14 +27,14 @@ NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
PUNCTUATION = [".", "!", "?", ",", ";"]
# These are now command line parameters! See below...
-USE_DELTATFIDF = True # Martineau and Finn. Excludes some other parameters (e.g. frequency)
+USE_DELTATFIDF = False # Martineau and Finn. Excludes some other parameters (e.g. frequency)
USE_PRESENCE = False # If true, use presence rather than frequency.
USE_POS_TAGS = False
USE_ADJ_ONLY = False
USE_NEGATION = True
USE_POSITION = False
GRAM_LENGTH = 1 # Unigrams, bigrams, ... TODO use a range
-NUM_FOLDS = 10 # For cross-validation (Pang & Lee used 3)
+NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3, Martineau & Finn used 10)
MIN_OCCURRENCES = 0#4 # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
EPSILON = .001 # determines how long the algorithm runs (default is 0.001)
@@ -97,6 +97,12 @@ try:
elif args[i] == "--epsilon":
EPSILON = float(args[i+1])
i += 2
+ elif args[i] == "--use-amazon":
+ USE_AMAZON = True
+ i += 1
+ elif args[i] == "--use-delta":
+ USE_DELTATFIDF = True
+ i += 1
elif args[i] == "--help":
print "Usage:"
print "--gram-length N\t\tUse groups of N consecutive words (Default: 1)"
@@ -110,6 +116,8 @@ try:
print "--threshold N\t\tOnly include words that appear at least N times across all documents (Default: 4)"
print "--epsilon X\t\tSVM parameter to control the number of iterations (Default: 0.001)"
print "\t\t\t(0 < epsilon < 1; lower = more iterations)"
+ print "--use-amazon\t\tUse the Amazon data set rather than the movie review set. (Default: Off)"
+ print "--use-delta\t\tUse Delta TFIDF. (Default: Off)"
exit()
else:
print "Error: Invalid argument", args[i]
@@ -119,15 +127,21 @@ except Exception:
t0 = time.time()
+positive_ids = []
+negative_ids = []
+
if USE_AMAZON:
# Load the mixed Amazon review dataset.
(ids, reviews, labels) = XMLParser.get_all_reviews()
+ for i in range(len(ids)):
+ if labels[i] == 1:
+ positive_ids.append(ids[i])
+ elif labels[i] == -1:
+ negative_ids.append(ids[i])
else:
# Load the Pang and Lee sentiment dataset.
ids = movie_reviews.fileids()
reviews = [list(movie_reviews.words(fileids = [id])) for id in ids]
- positive_ids = []
- negative_ids = []
labels = []
for id in ids:
label = movie_reviews.categories(id)[0]
@@ -148,8 +162,10 @@ for i in range(len(reviews)):
negative_reviews.append(reviews[i])
#TEST
-#positive_reviews = random.sample(positive_reviews, 25)
-#negative_reviews = random.sample(negative_reviews, 25)
+#positive_reviews = positive_reviews[:200]
+#negative_reviews = negative_reviews[:600]
+#positive_reviews = random.sample(positive_reviews, 1000)
+#negative_reviews = random.sample(negative_reviews, 1000)
# Partition reviews into folds.
(pos_folds, pos_fold_ids) = make_folds(positive_reviews, positive_ids, NUM_FOLDS)
@@ -202,7 +218,7 @@ avg_acc = 0
wordlist = total_word_counts.keys()
-f = open("results.txt", "w")
+#f = open("results.txt", "w")
for i in range(NUM_FOLDS):
pos_train_reviews = []
neg_train_reviews = []
@@ -241,11 +257,12 @@ for i in range(NUM_FOLDS):
predicted_labels = classifier.predict(test_vecs)
acc = classifier.score(test_vecs, test_labels)
for i in range(len(test_reviews)):
- print "%s\t%d\t%d" % (test_ids[i], test_labels[i], predicted_labels[i])
+ #f.write("%s\t%d\t%d\n" % (test_ids[i], test_labels[i], predicted_labels[i]))
+ print("%s\t%d\t%d" % (test_ids[i], test_labels[i], predicted_labels[i]))
avg_acc += acc
-f.close()
+#f.close()
t2 = time.time()
avg_acc /= NUM_FOLDS
diff --git a/svm.bat b/svm.bat
index 66bc4a6..b9dc1a5 100644
--- a/svm.bat
+++ b/svm.bat
@@ -11,4 +11,7 @@ echo (5) Unigrams + POS tags
python review_svm.py --gram-length 1 --presence --use-pos-tags
echo (6) Adjectives
-python review_svm.py --gram-length 1 --presence --use-pos-tags --use-adj-only
\ No newline at end of file
+python review_svm.py --gram-length 1 --presence --use-pos-tags --use-adj-only
+
+echo (8) Unigrams + Position
+python review_svm.py --gram-length 1 --presence --use-position
\ No newline at end of file