From c458117d6eddcb40629d1ea093a61cc6a9715dcd Mon Sep 17 00:00:00 2001
From: Jack <john.butler@uconn.edu>
Date: Thu, 31 Mar 2016 19:29:05 -0400
Subject: [PATCH] meh

---
 GlossCountJWB.py | 97 ++++++++++++++++++++++++++++++------------------
 MPQALexicon.py   |  4 +-
 asdf.py          | 11 ++++++
 pos.py           | 36 ++++++++++++++++++
 review_svm.py    | 20 +++++++---
 5 files changed, 125 insertions(+), 43 deletions(-)
 create mode 100644 asdf.py
 create mode 100644 pos.py

diff --git a/GlossCountJWB.py b/GlossCountJWB.py
index 2c9d543..8fc5609 100644
--- a/GlossCountJWB.py
+++ b/GlossCountJWB.py
@@ -8,15 +8,17 @@
 import string
 import random
 import BagOfWords
-from sklearn.naive_bayes import MultinomialNB
 from sklearn import svm
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.linear_model import LogisticRegression
 import MPQALexicon
+import numpy
     
 def get_defs(word):
     return string.join([synset.definition() for synset in wn.synsets(word, pos=wn.ADJ)])
 
 def make_bag(text):
-    return BagOfWords.make(text, normalize=True, use_negation=False, use_hash=False, use_presence=False)
+    return BagOfWords.make(text, normalize=True, use_negation=True, use_hash=False, use_presence=True)
 
 def expand_sets(positive,negative,neutral):
     newPositive = set(positive)
@@ -25,42 +27,54 @@ def expand_sets(positive,negative,neutral):
     for word in positive:
         for syn in wn.synsets(word, pos=wn.ADJ):
             for lemma in syn.lemmas():
-                curr = lemma.name().split('.')[0]
+                curr = lemma.name()
                 if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
                     newPositive.add(curr)
                 elif curr in newNegative:
                     newNegative.discard(curr)
                     newNeutral.add(curr)
+                for antonym in lemma.antonyms():
+                    ant = antonym.name()
+                    if ant not in newPositive and ant not in newNegative and ant not in newNeutral:
+                        newNegative.add(ant)
+                    elif ant in newPositive:
+                        newPositive.discard(ant)
+                        newNeutral.add(ant)
 
     for word in negative:
         for syn in wn.synsets(word, pos=wn.ADJ):
             for lemma in syn.lemmas():
-                curr = lemma.name().split('.')[0]
+                curr = lemma.name()
                 if curr not in newPositive and curr not in newNegative and curr not in newNeutral:
                     newNegative.add(curr)
                 elif curr in newPositive:
                     newPositive.discard(curr)
                     newNeutral.add(curr)
+                for antonym in lemma.antonyms():
+                    ant = antonym.name()
+                    if ant not in newPositive and ant not in newNegative and ant not in newNeutral:
+                        newPositive.add(ant)
+                    elif ant in newPositive:
+                        newNegative.discard(ant)
+                        newNeutral.add(ant)
     return (newPositive, newNegative, newNeutral)
     
 def bag_to_vec(bag, wordlist):
-  vec = []
-  for word in wordlist:
-    if bag.has_key(word):
-      vec.append(bag[word])
-    else:
-      vec.append(0)
-  return vec
+    vec = []
+    for word in wordlist:
+        if bag.has_key(word):
+            vec.append(bag[word])
+        else:
+            vec.append(0)
+    return vec
 
 # Set up initial Sets S_p and S_n
-#positive = Set(['Good'])
-#negative = Set(['Bad'])
-neutral = Set([''])
+neutral = Set([])
 positive = Set(['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior'])
 negative = Set(['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior'])
 
 # Expand on Sets to get S_p' and S_n'
-for num in range(2):
+for num in range(3):
     newsets = expand_sets(positive,negative,neutral);
     positive = set(newsets[0])
     negative = set(newsets[1])
@@ -85,18 +99,14 @@ def bag_to_vec(bag, wordlist):
       train_wordlist.append(word)
 train_wordlist = sorted(train_wordlist)
 train_vecs = [bag_to_vec(bag, train_wordlist) for bag in train_bags]
-classifier = MultinomialNB()
-#classifier = svm.SVC(kernel="linear")
+#classifier = MultinomialNB()
+classifier = svm.SVC(kernel="linear")
 classifier.fit(train_vecs, train_labels)
-# Iterate through all of the reviews and find sentiment
-count = 0
-correct = 0
-ids = sorted(movie_reviews.fileids())
 
 # Load the test set
 (test_words, test_labels) = MPQALexicon.load()
 #test_words = string.join(list(movie_reviews.words(fileids=ids)))
-test_wordlist_bag = BagOfWords.make(string.join(test_words), normalize=False, use_negation=False, use_hash=False, use_presence=False)
+test_wordlist_bag = BagOfWords.make(string.join(test_words), normalize=False, use_negation=False, use_hash=False, use_presence=True)
 test_wordlist = sorted(test_wordlist_bag.keys(), key=lambda k: -test_wordlist_bag[k])#[:NUM_TEST_WORDS]
 test_bags = []
 test_wordlist2 = []
@@ -105,7 +115,7 @@ def bag_to_vec(bag, wordlist):
     if defs != '':
         test_wordlist2.append(word)
         test_bags.append(make_bag(defs))
-        
+
 test_vecs = [bag_to_vec(bag, train_wordlist) for bag in test_bags]
 predicted_labels = classifier.predict(test_vecs)
 word_labels = {}
@@ -116,29 +126,44 @@ def bag_to_vec(bag, wordlist):
 
 pos_words = [w for w in test_wordlist2 if word_labels[w] > 0]
 neg_words = [w for w in test_wordlist2 if word_labels[w] < 0]
+
 # Use the same number of positive and negative words.
 length = min(len(pos_words), len(neg_words))
 pos_words = pos_words[:length]
 neg_words = neg_words[:length]
+word_labels2 = {}
+for word in pos_words:
+  word_labels2[word] = 1
+for word in neg_words:
+  word_labels2[word] = -1
 
-scores = {}
+# Iterate through all of the reviews and find sentiment
+correct = 0
+positive = 0
+ids = sorted(movie_reviews.fileids())
+scores = []
+    
 for review_id in ids:
     words = movie_reviews.words(fileids=[review_id])
     score = 0
     for word in words:
-        if word in pos_words:
-            score += 1
-        elif word in neg_words:
-            score -= 1
-    if (score >= 0):
+        if word_labels2.has_key(word):
+            score += word_labels2[word]
+    scores.append(score)
+
+avg_score = float(sum(scores))/len(scores)
+for i in range(len(ids)):
+    id = ids[i]
+    score = scores[i]
+    if score >= avg_score:
         sent_value = "pos"
-        print "Positive (%s)" % review_id
-    else:
+        positive += 1
+    elif score < avg_score:
         sent_value = "neg"
-        print "Negative (%s)" % review_id
-    if (sent_value == movie_reviews.categories(fileids=[review_id])[0]):
+    label = movie_reviews.categories(fileids=[id])[0]
+    if sent_value == label:
         correct += 1
-    count += 1
-    scores[review_id] = score
 
-print "correct:", float(correct)/count
\ No newline at end of file
+print "correct:", float(correct)/len(ids)
+print "positive:", float(positive)/len(ids)
+print "avg:", avg_score
\ No newline at end of file
diff --git a/MPQALexicon.py b/MPQALexicon.py
index d86c2e1..6eec7a6 100644
--- a/MPQALexicon.py
+++ b/MPQALexicon.py
@@ -9,8 +9,8 @@ def load():
     fields = line.split(" ")
     fields = [field for field in fields if "=" in field] #ugh, two lines have a random extra char in them
     d = dict([field.rstrip().split("=") for field in fields])
-    (word, label, pos) = d["word1"], d["priorpolarity"], d["pos1"]
-    if pos == "adj":
+    (word, label, pos, type) = d["word1"], d["priorpolarity"], d["pos1"], d["type"]
+    if pos == "adj":# and type == "strongsubj":
       if label == "positive":
         words.append(word)
         labels.append("pos")
diff --git a/asdf.py b/asdf.py
new file mode 100644
index 0000000..9c8868d
--- /dev/null
+++ b/asdf.py
@@ -0,0 +1,11 @@
+import nltk
+from nltk.corpus import wordnet
+from nltk.corpus import movie_reviews
+from nltk.classify import NaiveBayesClassifier
+
+word = "good"
+syns = wordnet.synsets(word)
+for syn in syns:
+  lemmas = syn.lemmas()
+  for lemma in lemmas:
+    if lemma.antonyms() != []: print lemma.antonyms()
\ No newline at end of file
diff --git a/pos.py b/pos.py
new file mode 100644
index 0000000..f6311c5
--- /dev/null
+++ b/pos.py
@@ -0,0 +1,36 @@
+import nltk
+import os
+import string
+
+"""
+POS tagging is really slow compared to SVM training and prediction.
+This script processes the reviews beforehand, applies the NLTK POS tagger,
+and saves them in a new folder.
+"""
+
+POS_FOLDER   = os.path.join("review_polarity","txt_sentoken","pos")
+NEG_FOLDER   = os.path.join("review_polarity","txt_sentoken","neg")
+POS_TAGGED_FOLDER   = os.path.join("review_polarity","txt_sentoken","pos_tagged")
+NEG_TAGGED_FOLDER   = os.path.join("review_polarity","txt_sentoken","neg_tagged")
+
+for (folder_name, tagged_folder_name) in [(POS_FOLDER, POS_TAGGED_FOLDER), (NEG_FOLDER, NEG_TAGGED_FOLDER)]:
+  filenames = []
+  for (folder, x, folder_filenames) in os.walk(folder_name):
+    for filename in folder_filenames:
+      if filename.endswith(".txt"):
+        filenames.append(os.path.join(folder, filename))
+  for filename in filenames:
+    f = open(filename)
+    lines = f.readlines()
+    f.close()
+    text = string.join(lines, " ")
+    
+    tokens = nltk.word_tokenize(text)
+    tagged = nltk.pos_tag(tokens)
+    tagged = [string.join(t, "_") for t in tagged]
+    tagged = string.join(tagged, " ")
+    tagged_filename = os.path.join(tagged_folder_name, os.path.split(filename)[-1])
+    f = open(tagged_filename, "w")
+    f.write(tagged)
+    f.close()
+    print "Tagged & saved file", tagged_filename
\ No newline at end of file
diff --git a/review_svm.py b/review_svm.py
index 6b5dd09..0d3a6bd 100644
--- a/review_svm.py
+++ b/review_svm.py
@@ -29,6 +29,7 @@
 USE_POS_TAGS    = False
 USE_ADJ_ONLY    = False
 USE_NEGATION    = True
+USE_POSITION    = False
 GRAM_LENGTH     = 1                           # Unigrams, bigrams, ... TODO use a range
 NUM_FOLDS       = 3                           # For cross-validation (Pang & Lee used 3)
 
@@ -60,6 +61,13 @@ def partition_filenames(filenames, num_partitions):
   for i in range(len(filenames)):
     partitions[i % num_partitions].append(filenames[i])
   return partitions
+  
+def make_bag(text, total_word_counts):
+  return BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts,
+                         gram_length=GRAM_LENGTH, use_presence=USE_PRESENCE,
+                         use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY,
+                         normalize=NORMALIZE_BAGS, use_negation=USE_NEGATION,
+                         use_position=USE_POSITION)
 
   
 # Set parameters from command-line arguments.
@@ -91,6 +99,9 @@ def partition_filenames(filenames, num_partitions):
     elif args[i] == "--no-negation":
       USE_NEGATION = False
       i += 1
+    elif args[i] == "--use-position":
+      USE_POSITION = True
+      i += 1
     elif args[i] == "--threshold":
       MIN_OCCURRENCES = int(args[i+1])
       i += 2
@@ -104,7 +115,9 @@ def partition_filenames(filenames, num_partitions):
       print "--presence\t\tUse word presence rather than word frequency (Default: Off)"
       print "--frequency\t\tUse word frequency rather than word presence (Default: On)"
       print "--use-pos-tags\t\tUse part-of-speech tags (Default: Off)"
+      print "--use-negation\t\tTag words appearing after a negation word (Default: Off)"
       print "--use-adj-only\t\tUse adjectives only (requires --use-pos-tags and --gram-length 1) (Default: Off)"
+      print "--use-position\t\tTag words according to their position in the text (Default: Off)"
       print "--threshold N\t\tOnly include words that appear at least N times across all documents  (Default: 4)"
       print "--epsilon X\t\tSVM parameter to control the number of iterations (Default: 0.001)"
       print "\t\t\t(0 < epsilon < 1; lower = more iterations)"
@@ -138,13 +151,10 @@ def partition_filenames(filenames, num_partitions):
   
 for i in range(NUM_FOLDS):
   for filename in pos_folds[i]:
-    pos_fold_bags[i].append(BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts, gram_length=GRAM_LENGTH,
-                            use_presence=USE_PRESENCE, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, normalize_bags=NORMALIZE_BAGS))
+    pos_fold_bags[i].append(make_bag(file_to_text(filename), total_word_counts))
     
   for filename in neg_folds[i]:
-    neg_fold_bags[i].append(
-      BagOfWords.make(file_to_text(filename), ref_bag=total_word_counts, gram_length=GRAM_LENGTH,
-                      use_presence=USE_PRESENCE, use_pos_tags=USE_POS_TAGS, use_adj_only=USE_ADJ_ONLY, normalize_bags=NORMALIZE_BAGS))
+    neg_fold_bags[i].append(make_bag(file_to_text(filename), total_word_counts))
 
 # Remove words with less than the minimum occurrences threshold.
 for k in total_word_counts.keys():