missed some changes

job13011 · Apr 13, 2016 · 6d16225 · 6d16225
1 parent efbde4b
commit 6d16225
Show file tree

Hide file tree

Showing 7 changed files with 79 additions and 44 deletions.
diff --git a/GlossLexicon.py b/GlossLexicon.py
@@ -93,14 +93,6 @@ def do_stem(text):
   global stemmer
   return [stemmer.stem(word) for word in text]
 
-def create_lexicon(words, labels):
-  lexicon = {}
-  for i in range(len(words)):
-    word = words[i]
-    label = labels[i]
-    lexicon[word] = label
-  return lexicon
-
 def create(test_words, test_labels):
   # Set up initial Sets S_p and S_n
   neutral = []
@@ -184,7 +176,7 @@ def create(test_words, test_labels):
 
   for word in neg_words:
     #lexicon[word] = -1
-    lexicon[word] = -1.5
+    lexicon[word] = -1
 
   return lexicon
 

diff --git a/LexiconEval.py b/LexiconEval.py
@@ -14,6 +14,7 @@
 LEX_SOURCE = "mpqa"
 
 # new and improved finite state machine
+# kinda-sorta based on Taboada 2011.
 # states are as follows:
 # 0 - base
 # 1 - negator found
@@ -35,17 +36,21 @@ def calculate_score(text, lexicon):
   num_neg = 0
   num_halfneg = 0
   for word in text:
+    if lexicon.has_key(word):
+      word_score = lexicon[word]
+      # EXPERIMENTAL
+      if word_score < 0: word_score *= 1.5
     if state == 0:
       if lexicon.has_key(word):
-        score += lexicon[word]
+        score += word_score
         num_single += 1
       elif word in negators:
         state = 1
       elif word in intensifiers:
         state = 2
     elif state == 1:
       if lexicon.has_key(word):
-        score += -1 * lexicon[word]
+        score += -1 * word_score
         num_neg += 1
         state = 0
       elif word in intensifiers:
@@ -54,7 +59,7 @@ def calculate_score(text, lexicon):
         state = 0
     elif state == 2:
       if lexicon.has_key(word):
-        score += 2 * lexicon[word]
+        score += 2 * word_score
         num_double += 1
         state = 0
       else:
@@ -63,7 +68,7 @@ def calculate_score(text, lexicon):
       pass #TODO
     elif state == 4:
       if lexicon.has_key(word):
-        score += -0.5 * lexicon[word]
+        score += -0.5 * word_score
         num_halfneg += 1
         state = 0
       else:
@@ -77,6 +82,15 @@ def do_stem(text):
 
 def get_label(id):
   return movie_reviews.categories(fileids=[id])[0]
+
+# Used to create a lexicon instance from the words + labels directly (i.e. without using an algorithm)
+def create_lexicon(words, labels):
+  lexicon = {}
+  for i in range(len(words)):
+    word = words[i]
+    label = labels[i]
+    lexicon[word] = label
+  return lexicon
 
 i = 0
 try:
@@ -87,6 +101,8 @@ def get_label(id):
         LEX_ALG = "gloss"
       elif args[i+1] == "conjunction":
         LEX_ALG = "conjunction"
+      elif args[i+1] == "none":
+        LEX_ALG = "none"
       else:
         print "Invalid algorithm"
       i += 2
@@ -137,9 +153,10 @@ def get_label(id):
 elif LEX_ALG == "none":
   lexicon = create_lexicon(test_words, test_labels)
 
-correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]])
-lex_acc = correct/len(lexicon.items())
-print "Lexicon accuracy:", lex_acc
+if LEX_ALG != "none":
+  correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]])
+  lex_acc = correct/len(lexicon.items())
+  print "Lexicon accuracy:", lex_acc
 
 # Iterate through all of the reviews and compute scores by taking the sum of their
 # component lexicon words.  Includes rudimentary negation testing.
@@ -159,7 +176,6 @@ def get_label(id):
     for word in words:
       if lexicon.has_key(word):
         score += lexicon[word]
-        x += 1
   scores.append(score)
   #print id, score
 

diff --git a/TFIDF.py b/TFIDF.py
@@ -20,7 +20,7 @@ def compute_idfs(documents):
 
 def tfidf(term, document, documents, idfs={}):
   if idfs == {}:
-    all_doc_appearances = sum([doc for doc in documents if term in doc])
+    all_doc_appearances = len([doc for doc in documents if term in doc])
     idf = math.log(len(documents)/all_doc_appearances, 10)
   else:
     if idfs.has_key(term):

diff --git a/XMLParser.py b/XMLParser.py
@@ -14,11 +14,11 @@ def startElement(self, tag, attributes):
 
   def endElement(self, tag):
     if tag == 'unique_id':
-      self.ids.append(self.data.strip())
+      self.ids.append(clean_data(self.data.strip()))
     elif tag == 'title':
-      self.title = self.data.strip()
+      self.title = clean_data(self.data.strip())
     elif tag == 'review_text':
-      self.reviews.append(nltk.word_tokenize(self.title + ' ' + self.data.strip()))
+      self.reviews.append(nltk.word_tokenize(self.title + ' ' + clean_data(self.data.strip())))
 
   def characters(self, data):
     self.data += data
@@ -28,18 +28,27 @@ def get_reviews(filename):
   f = open(filename, 'rU')
 
   data = f.read()
+  cleaned_data = clean_data(data)
   # hack because review format doesn't have an enclosing tag
-  # hack because there's a malformed ampersand...
-  data = data.replace('&', '&amp;')
-  data = data.replace('\x1a', '')
-  xmldata = '<reviews>' + data + '</reviews>'
+  xmldata = '<reviews>' + cleaned_data + '</reviews>'
   f.close()
 
   handler = ReviewHandler()
   xml.sax.parseString(xmldata, handler)
   # Concatenate the review and title.
   return (handler.ids, handler.reviews)
 
+def clean_data(data):
+  # hack because there's a malformed ampersand...
+  cleaned_data = data.replace('&', '&amp;')
+  cleaned_data = cleaned_data.replace('\x1a', '')
+  # hack because there's a u'\ufffd'... wtf is that?
+  cleaned_data2 = ''
+  for char in cleaned_data:
+    if ord(char) < 255:
+      cleaned_data2 += char
+  return cleaned_data2
+
 def get_all_reviews():
   filenames = ['sorted_data_acl/books/positive.review', 'sorted_data_acl/books/negative.review',
                'sorted_data_acl/dvd/positive.review', 'sorted_data_acl/dvd/negative.review',

diff --git a/cblexicon.py b/cblexicon.py
@@ -228,30 +228,28 @@ def getConj():
         # Generate dissimilarity matrix
         dis = vectorize(conjSet,defSet)
 
-        """
         # Its Cluster time
         set1 = defSet[len(defSet)//2:]
         set2 = defSet[:len(defSet)//2]
         """
         set1 = random.sample(defSet, len(defSet)//4)
         set2 = [x for x in defSet if x not in set1]
-
+        """
         # Optimize objective function
         sets = optimize2(set1,set2,conjSet,defSet,dis)
         set1 = sets[0]
         set2 = sets[1]
 
         print(set1)
         print(set2)
-        """
-        f = open('set1.txt', 'w+')
-        f2 = open('set1.txt', 'w+')
+        f1 = open('set1.txt', 'w+')
+        f2 = open('set2.txt', 'w+')
         for word in set1:
-            f.write(word + "/n")
+            f1.write(word + "\n")
         for word in set2:
-            f2.write(word + "/n")
-        """
-
+            f2.write(word + "\n")
+        f1.close()
+        f2.close()
 
         # Can we classify and then run bag of words?
         #negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]

diff --git a/review_svm.py b/review_svm.py
@@ -27,14 +27,14 @@
 PUNCTUATION    = [".", "!", "?", ",", ";"]
 
 # These are now command line parameters! See below...
-USE_DELTATFIDF  = True                        # Martineau and Finn.  Excludes some other parameters (e.g. frequency)
+USE_DELTATFIDF  = False                       # Martineau and Finn.  Excludes some other parameters (e.g. frequency)
 USE_PRESENCE    = False                       # If true, use presence rather than frequency.
 USE_POS_TAGS    = False
 USE_ADJ_ONLY    = False
 USE_NEGATION    = True
 USE_POSITION    = False
 GRAM_LENGTH     = 1                           # Unigrams, bigrams, ... TODO use a range
-NUM_FOLDS       = 10                           # For cross-validation (Pang & Lee used 3)
+NUM_FOLDS       = 3                           # For cross-validation (Pang & Lee used 3, Martineau & Finn used 10)
 
 MIN_OCCURRENCES = 0#4                         # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
 EPSILON         = .001                        # determines how long the algorithm runs (default is 0.001)
@@ -97,6 +97,12 @@ def make_bag(text, total_word_counts):
     elif args[i] == "--epsilon":
       EPSILON = float(args[i+1])
       i += 2
+    elif args[i] == "--use-amazon":
+      USE_AMAZON = True
+      i += 1
+    elif args[i] == "--use-delta":
+      USE_DELTATFIDF = True
+      i += 1
     elif args[i] == "--help":
       print "Usage:"
       print "--gram-length N\t\tUse groups of N consecutive words (Default: 1)"
@@ -110,6 +116,8 @@ def make_bag(text, total_word_counts):
       print "--threshold N\t\tOnly include words that appear at least N times across all documents  (Default: 4)"
       print "--epsilon X\t\tSVM parameter to control the number of iterations (Default: 0.001)"
       print "\t\t\t(0 < epsilon < 1; lower = more iterations)"
+      print "--use-amazon\t\tUse the Amazon data set rather than the movie review set.  (Default: Off)"
+      print "--use-delta\t\tUse Delta TFIDF.  (Default: Off)"
       exit()
     else:
       print "Error: Invalid argument", args[i]
@@ -119,15 +127,21 @@ def make_bag(text, total_word_counts):
 
 t0 = time.time()
 
+positive_ids = []
+negative_ids = []
+
 if USE_AMAZON:
   # Load the mixed Amazon review dataset.
   (ids, reviews, labels) = XMLParser.get_all_reviews()
+  for i in range(len(ids)):
+    if labels[i] == 1:
+      positive_ids.append(ids[i])
+    elif labels[i] == -1:
+      negative_ids.append(ids[i])
 else:
   # Load the Pang and Lee sentiment dataset.
   ids = movie_reviews.fileids()
   reviews = [list(movie_reviews.words(fileids = [id])) for id in ids]
-  positive_ids = []
-  negative_ids = []
   labels = []
   for id in ids:
     label = movie_reviews.categories(id)[0]
@@ -148,8 +162,10 @@ def make_bag(text, total_word_counts):
     negative_reviews.append(reviews[i])
 
 #TEST
-#positive_reviews = random.sample(positive_reviews, 25)
-#negative_reviews = random.sample(negative_reviews, 25)
+#positive_reviews = positive_reviews[:200]
+#negative_reviews = negative_reviews[:600]
+#positive_reviews = random.sample(positive_reviews, 1000)
+#negative_reviews = random.sample(negative_reviews, 1000)
 
 # Partition reviews into folds.
 (pos_folds, pos_fold_ids) = make_folds(positive_reviews, positive_ids, NUM_FOLDS)
@@ -202,7 +218,7 @@ def make_bag(text, total_word_counts):
 
 wordlist = total_word_counts.keys()
 
-f = open("results.txt", "w")
+#f = open("results.txt", "w")
 for i in range(NUM_FOLDS):
   pos_train_reviews = []
   neg_train_reviews = []
@@ -241,11 +257,12 @@ def make_bag(text, total_word_counts):
   predicted_labels = classifier.predict(test_vecs)
   acc = classifier.score(test_vecs, test_labels)
   for i in range(len(test_reviews)):
-    print "%s\t%d\t%d" % (test_ids[i], test_labels[i], predicted_labels[i])
+    #f.write("%s\t%d\t%d\n" % (test_ids[i], test_labels[i], predicted_labels[i]))
+    print("%s\t%d\t%d" % (test_ids[i], test_labels[i], predicted_labels[i]))
 
   avg_acc += acc
 
-f.close()
+#f.close()
 
 t2 = time.time()
 avg_acc /= NUM_FOLDS

diff --git a/svm.bat b/svm.bat
@@ -11,4 +11,7 @@ echo (5) Unigrams + POS tags
 python review_svm.py --gram-length 1 --presence --use-pos-tags
 
 echo (6) Adjectives
-python review_svm.py --gram-length 1 --presence --use-pos-tags --use-adj-only
+python review_svm.py --gram-length 1 --presence --use-pos-tags --use-adj-only
+
+echo (8) Unigrams + Position
+python review_svm.py --gram-length 1 --presence --use-position