refactored lexicons

job13011 · Apr 12, 2016 · 1a5c75f · 1a5c75f
1 parent 6c944e0
commit 1a5c75f
Showing 1 changed file with 10 additions and 121 deletions.
diff --git a/GlossCountJWB.py → GlossLexicon.py b/GlossCountJWB.py → GlossLexicon.py
@@ -9,21 +9,19 @@
 from sklearn.naive_bayes import MultinomialNB
 import nltk
 from nltk.corpus import wordnet as wn
-import nltk.classify.util
-from nltk.corpus import movie_reviews
+
 import BagOfWords
 import MPQALexicon
 import AniaLexicon
 
 EXPAND_ITERATIONS = 2
-CLASSIFIER = "me" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy
+CLASSIFIER = "svm" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy
 REMOVE_STOPWORDS = False
 USE_STEMMING = False
 USE_EXAMPLES = True
 
 USE_EQUAL_TRAINING = True
 USE_EQUAL_TEST = True
-USE_PARSING = True
 
 POS_SEED = ['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior']
 NEG_SEED = ['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior']
@@ -91,71 +89,10 @@ def expand_sets(positive, negative, neutral):
             newNeutral.add(ant)
   return (newPositive, newNegative, newNeutral)
 
-def get_label(id):
-  return movie_reviews.categories(fileids=[id])[0]
-
 def do_stem(text):
   global stemmer
   return [stemmer.stem(word) for word in text]
 
-# new and improved finite state machine
-# states are as follows:
-# 0 - base
-# 1 - negator found
-# 2 - intensifier found
-# 3 - un-intensifier found (unused)
-# 4 - negator + intensifier found
-def calculate_score(text, lexicon):
-  negators = ["not", "n't", "hardly", "barely"]
-  intensifiers = ["very", "really", "incredibly", "amazingly", "extremely"]
-  if USE_STEMMING:
-    negators = do_stem(negators)
-    intensifiers = do_stem(intensifiers)
-
-  punctuation = [".", "!", "?", ",", ";", '(', ')']
-  state = 0
-  score = 0
-  num_double = 0
-  num_single = 0
-  num_neg = 0
-  num_halfneg = 0
-  for word in text:
-    if state == 0:
-      if lexicon.has_key(word):
-        score += lexicon[word]
-        num_single += 1
-      elif word in negators:
-        state = 1
-      elif word in intensifiers:
-        state = 2
-    elif state == 1:
-      if lexicon.has_key(word):
-        score += -1 * lexicon[word]
-        num_neg += 1
-        state = 0
-      elif word in intensifiers:
-        state = 4
-      else:
-        state = 0
-    elif state == 2:
-      if lexicon.has_key(word):
-        score += 2 * lexicon[word]
-        num_double += 1
-        state = 0
-      else:
-        state = 0
-    elif state == 3:
-      pass #TODO
-    elif state == 4:
-      if lexicon.has_key(word):
-        score += -0.5 * lexicon[word]
-        num_halfneg += 1
-        state = 0
-      else:
-        state = 0
-  #print num_single, num_neg, num_double, num_halfneg
-  return score
-
 def create_lexicon(words, labels):
   lexicon = {}
   for i in range(len(words)):
@@ -164,14 +101,14 @@ def create_lexicon(words, labels):
     lexicon[word] = label
   return lexicon
 
-def create_trained_lexicon(pos_seed, neg_seed, test_words, test_labels):
+def create(test_words, test_labels):
   # Set up initial Sets S_p and S_n
   neutral = []
   #positive = ['good']
   #negative = ['bad']
 
-  positive = [word for word in pos_seed]
-  negative = [word for word in neg_seed]
+  positive = [word for word in POS_SEED]
+  negative = [word for word in NEG_SEED]
   # Expand on Sets to get S_p' and S_n'
   for num in range(EXPAND_ITERATIONS):
     (positive, negative, neutral) = expand_sets(positive,negative,neutral)
@@ -217,12 +154,14 @@ def create_trained_lexicon(pos_seed, neg_seed, test_words, test_labels):
   test_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in test_bags]
 
   predicted_labels = classifier.predict(test_vecs)
+  """
   correct = 0
   for i in range(len(test_labels)):
     if test_labels[i] == predicted_labels[i]:
       correct += 1
       
   print "Lexicon accuracy:", correct/len(test_labels)
+  """
 
   word_labels = {}
   for i in range(len(test_words)):
@@ -244,59 +183,9 @@ def create_trained_lexicon(pos_seed, neg_seed, test_words, test_labels):
     lexicon[word] = 1
 
   for word in neg_words:
-    lexicon[word] = -1
+    #lexicon[word] = -1
+    lexicon[word] = -1.5
 
   return lexicon
 
-if USE_STEMMING:
-  stemmer = nltk.stem.porter.PorterStemmer()
-
-# Load the test set. A few options here.
-(test_words, test_labels) = MPQALexicon.load(True)
-#(test_words, test_labels) = AniaLexicon.load()
-if USE_STEMMING:
-  test_words = do_stem(test_words)
-
-lexicon = create_trained_lexicon(POS_SEED, NEG_SEED, test_words, test_labels)
-#lexicon = create_seed_lexicon(POS_SEED, NEG_SEED)
-#lexicon = create_lexicon(test_words, test_labels)
-
-# Iterate through all of the reviews and compute scores by taking the sum of their
-# component lexicon words.  Includes rudimentary negation testing.
-correct = 0
-positive = 0
-ids = sorted(movie_reviews.fileids())
-scores = []
-
-for id in ids:
-  words = list(movie_reviews.words(fileids=[id]))
-  if USE_STEMMING:
-    words = do_stem(words)
-  if USE_PARSING:
-    scores.append(calculate_score(words, lexicon))
-  else:
-    score = 0
-    x = 0
-    for word in words:
-      if lexicon.has_key(word):
-        score += lexicon[word]
-        x += 1
-    scores.append(score)
-    print score, x
-
-for i in range(len(ids)):
-  id = ids[i]
-  score = scores[i]
-  if score >= 0:
-    sent_value = "pos"
-    positive += 1
-    #print id, sent_value
-  elif score < 0:
-    sent_value = "neg"
-    #print id, sent_value
-  label = get_label(id)
-  if sent_value == label:
-    correct += 1
-
-print "correct:", correct/len(ids)
-print "positive:", positive/len(ids)
+#create_trained_lexicon(POS_SEED, NEG_SEED, test_words, test_labels)