Twitter corpus; random changes from last week

job13011 · Apr 26, 2016 · 7b5edc3 · 7b5edc3
1 parent 971ba40
commit 7b5edc3
Show file tree

Hide file tree

Showing 9 changed files with 300 additions and 193 deletions.
diff --git a/BagOfWords.py b/BagOfWords.py
@@ -14,17 +14,17 @@
 POSITION_THRESHOLDS = [0.25, 0.75, 1]
 
 # ref_bag is used to calculate the total word count across all documents.
-def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=True, use_hash=False):
+def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=False):
   bag_of_words = {}
   if use_negation:
     do_negation = False
 
   if use_pos_tags:
-    tagged = nltk.pos_tag(words)
+    #tagged = nltk.pos_tag(words)
+    tagged = tagger.tag(words) # this is much much faster !!!
     words = [string.join(t, "_") for t in tagged]
   for i in range(len(words) - gram_length + 1):
     n_gram = string.join(words[i:i+gram_length], "_")
-
     if use_negation:
       if (gram_length == 1): # Pang and Lee didn't do negation tagging for bigrams.
         if n_gram in NEGATION_WORDS:
@@ -39,13 +39,8 @@ def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=Fa
         if i/len(words) < POSITION_THRESHOLDS[j]:
           n_gram += POSITION_TAGS[j]
           break
-
-    # LIBSVM won't use strings as keys, so hash to convert to a number.
-    if use_hash:
-      index = hash(n_gram)
-    else:
-      index = n_gram
 
+    index = n_gram
     if not (use_pos_tags and use_adj_only and (tagged[i][1] not in ADJECTIVE_TAGS)):
       if (not use_presence) and bag_of_words.has_key(index):
         bag_of_words[index] += 1
@@ -58,7 +53,8 @@ def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=Fa
           ref_bag[index] += 1
         else:
           ref_bag[index] = 1
-
+
+  #length-normalize
   if normalize:
     length = 0
     for k in bag_of_words.keys():
@@ -84,7 +80,7 @@ def make_tfidf(document, documents):
 
 # As per Martineau and Finn (2009), create a bag of words using delta TFIDF as the feature value.
 # Todo: Bigrams?
-def make_delta_tfidf(document, positive_set, negative_set, pos_idfs, neg_idfs, ref_bag):
+def make_delta_tfidf(document, positive_set, negative_set, pos_idfs, neg_idfs, ref_bag, use_pos_tags=False):
   bag = {}
   factor = 0
   for term in set(document):
@@ -112,4 +108,6 @@ def to_vector(bag, wordlist):
     else:
       vec.append(0)
   return vec
-  #return numpy.array(vec).reshape(1,-1)
+  #return numpy.array(vec).reshape(1,-1)
+
+tagger = nltk.tag.perceptron.PerceptronTagger()
diff --git a/GlossLexicon.py b/GlossLexicon.py
@@ -12,10 +12,10 @@
 
 import BagOfWords
 
-EXPAND_ITERATIONS = 3
+EXPAND_ITERATIONS = 2
 CLASSIFIER = "svm" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy
-REMOVE_STOPWORDS = True
-USE_STEMMING = True # sync this up with eval!
+REMOVE_STOPWORDS = False
+USE_STEMMING = False # sync this up with eval!
 USE_EXAMPLES = True
 
 USE_EQUAL_TRAINING = True
@@ -144,14 +144,12 @@ def create(test_words, test_labels):
   test_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in test_bags]
 
   predicted_labels = classifier.predict(test_vecs)
-  """
   correct = 0
   for i in range(len(test_labels)):
     if test_labels[i] == predicted_labels[i]:
       correct += 1
 
   print "Lexicon accuracy:", correct/len(test_labels)
-  """
 
   word_labels = {}
   for i in range(len(test_words)):
@@ -173,7 +171,6 @@ def create(test_words, test_labels):
     lexicon[word] = 1
 
   for word in neg_words:
-    #lexicon[word] = -1
     lexicon[word] = -1
 
   return lexicon

diff --git a/LexiconEval.py b/LexiconEval.py
@@ -7,13 +7,15 @@
 import MPQALexicon
 import AniaLexicon
 import GlossLexicon
+import LexFromFile
 import XMLParser
+import TwitterCorpus
 
-USE_STEMMING = True # sync this up with lexicon!
+USE_STEMMING = False # sync this up with lexicon!
 USE_PARSING = True
 LEX_ALG = "gloss"   # "gloss", "conjunction", "none"
 LEX_SOURCE = "mpqa" # "mpqa", "ania"
-CORPUS = "movies"   # "amazon", "movies"
+CORPUS = "movies"   # "amazon", "movies", "twitter"
 NEG_MOD = 1.5       # Taboada suggested 1.5.
 
 # new and improved finite state machine
@@ -115,6 +117,8 @@ def create_lexicon(words, labels):
         CORPUS = "movies"
       elif args[i+1] == "amazon":
         CORPUS = "amazon"
+      elif args[i+1] == "twitter":
+        CORPUS = "twitter"
       i += 2
     elif args[i] == "--help":
       print "Usage:"
@@ -127,6 +131,7 @@ def create_lexicon(words, labels):
       print "  - ania: Use the hand-labeled lexicon from the Brown corpus"
       print "--corpus X: Choose the data set to test on"
       print "  - amazon: Use the Amazon data set"
+      print "  - twitter: Use the Twitter data set"
       print "  - movies: Use the Pang&Lee movie data set (default)"
       exit()
     else:
@@ -142,7 +147,7 @@ def create_lexicon(words, labels):
 
 # Load the test set. A few options here.
 if LEX_SOURCE == "mpqa":
-  (test_words, test_labels) = MPQALexicon.load(True)
+  (test_words, test_labels) = MPQALexicon.load(False)
 elif LEX_SOURCE == "ania":
   (test_words, test_labels) = AniaLexicon.load()
 else:
@@ -164,10 +169,12 @@ def create_lexicon(words, labels):
   correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]])
   lex_acc = correct/len(lexicon.items())
   print "Lexicon accuracy:", lex_acc
-
+
+# TODO refactor me again.
+#lexicon = LexFromFile.lexfromfile("cblex.txt")
 for key in lexicon.keys():
   if lexicon[key] < 0: lexicon[key] *= NEG_MOD
-  
+
 if CORPUS == "movies":
   ids = movie_reviews.fileids()
   reviews = [list(movie_reviews.words(fileids=[id])) for id in ids]
@@ -180,10 +187,12 @@ def create_lexicon(words, labels):
       labels.append(-1)
 elif CORPUS == "amazon":
   (ids, reviews, labels) = XMLParser.get_all_reviews()
+elif CORPUS == "twitter":
+  (ids, reviews, labels) = TwitterCorpus.load() #they're not reviews but we'll let it slide.
 else:
   print "Invalid corpus!"
   exit()
-  
+
 """
 # It feels like there should be a more efficient way do to this.
 shuffled = zip(ids,reviews,labels)
@@ -192,6 +201,8 @@ def create_lexicon(words, labels):
 reviews = [x[1] for x in shuffled]
 labels = [x[2] for x in shuffled]
 """
+#for k in lexicon.keys():
+#  lexicon[k] *= -1
 
 # Iterate through all of the reviews and compute scores by taking the sum of their
 # component lexicon words.  Includes rudimentary negation testing.
@@ -203,7 +214,6 @@ def create_lexicon(words, labels):
   words = reviews[i]
   if USE_STEMMING:
     words = do_stem(words)
-
   if USE_PARSING:
     score = calculate_score(words, lexicon)
   else:

diff --git a/TFIDF.py b/TFIDF.py
@@ -44,6 +44,6 @@ def tfidf(term, document, documents, idfs={}):
   tfidf = (1 + math.log(doc_appearances,10)) * idf
   return tfidf
 
-# Martineau and Finn 2009
+# Martineau and Finin 2009
 def delta_tfidf(term, document, positive_set, negative_set, pos_idfs={}, neg_idfs={}):
   return tfidf(term, document, positive_set, pos_idfs) - tfidf(term, document, negative_set, neg_idfs)
diff --git a/TwitterCorpus.py b/TwitterCorpus.py
@@ -0,0 +1,30 @@
+import nltk
+import string
+import random
+
+def load(sample=True):
+  CONJUNCTIONS = ["and", "but", "or"]
+
+  f = open("Sentiment Analysis Dataset.csv")
+  lines = f.readlines()
+  #lines = lines[:1000]
+  f.close()
+  if sample:
+    lines = random.sample(lines, 10000)
+  ids = []
+  tweets = []
+  labels = []
+  for line in lines[1:]:
+    line = line.replace("\"", "").strip()
+    line2 = ""
+    for c in line:
+      if ord(c) < 128: line2 += c
+    terms = line2.split(",")
+    id = terms[0]
+    label = int(terms[1])
+    if label == 0: label = -1
+    tweet = terms[3]
+    ids.append(id)
+    tweets.append(nltk.word_tokenize(tweet))
+    labels.append(label)
+  return (ids, tweets, labels)