Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Twitter corpus; random changes from last week
  • Loading branch information
job13011 committed Apr 26, 2016
1 parent 971ba40 commit 7b5edc3
Show file tree
Hide file tree
Showing 9 changed files with 300 additions and 193 deletions.
22 changes: 10 additions & 12 deletions BagOfWords.py
Expand Up @@ -14,17 +14,17 @@ ADJECTIVE_TAGS = ["JJ", "JJR", "JJS", "JJT"]
POSITION_THRESHOLDS = [0.25, 0.75, 1]

# ref_bag is used to calculate the total word count across all documents.
def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=True, use_hash=False):
def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=False):
bag_of_words = {}
if use_negation:
do_negation = False

if use_pos_tags:
tagged = nltk.pos_tag(words)
#tagged = nltk.pos_tag(words)
tagged = tagger.tag(words) # this is much much faster !!!
words = [string.join(t, "_") for t in tagged]
for i in range(len(words) - gram_length + 1):
n_gram = string.join(words[i:i+gram_length], "_")

if use_negation:
if (gram_length == 1): # Pang and Lee didn't do negation tagging for bigrams.
if n_gram in NEGATION_WORDS:
Expand All @@ -39,13 +39,8 @@ def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=Fa
if i/len(words) < POSITION_THRESHOLDS[j]:
n_gram += POSITION_TAGS[j]
break

# LIBSVM won't use strings as keys, so hash to convert to a number.
if use_hash:
index = hash(n_gram)
else:
index = n_gram

index = n_gram
if not (use_pos_tags and use_adj_only and (tagged[i][1] not in ADJECTIVE_TAGS)):
if (not use_presence) and bag_of_words.has_key(index):
bag_of_words[index] += 1
Expand All @@ -58,7 +53,8 @@ def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=Fa
ref_bag[index] += 1
else:
ref_bag[index] = 1


#length-normalize
if normalize:
length = 0
for k in bag_of_words.keys():
Expand All @@ -84,7 +80,7 @@ def make_tfidf(document, documents):

# As per Martineau and Finn (2009), create a bag of words using delta TFIDF as the feature value.
# Todo: Bigrams?
def make_delta_tfidf(document, positive_set, negative_set, pos_idfs, neg_idfs, ref_bag):
def make_delta_tfidf(document, positive_set, negative_set, pos_idfs, neg_idfs, ref_bag, use_pos_tags=False):
bag = {}
factor = 0
for term in set(document):
Expand Down Expand Up @@ -112,4 +108,6 @@ def to_vector(bag, wordlist):
else:
vec.append(0)
return vec
#return numpy.array(vec).reshape(1,-1)
#return numpy.array(vec).reshape(1,-1)

tagger = nltk.tag.perceptron.PerceptronTagger()
9 changes: 3 additions & 6 deletions GlossLexicon.py
Expand Up @@ -12,10 +12,10 @@ from nltk.corpus import wordnet as wn

import BagOfWords

EXPAND_ITERATIONS = 3
EXPAND_ITERATIONS = 2
CLASSIFIER = "svm" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy
REMOVE_STOPWORDS = True
USE_STEMMING = True # sync this up with eval!
REMOVE_STOPWORDS = False
USE_STEMMING = False # sync this up with eval!
USE_EXAMPLES = True

USE_EQUAL_TRAINING = True
Expand Down Expand Up @@ -144,14 +144,12 @@ def create(test_words, test_labels):
test_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in test_bags]

predicted_labels = classifier.predict(test_vecs)
"""
correct = 0
for i in range(len(test_labels)):
if test_labels[i] == predicted_labels[i]:
correct += 1

print "Lexicon accuracy:", correct/len(test_labels)
"""

word_labels = {}
for i in range(len(test_words)):
Expand All @@ -173,7 +171,6 @@ def create(test_words, test_labels):
lexicon[word] = 1

for word in neg_words:
#lexicon[word] = -1
lexicon[word] = -1

return lexicon
Expand Down
24 changes: 17 additions & 7 deletions LexiconEval.py
Expand Up @@ -7,13 +7,15 @@ from nltk.corpus import movie_reviews
import MPQALexicon
import AniaLexicon
import GlossLexicon
import LexFromFile
import XMLParser
import TwitterCorpus

USE_STEMMING = True # sync this up with lexicon!
USE_STEMMING = False # sync this up with lexicon!
USE_PARSING = True
LEX_ALG = "gloss" # "gloss", "conjunction", "none"
LEX_SOURCE = "mpqa" # "mpqa", "ania"
CORPUS = "movies" # "amazon", "movies"
CORPUS = "movies" # "amazon", "movies", "twitter"
NEG_MOD = 1.5 # Taboada suggested 1.5.

# new and improved finite state machine
Expand Down Expand Up @@ -115,6 +117,8 @@ try:
CORPUS = "movies"
elif args[i+1] == "amazon":
CORPUS = "amazon"
elif args[i+1] == "twitter":
CORPUS = "twitter"
i += 2
elif args[i] == "--help":
print "Usage:"
Expand All @@ -127,6 +131,7 @@ try:
print " - ania: Use the hand-labeled lexicon from the Brown corpus"
print "--corpus X: Choose the data set to test on"
print " - amazon: Use the Amazon data set"
print " - twitter: Use the Twitter data set"
print " - movies: Use the Pang&Lee movie data set (default)"
exit()
else:
Expand All @@ -142,7 +147,7 @@ print "Corpus =", CORPUS

# Load the test set. A few options here.
if LEX_SOURCE == "mpqa":
(test_words, test_labels) = MPQALexicon.load(True)
(test_words, test_labels) = MPQALexicon.load(False)
elif LEX_SOURCE == "ania":
(test_words, test_labels) = AniaLexicon.load()
else:
Expand All @@ -164,10 +169,12 @@ if LEX_ALG != "none":
correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]])
lex_acc = correct/len(lexicon.items())
print "Lexicon accuracy:", lex_acc


# TODO refactor me again.
#lexicon = LexFromFile.lexfromfile("cblex.txt")
for key in lexicon.keys():
if lexicon[key] < 0: lexicon[key] *= NEG_MOD

if CORPUS == "movies":
ids = movie_reviews.fileids()
reviews = [list(movie_reviews.words(fileids=[id])) for id in ids]
Expand All @@ -180,10 +187,12 @@ if CORPUS == "movies":
labels.append(-1)
elif CORPUS == "amazon":
(ids, reviews, labels) = XMLParser.get_all_reviews()
elif CORPUS == "twitter":
(ids, reviews, labels) = TwitterCorpus.load() #they're not reviews but we'll let it slide.
else:
print "Invalid corpus!"
exit()

"""
# It feels like there should be a more efficient way do to this.
shuffled = zip(ids,reviews,labels)
Expand All @@ -192,6 +201,8 @@ ids = [x[0] for x in shuffled]
reviews = [x[1] for x in shuffled]
labels = [x[2] for x in shuffled]
"""
#for k in lexicon.keys():
# lexicon[k] *= -1

# Iterate through all of the reviews and compute scores by taking the sum of their
# component lexicon words. Includes rudimentary negation testing.
Expand All @@ -203,7 +214,6 @@ for i in range(len(reviews)):
words = reviews[i]
if USE_STEMMING:
words = do_stem(words)

if USE_PARSING:
score = calculate_score(words, lexicon)
else:
Expand Down
2 changes: 1 addition & 1 deletion TFIDF.py
Expand Up @@ -44,6 +44,6 @@ def tfidf(term, document, documents, idfs={}):
tfidf = (1 + math.log(doc_appearances,10)) * idf
return tfidf

# Martineau and Finn 2009
# Martineau and Finin 2009
def delta_tfidf(term, document, positive_set, negative_set, pos_idfs={}, neg_idfs={}):
return tfidf(term, document, positive_set, pos_idfs) - tfidf(term, document, negative_set, neg_idfs)
30 changes: 30 additions & 0 deletions TwitterCorpus.py
@@ -0,0 +1,30 @@
import nltk
import string
import random

def load(sample=True):
CONJUNCTIONS = ["and", "but", "or"]

f = open("Sentiment Analysis Dataset.csv")
lines = f.readlines()
#lines = lines[:1000]
f.close()
if sample:
lines = random.sample(lines, 10000)
ids = []
tweets = []
labels = []
for line in lines[1:]:
line = line.replace("\"", "").strip()
line2 = ""
for c in line:
if ord(c) < 128: line2 += c
terms = line2.split(",")
id = terms[0]
label = int(terms[1])
if label == 0: label = -1
tweet = terms[3]
ids.append(id)
tweets.append(nltk.word_tokenize(tweet))
labels.append(label)
return (ids, tweets, labels)

0 comments on commit 7b5edc3

Please sign in to comment.