Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
missed some changes
  • Loading branch information
job13011 committed Apr 13, 2016
1 parent efbde4b commit 6d16225
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 44 deletions.
10 changes: 1 addition & 9 deletions GlossLexicon.py
Expand Up @@ -93,14 +93,6 @@ def do_stem(text):
global stemmer
return [stemmer.stem(word) for word in text]

def create_lexicon(words, labels):
lexicon = {}
for i in range(len(words)):
word = words[i]
label = labels[i]
lexicon[word] = label
return lexicon

def create(test_words, test_labels):
# Set up initial Sets S_p and S_n
neutral = []
Expand Down Expand Up @@ -184,7 +176,7 @@ def create(test_words, test_labels):

for word in neg_words:
#lexicon[word] = -1
lexicon[word] = -1.5
lexicon[word] = -1

return lexicon

Expand Down
32 changes: 24 additions & 8 deletions LexiconEval.py
Expand Up @@ -14,6 +14,7 @@ LEX_ALG = "gloss"
LEX_SOURCE = "mpqa"

# new and improved finite state machine
# kinda-sorta based on Taboada 2011.
# states are as follows:
# 0 - base
# 1 - negator found
Expand All @@ -35,17 +36,21 @@ def calculate_score(text, lexicon):
num_neg = 0
num_halfneg = 0
for word in text:
if lexicon.has_key(word):
word_score = lexicon[word]
# EXPERIMENTAL
if word_score < 0: word_score *= 1.5
if state == 0:
if lexicon.has_key(word):
score += lexicon[word]
score += word_score
num_single += 1
elif word in negators:
state = 1
elif word in intensifiers:
state = 2
elif state == 1:
if lexicon.has_key(word):
score += -1 * lexicon[word]
score += -1 * word_score
num_neg += 1
state = 0
elif word in intensifiers:
Expand All @@ -54,7 +59,7 @@ def calculate_score(text, lexicon):
state = 0
elif state == 2:
if lexicon.has_key(word):
score += 2 * lexicon[word]
score += 2 * word_score
num_double += 1
state = 0
else:
Expand All @@ -63,7 +68,7 @@ def calculate_score(text, lexicon):
pass #TODO
elif state == 4:
if lexicon.has_key(word):
score += -0.5 * lexicon[word]
score += -0.5 * word_score
num_halfneg += 1
state = 0
else:
Expand All @@ -77,6 +82,15 @@ def do_stem(text):

def get_label(id):
return movie_reviews.categories(fileids=[id])[0]

# Used to create a lexicon instance from the words + labels directly (i.e. without using an algorithm)
def create_lexicon(words, labels):
lexicon = {}
for i in range(len(words)):
word = words[i]
label = labels[i]
lexicon[word] = label
return lexicon

i = 0
try:
Expand All @@ -87,6 +101,8 @@ try:
LEX_ALG = "gloss"
elif args[i+1] == "conjunction":
LEX_ALG = "conjunction"
elif args[i+1] == "none":
LEX_ALG = "none"
else:
print "Invalid algorithm"
i += 2
Expand Down Expand Up @@ -137,9 +153,10 @@ elif LEX_ALG == "conjunction":
elif LEX_ALG == "none":
lexicon = create_lexicon(test_words, test_labels)

correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]])
lex_acc = correct/len(lexicon.items())
print "Lexicon accuracy:", lex_acc
if LEX_ALG != "none":
correct = len([(word, label) for (word, label) in zip(test_words, test_labels) if lexicon.has_key(word) and label == lexicon[word]])
lex_acc = correct/len(lexicon.items())
print "Lexicon accuracy:", lex_acc

# Iterate through all of the reviews and compute scores by taking the sum of their
# component lexicon words. Includes rudimentary negation testing.
Expand All @@ -159,7 +176,6 @@ for id in ids:
for word in words:
if lexicon.has_key(word):
score += lexicon[word]
x += 1
scores.append(score)
#print id, score

Expand Down
2 changes: 1 addition & 1 deletion TFIDF.py
Expand Up @@ -20,7 +20,7 @@ def compute_idfs(documents):

def tfidf(term, document, documents, idfs={}):
if idfs == {}:
all_doc_appearances = sum([doc for doc in documents if term in doc])
all_doc_appearances = len([doc for doc in documents if term in doc])
idf = math.log(len(documents)/all_doc_appearances, 10)
else:
if idfs.has_key(term):
Expand Down
23 changes: 16 additions & 7 deletions XMLParser.py
Expand Up @@ -14,11 +14,11 @@ class ReviewHandler(xml.sax.ContentHandler):

def endElement(self, tag):
if tag == 'unique_id':
self.ids.append(self.data.strip())
self.ids.append(clean_data(self.data.strip()))
elif tag == 'title':
self.title = self.data.strip()
self.title = clean_data(self.data.strip())
elif tag == 'review_text':
self.reviews.append(nltk.word_tokenize(self.title + ' ' + self.data.strip()))
self.reviews.append(nltk.word_tokenize(self.title + ' ' + clean_data(self.data.strip())))

def characters(self, data):
self.data += data
Expand All @@ -28,18 +28,27 @@ def get_reviews(filename):
f = open(filename, 'rU')

data = f.read()
cleaned_data = clean_data(data)
# hack because review format doesn't have an enclosing tag
# hack because there's a malformed ampersand...
data = data.replace('&', '&amp;')
data = data.replace('\x1a', '')
xmldata = '<reviews>' + data + '</reviews>'
xmldata = '<reviews>' + cleaned_data + '</reviews>'
f.close()

handler = ReviewHandler()
xml.sax.parseString(xmldata, handler)
# Concatenate the review and title.
return (handler.ids, handler.reviews)

def clean_data(data):
# hack because there's a malformed ampersand...
cleaned_data = data.replace('&', '&amp;')
cleaned_data = cleaned_data.replace('\x1a', '')
# hack because there's a u'\ufffd'... wtf is that?
cleaned_data2 = ''
for char in cleaned_data:
if ord(char) < 255:
cleaned_data2 += char
return cleaned_data2

def get_all_reviews():
filenames = ['sorted_data_acl/books/positive.review', 'sorted_data_acl/books/negative.review',
'sorted_data_acl/dvd/positive.review', 'sorted_data_acl/dvd/negative.review',
Expand Down
16 changes: 7 additions & 9 deletions cblexicon.py
Expand Up @@ -228,30 +228,28 @@ class cblexicon:
# Generate dissimilarity matrix
dis = vectorize(conjSet,defSet)

"""
# Its Cluster time
set1 = defSet[len(defSet)//2:]
set2 = defSet[:len(defSet)//2]
"""
set1 = random.sample(defSet, len(defSet)//4)
set2 = [x for x in defSet if x not in set1]

"""
# Optimize objective function
sets = optimize2(set1,set2,conjSet,defSet,dis)
set1 = sets[0]
set2 = sets[1]

print(set1)
print(set2)
"""
f = open('set1.txt', 'w+')
f2 = open('set1.txt', 'w+')
f1 = open('set1.txt', 'w+')
f2 = open('set2.txt', 'w+')
for word in set1:
f.write(word + "/n")
f1.write(word + "\n")
for word in set2:
f2.write(word + "/n")
"""

f2.write(word + "\n")
f1.close()
f2.close()

# Can we classify and then run bag of words?
#negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
Expand Down
35 changes: 26 additions & 9 deletions review_svm.py
Expand Up @@ -27,14 +27,14 @@ NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
PUNCTUATION = [".", "!", "?", ",", ";"]

# These are now command line parameters! See below...
USE_DELTATFIDF = True # Martineau and Finn. Excludes some other parameters (e.g. frequency)
USE_DELTATFIDF = False # Martineau and Finn. Excludes some other parameters (e.g. frequency)
USE_PRESENCE = False # If true, use presence rather than frequency.
USE_POS_TAGS = False
USE_ADJ_ONLY = False
USE_NEGATION = True
USE_POSITION = False
GRAM_LENGTH = 1 # Unigrams, bigrams, ... TODO use a range
NUM_FOLDS = 10 # For cross-validation (Pang & Lee used 3)
NUM_FOLDS = 3 # For cross-validation (Pang & Lee used 3, Martineau & Finn used 10)

MIN_OCCURRENCES = 0#4 # To be included, the word must show up this many times across all documents (Pang and Lee used 4)
EPSILON = .001 # determines how long the algorithm runs (default is 0.001)
Expand Down Expand Up @@ -97,6 +97,12 @@ try:
elif args[i] == "--epsilon":
EPSILON = float(args[i+1])
i += 2
elif args[i] == "--use-amazon":
USE_AMAZON = True
i += 1
elif args[i] == "--use-delta":
USE_DELTATFIDF = True
i += 1
elif args[i] == "--help":
print "Usage:"
print "--gram-length N\t\tUse groups of N consecutive words (Default: 1)"
Expand All @@ -110,6 +116,8 @@ try:
print "--threshold N\t\tOnly include words that appear at least N times across all documents (Default: 4)"
print "--epsilon X\t\tSVM parameter to control the number of iterations (Default: 0.001)"
print "\t\t\t(0 < epsilon < 1; lower = more iterations)"
print "--use-amazon\t\tUse the Amazon data set rather than the movie review set. (Default: Off)"
print "--use-delta\t\tUse Delta TFIDF. (Default: Off)"
exit()
else:
print "Error: Invalid argument", args[i]
Expand All @@ -119,15 +127,21 @@ except Exception:

t0 = time.time()

positive_ids = []
negative_ids = []

if USE_AMAZON:
# Load the mixed Amazon review dataset.
(ids, reviews, labels) = XMLParser.get_all_reviews()
for i in range(len(ids)):
if labels[i] == 1:
positive_ids.append(ids[i])
elif labels[i] == -1:
negative_ids.append(ids[i])
else:
# Load the Pang and Lee sentiment dataset.
ids = movie_reviews.fileids()
reviews = [list(movie_reviews.words(fileids = [id])) for id in ids]
positive_ids = []
negative_ids = []
labels = []
for id in ids:
label = movie_reviews.categories(id)[0]
Expand All @@ -148,8 +162,10 @@ for i in range(len(reviews)):
negative_reviews.append(reviews[i])

#TEST
#positive_reviews = random.sample(positive_reviews, 25)
#negative_reviews = random.sample(negative_reviews, 25)
#positive_reviews = positive_reviews[:200]
#negative_reviews = negative_reviews[:600]
#positive_reviews = random.sample(positive_reviews, 1000)
#negative_reviews = random.sample(negative_reviews, 1000)

# Partition reviews into folds.
(pos_folds, pos_fold_ids) = make_folds(positive_reviews, positive_ids, NUM_FOLDS)
Expand Down Expand Up @@ -202,7 +218,7 @@ avg_acc = 0

wordlist = total_word_counts.keys()

f = open("results.txt", "w")
#f = open("results.txt", "w")
for i in range(NUM_FOLDS):
pos_train_reviews = []
neg_train_reviews = []
Expand Down Expand Up @@ -241,11 +257,12 @@ for i in range(NUM_FOLDS):
predicted_labels = classifier.predict(test_vecs)
acc = classifier.score(test_vecs, test_labels)
for i in range(len(test_reviews)):
print "%s\t%d\t%d" % (test_ids[i], test_labels[i], predicted_labels[i])
#f.write("%s\t%d\t%d\n" % (test_ids[i], test_labels[i], predicted_labels[i]))
print("%s\t%d\t%d" % (test_ids[i], test_labels[i], predicted_labels[i]))

avg_acc += acc

f.close()
#f.close()

t2 = time.time()
avg_acc /= NUM_FOLDS
Expand Down
5 changes: 4 additions & 1 deletion svm.bat
Expand Up @@ -11,4 +11,7 @@ echo (5) Unigrams + POS tags
python review_svm.py --gram-length 1 --presence --use-pos-tags

echo (6) Adjectives
python review_svm.py --gram-length 1 --presence --use-pos-tags --use-adj-only
python review_svm.py --gram-length 1 --presence --use-pos-tags --use-adj-only

echo (8) Unigrams + Position
python review_svm.py --gram-length 1 --presence --use-position

0 comments on commit 6d16225

Please sign in to comment.