diff --git a/GlossCountJWB.py b/GlossCountJWB.py index 8fc5609..2ca0964 100644 --- a/GlossCountJWB.py +++ b/GlossCountJWB.py @@ -14,11 +14,18 @@ import MPQALexicon import numpy -def get_defs(word): - return string.join([synset.definition() for synset in wn.synsets(word, pos=wn.ADJ)]) +# returns tokenized +def get_defs(word, use_examples=True): + defs = [synset.definition() for synset in wn.synsets(word, pos=wn.ADJ)] + if use_examples: + examples = [synset.examples() for synset in wn.synsets(word, pos=wn.ADJ)] + for example in examples: defs += example + return nltk.word_tokenize(string.join(defs)) -def make_bag(text): - return BagOfWords.make(text, normalize=True, use_negation=True, use_hash=False, use_presence=True) +# text and documents are pre-tokenized +def make_bag(text, documents): + #return BagOfWords.make(text, normalize=True, use_negation=False, use_hash=False, use_presence=False) + return BagOfWords.make_tfidf(text, documents) def expand_sets(positive,negative,neutral): newPositive = set(positive) @@ -54,19 +61,19 @@ def expand_sets(positive,negative,neutral): ant = antonym.name() if ant not in newPositive and ant not in newNegative and ant not in newNeutral: newPositive.add(ant) - elif ant in newPositive: + elif ant in newNegative: newNegative.discard(ant) newNeutral.add(ant) return (newPositive, newNegative, newNeutral) - + def bag_to_vec(bag, wordlist): - vec = [] - for word in wordlist: - if bag.has_key(word): - vec.append(bag[word]) - else: - vec.append(0) - return vec + vec = [] + for word in wordlist: + if bag.has_key(word): + vec.append(bag[word]) + else: + vec.append(0) + return vec # Set up initial Sets S_p and S_n neutral = Set([]) @@ -74,58 +81,51 @@ def bag_to_vec(bag, wordlist): negative = Set(['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior']) # Expand on Sets to get S_p' and S_n' -for num in range(3): - newsets = expand_sets(positive,negative,neutral); - positive = set(newsets[0]) - negative = set(newsets[1]) - neutral = set(newsets[2]) - -# Use the same number of positive and negative words. +for num in range(1): + (positive, negative, neutral) = expand_sets(positive,negative,neutral); + +# Use the same number of positive and negative training words. positive = random.sample(positive, min(len(positive), len(negative))) negative = random.sample(negative, min(len(positive), len(negative))) -# Learn Classifier -train_bags = [make_bag(get_defs(word)) for word in positive] + [make_bag(get_defs(word)) for word in negative] - +# Train the classifier using the expanded wordlist. +train_wordlist = set(positive + negative) + +train_defs = [get_defs(word) for word in (positive + negative)] + +train_bags = [make_bag(get_defs(word), train_defs) for word in positive] + [make_bag(get_defs(word), train_defs) for word in negative] + train_labels = [1 for word in positive] + [-1 for word in negative] -train_wordlist = [] # The classifier needs vectors, not dicts. So we need to convert them to vectors. # Make a list of all the words contained in them, then make an array with entries # corresponding to each word. -for bag in train_bags: - for word in bag.keys(): - if not (word in train_wordlist): - train_wordlist.append(word) -train_wordlist = sorted(train_wordlist) + train_vecs = [bag_to_vec(bag, train_wordlist) for bag in train_bags] -#classifier = MultinomialNB() -classifier = svm.SVC(kernel="linear") +classifier = svm.LinearSVC() classifier.fit(train_vecs, train_labels) -# Load the test set -(test_words, test_labels) = MPQALexicon.load() -#test_words = string.join(list(movie_reviews.words(fileids=ids))) -test_wordlist_bag = BagOfWords.make(string.join(test_words), normalize=False, use_negation=False, use_hash=False, use_presence=True) -test_wordlist = sorted(test_wordlist_bag.keys(), key=lambda k: -test_wordlist_bag[k])#[:NUM_TEST_WORDS] +# Load the test set. I'm only using the bag of words structure here to select the words +# with a certain word count threshold. +(test_words, test_labels) = MPQALexicon.load(True) + +test_wordlist_bag = BagOfWords.make(string.join(test_words), normalize=False, use_negation=False, use_hash=False, use_presence=False) +test_wordlist = sorted(test_wordlist_bag.keys(), key=lambda k: -test_wordlist_bag[k])#[:500] test_bags = [] -test_wordlist2 = [] -for word in test_wordlist: - defs = get_defs(word) - if defs != '': - test_wordlist2.append(word) - test_bags.append(make_bag(defs)) + +test_wordlist = filter(lambda x: x != '', test_wordlist) +test_bags = [make_bag(get_defs(word), train_defs) for word in test_wordlist] test_vecs = [bag_to_vec(bag, train_wordlist) for bag in test_bags] predicted_labels = classifier.predict(test_vecs) word_labels = {} -for i in range(len(test_wordlist2)): - key = test_wordlist2[i] - word_labels[key] = predicted_labels[i] +for i in range(len(test_wordlist)): + key = test_wordlist[i] + word_labels[key] = predicted_labels[i] -pos_words = [w for w in test_wordlist2 if word_labels[w] > 0] -neg_words = [w for w in test_wordlist2 if word_labels[w] < 0] +pos_words = [w for w in test_wordlist if word_labels[w] > 0] +neg_words = [w for w in test_wordlist if word_labels[w] < 0] # Use the same number of positive and negative words. length = min(len(pos_words), len(neg_words)) @@ -134,9 +134,18 @@ def bag_to_vec(bag, wordlist): word_labels2 = {} for word in pos_words: word_labels2[word] = 1 + for word in neg_words: word_labels2[word] = -1 - + +f = open('fuck.txt', 'w') +f.write("[POS]\n\n") +f.write(string.join(pos_words,"\n")) +f.write("\n\n[NEG]\n\n") +f.write(string.join(neg_words,"\n")) +f.close() +#exit() + # Iterate through all of the reviews and find sentiment correct = 0 positive = 0 @@ -144,21 +153,21 @@ def bag_to_vec(bag, wordlist): scores = [] for review_id in ids: - words = movie_reviews.words(fileids=[review_id]) - score = 0 - for word in words: - if word_labels2.has_key(word): - score += word_labels2[word] - scores.append(score) + words = movie_reviews.words(fileids=[review_id]) + score = 0 + for word in words: + if word_labels2.has_key(word): + score += word_labels2[word] + scores.append(score) avg_score = float(sum(scores))/len(scores) for i in range(len(ids)): id = ids[i] score = scores[i] - if score >= avg_score: + if score >= 0:#avg_score: sent_value = "pos" positive += 1 - elif score < avg_score: + elif score < 0:#avg_score: sent_value = "neg" label = movie_reviews.categories(fileids=[id])[0] if sent_value == label: @@ -166,4 +175,4 @@ def bag_to_vec(bag, wordlist): print "correct:", float(correct)/len(ids) print "positive:", float(positive)/len(ids) -print "avg:", avg_score \ No newline at end of file +#print "avg:", avg_score \ No newline at end of file