diff --git a/GlossCountJWB.py b/GlossLexicon.py similarity index 64% rename from GlossCountJWB.py rename to GlossLexicon.py index cf9278c..0fd0c4e 100644 --- a/GlossCountJWB.py +++ b/GlossLexicon.py @@ -9,21 +9,19 @@ from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB import nltk from nltk.corpus import wordnet as wn -import nltk.classify.util -from nltk.corpus import movie_reviews + import BagOfWords import MPQALexicon import AniaLexicon EXPAND_ITERATIONS = 2 -CLASSIFIER = "me" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy +CLASSIFIER = "svm" # "nb" = Naive Bayes, "svm" = Linear SVM, "me" = maximum entropy REMOVE_STOPWORDS = False USE_STEMMING = False USE_EXAMPLES = True USE_EQUAL_TRAINING = True USE_EQUAL_TEST = True -USE_PARSING = True POS_SEED = ['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior'] NEG_SEED = ['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior'] @@ -91,71 +89,10 @@ def expand_sets(positive, negative, neutral): newNeutral.add(ant) return (newPositive, newNegative, newNeutral) -def get_label(id): - return movie_reviews.categories(fileids=[id])[0] - def do_stem(text): global stemmer return [stemmer.stem(word) for word in text] -# new and improved finite state machine -# states are as follows: -# 0 - base -# 1 - negator found -# 2 - intensifier found -# 3 - un-intensifier found (unused) -# 4 - negator + intensifier found -def calculate_score(text, lexicon): - negators = ["not", "n't", "hardly", "barely"] - intensifiers = ["very", "really", "incredibly", "amazingly", "extremely"] - if USE_STEMMING: - negators = do_stem(negators) - intensifiers = do_stem(intensifiers) - - punctuation = [".", "!", "?", ",", ";", '(', ')'] - state = 0 - score = 0 - num_double = 0 - num_single = 0 - num_neg = 0 - num_halfneg = 0 - for word in text: - if state == 0: - if lexicon.has_key(word): - score += lexicon[word] - num_single += 1 - elif word in negators: - state = 1 - elif word in intensifiers: - state = 2 - elif state == 1: - if lexicon.has_key(word): - score += -1 * lexicon[word] - num_neg += 1 - state = 0 - elif word in intensifiers: - state = 4 - else: - state = 0 - elif state == 2: - if lexicon.has_key(word): - score += 2 * lexicon[word] - num_double += 1 - state = 0 - else: - state = 0 - elif state == 3: - pass #TODO - elif state == 4: - if lexicon.has_key(word): - score += -0.5 * lexicon[word] - num_halfneg += 1 - state = 0 - else: - state = 0 - #print num_single, num_neg, num_double, num_halfneg - return score - def create_lexicon(words, labels): lexicon = {} for i in range(len(words)): @@ -164,14 +101,14 @@ def create_lexicon(words, labels): lexicon[word] = label return lexicon -def create_trained_lexicon(pos_seed, neg_seed, test_words, test_labels): +def create(test_words, test_labels): # Set up initial Sets S_p and S_n neutral = [] #positive = ['good'] #negative = ['bad'] - positive = [word for word in pos_seed] - negative = [word for word in neg_seed] + positive = [word for word in POS_SEED] + negative = [word for word in NEG_SEED] # Expand on Sets to get S_p' and S_n' for num in range(EXPAND_ITERATIONS): (positive, negative, neutral) = expand_sets(positive,negative,neutral) @@ -217,12 +154,14 @@ def create_trained_lexicon(pos_seed, neg_seed, test_words, test_labels): test_vecs = [BagOfWords.to_vector(bag, train_wordlist) for bag in test_bags] predicted_labels = classifier.predict(test_vecs) + """ correct = 0 for i in range(len(test_labels)): if test_labels[i] == predicted_labels[i]: correct += 1 print "Lexicon accuracy:", correct/len(test_labels) + """ word_labels = {} for i in range(len(test_words)): @@ -244,59 +183,9 @@ def create_trained_lexicon(pos_seed, neg_seed, test_words, test_labels): lexicon[word] = 1 for word in neg_words: - lexicon[word] = -1 + #lexicon[word] = -1 + lexicon[word] = -1.5 return lexicon -if USE_STEMMING: - stemmer = nltk.stem.porter.PorterStemmer() - -# Load the test set. A few options here. -(test_words, test_labels) = MPQALexicon.load(True) -#(test_words, test_labels) = AniaLexicon.load() -if USE_STEMMING: - test_words = do_stem(test_words) - -lexicon = create_trained_lexicon(POS_SEED, NEG_SEED, test_words, test_labels) -#lexicon = create_seed_lexicon(POS_SEED, NEG_SEED) -#lexicon = create_lexicon(test_words, test_labels) - -# Iterate through all of the reviews and compute scores by taking the sum of their -# component lexicon words. Includes rudimentary negation testing. -correct = 0 -positive = 0 -ids = sorted(movie_reviews.fileids()) -scores = [] - -for id in ids: - words = list(movie_reviews.words(fileids=[id])) - if USE_STEMMING: - words = do_stem(words) - if USE_PARSING: - scores.append(calculate_score(words, lexicon)) - else: - score = 0 - x = 0 - for word in words: - if lexicon.has_key(word): - score += lexicon[word] - x += 1 - scores.append(score) - print score, x - -for i in range(len(ids)): - id = ids[i] - score = scores[i] - if score >= 0: - sent_value = "pos" - positive += 1 - #print id, sent_value - elif score < 0: - sent_value = "neg" - #print id, sent_value - label = get_label(id) - if sent_value == label: - correct += 1 - -print "correct:", correct/len(ids) -print "positive:", positive/len(ids) \ No newline at end of file +#create_trained_lexicon(POS_SEED, NEG_SEED, test_words, test_labels) \ No newline at end of file