Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
113 lines (102 sloc) 3.58 KB
from __future__ import division
import string
import numpy
import nltk
from TFIDF import tfidf, delta_tfidf
# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
# They didn't provide a full list.
NEGATION_WORDS = ["not", "n't"]
PUNCTUATION = [".", "!", "?", ",", ";", '(', ')'] #TODO make this work with POS tags (._.)
POSITION_TAGS = ["_1Q", "_2H", "_3Q"]
ADJECTIVE_TAGS = ["JJ", "JJR", "JJS", "JJT"]
POSITION_THRESHOLDS = [0.25, 0.75, 1]
# ref_bag is used to calculate the total word count across all documents.
def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=False):
bag_of_words = {}
if use_negation:
do_negation = False
if use_pos_tags:
#tagged = nltk.pos_tag(words)
tagged = tagger.tag(words) # this is much much faster !!!
words = [string.join(t, "_") for t in tagged]
for i in range(len(words) - gram_length + 1):
n_gram = string.join(words[i:i+gram_length], "_")
if use_negation:
if (gram_length == 1): # Pang and Lee didn't do negation tagging for bigrams.
if n_gram in NEGATION_WORDS:
do_negation = True
elif n_gram in PUNCTUATION:
do_negation = False
if do_negation:
n_gram = "NOT_" + n_gram
if use_position:
for j in range(len(POSITION_TAGS)):
if i/len(words) < POSITION_THRESHOLDS[j]:
n_gram += POSITION_TAGS[j]
break
index = n_gram
if not (use_pos_tags and use_adj_only and (tagged[i][1] not in ADJECTIVE_TAGS)):
if (not use_presence) and bag_of_words.has_key(index):
bag_of_words[index] += 1
else:
bag_of_words[index] = 1
# Add it to the reference bag
if ref_bag != None:
if ref_bag.has_key(index):
ref_bag[index] += 1
else:
ref_bag[index] = 1
#length-normalize
if normalize:
length = 0
for k in bag_of_words.keys():
length += (bag_of_words[k]**2)
length **= 0.5
for k in bag_of_words.keys():
bag_of_words[k] = bag_of_words[k]/length
return bag_of_words
# document and document are lists of words (pre-tokenized with nltk.word_tokenize())
def make_tfidf(document, documents):
bag = {}
factor = 0
for term in set(document):
weight = tfidf(term, document, documents)
if (weight != 0):
bag[term] = weight
factor += weight**2
factor **= 0.5
for key in bag.keys():
bag[key] /= factor
return bag
# As per Martineau and Finn (2009), create a bag of words using delta TFIDF as the feature value.
# Todo: Bigrams?
def make_delta_tfidf(document, positive_set, negative_set, pos_idfs, neg_idfs, ref_bag, use_pos_tags=False):
bag = {}
factor = 0
for term in set(document):
weight = delta_tfidf(term, document, positive_set, negative_set, pos_idfs, neg_idfs)
if (weight != 0):
bag[term] = weight
factor += weight**2
factor **= 0.5
for key in bag.keys():
bag[key] /= factor
# Add word counts to the reference bag
for term in document:
if ref_bag != None:
if ref_bag.has_key(term):
ref_bag[term] += 1
else:
ref_bag[term] = 1
return bag
def to_vector(bag, wordlist):
vec = []
for word in wordlist:
if bag.has_key(word):
vec.append(bag[word])
else:
vec.append(0)
return vec
#return numpy.array(vec).reshape(1,-1)
tagger = nltk.tag.perceptron.PerceptronTagger()
You can’t perform that action at this time.