Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Delta TFIDF, part 1
  • Loading branch information
job13011 committed Apr 11, 2016
1 parent c1f8c1e commit 4447099
Show file tree
Hide file tree
Showing 6 changed files with 352 additions and 178 deletions.
26 changes: 24 additions & 2 deletions BagOfWords.py
Expand Up @@ -2,7 +2,7 @@ from __future__ import division
import string
import numpy
import nltk
from TFIDF import tfidf
from TFIDF import tfidf, delta_tfidf

# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
Expand Down Expand Up @@ -81,7 +81,29 @@ def make_tfidf(document, documents):
for key in bag.keys():
bag[key] /= factor
return bag


# As per Martineau and Finn (2009), create a bag of words using delta TFIDF as the feature value.
# Todo: Bigrams?
def make_delta_tfidf(document, positive_set, negative_set, ref_bag):
bag = {}
factor = 0
for term in set(document):
weight = delta_tfidf(term, document, positive_set, negative_set)
if (weight != 0):
bag[term] = weight
factor += weight**2
factor **= 0.5
for key in bag.keys():
bag[key] /= factor
# Add word counts to the reference bag
for term in document:
if ref_bag != None:
if ref_bag.has_key(term):
ref_bag[term] += 1
else:
ref_bag[term] = 1
return bag

def to_vector(bag, wordlist):
vec = []
for word in wordlist:
Expand Down

0 comments on commit 4447099

Please sign in to comment.