BagOfWords.py

from __future__ import division
import string
import numpy
import nltk
from TFIDF import tfidf, delta_tfidf

# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
# They didn't provide a full list.
NEGATION_WORDS      = ["not", "n't"]
PUNCTUATION         = [".", "!", "?", ",", ";", '(', ')'] #TODO make this work with POS tags (._.)
POSITION_TAGS       = ["_1Q", "_2H", "_3Q"]
ADJECTIVE_TAGS      = ["JJ", "JJR", "JJS", "JJT"]
POSITION_THRESHOLDS = [0.25, 0.75, 1]

# ref_bag is used to calculate the total word count across all documents.
def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=False):
  bag_of_words = {}
  if use_negation:
    do_negation = False

  if use_pos_tags:
    #tagged = nltk.pos_tag(words)
    tagged = tagger.tag(words) # this is much much faster !!!
    words = [string.join(t, "_") for t in tagged]
  for i in range(len(words) - gram_length + 1):
    n_gram = string.join(words[i:i+gram_length], "_")
    if use_negation:
      if (gram_length == 1): # Pang and Lee didn't do negation tagging for bigrams.
        if n_gram in NEGATION_WORDS:
          do_negation = True
        elif n_gram in PUNCTUATION:
          do_negation = False
        if do_negation:
          n_gram = "NOT_" + n_gram

    if use_position:
      for j in range(len(POSITION_TAGS)):
        if i/len(words) < POSITION_THRESHOLDS[j]:
          n_gram += POSITION_TAGS[j]
          break

    index = n_gram
    if not (use_pos_tags and use_adj_only and (tagged[i][1] not in ADJECTIVE_TAGS)):
      if (not use_presence) and bag_of_words.has_key(index):
        bag_of_words[index] += 1
      else:
        bag_of_words[index] = 1

      # Add it to the reference bag
      if ref_bag != None:
        if ref_bag.has_key(index):
          ref_bag[index] += 1
        else:
          ref_bag[index] = 1

  #length-normalize
  if normalize:
    length = 0
    for k in bag_of_words.keys():
      length += (bag_of_words[k]**2)
    length **= 0.5
    for k in bag_of_words.keys():
      bag_of_words[k] = bag_of_words[k]/length
  return bag_of_words

# document and document are lists of words (pre-tokenized with nltk.word_tokenize())
def make_tfidf(document, documents):
  bag = {}
  factor = 0
  for term in set(document):
    weight = tfidf(term, document, documents)
    if (weight != 0):
      bag[term] = weight
      factor += weight**2
  factor **= 0.5
  for key in bag.keys():
    bag[key] /= factor
  return bag

# As per Martineau and Finn (2009), create a bag of words using delta TFIDF as the feature value.
# Todo: Bigrams?
def make_delta_tfidf(document, positive_set, negative_set, pos_idfs, neg_idfs, ref_bag, use_pos_tags=False):
  bag = {}
  factor = 0
  for term in set(document):
    weight = delta_tfidf(term, document, positive_set, negative_set, pos_idfs, neg_idfs)
    if (weight != 0):
      bag[term] = weight
      factor += weight**2
  factor **= 0.5
  for key in bag.keys():
    bag[key] /= factor
  # Add word counts to the reference bag
  for term in document:
    if ref_bag != None:
      if ref_bag.has_key(term):
        ref_bag[term] += 1
      else:
        ref_bag[term] = 1
  return bag

def to_vector(bag, wordlist):
  vec = []
  for word in wordlist:
    if bag.has_key(word):
      vec.append(bag[word])
    else:
      vec.append(0)
  return vec
  #return numpy.array(vec).reshape(1,-1)

tagger = nltk.tag.perceptron.PerceptronTagger()
	from __future__ import division
	import string
	import numpy
	import nltk
	from TFIDF import tfidf, delta_tfidf

	# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
	# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
	# They didn't provide a full list.
	NEGATION_WORDS = ["not", "n't"]
	PUNCTUATION = [".", "!", "?", ",", ";", '(', ')'] #TODO make this work with POS tags (._.)
	POSITION_TAGS = ["_1Q", "_2H", "_3Q"]
	ADJECTIVE_TAGS = ["JJ", "JJR", "JJS", "JJT"]
	POSITION_THRESHOLDS = [0.25, 0.75, 1]

	# ref_bag is used to calculate the total word count across all documents.
	def make(words, ref_bag=None, gram_length=1, use_negation=False, use_presence=False, use_pos_tags=False, use_adj_only=False, use_position=False, normalize=False):
	bag_of_words = {}
	if use_negation:
	do_negation = False

	if use_pos_tags:
	#tagged = nltk.pos_tag(words)
	tagged = tagger.tag(words) # this is much much faster !!!
	words = [string.join(t, "_") for t in tagged]
	for i in range(len(words) - gram_length + 1):
	n_gram = string.join(words[i:i+gram_length], "_")
	if use_negation:
	if (gram_length == 1): # Pang and Lee didn't do negation tagging for bigrams.
	if n_gram in NEGATION_WORDS:
	do_negation = True
	elif n_gram in PUNCTUATION:
	do_negation = False
	if do_negation:
	n_gram = "NOT_" + n_gram

	if use_position:
	for j in range(len(POSITION_TAGS)):
	if i/len(words) < POSITION_THRESHOLDS[j]:
	n_gram += POSITION_TAGS[j]
	break

	index = n_gram
	if not (use_pos_tags and use_adj_only and (tagged[i][1] not in ADJECTIVE_TAGS)):
	if (not use_presence) and bag_of_words.has_key(index):
	bag_of_words[index] += 1
	else:
	bag_of_words[index] = 1

	# Add it to the reference bag
	if ref_bag != None:
	if ref_bag.has_key(index):
	ref_bag[index] += 1
	else:
	ref_bag[index] = 1

	#length-normalize
	if normalize:
	length = 0
	for k in bag_of_words.keys():
	length += (bag_of_words[k]**2)
	length **= 0.5
	for k in bag_of_words.keys():
	bag_of_words[k] = bag_of_words[k]/length
	return bag_of_words

	# document and document are lists of words (pre-tokenized with nltk.word_tokenize())
	def make_tfidf(document, documents):
	bag = {}
	factor = 0
	for term in set(document):
	weight = tfidf(term, document, documents)
	if (weight != 0):
	bag[term] = weight
	factor += weight**2
	factor **= 0.5
	for key in bag.keys():
	bag[key] /= factor
	return bag

	# As per Martineau and Finn (2009), create a bag of words using delta TFIDF as the feature value.
	# Todo: Bigrams?
	def make_delta_tfidf(document, positive_set, negative_set, pos_idfs, neg_idfs, ref_bag, use_pos_tags=False):
	bag = {}
	factor = 0
	for term in set(document):
	weight = delta_tfidf(term, document, positive_set, negative_set, pos_idfs, neg_idfs)
	if (weight != 0):
	bag[term] = weight
	factor += weight**2
	factor **= 0.5
	for key in bag.keys():
	bag[key] /= factor
	# Add word counts to the reference bag
	for term in document:
	if ref_bag != None:
	if ref_bag.has_key(term):
	ref_bag[term] += 1
	else:
	ref_bag[term] = 1
	return bag

	def to_vector(bag, wordlist):
	vec = []
	for word in wordlist:
	if bag.has_key(word):
	vec.append(bag[word])
	else:
	vec.append(0)
	return vec
	#return numpy.array(vec).reshape(1,-1)

	tagger = nltk.tag.perceptron.PerceptronTagger()