BagOfWords.py

import nltk
import string

# "Adapting a technique of Das and Chen (2001), we added the tag NOT to every word between a negation word ('not',
# 'isn't', 'didn't', etc.) and the first punctuation mark following the negation word."
# They didn't provide a full list.
NEGATION_WORDS = ["not", "isn't", "didn't", "doesn't"]
PUNCTUATION    = [".", "!", "?", ",", ";"] #TODO make this work with POS tags (._.)


def make(text, ref_bag=None, use_presence=False, use_pos_tags=False, use_adj_only=False, gram_length=1, normalize_bags=True):
  bag_of_words = {}
  do_negation = False
  
  words = nltk.word_tokenize(text)
  if use_pos_tags:# and gram_length==1:
    tagged = nltk.pos_tag(words)
    tagged = [string.join(t, "_") for t in tagged]
    words = tagged
  count = 0
  for i in range(len(words) - gram_length + 1):
    n_gram = string.join(words[i:i+gram_length], "_")
    if (gram_length == 1):                 # Pang and Lee didn't do negation tagging for bigrams.
      if n_gram in NEGATION_WORDS:
        do_negation = True
      elif n_gram in PUNCTUATION:
        do_negation = False
      if do_negation:
        n_gram = "NOT_" + n_gram
    
    # LIBSVM won't use strings as keys, so hash to convert to a number.
    index = hash(n_gram)
    if not (use_pos_tags and use_adj_only and (tagged[i][1] != "JJ")):
      #if not (ref_bag != None and ((not ref_bag.has_key(index)) or (ref_bag[index] < MIN_OCCURRENCES))):
      if (not use_presence) and bag_of_words.has_key(index):
        bag_of_words[index] += 1
        count += 1
      else:
        bag_of_words[index] = 1
        count += 1
      
      # Add it to the reference bag
      if ref_bag != None:
        if ref_bag.has_key(index):
          ref_bag[index] += 1
        else:
          ref_bag[index] = 1
          
  # TODO do this correctly
  
  #if normalize_bags:
  #  for k in bag_of_words.keys():
  #    bag_of_words[k] = float(NORMAL_LENGTH*bag_of_words[k])/count
  return bag_of_words