MPQALexicon.py

from nltk.corpus import wordnet as wn

def load(strong_only=False):
  filename = "subjclueslen1-HLTEMNLP05.tff"
  f = open(filename)
  lines = f.readlines()
  f.close()
  words = []
  labels = []
  for line in lines:
    fields = line.split(" ")
    fields = [field for field in fields if "=" in field] #ugh, two lines have a random extra char in them
    d = dict([field.rstrip().split("=") for field in fields])
    (word, label, pos, type) = d["word1"], d["priorpolarity"], d["pos1"], d["type"]
    if word not in words:
      if is_adjective(word):
        if not (strong_only and (type != "strongsubj")):
          if label == "positive":
            words.append(word)
            labels.append(1)
          elif label == "negative":
            words.append(word)
            labels.append(-1)
  return (words, labels)
  
def is_adjective(word):
  return (len(wn.synsets(word, wn.ADJ)) > 0)