Skip to content
Permalink
235376b91e
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
154 lines (130 sloc) 5.29 KB
from __future__ import division
import math
import nltk
from nltk.corpus import wordnet as wn
from collections import Counter
import numpy
from nltk.corpus import movie_reviews
import nltk.stem
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance
from nltk.classify import NaiveBayesClassifier
import random
from nltk.stem import *
from sets import Set
class cblexicon:
def process(self):
def cluster(set1,set2,conjSet,defSet,dis):
for word in set1:
score1 = calcScore(word,set1,conjSet,dis)
#print "Score 1: %f" % score1
score2 = calcScore(word,set2,conjSet,dis)
#print "Score 2: %f" % score2
if score2 < score1:
print "swap"
set1.remove(word)
set2.append(word)
for word in set2:
score1 = calcScore(word,set1,conjSet,dis)
score2 = calcScore(word,set2,conjSet,dis)
if score1 < score2:
set2.remove(word)
set1.append(word)
return set1,set2
def calcScore(curr,set,conjSet,dis):
score = 0
for word in set:
if word != curr:
cats = dis[conjSet[curr][0]][conjSet[word][0]]
score = score + cats
return score * (1.0/len(set))
def normalize_word(word):
return SnowballStemmer("english").stem(word)
def vectorize(conjSet,defSet):
dis = numpy.zeros((len(defSet),len(defSet)))
dis.fill(.5)
for word in defSet:
similar = conjSet[word][1]
dissimilar = conjSet[word][2]
for sim in similar:
dis[conjSet[word][0]][conjSet[sim][0]] = 0
for d in dissimilar:
dis[conjSet[word][0]][conjSet[d][0]] = 1
return dis
def word_feats(words):
return dict([(word, True) for word in words])
def genSets():
f = open('words.txt', 'r+')
content = f.readlines()
positive = Set([])
negative = Set([])
for pair in content:
current = pair.split(' ')
if (current[1][0] == 'p'):
positive.add(current[0])
elif (current[1][0] == 'n'):
negative.add(current[0])
return positive,negative
def getConj():
# Set up the tuple (index, similar, dissimilar)
f = open('conj.txt', 'r+')
content = f.readlines()
d = dict()
i = 0
for line in content:
current = line.split(' ')
if current[2] == "but":
if current[0] in d:
d[current[0]][2].add(current[1])
else:
d[current[0]] = (i,Set(),Set([current[1]]))
i = i+1
if current[1] in d:
d[current[1]][2].add(current[0])
else:
d[current[1]] = (i,Set(),Set([current[0]]))
i = i+1
else:
if current[0] in d:
d[current[0]][1].add(current[1])
else:
d[current[0]] = (i,Set([current[1]]),Set())
i = i+1
if current[1] in d:
d[current[1]][1].add(current[0])
else:
d[current[1]] = (i,Set([current[0]]),Set())
i = i+1
return d
#Get the Data#
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
training = set(negids[:500] + posids[:500])
testing = set(negids[500:] + posids[500:])
# Generate positive and negative initial sets
sets = genSets()
positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
negative = random.sample(sets[1], min(len(sets[0]), len(sets[1])))
# Clustering Setup
stopwords = set(nltk.corpus.stopwords.words('english'))
# Create dictionary (adj, (index, similar, dissimilar))
conjSet = getConj()
# Create list out of all keys of conjSet
defSet = conjSet.keys()
# Generate dissimilarity matrix
dis = vectorize(conjSet,defSet)
# Its Cluster time
set1 = defSet[len(defSet)//2:]
set2 = defSet[:len(defSet)//2]
for i in range(0,10):
sets = cluster(set1,set2,conjSet,defSet,dis)
set1 = sets[0]
set2 = sets[1]
print len(set2)
# Can we classify and then run bag of words?
#negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
#posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
#trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]
#testfeats = negfeats[500:] + posfeats[500:]
#classifier1 = NaiveBayesClassifier.train(trainfeats)
#print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg"))
cblexicon().process()