Skip to content
Permalink
6d162253b4
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
264 lines (233 sloc) 9.54 KB
from __future__ import division
import math
import nltk
from nltk.corpus import wordnet as wn
from collections import Counter
import numpy
from nltk.corpus import movie_reviews
import nltk.stem
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance
from nltk.classify import NaiveBayesClassifier
import random
from nltk.stem import *
from sets import Set
class cblexicon:
def process(self):
def optimize(set1,set2,conjSet,defSet,dis):
i = 0
currentMin = 999999
consideredMin = calcScore(set1,set2,conjSet,dis)
bestSwapWord = ""
# Calculate the best word to remove until no moves lessen the function
while( currentMin > consideredMin):
print i
i = i + 1
currentMin = consideredMin
for word in set1:
set1.remove(word)
set2.append(word)
test = calcScore(set1,set2,conjSet,dis)
set2.remove(word)
set1.append(word)
if (test < consideredMin):
consideredMin = test
bestSwapWord = word
for word in set2:
set2.remove(word)
set1.append(word)
test = calcScore(set1,set2,conjSet,dis)
set1.remove(word)
set2.append(word)
if (test < consideredMin):
consideredMin = test
bestSwapWord = word
if(bestSwapWord in set1):
set1.remove(bestSwapWord)
set2.append(bestSwapWord)
else:
set2.remove(bestSwapWord)
set1.append(bestSwapWord)
# Return the optimized sets
return set1,set2
def optimize2(set1,set2,conjSet,defSet,dis):
i = 0
currentMin = 999999
consideredMin = calcScore(set1,conjSet,dis) + calcScore(set2,conjSet,dis)
bestSwapWord = None
print consideredMin
# Calculate the best word to remove until no moves lessen the function
while( currentMin > consideredMin):
print "Iteration #%d: (%d, %d)" % (i, len(set1), len(set2))
currentMin = consideredMin
currentS1 = calcScore(set1,conjSet,dis)
currentS2 = calcScore(set2,conjSet,dis)
consideredMin = currentS1 + currentS2 #
for word in set1:
test = calcSwap(word,set1,set2,currentS1,currentS2,conjSet,dis)
if (test < consideredMin):
print "found1"
consideredMin = test
bestSwapWord = word
for word in set2:
test = calcSwap(word,set2,set1,currentS2,currentS1,conjSet,dis)
if (test < consideredMin):
print "found2"
consideredMin = test
bestSwapWord = word
print "New min: %f" % consideredMin
if(bestSwapWord in set1):
set1.remove(bestSwapWord)
set2.append(bestSwapWord)
elif(bestSwapWord in set2):
set2.remove(bestSwapWord)
set1.append(bestSwapWord)
i = i + 1
# Return the optimized sets
return set1,set2
def cluster(set1,set2,conjSet,defSet,dis):
for word in set1:
score1 = calcScore(word,set1,conjSet,dis)
#print "Score 1: %f" % score1
score2 = calcScore(word,set2,conjSet,dis)
#print "Score 2: %f" % score2
if score2 < score1:
print "swap"
set1.remove(word)
set2.append(word)
for word in set2:
score1 = calcScore(word,set1,conjSet,dis)
score2 = calcScore(word,set2,conjSet,dis)
if score1 < score2:
set2.remove(word)
set1.append(word)
return set1,set2
def calcScore(set,conjSet,dis):
score = 0
for i in range(len(set)):
w1 = set[i]
for j in range(i, len(set)):
w2 = set[j]
cats = dis[conjSet[w1][0]][conjSet[w2][0]]
score = score + cats
return score / len(set)
def calcSwap(word,currSet,opSet,currentCount,otherCount,conjSet,dis):
score1 = 0
score2 = 0
for w in currSet:
if word != w:
cats = dis[conjSet[word][0]][conjSet[w][0]]
score1 = score1 + cats
currentCount = ((currentCount* len(currSet)) - score1 )/(len(currSet)-1)
#for word in set2:
for w in opSet:
if word != w:
cats = dis[conjSet[word][0]][conjSet[w][0]]
score2 = score2 + cats
otherCount = ((otherCount* len(opSet)) + score2 )/(len(opSet)+1)
return currentCount + otherCount
def normalize_word(word):
return SnowballStemmer("english").stem(word)
def vectorize(conjSet,defSet):
dis = numpy.zeros((len(defSet),len(defSet)))
dis.fill(.5)
for word in defSet:
similar = conjSet[word][1]
dissimilar = conjSet[word][2]
for sim in similar:
dis[conjSet[word][0]][conjSet[sim][0]] = 0
for d in dissimilar:
dis[conjSet[word][0]][conjSet[d][0]] = 1
return dis
def word_feats(words):
return dict([(word, True) for word in words])
def genSets():
f = open('words.txt', 'r+')
content = f.readlines()
positive = Set([])
negative = Set([])
for pair in content:
current = pair.split(' ')
if (current[1][0] == 'p'):
positive.add(current[0])
elif (current[1][0] == 'n'):
negative.add(current[0])
return positive,negative
def getConj():
# Set up the tuple (index, similar, dissimilar)
f = open('conj.txt', 'r+')
content = f.readlines()
d = dict()
i = 0
for line in content:
current = line.split(' ')
if current[2] == "but":
if current[0] in d:
d[current[0]][2].add(current[1])
else:
d[current[0]] = (i,Set(),Set([current[1]]))
i = i+1
if current[1] in d:
d[current[1]][2].add(current[0])
else:
d[current[1]] = (i,Set(),Set([current[0]]))
i = i+1
else:
if current[0] in d:
d[current[0]][1].add(current[1])
else:
d[current[0]] = (i,Set([current[1]]),Set())
i = i+1
if current[1] in d:
d[current[1]][1].add(current[0])
else:
d[current[1]] = (i,Set([current[0]]),Set())
i = i+1
return d
#Get the Data#
"""
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
training = set(negids[:500] + posids[:500])
testing = set(negids[500:] + posids[500:])
"""
# Generate positive and negative initial sets
sets = genSets()
positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
negative = random.sample(sets[1], min(len(sets[0]), len(sets[1])))
# Clustering Setup
stopwords = set(nltk.corpus.stopwords.words('english'))
# Create dictionary (adj, (index, similar, dissimilar))
conjSet = getConj()
# Create list out of all keys of conjSet
defSet = conjSet.keys()
# Generate dissimilarity matrix
dis = vectorize(conjSet,defSet)
# Its Cluster time
set1 = defSet[len(defSet)//2:]
set2 = defSet[:len(defSet)//2]
"""
set1 = random.sample(defSet, len(defSet)//4)
set2 = [x for x in defSet if x not in set1]
"""
# Optimize objective function
sets = optimize2(set1,set2,conjSet,defSet,dis)
set1 = sets[0]
set2 = sets[1]
print(set1)
print(set2)
f1 = open('set1.txt', 'w+')
f2 = open('set2.txt', 'w+')
for word in set1:
f1.write(word + "\n")
for word in set2:
f2.write(word + "\n")
f1.close()
f2.close()
# Can we classify and then run bag of words?
#negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
#posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
#trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]
#testfeats = negfeats[500:] + posfeats[500:]
#classifier1 = NaiveBayesClassifier.train(trainfeats)
#print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg"))
cblexicon().process()