Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
BigData/cblexicon.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
264 lines (233 sloc)
9.54 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import math | |
import nltk | |
from nltk.corpus import wordnet as wn | |
from collections import Counter | |
import numpy | |
from nltk.corpus import movie_reviews | |
import nltk.stem | |
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance | |
from nltk.classify import NaiveBayesClassifier | |
import random | |
from nltk.stem import * | |
from sets import Set | |
class cblexicon: | |
def process(self): | |
def optimize(set1,set2,conjSet,defSet,dis): | |
i = 0 | |
currentMin = 999999 | |
consideredMin = calcScore(set1,set2,conjSet,dis) | |
bestSwapWord = "" | |
# Calculate the best word to remove until no moves lessen the function | |
while( currentMin > consideredMin): | |
print i | |
i = i + 1 | |
currentMin = consideredMin | |
for word in set1: | |
set1.remove(word) | |
set2.append(word) | |
test = calcScore(set1,set2,conjSet,dis) | |
set2.remove(word) | |
set1.append(word) | |
if (test < consideredMin): | |
consideredMin = test | |
bestSwapWord = word | |
for word in set2: | |
set2.remove(word) | |
set1.append(word) | |
test = calcScore(set1,set2,conjSet,dis) | |
set1.remove(word) | |
set2.append(word) | |
if (test < consideredMin): | |
consideredMin = test | |
bestSwapWord = word | |
if(bestSwapWord in set1): | |
set1.remove(bestSwapWord) | |
set2.append(bestSwapWord) | |
else: | |
set2.remove(bestSwapWord) | |
set1.append(bestSwapWord) | |
# Return the optimized sets | |
return set1,set2 | |
def optimize2(set1,set2,conjSet,defSet,dis): | |
i = 0 | |
currentMin = 999999 | |
consideredMin = calcScore(set1,conjSet,dis) + calcScore(set2,conjSet,dis) | |
bestSwapWord = None | |
print consideredMin | |
# Calculate the best word to remove until no moves lessen the function | |
while( currentMin > consideredMin): | |
print "Iteration #%d: (%d, %d)" % (i, len(set1), len(set2)) | |
currentMin = consideredMin | |
currentS1 = calcScore(set1,conjSet,dis) | |
currentS2 = calcScore(set2,conjSet,dis) | |
consideredMin = currentS1 + currentS2 # | |
for word in set1: | |
test = calcSwap(word,set1,set2,currentS1,currentS2,conjSet,dis) | |
if (test < consideredMin): | |
print "found1" | |
consideredMin = test | |
bestSwapWord = word | |
for word in set2: | |
test = calcSwap(word,set2,set1,currentS2,currentS1,conjSet,dis) | |
if (test < consideredMin): | |
print "found2" | |
consideredMin = test | |
bestSwapWord = word | |
print "New min: %f" % consideredMin | |
if(bestSwapWord in set1): | |
set1.remove(bestSwapWord) | |
set2.append(bestSwapWord) | |
elif(bestSwapWord in set2): | |
set2.remove(bestSwapWord) | |
set1.append(bestSwapWord) | |
i = i + 1 | |
# Return the optimized sets | |
return set1,set2 | |
def cluster(set1,set2,conjSet,defSet,dis): | |
for word in set1: | |
score1 = calcScore(word,set1,conjSet,dis) | |
#print "Score 1: %f" % score1 | |
score2 = calcScore(word,set2,conjSet,dis) | |
#print "Score 2: %f" % score2 | |
if score2 < score1: | |
print "swap" | |
set1.remove(word) | |
set2.append(word) | |
for word in set2: | |
score1 = calcScore(word,set1,conjSet,dis) | |
score2 = calcScore(word,set2,conjSet,dis) | |
if score1 < score2: | |
set2.remove(word) | |
set1.append(word) | |
return set1,set2 | |
def calcScore(set,conjSet,dis): | |
score = 0 | |
for i in range(len(set)): | |
w1 = set[i] | |
for j in range(i, len(set)): | |
w2 = set[j] | |
cats = dis[conjSet[w1][0]][conjSet[w2][0]] | |
score = score + cats | |
return score / len(set) | |
def calcSwap(word,currSet,opSet,currentCount,otherCount,conjSet,dis): | |
score1 = 0 | |
score2 = 0 | |
for w in currSet: | |
if word != w: | |
cats = dis[conjSet[word][0]][conjSet[w][0]] | |
score1 = score1 + cats | |
currentCount = ((currentCount* len(currSet)) - score1 )/(len(currSet)-1) | |
#for word in set2: | |
for w in opSet: | |
if word != w: | |
cats = dis[conjSet[word][0]][conjSet[w][0]] | |
score2 = score2 + cats | |
otherCount = ((otherCount* len(opSet)) + score2 )/(len(opSet)+1) | |
return currentCount + otherCount | |
def normalize_word(word): | |
return SnowballStemmer("english").stem(word) | |
def vectorize(conjSet,defSet): | |
dis = numpy.zeros((len(defSet),len(defSet))) | |
dis.fill(.5) | |
for word in defSet: | |
similar = conjSet[word][1] | |
dissimilar = conjSet[word][2] | |
for sim in similar: | |
dis[conjSet[word][0]][conjSet[sim][0]] = 0 | |
for d in dissimilar: | |
dis[conjSet[word][0]][conjSet[d][0]] = 1 | |
return dis | |
def word_feats(words): | |
return dict([(word, True) for word in words]) | |
def genSets(): | |
f = open('words.txt', 'r+') | |
content = f.readlines() | |
positive = Set([]) | |
negative = Set([]) | |
for pair in content: | |
current = pair.split(' ') | |
if (current[1][0] == 'p'): | |
positive.add(current[0]) | |
elif (current[1][0] == 'n'): | |
negative.add(current[0]) | |
return positive,negative | |
def getConj(): | |
# Set up the tuple (index, similar, dissimilar) | |
f = open('conj.txt', 'r+') | |
content = f.readlines() | |
d = dict() | |
i = 0 | |
for line in content: | |
current = line.split(' ') | |
if current[2] == "but": | |
if current[0] in d: | |
d[current[0]][2].add(current[1]) | |
else: | |
d[current[0]] = (i,Set(),Set([current[1]])) | |
i = i+1 | |
if current[1] in d: | |
d[current[1]][2].add(current[0]) | |
else: | |
d[current[1]] = (i,Set(),Set([current[0]])) | |
i = i+1 | |
else: | |
if current[0] in d: | |
d[current[0]][1].add(current[1]) | |
else: | |
d[current[0]] = (i,Set([current[1]]),Set()) | |
i = i+1 | |
if current[1] in d: | |
d[current[1]][1].add(current[0]) | |
else: | |
d[current[1]] = (i,Set([current[0]]),Set()) | |
i = i+1 | |
return d | |
#Get the Data# | |
""" | |
negids = movie_reviews.fileids('neg') | |
posids = movie_reviews.fileids('pos') | |
training = set(negids[:500] + posids[:500]) | |
testing = set(negids[500:] + posids[500:]) | |
""" | |
# Generate positive and negative initial sets | |
sets = genSets() | |
positive = random.sample(sets[0], min(len(sets[0]), len(sets[1]))) | |
negative = random.sample(sets[1], min(len(sets[0]), len(sets[1]))) | |
# Clustering Setup | |
stopwords = set(nltk.corpus.stopwords.words('english')) | |
# Create dictionary (adj, (index, similar, dissimilar)) | |
conjSet = getConj() | |
# Create list out of all keys of conjSet | |
defSet = conjSet.keys() | |
# Generate dissimilarity matrix | |
dis = vectorize(conjSet,defSet) | |
# Its Cluster time | |
set1 = defSet[len(defSet)//2:] | |
set2 = defSet[:len(defSet)//2] | |
""" | |
set1 = random.sample(defSet, len(defSet)//4) | |
set2 = [x for x in defSet if x not in set1] | |
""" | |
# Optimize objective function | |
sets = optimize2(set1,set2,conjSet,defSet,dis) | |
set1 = sets[0] | |
set2 = sets[1] | |
print(set1) | |
print(set2) | |
f1 = open('set1.txt', 'w+') | |
f2 = open('set2.txt', 'w+') | |
for word in set1: | |
f1.write(word + "\n") | |
for word in set2: | |
f2.write(word + "\n") | |
f1.close() | |
f2.close() | |
# Can we classify and then run bag of words? | |
#negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] | |
#posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] | |
#trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative] | |
#testfeats = negfeats[500:] + posfeats[500:] | |
#classifier1 = NaiveBayesClassifier.train(trainfeats) | |
#print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg")) | |
cblexicon().process() |