Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
BigData/cblexicon.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
212 lines (182 sloc)
7.3 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import math | |
import nltk | |
from nltk.corpus import wordnet as wn | |
from collections import Counter | |
import numpy | |
from nltk.corpus import movie_reviews | |
import nltk.stem | |
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance | |
from nltk.classify import NaiveBayesClassifier | |
import random | |
from nltk.stem import * | |
from sets import Set | |
class cblexicon: | |
def process(self): | |
def optimize(set1,set2,conjSet,defSet,dis): | |
i = 0 | |
currentMin = 999999 | |
consideredMin = calcScore(set1,set2,conjSet,dis) | |
bestSwapWord = "" | |
# Calculate the best word to remove until no moves lessen the function | |
while( currentMin > consideredMin): | |
print i | |
i = i + 1 | |
currentMin = consideredMin | |
for word in set1: | |
set1.remove(word) | |
set2.append(word) | |
test = calcScore(set1,set2,conjSet,dis) | |
set2.remove(word) | |
set1.append(word) | |
if (test < consideredMin): | |
consideredMin = test | |
bestSwapWord = word | |
for word in set2: | |
set2.remove(word) | |
set1.append(word) | |
test = calcScore(set1,set2,conjSet,dis) | |
set1.remove(word) | |
set2.append(word) | |
if (test < consideredMin): | |
consideredMin = test | |
bestSwapWord = word | |
if(bestSwapWord in set1): | |
set1.remove(word) | |
set2.append(word) | |
else: | |
set2.remove(word) | |
set1.append(word) | |
# Return the optimized sets | |
return set1,set2 | |
def cluster(set1,set2,conjSet,defSet,dis): | |
for word in set1: | |
score1 = calcScore(word,set1,conjSet,dis) | |
#print "Score 1: %f" % score1 | |
score2 = calcScore(word,set2,conjSet,dis) | |
#print "Score 2: %f" % score2 | |
if score2 < score1: | |
print "swap" | |
set1.remove(word) | |
set2.append(word) | |
for word in set2: | |
score1 = calcScore(word,set1,conjSet,dis) | |
score2 = calcScore(word,set2,conjSet,dis) | |
if score1 < score2: | |
set2.remove(word) | |
set1.append(word) | |
return set1,set2 | |
def calcScore(set1,set2,conjSet,dis): | |
score1 = 0 | |
score2 = 0 | |
for curr in set1: | |
for word in set1: | |
if word != curr: | |
cats = dis[conjSet[curr][0]][conjSet[word][0]] | |
score1 = score1 + cats | |
score1 = score1 * (1.0/len(set1)) | |
for curr in set2: | |
for word in set2: | |
if word != curr: | |
cats = dis[conjSet[curr][0]][conjSet[word][0]] | |
score2 = score2 + cats | |
score2 = score2 * (1.0/len(set2)) | |
return score1 + score2 | |
def normalize_word(word): | |
return SnowballStemmer("english").stem(word) | |
def vectorize(conjSet,defSet): | |
dis = numpy.zeros((len(defSet),len(defSet))) | |
dis.fill(.5) | |
for word in defSet: | |
similar = conjSet[word][1] | |
dissimilar = conjSet[word][2] | |
for sim in similar: | |
dis[conjSet[word][0]][conjSet[sim][0]] = 0 | |
for d in dissimilar: | |
dis[conjSet[word][0]][conjSet[d][0]] = 1 | |
return dis | |
def word_feats(words): | |
return dict([(word, True) for word in words]) | |
def genSets(): | |
f = open('words.txt', 'r+') | |
content = f.readlines() | |
positive = Set([]) | |
negative = Set([]) | |
for pair in content: | |
current = pair.split(' ') | |
if (current[1][0] == 'p'): | |
positive.add(current[0]) | |
elif (current[1][0] == 'n'): | |
negative.add(current[0]) | |
return positive,negative | |
def getConj(): | |
# Set up the tuple (index, similar, dissimilar) | |
f = open('conj.txt', 'r+') | |
content = f.readlines() | |
d = dict() | |
i = 0 | |
for line in content: | |
current = line.split(' ') | |
if current[2] == "but": | |
if current[0] in d: | |
d[current[0]][2].add(current[1]) | |
else: | |
d[current[0]] = (i,Set(),Set([current[1]])) | |
i = i+1 | |
if current[1] in d: | |
d[current[1]][2].add(current[0]) | |
else: | |
d[current[1]] = (i,Set(),Set([current[0]])) | |
i = i+1 | |
else: | |
if current[0] in d: | |
d[current[0]][1].add(current[1]) | |
else: | |
d[current[0]] = (i,Set([current[1]]),Set()) | |
i = i+1 | |
if current[1] in d: | |
d[current[1]][1].add(current[0]) | |
else: | |
d[current[1]] = (i,Set([current[0]]),Set()) | |
i = i+1 | |
return d | |
#Get the Data# | |
negids = movie_reviews.fileids('neg') | |
posids = movie_reviews.fileids('pos') | |
training = set(negids[:500] + posids[:500]) | |
testing = set(negids[500:] + posids[500:]) | |
# Generate positive and negative initial sets | |
sets = genSets() | |
positive = random.sample(sets[0], min(len(sets[0]), len(sets[1]))) | |
negative = random.sample(sets[1], min(len(sets[0]), len(sets[1]))) | |
# Clustering Setup | |
stopwords = set(nltk.corpus.stopwords.words('english')) | |
# Create dictionary (adj, (index, similar, dissimilar)) | |
conjSet = getConj() | |
# Create list out of all keys of conjSet | |
defSet = conjSet.keys() | |
# Generate dissimilarity matrix | |
dis = vectorize(conjSet,defSet) | |
# Its Cluster time | |
set1 = defSet[len(defSet)//2:] | |
set2 = defSet[:len(defSet)//2] | |
# Optimize objective function | |
sets = optimize(set1,set2,conjSet,defSet,dis) | |
set1 = sets[0] | |
set2 = sets[1] | |
print(set1) | |
print(set2) | |
f = open('set1.txt', 'w+') | |
f2 = open('set1.txt', 'w+') | |
for word in set1: | |
f.write(word + "/n") | |
for word in set2: | |
f2.write(word + "/n") | |
# Can we classify and then run bag of words? | |
#negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] | |
#posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] | |
#trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative] | |
#testfeats = negfeats[500:] + posfeats[500:] | |
#classifier1 = NaiveBayesClassifier.train(trainfeats) | |
#print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg")) | |
cblexicon().process() |