Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
BigData/cblexicon.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
154 lines (130 sloc)
5.29 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import math | |
import nltk | |
from nltk.corpus import wordnet as wn | |
from collections import Counter | |
import numpy | |
from nltk.corpus import movie_reviews | |
import nltk.stem | |
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance | |
from nltk.classify import NaiveBayesClassifier | |
import random | |
from nltk.stem import * | |
from sets import Set | |
class cblexicon: | |
def process(self): | |
def cluster(set1,set2,conjSet,defSet,dis): | |
for word in set1: | |
score1 = calcScore(word,set1,conjSet,dis) | |
#print "Score 1: %f" % score1 | |
score2 = calcScore(word,set2,conjSet,dis) | |
#print "Score 2: %f" % score2 | |
if score2 < score1: | |
print "swap" | |
set1.remove(word) | |
set2.append(word) | |
for word in set2: | |
score1 = calcScore(word,set1,conjSet,dis) | |
score2 = calcScore(word,set2,conjSet,dis) | |
if score1 < score2: | |
set2.remove(word) | |
set1.append(word) | |
return set1,set2 | |
def calcScore(curr,set,conjSet,dis): | |
score = 0 | |
for word in set: | |
if word != curr: | |
cats = dis[conjSet[curr][0]][conjSet[word][0]] | |
score = score + cats | |
return score * (1.0/len(set)) | |
def normalize_word(word): | |
return SnowballStemmer("english").stem(word) | |
def vectorize(conjSet,defSet): | |
dis = numpy.zeros((len(defSet),len(defSet))) | |
dis.fill(.5) | |
for word in defSet: | |
similar = conjSet[word][1] | |
dissimilar = conjSet[word][2] | |
for sim in similar: | |
dis[conjSet[word][0]][conjSet[sim][0]] = 0 | |
for d in dissimilar: | |
dis[conjSet[word][0]][conjSet[d][0]] = 1 | |
return dis | |
def word_feats(words): | |
return dict([(word, True) for word in words]) | |
def genSets(): | |
f = open('words.txt', 'r+') | |
content = f.readlines() | |
positive = Set([]) | |
negative = Set([]) | |
for pair in content: | |
current = pair.split(' ') | |
if (current[1][0] == 'p'): | |
positive.add(current[0]) | |
elif (current[1][0] == 'n'): | |
negative.add(current[0]) | |
return positive,negative | |
def getConj(): | |
# Set up the tuple (index, similar, dissimilar) | |
f = open('conj.txt', 'r+') | |
content = f.readlines() | |
d = dict() | |
i = 0 | |
for line in content: | |
current = line.split(' ') | |
if current[2] == "but": | |
if current[0] in d: | |
d[current[0]][2].add(current[1]) | |
else: | |
d[current[0]] = (i,Set(),Set([current[1]])) | |
i = i+1 | |
if current[1] in d: | |
d[current[1]][2].add(current[0]) | |
else: | |
d[current[1]] = (i,Set(),Set([current[0]])) | |
i = i+1 | |
else: | |
if current[0] in d: | |
d[current[0]][1].add(current[1]) | |
else: | |
d[current[0]] = (i,Set([current[1]]),Set()) | |
i = i+1 | |
if current[1] in d: | |
d[current[1]][1].add(current[0]) | |
else: | |
d[current[1]] = (i,Set([current[0]]),Set()) | |
i = i+1 | |
return d | |
#Get the Data# | |
negids = movie_reviews.fileids('neg') | |
posids = movie_reviews.fileids('pos') | |
training = set(negids[:500] + posids[:500]) | |
testing = set(negids[500:] + posids[500:]) | |
# Generate positive and negative initial sets | |
sets = genSets() | |
positive = random.sample(sets[0], min(len(sets[0]), len(sets[1]))) | |
negative = random.sample(sets[1], min(len(sets[0]), len(sets[1]))) | |
# Clustering Setup | |
stopwords = set(nltk.corpus.stopwords.words('english')) | |
# Create dictionary (adj, (index, similar, dissimilar)) | |
conjSet = getConj() | |
# Create list out of all keys of conjSet | |
defSet = conjSet.keys() | |
# Generate dissimilarity matrix | |
dis = vectorize(conjSet,defSet) | |
# Its Cluster time | |
set1 = defSet[len(defSet)//2:] | |
set2 = defSet[:len(defSet)//2] | |
for i in range(0,10): | |
sets = cluster(set1,set2,conjSet,defSet,dis) | |
set1 = sets[0] | |
set2 = sets[1] | |
print len(set2) | |
# Can we classify and then run bag of words? | |
#negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] | |
#posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] | |
#trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative] | |
#testfeats = negfeats[500:] + posfeats[500:] | |
#classifier1 = NaiveBayesClassifier.train(trainfeats) | |
#print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg")) | |
cblexicon().process() |