Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Working on clustering in cblexicon
  • Loading branch information
adl13006 committed Apr 2, 2016
1 parent 8e9a44c commit 66918f8
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 30 deletions.
Binary file added BagOfWords.pyc
Binary file not shown.
23 changes: 6 additions & 17 deletions GlossCount.py
Expand Up @@ -61,19 +61,13 @@ class GlossCount:
classifier = NaiveBayesClassifier.train(trainfeats)
print "cat"
#print classifier.classify(dict([(word, True) for word in words]))
<<<<<<< HEAD
#print classifier.classify(dict([("bad",True),("bad",True)]))


=======
print classifier.classify(dict([("bad",True),("bad",True)]))
>>>>>>> parent of 47c6a2a... Bugfix

# Iterate through all of the reviews and find sentiment
count = 0.00
correct = 0.00
for reviews in movie_reviews.fileids():
for reviews in movie_reviews.fileids(): #For every review
score = 0;
<<<<<<< HEAD
tokens = nltk.pos_tag(nltk.word_tokenize(movie_reviews.raw(fileids=[reviews]))) #Tokenize all words with POS
for token in tokens:
if (token[1]== "JJ" or token[1] == "JJR" or token[1] == "JJS"): # If adjective, check value
Expand All @@ -82,24 +76,19 @@ class GlossCount:
score = score - 1
elif(sent_value is 'pos'):
score = score + 1
=======
for words in movie_reviews.words(fileids=[reviews]):
if()
sent_value = classifier.classify(dict([(word, True)]))
if(sent_value is 'neg'):
score = score - 1
elif(sent_value is 'pos'):
score = score + 1
>>>>>>> parent of 47c6a2a... Bugfix
if (score < 0):
print "Negative at %d" % (score)
sentiment = 'neg'
else:
sentiment = 'pos'
print "Positive at %d" % (score)
if (sentiment == movie_reviews.categories(fileids=[reviews])[0]):
print "Correct"
correct = correct + 1.00
count = count + 1.00
print correct/count




GlossCount().demo()
Binary file added MPQALexicon.pyc
Binary file not shown.
119 changes: 106 additions & 13 deletions cblexicon.py
@@ -1,26 +1,119 @@
import math
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import brown as sc
from collections import Counter
import numpy
from nltk.corpus import movie_reviews
import nltk.stem
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance
from nltk.classify import NaiveBayesClassifier
import random
from nltk.stem import *

from sets import Set

class cblexicon:

def genSets(self):
f = open('words.txt', 'r+')
content = f.readlines()
positive = Set([])
negative = Set([])
def process(self):

def normalize_word(word):
return SnowballStemmer("english").stem(word)

def vectorspaced(title,CS,DF):
title_components = CS[title][1]
return numpy.array([
word in title_components
for word in DF], numpy.short)

def word_feats(words):
return dict([(word, True) for word in words])

def genSets():
f = open('words.txt', 'r+')
content = f.readlines()
positive = Set([])
negative = Set([])

for pair in content:
current = pair.split(' ')
if (current[1][0] == 'p'):
positive.add(current[0])
elif (current[1][0] == 'n'):
negative.add(current[0])
for pair in content:
current = pair.split(' ')
if (current[1][0] == 'p'):
positive.add(current[0])
elif (current[1][0] == 'n'):
negative.add(current[0])

return positive,negative

def getConj():
f = open('conj.txt', 'r+')
content = f.readlines()
d = dict()
i = 0
for line in content:
current = line.split(' ')
#Add the first adjective
if current[0] in d:
d[current[0]][1].add(current[1])
else:
d[current[0]] = (i,Set([current[1]]))
i = i+1
#Add the second adjective
if current[1] in d:
d[current[1]][1].add(current[0])
else:
d[current[1]] = (i,Set([current[0]]))
i = i+1
return d

#Get the Data#
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
training = set(negids[:500] + posids[:500])
testing = set(negids[500:] + posids[500:])
# Generate positive and negative initial sets
sets = genSets()
positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
negative = random.sample(sets[1], min(len(sets[0]), len(sets[1])))
print len(positive)
print len(negative)

cblexicon().genSets()
# Clustering Setup
stopwords = set(nltk.corpus.stopwords.words('english'))
# Create dictionary (adj, (index,[associated words]))
conjSet = getConj()
print conjSet

# Create list out of all keys of conjSet
defSet = conjSet.keys()

# Its Cluster time
cluster = KMeansClusterer(2, euclidean_distance)
print conjSet["young"]
z = vectorspaced("young",conjSet,defSet)

for num in z:
if num == 1:
print "one"


#cluster.cluster([vectorspaced(title,conjSet,defSet) for title in defSet if title])
cluster.cluster(vectorspaced("young",conjSet,defSet))
cluster.cluster(vectorspaced("stiff",conjSet,defSet))
classified_examples = [
cluster.classify(vectorspaced(title,conjSet,defSet)) for title in defSet
]
print classified_examples




# Can we classify and then run bag of words?
#negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
#posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
#trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]
#testfeats = negfeats[500:] + posfeats[500:]
#classifier1 = NaiveBayesClassifier.train(trainfeats)
#print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg"))



cblexicon().process()
31 changes: 31 additions & 0 deletions getAdjectives.py
Expand Up @@ -4,6 +4,37 @@ from nltk.corpus import brown as sc
from collections import Counter



def genConj(training):
conj = open('conj.txt', 'r+')
ands = open('ands.txt', 'r+')
ors = open('ors.txt', 'r+')
buts = open('buts.txt', 'r+')
nor = open('nor.txt', 'r+')
eor = open('eor.txt', 'r+')
j = 0;
for review in training: #For every review
tokens = nltk.pos_tag(nltk.word_tokenize(movie_reviews.raw(fileids=[review])))
print j
j = j+1
for i in range(0,len(tokens)-3):
if ((tokens[i][1]== "JJ" or tokens[i][1] == "JJR" or tokens[i][1] == "JJS") and (tokens[i+2][1]== "JJ" or tokens[i+2][1] == "JJR" or tokens[i+2][1] == "JJS")):
if (tokens[i+1][0] == "and"):
conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
ands.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
elif (tokens[i+1][0] == "or"):
conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
ors.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
elif (tokens[i+1][0] == "but"+ "\n"):
conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
buts.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
elif (tokens[i+1][0] == "either-or"):
conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
eor.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")
elif (tokens[i+1][0] == "neither-nor"):
conj.write(tokens[i][0]+ " " + tokens[i+2][0] + " " + tokens[i+1][0] + "\n")
nor.write(tokens[i][0]+ " " + tokens[i+2][0]+ "\n")

f = open('words.txt', 'r+')
list1 = []
for word in sc.tagged_sents():
Expand Down

0 comments on commit 66918f8

Please sign in to comment.