Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Updates to Conjunctions
  • Loading branch information
adl13006 committed Apr 10, 2016
1 parent cbc5649 commit 235376b
Show file tree
Hide file tree
Showing 2 changed files with 1,922 additions and 36 deletions.
107 changes: 71 additions & 36 deletions cblexicon.py
@@ -1,3 +1,4 @@
from __future__ import division
import math
import nltk
from nltk.corpus import wordnet as wn
Expand All @@ -16,14 +17,46 @@ class cblexicon:

def process(self):

def cluster(set1,set2,conjSet,defSet,dis):
for word in set1:
score1 = calcScore(word,set1,conjSet,dis)
#print "Score 1: %f" % score1
score2 = calcScore(word,set2,conjSet,dis)
#print "Score 2: %f" % score2
if score2 < score1:
print "swap"
set1.remove(word)
set2.append(word)
for word in set2:
score1 = calcScore(word,set1,conjSet,dis)
score2 = calcScore(word,set2,conjSet,dis)
if score1 < score2:
set2.remove(word)
set1.append(word)
return set1,set2

def calcScore(curr,set,conjSet,dis):
score = 0
for word in set:
if word != curr:
cats = dis[conjSet[curr][0]][conjSet[word][0]]
score = score + cats
return score * (1.0/len(set))

def normalize_word(word):
return SnowballStemmer("english").stem(word)

def vectorspaced(title,CS,DF):
title_components = CS[title][1]
return numpy.array([
word in title_components
for word in DF], numpy.short)
def vectorize(conjSet,defSet):
dis = numpy.zeros((len(defSet),len(defSet)))
dis.fill(.5)
for word in defSet:
similar = conjSet[word][1]
dissimilar = conjSet[word][2]
for sim in similar:
dis[conjSet[word][0]][conjSet[sim][0]] = 0
for d in dissimilar:
dis[conjSet[word][0]][conjSet[d][0]] = 1
return dis

def word_feats(words):
return dict([(word, True) for word in words])
Expand All @@ -44,24 +77,35 @@ class cblexicon:
return positive,negative

def getConj():
# Set up the tuple (index, similar, dissimilar)
f = open('conj.txt', 'r+')
content = f.readlines()
d = dict()
i = 0
for line in content:
current = line.split(' ')
#Add the first adjective
if current[0] in d:
d[current[0]][1].add(current[1])
else:
d[current[0]] = (i,Set([current[1]]))
i = i+1
#Add the second adjective
if current[1] in d:
d[current[1]][1].add(current[0])
if current[2] == "but":
if current[0] in d:
d[current[0]][2].add(current[1])
else:
d[current[0]] = (i,Set(),Set([current[1]]))
i = i+1
if current[1] in d:
d[current[1]][2].add(current[0])
else:
d[current[1]] = (i,Set(),Set([current[0]]))
i = i+1
else:
d[current[1]] = (i,Set([current[0]]))
i = i+1
if current[0] in d:
d[current[0]][1].add(current[1])
else:
d[current[0]] = (i,Set([current[1]]),Set())
i = i+1
if current[1] in d:
d[current[1]][1].add(current[0])
else:
d[current[1]] = (i,Set([current[0]]),Set())
i = i+1
return d

#Get the Data#
Expand All @@ -73,38 +117,29 @@ class cblexicon:
sets = genSets()
positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
negative = random.sample(sets[1], min(len(sets[0]), len(sets[1])))
print len(positive)
print len(negative)

# Clustering Setup
stopwords = set(nltk.corpus.stopwords.words('english'))
# Create dictionary (adj, (index,[associated words]))
# Create dictionary (adj, (index, similar, dissimilar))
conjSet = getConj()
print conjSet

# Create list out of all keys of conjSet
defSet = conjSet.keys()

# Its Cluster time
cluster = KMeansClusterer(2, euclidean_distance)
print conjSet["young"]
z = vectorspaced("young",conjSet,defSet)

for num in z:
if num == 1:
print "one"

# Generate dissimilarity matrix
dis = vectorize(conjSet,defSet)

#cluster.cluster([vectorspaced(title,conjSet,defSet) for title in defSet if title])
cluster.cluster(vectorspaced("young",conjSet,defSet))
cluster.cluster(vectorspaced("stiff",conjSet,defSet))
classified_examples = [
cluster.classify(vectorspaced(title,conjSet,defSet)) for title in defSet
]
print classified_examples

# Its Cluster time
set1 = defSet[len(defSet)//2:]
set2 = defSet[:len(defSet)//2]

for i in range(0,10):
sets = cluster(set1,set2,conjSet,defSet,dis)
set1 = sets[0]
set2 = sets[1]

print len(set2)

# Can we classify and then run bag of words?
#negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
Expand Down

0 comments on commit 235376b

Please sign in to comment.