Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Working on clustering in cblexicon
- Loading branch information
Showing
5 changed files
with
143 additions
and
30 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,26 +1,119 @@ | ||
import math | ||
import nltk | ||
from nltk.corpus import wordnet as wn | ||
from nltk.corpus import brown as sc | ||
from collections import Counter | ||
import numpy | ||
from nltk.corpus import movie_reviews | ||
import nltk.stem | ||
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance | ||
from nltk.classify import NaiveBayesClassifier | ||
import random | ||
from nltk.stem import * | ||
|
||
from sets import Set | ||
|
||
class cblexicon: | ||
|
||
def genSets(self): | ||
f = open('words.txt', 'r+') | ||
content = f.readlines() | ||
positive = Set([]) | ||
negative = Set([]) | ||
def process(self): | ||
|
||
def normalize_word(word): | ||
return SnowballStemmer("english").stem(word) | ||
|
||
def vectorspaced(title,CS,DF): | ||
title_components = CS[title][1] | ||
return numpy.array([ | ||
word in title_components | ||
for word in DF], numpy.short) | ||
|
||
def word_feats(words): | ||
return dict([(word, True) for word in words]) | ||
|
||
def genSets(): | ||
f = open('words.txt', 'r+') | ||
content = f.readlines() | ||
positive = Set([]) | ||
negative = Set([]) | ||
|
||
for pair in content: | ||
current = pair.split(' ') | ||
if (current[1][0] == 'p'): | ||
positive.add(current[0]) | ||
elif (current[1][0] == 'n'): | ||
negative.add(current[0]) | ||
for pair in content: | ||
current = pair.split(' ') | ||
if (current[1][0] == 'p'): | ||
positive.add(current[0]) | ||
elif (current[1][0] == 'n'): | ||
negative.add(current[0]) | ||
|
||
return positive,negative | ||
|
||
def getConj(): | ||
f = open('conj.txt', 'r+') | ||
content = f.readlines() | ||
d = dict() | ||
i = 0 | ||
for line in content: | ||
current = line.split(' ') | ||
#Add the first adjective | ||
if current[0] in d: | ||
d[current[0]][1].add(current[1]) | ||
else: | ||
d[current[0]] = (i,Set([current[1]])) | ||
i = i+1 | ||
#Add the second adjective | ||
if current[1] in d: | ||
d[current[1]][1].add(current[0]) | ||
else: | ||
d[current[1]] = (i,Set([current[0]])) | ||
i = i+1 | ||
return d | ||
|
||
#Get the Data# | ||
negids = movie_reviews.fileids('neg') | ||
posids = movie_reviews.fileids('pos') | ||
training = set(negids[:500] + posids[:500]) | ||
testing = set(negids[500:] + posids[500:]) | ||
# Generate positive and negative initial sets | ||
sets = genSets() | ||
positive = random.sample(sets[0], min(len(sets[0]), len(sets[1]))) | ||
negative = random.sample(sets[1], min(len(sets[0]), len(sets[1]))) | ||
print len(positive) | ||
print len(negative) | ||
|
||
cblexicon().genSets() | ||
# Clustering Setup | ||
stopwords = set(nltk.corpus.stopwords.words('english')) | ||
# Create dictionary (adj, (index,[associated words])) | ||
conjSet = getConj() | ||
print conjSet | ||
|
||
# Create list out of all keys of conjSet | ||
defSet = conjSet.keys() | ||
|
||
# Its Cluster time | ||
cluster = KMeansClusterer(2, euclidean_distance) | ||
print conjSet["young"] | ||
z = vectorspaced("young",conjSet,defSet) | ||
|
||
for num in z: | ||
if num == 1: | ||
print "one" | ||
|
||
|
||
#cluster.cluster([vectorspaced(title,conjSet,defSet) for title in defSet if title]) | ||
cluster.cluster(vectorspaced("young",conjSet,defSet)) | ||
cluster.cluster(vectorspaced("stiff",conjSet,defSet)) | ||
classified_examples = [ | ||
cluster.classify(vectorspaced(title,conjSet,defSet)) for title in defSet | ||
] | ||
print classified_examples | ||
|
||
|
||
|
||
|
||
# Can we classify and then run bag of words? | ||
#negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] | ||
#posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] | ||
#trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative] | ||
#testfeats = negfeats[500:] + posfeats[500:] | ||
#classifier1 = NaiveBayesClassifier.train(trainfeats) | ||
#print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg")) | ||
|
||
|
||
|
||
cblexicon().process() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters