Skip to content
Permalink
66918f812e
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
1 contributor

Users who have contributed to this file

119 lines (95 sloc) 3.85 KB
import math
import nltk
from nltk.corpus import wordnet as wn
from collections import Counter
import numpy
from nltk.corpus import movie_reviews
import nltk.stem
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance
from nltk.classify import NaiveBayesClassifier
import random
from nltk.stem import *
from sets import Set
class cblexicon:
def process(self):
def normalize_word(word):
return SnowballStemmer("english").stem(word)
def vectorspaced(title,CS,DF):
title_components = CS[title][1]
return numpy.array([
word in title_components
for word in DF], numpy.short)
def word_feats(words):
return dict([(word, True) for word in words])
def genSets():
f = open('words.txt', 'r+')
content = f.readlines()
positive = Set([])
negative = Set([])
for pair in content:
current = pair.split(' ')
if (current[1][0] == 'p'):
positive.add(current[0])
elif (current[1][0] == 'n'):
negative.add(current[0])
return positive,negative
def getConj():
f = open('conj.txt', 'r+')
content = f.readlines()
d = dict()
i = 0
for line in content:
current = line.split(' ')
#Add the first adjective
if current[0] in d:
d[current[0]][1].add(current[1])
else:
d[current[0]] = (i,Set([current[1]]))
i = i+1
#Add the second adjective
if current[1] in d:
d[current[1]][1].add(current[0])
else:
d[current[1]] = (i,Set([current[0]]))
i = i+1
return d
#Get the Data#
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
training = set(negids[:500] + posids[:500])
testing = set(negids[500:] + posids[500:])
# Generate positive and negative initial sets
sets = genSets()
positive = random.sample(sets[0], min(len(sets[0]), len(sets[1])))
negative = random.sample(sets[1], min(len(sets[0]), len(sets[1])))
print len(positive)
print len(negative)
# Clustering Setup
stopwords = set(nltk.corpus.stopwords.words('english'))
# Create dictionary (adj, (index,[associated words]))
conjSet = getConj()
print conjSet
# Create list out of all keys of conjSet
defSet = conjSet.keys()
# Its Cluster time
cluster = KMeansClusterer(2, euclidean_distance)
print conjSet["young"]
z = vectorspaced("young",conjSet,defSet)
for num in z:
if num == 1:
print "one"
#cluster.cluster([vectorspaced(title,conjSet,defSet) for title in defSet if title])
cluster.cluster(vectorspaced("young",conjSet,defSet))
cluster.cluster(vectorspaced("stiff",conjSet,defSet))
classified_examples = [
cluster.classify(vectorspaced(title,conjSet,defSet)) for title in defSet
]
print classified_examples
# Can we classify and then run bag of words?
#negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
#posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
#trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative]
#testfeats = negfeats[500:] + posfeats[500:]
#classifier1 = NaiveBayesClassifier.train(trainfeats)
#print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg"))
cblexicon().process()