Permalink
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
BigData/cblexicon.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
119 lines (95 sloc)
3.85 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import nltk | |
from nltk.corpus import wordnet as wn | |
from collections import Counter | |
import numpy | |
from nltk.corpus import movie_reviews | |
import nltk.stem | |
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance | |
from nltk.classify import NaiveBayesClassifier | |
import random | |
from nltk.stem import * | |
from sets import Set | |
class cblexicon: | |
def process(self): | |
def normalize_word(word): | |
return SnowballStemmer("english").stem(word) | |
def vectorspaced(title,CS,DF): | |
title_components = CS[title][1] | |
return numpy.array([ | |
word in title_components | |
for word in DF], numpy.short) | |
def word_feats(words): | |
return dict([(word, True) for word in words]) | |
def genSets(): | |
f = open('words.txt', 'r+') | |
content = f.readlines() | |
positive = Set([]) | |
negative = Set([]) | |
for pair in content: | |
current = pair.split(' ') | |
if (current[1][0] == 'p'): | |
positive.add(current[0]) | |
elif (current[1][0] == 'n'): | |
negative.add(current[0]) | |
return positive,negative | |
def getConj(): | |
f = open('conj.txt', 'r+') | |
content = f.readlines() | |
d = dict() | |
i = 0 | |
for line in content: | |
current = line.split(' ') | |
#Add the first adjective | |
if current[0] in d: | |
d[current[0]][1].add(current[1]) | |
else: | |
d[current[0]] = (i,Set([current[1]])) | |
i = i+1 | |
#Add the second adjective | |
if current[1] in d: | |
d[current[1]][1].add(current[0]) | |
else: | |
d[current[1]] = (i,Set([current[0]])) | |
i = i+1 | |
return d | |
#Get the Data# | |
negids = movie_reviews.fileids('neg') | |
posids = movie_reviews.fileids('pos') | |
training = set(negids[:500] + posids[:500]) | |
testing = set(negids[500:] + posids[500:]) | |
# Generate positive and negative initial sets | |
sets = genSets() | |
positive = random.sample(sets[0], min(len(sets[0]), len(sets[1]))) | |
negative = random.sample(sets[1], min(len(sets[0]), len(sets[1]))) | |
print len(positive) | |
print len(negative) | |
# Clustering Setup | |
stopwords = set(nltk.corpus.stopwords.words('english')) | |
# Create dictionary (adj, (index,[associated words])) | |
conjSet = getConj() | |
print conjSet | |
# Create list out of all keys of conjSet | |
defSet = conjSet.keys() | |
# Its Cluster time | |
cluster = KMeansClusterer(2, euclidean_distance) | |
print conjSet["young"] | |
z = vectorspaced("young",conjSet,defSet) | |
for num in z: | |
if num == 1: | |
print "one" | |
#cluster.cluster([vectorspaced(title,conjSet,defSet) for title in defSet if title]) | |
cluster.cluster(vectorspaced("young",conjSet,defSet)) | |
cluster.cluster(vectorspaced("stiff",conjSet,defSet)) | |
classified_examples = [ | |
cluster.classify(vectorspaced(title,conjSet,defSet)) for title in defSet | |
] | |
print classified_examples | |
# Can we classify and then run bag of words? | |
#negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] | |
#posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] | |
#trainfeats = [({word : True},"pos") for word in positive] + [({word : True},"neg") for word in negative] | |
#testfeats = negfeats[500:] + posfeats[500:] | |
#classifier1 = NaiveBayesClassifier.train(trainfeats) | |
#print 'accuracy:', nltk.classify.util.accuracy(classifier1,({"Bad": True},"neg")) | |
cblexicon().process() |