Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Created a new version which supposedly works but takes 5EVER so we do…
…n't know for sure yet
  • Loading branch information
sib12004 committed Apr 20, 2016
1 parent 2df6bab commit bcc02ca
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 33 deletions.
58 changes: 25 additions & 33 deletions pagerank.py
Expand Up @@ -18,7 +18,7 @@ with open("hollins.dat", "r") as data:
#create a dictionary of the index : url
urls = {}

#create a dictionary of the source nodes (j) : all destination nodes (i)
#create a dictionary of the source nodes (i) : all destination nodes (j)
destinations = {}

#also create the reverse dict, with destinations : source pages which link to it
Expand All @@ -44,28 +44,14 @@ with open("hollins.dat", "r") as data:
for n in range(0,V) :
initialVector.append(1/V) #initialize the vector

#initialize a dict of tuples as our matrix P
#P = {}
#for i in range(0,V) :
# for j in range(0,V) :
# P[(i,j)] = 0

#iterate through and populate the matrix P
#for j in range(0,V) :
# for i in range(0,V) :
# if i in destinations[j] : #if i is in the list of destinations
# P[(i,j)] = 1/N[j] #add 1/n_j to the matrix at (i,j)

#Initialize an array/matrix P
P = np.zeros(V,V)

#populate the matrix
for j in range(0,V) :
for i in range(0,V) :
for i in range(0,V) :
for j in range(0,V) :
if i in destinations[j] :
P[j][i] = 1/len(destinations[j])

P = P*damp + (1-damp) #modify P w/ dampening factor
P[i][j] = 1/len(destinations[i])

#Make vector (list) N to store all n_j values
#N = []
Expand All @@ -78,23 +64,29 @@ with open("hollins.dat", "r") as data:
#Time to rank the pages!
#PR(V, initialVector, N, damp)
def PageRank(verts, initVec, outgoing, damp) :

#P = P*damp + (1-damp) #modify P w/ dampening factor

nextVector = initVec
for a in range(0,verts) :
for i in range(0, verts) :
total = 0
if a in sources : #first check if a exists
links = sources[a] #links contains list of pages linking to page a
else :
continue #if not in sources, move on
for j in links :
try:
if ((outgoing[j] !=0) & (initVec[j] !=0)) :
total += initVec[j]/outgoing[j] #add page rank/outbound links
except IndexError:
print("error on j = ", j)
break

nextVector[a] = ((1-damp) + damp*total)
for j in range(0, verts) :
initVec[j]*P[i][j]

#for a in range(0,verts) :
# total = 0
# if a in sources : #first check if a exists
# links = sources[a] #links contains list of pages linking to page a
# else :
# continue #if not in sources, move on
# for j in links :
# try:
# if ((outgoing[j] !=0) & (initVec[j] !=0)) :
# total += initVec[j]/outgoing[j] #add page rank/outbound links
# except IndexError:
# print("error on j = ", j)
# break
#
# nextVector[a] = ((1-damp) + damp*total)
print(nextVector)

if (initVec != nextVector) :
Expand Down
71 changes: 71 additions & 0 deletions pagerank2.py
@@ -0,0 +1,71 @@
#CSE 3504 Project 2
#Siena Biales, Vincent Chov
#Google PageRank Algorithm Implementation

import csv
import operator
import numpy as np

with open("hollins.dat", "r") as data:
reader = csv.reader(data, delimiter = ' ', skipinitialspace=True)

cols = next(reader)

#Extract the number of nodes (V) and edges (E) from the first line of the file
V = int(cols[0])
E = int(cols[1])

#create a dictionary of the index : url
urls = {}

#create a dictionary of the source nodes (i) : all destination nodes (j)
outgoing = [[] for i in range(V)]

#add every node to the dictionary
for n in range(0,V) :
line = next(reader) #read the next line from the file
index = int(line[0]) #cast the index to an integer
urls[index] = line[1] #add the data to the dictionary

for n in range(0, E) :
line = next(reader) #read the next line from the file
src = int(line[0])
dst = int(line[1])
#if the key has no value, set the value to an empty list
#then append the destination node to the list
outgoing[src-1].append(dst-1)

#create initial state vector p(0)
initialVector = []
for n in range(0,V) :
initialVector.append(1/V) #initialize the vector


#Initialize an array/matrix P
P = np.zeros((V,V))

#populate the matrix
for i in range(0,V) :
for j in range(0,V) :
if i in outgoing[j] :
P[i][j] = 1/len(outgoing[j])

#PR(P, initialVector, damp)
def PageRank(trans, initVec, damp, n=0) :

nextVector = []

for i in range(len(initVec)) :
total = 0
for j in range(len(initVec)) :
total += initVec[j]*trans[i][j]
nextVector.append((1-damp) + damp*total)

if (initVec != nextVector and n<100) :
return PageRank(trans, nextVector, damp, n+1)
else :
print("success!")
return nextVector

finalRank = PageRank(P, initialVector, .85)
print(finalRank)
11 changes: 11 additions & 0 deletions test.dat
@@ -0,0 +1,11 @@
4 6
1 blah
2 blahhh
3 blerhghgh
4 help
1 2
1 3
2 1
2 3
2 4
4 3

0 comments on commit bcc02ca

Please sign in to comment.