Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#CSE 3504 Project 2
#Siena Biales, Vincent Chov
#Google PageRank Algorithm Implementation
import csv
import operator
import numpy as np
with open("hollins.dat", "r") as data:
reader = csv.reader(data, delimiter = ' ', skipinitialspace=True)
cols = next(reader)
#Extract the number of nodes (V) and edges (E) from the first line of the file
V = int(cols[0])
E = int(cols[1])
#create a dictionary of the index : url
urls = {}
#create a dictionary of the source nodes (i) : all destination nodes (j)
destinations = {}
#also create the reverse dict, with destinations : source pages which link to it
sources = {}
#add every node to the dictionary
for n in range(0,V) :
line = next(reader) #read the next line from the file
index = int(line[0]) #cast the index to an integer
urls[index] = line[1] #add the data to the dictionary
for n in range(0, V) :
line = next(reader) #read the next line from the file
src = int(line[0])
dst = int(line[1])
#if the key has no value, set the value to an empty list
#then append the destination node to the list
destinations.setdefault(src, []).append(dst)
sources.setdefault(dst, []).append(src)
#create initial state vector p(0)
initialVector = []
for n in range(0,V) :
initialVector.append(1/V) #initialize the vector
#Initialize an array/matrix P
P = np.zeros(V,V)
#populate the matrix
for i in range(0,V) :
for j in range(0,V) :
if i in destinations[j] :
P[i][j] = 1/len(destinations[i])
#Make vector (list) N to store all n_j values
#N = []
#for j in range(0,V) :
# if j in destinations : #need to check if it's in the dict
# N.append(len(destinations[j]))
# else :
# N.append(0)
#Time to rank the pages!
#PR(V, initialVector, N, damp)
def PageRank(verts, initVec, outgoing, damp) :
#P = P*damp + (1-damp) #modify P w/ dampening factor
for i in range(0, verts) :
total = 0
for j in range(0, verts) :
initVec[j]*P[i][j]
#for a in range(0,verts) :
# total = 0
# if a in sources : #first check if a exists
# links = sources[a] #links contains list of pages linking to page a
# else :
# continue #if not in sources, move on
# for j in links :
# try:
# if ((outgoing[j] !=0) & (initVec[j] !=0)) :
# total += initVec[j]/outgoing[j] #add page rank/outbound links
# except IndexError:
# print("error on j = ", j)
# break
#
# nextVector[a] = ((1-damp) + damp*total)
print(nextVector)
if (initVec != nextVector) :
PageRank(verts, nextVector, outgoing, damp)
else :
print("success!")
return nextVector
finalRank = PageRank(V, initialVector, N, .85)
finalRankDict = {}
#Now we need to create a printable list
#First we make finalRank a dictionary of index : rank
for i in range(1,len(finalRank)) :
finalRankDict[i] = finalRank[i]
#now sort these by rank, in descending order; format is a list of (index, rank) tuples
sortedRanks = sorted(finalRankDict.items(), key=operator.itemgetter(1), reverse=True)
#Need to print these in order by searching through the urls dictionary