Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
CSE3504-Project-2/pagerank.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
109 lines (87 sloc)
3.62 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#CSE 3504 Project 2 | |
#Siena Biales, Vincent Chov | |
#Google PageRank Algorithm Implementation | |
import csv | |
import operator | |
import numpy as np | |
with open("hollins.dat", "r") as data: | |
reader = csv.reader(data, delimiter = ' ', skipinitialspace=True) | |
cols = next(reader) | |
#Extract the number of nodes (V) and edges (E) from the first line of the file | |
V = int(cols[0]) | |
E = int(cols[1]) | |
#create a dictionary of the index : url | |
urls = {} | |
#create a dictionary of the source nodes (i) : all destination nodes (j) | |
destinations = {} | |
#also create the reverse dict, with destinations : source pages which link to it | |
sources = {} | |
#add every node to the dictionary | |
for n in range(0,V) : | |
line = next(reader) #read the next line from the file | |
index = int(line[0]) #cast the index to an integer | |
urls[index] = line[1] #add the data to the dictionary | |
for n in range(0, V) : | |
line = next(reader) #read the next line from the file | |
src = int(line[0]) | |
dst = int(line[1]) | |
#if the key has no value, set the value to an empty list | |
#then append the destination node to the list | |
destinations.setdefault(src, []).append(dst) | |
sources.setdefault(dst, []).append(src) | |
#create initial state vector p(0) | |
initialVector = [] | |
for n in range(0,V) : | |
initialVector.append(1/V) #initialize the vector | |
#Initialize an array/matrix P | |
P = np.zeros(V,V) | |
#populate the matrix | |
for i in range(0,V) : | |
for j in range(0,V) : | |
if i in destinations[j] : | |
P[i][j] = 1/len(destinations[i]) | |
#Make vector (list) N to store all n_j values | |
#N = [] | |
#for j in range(0,V) : | |
# if j in destinations : #need to check if it's in the dict | |
# N.append(len(destinations[j])) | |
# else : | |
# N.append(0) | |
#Time to rank the pages! | |
#PR(V, initialVector, N, damp) | |
def PageRank(verts, initVec, outgoing, damp) : | |
#P = P*damp + (1-damp) #modify P w/ dampening factor | |
for i in range(0, verts) : | |
total = 0 | |
for j in range(0, verts) : | |
initVec[j]*P[i][j] | |
#for a in range(0,verts) : | |
# total = 0 | |
# if a in sources : #first check if a exists | |
# links = sources[a] #links contains list of pages linking to page a | |
# else : | |
# continue #if not in sources, move on | |
# for j in links : | |
# try: | |
# if ((outgoing[j] !=0) & (initVec[j] !=0)) : | |
# total += initVec[j]/outgoing[j] #add page rank/outbound links | |
# except IndexError: | |
# print("error on j = ", j) | |
# break | |
# | |
# nextVector[a] = ((1-damp) + damp*total) | |
print(nextVector) | |
if (initVec != nextVector) : | |
PageRank(verts, nextVector, outgoing, damp) | |
else : | |
print("success!") | |
return nextVector | |
finalRank = PageRank(V, initialVector, N, .85) | |
finalRankDict = {} | |
#Now we need to create a printable list | |
#First we make finalRank a dictionary of index : rank | |
for i in range(1,len(finalRank)) : | |
finalRankDict[i] = finalRank[i] | |
#now sort these by rank, in descending order; format is a list of (index, rank) tuples | |
sortedRanks = sorted(finalRankDict.items(), key=operator.itemgetter(1), reverse=True) | |
#Need to print these in order by searching through the urls dictionary |