pagerank.py

#CSE 3504 Project 2
#Siena Biales, Vincent Chov
#Google PageRank Algorithm Implementation

import csv
import operator
import numpy as np

with open("hollins.dat", "r") as data:
    reader = csv.reader(data, delimiter = ' ', skipinitialspace=True)

    cols = next(reader)

    #Extract the number of nodes (V) and edges (E) from the first line of the file
    V = int(cols[0])
    E = int(cols[1])

    #create a dictionary of the index : url
    urls = {}

    #create a dictionary of the source nodes (i) : all destination nodes (j)
    destinations = {}

    #also create the reverse dict, with destinations : source pages which link to it
    sources = {}

    #add every node to the dictionary
    for n in range(0,V) :
        line = next(reader)             #read the next line from the file
        index = int(line[0])            #cast the index to an integer
        urls[index] = line[1]           #add the data to the dictionary

    for n in range(0, V) :
        line = next(reader)             #read the next line from the file
        src = int(line[0])
        dst = int(line[1])
        #if the key has no value, set the value to an empty list
        #then append the destination node to the list
        destinations.setdefault(src, []).append(dst)
        sources.setdefault(dst, []).append(src)

    #create initial state vector p(0)
    initialVector = []
    for n in range(0,V) :
        initialVector.append(1/V)          #initialize the vector

    #Initialize an array/matrix P
    P = np.zeros(V,V)

    #populate the matrix
    for i in range(0,V) :
        for j in range(0,V) :
            if i in destinations[j] :
                P[i][j] = 1/len(destinations[i])

    #Make vector (list) N to store all n_j values
    #N = []
    #for j in range(0,V) :
    #    if j in destinations :                   #need to check if it's in the dict
    #        N.append(len(destinations[j]))
    #    else :
    #        N.append(0)

#Time to rank the pages!
#PR(V, initialVector, N, damp)
def PageRank(verts, initVec, outgoing, damp) :

    #P = P*damp + (1-damp)                       #modify P w/ dampening factor

    for i in range(0, verts) :
        total = 0
        for j in range(0, verts) :
            initVec[j]*P[i][j]

    #for a in range(0,verts) :
    #    total = 0
    #    if a in sources :                       #first check if a exists
    #        links = sources[a]                  #links contains list of pages linking to page a
    #    else :
    #        continue                            #if not in sources, move on
    #    for j in links :
    #        try:
    #            if ((outgoing[j] !=0) & (initVec[j] !=0)) :
    #                total += initVec[j]/outgoing[j] #add page rank/outbound links
    #        except IndexError:
    #            print("error on j = ", j)
    #            break
    #
    #        nextVector[a] = ((1-damp) + damp*total)
    print(nextVector)

    if (initVec != nextVector) :
        PageRank(verts, nextVector, outgoing, damp)
    else :
        print("success!")
        return nextVector

finalRank = PageRank(V, initialVector, N, .85)
finalRankDict = {}

#Now we need to create a printable list
#First we make finalRank a dictionary of index : rank
for i in range(1,len(finalRank)) :
    finalRankDict[i] = finalRank[i]

#now sort these by rank, in descending order; format is a list of (index, rank) tuples
sortedRanks = sorted(finalRankDict.items(), key=operator.itemgetter(1), reverse=True)

#Need to print these in order by searching through the urls dictionary
	#CSE 3504 Project 2
	#Siena Biales, Vincent Chov
	#Google PageRank Algorithm Implementation

	import csv
	import operator
	import numpy as np

	with open("hollins.dat", "r") as data:
	reader = csv.reader(data, delimiter = ' ', skipinitialspace=True)

	cols = next(reader)

	#Extract the number of nodes (V) and edges (E) from the first line of the file
	V = int(cols[0])
	E = int(cols[1])

	#create a dictionary of the index : url
	urls = {}

	#create a dictionary of the source nodes (i) : all destination nodes (j)
	destinations = {}

	#also create the reverse dict, with destinations : source pages which link to it
	sources = {}

	#add every node to the dictionary
	for n in range(0,V) :
	line = next(reader) #read the next line from the file
	index = int(line[0]) #cast the index to an integer
	urls[index] = line[1] #add the data to the dictionary

	for n in range(0, V) :
	line = next(reader) #read the next line from the file
	src = int(line[0])
	dst = int(line[1])
	#if the key has no value, set the value to an empty list
	#then append the destination node to the list
	destinations.setdefault(src, []).append(dst)
	sources.setdefault(dst, []).append(src)

	#create initial state vector p(0)
	initialVector = []
	for n in range(0,V) :
	initialVector.append(1/V) #initialize the vector

	#Initialize an array/matrix P
	P = np.zeros(V,V)

	#populate the matrix
	for i in range(0,V) :
	for j in range(0,V) :
	if i in destinations[j] :
	P[i][j] = 1/len(destinations[i])

	#Make vector (list) N to store all n_j values
	#N = []
	#for j in range(0,V) :
	# if j in destinations : #need to check if it's in the dict
	# N.append(len(destinations[j]))
	# else :
	# N.append(0)

	#Time to rank the pages!
	#PR(V, initialVector, N, damp)
	def PageRank(verts, initVec, outgoing, damp) :

	#P = P*damp + (1-damp) #modify P w/ dampening factor

	for i in range(0, verts) :
	total = 0
	for j in range(0, verts) :
	initVec[j]*P[i][j]

	#for a in range(0,verts) :
	# total = 0
	# if a in sources : #first check if a exists
	# links = sources[a] #links contains list of pages linking to page a
	# else :
	# continue #if not in sources, move on
	# for j in links :
	# try:
	# if ((outgoing[j] !=0) & (initVec[j] !=0)) :
	# total += initVec[j]/outgoing[j] #add page rank/outbound links
	# except IndexError:
	# print("error on j = ", j)
	# break
	#
	# nextVector[a] = ((1-damp) + damp*total)
	print(nextVector)

	if (initVec != nextVector) :
	PageRank(verts, nextVector, outgoing, damp)
	else :
	print("success!")
	return nextVector

	finalRank = PageRank(V, initialVector, N, .85)
	finalRankDict = {}

	#Now we need to create a printable list
	#First we make finalRank a dictionary of index : rank
	for i in range(1,len(finalRank)) :
	finalRankDict[i] = finalRank[i]

	#now sort these by rank, in descending order; format is a list of (index, rank) tuples
	sortedRanks = sorted(finalRankDict.items(), key=operator.itemgetter(1), reverse=True)

	#Need to print these in order by searching through the urls dictionary