From bcc02ca2ef0733f3bd5b6492013472e247c04a17 Mon Sep 17 00:00:00 2001 From: sib12004 Date: Wed, 20 Apr 2016 19:51:23 -0400 Subject: [PATCH] Created a new version which supposedly works but takes 5EVER so we don't know for sure yet --- pagerank.py | 58 ++++++++++++++++++------------------------ pagerank2.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++ test.dat | 11 ++++++++ 3 files changed, 107 insertions(+), 33 deletions(-) create mode 100644 pagerank2.py create mode 100644 test.dat diff --git a/pagerank.py b/pagerank.py index 9b7636f..68e960a 100644 --- a/pagerank.py +++ b/pagerank.py @@ -18,7 +18,7 @@ with open("hollins.dat", "r") as data: #create a dictionary of the index : url urls = {} - #create a dictionary of the source nodes (j) : all destination nodes (i) + #create a dictionary of the source nodes (i) : all destination nodes (j) destinations = {} #also create the reverse dict, with destinations : source pages which link to it @@ -44,28 +44,14 @@ with open("hollins.dat", "r") as data: for n in range(0,V) : initialVector.append(1/V) #initialize the vector - #initialize a dict of tuples as our matrix P - #P = {} - #for i in range(0,V) : - # for j in range(0,V) : - # P[(i,j)] = 0 - - #iterate through and populate the matrix P - #for j in range(0,V) : - # for i in range(0,V) : - # if i in destinations[j] : #if i is in the list of destinations - # P[(i,j)] = 1/N[j] #add 1/n_j to the matrix at (i,j) - #Initialize an array/matrix P P = np.zeros(V,V) #populate the matrix - for j in range(0,V) : - for i in range(0,V) : + for i in range(0,V) : + for j in range(0,V) : if i in destinations[j] : - P[j][i] = 1/len(destinations[j]) - - P = P*damp + (1-damp) #modify P w/ dampening factor + P[i][j] = 1/len(destinations[i]) #Make vector (list) N to store all n_j values #N = [] @@ -78,23 +64,29 @@ with open("hollins.dat", "r") as data: #Time to rank the pages! #PR(V, initialVector, N, damp) def PageRank(verts, initVec, outgoing, damp) : + + #P = P*damp + (1-damp) #modify P w/ dampening factor - nextVector = initVec - for a in range(0,verts) : + for i in range(0, verts) : total = 0 - if a in sources : #first check if a exists - links = sources[a] #links contains list of pages linking to page a - else : - continue #if not in sources, move on - for j in links : - try: - if ((outgoing[j] !=0) & (initVec[j] !=0)) : - total += initVec[j]/outgoing[j] #add page rank/outbound links - except IndexError: - print("error on j = ", j) - break - - nextVector[a] = ((1-damp) + damp*total) + for j in range(0, verts) : + initVec[j]*P[i][j] + + #for a in range(0,verts) : + # total = 0 + # if a in sources : #first check if a exists + # links = sources[a] #links contains list of pages linking to page a + # else : + # continue #if not in sources, move on + # for j in links : + # try: + # if ((outgoing[j] !=0) & (initVec[j] !=0)) : + # total += initVec[j]/outgoing[j] #add page rank/outbound links + # except IndexError: + # print("error on j = ", j) + # break + # + # nextVector[a] = ((1-damp) + damp*total) print(nextVector) if (initVec != nextVector) : diff --git a/pagerank2.py b/pagerank2.py new file mode 100644 index 0000000..4341df3 --- /dev/null +++ b/pagerank2.py @@ -0,0 +1,71 @@ +#CSE 3504 Project 2 +#Siena Biales, Vincent Chov +#Google PageRank Algorithm Implementation + +import csv +import operator +import numpy as np + +with open("hollins.dat", "r") as data: + reader = csv.reader(data, delimiter = ' ', skipinitialspace=True) + + cols = next(reader) + + #Extract the number of nodes (V) and edges (E) from the first line of the file + V = int(cols[0]) + E = int(cols[1]) + + #create a dictionary of the index : url + urls = {} + + #create a dictionary of the source nodes (i) : all destination nodes (j) + outgoing = [[] for i in range(V)] + + #add every node to the dictionary + for n in range(0,V) : + line = next(reader) #read the next line from the file + index = int(line[0]) #cast the index to an integer + urls[index] = line[1] #add the data to the dictionary + + for n in range(0, E) : + line = next(reader) #read the next line from the file + src = int(line[0]) + dst = int(line[1]) + #if the key has no value, set the value to an empty list + #then append the destination node to the list + outgoing[src-1].append(dst-1) + + #create initial state vector p(0) + initialVector = [] + for n in range(0,V) : + initialVector.append(1/V) #initialize the vector + + + #Initialize an array/matrix P + P = np.zeros((V,V)) + + #populate the matrix + for i in range(0,V) : + for j in range(0,V) : + if i in outgoing[j] : + P[i][j] = 1/len(outgoing[j]) + +#PR(P, initialVector, damp) +def PageRank(trans, initVec, damp, n=0) : + + nextVector = [] + + for i in range(len(initVec)) : + total = 0 + for j in range(len(initVec)) : + total += initVec[j]*trans[i][j] + nextVector.append((1-damp) + damp*total) + + if (initVec != nextVector and n<100) : + return PageRank(trans, nextVector, damp, n+1) + else : + print("success!") + return nextVector + +finalRank = PageRank(P, initialVector, .85) +print(finalRank) diff --git a/test.dat b/test.dat new file mode 100644 index 0000000..dcd9c0f --- /dev/null +++ b/test.dat @@ -0,0 +1,11 @@ +4 6 +1 blah +2 blahhh +3 blerhghgh +4 help +1 2 +1 3 +2 1 +2 3 +2 4 +4 3 \ No newline at end of file