Created a new version which supposedly works but takes 5EVER so we do…

…n't know for sure yet
sib12004 · Apr 20, 2016 · bcc02ca · bcc02ca
1 parent 2df6bab
commit bcc02ca
Show file tree

Hide file tree

Showing 3 changed files with 107 additions and 33 deletions.
diff --git a/pagerank.py b/pagerank.py
@@ -18,7 +18,7 @@
     #create a dictionary of the index : url
     urls = {}
 
-    #create a dictionary of the source nodes (j) : all destination nodes (i)
+    #create a dictionary of the source nodes (i) : all destination nodes (j)
     destinations = {}
 
     #also create the reverse dict, with destinations : source pages which link to it    
@@ -44,28 +44,14 @@
     for n in range(0,V) :
         initialVector.append(1/V)          #initialize the vector
 
-    #initialize a dict of tuples as our matrix P
-    #P = {}
-    #for i in range(0,V) :
-    #    for j in range(0,V) :
-    #        P[(i,j)] = 0
-
-    #iterate through and populate the matrix P
-    #for j in range(0,V) :
-    #    for i in range(0,V) :
-    #        if i in destinations[j] :           #if i is in the list of destinations
-    #            P[(i,j)] = 1/N[j]               #add 1/n_j to the matrix at (i,j)
-
     #Initialize an array/matrix P
     P = np.zeros(V,V)
 
     #populate the matrix
-    for j in range(0,V) :
-        for i in range(0,V) :
+    for i in range(0,V) :
+        for j in range(0,V) :
             if i in destinations[j] :
-                P[j][i] = 1/len(destinations[j])
-
-    P = P*damp + (1-damp)                       #modify P w/ dampening factor
+                P[i][j] = 1/len(destinations[i])
 
     #Make vector (list) N to store all n_j values
     #N = []
@@ -78,23 +64,29 @@
 #Time to rank the pages!
 #PR(V, initialVector, N, damp)
 def PageRank(verts, initVec, outgoing, damp) :
+
+    #P = P*damp + (1-damp)                       #modify P w/ dampening factor
 
-    nextVector = initVec
-    for a in range(0,verts) :
+    for i in range(0, verts) :
         total = 0
-        if a in sources :                       #first check if a exists
-            links = sources[a]                  #links contains list of pages linking to page a
-        else :
-            continue                            #if not in sources, move on
-        for j in links :
-            try:
-                if ((outgoing[j] !=0) & (initVec[j] !=0)) :
-                    total += initVec[j]/outgoing[j] #add page rank/outbound links    
-            except IndexError:
-                print("error on j = ", j)
-                break
-
-            nextVector[a] = ((1-damp) + damp*total)
+        for j in range(0, verts) :
+            initVec[j]*P[i][j]
+
+    #for a in range(0,verts) :
+    #    total = 0
+    #    if a in sources :                       #first check if a exists
+    #        links = sources[a]                  #links contains list of pages linking to page a
+    #    else :
+    #        continue                            #if not in sources, move on
+    #    for j in links :
+    #        try:
+    #            if ((outgoing[j] !=0) & (initVec[j] !=0)) :
+    #                total += initVec[j]/outgoing[j] #add page rank/outbound links    
+    #        except IndexError:
+    #            print("error on j = ", j)
+    #            break
+    #
+    #        nextVector[a] = ((1-damp) + damp*total)
     print(nextVector)
 
     if (initVec != nextVector) :

diff --git a/pagerank2.py b/pagerank2.py
@@ -0,0 +1,71 @@
+#CSE 3504 Project 2
+#Siena Biales, Vincent Chov
+#Google PageRank Algorithm Implementation
+
+import csv
+import operator
+import numpy as np
+
+with open("hollins.dat", "r") as data:
+    reader = csv.reader(data, delimiter = ' ', skipinitialspace=True)
+
+    cols = next(reader)
+
+    #Extract the number of nodes (V) and edges (E) from the first line of the file
+    V = int(cols[0])
+    E = int(cols[1])
+
+    #create a dictionary of the index : url
+    urls = {}
+
+    #create a dictionary of the source nodes (i) : all destination nodes (j)
+    outgoing = [[] for i in range(V)]
+
+    #add every node to the dictionary
+    for n in range(0,V) :
+        line = next(reader)             #read the next line from the file
+        index = int(line[0])            #cast the index to an integer
+        urls[index] = line[1]           #add the data to the dictionary
+
+    for n in range(0, E) :
+        line = next(reader)             #read the next line from the file
+        src = int(line[0])
+        dst = int(line[1])
+        #if the key has no value, set the value to an empty list
+        #then append the destination node to the list
+        outgoing[src-1].append(dst-1)
+
+    #create initial state vector p(0)
+    initialVector = []
+    for n in range(0,V) :
+        initialVector.append(1/V)          #initialize the vector
+
+
+    #Initialize an array/matrix P
+    P = np.zeros((V,V))
+
+    #populate the matrix
+    for i in range(0,V) :
+        for j in range(0,V) :
+            if i in outgoing[j] :
+                P[i][j] = 1/len(outgoing[j])
+
+#PR(P, initialVector, damp)
+def PageRank(trans, initVec, damp, n=0) :
+
+    nextVector = []
+
+    for i in range(len(initVec)) :
+        total = 0
+        for j in range(len(initVec)) :
+            total += initVec[j]*trans[i][j]
+        nextVector.append((1-damp) + damp*total)
+
+    if (initVec != nextVector and n<100) :
+        return PageRank(trans, nextVector, damp, n+1)
+    else :
+        print("success!")
+        return nextVector
+
+finalRank = PageRank(P, initialVector, .85)
+print(finalRank)
diff --git a/test.dat b/test.dat
@@ -0,0 +1,11 @@
+4 6
+1 blah
+2 blahhh
+3 blerhghgh
+4 help
+1 2
+1 3
+2 1
+2 3
+2 4
+4 3