Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
github-graph-analysis/code/graph_analysis.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
380 lines (338 sloc)
13.7 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import matplotlib.mlab as mlab | |
import numpy as np | |
import cairo | |
import igraph | |
from igraph import * | |
from scipy import stats as sci | |
# params | |
filepath = "C:/Python27_64/Scripts/archive_pickle_projects" | |
is_directed = False | |
is_projects = True | |
do_degree_analysis = False | |
do_strength_analysis = False | |
do_diameter_analysis = False | |
do_analytical_analysis = False | |
do_corrolation_analysis = False | |
do_box_plot_analysis = True | |
do_graph_cluster_analysis = True | |
do_page_rank_analysis = False | |
use_reverse = True | |
look_at_languages = True | |
# utility | |
common_words = ["the","of","and","a","to","in","is","you","that","it","he","was","for","on","are","as","with","his","they","I","at","be","this","have","from","or","one","had","by","word","but","not","what","all","were","we","when","your","can","said","there","use","an","each","which","she","do","how","their","if","will","up","other","about","out","many","then","them","these","so","some","her","would","make","like","him","into","time","has","look","two","more","write","go","see","number","no","way","could","people","my","than","first","water","been","call","who","oil","its","now","find","long","down","day","did","get","come","made","may","part"] | |
# read and initialize graph | |
print("0...initialize and read:") | |
g1 = Graph() | |
g1 = g1.Read_Pickle(filepath) | |
#pre-process graph | |
count = 0 | |
if(is_projects): | |
for edge in g1.es: | |
v_src = edge.source | |
v_tgt = edge.target | |
e_oldw = edge["weight"] | |
edge["weight"]=(e_oldw/float(g1.vs[v_src]["size"]) + e_oldw/float(g1.vs[v_tgt]["size"]))/2 | |
count+=1 | |
g1.vs["degree"] = g1.degree() | |
g1.vs["strength"] = g1.strength(weights=g1.es["weight"]) | |
g1.vs["pagerank"] = g1.pagerank(vertices=None, directed=False, damping=0.85, weights=g1.es["weight"]) | |
if(do_analytical_analysis): | |
print(" ") | |
print(" ") | |
print("1A...degree analytical analysis (top 5 or bottom):") | |
degrees = [] | |
for degree in g1.degree(): | |
degrees.append(degree) | |
degrees.sort(cmp=None, key=None, reverse=use_reverse) | |
count = 0 | |
for degree in degrees[:5]: | |
vertex = g1.vs.select(_degree = degree) | |
name = vertex["name"] | |
if(is_projects): | |
language = vertex["language"] | |
watchers = vertex["watchers"] | |
forks = vertex["forks"] | |
size = vertex["size"] | |
strength = vertex["strength"] | |
pagerank = vertex["pagerank"] | |
print("-------"+str(count)+"--------") | |
print(degree) | |
print(name) | |
print(language) | |
print(watchers) | |
print(forks) | |
print(size) | |
print(strength) | |
print(pagerank) | |
count+=1 | |
else: | |
followers = vertex["followers"] | |
following = vertex["following"] | |
strength = vertex["strength"] | |
pagerank = vertex["pagerank"] | |
print("-------"+str(count)+"--------") | |
print(degree) | |
print(name) | |
print(followers) | |
print(following) | |
print(strength) | |
print(pagerank) | |
count+=1 | |
print(" ") | |
print(" ") | |
print("1B... strength analytical analysis (top 5 or bottom):") | |
strengths = [] | |
for strength in g1.strength(weights=g1.es["weight"]): | |
strengths.append(strength) | |
strengths.sort(cmp=None, key=None, reverse=use_reverse) | |
count = 0 | |
for strength in strengths[:5]: | |
vertex = g1.vs.select(strength = strength) | |
name = vertex["name"] | |
if(is_projects): | |
language = vertex["language"] | |
watchers = vertex["watchers"] | |
forks = vertex["forks"] | |
size = vertex["size"] | |
degree = vertex["degree"] | |
pagerank = vertex["pagerank"] | |
print("-------"+str(count)+"--------") | |
print(strength) | |
print(name) | |
print(language) | |
print(watchers) | |
print(forks) | |
print(size) | |
print(degree) | |
print(pagerank) | |
count+=1 | |
else: | |
followers = vertex["followers"] | |
following = vertex["following"] | |
degree = vertex["degree"] | |
pagerank = vertex["pagerank"] | |
print("-------"+str(count)+"--------") | |
print(strength) | |
print(name) | |
print(followers) | |
print(following) | |
print(degree) | |
count+=1 | |
print(" ") | |
print(" ") | |
print("1C... pagerank analytical analysis (top 5 or bottom):") | |
pageranks = [] | |
for pagerank in g1.pagerank(vertices=None, directed=False, damping=0.85, weights=g1.es["weight"]): | |
pageranks.append(pagerank) | |
pageranks.sort(cmp=None, key=None, reverse=use_reverse) | |
count = 0 | |
for pagerank in pageranks[:5]: | |
vertex = g1.vs.select(pagerank = pagerank) | |
name = vertex["name"] | |
if(is_projects): | |
language = vertex["language"] | |
watchers = vertex["watchers"] | |
forks = vertex["forks"] | |
size = vertex["size"] | |
strength = vertex["strength"] | |
print("-------"+str(count)+"--------") | |
print(pagerank) | |
print(name) | |
print(language) | |
print(watchers) | |
print(forks) | |
print(size) | |
print(degree) | |
print(strength) | |
count+=1 | |
else: | |
followers = vertex["followers"] | |
following = vertex["following"] | |
degree = vertex["degree"] | |
strength = vertex["strength"] | |
print("-------"+str(count)+"--------") | |
print(pagerank) | |
print(name) | |
print(followers) | |
print(following) | |
print(degree) | |
print(strength) | |
count+=1 | |
if(do_degree_analysis): | |
print(" ") | |
print(" ") | |
print("2...degree distribution analysis:") | |
plt.figure() | |
plt.hist(g1.degree(),bins=50) | |
plt.show() | |
if(do_strength_analysis): | |
print(" ") | |
print(" ") | |
print("3...strength distribution analysis:") | |
plt.figure() | |
plt.hist(g1.strength(weights=g1.es["weight"]),bins=50) | |
plt.show() | |
if(do_page_rank_analysis): | |
print(" ") | |
print(" ") | |
print("7...page rank analysis:") | |
plt.figure() | |
plt.hist(g1.pagerank(vertices=None, directed=False, damping=0.85, weights=g1.es["weight"]),bins=50) | |
plt.show() | |
if(do_corrolation_analysis): | |
print(" ") | |
print(" ") | |
print("5A...correlation analysis (degree vs strength):") | |
result = np.correlate(g1.degree(), g1.strength()) | |
print(result) | |
if(is_projects): | |
print("5B...correlation analysis (strength vs size):") | |
result = np.corrcoef(g1.strength(), g1.vs["size"]) | |
print(result) | |
print("5C...correlation analysis (strength vs watchers):") | |
result = np.corrcoef(g1.strength(), g1.vs["watchers"]) | |
print(result) | |
print("5D...correlation analysis (strength vs forks):") | |
result = np.corrcoef(g1.strength(), g1.vs["forks"]) | |
print(result) | |
else: | |
print("5B...correlation analysis (strength vs followers):") | |
result = np.corrcoef(g1.strength(), g1.vs["followers"]) | |
print(result) | |
print("5C...correlation analysis (strength vs following):") | |
result = np.corrcoef(g1.strength(), g1.vs["following"]) | |
print(result) | |
if(do_box_plot_analysis and is_projects): | |
print(" ") | |
print(" ") | |
print("6...boxplot analysis (categorical correlation):") | |
count = {} | |
for vertex in g1.vs.select(): | |
if(vertex["language"] in count and count[vertex["language"]] is not None): | |
count[vertex["language"]].append(float(vertex["strength"])) | |
elif(vertex["language"] not in count): | |
count[vertex["language"]] = [float(vertex["strength"])] | |
c = 0 | |
for key,val in count.items(): | |
if(len(val) > 3): | |
w, p_val = sci.shapiro(val) | |
print(c) | |
print("pvalue:"+str(p_val)) | |
print("mean:"+str(np.mean(val))) | |
print("langage:"+key) | |
print("# of points:"+str(len(val))) | |
print("-------") | |
c+=1 | |
x = count.keys() | |
y = [1,2,3,4,5,6,7,8,9,10,11,12,13,14] | |
plt.figure() | |
plt.boxplot(count.values()) | |
plt.xticks(y,x) | |
plt.show() | |
f_val, p_val = sci.f_oneway(count.values()[0],count.values()[2], | |
count.values()[3],count.values()[4],count.values()[5], | |
count.values()[6],count.values()[7], | |
count.values()[9],count.values()[11], | |
count.values()[12]) | |
print("For ANOVA All: "+ str(p_val)) | |
f_val, p_val = sci.f_oneway(count.values()[5],count.values()[11]) | |
print("For ANOVA javascript,C++: "+ str(p_val)) | |
f_val, p_val = sci.ranksums(count.values()[5],count.values()[11]) | |
print("For Ranksum javascript,C++: "+ str(p_val)) | |
if(do_graph_cluster_analysis): | |
print(" ") | |
print(" ") | |
print("7...graph cluster analysis:") | |
use_cim = True | |
use_cle = False | |
use_clp = False | |
use_cml = False | |
if(use_cim): | |
clust = g1.community_infomap(edge_weights=g1.es["weight"]) | |
if(use_cle): | |
clust = g1.community_leading_eigenvector(clusters=90,weights=g1.es["weight"]) | |
if(use_clp): | |
clust = g1.community_label_propagation(weights=g1.es["weight"]) | |
if(use_cml): | |
clust = g1.community_multilevel(weights=g1.es["weight"], return_levels=False) | |
modularity_score = g1.modularity(membership=clust,weights=g1.es["weight"]) | |
cont_graph = clust.cluster_graph(combine_vertices=True,combine_edges=True) | |
sub_graphs = clust.subgraphs() | |
''' | |
count = 0 | |
for graph in sub_graphs: | |
visual_style = {} | |
visual_style["vertex_size"] = 2 | |
visual_style["vertex_label"] = graph.vs["name"] | |
visual_style["vertex_color"] = "red" | |
visual_style["edge_label"] = graph.es["weight"] | |
visual_style["edge_width"] = 1 | |
visual_style["layout"] = g1.layout("kk") | |
visual_style["bbox"] = (1000, 1000) | |
visual_style["margin"] = 100 | |
filepath = "visual_group_"+str(count)+".pdf" | |
igraph.plot( graph, filepath, **visual_style ) | |
if(is_projects and look_at_languages): | |
dist_count = {} | |
for vertex in graph.vs.select(): | |
if(vertex["language"] in dist_count and dist_count[vertex["language"]] is not None): | |
dist_count[vertex["language"]] += 1 | |
elif(vertex["language"] not in dist_count): | |
dist_count[vertex["language"]] = 1 | |
elif(is_projects): | |
dist_count = {} | |
for vertex in graph.vs.select(): | |
words = vertex["description"].split() | |
for word in words: | |
word = word.lower() | |
if(word in dist_count and word not in common_words and dist_count[word] is not None): | |
dist_count[word] += 1 | |
elif(vertex["language"] not in dist_count and word not in common_words): | |
dist_count[word] = 1 | |
for k,v in dist_count.items(): | |
if(v < np.median(dist_count.values())): | |
dist_count.pop(k, None) | |
else: | |
dist_count = {} | |
for vertex in graph.vs.select(): | |
if(vertex["location"] in dist_count and dist_count[vertex["location"]] is not None): | |
dist_count[vertex["location"]] += 1 | |
elif(vertex["location"] not in dist_count): | |
dist_count[vertex["location"]] = 1 | |
for k,v in dist_count.items(): | |
if(v <= np.median(dist_count.values())): | |
dist_count.pop(k, None) | |
pos = np.arange(len(dist_count.keys())) | |
freq = dist_count.values() | |
width = 1.0 | |
ax = plt.axes() | |
ax.set_xticks(pos + (width / 2)) | |
ax.set_xticklabels(dist_count.keys()) | |
plt.bar(pos,freq,width,color='r') | |
filepath = "dist_users_"+str(count)+".pdf" | |
plt.show() | |
count+=1 | |
''' | |
visual_style = {} | |
visual_style["vertex_size"] = 10 | |
visual_style["vertex_color"] = "red" | |
visual_style["edge_width"] = 1 | |
visual_style["layout"] = cont_graph.layout("kk") | |
visual_style["bbox"] = (1000, 1000) | |
visual_style["margin"] = 100 | |
igraph.plot( cont_graph, "visual_contracted.pdf", **visual_style ) | |
print(modularity_score) | |
if(do_diameter_analysis): | |
print(" ") | |
print(" ") | |
print("8A...diameter analysis (weighted):") | |
farthest_points = g1.farthest_points(directed=False,unconn=True,weights=g1.es["weight"]) | |
diameter = g1.get_diameter(directed=is_directed,unconn=True,weights=g1.es["weight"]) | |
print(farthest_points) | |
print(diameter) | |
print("8B...diameter analysis (un-weighted):") | |
farthest_points = g1.farthest_points(directed=False,unconn=True,weights=None) | |
diameter = g1.get_diameter(directed=is_directed,unconn=True,weights=None) | |
print(farthest_points) | |
print(diameter) |