
@@ 3,12 +3,55 @@ 



import itertools 



import numpy as np 







def backtrack(v, w, dir_graph): 



''' 



Backtracks through dir_graph to find v_p, w_p 



''' 







# Init prime strings, and the indel char 



v_p = "" 



w_p = "" 



indel = "" 







# get our starting index 



i = len(v) 



j = len(w) 







# until we reach 0,0 go through the direction graph and work backwards 



while i != 0 and j != 0: 







direction = dir_graph[i,j] 







if direction == 0: 



# We go up, add letter to v_p, add indel to w_p 



v_p += v[i1] 



w_p += indel 



i = 1 



elif direction == 1: 



# We go left, add indel to v_p, add letter to w_p 



v_p += indel 



w_p += w[j1] 



j = 1 



else: 



# We go up/left (diag), add letter to v_p, w_p 



v_p += v[i1] 



w_p += w[j1] 



i = 1 



j = 1 







# Reverse the strings since we added from the end first 



v_p = v_p[::1] 



w_p = w_p[::1] 







return v_p, w_p 















def find_edit_distance(v,w): 



''' 



Finds the edit distance between w and v 



Returns the distance between the two strings 



Edit graph is set up as w along y axis (left), v along x axis (top) 



Edit graph is set up as v along y axis (left), w along x axis (top) 



Score 0 for match, 1 for mistmatch or indel 



''' 



# n, m are number of rows, cols 

@@ 20,9 +63,10 @@ def find_edit_distance(v,w): 







# Init graph 



edit_graph = np.zeros((n, m)) 







dir_graph = np.zeros((n,m)) 







# Add distances along x, y axis 



# +1 at every step because indels > +1 



# +1 at every step along 0th row, col because indels > +1 



for i in range(n): 



edit_graph[i,0] = i 



for j in range(m): 

@@ 39,33 +83,61 @@ def find_edit_distance(v,w): 



diag = edit_graph[i1,j1] + 1 







# Update 



edit_graph[i,j] = min(edit_graph[i1, j]+1, edit_graph[i,j1]+1, diag) 



min_index, min_score = min(enumerate([edit_graph[i1, j]+1, edit_graph[i,j1]+1, diag]), key=lambda p: p[1]) 



edit_graph[i,j] = min_score 







# Store direction, backtrack later 



# 0 = Up, 1 = left, 2 = diag up left 



dir_graph[i,j] = min_index 







# Backtrack to get aligned strings (primes) 



v_p, w_p = backtrack(v, w, dir_graph) 







# Return n,m 



distance = edit_graph[n1,m1] 



return distance 



# Get distance and return it all 



distance = edit_graph[n1,m1] # Remember we index at 0 so 1 







return distance, v_p, w_p 











def align_strings(S, min_index): 



''' 



Aligns strings S around center string min_index 



''' 







# Pop out our center string from the string list 



Sc = S.pop(min_index) 







# Loop through and align, going back and changing as needed 



for i in range(len(S)): 



while i >= 0: 



distance, v_p, w_p = find_edit_distance(Sc, S[i]) 



Sc = v_p 



S[i] = w_p 



i = 1 







return Sc, S 







def main(): 



''' 



Main Function 



''' 







print "Center Star Algorithm\nStephen Lincoln\nProgramming Assignment 2\n\n\n" 







# Try to get file as input 



try: 



input_file = sys.argv[1] 



except: 



print "Please supply an input file" 



sys.exit() 











# Assuming that the input file is correct, for sake of brevity 



# Read in n, strings to S 



with open(input_file, 'r') as f: 



content = f.readlines() 







n = int(content[0].strip()) 



S = [s.strip() for s in content[1:]] 



S = [s.strip() for s in content[1:]] # Dont forget to strip newlines and such 







# Init scoring table to find center string candidate 



scores = [0 for i in range(n)] 

@@ 75,16 +147,41 @@ def main(): 







# Iterate and score 



for pair in string_pairs: 



w = S[pair[0]] 



v = S[pair[1]] 



distance = find_edit_distance(w,v) 



v = S[pair[0]] 



w = S[pair[1]] 



distance, v_p, w_p = find_edit_distance(v, w) 







# Update scores 



for i in range(n): 



if i in pair: 



scores[i] += distance 







# get the string number and score of the best candidate 



min_index, min_score = min(enumerate(scores), key=lambda p: p[1]) 







# Now align all the strings with the center string 



Sc, S = align_strings(S, min_index) 







# Recombine into a list 



new_S = [Sc] + S 







# Init total cost 



total_cost = 0 







# Find the total cost 



string_pairs = itertools.combinations(range(n), 2) 



for pair in string_pairs: 



v = new_S[pair[0]] 



w = new_S[pair[1]] 



for i in range(len(v)): 



if v[i] != w[i]: 



total_cost += 1 







print "Sequences aligned\nmin star cost: %d\ntotal cost: %d" % (min_score, total_cost) 



for s in new_S: 



print s 







# End main 







if __name__ == '__main__': 



main() 


0 comments on commit
1fbe751