Permalink
Cannot retrieve contributors at this time

import sys | |

import os | |

import itertools | |

import numpy as np | |

def backtrack(v, w, dir_graph): | |

''' | |

Backtracks through dir_graph to find v_p, w_p | |

''' | |

# Init prime strings, and the indel char | |

v_p = "" | |

w_p = "" | |

indel = "-" | |

# get our starting index | |

i = len(v) | |

j = len(w) | |

# until we reach 0,0 go through the direction graph and work backwards | |

while i != 0 and j != 0: | |

direction = dir_graph[i,j] | |

if direction == 0: | |

# We go up, add letter to v_p, add indel to w_p | |

v_p += v[i-1] | |

w_p += indel | |

i -= 1 | |

elif direction == 1: | |

# We go left, add indel to v_p, add letter to w_p | |

v_p += indel | |

w_p += w[j-1] | |

j -= 1 | |

else: | |

# We go up/left (diag), add letter to v_p, w_p | |

v_p += v[i-1] | |

w_p += w[j-1] | |

i -= 1 | |

j -= 1 | |

# Reverse the strings since we added from the end first | |

v_p = v_p[::-1] | |

w_p = w_p[::-1] | |

return v_p, w_p | |

def find_edit_distance(v,w): | |

''' | |

Finds the edit distance between w and v | |

Returns the distance between the two strings | |

Edit graph is set up as v along y axis (left), w along x axis (top) | |

Score 0 for match, 1 for mistmatch or indel | |

''' | |

# n, m are number of rows, cols | |

# Note we need the +1 because we need to add the 0th row and column as our starting point | |

# And python indexes at 0 | |

n = len(v)+1 | |

m = len(w)+1 | |

# Init graph | |

edit_graph = np.zeros((n, m)) | |

dir_graph = np.zeros((n,m)) | |

# Add distances along x, y axis | |

# +1 at every step along 0th row, col because indels -> +1 | |

for i in range(n): | |

edit_graph[i,0] = i | |

for j in range(m): | |

edit_graph[0,j] = j | |

# Go through the graph | |

for i in range(1, n): | |

for j in range(1,m): | |

# Get the diag distance | |

if v[i-1] == w[j-1]: | |

diag = edit_graph[i-1,j-1] | |

else: | |

diag = edit_graph[i-1,j-1] + 1 | |

# Update | |

min_index, min_score = min(enumerate([edit_graph[i-1, j]+1, edit_graph[i,j-1]+1, diag]), key=lambda p: p[1]) | |

edit_graph[i,j] = min_score | |

# Store direction, backtrack later | |

# 0 = Up, 1 = left, 2 = diag up left | |

dir_graph[i,j] = min_index | |

# Backtrack to get aligned strings (primes) | |

v_p, w_p = backtrack(v, w, dir_graph) | |

# Get distance and return it all | |

distance = edit_graph[n-1,m-1] # Remember we index at 0 so -1 | |

return distance, v_p, w_p | |

def align_strings(S, min_index): | |

''' | |

Aligns strings S around center string min_index | |

''' | |

# Pop out our center string from the string list | |

Sc = S.pop(min_index) | |

# Loop through and align, going back and changing as needed | |

for i in range(len(S)): | |

while i >= 0: | |

distance, v_p, w_p = find_edit_distance(Sc, S[i]) | |

Sc = v_p | |

S[i] = w_p | |

i -= 1 | |

return Sc, S | |

def main(): | |

''' | |

Main Function | |

''' | |

print "Center Star Algorithm\nStephen Lincoln\nProgramming Assignment 2\n\n\n" | |

# Try to get file as input | |

try: | |

input_file = sys.argv[1] | |

except: | |

print "Please supply an input file" | |

sys.exit() | |

# Assuming that the input file is correct, for sake of brevity | |

# Read in n, strings to S | |

with open(input_file, 'r') as f: | |

content = f.readlines() | |

n = int(content[0].strip()) | |

S = [s.strip() for s in content[1:]] # Dont forget to strip newlines and such | |

if len(S[0]) > 1000: | |

sys.exit() | |

# Init scoring table to find center string candidate | |

scores = [0 for i in range(n)] | |

# Create all index pairs of strings to score | |

string_pairs = itertools.combinations(range(n), 2) | |

# Iterate and score | |

for pair in string_pairs: | |

v = S[pair[0]] | |

w = S[pair[1]] | |

distance, v_p, w_p = find_edit_distance(v, w) | |

# Update scores | |

for i in range(n): | |

if i in pair: | |

scores[i] += distance | |

# get the string number and score of the best candidate | |

min_index, min_score = min(enumerate(scores), key=lambda p: p[1]) | |

# Now align all the strings with the center string | |

Sc, S = align_strings(S, min_index) | |

# Recombine into a list | |

new_S = [Sc] + S | |

# Init total cost | |

total_cost = 0 | |

# Find the total cost | |

string_pairs = itertools.combinations(range(n), 2) | |

for pair in string_pairs: | |

v = new_S[pair[0]] | |

w = new_S[pair[1]] | |

for i in range(len(v)): | |

if v[i] != w[i]: | |

total_cost += 1 | |

print "Sequences aligned\nmin star cost: %d\ntotal cost: %d" % (min_score, total_cost) | |

for s in new_S: | |

print s | |

# End main | |

if __name__ == '__main__': | |

main() |