Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Done
  • Loading branch information
Moria committed Oct 14, 2015
1 parent 6003609 commit 1fbe751
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 13 deletions.
121 changes: 109 additions & 12 deletions centerstar.py
Expand Up @@ -3,12 +3,55 @@ import os
import itertools import itertools
import numpy as np import numpy as np


def backtrack(v, w, dir_graph):
'''
Backtracks through dir_graph to find v_p, w_p
'''

# Init prime strings, and the indel char
v_p = ""
w_p = ""
indel = "-"

# get our starting index
i = len(v)
j = len(w)

# until we reach 0,0 go through the direction graph and work backwards
while i != 0 and j != 0:

direction = dir_graph[i,j]

if direction == 0:
# We go up, add letter to v_p, add indel to w_p
v_p += v[i-1]
w_p += indel
i -= 1
elif direction == 1:
# We go left, add indel to v_p, add letter to w_p
v_p += indel
w_p += w[j-1]
j -= 1
else:
# We go up/left (diag), add letter to v_p, w_p
v_p += v[i-1]
w_p += w[j-1]
i -= 1
j -= 1

# Reverse the strings since we added from the end first
v_p = v_p[::-1]
w_p = w_p[::-1]

return v_p, w_p




def find_edit_distance(v,w): def find_edit_distance(v,w):
''' '''
Finds the edit distance between w and v Finds the edit distance between w and v
Returns the distance between the two strings Returns the distance between the two strings
Edit graph is set up as w along y axis (left), v along x axis (top) Edit graph is set up as v along y axis (left), w along x axis (top)
Score 0 for match, 1 for mistmatch or indel Score 0 for match, 1 for mistmatch or indel
''' '''
# n, m are number of rows, cols # n, m are number of rows, cols
Expand All @@ -20,9 +63,10 @@ def find_edit_distance(v,w):


# Init graph # Init graph
edit_graph = np.zeros((n, m)) edit_graph = np.zeros((n, m))

dir_graph = np.zeros((n,m))

# Add distances along x, y axis # Add distances along x, y axis
# +1 at every step because indels -> +1 # +1 at every step along 0th row, col because indels -> +1
for i in range(n): for i in range(n):
edit_graph[i,0] = i edit_graph[i,0] = i
for j in range(m): for j in range(m):
Expand All @@ -39,33 +83,61 @@ def find_edit_distance(v,w):
diag = edit_graph[i-1,j-1] + 1 diag = edit_graph[i-1,j-1] + 1


# Update # Update
edit_graph[i,j] = min(edit_graph[i-1, j]+1, edit_graph[i,j-1]+1, diag) min_index, min_score = min(enumerate([edit_graph[i-1, j]+1, edit_graph[i,j-1]+1, diag]), key=lambda p: p[1])
edit_graph[i,j] = min_score

# Store direction, backtrack later
# 0 = Up, 1 = left, 2 = diag up left
dir_graph[i,j] = min_index

# Backtrack to get aligned strings (primes)
v_p, w_p = backtrack(v, w, dir_graph)


# Return n,m # Get distance and return it all
distance = edit_graph[n-1,m-1] distance = edit_graph[n-1,m-1] # Remember we index at 0 so -1
return distance
return distance, v_p, w_p



def align_strings(S, min_index):
'''
Aligns strings S around center string min_index
'''


# Pop out our center string from the string list
Sc = S.pop(min_index)

# Loop through and align, going back and changing as needed
for i in range(len(S)):
while i >= 0:
distance, v_p, w_p = find_edit_distance(Sc, S[i])
Sc = v_p
S[i] = w_p
i -= 1

return Sc, S


def main(): def main():
''' '''
Main Function Main Function
''' '''


print "Center Star Algorithm\nStephen Lincoln\nProgramming Assignment 2\n\n\n"

# Try to get file as input # Try to get file as input
try: try:
input_file = sys.argv[1] input_file = sys.argv[1]
except: except:
print "Please supply an input file" print "Please supply an input file"
sys.exit() sys.exit()



# Assuming that the input file is correct, for sake of brevity
# Read in n, strings to S # Read in n, strings to S
with open(input_file, 'r') as f: with open(input_file, 'r') as f:
content = f.readlines() content = f.readlines()


n = int(content[0].strip()) n = int(content[0].strip())
S = [s.strip() for s in content[1:]] S = [s.strip() for s in content[1:]] # Dont forget to strip newlines and such


# Init scoring table to find center string candidate # Init scoring table to find center string candidate
scores = [0 for i in range(n)] scores = [0 for i in range(n)]
Expand All @@ -75,16 +147,41 @@ def main():


# Iterate and score # Iterate and score
for pair in string_pairs: for pair in string_pairs:
w = S[pair[0]] v = S[pair[0]]
v = S[pair[1]] w = S[pair[1]]
distance = find_edit_distance(w,v) distance, v_p, w_p = find_edit_distance(v, w)


# Update scores # Update scores
for i in range(n): for i in range(n):
if i in pair: if i in pair:
scores[i] += distance scores[i] += distance


# get the string number and score of the best candidate
min_index, min_score = min(enumerate(scores), key=lambda p: p[1])

# Now align all the strings with the center string
Sc, S = align_strings(S, min_index)

# Recombine into a list
new_S = [Sc] + S

# Init total cost
total_cost = 0

# Find the total cost
string_pairs = itertools.combinations(range(n), 2)
for pair in string_pairs:
v = new_S[pair[0]]
w = new_S[pair[1]]
for i in range(len(v)):
if v[i] != w[i]:
total_cost += 1


print "Sequences aligned\nmin star cost: %d\ntotal cost: %d" % (min_score, total_cost)
for s in new_S:
print s

# End main


if __name__ == '__main__': if __name__ == '__main__':
main() main()
Expand Down
1 change: 0 additions & 1 deletion input.txt
Expand Up @@ -3,4 +3,3 @@ AXZ
AXXZ AXXZ
AYXYZ AYXYZ
AYZ AYZ

0 comments on commit 1fbe751

Please sign in to comment.