Skip to content
Permalink
Browse files

Done

  • Loading branch information...
Moria
Moria committed Oct 14, 2015
1 parent 6003609 commit 1fbe751b7b82bdc948e8339beb60cab84d4f0a66
Showing with 109 additions and 13 deletions.
  1. +109 −12 centerstar.py
  2. +0 −1 input.txt
@@ -3,12 +3,55 @@
import itertools
import numpy as np

def backtrack(v, w, dir_graph):
'''
Backtracks through dir_graph to find v_p, w_p
'''

# Init prime strings, and the indel char
v_p = ""
w_p = ""
indel = "-"

# get our starting index
i = len(v)
j = len(w)

# until we reach 0,0 go through the direction graph and work backwards
while i != 0 and j != 0:

direction = dir_graph[i,j]

if direction == 0:
# We go up, add letter to v_p, add indel to w_p
v_p += v[i-1]
w_p += indel
i -= 1
elif direction == 1:
# We go left, add indel to v_p, add letter to w_p
v_p += indel
w_p += w[j-1]
j -= 1
else:
# We go up/left (diag), add letter to v_p, w_p
v_p += v[i-1]
w_p += w[j-1]
i -= 1
j -= 1

# Reverse the strings since we added from the end first
v_p = v_p[::-1]
w_p = w_p[::-1]

return v_p, w_p



def find_edit_distance(v,w):
'''
Finds the edit distance between w and v
Returns the distance between the two strings
Edit graph is set up as w along y axis (left), v along x axis (top)
Edit graph is set up as v along y axis (left), w along x axis (top)
Score 0 for match, 1 for mistmatch or indel
'''
# n, m are number of rows, cols
@@ -20,9 +63,10 @@ def find_edit_distance(v,w):

# Init graph
edit_graph = np.zeros((n, m))

dir_graph = np.zeros((n,m))

# Add distances along x, y axis
# +1 at every step because indels -> +1
# +1 at every step along 0th row, col because indels -> +1
for i in range(n):
edit_graph[i,0] = i
for j in range(m):
@@ -39,33 +83,61 @@ def find_edit_distance(v,w):
diag = edit_graph[i-1,j-1] + 1

# Update
edit_graph[i,j] = min(edit_graph[i-1, j]+1, edit_graph[i,j-1]+1, diag)
min_index, min_score = min(enumerate([edit_graph[i-1, j]+1, edit_graph[i,j-1]+1, diag]), key=lambda p: p[1])
edit_graph[i,j] = min_score

# Store direction, backtrack later
# 0 = Up, 1 = left, 2 = diag up left
dir_graph[i,j] = min_index

# Backtrack to get aligned strings (primes)
v_p, w_p = backtrack(v, w, dir_graph)

# Return n,m
distance = edit_graph[n-1,m-1]
return distance
# Get distance and return it all
distance = edit_graph[n-1,m-1] # Remember we index at 0 so -1

return distance, v_p, w_p


def align_strings(S, min_index):
'''
Aligns strings S around center string min_index
'''

# Pop out our center string from the string list
Sc = S.pop(min_index)

# Loop through and align, going back and changing as needed
for i in range(len(S)):
while i >= 0:
distance, v_p, w_p = find_edit_distance(Sc, S[i])
Sc = v_p
S[i] = w_p
i -= 1

return Sc, S

def main():
'''
Main Function
'''

print "Center Star Algorithm\nStephen Lincoln\nProgramming Assignment 2\n\n\n"

# Try to get file as input
try:
input_file = sys.argv[1]
except:
print "Please supply an input file"
sys.exit()


# Assuming that the input file is correct, for sake of brevity
# Read in n, strings to S
with open(input_file, 'r') as f:
content = f.readlines()

n = int(content[0].strip())
S = [s.strip() for s in content[1:]]
S = [s.strip() for s in content[1:]] # Dont forget to strip newlines and such

# Init scoring table to find center string candidate
scores = [0 for i in range(n)]
@@ -75,16 +147,41 @@ def main():

# Iterate and score
for pair in string_pairs:
w = S[pair[0]]
v = S[pair[1]]
distance = find_edit_distance(w,v)
v = S[pair[0]]
w = S[pair[1]]
distance, v_p, w_p = find_edit_distance(v, w)

# Update scores
for i in range(n):
if i in pair:
scores[i] += distance

# get the string number and score of the best candidate
min_index, min_score = min(enumerate(scores), key=lambda p: p[1])

# Now align all the strings with the center string
Sc, S = align_strings(S, min_index)

# Recombine into a list
new_S = [Sc] + S

# Init total cost
total_cost = 0

# Find the total cost
string_pairs = itertools.combinations(range(n), 2)
for pair in string_pairs:
v = new_S[pair[0]]
w = new_S[pair[1]]
for i in range(len(v)):
if v[i] != w[i]:
total_cost += 1

print "Sequences aligned\nmin star cost: %d\ntotal cost: %d" % (min_score, total_cost)
for s in new_S:
print s

# End main

if __name__ == '__main__':
main()
@@ -3,4 +3,3 @@ AXZ
AXXZ
AYXYZ
AYZ

0 comments on commit 1fbe751

Please sign in to comment.
You can’t perform that action at this time.