Skip to content
Permalink
0e708c284a
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
189 lines (146 sloc) 4.86 KB
import sys
import os
import itertools
import numpy as np
def backtrack(v, w, dir_graph):
'''
Backtracks through dir_graph to find v_p, w_p
'''
# Init prime strings, and the indel char
v_p = ""
w_p = ""
indel = "-"
# get our starting index
i = len(v)
j = len(w)
# until we reach 0,0 go through the direction graph and work backwards
while i != 0 and j != 0:
direction = dir_graph[i,j]
if direction == 0:
# We go up, add letter to v_p, add indel to w_p
v_p += v[i-1]
w_p += indel
i -= 1
elif direction == 1:
# We go left, add indel to v_p, add letter to w_p
v_p += indel
w_p += w[j-1]
j -= 1
else:
# We go up/left (diag), add letter to v_p, w_p
v_p += v[i-1]
w_p += w[j-1]
i -= 1
j -= 1
# Reverse the strings since we added from the end first
v_p = v_p[::-1]
w_p = w_p[::-1]
return v_p, w_p
def find_edit_distance(v,w):
'''
Finds the edit distance between w and v
Returns the distance between the two strings
Edit graph is set up as v along y axis (left), w along x axis (top)
Score 0 for match, 1 for mistmatch or indel
'''
# n, m are number of rows, cols
# Note we need the +1 because we need to add the 0th row and column as our starting point
# And python indexes at 0
n = len(v)+1
m = len(w)+1
# Init graph
edit_graph = np.zeros((n, m))
dir_graph = np.zeros((n,m))
# Add distances along x, y axis
# +1 at every step along 0th row, col because indels -> +1
for i in range(n):
edit_graph[i,0] = i
for j in range(m):
edit_graph[0,j] = j
# Go through the graph
for i in range(1, n):
for j in range(1,m):
# Get the diag distance
if v[i-1] == w[j-1]:
diag = edit_graph[i-1,j-1]
else:
diag = edit_graph[i-1,j-1] + 1
# Update
min_index, min_score = min(enumerate([edit_graph[i-1, j]+1, edit_graph[i,j-1]+1, diag]), key=lambda p: p[1])
edit_graph[i,j] = min_score
# Store direction, backtrack later
# 0 = Up, 1 = left, 2 = diag up left
dir_graph[i,j] = min_index
# Backtrack to get aligned strings (primes)
v_p, w_p = backtrack(v, w, dir_graph)
# Get distance and return it all
distance = edit_graph[n-1,m-1] # Remember we index at 0 so -1
return distance, v_p, w_p
def align_strings(S, min_index):
'''
Aligns strings S around center string min_index
'''
# Pop out our center string from the string list
Sc = S.pop(min_index)
# Loop through and align, going back and changing as needed
for i in range(len(S)):
while i >= 0:
distance, v_p, w_p = find_edit_distance(Sc, S[i])
Sc = v_p
S[i] = w_p
i -= 1
return Sc, S
def main():
'''
Main Function
'''
print "Center Star Algorithm\nStephen Lincoln\nProgramming Assignment 2\n\n\n"
# Try to get file as input
try:
input_file = sys.argv[1]
except:
print "Please supply an input file"
sys.exit()
# Assuming that the input file is correct, for sake of brevity
# Read in n, strings to S
with open(input_file, 'r') as f:
content = f.readlines()
n = int(content[0].strip())
S = [s.strip() for s in content[1:]] # Dont forget to strip newlines and such
if len(S[0]) > 1000:
sys.exit()
# Init scoring table to find center string candidate
scores = [0 for i in range(n)]
# Create all index pairs of strings to score
string_pairs = itertools.combinations(range(n), 2)
# Iterate and score
for pair in string_pairs:
v = S[pair[0]]
w = S[pair[1]]
distance, v_p, w_p = find_edit_distance(v, w)
# Update scores
for i in range(n):
if i in pair:
scores[i] += distance
# get the string number and score of the best candidate
min_index, min_score = min(enumerate(scores), key=lambda p: p[1])
# Now align all the strings with the center string
Sc, S = align_strings(S, min_index)
# Recombine into a list
new_S = [Sc] + S
# Init total cost
total_cost = 0
# Find the total cost
string_pairs = itertools.combinations(range(n), 2)
for pair in string_pairs:
v = new_S[pair[0]]
w = new_S[pair[1]]
for i in range(len(v)):
if v[i] != w[i]:
total_cost += 1
print "Sequences aligned\nmin star cost: %d\ntotal cost: %d" % (min_score, total_cost)
for s in new_S:
print s
# End main
if __name__ == '__main__':
main()