Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
CenterStar/centerstar.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
189 lines (146 sloc)
4.86 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
import itertools | |
import numpy as np | |
def backtrack(v, w, dir_graph): | |
''' | |
Backtracks through dir_graph to find v_p, w_p | |
''' | |
# Init prime strings, and the indel char | |
v_p = "" | |
w_p = "" | |
indel = "-" | |
# get our starting index | |
i = len(v) | |
j = len(w) | |
# until we reach 0,0 go through the direction graph and work backwards | |
while i != 0 and j != 0: | |
direction = dir_graph[i,j] | |
if direction == 0: | |
# We go up, add letter to v_p, add indel to w_p | |
v_p += v[i-1] | |
w_p += indel | |
i -= 1 | |
elif direction == 1: | |
# We go left, add indel to v_p, add letter to w_p | |
v_p += indel | |
w_p += w[j-1] | |
j -= 1 | |
else: | |
# We go up/left (diag), add letter to v_p, w_p | |
v_p += v[i-1] | |
w_p += w[j-1] | |
i -= 1 | |
j -= 1 | |
# Reverse the strings since we added from the end first | |
v_p = v_p[::-1] | |
w_p = w_p[::-1] | |
return v_p, w_p | |
def find_edit_distance(v,w): | |
''' | |
Finds the edit distance between w and v | |
Returns the distance between the two strings | |
Edit graph is set up as v along y axis (left), w along x axis (top) | |
Score 0 for match, 1 for mistmatch or indel | |
''' | |
# n, m are number of rows, cols | |
# Note we need the +1 because we need to add the 0th row and column as our starting point | |
# And python indexes at 0 | |
n = len(v)+1 | |
m = len(w)+1 | |
# Init graph | |
edit_graph = np.zeros((n, m)) | |
dir_graph = np.zeros((n,m)) | |
# Add distances along x, y axis | |
# +1 at every step along 0th row, col because indels -> +1 | |
for i in range(n): | |
edit_graph[i,0] = i | |
for j in range(m): | |
edit_graph[0,j] = j | |
# Go through the graph | |
for i in range(1, n): | |
for j in range(1,m): | |
# Get the diag distance | |
if v[i-1] == w[j-1]: | |
diag = edit_graph[i-1,j-1] | |
else: | |
diag = edit_graph[i-1,j-1] + 1 | |
# Update | |
min_index, min_score = min(enumerate([edit_graph[i-1, j]+1, edit_graph[i,j-1]+1, diag]), key=lambda p: p[1]) | |
edit_graph[i,j] = min_score | |
# Store direction, backtrack later | |
# 0 = Up, 1 = left, 2 = diag up left | |
dir_graph[i,j] = min_index | |
# Backtrack to get aligned strings (primes) | |
v_p, w_p = backtrack(v, w, dir_graph) | |
# Get distance and return it all | |
distance = edit_graph[n-1,m-1] # Remember we index at 0 so -1 | |
return distance, v_p, w_p | |
def align_strings(S, min_index): | |
''' | |
Aligns strings S around center string min_index | |
''' | |
# Pop out our center string from the string list | |
Sc = S.pop(min_index) | |
# Loop through and align, going back and changing as needed | |
for i in range(len(S)): | |
while i >= 0: | |
distance, v_p, w_p = find_edit_distance(Sc, S[i]) | |
Sc = v_p | |
S[i] = w_p | |
i -= 1 | |
return Sc, S | |
def main(): | |
''' | |
Main Function | |
''' | |
print "Center Star Algorithm\nStephen Lincoln\nProgramming Assignment 2\n\n\n" | |
# Try to get file as input | |
try: | |
input_file = sys.argv[1] | |
except: | |
print "Please supply an input file" | |
sys.exit() | |
# Assuming that the input file is correct, for sake of brevity | |
# Read in n, strings to S | |
with open(input_file, 'r') as f: | |
content = f.readlines() | |
n = int(content[0].strip()) | |
S = [s.strip() for s in content[1:]] # Dont forget to strip newlines and such | |
if len(S[0]) > 1000: | |
sys.exit() | |
# Init scoring table to find center string candidate | |
scores = [0 for i in range(n)] | |
# Create all index pairs of strings to score | |
string_pairs = itertools.combinations(range(n), 2) | |
# Iterate and score | |
for pair in string_pairs: | |
v = S[pair[0]] | |
w = S[pair[1]] | |
distance, v_p, w_p = find_edit_distance(v, w) | |
# Update scores | |
for i in range(n): | |
if i in pair: | |
scores[i] += distance | |
# get the string number and score of the best candidate | |
min_index, min_score = min(enumerate(scores), key=lambda p: p[1]) | |
# Now align all the strings with the center string | |
Sc, S = align_strings(S, min_index) | |
# Recombine into a list | |
new_S = [Sc] + S | |
# Init total cost | |
total_cost = 0 | |
# Find the total cost | |
string_pairs = itertools.combinations(range(n), 2) | |
for pair in string_pairs: | |
v = new_S[pair[0]] | |
w = new_S[pair[1]] | |
for i in range(len(v)): | |
if v[i] != w[i]: | |
total_cost += 1 | |
print "Sequences aligned\nmin star cost: %d\ntotal cost: %d" % (min_score, total_cost) | |
for s in new_S: | |
print s | |
# End main | |
if __name__ == '__main__': | |
main() |