From 1fbe751b7b82bdc948e8339beb60cab84d4f0a66 Mon Sep 17 00:00:00 2001 From: Moria Date: Wed, 14 Oct 2015 14:00:53 -0400 Subject: [PATCH] Done --- centerstar.py | 121 +++++++++++++++++++++++++++++++++++++++++++++----- input.txt | 1 - 2 files changed, 109 insertions(+), 13 deletions(-) diff --git a/centerstar.py b/centerstar.py index 4134938..f130978 100644 --- a/centerstar.py +++ b/centerstar.py @@ -3,12 +3,55 @@ import os import itertools import numpy as np +def backtrack(v, w, dir_graph): + ''' + Backtracks through dir_graph to find v_p, w_p + ''' + + # Init prime strings, and the indel char + v_p = "" + w_p = "" + indel = "-" + + # get our starting index + i = len(v) + j = len(w) + + # until we reach 0,0 go through the direction graph and work backwards + while i != 0 and j != 0: + + direction = dir_graph[i,j] + + if direction == 0: + # We go up, add letter to v_p, add indel to w_p + v_p += v[i-1] + w_p += indel + i -= 1 + elif direction == 1: + # We go left, add indel to v_p, add letter to w_p + v_p += indel + w_p += w[j-1] + j -= 1 + else: + # We go up/left (diag), add letter to v_p, w_p + v_p += v[i-1] + w_p += w[j-1] + i -= 1 + j -= 1 + + # Reverse the strings since we added from the end first + v_p = v_p[::-1] + w_p = w_p[::-1] + + return v_p, w_p + + def find_edit_distance(v,w): ''' Finds the edit distance between w and v Returns the distance between the two strings - Edit graph is set up as w along y axis (left), v along x axis (top) + Edit graph is set up as v along y axis (left), w along x axis (top) Score 0 for match, 1 for mistmatch or indel ''' # n, m are number of rows, cols @@ -20,9 +63,10 @@ def find_edit_distance(v,w): # Init graph edit_graph = np.zeros((n, m)) - + dir_graph = np.zeros((n,m)) + # Add distances along x, y axis - # +1 at every step because indels -> +1 + # +1 at every step along 0th row, col because indels -> +1 for i in range(n): edit_graph[i,0] = i for j in range(m): @@ -39,19 +83,47 @@ def find_edit_distance(v,w): diag = edit_graph[i-1,j-1] + 1 # Update - edit_graph[i,j] = min(edit_graph[i-1, j]+1, edit_graph[i,j-1]+1, diag) + min_index, min_score = min(enumerate([edit_graph[i-1, j]+1, edit_graph[i,j-1]+1, diag]), key=lambda p: p[1]) + edit_graph[i,j] = min_score + + # Store direction, backtrack later + # 0 = Up, 1 = left, 2 = diag up left + dir_graph[i,j] = min_index + + # Backtrack to get aligned strings (primes) + v_p, w_p = backtrack(v, w, dir_graph) - # Return n,m - distance = edit_graph[n-1,m-1] - return distance + # Get distance and return it all + distance = edit_graph[n-1,m-1] # Remember we index at 0 so -1 + + return distance, v_p, w_p + +def align_strings(S, min_index): + ''' + Aligns strings S around center string min_index + ''' + # Pop out our center string from the string list + Sc = S.pop(min_index) + + # Loop through and align, going back and changing as needed + for i in range(len(S)): + while i >= 0: + distance, v_p, w_p = find_edit_distance(Sc, S[i]) + Sc = v_p + S[i] = w_p + i -= 1 + + return Sc, S def main(): ''' Main Function ''' + print "Center Star Algorithm\nStephen Lincoln\nProgramming Assignment 2\n\n\n" + # Try to get file as input try: input_file = sys.argv[1] @@ -59,13 +131,13 @@ def main(): print "Please supply an input file" sys.exit() - + # Assuming that the input file is correct, for sake of brevity # Read in n, strings to S with open(input_file, 'r') as f: content = f.readlines() n = int(content[0].strip()) - S = [s.strip() for s in content[1:]] + S = [s.strip() for s in content[1:]] # Dont forget to strip newlines and such # Init scoring table to find center string candidate scores = [0 for i in range(n)] @@ -75,16 +147,41 @@ def main(): # Iterate and score for pair in string_pairs: - w = S[pair[0]] - v = S[pair[1]] - distance = find_edit_distance(w,v) + v = S[pair[0]] + w = S[pair[1]] + distance, v_p, w_p = find_edit_distance(v, w) # Update scores for i in range(n): if i in pair: scores[i] += distance + # get the string number and score of the best candidate + min_index, min_score = min(enumerate(scores), key=lambda p: p[1]) + + # Now align all the strings with the center string + Sc, S = align_strings(S, min_index) + + # Recombine into a list + new_S = [Sc] + S + + # Init total cost + total_cost = 0 + + # Find the total cost + string_pairs = itertools.combinations(range(n), 2) + for pair in string_pairs: + v = new_S[pair[0]] + w = new_S[pair[1]] + for i in range(len(v)): + if v[i] != w[i]: + total_cost += 1 + print "Sequences aligned\nmin star cost: %d\ntotal cost: %d" % (min_score, total_cost) + for s in new_S: + print s + + # End main if __name__ == '__main__': main() diff --git a/input.txt b/input.txt index 44af585..7ee1f94 100644 --- a/input.txt +++ b/input.txt @@ -3,4 +3,3 @@ AXZ AXXZ AYXYZ AYZ -