# stl08007/CenterStar

Got edit distance working Moria committed Oct 14, 2015
1 parent 014ef2b commit 600360961249509937446227a4967eaa721921ab
Showing with 103 additions and 0 deletions.
1. +91 −0 centerstar.py
2. +6 −0 input.txt
3. +6 −0 test_inputs.txt
 @@ -0,0 +1,91 @@ import sys import os import itertools import numpy as np def find_edit_distance(v,w): ''' Finds the edit distance between w and v Returns the distance between the two strings Edit graph is set up as w along y axis (left), v along x axis (top) Score 0 for match, 1 for mistmatch or indel ''' # n, m are number of rows, cols # Note we need the +1 because we need to add the 0th row and column as our starting point # And python indexes at 0 n = len(v)+1 m = len(w)+1 # Init graph edit_graph = np.zeros((n, m)) # Add distances along x, y axis # +1 at every step because indels -> +1 for i in range(n): edit_graph[i,0] = i for j in range(m): edit_graph[0,j] = j # Go through the graph for i in range(1, n): for j in range(1,m): # Get the diag distance if v[i-1] == w[j-1]: diag = edit_graph[i-1,j-1] else: diag = edit_graph[i-1,j-1] + 1 # Update edit_graph[i,j] = min(edit_graph[i-1, j]+1, edit_graph[i,j-1]+1, diag) # Return n,m distance = edit_graph[n-1,m-1] return distance def main(): ''' Main Function ''' # Try to get file as input try: input_file = sys.argv except: print "Please supply an input file" sys.exit() # Read in n, strings to S with open(input_file, 'r') as f: content = f.readlines() n = int(content.strip()) S = [s.strip() for s in content[1:]] # Init scoring table to find center string candidate scores = [0 for i in range(n)] # Create all index pairs of strings to score string_pairs = itertools.combinations(range(n), 2) # Iterate and score for pair in string_pairs: w = S[pair] v = S[pair] distance = find_edit_distance(w,v) # Update scores for i in range(n): if i in pair: scores[i] += distance if __name__ == '__main__': main()
 @@ -0,0 +1,6 @@ 4 AXZ AXXZ AYXYZ AYZ
 @@ -0,0 +1,6 @@ 5 CCTGCTGCAG GATGTGCCG GATGTGCAG CCGCTAGCAG CCTGTAGG