From 600360961249509937446227a4967eaa721921ab Mon Sep 17 00:00:00 2001 From: Moria Date: Wed, 14 Oct 2015 11:34:59 -0400 Subject: [PATCH] Got edit distance working --- centerstar.py | 91 +++++++++++++++++++++++++++++++++++++++++++++++++ input.txt | 6 ++++ test_inputs.txt | 6 ++++ 3 files changed, 103 insertions(+) create mode 100644 input.txt create mode 100644 test_inputs.txt diff --git a/centerstar.py b/centerstar.py index e69de29..4134938 100644 --- a/centerstar.py +++ b/centerstar.py @@ -0,0 +1,91 @@ +import sys +import os +import itertools +import numpy as np + + +def find_edit_distance(v,w): + ''' + Finds the edit distance between w and v + Returns the distance between the two strings + Edit graph is set up as w along y axis (left), v along x axis (top) + Score 0 for match, 1 for mistmatch or indel + ''' + # n, m are number of rows, cols + # Note we need the +1 because we need to add the 0th row and column as our starting point + # And python indexes at 0 + + n = len(v)+1 + m = len(w)+1 + + # Init graph + edit_graph = np.zeros((n, m)) + + # Add distances along x, y axis + # +1 at every step because indels -> +1 + for i in range(n): + edit_graph[i,0] = i + for j in range(m): + edit_graph[0,j] = j + + # Go through the graph + for i in range(1, n): + for j in range(1,m): + + # Get the diag distance + if v[i-1] == w[j-1]: + diag = edit_graph[i-1,j-1] + else: + diag = edit_graph[i-1,j-1] + 1 + + # Update + edit_graph[i,j] = min(edit_graph[i-1, j]+1, edit_graph[i,j-1]+1, diag) + + # Return n,m + distance = edit_graph[n-1,m-1] + return distance + + + +def main(): + ''' + Main Function + ''' + + # Try to get file as input + try: + input_file = sys.argv[1] + except: + print "Please supply an input file" + sys.exit() + + + # Read in n, strings to S + with open(input_file, 'r') as f: + content = f.readlines() + + n = int(content[0].strip()) + S = [s.strip() for s in content[1:]] + + # Init scoring table to find center string candidate + scores = [0 for i in range(n)] + + # Create all index pairs of strings to score + string_pairs = itertools.combinations(range(n), 2) + + # Iterate and score + for pair in string_pairs: + w = S[pair[0]] + v = S[pair[1]] + distance = find_edit_distance(w,v) + + # Update scores + for i in range(n): + if i in pair: + scores[i] += distance + + + +if __name__ == '__main__': + main() + diff --git a/input.txt b/input.txt new file mode 100644 index 0000000..44af585 --- /dev/null +++ b/input.txt @@ -0,0 +1,6 @@ +4 +AXZ +AXXZ +AYXYZ +AYZ + diff --git a/test_inputs.txt b/test_inputs.txt new file mode 100644 index 0000000..00e9261 --- /dev/null +++ b/test_inputs.txt @@ -0,0 +1,6 @@ +5 +CCTGCTGCAG +GATGTGCCG +GATGTGCAG +CCGCTAGCAG +CCTGTAGG