Skip to content
Permalink
6003609612
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
91 lines (67 sloc) 2.06 KB
import sys
import os
import itertools
import numpy as np
def find_edit_distance(v,w):
'''
Finds the edit distance between w and v
Returns the distance between the two strings
Edit graph is set up as w along y axis (left), v along x axis (top)
Score 0 for match, 1 for mistmatch or indel
'''
# n, m are number of rows, cols
# Note we need the +1 because we need to add the 0th row and column as our starting point
# And python indexes at 0
n = len(v)+1
m = len(w)+1
# Init graph
edit_graph = np.zeros((n, m))
# Add distances along x, y axis
# +1 at every step because indels -> +1
for i in range(n):
edit_graph[i,0] = i
for j in range(m):
edit_graph[0,j] = j
# Go through the graph
for i in range(1, n):
for j in range(1,m):
# Get the diag distance
if v[i-1] == w[j-1]:
diag = edit_graph[i-1,j-1]
else:
diag = edit_graph[i-1,j-1] + 1
# Update
edit_graph[i,j] = min(edit_graph[i-1, j]+1, edit_graph[i,j-1]+1, diag)
# Return n,m
distance = edit_graph[n-1,m-1]
return distance
def main():
'''
Main Function
'''
# Try to get file as input
try:
input_file = sys.argv[1]
except:
print "Please supply an input file"
sys.exit()
# Read in n, strings to S
with open(input_file, 'r') as f:
content = f.readlines()
n = int(content[0].strip())
S = [s.strip() for s in content[1:]]
# Init scoring table to find center string candidate
scores = [0 for i in range(n)]
# Create all index pairs of strings to score
string_pairs = itertools.combinations(range(n), 2)
# Iterate and score
for pair in string_pairs:
w = S[pair[0]]
v = S[pair[1]]
distance = find_edit_distance(w,v)
# Update scores
for i in range(n):
if i in pair:
scores[i] += distance
if __name__ == '__main__':
main()