Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
Got edit distance working
- Loading branch information
Moria
committed
Oct 14, 2015
1 parent
014ef2b
commit 6003609
Showing
3 changed files
with
103 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
import sys | ||
import os | ||
import itertools | ||
import numpy as np | ||
|
||
|
||
def find_edit_distance(v,w): | ||
''' | ||
Finds the edit distance between w and v | ||
Returns the distance between the two strings | ||
Edit graph is set up as w along y axis (left), v along x axis (top) | ||
Score 0 for match, 1 for mistmatch or indel | ||
''' | ||
# n, m are number of rows, cols | ||
# Note we need the +1 because we need to add the 0th row and column as our starting point | ||
# And python indexes at 0 | ||
|
||
n = len(v)+1 | ||
m = len(w)+1 | ||
|
||
# Init graph | ||
edit_graph = np.zeros((n, m)) | ||
|
||
# Add distances along x, y axis | ||
# +1 at every step because indels -> +1 | ||
for i in range(n): | ||
edit_graph[i,0] = i | ||
for j in range(m): | ||
edit_graph[0,j] = j | ||
|
||
# Go through the graph | ||
for i in range(1, n): | ||
for j in range(1,m): | ||
|
||
# Get the diag distance | ||
if v[i-1] == w[j-1]: | ||
diag = edit_graph[i-1,j-1] | ||
else: | ||
diag = edit_graph[i-1,j-1] + 1 | ||
|
||
# Update | ||
edit_graph[i,j] = min(edit_graph[i-1, j]+1, edit_graph[i,j-1]+1, diag) | ||
|
||
# Return n,m | ||
distance = edit_graph[n-1,m-1] | ||
return distance | ||
|
||
|
||
|
||
def main(): | ||
''' | ||
Main Function | ||
''' | ||
|
||
# Try to get file as input | ||
try: | ||
input_file = sys.argv[1] | ||
except: | ||
print "Please supply an input file" | ||
sys.exit() | ||
|
||
|
||
# Read in n, strings to S | ||
with open(input_file, 'r') as f: | ||
content = f.readlines() | ||
|
||
n = int(content[0].strip()) | ||
S = [s.strip() for s in content[1:]] | ||
|
||
# Init scoring table to find center string candidate | ||
scores = [0 for i in range(n)] | ||
|
||
# Create all index pairs of strings to score | ||
string_pairs = itertools.combinations(range(n), 2) | ||
|
||
# Iterate and score | ||
for pair in string_pairs: | ||
w = S[pair[0]] | ||
v = S[pair[1]] | ||
distance = find_edit_distance(w,v) | ||
|
||
# Update scores | ||
for i in range(n): | ||
if i in pair: | ||
scores[i] += distance | ||
|
||
|
||
|
||
if __name__ == '__main__': | ||
main() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
4 | ||
AXZ | ||
AXXZ | ||
AYXYZ | ||
AYZ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
5 | ||
CCTGCTGCAG | ||
GATGTGCCG | ||
GATGTGCAG | ||
CCGCTAGCAG | ||
CCTGTAGG |