Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
1 contributor

Users who have contributed to this file

118 lines (95 sloc) 2.98 KB
#This software is a free software. Thus, it is licensed under GNU General Public License version 3 or later.
#Python implementation to Smith-Waterman Algorithm for Homework 1 of Bioinformatics class.
#Forrest Bao, Sept. 26, 2007 <http://fsbao.net> <forrest.bao aT gmail.com>
import sys, string
from random import *
from numpy import *
from matplotlib import *
alpha='cagt'
#read the first sequence
seq1=''
seq2=''
for i in range(6):
seq1=seq1+choice(alpha);
for i in range(20):
seq2=seq2+choice(alpha);
print seq1, "\n", seq2
m,n = len(seq1),len(seq2) #length of two sequences
#generate DP table and traceback path pointer matrix
score=zeros((m+1,n+1)) #the DP table
pointer=zeros((m+1,n+1)) #to store the traceback path
P=0;
penalty=-1
def match_score(alpha,beta):
if alpha==beta:
return 2
else:
return penalty
max_score=P; #initial maximum score in DP table
#calculate DP table and mark pointers
for i in range(1,m+1):
for j in range(1,n+1):
score_up=score[i-1][j]+penalty ;
score_down=score[i][j-1]+penalty ;
score_diagonal=score[i-1][j-1]+match_score(seq1[i-1],seq2[j-1]);
score[i][j]=max(0,score_up,score_down,score_diagonal);
if score[i][j]==0:
pointer[i][j]=0; #0 means end of the path
if score[i][j]==score_up:
pointer[i][j]=1; #1 means trace up
if score[i][j]==score_down:
pointer[i][j]=2; #2 means trace left
if score[i][j]==score_diagonal:
pointer[i][j]=3; #3 means trace diagonal
if score[i][j]>=max_score:
max_i=i;
max_j=j;
max_score=score[i][j];
#END of DP table
for i in range(m+1):
for j in range(n+1):
print '{0:4d}'.format(int(score[i,j])), '{0:4d}'.format(int(pointer[i][j])), '|',
print
align1,align2='',''; #initial sequences
i,j=max_i,max_j; #indices of path starting point
#missing beginning and ending for this routine
#traceback, follow pointers
while pointer[i][j]!=0:
if pointer[i][j]==3:
align1=align1+seq1[i-1];
align2=align2+seq2[j-1];
i=i-1;
j=j-1;
elif pointer[i][j]==2:
align1=align1+'-';
align2=align2+seq2[j-1];
j=j-1;
elif pointer[i][j]==1:
align1=align1+seq1[i-1];
align2=align2+'-';
i=i-1;
#END of traceback
align1=align1[::-1]; #reverse sequence 1
align2=align2[::-1]; #reverse sequence 2
i,j=0,0;
#calcuate identity, similarity, score and aligned sequeces
def result(align1,align2):
symbol='';
found=0;
score=0;
identity,similarity=0,0;
for i in range(0,len(align1)):
if align1[i]==align2[i]: #if two AAs are the same, then output the letter
symbol=symbol+align1[i];
identity=identity+1;
score=score+match_score(align1[i],align2[i]);
elif align1[i]=='-' or align2[i]=='-': #if one of them is a gap, output a space
symbol=symbol+' ';
score=score+match_score(align1[i],align2[i]);
print 'length of common subsequence = ', identity
print 'Score =', score;
print align1
print align2
print symbol
#END of function result
result(align1,align2)
You can’t perform that action at this time.