Permalink

#This software is a free software. Thus, it is licensed under GNU General Public License version 3 or later. | |

#Python implementation to Smith-Waterman Algorithm for Homework 1 of Bioinformatics class. | |

#Forrest Bao, Sept. 26, 2007 <http://fsbao.net> <forrest.bao aT gmail.com> | |

import sys, string | |

from random import * | |

from numpy import * | |

from matplotlib import * | |

alpha='cagt' | |

#read the first sequence | |

seq1='' | |

seq2='' | |

for i in range(6): | |

seq1=seq1+choice(alpha); | |

for i in range(20): | |

seq2=seq2+choice(alpha); | |

print seq1, "\n", seq2 | |

m,n = len(seq1),len(seq2) #length of two sequences | |

#generate DP table and traceback path pointer matrix | |

score=zeros((m+1,n+1)) #the DP table | |

pointer=zeros((m+1,n+1)) #to store the traceback path | |

P=0; | |

penalty=-1 | |

def match_score(alpha,beta): | |

if alpha==beta: | |

return 2 | |

else: | |

return penalty | |

max_score=P; #initial maximum score in DP table | |

#calculate DP table and mark pointers | |

for i in range(1,m+1): | |

for j in range(1,n+1): | |

score_up=score[i-1][j]+penalty ; | |

score_down=score[i][j-1]+penalty ; | |

score_diagonal=score[i-1][j-1]+match_score(seq1[i-1],seq2[j-1]); | |

score[i][j]=max(0,score_up,score_down,score_diagonal); | |

if score[i][j]==0: | |

pointer[i][j]=0; #0 means end of the path | |

if score[i][j]==score_up: | |

pointer[i][j]=1; #1 means trace up | |

if score[i][j]==score_down: | |

pointer[i][j]=2; #2 means trace left | |

if score[i][j]==score_diagonal: | |

pointer[i][j]=3; #3 means trace diagonal | |

if score[i][j]>=max_score: | |

max_i=i; | |

max_j=j; | |

max_score=score[i][j]; | |

#END of DP table | |

for i in range(m+1): | |

for j in range(n+1): | |

print '{0:4d}'.format(int(score[i,j])), '{0:4d}'.format(int(pointer[i][j])), '|', | |

align1,align2='',''; #initial sequences | |

i,j=max_i,max_j; #indices of path starting point | |

#missing beginning and ending for this routine | |

#traceback, follow pointers | |

while pointer[i][j]!=0: | |

if pointer[i][j]==3: | |

align1=align1+seq1[i-1]; | |

align2=align2+seq2[j-1]; | |

i=i-1; | |

j=j-1; | |

elif pointer[i][j]==2: | |

align1=align1+'-'; | |

align2=align2+seq2[j-1]; | |

j=j-1; | |

elif pointer[i][j]==1: | |

align1=align1+seq1[i-1]; | |

align2=align2+'-'; | |

i=i-1; | |

#END of traceback | |

align1=align1[::-1]; #reverse sequence 1 | |

align2=align2[::-1]; #reverse sequence 2 | |

i,j=0,0; | |

#calcuate identity, similarity, score and aligned sequeces | |

def result(align1,align2): | |

symbol=''; | |

found=0; | |

score=0; | |

identity,similarity=0,0; | |

for i in range(0,len(align1)): | |

if align1[i]==align2[i]: #if two AAs are the same, then output the letter | |

symbol=symbol+align1[i]; | |

identity=identity+1; | |

score=score+match_score(align1[i],align2[i]); | |

elif align1[i]=='-' or align2[i]=='-': #if one of them is a gap, output a space | |

symbol=symbol+' '; | |

score=score+match_score(align1[i],align2[i]); | |

print 'length of common subsequence = ', identity | |

print 'Score =', score; | |

print align1 | |

print align2 | |

print symbol | |

#END of function result | |

result(align1,align2) |