Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
141 lines (109 sloc) 4.14 KB
'''
PROBLEM STATEMENT
The closest sequence problem is defined as follows:
Given m DNA sequences s1, . . . , sm, each of length n, find a DNA sequence t of length
n such that maxi=1,...,m dH(t, si) is minimized, where dH(t, si) denotes the Hamming distance
between t and si.
For this project you must implement a method for finding optimum solutions to the closest sequence
problem. You can either implement a branch-and-bound algorithm or use integer programming in
conjunction with optimization engines such as the GNU Linear Programming Kit (GLPK).
INPUT
Your program should read from the standard input a line containing integers m and n, followed by
m lines each containing a DNA sequence of length n.
Sample input:
10 25
CTGGCGGTGGCTATCATCCGTCCCT
CATGCGAGTGGTCGGTGATAGCTCG
GAAGTGTGAGGAATCCGTAGAGAAT
GAACTAAGTAGTTCACCTTACCCTC
CCAACACTCATATCGTCTTGCTACT
TGACTCCTTTTTTATTCATATTTTC
AATACTCGACCTTCCACGAAGGCTG
GGATTCACCTCCCTTTCCGCTGAAT
CAGAGGTAAAAGAAAGGGGGACAAT
GATAATCGTAGAATTAAATAAGACA
OUTPUT
If you use integer programming, your program should print to the standard output an integer program
model of the input problem instance in lp format (see http://lpsolve.sourceforge.net/5.1/CPLEXformat.htm).
'''
# Imports
import os
import sys
# I need to symlink my Anaconda install with this :/
sys.path.append('/usr/local/lib/python2.7/dist-packages')
import cplex
# Absolute path to inputs, outputs
path_to_inputs = '/home/moria/Projects/Bioinformatics/CenterStringLP/Inputs/'
path_to_outputs = '/home/moria/Projects/Bioinformatics/CenterStringLP/Outputs/'
# Master bases
bases = ['A','C','T','G']
# For each input file...
for fn in os.listdir(path_to_inputs):
# Split filename into prefix, suffix
pre,suff = fn.split('.')
# Reset line counter, init array S to store sequences
line_num = 0
S = []
# Open the files
f_in = open(path_to_inputs+fn, 'r')
f_out = open(path_to_outputs+pre+'.lp', 'w')
# For each line in the input file...
for line in f_in:
# If line num is 0, grab n,m else put stripped sequence into S
if line_num == 0:
# get number of strings, length of strings
m,n = line.split(' ')
m = int(m)
n = int(n)
line_num += 1
else:
S.append(str(line.strip()))
line_num += 1
# Write LP
# We want to minimize d subject to constraints
f_out.write('Minimize\n')
f_out.write(' obj: d\n')
f_out.write('Subject To\n')
count = 0 # Constraint counter
# For each sequence in S
for s in S:
# Write first part - c#:
equation = ' '
equation += 'c' + str(count) + ': '
count += 1
# For each index, value in sequence s
for i,v in enumerate(s):
equation += ' - ' + str(v) + str(i)
equation += ' - d <= - ' + str(n) + '\n'
f_out.write(equation)
# All Xi's must add up to 1
for number in range(n):
constraint = ' c' + str(count) + ': '
count += 1
for letter in bases:
constraint += ' + ' + letter + str(number)
constraint += ' = 1\n'
f_out.write(constraint)
# No bounds, just binary numbers
f_out.write('Bounds\n')
f_out.write('Binaries\n')
for number in range(n):
for letter in bases:
f_out.write(' '+ letter + str(number))
f_out.write('\n')
f_out.write('End')
f_in.close()
f_out.close()
# Solve the problem, output solution
lp_problem = cplex.Cplex()
print 'Solving ' + pre + '.lp\n'
lp_problem.read(path_to_outputs+pre+'.lp')
lp_problem.solve()
soln = ''
# If Xi = 1, append it to the solution as this is the center string in element i
for number in range(n):
for letter in bases:
if lp_problem.solution.get_values(letter+str(number)) == 1:
soln += letter
print "\n\nCenter string is: " + soln + "\n"
print "d is " + str(lp_problem.solution.get_values('d')) + "\n\n\n\n###################"
You can’t perform that action at this time.