Permalink
Cannot retrieve contributors at this time
CenterStringLP/center_string.py
Go to file''' | |
PROBLEM STATEMENT | |
The closest sequence problem is defined as follows: | |
Given m DNA sequences s1, . . . , sm, each of length n, find a DNA sequence t of length | |
n such that maxi=1,...,m dH(t, si) is minimized, where dH(t, si) denotes the Hamming distance | |
between t and si. | |
For this project you must implement a method for finding optimum solutions to the closest sequence | |
problem. You can either implement a branch-and-bound algorithm or use integer programming in | |
conjunction with optimization engines such as the GNU Linear Programming Kit (GLPK). | |
INPUT | |
Your program should read from the standard input a line containing integers m and n, followed by | |
m lines each containing a DNA sequence of length n. | |
Sample input: | |
10 25 | |
CTGGCGGTGGCTATCATCCGTCCCT | |
CATGCGAGTGGTCGGTGATAGCTCG | |
GAAGTGTGAGGAATCCGTAGAGAAT | |
GAACTAAGTAGTTCACCTTACCCTC | |
CCAACACTCATATCGTCTTGCTACT | |
TGACTCCTTTTTTATTCATATTTTC | |
AATACTCGACCTTCCACGAAGGCTG | |
GGATTCACCTCCCTTTCCGCTGAAT | |
CAGAGGTAAAAGAAAGGGGGACAAT | |
GATAATCGTAGAATTAAATAAGACA | |
OUTPUT | |
If you use integer programming, your program should print to the standard output an integer program | |
model of the input problem instance in lp format (see http://lpsolve.sourceforge.net/5.1/CPLEXformat.htm). | |
''' | |
# Imports | |
import os | |
import sys | |
# I need to symlink my Anaconda install with this :/ | |
sys.path.append('/usr/local/lib/python2.7/dist-packages') | |
import cplex | |
# Absolute path to inputs, outputs | |
path_to_inputs = '/home/moria/Projects/Bioinformatics/CenterStringLP/Inputs/' | |
path_to_outputs = '/home/moria/Projects/Bioinformatics/CenterStringLP/Outputs/' | |
# Master bases | |
bases = ['A','C','T','G'] | |
# For each input file... | |
for fn in os.listdir(path_to_inputs): | |
# Split filename into prefix, suffix | |
pre,suff = fn.split('.') | |
# Reset line counter, init array S to store sequences | |
line_num = 0 | |
S = [] | |
# Open the files | |
f_in = open(path_to_inputs+fn, 'r') | |
f_out = open(path_to_outputs+pre+'.lp', 'w') | |
# For each line in the input file... | |
for line in f_in: | |
# If line num is 0, grab n,m else put stripped sequence into S | |
if line_num == 0: | |
# get number of strings, length of strings | |
m,n = line.split(' ') | |
m = int(m) | |
n = int(n) | |
line_num += 1 | |
else: | |
S.append(str(line.strip())) | |
line_num += 1 | |
# Write LP | |
# We want to minimize d subject to constraints | |
f_out.write('Minimize\n') | |
f_out.write(' obj: d\n') | |
f_out.write('Subject To\n') | |
count = 0 # Constraint counter | |
# For each sequence in S | |
for s in S: | |
# Write first part - c#: | |
equation = ' ' | |
equation += 'c' + str(count) + ': ' | |
count += 1 | |
# For each index, value in sequence s | |
for i,v in enumerate(s): | |
equation += ' - ' + str(v) + str(i) | |
equation += ' - d <= - ' + str(n) + '\n' | |
f_out.write(equation) | |
# All Xi's must add up to 1 | |
for number in range(n): | |
constraint = ' c' + str(count) + ': ' | |
count += 1 | |
for letter in bases: | |
constraint += ' + ' + letter + str(number) | |
constraint += ' = 1\n' | |
f_out.write(constraint) | |
# No bounds, just binary numbers | |
f_out.write('Bounds\n') | |
f_out.write('Binaries\n') | |
for number in range(n): | |
for letter in bases: | |
f_out.write(' '+ letter + str(number)) | |
f_out.write('\n') | |
f_out.write('End') | |
f_in.close() | |
f_out.close() | |
# Solve the problem, output solution | |
lp_problem = cplex.Cplex() | |
print 'Solving ' + pre + '.lp\n' | |
lp_problem.read(path_to_outputs+pre+'.lp') | |
lp_problem.solve() | |
soln = '' | |
# If Xi = 1, append it to the solution as this is the center string in element i | |
for number in range(n): | |
for letter in bases: | |
if lp_problem.solution.get_values(letter+str(number)) == 1: | |
soln += letter | |
print "\n\nCenter string is: " + soln + "\n" | |
print "d is " + str(lp_problem.solution.get_values('d')) + "\n\n\n\n###################" |