Permalink
##
CenterStringLP/

Go to file
Cannot retrieve contributors at this time

##
CenterStringLP/**center_string.py**

Go to file
''' | |

PROBLEM STATEMENT | |

The closest sequence problem is defined as follows: | |

Given m DNA sequences s1, . . . , sm, each of length n, find a DNA sequence t of length | |

n such that maxi=1,...,m dH(t, si) is minimized, where dH(t, si) denotes the Hamming distance | |

between t and si. | |

For this project you must implement a method for finding optimum solutions to the closest sequence | |

problem. You can either implement a branch-and-bound algorithm or use integer programming in | |

conjunction with optimization engines such as the GNU Linear Programming Kit (GLPK). | |

INPUT | |

Your program should read from the standard input a line containing integers m and n, followed by | |

m lines each containing a DNA sequence of length n. | |

Sample input: | |

10 25 | |

CTGGCGGTGGCTATCATCCGTCCCT | |

CATGCGAGTGGTCGGTGATAGCTCG | |

GAAGTGTGAGGAATCCGTAGAGAAT | |

GAACTAAGTAGTTCACCTTACCCTC | |

CCAACACTCATATCGTCTTGCTACT | |

TGACTCCTTTTTTATTCATATTTTC | |

AATACTCGACCTTCCACGAAGGCTG | |

GGATTCACCTCCCTTTCCGCTGAAT | |

CAGAGGTAAAAGAAAGGGGGACAAT | |

GATAATCGTAGAATTAAATAAGACA | |

OUTPUT | |

If you use integer programming, your program should print to the standard output an integer program | |

model of the input problem instance in lp format (see http://lpsolve.sourceforge.net/5.1/CPLEXformat.htm). | |

''' | |

# Imports | |

import os | |

import sys | |

# I need to symlink my Anaconda install with this :/ | |

sys.path.append('/usr/local/lib/python2.7/dist-packages') | |

import cplex | |

# Absolute path to inputs, outputs | |

path_to_inputs = '/home/moria/Projects/Bioinformatics/CenterStringLP/Inputs/' | |

path_to_outputs = '/home/moria/Projects/Bioinformatics/CenterStringLP/Outputs/' | |

# Master bases | |

bases = ['A','C','T','G'] | |

# For each input file... | |

for fn in os.listdir(path_to_inputs): | |

# Split filename into prefix, suffix | |

pre,suff = fn.split('.') | |

# Reset line counter, init array S to store sequences | |

line_num = 0 | |

S = [] | |

# Open the files | |

f_in = open(path_to_inputs+fn, 'r') | |

f_out = open(path_to_outputs+pre+'.lp', 'w') | |

# For each line in the input file... | |

for line in f_in: | |

# If line num is 0, grab n,m else put stripped sequence into S | |

if line_num == 0: | |

# get number of strings, length of strings | |

m,n = line.split(' ') | |

m = int(m) | |

n = int(n) | |

line_num += 1 | |

else: | |

S.append(str(line.strip())) | |

line_num += 1 | |

# Write LP | |

# We want to minimize d subject to constraints | |

f_out.write('Minimize\n') | |

f_out.write(' obj: d\n') | |

f_out.write('Subject To\n') | |

count = 0 # Constraint counter | |

# For each sequence in S | |

for s in S: | |

# Write first part - c#: | |

equation = ' ' | |

equation += 'c' + str(count) + ': ' | |

count += 1 | |

# For each index, value in sequence s | |

for i,v in enumerate(s): | |

equation += ' - ' + str(v) + str(i) | |

equation += ' - d <= - ' + str(n) + '\n' | |

f_out.write(equation) | |

# All Xi's must add up to 1 | |

for number in range(n): | |

constraint = ' c' + str(count) + ': ' | |

count += 1 | |

for letter in bases: | |

constraint += ' + ' + letter + str(number) | |

constraint += ' = 1\n' | |

f_out.write(constraint) | |

# No bounds, just binary numbers | |

f_out.write('Bounds\n') | |

f_out.write('Binaries\n') | |

for number in range(n): | |

for letter in bases: | |

f_out.write(' '+ letter + str(number)) | |

f_out.write('\n') | |

f_out.write('End') | |

f_in.close() | |

f_out.close() | |

# Solve the problem, output solution | |

lp_problem = cplex.Cplex() | |

print 'Solving ' + pre + '.lp\n' | |

lp_problem.read(path_to_outputs+pre+'.lp') | |

lp_problem.solve() | |

soln = '' | |

# If Xi = 1, append it to the solution as this is the center string in element i | |

for number in range(n): | |

for letter in bases: | |

if lp_problem.solution.get_values(letter+str(number)) == 1: | |

soln += letter | |

print "\n\nCenter string is: " + soln + "\n" | |

print "d is " + str(lp_problem.solution.get_values('d')) + "\n\n\n\n###################" |