Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
CenterStringLP/center_string.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
141 lines (109 sloc)
4.14 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
PROBLEM STATEMENT | |
The closest sequence problem is defined as follows: | |
Given m DNA sequences s1, . . . , sm, each of length n, find a DNA sequence t of length | |
n such that maxi=1,...,m dH(t, si) is minimized, where dH(t, si) denotes the Hamming distance | |
between t and si. | |
For this project you must implement a method for finding optimum solutions to the closest sequence | |
problem. You can either implement a branch-and-bound algorithm or use integer programming in | |
conjunction with optimization engines such as the GNU Linear Programming Kit (GLPK). | |
INPUT | |
Your program should read from the standard input a line containing integers m and n, followed by | |
m lines each containing a DNA sequence of length n. | |
Sample input: | |
10 25 | |
CTGGCGGTGGCTATCATCCGTCCCT | |
CATGCGAGTGGTCGGTGATAGCTCG | |
GAAGTGTGAGGAATCCGTAGAGAAT | |
GAACTAAGTAGTTCACCTTACCCTC | |
CCAACACTCATATCGTCTTGCTACT | |
TGACTCCTTTTTTATTCATATTTTC | |
AATACTCGACCTTCCACGAAGGCTG | |
GGATTCACCTCCCTTTCCGCTGAAT | |
CAGAGGTAAAAGAAAGGGGGACAAT | |
GATAATCGTAGAATTAAATAAGACA | |
OUTPUT | |
If you use integer programming, your program should print to the standard output an integer program | |
model of the input problem instance in lp format (see http://lpsolve.sourceforge.net/5.1/CPLEXformat.htm). | |
''' | |
# Imports | |
import os | |
import sys | |
# I need to symlink my Anaconda install with this :/ | |
sys.path.append('/usr/local/lib/python2.7/dist-packages') | |
import cplex | |
# Absolute path to inputs, outputs | |
path_to_inputs = '/home/moria/Projects/Bioinformatics/CenterStringLP/Inputs/' | |
path_to_outputs = '/home/moria/Projects/Bioinformatics/CenterStringLP/Outputs/' | |
# Master bases | |
bases = ['A','C','T','G'] | |
# For each input file... | |
for fn in os.listdir(path_to_inputs): | |
# Split filename into prefix, suffix | |
pre,suff = fn.split('.') | |
# Reset line counter, init array S to store sequences | |
line_num = 0 | |
S = [] | |
# Open the files | |
f_in = open(path_to_inputs+fn, 'r') | |
f_out = open(path_to_outputs+pre+'.lp', 'w') | |
# For each line in the input file... | |
for line in f_in: | |
# If line num is 0, grab n,m else put stripped sequence into S | |
if line_num == 0: | |
# get number of strings, length of strings | |
m,n = line.split(' ') | |
m = int(m) | |
n = int(n) | |
line_num += 1 | |
else: | |
S.append(str(line.strip())) | |
line_num += 1 | |
# Write LP | |
# We want to minimize d subject to constraints | |
f_out.write('Minimize\n') | |
f_out.write(' obj: d\n') | |
f_out.write('Subject To\n') | |
count = 0 # Constraint counter | |
# For each sequence in S | |
for s in S: | |
# Write first part - c#: | |
equation = ' ' | |
equation += 'c' + str(count) + ': ' | |
count += 1 | |
# For each index, value in sequence s | |
for i,v in enumerate(s): | |
equation += ' - ' + str(v) + str(i) | |
equation += ' - d <= - ' + str(n) + '\n' | |
f_out.write(equation) | |
# All Xi's must add up to 1 | |
for number in range(n): | |
constraint = ' c' + str(count) + ': ' | |
count += 1 | |
for letter in bases: | |
constraint += ' + ' + letter + str(number) | |
constraint += ' = 1\n' | |
f_out.write(constraint) | |
# No bounds, just binary numbers | |
f_out.write('Bounds\n') | |
f_out.write('Binaries\n') | |
for number in range(n): | |
for letter in bases: | |
f_out.write(' '+ letter + str(number)) | |
f_out.write('\n') | |
f_out.write('End') | |
f_in.close() | |
f_out.close() | |
# Solve the problem, output solution | |
lp_problem = cplex.Cplex() | |
print 'Solving ' + pre + '.lp\n' | |
lp_problem.read(path_to_outputs+pre+'.lp') | |
lp_problem.solve() | |
soln = '' | |
# If Xi = 1, append it to the solution as this is the center string in element i | |
for number in range(n): | |
for letter in bases: | |
if lp_problem.solution.get_values(letter+str(number)) == 1: | |
soln += letter | |
print "\n\nCenter string is: " + soln + "\n" | |
print "d is " + str(lp_problem.solution.get_values('d')) + "\n\n\n\n###################" |