center_string.py

'''
PROBLEM STATEMENT

    The closest sequence problem is defined as follows:
    Given m DNA sequences s1, . . . , sm, each of length n, find a DNA sequence t of length
    n such that maxi=1,...,m dH(t, si) is minimized, where dH(t, si) denotes the Hamming distance
    between t and si.

    For this project you must implement a method for finding optimum solutions to the closest sequence
    problem. You can either implement a branch-and-bound algorithm or use integer programming in
    conjunction with optimization engines such as the GNU Linear Programming Kit (GLPK).

INPUT

    Your program should read from the standard input a line containing integers m and n, followed by
    m lines each containing a DNA sequence of length n.
    Sample input:
        10 25
        CTGGCGGTGGCTATCATCCGTCCCT
        CATGCGAGTGGTCGGTGATAGCTCG
        GAAGTGTGAGGAATCCGTAGAGAAT
        GAACTAAGTAGTTCACCTTACCCTC
        CCAACACTCATATCGTCTTGCTACT
        TGACTCCTTTTTTATTCATATTTTC
        AATACTCGACCTTCCACGAAGGCTG
        GGATTCACCTCCCTTTCCGCTGAAT
        CAGAGGTAAAAGAAAGGGGGACAAT
        GATAATCGTAGAATTAAATAAGACA


OUTPUT

    If you use integer programming, your program should print to the standard output an integer program
    model of the input problem instance in lp format (see http://lpsolve.sourceforge.net/5.1/CPLEXformat.htm).

'''

# Imports
import os
import sys

sys.path.append('/usr/local/lib/python2.7/dist-packages')
import cplex


# Absolutepath to inputs, outputs
path_to_inputs = '/home/moria/Projects/Bioinformatics/CenterStringLP/Inputs/'
path_to_outputs = '/home/moria/Projects/Bioinformatics/CenterStringLP/Outputs/'
bases = ['A','C','T','G']

for fn in os.listdir(path_to_inputs):
    # Split filename into prefix, suffix
    pre,suff = fn.split('.')
    
    # Reset line counter
    line_num = 0
    S = []
    # Open the files
    f_in = open(path_to_inputs+fn, 'r')
    f_out = open(path_to_outputs+pre+'.lp', 'w')
    for line in f_in:
        if line_num == 0:
            # get number of strings, length of strings
            m,n = line.split(' ')
            m = int(m)
            n = int(n)
            line_num += 1
        else:
            S.append(str(line))
    
    # Write LP
    f_out.write('Minimize\n')
    f_out.write(' obj: d\n')
    f_out.write('Subject To\n')
    
    count = 0   # Constraint counter
    
    # For each sequence in S
    for s in S:
        equation = ' '
        s = s.strip()
        equation += 'c' + str(count) + ': '
        count += 1

        # For each index, value in sequence s
        for i,v in enumerate(s):
            equation += ' - ' + str(v) + str(i)
        equation += ' - d <= - ' + str(n) + '\n'
        f_out.write(equation)
     
    for number in range(n):
        constraint = ' c' + str(count) + ': '
        count += 1
        for letter in bases:
            constraint += ' + ' + letter + str(number)
        constraint += ' = 1\n'

        f_out.write(constraint)
    f_out.write('Bounds\n')
    f_out.write('Binaries\n')
    for number in range(n):
        for letter in bases:
            f_out.write(' '+ letter + str(number))

    f_out.write('\n')
    f_out.write('End')        


    f_in.close()
    f_out.close()
 
    # Solve the problem, output solution
    lp_problem = cplex.Cplex()
    print 'Solving ' + pre + '.lp\n'
    lp_problem.read(path_to_outputs+pre+'.lp')
    lp_problem.solve()
    
    soln = ''
    for number in range(n):
        for letter in bases:
            if lp_problem.solution.get_values(letter+str(number)) == 1:
                soln += letter
    print "Center string is:    " + soln + "\n"
    print "d is " + str(lp_problem.solution.get_values('d')) + "\n"