diff --git a/Inputs/input1.txt b/Inputs/input1.txt new file mode 100644 index 0000000..046ec89 --- /dev/null +++ b/Inputs/input1.txt @@ -0,0 +1,11 @@ +10 5 +ACCAC +ACAAC +CCACC +ACAAC +AACCA +CAACC +CCAAC +CAACA +AACAA +CAACA diff --git a/Inputs/input2.txt b/Inputs/input2.txt new file mode 100644 index 0000000..36b1e24 --- /dev/null +++ b/Inputs/input2.txt @@ -0,0 +1,21 @@ +20 10 +ACCCACCCCC +ACAACAACCA +ACAACCACCA +AACAACCACC +CCACCCCCCA +ACCACCACCC +AACAACAACC +CCAACAACAA +CCACCACCCA +CCCCCCACAC +ACCACCCACC +CACCCACCCC +CACCCCCCAC +AACCACCACC +CCCACCCCCC +CAACCACCAC +CACCACCCAC +ACCCCCCACA +CCACCCACCC +CAACAACCAC diff --git a/Inputs/input3.txt b/Inputs/input3.txt new file mode 100644 index 0000000..15d35dc --- /dev/null +++ b/Inputs/input3.txt @@ -0,0 +1,11 @@ +10 25 +CATACGTAAAAGAAAGGCGGACAAT +AAGAGGAAAAAGAAACGGGGACAAT +CAGAGGTAAAAGTAAGGCGGACTAT +GAGAGGTATAAGAAAGCGGGACAAT +GAGAGGTAAAAGTAAGGGGGACAAA +CAGAGGTAACAGAAAGGCGGACGAT +TAGAGGTAACAGAAAGCGGGACAAT +CAGACGTTAAAGAAAGGCGGACAAT +CCGAGGTAAAACAAAGGGGAACAAT +AAGAGTTAAAAGAAAGGGGGAAAAT diff --git a/Inputs/input4.txt b/Inputs/input4.txt new file mode 100644 index 0000000..70e1820 --- /dev/null +++ b/Inputs/input4.txt @@ -0,0 +1,11 @@ +10 100 +CTGGCGGTGGCTATCATCCGTCCCTCATGCGAGTGGTCGGTGATAGCTCGGAAGTGTGAGGAATCCGTAGAGAATGAACTAAGTAGTTCACCTTACCCTC +CCAACACTCATATCGTCTTGCTACTTGACTCCTTTTTTATTCATATTTTCAATACTCGACCTTCCACGAAGGCTGGGATTCACCTCCCTTTCCGCTGAAT +CAGAGGTAAAAGAAAGGGGGACAATGATAATCGTAGAATTAAATAAGACAGGTGTCGATTACGACCCATTTCCCTTCGCTTACGGATGTATAGGCGTCTC +AGTATAGGGTGAAATAGCGGCTATAACCCATTTCCATGCGGACTCGGAACTGCTAGTAGTCTCAGTCATCGGGATCAACGTTGATATGCTAGGTCCGAGA +GGAGGTGTCTACAGACAGCCGCCCAAAGTAAGGCGGAATGGTCGTAAGAGCTTTCCTCGTCGTCGACTAAGATTATCTTCTTATGAAACAACGAGACCTT +CAAAATTGAAAGTCTGTAAGGATATAGAGGACTCCCGCATTTCAGCAACCTATGATGGCTAGGCTTCATCTACCCCGCTGGGTCTCATCCCTGGTTTTCC +GGCCGTCGAGCCTGCTCCAATTCTCCACTATCGGGCTTGCCCCTAGTAAGAAGCGCTCAGTCCACGGTACGGCAACGCAGTAAAAACACTTAGACTAAAG +CATTGACTATAGCTTGAGTCGCGTGCATGTTGTTACAATCCTCAACCCTCGGGCGAGCGGAACTTGTCTTCTCAGCTTGCTTTCAAAGGCCTTACCCTTC +GTCCGCACTTGCCTACCTAAGGCTGGACGCAACACCAATATTAACGGGCTTAGCCTGGAGTTGACCGGATCCTGGCCGACCTCTACTCGGCGCCCGTTGC +GCACCCATCTCAGTAGTGTCACGAACGAGGATAGCTGACGATGTAAATGTTCTGGTCAAGCCGGCATTGCGGTGTGAATTAACTATCTGCCTACGCAGGG diff --git a/center_string.py b/center_string.py new file mode 100644 index 0000000..4b1a2ab --- /dev/null +++ b/center_string.py @@ -0,0 +1,98 @@ +''' +PROBLEM STATEMENT + + The closest sequence problem is defined as follows: + Given m DNA sequences s1, . . . , sm, each of length n, find a DNA sequence t of length + n such that maxi=1,...,m dH(t, si) is minimized, where dH(t, si) denotes the Hamming distance + between t and si. + + For this project you must implement a method for finding optimum solutions to the closest sequence + problem. You can either implement a branch-and-bound algorithm or use integer programming in + conjunction with optimization engines such as the GNU Linear Programming Kit (GLPK). + +INPUT + + Your program should read from the standard input a line containing integers m and n, followed by + m lines each containing a DNA sequence of length n. + Sample input: + 10 25 + CTGGCGGTGGCTATCATCCGTCCCT + CATGCGAGTGGTCGGTGATAGCTCG + GAAGTGTGAGGAATCCGTAGAGAAT + GAACTAAGTAGTTCACCTTACCCTC + CCAACACTCATATCGTCTTGCTACT + TGACTCCTTTTTTATTCATATTTTC + AATACTCGACCTTCCACGAAGGCTG + GGATTCACCTCCCTTTCCGCTGAAT + CAGAGGTAAAAGAAAGGGGGACAAT + GATAATCGTAGAATTAAATAAGACA + + +OUTPUT + + If you use integer programming, your program should print to the standard output an integer program + model of the input problem instance in lp format (see http://lpsolve.sourceforge.net/5.1/CPLEXformat.htm). + +''' + +# Imports +import os + +# Absolutepath to inputs +path_to_inputs = "/home/moria/Projects/Bioinformatics/CenterStringLP/Inputs/" +path_to_outputs = "/home/moria/Projects/Bioinformatics/CenterStringLP/Outputs/" +bases = ['A','C','T','G'] + + +for fn in os.listdir(path_to_inputs): + # Split filename into prefix, suffix + pre,suff = fn.split('.') + + # Reset line counter + line_num = 0 + S = [] + # Opeb the file + f_in = open(path_to_inputs+fn, 'r') + f_out = open(path_to_outputs+pre+'.lp', 'w') + for line in f_in: + if line_num == 0: + # get number of strings, length of strings + m,n = line.split(' ') + m = int(m) + n = int(n) + line_num += 1 + else: + S.append(str(line)) + + f_out.write("Minimize\n") + f_out.write(" obj: d\n") + f_out.write("Subject To\n") + count = 0 + for s in S: + equation = " " + s = s.strip() + equation += "c" + str(count) + ": " + str(n) + count += 1 + for i,v in enumerate(s): + equation += " - " + str(v) + str(i) + equation += " - d <= 0\n" + f_out.write(equation) + + for number in range(n): + constraint = " c" + str(count) + ": 0 <=" + count += 1 + for letter in bases: + constraint += " + " + letter + str(number) + constraint += " <= 1\n" + + f_out.write(constraint) + f_out.write("Binaries\n") + for number in range(n): + for letter in bases: + f_out.write( + + + + f_in.close() + f_out.close() +