From 233ab28befa8ce2ff80e830c439458aaa6b36c7d Mon Sep 17 00:00:00 2001 From: Suman Neupane Date: Sat, 22 Apr 2017 19:58:19 -0400 Subject: [PATCH] creates random concatenations of tupules from a set of genes --- mergeSeq.py | 453 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 453 insertions(+) create mode 100644 mergeSeq.py diff --git a/mergeSeq.py b/mergeSeq.py new file mode 100644 index 0000000..c28404a --- /dev/null +++ b/mergeSeq.py @@ -0,0 +1,453 @@ +import re, os, glob, itertools, fnmatch, sys, shutil +from capture_marglik_pol import capture_script + +from itertools import combinations + +script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) +path = os.path.join(script_dir, 'nexus') +#path = '/Users/suman/Documents/Postdoctoral_projects/Algae_data/two_tupules_run/Mar4/combo_script/nexus' + +first_combo = 700 +last_combo = 770 + +# create output directory, recursively deleting it if it already exists after asking permission +output = 'g-700-770' +if os.path.exists(output): + print 'Directory "%s" exists' % output + answer = raw_input('delete it (y/n)?') + if answer in ['y','yes','Y','Yes','YES']: + shutil.rmtree(output) + else: + sys.exit('Aborting because you did not answer "y" or "yes"') +os.mkdir(output) + +number_to_merge = 2 + +#### counting number of genes to be analyzed +number_of_genes = len(fnmatch.filter(os.listdir(path), '*.nex')) +number_gen = range(1, number_of_genes+1) + +#### counting number of nexus files to be written after combinations +combos = list(itertools.combinations(number_gen, number_to_merge)) +number_of_files = len(combos) +# print number_of_files + +# chunks are pieces of the chloroplast that have not been rearranged +# The largest chunk (1) has 7 genes, the smallest (16-23) have only 1 gene each +chunk_map = { +'rpl23':1, +'rpl2' :1, +'rps19':1, +'rps14':1, +'atpA' :1, +'psbI' :1, +'cemA' :1, +'psbE' :2, +'rps9' :2, +'ycf3' :2, +'rpl36':2, +'petD' :2, +'rpoA' :2, +'psbJ' :3, +'atpI' :3, +'psaJ' :3, +'rps12':3, +'rpl16':4, +'rpl14':4, +'rpl5' :4, +'rps8' :4, +'rpoBa':5, +'rpoBb':5, +'rps3' :5, +'rpoC2':5, +'psaB' :6, +'ccsA' :6, +'psbZ' :6, +'psbM' :6, +'psbF' :7, +'psbL' :7, +'petG' :7, +'psbB' :8, +'psbT' :8, +'rps4' :9, +'clpP' :9, +'rps18':10, +'petB' :10, +'rps7' :11, +'atpE' :11, +'atpH' :12, +'atpF' :12, +'psbH' :13, +'psbK' :13, +'psaC' :14, +'petL' :14, +'rbcL' :15, +'rps11':15, +'rpl20':16, +'tufA' :17, +'psaA' :18, +'psbD' :19, +'atpB' :20, +'psbA' :21, +'psbC' :22, +'psbN' :23 +} + +##### nexus file for each merged pair + +new_way = True +genes = [] +data = {} +gene_javascript_data = [] +print 'Reading nexus files...' +for filename in glob.glob(os.path.join(path, '*.nex')): +# print 'Reading file:',filename + #pol m = re.match('(.+)[.]nex', os.path.basename(filename)) + m = re.match('(.+)[-]stripped[.]nex', os.path.basename(filename)) + gene_name = m.group(1) + + genes.append(gene_name) + f = open(filename, 'r').read() + m = re.search('ntax\s*=\s*(\d+)', f, re.M | re.S) + ntax = int(m.group(1)) + m = re.search('nchar\s*=\s*(\d+)', f, re.M | re.S) + nchar = int(m.group(1)) + + gene_javascript_data.append((chunk_map[gene_name], gene_name, nchar)) + + m = re.search('Matrix\s+(.+?);', f, re.M | re.S) + matrix = m.group(1).strip() + + matrix_lines = matrix.split('\n') + + taxon_names = [] + sequences = {} + for line in matrix_lines: + parts = line.strip().split() + assert len(parts) == 2 + taxon_name = parts[0] + sequence = parts[1] + taxon_names.append(taxon_name) + sequences[taxon_name] = sequence + + if not new_way: + if len(data) == 0: + for t in taxon_names: + data[t] = [] + + for t in taxon_names: + if new_way: + group = data.setdefault(t, []) + group.append(sequences[t]) + else: + data[t].append(sequences[t]) + +print 'Found %d genes' % len(genes) +for c,g,n in sorted(gene_javascript_data): + print ' {name:"%s", chunk:%d, seqlen:%d},' % (g,c,n) + +taxa_sorted = sorted(data.keys()) +# print "taxa_sorted=", taxa_sorted + +number_of_genes = len(genes) +number_gen = range(1, number_of_genes+1) + +#### counting number of nexus files to be written after combinations +combos = list(itertools.combinations(number_gen, number_to_merge)) +number_of_combos = len(combos) +print 'Number of combinations:',number_of_combos + +combo_list=[] +current_combo = 0 +for a,b in combos: + current_combo += 1 + if current_combo <= first_combo or current_combo > last_combo: + continue + print 'current_combo=', current_combo-1 + + new_taxonlist = [] + se1 = [] + se2 = [] + new_sequences = [] + seq1 = [] + seq2 = [] + genes_merged = (genes[a-1]+'_'+genes[b-1]) +# print 'genes[a-1]==', genes[a-1] +# print 'genes[b-1]==', genes[b-1] +# + for t in taxa_sorted: + new_taxonlist.append((t,data[t][a-1],data[t][b-1])[0]) + new_sequences.append((data[t][a-1]+data[t][b-1])) + seq1.append((data[t][a-1])) + seq2.append((data[t][b-1])) +# print 'new_taxonlist==', new_taxonlist + ntax = len(new_taxonlist) + newnchar = len(new_sequences[0]) + nchar_gene1 = len(seq1[0]) + nchar_gene2 = len(seq2[0]) +# print 'nchar_gene1====', nchar_gene1 +# print 'nchar_gene2====', nchar_gene2 + + new_nexus_file = '%s.nex' % genes_merged + gene_dir = os.path.join(output, '%s' % (genes_merged)) + if not os.path.exists(gene_dir): + os.mkdir(gene_dir) + + combo_list.append(genes_merged) + + mrbayes_single_run1 = '''begin mrbayes; +delete Ankyra_judai Atractomorpha_echinata Bracteacoccus_aerius Bracteacoccus_minor Chlamydomonas_reinhardtii Chromochloris_zofingiensis Dunaliella_salina Floydiella_terrestris Gonium_pectorale Kirchneriella_aperta Mychonastes_homosphaera Oedogonium_cardiacum Ourococcus_multisporus Pleodorina_starrii Pseudomuriella_schumacherensis Rotundella_rotunda Schizomeris_leibleinii Stigeoclonium_helveticum Volvox_carteri; +charset first = 1-%d\\3; +charset second = 2-%d\\3; +charset third = 3-%d\\3; +partition mine = 3: first, second, third; +set partition=mine; +lset applyto=(all) nst=6 ngammacat=4 rates=gamma; +prset applyto=(all) statefreqpr=Dirichlet(1.0,1.0,1.0,1.0) ratepr=variable revmatpr=dirichlet(1,1,1,1,1,1) brlenspr=Unconstrained:GammaDir(1.0,0.100,1.0,1.0) shapepr=exponential(1.0); +unlink shape=(all) statefreq=(all) revmat=(all); +set seed=9223 swapseed=9223; +mcmcp ngen=8000000 samplefreq=500 printfreq=8000000 starttree=random nruns=1 nchains=1 savebrlens=yes filename=ss; +[sumt filename=mcmc;] +[mcmcp filename=ss;] +ss alpha=0.3 nsteps=30 burninss=-2; +end; +'''% (nchar_gene1, nchar_gene1, nchar_gene1) + + mrbayes_single_run2 = '''begin mrbayes; +delete Ankyra_judai Atractomorpha_echinata Bracteacoccus_aerius Bracteacoccus_minor Chlamydomonas_reinhardtii Chromochloris_zofingiensis Dunaliella_salina Floydiella_terrestris Gonium_pectorale Kirchneriella_aperta Mychonastes_homosphaera Oedogonium_cardiacum Ourococcus_multisporus Pleodorina_starrii Pseudomuriella_schumacherensis Rotundella_rotunda Schizomeris_leibleinii Stigeoclonium_helveticum Volvox_carteri; +charset first = 1-%d\\3; +charset second = 2-%d\\3; +charset third = 3-%d\\3; +partition mine = 3: first, second, third; +set partition=mine; +lset applyto=(all) nst=6 ngammacat=4 rates=gamma; +prset applyto=(all) statefreqpr=Dirichlet(1.0,1.0,1.0,1.0) ratepr=variable revmatpr=dirichlet(1,1,1,1,1,1) brlenspr=Unconstrained:GammaDir(1.0,0.100,1.0,1.0) shapepr=exponential(1.0); +unlink shape=(all) statefreq=(all) revmat=(all); +set seed=9223 swapseed=9223; +mcmcp ngen=8000000 samplefreq=500 printfreq=8000000 starttree=random nruns=1 nchains=1 savebrlens=yes filename=ss; +[sumt filename=mcmc;] +[mcmcp filename=ss;] +ss alpha=0.3 nsteps=30 burninss=-2; +end; +'''% (nchar_gene2, nchar_gene2, nchar_gene2) + + + mrbayes_concat_topo_brlen = '''begin mrbayes; +delete Ankyra_judai Atractomorpha_echinata Bracteacoccus_aerius Bracteacoccus_minor Chlamydomonas_reinhardtii Chromochloris_zofingiensis Dunaliella_salina Floydiella_terrestris Gonium_pectorale Kirchneriella_aperta Mychonastes_homosphaera Oedogonium_cardiacum Ourococcus_multisporus Pleodorina_starrii Pseudomuriella_schumacherensis Rotundella_rotunda Schizomeris_leibleinii Stigeoclonium_helveticum Volvox_carteri; +charset first = 1-%d\\3; +charset second = 2-%d\\3; +charset third = 3-%d\\3; +charset fourth = %d-%d\\3; +charset fifth = %d-%d\\3; +charset sixth = %d-%d\\3; +partition mine = 6: first, second, third, fourth, fifth, sixth; +set partition=mine; +lset applyto=(all) nst=6 ngammacat=4 rates=gamma; +prset applyto=(all) statefreqpr=Dirichlet(1.0,1.0,1.0,1.0) ratepr=variable revmatpr=dirichlet(1,1,1,1,1,1) brlenspr=Unconstrained:GammaDir(1.0,0.100,1.0,1.0) shapepr=exponential(1.0); +unlink shape=(all) statefreq=(all) revmat=(all); +set seed=9223 swapseed=9223; +mcmcp ngen=8000000 samplefreq=500 printfreq=8000000 starttree=random nruns=1 nchains=1 savebrlens=yes filename=ss; +[sumt filename=mcmc;] +[mcmcp filename=ss;] +ss alpha=0.3 nsteps=30 burninss=-2; +end; +'''% (nchar_gene1, nchar_gene1, nchar_gene1, nchar_gene1+1, newnchar, nchar_gene1+2, newnchar, nchar_gene1+3,newnchar) + + mrbayes_concat_topo = '''begin mrbayes; +delete Ankyra_judai Atractomorpha_echinata Bracteacoccus_aerius Bracteacoccus_minor Chlamydomonas_reinhardtii Chromochloris_zofingiensis Dunaliella_salina Floydiella_terrestris Gonium_pectorale Kirchneriella_aperta Mychonastes_homosphaera Oedogonium_cardiacum Ourococcus_multisporus Pleodorina_starrii Pseudomuriella_schumacherensis Rotundella_rotunda Schizomeris_leibleinii Stigeoclonium_helveticum Volvox_carteri; +charset first = 1-%d\\3; +charset second = 2-%d\\3; +charset third = 3-%d\\3; +charset fourth = %d-%d\\3; +charset fifth = %d-%d\\3; +charset sixth = %d-%d\\3; +partition mine = 6: first, second, third, fourth, fifth, sixth; +set partition=mine; +lset applyto=(all) nst=6 ngammacat=4 rates=gamma; +prset applyto=(all) statefreqpr=Dirichlet(1.0,1.0,1.0,1.0) revmatpr=dirichlet(1,1,1,1,1,1) brlenspr=Unconstrained:GammaDir(1.0,0.100,1.0,1.0) shapepr=exponential(1.0); +unlink shape=(all) statefreq=(all) revmat=(all) brlens=(all); +set seed=9223 swapseed=9223; +mcmcp ngen=8000000 samplefreq=500 printfreq=8000000 starttree=random nruns=1 nchains=1 savebrlens=yes filename=ss; +[sumt filename=mcmc;] +[mcmcp filename=ss;] +ss alpha=0.3 nsteps=30 burninss=-2; +end; +'''% (nchar_gene1, nchar_gene1, nchar_gene1, nchar_gene1+1, newnchar, nchar_gene1+2, newnchar, nchar_gene1+3,newnchar) + + +#concat_topo +###########submitallscript########### + submitallscript ='''cd %s; qsub qsub.sh; cd .. +cd %s; qsub qsub.sh; cd .. +cd concat_topo_brlen; qsub qsub.sh; cd .. +cd concat_topo; qsub qsub.sh; cd .. +'''%(genes[a-1], genes[b-1]) + submitall = os.path.join(gene_dir, 'submitall.sh') + newf = open(submitall, 'w') + newf.write(submitallscript) +###########submitallscript########### + + +###########qsubscript for concatenated set########### +# qsubscript = '''#!/bin/bash +# #$ -S /bin/bash +# #$ -cwd +# #$ -N concatenated +# #$ -q highpri.q,highmem.q +# mb concatenated.nex +# rm -f mcmc.* ss.*''' +###########qsubscript for concatenated set########### + + +###########gene1########### + single_gene_dir = os.path.join(gene_dir, genes[a-1]) + if not os.path.exists(single_gene_dir): + os.mkdir(single_gene_dir) + output_path = os.path.join(single_gene_dir, ('%s.nex' %(genes[a-1]))) + newf = open(output_path, 'w') + newf.write('#NEXUS\n\n') + newf.write('Begin data;\n') + newf.write(' Dimensions ntax=%d nchar=%d;\n' % (ntax, nchar_gene1)) + newf.write(' Format datatype=dna missing=? gap=-;\n') + newf.write(' Matrix\n') + longest_taxon_name = max([len(t) for t in new_taxonlist]) + for t,s in zip(new_taxonlist, seq1): + formatstr = '%%%ds' % longest_taxon_name + namestr = formatstr % t + newf.write(' %s %s\n' % (namestr, s)) + newf.write(';\n') + newf.write('end;\n\n') + newf.write(mrbayes_single_run1) + newf.close() + ##qsub_script##### + qsub = os.path.join(single_gene_dir, 'qsub.sh') + newf = open(qsub, 'w') + newf.write('''#!/bin/bash +#$ -S /bin/bash +#$ -cwd +#$ -N %s +#$ -q highpri.q,highmem.q +mb %s.nex +rm -f mcmc.* ss.*'''%(genes[a-1], genes[a-1])) + ##qsub_script##### +###########gene1########### + + +###########gene2########### + single_gene_dir = os.path.join(gene_dir, genes[b-1]) + if not os.path.exists(single_gene_dir): + os.mkdir(single_gene_dir) + output_path = os.path.join(single_gene_dir, ('%s.nex' %(genes[b-1]))) + newf = open(output_path, 'w') + newf.write('#NEXUS\n\n') + newf.write('Begin data;\n') + newf.write(' Dimensions ntax=%d nchar=%d;\n' % (ntax, nchar_gene2)) + newf.write(' Format datatype=dna missing=? gap=-;\n') + newf.write(' Matrix\n') + longest_taxon_name = max([len(t) for t in new_taxonlist]) + for t,s in zip(new_taxonlist, seq2): + formatstr = '%%%ds' % longest_taxon_name + namestr = formatstr % t + newf.write(' %s %s\n' % (namestr, s)) + newf.write(';\n') + newf.write('end;\n\n') + newf.write(mrbayes_single_run2) + newf.close() + ####qsub_script##### + qsub = os.path.join(single_gene_dir, 'qsub.sh') + newf = open(qsub, 'w') + newf.write('''#!/bin/bash +#$ -S /bin/bash +#$ -cwd +#$ -N %s +#$ -q highpri.q,highmem.q +mb %s.nex +rm -f mcmc.* ss.*'''%(genes[b-1], genes[b-1])) + ####qsub_script##### +###########gene2########### + + +###########concat_topo_brlen########### + concat_dir = os.path.join(gene_dir, 'concat_topo_brlen') + if not os.path.exists(concat_dir): + os.mkdir(concat_dir) + output_path = os.path.join(concat_dir, 'concatenated.nex') + newf = open(output_path, 'w') + newf.write('#NEXUS\n\n') + newf.write('Begin data;\n') + newf.write(' Dimensions ntax=%d nchar=%d;\n' % (ntax, newnchar)) + newf.write(' Format datatype=dna missing=? gap=-;\n') + newf.write(' Matrix\n') + longest_taxon_name = max([len(t) for t in new_taxonlist]) + for t,s in zip(new_taxonlist, new_sequences): + formatstr = '%%%ds' % longest_taxon_name + namestr = formatstr % t + newf.write(' %s %s\n' % (namestr, s)) + newf.write(';\n') + newf.write('end;\n\n') + newf.write(mrbayes_concat_topo_brlen) + newf.close() + ####qsub_script concat_topo_brlen##### + qsub = os.path.join(concat_dir, 'qsub.sh') + newf = open(qsub, 'w') + newf.write('''#!/bin/bash +#$ -S /bin/bash +#$ -cwd +#$ -N %s_%s_concat_topo_brlen +#$ -q highpri.q,highmem.q +mb concatenated.nex +rm -f mcmc.* ss.*'''%(genes[a-1],genes[b-1])) + ####qsub_script concat_topo_brlen##### +###########concat_topo_brlen########### + + +###########concat_topo########### + concat_dir = os.path.join(gene_dir, 'concat_topo') + if not os.path.exists(concat_dir): + os.mkdir(concat_dir) + output_path = os.path.join(concat_dir, 'concatenated.nex') + newf = open(output_path, 'w') + newf.write('#NEXUS\n\n') + newf.write('Begin data;\n') + newf.write(' Dimensions ntax=%d nchar=%d;\n' % (ntax, newnchar)) + newf.write(' Format datatype=dna missing=? gap=-;\n') + newf.write(' Matrix\n') + longest_taxon_name = max([len(t) for t in new_taxonlist]) + for t,s in zip(new_taxonlist, new_sequences): + formatstr = '%%%ds' % longest_taxon_name + namestr = formatstr % t + newf.write(' %s %s\n' % (namestr, s)) + newf.write(';\n') + newf.write('end;\n\n') + newf.write(mrbayes_concat_topo) + newf.close() + ####qsub_script concat_topo##### + qsub = os.path.join(concat_dir, 'qsub.sh') + newf = open(qsub, 'w') + newf.write('''#!/bin/bash +#$ -S /bin/bash +#$ -cwd +#$ -N %s_%s_concat_topo +#$ -q highpri.q,highmem.q +mb concatenated.nex +rm -f mcmc.* ss.*'''%(genes[a-1],genes[b-1])) + ####qsub_script concat_topo##### +###########concat_topo########### + + +####### go.sh ######### +jobscript = os.path.join(output, 'go.sh') +newf = open(jobscript, 'w') +newf.write('#!/bin/bash\n\n') +for i in combo_list: + newf.write('cd %s\n. submitall.sh\ncd ..\n\n' %(i)) +newf.close() + +####### capture_marg.py ######### +jobscript2 = os.path.join(output, 'capture_marg.py') +newf = open(jobscript2, 'w') +newf.write(capture_script) +newf.close() + + +