From 654ed682b27c9401b0ab9d3f6d6a374ea41748b0 Mon Sep 17 00:00:00 2001 From: Suman Neupane Date: Fri, 26 May 2017 11:54:41 -0400 Subject: [PATCH] Delete mergeSeq.py --- mergeSeq.py | 453 ---------------------------------------------------- 1 file changed, 453 deletions(-) delete mode 100644 mergeSeq.py diff --git a/mergeSeq.py b/mergeSeq.py deleted file mode 100644 index c28404a..0000000 --- a/mergeSeq.py +++ /dev/null @@ -1,453 +0,0 @@ -import re, os, glob, itertools, fnmatch, sys, shutil -from capture_marglik_pol import capture_script - -from itertools import combinations - -script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) -path = os.path.join(script_dir, 'nexus') -#path = '/Users/suman/Documents/Postdoctoral_projects/Algae_data/two_tupules_run/Mar4/combo_script/nexus' - -first_combo = 700 -last_combo = 770 - -# create output directory, recursively deleting it if it already exists after asking permission -output = 'g-700-770' -if os.path.exists(output): - print 'Directory "%s" exists' % output - answer = raw_input('delete it (y/n)?') - if answer in ['y','yes','Y','Yes','YES']: - shutil.rmtree(output) - else: - sys.exit('Aborting because you did not answer "y" or "yes"') -os.mkdir(output) - -number_to_merge = 2 - -#### counting number of genes to be analyzed -number_of_genes = len(fnmatch.filter(os.listdir(path), '*.nex')) -number_gen = range(1, number_of_genes+1) - -#### counting number of nexus files to be written after combinations -combos = list(itertools.combinations(number_gen, number_to_merge)) -number_of_files = len(combos) -# print number_of_files - -# chunks are pieces of the chloroplast that have not been rearranged -# The largest chunk (1) has 7 genes, the smallest (16-23) have only 1 gene each -chunk_map = { -'rpl23':1, -'rpl2' :1, -'rps19':1, -'rps14':1, -'atpA' :1, -'psbI' :1, -'cemA' :1, -'psbE' :2, -'rps9' :2, -'ycf3' :2, -'rpl36':2, -'petD' :2, -'rpoA' :2, -'psbJ' :3, -'atpI' :3, -'psaJ' :3, -'rps12':3, -'rpl16':4, -'rpl14':4, -'rpl5' :4, -'rps8' :4, -'rpoBa':5, -'rpoBb':5, -'rps3' :5, -'rpoC2':5, -'psaB' :6, -'ccsA' :6, -'psbZ' :6, -'psbM' :6, -'psbF' :7, -'psbL' :7, -'petG' :7, -'psbB' :8, -'psbT' :8, -'rps4' :9, -'clpP' :9, -'rps18':10, -'petB' :10, -'rps7' :11, -'atpE' :11, -'atpH' :12, -'atpF' :12, -'psbH' :13, -'psbK' :13, -'psaC' :14, -'petL' :14, -'rbcL' :15, -'rps11':15, -'rpl20':16, -'tufA' :17, -'psaA' :18, -'psbD' :19, -'atpB' :20, -'psbA' :21, -'psbC' :22, -'psbN' :23 -} - -##### nexus file for each merged pair - -new_way = True -genes = [] -data = {} -gene_javascript_data = [] -print 'Reading nexus files...' -for filename in glob.glob(os.path.join(path, '*.nex')): -# print 'Reading file:',filename - #pol m = re.match('(.+)[.]nex', os.path.basename(filename)) - m = re.match('(.+)[-]stripped[.]nex', os.path.basename(filename)) - gene_name = m.group(1) - - genes.append(gene_name) - f = open(filename, 'r').read() - m = re.search('ntax\s*=\s*(\d+)', f, re.M | re.S) - ntax = int(m.group(1)) - m = re.search('nchar\s*=\s*(\d+)', f, re.M | re.S) - nchar = int(m.group(1)) - - gene_javascript_data.append((chunk_map[gene_name], gene_name, nchar)) - - m = re.search('Matrix\s+(.+?);', f, re.M | re.S) - matrix = m.group(1).strip() - - matrix_lines = matrix.split('\n') - - taxon_names = [] - sequences = {} - for line in matrix_lines: - parts = line.strip().split() - assert len(parts) == 2 - taxon_name = parts[0] - sequence = parts[1] - taxon_names.append(taxon_name) - sequences[taxon_name] = sequence - - if not new_way: - if len(data) == 0: - for t in taxon_names: - data[t] = [] - - for t in taxon_names: - if new_way: - group = data.setdefault(t, []) - group.append(sequences[t]) - else: - data[t].append(sequences[t]) - -print 'Found %d genes' % len(genes) -for c,g,n in sorted(gene_javascript_data): - print ' {name:"%s", chunk:%d, seqlen:%d},' % (g,c,n) - -taxa_sorted = sorted(data.keys()) -# print "taxa_sorted=", taxa_sorted - -number_of_genes = len(genes) -number_gen = range(1, number_of_genes+1) - -#### counting number of nexus files to be written after combinations -combos = list(itertools.combinations(number_gen, number_to_merge)) -number_of_combos = len(combos) -print 'Number of combinations:',number_of_combos - -combo_list=[] -current_combo = 0 -for a,b in combos: - current_combo += 1 - if current_combo <= first_combo or current_combo > last_combo: - continue - print 'current_combo=', current_combo-1 - - new_taxonlist = [] - se1 = [] - se2 = [] - new_sequences = [] - seq1 = [] - seq2 = [] - genes_merged = (genes[a-1]+'_'+genes[b-1]) -# print 'genes[a-1]==', genes[a-1] -# print 'genes[b-1]==', genes[b-1] -# - for t in taxa_sorted: - new_taxonlist.append((t,data[t][a-1],data[t][b-1])[0]) - new_sequences.append((data[t][a-1]+data[t][b-1])) - seq1.append((data[t][a-1])) - seq2.append((data[t][b-1])) -# print 'new_taxonlist==', new_taxonlist - ntax = len(new_taxonlist) - newnchar = len(new_sequences[0]) - nchar_gene1 = len(seq1[0]) - nchar_gene2 = len(seq2[0]) -# print 'nchar_gene1====', nchar_gene1 -# print 'nchar_gene2====', nchar_gene2 - - new_nexus_file = '%s.nex' % genes_merged - gene_dir = os.path.join(output, '%s' % (genes_merged)) - if not os.path.exists(gene_dir): - os.mkdir(gene_dir) - - combo_list.append(genes_merged) - - mrbayes_single_run1 = '''begin mrbayes; -delete Ankyra_judai Atractomorpha_echinata Bracteacoccus_aerius Bracteacoccus_minor Chlamydomonas_reinhardtii Chromochloris_zofingiensis Dunaliella_salina Floydiella_terrestris Gonium_pectorale Kirchneriella_aperta Mychonastes_homosphaera Oedogonium_cardiacum Ourococcus_multisporus Pleodorina_starrii Pseudomuriella_schumacherensis Rotundella_rotunda Schizomeris_leibleinii Stigeoclonium_helveticum Volvox_carteri; -charset first = 1-%d\\3; -charset second = 2-%d\\3; -charset third = 3-%d\\3; -partition mine = 3: first, second, third; -set partition=mine; -lset applyto=(all) nst=6 ngammacat=4 rates=gamma; -prset applyto=(all) statefreqpr=Dirichlet(1.0,1.0,1.0,1.0) ratepr=variable revmatpr=dirichlet(1,1,1,1,1,1) brlenspr=Unconstrained:GammaDir(1.0,0.100,1.0,1.0) shapepr=exponential(1.0); -unlink shape=(all) statefreq=(all) revmat=(all); -set seed=9223 swapseed=9223; -mcmcp ngen=8000000 samplefreq=500 printfreq=8000000 starttree=random nruns=1 nchains=1 savebrlens=yes filename=ss; -[sumt filename=mcmc;] -[mcmcp filename=ss;] -ss alpha=0.3 nsteps=30 burninss=-2; -end; -'''% (nchar_gene1, nchar_gene1, nchar_gene1) - - mrbayes_single_run2 = '''begin mrbayes; -delete Ankyra_judai Atractomorpha_echinata Bracteacoccus_aerius Bracteacoccus_minor Chlamydomonas_reinhardtii Chromochloris_zofingiensis Dunaliella_salina Floydiella_terrestris Gonium_pectorale Kirchneriella_aperta Mychonastes_homosphaera Oedogonium_cardiacum Ourococcus_multisporus Pleodorina_starrii Pseudomuriella_schumacherensis Rotundella_rotunda Schizomeris_leibleinii Stigeoclonium_helveticum Volvox_carteri; -charset first = 1-%d\\3; -charset second = 2-%d\\3; -charset third = 3-%d\\3; -partition mine = 3: first, second, third; -set partition=mine; -lset applyto=(all) nst=6 ngammacat=4 rates=gamma; -prset applyto=(all) statefreqpr=Dirichlet(1.0,1.0,1.0,1.0) ratepr=variable revmatpr=dirichlet(1,1,1,1,1,1) brlenspr=Unconstrained:GammaDir(1.0,0.100,1.0,1.0) shapepr=exponential(1.0); -unlink shape=(all) statefreq=(all) revmat=(all); -set seed=9223 swapseed=9223; -mcmcp ngen=8000000 samplefreq=500 printfreq=8000000 starttree=random nruns=1 nchains=1 savebrlens=yes filename=ss; -[sumt filename=mcmc;] -[mcmcp filename=ss;] -ss alpha=0.3 nsteps=30 burninss=-2; -end; -'''% (nchar_gene2, nchar_gene2, nchar_gene2) - - - mrbayes_concat_topo_brlen = '''begin mrbayes; -delete Ankyra_judai Atractomorpha_echinata Bracteacoccus_aerius Bracteacoccus_minor Chlamydomonas_reinhardtii Chromochloris_zofingiensis Dunaliella_salina Floydiella_terrestris Gonium_pectorale Kirchneriella_aperta Mychonastes_homosphaera Oedogonium_cardiacum Ourococcus_multisporus Pleodorina_starrii Pseudomuriella_schumacherensis Rotundella_rotunda Schizomeris_leibleinii Stigeoclonium_helveticum Volvox_carteri; -charset first = 1-%d\\3; -charset second = 2-%d\\3; -charset third = 3-%d\\3; -charset fourth = %d-%d\\3; -charset fifth = %d-%d\\3; -charset sixth = %d-%d\\3; -partition mine = 6: first, second, third, fourth, fifth, sixth; -set partition=mine; -lset applyto=(all) nst=6 ngammacat=4 rates=gamma; -prset applyto=(all) statefreqpr=Dirichlet(1.0,1.0,1.0,1.0) ratepr=variable revmatpr=dirichlet(1,1,1,1,1,1) brlenspr=Unconstrained:GammaDir(1.0,0.100,1.0,1.0) shapepr=exponential(1.0); -unlink shape=(all) statefreq=(all) revmat=(all); -set seed=9223 swapseed=9223; -mcmcp ngen=8000000 samplefreq=500 printfreq=8000000 starttree=random nruns=1 nchains=1 savebrlens=yes filename=ss; -[sumt filename=mcmc;] -[mcmcp filename=ss;] -ss alpha=0.3 nsteps=30 burninss=-2; -end; -'''% (nchar_gene1, nchar_gene1, nchar_gene1, nchar_gene1+1, newnchar, nchar_gene1+2, newnchar, nchar_gene1+3,newnchar) - - mrbayes_concat_topo = '''begin mrbayes; -delete Ankyra_judai Atractomorpha_echinata Bracteacoccus_aerius Bracteacoccus_minor Chlamydomonas_reinhardtii Chromochloris_zofingiensis Dunaliella_salina Floydiella_terrestris Gonium_pectorale Kirchneriella_aperta Mychonastes_homosphaera Oedogonium_cardiacum Ourococcus_multisporus Pleodorina_starrii Pseudomuriella_schumacherensis Rotundella_rotunda Schizomeris_leibleinii Stigeoclonium_helveticum Volvox_carteri; -charset first = 1-%d\\3; -charset second = 2-%d\\3; -charset third = 3-%d\\3; -charset fourth = %d-%d\\3; -charset fifth = %d-%d\\3; -charset sixth = %d-%d\\3; -partition mine = 6: first, second, third, fourth, fifth, sixth; -set partition=mine; -lset applyto=(all) nst=6 ngammacat=4 rates=gamma; -prset applyto=(all) statefreqpr=Dirichlet(1.0,1.0,1.0,1.0) revmatpr=dirichlet(1,1,1,1,1,1) brlenspr=Unconstrained:GammaDir(1.0,0.100,1.0,1.0) shapepr=exponential(1.0); -unlink shape=(all) statefreq=(all) revmat=(all) brlens=(all); -set seed=9223 swapseed=9223; -mcmcp ngen=8000000 samplefreq=500 printfreq=8000000 starttree=random nruns=1 nchains=1 savebrlens=yes filename=ss; -[sumt filename=mcmc;] -[mcmcp filename=ss;] -ss alpha=0.3 nsteps=30 burninss=-2; -end; -'''% (nchar_gene1, nchar_gene1, nchar_gene1, nchar_gene1+1, newnchar, nchar_gene1+2, newnchar, nchar_gene1+3,newnchar) - - -#concat_topo -###########submitallscript########### - submitallscript ='''cd %s; qsub qsub.sh; cd .. -cd %s; qsub qsub.sh; cd .. -cd concat_topo_brlen; qsub qsub.sh; cd .. -cd concat_topo; qsub qsub.sh; cd .. -'''%(genes[a-1], genes[b-1]) - submitall = os.path.join(gene_dir, 'submitall.sh') - newf = open(submitall, 'w') - newf.write(submitallscript) -###########submitallscript########### - - -###########qsubscript for concatenated set########### -# qsubscript = '''#!/bin/bash -# #$ -S /bin/bash -# #$ -cwd -# #$ -N concatenated -# #$ -q highpri.q,highmem.q -# mb concatenated.nex -# rm -f mcmc.* ss.*''' -###########qsubscript for concatenated set########### - - -###########gene1########### - single_gene_dir = os.path.join(gene_dir, genes[a-1]) - if not os.path.exists(single_gene_dir): - os.mkdir(single_gene_dir) - output_path = os.path.join(single_gene_dir, ('%s.nex' %(genes[a-1]))) - newf = open(output_path, 'w') - newf.write('#NEXUS\n\n') - newf.write('Begin data;\n') - newf.write(' Dimensions ntax=%d nchar=%d;\n' % (ntax, nchar_gene1)) - newf.write(' Format datatype=dna missing=? gap=-;\n') - newf.write(' Matrix\n') - longest_taxon_name = max([len(t) for t in new_taxonlist]) - for t,s in zip(new_taxonlist, seq1): - formatstr = '%%%ds' % longest_taxon_name - namestr = formatstr % t - newf.write(' %s %s\n' % (namestr, s)) - newf.write(';\n') - newf.write('end;\n\n') - newf.write(mrbayes_single_run1) - newf.close() - ##qsub_script##### - qsub = os.path.join(single_gene_dir, 'qsub.sh') - newf = open(qsub, 'w') - newf.write('''#!/bin/bash -#$ -S /bin/bash -#$ -cwd -#$ -N %s -#$ -q highpri.q,highmem.q -mb %s.nex -rm -f mcmc.* ss.*'''%(genes[a-1], genes[a-1])) - ##qsub_script##### -###########gene1########### - - -###########gene2########### - single_gene_dir = os.path.join(gene_dir, genes[b-1]) - if not os.path.exists(single_gene_dir): - os.mkdir(single_gene_dir) - output_path = os.path.join(single_gene_dir, ('%s.nex' %(genes[b-1]))) - newf = open(output_path, 'w') - newf.write('#NEXUS\n\n') - newf.write('Begin data;\n') - newf.write(' Dimensions ntax=%d nchar=%d;\n' % (ntax, nchar_gene2)) - newf.write(' Format datatype=dna missing=? gap=-;\n') - newf.write(' Matrix\n') - longest_taxon_name = max([len(t) for t in new_taxonlist]) - for t,s in zip(new_taxonlist, seq2): - formatstr = '%%%ds' % longest_taxon_name - namestr = formatstr % t - newf.write(' %s %s\n' % (namestr, s)) - newf.write(';\n') - newf.write('end;\n\n') - newf.write(mrbayes_single_run2) - newf.close() - ####qsub_script##### - qsub = os.path.join(single_gene_dir, 'qsub.sh') - newf = open(qsub, 'w') - newf.write('''#!/bin/bash -#$ -S /bin/bash -#$ -cwd -#$ -N %s -#$ -q highpri.q,highmem.q -mb %s.nex -rm -f mcmc.* ss.*'''%(genes[b-1], genes[b-1])) - ####qsub_script##### -###########gene2########### - - -###########concat_topo_brlen########### - concat_dir = os.path.join(gene_dir, 'concat_topo_brlen') - if not os.path.exists(concat_dir): - os.mkdir(concat_dir) - output_path = os.path.join(concat_dir, 'concatenated.nex') - newf = open(output_path, 'w') - newf.write('#NEXUS\n\n') - newf.write('Begin data;\n') - newf.write(' Dimensions ntax=%d nchar=%d;\n' % (ntax, newnchar)) - newf.write(' Format datatype=dna missing=? gap=-;\n') - newf.write(' Matrix\n') - longest_taxon_name = max([len(t) for t in new_taxonlist]) - for t,s in zip(new_taxonlist, new_sequences): - formatstr = '%%%ds' % longest_taxon_name - namestr = formatstr % t - newf.write(' %s %s\n' % (namestr, s)) - newf.write(';\n') - newf.write('end;\n\n') - newf.write(mrbayes_concat_topo_brlen) - newf.close() - ####qsub_script concat_topo_brlen##### - qsub = os.path.join(concat_dir, 'qsub.sh') - newf = open(qsub, 'w') - newf.write('''#!/bin/bash -#$ -S /bin/bash -#$ -cwd -#$ -N %s_%s_concat_topo_brlen -#$ -q highpri.q,highmem.q -mb concatenated.nex -rm -f mcmc.* ss.*'''%(genes[a-1],genes[b-1])) - ####qsub_script concat_topo_brlen##### -###########concat_topo_brlen########### - - -###########concat_topo########### - concat_dir = os.path.join(gene_dir, 'concat_topo') - if not os.path.exists(concat_dir): - os.mkdir(concat_dir) - output_path = os.path.join(concat_dir, 'concatenated.nex') - newf = open(output_path, 'w') - newf.write('#NEXUS\n\n') - newf.write('Begin data;\n') - newf.write(' Dimensions ntax=%d nchar=%d;\n' % (ntax, newnchar)) - newf.write(' Format datatype=dna missing=? gap=-;\n') - newf.write(' Matrix\n') - longest_taxon_name = max([len(t) for t in new_taxonlist]) - for t,s in zip(new_taxonlist, new_sequences): - formatstr = '%%%ds' % longest_taxon_name - namestr = formatstr % t - newf.write(' %s %s\n' % (namestr, s)) - newf.write(';\n') - newf.write('end;\n\n') - newf.write(mrbayes_concat_topo) - newf.close() - ####qsub_script concat_topo##### - qsub = os.path.join(concat_dir, 'qsub.sh') - newf = open(qsub, 'w') - newf.write('''#!/bin/bash -#$ -S /bin/bash -#$ -cwd -#$ -N %s_%s_concat_topo -#$ -q highpri.q,highmem.q -mb concatenated.nex -rm -f mcmc.* ss.*'''%(genes[a-1],genes[b-1])) - ####qsub_script concat_topo##### -###########concat_topo########### - - -####### go.sh ######### -jobscript = os.path.join(output, 'go.sh') -newf = open(jobscript, 'w') -newf.write('#!/bin/bash\n\n') -for i in combo_list: - newf.write('cd %s\n. submitall.sh\ncd ..\n\n' %(i)) -newf.close() - -####### capture_marg.py ######### -jobscript2 = os.path.join(output, 'capture_marg.py') -newf = open(jobscript2, 'w') -newf.write(capture_script) -newf.close() - - -