From 938c6fd892e3dc572ac91283f58c65b1a40e565b Mon Sep 17 00:00:00 2001 From: Suman Neupane Date: Fri, 26 May 2017 11:53:23 -0400 Subject: [PATCH 1/9] Delete brnlenMCMC.py --- brnlenMCMC.py | 494 -------------------------------------------------- 1 file changed, 494 deletions(-) delete mode 100644 brnlenMCMC.py diff --git a/brnlenMCMC.py b/brnlenMCMC.py deleted file mode 100644 index 87c6160..0000000 --- a/brnlenMCMC.py +++ /dev/null @@ -1,494 +0,0 @@ -########################################################################################## -# This script reads a nexus DNA matrix (through module readseq.py) and a newick tree -# topology, and computes log-likelihood of the topology under Jukes Cantor+GAMMA model, -# and performs MCMC on branch length parameter -########################################################################################## - - -import readSeq -import random -import re, os, itertools, sys, glob -from itertools import chain -from scipy.stats import gamma -from math import exp, log - - -########################################################################################## -tree_file_name = 'tree.tre' -sequence_file = 'example3.nex' -alpha = 0.5 #gamma shape parameter for rate categories -n_gen = 50000 -save_every = 50 -mean_expo = 10. #mean_expo = mean of exponential distribution for branch length prior -########################################################################################## - - - - -class node(object): - def __init__(self, ndnum): # initialization function - self.rsib = None # right sibling - self.lchild = None # left child - self.par = None # parent node - self.number = ndnum # node number (internals negative, tips 0 or positive) - self.edgelen = 0.0 # branch length - self.descendants = set([ndnum]) # set containing descendant leaf set - self.partial = None # will have length 4*npatterns - - - def __str__(self): - # __str__ is a built-in function that is used by print to show an object - descendants_as_string = ','.join(['%d' % d for d in self.descendants]) - - lchildstr = 'None' - if self.lchild is not None: - lchildstr = '%d' % self.lchild.number - - rsibstr = 'None' - if self.rsib is not None: - rsibstr = '%d' % self.rsib.number - - parstr = 'None' - if self.par is not None: - parstr = '%d' % self.par.number - - return 'node: number=%d edgelen=%g lchild=%s rsib=%s parent=%s descendants=[%s]' % (self.number, self.edgelen, lchildstr, rsibstr, parstr, descendants_as_string) - - - -def allocatePartial(node, patterns, rates): - if node.number > 0: - npatterns = len(patterns) - - if node.partial is None: - node.partial = [0.0]*(4*4*npatterns) -# print len(node.partial) - for i,pattern in enumerate(patterns.keys()): - base = pattern[node.number-1] - for l in range(4): - if base == 'A': - - - node.partial[i*16+l*4 + 0] = 1.0 - elif base == 'C': - node.partial[i*16+l*4 + 1] = 1.0 - elif base == 'G': - node.partial[i*16+l*4 + 2] = 1.0 - elif base == 'T': - node.partial[i*16+l*4 + 3] = 1.0 - else: - assert(False), 'oops, something went horribly wrong!' - - else: - - npatterns = len(patterns) - if node.partial is None: - node.partial = [0.0]*(4*4*npatterns) - - like_list = [] - for i,pattern in enumerate(patterns.keys()): - m_list = [] - num_pattern = patterns[pattern] - - for l,m in enumerate(rates): - - psame = (0.25+0.75*exp(-4.0*m*(node.lchild.edgelen)/3.0)) - pdiff = (0.25-0.25*exp(-4.0*m*(node.lchild.edgelen)/3.0)) - - psame2 = (0.25+0.75*exp(-4.0*m*(node.lchild.rsib.edgelen)/3.0)) - pdiff2 = (0.25-0.25*exp(-4.0*m*(node.lchild.rsib.edgelen)/3.0)) - - num_pattern = patterns[pattern] - pAA = psame*(node.lchild.partial[i*16+l*4 + 0]) - pAC = pdiff*(node.lchild.partial[i*16+l*4 + 1]) - pAG = pdiff*(node.lchild.partial[i*16+l*4 + 2]) - pAT = pdiff*(node.lchild.partial[i*16+l*4 + 3]) - - pAA2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 0]) - pAC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1]) - pAG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2]) - pAT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3]) - - pfromA_lchild = pAA+pAC+pAG+pAT - pfromA_rchild = pAA2+pAC2+pAG2+pAT2 - node.partial[i*16+l*4 + 0] = pfromA_lchild*pfromA_rchild - - - ###################################################### - - pCA = pdiff*(node.lchild.partial[i*16+l*4 + 0]) - pCC = psame*(node.lchild.partial[i*16+l*4 + 1]) - pCG = pdiff*(node.lchild.partial[i*16+l*4 + 2]) - pCT = pdiff*(node.lchild.partial[i*16+l*4 + 3]) - - pCA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0]) - pCC2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 1]) - pCG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2]) - pCT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3]) - - pfromC_lchild = pCA+pCC+pCG+pCT - pfromC_rchild = pCA2+pCC2+pCG2+pCT2 - node.partial[i*16+l*4 + 1] = pfromC_lchild*pfromC_rchild - - ####################################################### -# - pGA = pdiff*(node.lchild.partial[i*16+l*4 + 0]) - pGC = pdiff*(node.lchild.partial[i*16+l*4 + 1]) - pGG = psame*(node.lchild.partial[i*16+l*4 + 2]) - pGT = pdiff*(node.lchild.partial[i*16+l*4 + 3]) - - pGA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0]) - pGC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1]) - pGG2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 2]) - pGT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3]) - - pfromG_lchild = pGA+pGC+pGG+pGT - pfromG_rchild = pGA2+pGC2+pGG2+pGT2 - node.partial[i*16+l*4 + 2] = pfromG_lchild*pfromG_rchild - - ####################################################### - - pTA = pdiff*(node.lchild.partial[i*16+l*4 + 0]) - pTC = pdiff*(node.lchild.partial[i*16+l*4 + 1]) - pTG = pdiff*(node.lchild.partial[i*16+l*4 + 2]) - pTT = psame*(node.lchild.partial[i*16+l*4 + 3]) - - pTA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0]) - pTC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1]) - pTG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2]) - pTT2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 3]) - - pfromT_lchild = pTA+pTC+pTG+pTT - pfromT_rchild = pTA2+pTC2+pTG2+pTT2 - node.partial[i*16+l*4 + 3] = pfromT_lchild*pfromT_rchild - - site_like = (sum(node.partial[i*16:i*16+16]))*0.25*0.25 - site_log_like = (log(site_like))*num_pattern - like_list.append(site_log_like) - log_like = sum(like_list) - return log_like - - - -def mcmcbrn(postorder, patterns, rates): - nodes = readnewick(treenewick()) - mcmc = 0 - output = os.path.join('brnlenMCMC_results.txt') - newf = open(output, 'w') - newf.write('%s\t'%('n_gen')) - newf.write( '%s\t%s\t'%('LnL','LnPr')) - for nl in postorder: - newf.write( 'node%s\t'%(nl.number)) - newf.write('\n') - start_log_prior = 0.0 - for nd in nodes: - start_log_prior += (-nd.edgelen/mean_expo)-(log(mean_expo)) - start_log_like = prepareTree(nodes, patterns, rates) - - - newf.write('%s\t'%(mcmc)) - print 'mcmc gen=', mcmc - print start_log_like, start_log_prior, - - newf.write( '%.6f\t%.6f\t'%(start_log_like,start_log_prior)) - for nl in postorder: - newf.write( '%.6f\t'%(nl.edgelen)) - print nl.edgelen, - print - print '**************************' - - newf.write('\n') -# print - for r in range(n_gen): - for i in range(len(postorder)): - preedgelen = nodes[i].edgelen - currentlike = prepareTree(nodes, patterns, rates) -# currentlike = 0.0 - currentprior = 0.0 - for nd in nodes: - currentprior += (-nd.edgelen/mean_expo)-(log(mean_expo)) - current_ln_posterior = currentlike + currentprior - - u = random.random() - m = exp(0.2*(u-0.5)) - nodes[i].edgelen = preedgelen*m - proposedprior = 0.0 - for nd in nodes: - proposedprior += (-nd.edgelen/mean_expo)-(log(mean_expo)) - - proposedlike = prepareTree(nodes, patterns, rates) - proposed_ln_posterior = proposedlike + proposedprior - hastings_ratio = log(m) - logR = proposed_ln_posterior - current_ln_posterior + hastings_ratio - u2 = random.random() - if log(u2) < logR: - nodes[i].edgelen = nodes[i].edgelen - log_prior = proposedprior - log_likelihood = proposedlike -# print 'log(u2) < logR so new proposal accepted..' - else: - nodes[i].edgelen = preedgelen - log_prior = currentprior - log_likelihood = currentlike - -# print 'log(u2) > logR so failed to accept new proposal..' - - if (r+1) % save_every == 0: - newf.write('%s\t'%(mcmc+1)) - print 'mcmc gen=', mcmc+1 - print log_likelihood,log_prior, - newf.write( '%.6f\t%.6f\t'%(log_likelihood,log_prior)) - for j,k in enumerate(nodes): - newf.write( '%.6f\t'%(k.edgelen)) - print k.edgelen, - newf.write('\n') - print - print '**************************' - - newf.flush() - - mcmc+=1 - - -def treenewick(): - script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) - path = os.path.join(script_dir, tree_file_name) - with open(path, 'r') as content: - newick = content.read() - return newick -# - -def gammaRates(alpha): - bounds = [0.0, 0.25, 0.50, 0.75, 1.] - rates = [] - for i in range(4): -# print i - lower = gamma.ppf(bounds[i], alpha, 0, 1./alpha) - upper = gamma.ppf(bounds[i+1], alpha, 0, 1./alpha) - mean_rate = ((gamma.cdf(upper, alpha+1., 0, 1./alpha) - gamma.cdf(lower, alpha+1., 0, 1./alpha)))*4. - rates.append(mean_rate) - return rates - -def prepareTree(postorder, patterns, rates): - likelihood_lists = [] - for nd in postorder: - likelihood_lists.append(allocatePartial(nd, patterns, rates)) -# print 'log-likelihood of the topology =', likelihood_lists[-1] - return likelihood_lists[-1] - -def joinRandomPair(node_list, next_node_number, is_deep_coalescence): - # pick first of two lineages to join and delete from node_list - i = random.randint(1, len(node_list)) - ndi = node_list[i-1] - del node_list[i-1] - - # pick second of two lineages to join and delete from node_list - j = random.randint(1, len(node_list)) - ndj = node_list[j-1] - del node_list[j-1] - - # join selected nodes and add ancestor to node_list - ancnd = node(next_node_number) - ancnd.deep = is_deep_coalescence - ancnd.lchild = ndi - ancnd.descendants = set() - ancnd.descendants |= ndi.descendants - ancnd.descendants |= ndj.descendants - ndi.rsib = ndj - ndi.par = ancnd - ndj.par = ancnd - node_list.append(ancnd) - - return node_list - - -def makeNewick(nd, brlen_scaler = 1.0, start = True): # - global _newick - global _TL - - if start: - _newick = '' - _TL = 0.0 - - if nd.lchild: - _newick += '(' - makeNewick(nd.lchild, brlen_scaler, False) - - else: - blen = nd.edgelen*brlen_scaler - _TL += blen - _newick += '%d:%.5f' % (nd.number, blen) - - if nd.rsib: - _newick += ',' - makeNewick(nd.rsib, brlen_scaler, False) - elif nd.par is not None: - blen = nd.par.edgelen*brlen_scaler - _TL += blen - _newick += '):%.3f' % blen - - return _newick, _TL - -def calcActualHeight(root): - h = 0.0 - nd = root - while nd.lchild: - nd = nd.lchild - h += nd.edgelen - return h - - -def readnewick(tree): - total_length = len(tree) - internal_node_number = -1 - - root = node(internal_node_number) - nd = root - i = 0 - pre = [root] - while i < total_length: - m = tree[i] - - if m =='(': - internal_node_number -= 1 - - child = node(internal_node_number) - pre.append(child) - nd.lchild=child - - child.par=nd - nd=child - elif m == ',': - internal_node_number -= 1 - rsib = node(internal_node_number) - pre.append(rsib) - nd.rsib = rsib - rsib.par=nd.par - nd = rsib - elif m == ')': - nd = nd.par - - elif m == ':': - edge_len_str = '' - i+=1 - m = tree[i] - assert m in ['0','1','2','3','4','5','6','7','8', '9','.'] - while m in ['0','1','2','3','4','5','6','7','8', '9','.']: - edge_len_str += m - i+=1 - m = tree[i] - i -=1 - nd.edgelen = float(edge_len_str) - - - else: - internal_node_number += 1 - - if True: - assert m in ['0','1','2','3','4','5','6','7','8', '9'], 'Error : expecting m to be a digit when in fact it was "%s"' % m - mm = '' - while m in ['0','1','2','3','4','5','6','7','8', '9' ]: - - mm += m - - i += 1 - m = tree[i] - nd.number = int(mm) - i -= 1 - - i += 1 - - post = pre[:] - post.reverse() - return post - -def Makenewick(pre): - newickstring = '' - for i,nd in enumerate(pre): - if nd.lchild: - newickstring += '(' - - elif nd.rsib: - newickstring += '%d' %(nd.number) - newickstring += ':%.1f' % nd.edgelen - newickstring += ',' - - else: - newickstring += '%d' %(nd.number) - newickstring += ':%.1f' % nd.edgelen - tmpnd = nd - while (tmpnd.par is not None) and (tmpnd.rsib is None): - newickstring += ')' - newickstring += ':%.1f' % tmpnd.par.edgelen - tmpnd = tmpnd.par - - if tmpnd.par is not None: - newickstring += ',' - return newickstring - -###################yule tree################################################### -# calcPhi computes sum_{K=2}^S 1/K, where S is the number of leaves in the tree -# - num_species is the number of leaves (tips) in the tree -def calcPhi(num_species): - phi = sum([1.0/(K+2.0) for K in range(num_species-1)]) - return phi - -# yuleTree creates a species tree in which edge lengths are measured in -# expected number of substitutions. -# - num_species is the number of leaves -# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio -def yuleTree(num_species, mu_over_s): - # create num_species nodes numbered 1, 2, ..., num_species - nodes = [node(i+1) for i in range(num_species)] - - next_node_number = num_species + 1 - while len(nodes) > 1: - # choose a speciation time in generations - K = float(len(nodes)) - mean_epoch_length = mu_over_s/K - t = random.gammavariate(1.0, mean_epoch_length) - - # update each node's edgelen - for n in nodes: - n.edgelen += t # same as: n.edgelen = n.edgelen + t - - nodes = joinRandomPair(nodes, next_node_number, False) - next_node_number += 1 - - return nodes[0] - -# calcExpectedHeight returns the expected height of the species tree in terms of -# expected number of substitutions from the root to one tip. -# - num_species is the number of leaves -# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio -def calcExpectedHeight(num_species, mu_over_s): - return mu_over_s*calcPhi(num_species) - - -if __name__ == '__main__': - random_seed = 348889 # 7632557, 12345 - number_of_species = 5 - mutation_speciation_rate_ratio = 0.689655172 # 0.689655172 # yields tree height 1 for 6 species - random.seed(random_seed) - species_tree_root = yuleTree(number_of_species, mutation_speciation_rate_ratio) -# print '#########' -# print species_tree_root - newick = makeNewick(species_tree_root) -# print 'Random number seed: %d' % random_seed -# print 'Simulating one tree:' -# print ' number of species = %d' % number_of_species -# print ' mutation-speciation rate ratio = %g' % mutation_speciation_rate_ratio -# print ' actual tree length =',newick[1] - expected_height = calcExpectedHeight(number_of_species, mutation_speciation_rate_ratio) -# print ' expected height =',expected_height - actual_height = calcActualHeight(species_tree_root) -# print ' actual height =',actual_height - print 'true tree: ',newick[0] - print '**************************' - -# yuletree = '(((1:0.54019,(5:0.40299,10:0.40299):0.1372):0.72686,(6:0.10576,4:0.10576):1.16129):0.42537,(2:0.58122,(9:0.21295,(7:0.16691,(8:0.14622,3:0.14622):0.02069):0.04604):0.36827):1.1112)' - rates_list = gammaRates(alpha) - postorder = readnewick(treenewick()) - result = prepareTree(postorder, readSeq.patterns(sequence_file), rates_list) -# try1 = readSeq.patterns() - result2 = mcmcbrn(postorder, readSeq.patterns(sequence_file), rates_list) From 9368821e7aa6ac32b6968be3d3bbbf0b6a05675e Mon Sep 17 00:00:00 2001 From: Suman Neupane Date: Fri, 26 May 2017 11:53:36 -0400 Subject: [PATCH 2/9] Delete treeLike_JCGAMMA.py --- treeLike_JCGAMMA.py | 413 -------------------------------------------- 1 file changed, 413 deletions(-) delete mode 100644 treeLike_JCGAMMA.py diff --git a/treeLike_JCGAMMA.py b/treeLike_JCGAMMA.py deleted file mode 100644 index 6a777cf..0000000 --- a/treeLike_JCGAMMA.py +++ /dev/null @@ -1,413 +0,0 @@ -################################## -# This script reads a nexus DNA matrix (through module readseq.py) and a newick tree -# topology, and computes log-likelihood of the topology under Jukes Cantor+GAMMA model -################################### - - -import readSeq -import random -import re, os, itertools, sys, glob -from itertools import chain -from scipy.stats import gamma -from math import exp, log - -########################################################################################## -tree_file_name = 'tree.tre' -sequence_file = 'example3.nex' -alpha = 0.5 #gamma shape parameter for rate categories -########################################################################################## - - - - - -class node(object): - def __init__(self, ndnum): # initialization function - self.rsib = None # right sibling - self.lchild = None # left child - self.par = None # parent node - self.number = ndnum # node number (internals negative, tips 0 or positive) - self.edgelen = 0.0 # branch length - self.descendants = set([ndnum]) # set containing descendant leaf set - self.partial = None # will have length 4*npatterns - - - - - def __str__(self): - # __str__ is a built-in function that is used by print to show an object - descendants_as_string = ','.join(['%d' % d for d in self.descendants]) - - lchildstr = 'None' - if self.lchild is not None: - lchildstr = '%d' % self.lchild.number - - rsibstr = 'None' - if self.rsib is not None: - rsibstr = '%d' % self.rsib.number - - parstr = 'None' - if self.par is not None: - parstr = '%d' % self.par.number - - return 'node: number=%d edgelen=%g lchild=%s rsib=%s parent=%s descendants=[%s]' % (self.number, self.edgelen, lchildstr, rsibstr, parstr, descendants_as_string) - - -def allocatePartial(node, patterns, rates): - if node.number > 0: - npatterns = len(patterns) -# print 'npat', npatterns - node.partial = [0.0]*(4*4*npatterns) -# print len(node.partial) - for i,pattern in enumerate(patterns.keys()): - base = pattern[node.number-1] - for l in range(4): - if base == 'A': - node.partial[i*16+l*4 + 0] = 1.0 - elif base == 'C': - node.partial[i*16+l*4 + 1] = 1.0 - elif base == 'G': - node.partial[i*16+l*4 + 2] = 1.0 - elif base == 'T': - node.partial[i*16+l*4 + 3] = 1.0 - else: - assert(False), 'oops, something went horribly wrong!' - - else: -# rt = [0.03338775, 0.25191592, 0.82026848, 2.89442785] -# rt = [2.89442785] - -# rt = [1.0, 1.0, 1.0, 1.0] - - npatterns = len(patterns) -# print 'patterns=', patterns - node.partial = [0.0]*(4*4*npatterns) - like_list = [] - for i,pattern in enumerate(patterns.keys()): -# print i, pattern, patterns.keys() - m_list = [] - num_pattern = patterns[pattern] -# print num_pattern - - for l,m in enumerate(rates): - - - psame = (0.25+0.75*exp(-4.0*m*(node.lchild.edgelen)/3.0)) - pdiff = (0.25-0.25*exp(-4.0*m*(node.lchild.edgelen)/3.0)) - - psame2 = (0.25+0.75*exp(-4.0*m*(node.lchild.rsib.edgelen)/3.0)) - pdiff2 = (0.25-0.25*exp(-4.0*m*(node.lchild.rsib.edgelen)/3.0)) - - num_pattern = patterns[pattern] - pAA = psame*(node.lchild.partial[i*16+l*4 + 0]) - pAC = pdiff*(node.lchild.partial[i*16+l*4 + 1]) - pAG = pdiff*(node.lchild.partial[i*16+l*4 + 2]) - pAT = pdiff*(node.lchild.partial[i*16+l*4 + 3]) - - pAA2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 0]) - pAC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1]) - pAG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2]) - pAT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3]) - - pfromA_lchild = pAA+pAC+pAG+pAT - pfromA_rchild = pAA2+pAC2+pAG2+pAT2 - node.partial[i*16+l*4 + 0] = pfromA_lchild*pfromA_rchild - - - ###################################################### - - pCA = pdiff*(node.lchild.partial[i*16+l*4 + 0]) - pCC = psame*(node.lchild.partial[i*16+l*4 + 1]) - pCG = pdiff*(node.lchild.partial[i*16+l*4 + 2]) - pCT = pdiff*(node.lchild.partial[i*16+l*4 + 3]) - - pCA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0]) - pCC2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 1]) - pCG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2]) - pCT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3]) - - pfromC_lchild = pCA+pCC+pCG+pCT - pfromC_rchild = pCA2+pCC2+pCG2+pCT2 - node.partial[i*16+l*4 + 1] = pfromC_lchild*pfromC_rchild - - ####################################################### -# - pGA = pdiff*(node.lchild.partial[i*16+l*4 + 0]) - pGC = pdiff*(node.lchild.partial[i*16+l*4 + 1]) - pGG = psame*(node.lchild.partial[i*16+l*4 + 2]) - pGT = pdiff*(node.lchild.partial[i*16+l*4 + 3]) - - pGA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0]) - pGC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1]) - pGG2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 2]) - pGT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3]) - - pfromG_lchild = pGA+pGC+pGG+pGT - pfromG_rchild = pGA2+pGC2+pGG2+pGT2 - node.partial[i*16+l*4 + 2] = pfromG_lchild*pfromG_rchild - - ####################################################### - - pTA = pdiff*(node.lchild.partial[i*16+l*4 + 0]) - pTC = pdiff*(node.lchild.partial[i*16+l*4 + 1]) - pTG = pdiff*(node.lchild.partial[i*16+l*4 + 2]) - pTT = psame*(node.lchild.partial[i*16+l*4 + 3]) - - pTA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0]) - pTC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1]) - pTG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2]) - pTT2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 3]) - - pfromT_lchild = pTA+pTC+pTG+pTT - pfromT_rchild = pTA2+pTC2+pTG2+pTT2 - node.partial[i*16+l*4 + 3] = pfromT_lchild*pfromT_rchild - - site_like = (sum(node.partial[i*16:i*16+16]))*0.25*0.25 - site_log_like = (log(site_like))*num_pattern - like_list.append(site_log_like) - log_like = sum(like_list) - return log_like - - -def treenewick(): - script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) - path = os.path.join(script_dir, tree_file_name) - with open(path, 'r') as content: - newick = content.read() - return newick -# - -def gammaRates(alpha): - bounds = [0.0, 0.25, 0.50, 0.75, 1.] - rates = [] - for i in range(4): -# print i - lower = gamma.ppf(bounds[i], alpha, 0, 1./alpha) - upper = gamma.ppf(bounds[i+1], alpha, 0, 1./alpha) - mean_rate = ((gamma.cdf(upper, alpha+1., 0, 1./alpha) - gamma.cdf(lower, alpha+1., 0, 1./alpha)))*4. - rates.append(mean_rate) - return rates - -def prepareTree(postorder, patterns, rates): - likelihood_lists = [] - for nd in postorder: - likelihood_lists.append(allocatePartial(nd, patterns, rates)) - print 'log-likelihood of the topology =', likelihood_lists[-1] - - -def joinRandomPair(node_list, next_node_number, is_deep_coalescence): - # pick first of two lineages to join and delete from node_list - i = random.randint(1, len(node_list)) - ndi = node_list[i-1] - del node_list[i-1] - - # pick second of two lineages to join and delete from node_list - j = random.randint(1, len(node_list)) - ndj = node_list[j-1] - del node_list[j-1] - - # join selected nodes and add ancestor to node_list - ancnd = node(next_node_number) - ancnd.deep = is_deep_coalescence - ancnd.lchild = ndi - ancnd.descendants = set() - ancnd.descendants |= ndi.descendants - ancnd.descendants |= ndj.descendants - ndi.rsib = ndj - ndi.par = ancnd - ndj.par = ancnd - node_list.append(ancnd) - - return node_list - - -def makeNewick(nd, brlen_scaler = 1.0, start = True): # - global _newick - global _TL - - if start: - _newick = '' - _TL = 0.0 - - if nd.lchild: - _newick += '(' - makeNewick(nd.lchild, brlen_scaler, False) - - else: - blen = nd.edgelen*brlen_scaler - _TL += blen - _newick += '%d:%.5f' % (nd.number, blen) - - if nd.rsib: - _newick += ',' - makeNewick(nd.rsib, brlen_scaler, False) - elif nd.par is not None: - blen = nd.par.edgelen*brlen_scaler - _TL += blen - _newick += '):%.3f' % blen - - return _newick, _TL - -def calcActualHeight(root): - h = 0.0 - nd = root - while nd.lchild: - nd = nd.lchild - h += nd.edgelen - return h - - -def readnewick(tree): - total_length = len(tree) - internal_node_number = -1 - - root = node(internal_node_number) - nd = root - i = 0 - pre = [root] - while i < total_length: - m = tree[i] - - if m =='(': - internal_node_number -= 1 - - child = node(internal_node_number) - pre.append(child) - nd.lchild=child - - child.par=nd - nd=child - elif m == ',': - internal_node_number -= 1 - rsib = node(internal_node_number) - pre.append(rsib) - nd.rsib = rsib - rsib.par=nd.par - nd = rsib - elif m == ')': - nd = nd.par - - elif m == ':': - edge_len_str = '' - i+=1 - m = tree[i] - assert m in ['0','1','2','3','4','5','6','7','8', '9','.'] - while m in ['0','1','2','3','4','5','6','7','8', '9','.']: - edge_len_str += m - i+=1 - m = tree[i] - i -=1 - nd.edgelen = float(edge_len_str) - - - else: - internal_node_number += 1 - - if True: - assert m in ['0','1','2','3','4','5','6','7','8', '9'], 'Error : expecting m to be a digit when in fact it was "%s"' % m - mm = '' - while m in ['0','1','2','3','4','5','6','7','8', '9' ]: - - mm += m - - i += 1 - m = tree[i] - nd.number = int(mm) - i -= 1 - - i += 1 - - post = pre[:] - post.reverse() - for nd in post: - print nd.number, nd.edgelen - return post - -def Makenewick(pre): - newickstring = '' - for i,nd in enumerate(pre): - if nd.lchild: - newickstring += '(' - - elif nd.rsib: - newickstring += '%d' %(nd.number) - newickstring += ':%.1f' % nd.edgelen - newickstring += ',' - - else: - newickstring += '%d' %(nd.number) - newickstring += ':%.1f' % nd.edgelen - tmpnd = nd - while (tmpnd.par is not None) and (tmpnd.rsib is None): - newickstring += ')' - newickstring += ':%.1f' % tmpnd.par.edgelen - tmpnd = tmpnd.par - - if tmpnd.par is not None: - newickstring += ',' - return newickstring - -###################yule tree################################################### -# calcPhi computes sum_{K=2}^S 1/K, where S is the number of leaves in the tree -# - num_species is the number of leaves (tips) in the tree -def calcPhi(num_species): - phi = sum([1.0/(K+2.0) for K in range(num_species-1)]) - return phi - -# yuleTree creates a species tree in which edge lengths are measured in -# expected number of substitutions. -# - num_species is the number of leaves -# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio -def yuleTree(num_species, mu_over_s): - # create num_species nodes numbered 1, 2, ..., num_species - nodes = [node(i+1) for i in range(num_species)] - - next_node_number = num_species + 1 - while len(nodes) > 1: - # choose a speciation time in generations - K = float(len(nodes)) - mean_epoch_length = mu_over_s/K - t = random.gammavariate(1.0, mean_epoch_length) - - # update each node's edgelen - for n in nodes: - n.edgelen += t # same as: n.edgelen = n.edgelen + t - - nodes = joinRandomPair(nodes, next_node_number, False) - next_node_number += 1 - - return nodes[0] - -# calcExpectedHeight returns the expected height of the species tree in terms of -# expected number of substitutions from the root to one tip. -# - num_species is the number of leaves -# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio -def calcExpectedHeight(num_species, mu_over_s): - return mu_over_s*calcPhi(num_species) - - -if __name__ == '__main__': - random_seed = 24553 # 7632557, 12345 - number_of_species = 5 - mutation_speciation_rate_ratio = 0.4 # 0.689655172 # yields tree height 1 for 6 species - random.seed(random_seed) - species_tree_root = yuleTree(number_of_species, mutation_speciation_rate_ratio) -# print '#########' -# print species_tree_root - newick = makeNewick(species_tree_root) -# print 'Random number seed: %d' % random_seed -# print 'Simulating one tree:' -# print ' number of species = %d' % number_of_species -# print ' mutation-speciation rate ratio = %g' % mutation_speciation_rate_ratio -# print ' actual tree length =',newick[1] - expected_height = calcExpectedHeight(number_of_species, mutation_speciation_rate_ratio) -# print ' expected height =',expected_height - actual_height = calcActualHeight(species_tree_root) -# print ' actual height =',actual_height -# print ' newick: ',newick[0] - - alpha = 0.5 ### gamma shape parameter rate categories -# yuletree = '(((1:0.54019,(5:0.40299,10:0.40299):0.1372):0.72686,(6:0.10576,4:0.10576):1.16129):0.42537,(2:0.58122,(9:0.21295,(7:0.16691,(8:0.14622,3:0.14622):0.02069):0.04604):0.36827):1.1112)' - rates_list = gammaRates(alpha) - postorder = readnewick(treenewick()) - result = prepareTree(postorder, readSeq.patterns(sequence_file), rates_list) From c51f63b08358e5c2ce494388b4370fb84daac6af Mon Sep 17 00:00:00 2001 From: Suman Neupane Date: Fri, 26 May 2017 11:53:43 -0400 Subject: [PATCH 3/9] Delete treeLike.py --- treeLike.py | 374 ---------------------------------------------------- 1 file changed, 374 deletions(-) delete mode 100644 treeLike.py diff --git a/treeLike.py b/treeLike.py deleted file mode 100644 index 1142e5e..0000000 --- a/treeLike.py +++ /dev/null @@ -1,374 +0,0 @@ -################################## -# This script reads a nexus DNA matrix (through module readseq.py) and a newick tree -# topology, and computes log-likelihood of the topology under Jukes Cantor model -################################### - - -import readSeq -import random -import re, os, itertools, sys, glob -from itertools import chain -from math import exp, log - -########################################################################################## -tree_file_name = 'tree.tre' -sequence_file = 'example3.nex' -########################################################################################## - - -class node(object): - def __init__(self, ndnum): # initialization function - self.rsib = None # right sibling - self.lchild = None # left child - self.par = None # parent node - self.number = ndnum # node number (internals negative, tips 0 or positive) - self.edgelen = 0.0 # branch length - self.descendants = set([ndnum]) # set containing descendant leaf set - self.partial = None # will have length 4*npatterns - - - def __str__(self): - # __str__ is a built-in function that is used by print to show an object - descendants_as_string = ','.join(['%d' % d for d in self.descendants]) - - lchildstr = 'None' - if self.lchild is not None: - lchildstr = '%d' % self.lchild.number - - rsibstr = 'None' - if self.rsib is not None: - rsibstr = '%d' % self.rsib.number - - parstr = 'None' - if self.par is not None: - parstr = '%d' % self.par.number - - return 'node: number=%d edgelen=%g lchild=%s rsib=%s parent=%s descendants=[%s]' % (self.number, self.edgelen, lchildstr, rsibstr, parstr, descendants_as_string) - -def allocatePartial(node, patterns): - if node.number > 0: - npatterns = len(patterns) - node.partial = [0.0]*(4*npatterns) - for i,pattern in enumerate(patterns.keys()): - base = pattern[node.number-1] - if base == 'A': - node.partial[i*4 + 0] = 1.0 - elif base == 'C': - node.partial[i*4 + 1] = 1.0 - elif base == 'G': - node.partial[i*4 + 2] = 1.0 - elif base == 'T': - node.partial[i*4 + 3] = 1.0 - else: - assert(False), 'oops, something went horribly wrong!' - - else: - npatterns = len(patterns) - node.partial = [0.0]*(4*npatterns) - like_list = [] - for i,pattern in enumerate(patterns.keys()): - psame = (0.25+0.75*exp(-4.0*(node.lchild.edgelen)/3.0)) - pdiff = (0.25-0.25*exp(-4.0*(node.lchild.edgelen)/3.0)) - - psame2 = (0.25+0.75*exp(-4.0*(node.lchild.rsib.edgelen)/3.0)) - pdiff2 = (0.25-0.25*exp(-4.0*(node.lchild.rsib.edgelen)/3.0)) - - num_pattern = patterns[pattern] - - pAA = psame*(node.lchild.partial[i*4 + 0]) - pAC = pdiff*(node.lchild.partial[i*4 + 1]) - pAG = pdiff*(node.lchild.partial[i*4 + 2]) - pAT = pdiff*(node.lchild.partial[i*4 + 3]) - - pAA2 = psame2*(node.lchild.rsib.partial[i*4 + 0]) - pAC2 = pdiff2*(node.lchild.rsib.partial[i*4 + 1]) - pAG2 = pdiff2*(node.lchild.rsib.partial[i*4 + 2]) - pAT2 = pdiff2*(node.lchild.rsib.partial[i*4 + 3]) - - pfromA_lchild = pAA+pAC+pAG+pAT - pfromA_rchild = pAA2+pAC2+pAG2+pAT2 - node.partial[i*4 + 0] = pfromA_lchild*pfromA_rchild - - ###################################################### - - pCA = pdiff*(node.lchild.partial[i*4 + 0]) - pCC = psame*(node.lchild.partial[i*4 + 1]) - pCG = pdiff*(node.lchild.partial[i*4 + 2]) - pCT = pdiff*(node.lchild.partial[i*4 + 3]) - - pCA2 = pdiff2*(node.lchild.rsib.partial[i*4 + 0]) - pCC2 = psame2*(node.lchild.rsib.partial[i*4 + 1]) - pCG2 = pdiff2*(node.lchild.rsib.partial[i*4 + 2]) - pCT2 = pdiff2*(node.lchild.rsib.partial[i*4 + 3]) - - pfromC_lchild = pCA+pCC+pCG+pCT - pfromC_rchild = pCA2+pCC2+pCG2+pCT2 - node.partial[i*4 + 1] = pfromC_lchild*pfromC_rchild - - ####################################################### -# - pGA = pdiff*(node.lchild.partial[i*4 + 0]) - pGC = pdiff*(node.lchild.partial[i*4 + 1]) - pGG = psame*(node.lchild.partial[i*4 + 2]) - pGT = pdiff*(node.lchild.partial[i*4 + 3]) -# - pGA2 = pdiff2*(node.lchild.rsib.partial[i*4 + 0]) - pGC2 = pdiff2*(node.lchild.rsib.partial[i*4 + 1]) - pGG2 = psame2*(node.lchild.rsib.partial[i*4 + 2]) - pGT2 = pdiff2*(node.lchild.rsib.partial[i*4 + 3]) -# - pfromG_lchild = pGA+pGC+pGG+pGT - pfromG_rchild = pGA2+pGC2+pGG2+pGT2 - node.partial[i*4 + 2] = pfromG_lchild*pfromG_rchild - - ####################################################### - - pTA = pdiff*(node.lchild.partial[i*4 + 0]) - pTC = pdiff*(node.lchild.partial[i*4 + 1]) - pTG = pdiff*(node.lchild.partial[i*4 + 2]) - pTT = psame*(node.lchild.partial[i*4 + 3]) -# - pTA2 = pdiff2*(node.lchild.rsib.partial[i*4 + 0]) - pTC2 = pdiff2*(node.lchild.rsib.partial[i*4 + 1]) - pTG2 = pdiff2*(node.lchild.rsib.partial[i*4 + 2]) - pTT2 = psame2*(node.lchild.rsib.partial[i*4 + 3]) -# - pfromT_lchild = pTA+pTC+pTG+pTT - pfromT_rchild = pTA2+pTC2+pTG2+pTT2 - node.partial[i*4 + 3] = pfromT_lchild*pfromT_rchild - - ######################################################### - - site_log_like = (log((sum(node.partial[i*4:i*4+4]))*0.25))*num_pattern - like_list.append(site_log_like) - - log_Like = sum(like_list) - return log_Like - -def treenewick(): - script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) - path = os.path.join(script_dir, tree_file_name) - with open(path, 'r') as content: - newick = content.read() - return newick - -def prepareTree(postorder, patterns): - likelihood_lists = [] - for nd in postorder: - likelihood_lists.append(allocatePartial(nd, patterns)) - print 'log-likelihood of the topology =', likelihood_lists[-1] - -def joinRandomPair(node_list, next_node_number, is_deep_coalescence): - # pick first of two lineages to join and delete from node_list - i = random.randint(1, len(node_list)) - ndi = node_list[i-1] - del node_list[i-1] - - # pick second of two lineages to join and delete from node_list - j = random.randint(1, len(node_list)) - ndj = node_list[j-1] - del node_list[j-1] - - # join selected nodes and add ancestor to node_list - ancnd = node(next_node_number) - ancnd.deep = is_deep_coalescence - ancnd.lchild = ndi - ancnd.descendants = set() - ancnd.descendants |= ndi.descendants - ancnd.descendants |= ndj.descendants - ndi.rsib = ndj - ndi.par = ancnd - ndj.par = ancnd - node_list.append(ancnd) - - return node_list - - -def makeNewick(nd, brlen_scaler = 1.0, start = True): # - global _newick - global _TL - - if start: - _newick = '' - _TL = 0.0 - - if nd.lchild: - _newick += '(' - makeNewick(nd.lchild, brlen_scaler, False) - - else: - blen = nd.edgelen*brlen_scaler - _TL += blen - _newick += '%d:%.5f' % (nd.number, blen) - - if nd.rsib: - _newick += ',' - makeNewick(nd.rsib, brlen_scaler, False) - elif nd.par is not None: - blen = nd.par.edgelen*brlen_scaler - _TL += blen - _newick += '):%.3f' % blen - - return _newick, _TL - -def calcActualHeight(root): - h = 0.0 - nd = root - while nd.lchild: - nd = nd.lchild - h += nd.edgelen - return h - - -def readnewick(tree): - total_length = len(tree) - internal_node_number = -1 - - root = node(internal_node_number) - nd = root - i = 0 - pre = [root] - while i < total_length: - m = tree[i] - - if m =='(': - internal_node_number -= 1 - - child = node(internal_node_number) - pre.append(child) - nd.lchild=child - - child.par=nd - nd=child - elif m == ',': - internal_node_number -= 1 - rsib = node(internal_node_number) - pre.append(rsib) - nd.rsib = rsib - rsib.par=nd.par - nd = rsib - elif m == ')': - nd = nd.par - - elif m == ':': - edge_len_str = '' - i+=1 - m = tree[i] - assert m in ['0','1','2','3','4','5','6','7','8', '9','.'] - while m in ['0','1','2','3','4','5','6','7','8', '9','.']: - edge_len_str += m - i+=1 - m = tree[i] - i -=1 - nd.edgelen = float(edge_len_str) - - - else: - internal_node_number += 1 - - if True: - assert m in ['0','1','2','3','4','5','6','7','8', '9'], 'Error : expecting m to be a digit when in fact it was "%s"' % m - mm = '' - while m in ['0','1','2','3','4','5','6','7','8', '9' ]: - - mm += m - - i += 1 - m = tree[i] - nd.number = int(mm) - i -= 1 - - i += 1 - - post = pre[:] - post.reverse() - return post - -def Makenewick(pre): - newickstring = '' - for i,nd in enumerate(pre): - if nd.lchild: - newickstring += '(' - - elif nd.rsib: - newickstring += '%d' %(nd.number) - newickstring += ':%.1f' % nd.edgelen - newickstring += ',' - - else: - newickstring += '%d' %(nd.number) - newickstring += ':%.1f' % nd.edgelen - tmpnd = nd - while (tmpnd.par is not None) and (tmpnd.rsib is None): - newickstring += ')' - newickstring += ':%.1f' % tmpnd.par.edgelen - tmpnd = tmpnd.par - - if tmpnd.par is not None: - newickstring += ',' - return newickstring - -###################yule tree################################################### -# calcPhi computes sum_{K=2}^S 1/K, where S is the number of leaves in the tree -# - num_species is the number of leaves (tips) in the tree -def calcPhi(num_species): - phi = sum([1.0/(K+2.0) for K in range(num_species-1)]) - return phi - -# yuleTree creates a species tree in which edge lengths are measured in -# expected number of substitutions. -# - num_species is the number of leaves -# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio -def yuleTree(num_species, mu_over_s): - # create num_species nodes numbered 1, 2, ..., num_species - nodes = [node(i+1) for i in range(num_species)] - - next_node_number = num_species + 1 - while len(nodes) > 1: - # choose a speciation time in generations - K = float(len(nodes)) - mean_epoch_length = mu_over_s/K - t = random.gammavariate(1.0, mean_epoch_length) - - # update each node's edgelen - for n in nodes: - n.edgelen += t # same as: n.edgelen = n.edgelen + t - - nodes = joinRandomPair(nodes, next_node_number, False) - next_node_number += 1 - - return nodes[0] - -# calcExpectedHeight returns the expected height of the species tree in terms of -# expected number of substitutions from the root to one tip. -# - num_species is the number of leaves -# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio -def calcExpectedHeight(num_species, mu_over_s): - return mu_over_s*calcPhi(num_species) - - -if __name__ == '__main__': - random_seed = 24553 # 7632557, 12345 - number_of_species = 5 - mutation_speciation_rate_ratio = 0.4 # 0.689655172 # yields tree height 1 for 6 species - random.seed(random_seed) - species_tree_root = yuleTree(number_of_species, mutation_speciation_rate_ratio) -# print '#########' -# print species_tree_root - newick = makeNewick(species_tree_root) -# print 'Random number seed: %d' % random_seed -# print 'Simulating one tree:' -# print ' number of species = %d' % number_of_species -# print ' mutation-speciation rate ratio = %g' % mutation_speciation_rate_ratio -# print ' actual tree length =',newick[1] - expected_height = calcExpectedHeight(number_of_species, mutation_speciation_rate_ratio) -# print ' expected height =',expected_height - actual_height = calcActualHeight(species_tree_root) -# print ' actual height =',actual_height -# print ' newick: ',newick[0] - - -# yuletree = '(((1:0.03915,5:0.03915):0.387,(4:0.42253,2:0.42253):0.004):0.118,3:0.54433)' - - postorder = readnewick(treenewick()) - result = prepareTree(postorder, readSeq.patterns(sequence_file)) \ No newline at end of file From 306e319a76b19b9bcc8ce22c2caa209d2ac1c1cc Mon Sep 17 00:00:00 2001 From: Suman Neupane Date: Fri, 26 May 2017 11:53:55 -0400 Subject: [PATCH 4/9] Delete tree.tre --- tree.tre | 1 - 1 file changed, 1 deletion(-) delete mode 100644 tree.tre diff --git a/tree.tre b/tree.tre deleted file mode 100644 index 2d84738..0000000 --- a/tree.tre +++ /dev/null @@ -1 +0,0 @@ -(5:1.8601,((3:0.47109,2:0.47109):0.492,(4:0.05805,1:0.05805):0.906):0.896) \ No newline at end of file From 8d860d53e6fed33382f1f6b27fc596df0fa696a9 Mon Sep 17 00:00:00 2001 From: Suman Neupane Date: Fri, 26 May 2017 11:54:15 -0400 Subject: [PATCH 5/9] Delete simSequences.py --- simSequences.py | 174 ------------------------------------------------ 1 file changed, 174 deletions(-) delete mode 100644 simSequences.py diff --git a/simSequences.py b/simSequences.py deleted file mode 100644 index 4181da1..0000000 --- a/simSequences.py +++ /dev/null @@ -1,174 +0,0 @@ -import readSeq -import random -import re, os, itertools, sys, glob -from itertools import chain -from math import exp, log -class node(object): - def __init__(self, ndnum): # initialization function - self.rsib = None # right sibling - self.lchild = None # left child - self.par = None # parent node - self.number = ndnum # node number (internals negative, tips 0 or positive) - self.edgelen = 0.0 # branch length - self.descendants = set([ndnum]) # set containing descendant leaf set - self.partial = None # will have length 4*npatterns - self.state = None - self.states = None - - - def simulateSequences(self, num_sites): - self.states = [str]*(num_sites) - freq = [0.25, 0.25, 0.25, 0.25] - current_states = [ 'A', 'C', 'G', 'T'] - if self.par is None: - - for i in range(num_sites): - - ran_nm = random.random() - if ran_nm < freq[0]: -# self.state = 'A' - self.states[i] = 'A' - elif ran_nm <= freq[0]+freq[1]: -# self.state = 'C' - self.states[i] = 'C' - elif ran_nm <= freq[0]+freq[1]+freq[2]: -# self.state = 'G' - self.states[i] = 'G' - - else: -# self.state = 'T' - self.states[i] = 'T' - - else: - for m in range(num_sites): - prob = [] - ran_nm = random.random() - - for i in current_states: - if self.par.states[m] == i: - p = (0.25+0.75*exp(-4.0*(self.edgelen)/3.0)) - prob.append(p) - else: - p = (0.25-0.25*exp(-4.0*(self.edgelen)/3.0)) - prob.append(p) - for i in prob: - - if ran_nm <= prob[0]: -# self.state = 'A' - self.states[m] = 'A' - - elif ran_nm <= prob[0]+ prob[1]: -# self.state = 'C' - self.states[m] = 'C' - - elif ran_nm <= prob[0]+ prob[1]+ prob[2]: -# self.state = 'G' - self.states[m] = 'G' - - else: -# self.state = 'T' - self.states[m] = 'T' - - return self.states - - def __str__(self): - # __str__ is a built-in function that is used by print to show an object - descendants_as_string = ','.join(['%d' % d for d in self.descendants]) - - lchildstr = 'None' - if self.lchild is not None: - lchildstr = '%d' % self.lchild.number - - rsibstr = 'None' - if self.rsib is not None: - rsibstr = '%d' % self.rsib.number - - parstr = 'None' - if self.par is not None: - parstr = '%d' % self.par.number - - return 'node: number=%d edgelen=%g lchild=%s rsib=%s parent=%s descendants=[%s]' % (self.number, self.edgelen, lchildstr, rsibstr, parstr, descendants_as_string) - -def simulate(preorder, ntax, num_sites, out): - newf = open(out, 'w') - newf.write('#nexus\n\n') - newf.write('begin data;\n') - newf.write('dimensions ntax=%d nchar=%d;\n' % (ntax, num_sites)) - newf.write('format datatype=dna missing=? gap=-;\n') - newf.write('matrix\n') - master = {} - for nd in preorder: - master[nd.number]=nd.simulateSequences(num_sites) - if nd.number >0: - newf.write('%s %s\n' % (nd.number, ''.join(nd.simulateSequences(num_sites)))) - newf.write(';\n') - newf.write('end;') -def readnewick(tree): - total_length = len(tree) - internal_node_number = -1 - - root = node(internal_node_number) - nd = root - i = 0 - pre = [root] - while i < total_length: - m = tree[i] - - if m =='(': - internal_node_number -= 1 - - child = node(internal_node_number) - pre.append(child) - nd.lchild=child - - child.par=nd - nd=child - elif m == ',': - internal_node_number -= 1 - rsib = node(internal_node_number) - pre.append(rsib) - nd.rsib = rsib - rsib.par=nd.par - nd = rsib - elif m == ')': - nd = nd.par - - elif m == ':': - edge_len_str = '' - i+=1 - m = tree[i] - assert m in ['0','1','2','3','4','5','6','7','8', '9','.'] - while m in ['0','1','2','3','4','5','6','7','8', '9','.']: - edge_len_str += m - i+=1 - m = tree[i] - i -=1 - nd.edgelen = float(edge_len_str) - else: - internal_node_number += 1 - - if True: - assert m in ['0','1','2','3','4','5','6','7','8', '9'], 'Error : expecting m to be a digit when in fact it was "%s"' % m - mm = '' - while m in ['0','1','2','3','4','5','6','7','8', '9' ]: - - mm += m - i += 1 - m = tree[i] - nd.number = int(mm) - i -= 1 - i += 1 - - return pre - -if __name__ == '__main__': - - output_filename = os.path.join('simulated_output.nexus') - - yuletree = '(5:1.86010,((3:0.47109,2:0.47109):0.492,(4:0.05805,1:0.05805):0.906):0.896)' - - preorder = readnewick(yuletree) - ntax = 5 - num_sites = 100 - result = simulate(preorder, ntax, num_sites, output_filename) - From e205f4efccf756476b1caecdfe953536420d72a5 Mon Sep 17 00:00:00 2001 From: Suman Neupane Date: Fri, 26 May 2017 11:54:22 -0400 Subject: [PATCH 6/9] Delete readtree.py --- readtree.py | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 readtree.py diff --git a/readtree.py b/readtree.py deleted file mode 100644 index a7b4943..0000000 --- a/readtree.py +++ /dev/null @@ -1,22 +0,0 @@ -# def treenewck(): -# script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) -# path = os.path.join(script_dir, 'nexus') -# for filename in glob.glob(os.path.join(path, '*.tre*')): -# f = open(filename, 'r').read() - -import re, os, glob, itertools, fnmatch, sys, shutil -# dirname, filename = os.path.split(os.path.abspath(__file__)) -def treenewick(): - script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) -# print script_dir - path = os.path.join(script_dir, 'tree.tre') - with open(path, 'r') as content: - newick = content.read() - return newick -a = treenewick() -print a - - -# dirname, filename = os.path.split(os.path.abspath(__file__)) -# print "running from", dirname -# print "file is", filename \ No newline at end of file From 6883d2d2d1c7fde7e554be10a6f2e4d2960aa812 Mon Sep 17 00:00:00 2001 From: Suman Neupane Date: Fri, 26 May 2017 11:54:28 -0400 Subject: [PATCH 7/9] Delete readSeq.py --- readSeq.py | 69 ------------------------------------------------------ 1 file changed, 69 deletions(-) delete mode 100644 readSeq.py diff --git a/readSeq.py b/readSeq.py deleted file mode 100644 index 91c58a5..0000000 --- a/readSeq.py +++ /dev/null @@ -1,69 +0,0 @@ -def patterns(sequence_file): - # - import re, os, glob, itertools, fnmatch, sys, shutil - from itertools import combinations - from collections import Counter - - script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) -# path = os.path.join(script_dir, 'nexus') - - genes = [] - data = {} -# print 'Reading nexus files...' - for filename in glob.glob(os.path.join(script_dir, sequence_file)): - - m = re.match('(.+).nex', os.path.basename(filename)) - gene_name = m.group(1) -# print 'gene_name=', gene_name - genes.append(gene_name) - f = open(filename, 'r').read() - - m = re.search('ntax\s*=\s*(\d+)', f, re.M | re.S) - ntax = int(m.group(1)) -# print 'ntax=', ntax - - m = re.search('nchar\s*=\s*(\d+)', f, re.M | re.S) - nchar = int(m.group(1)) -# print 'nchar=', nchar - - m = re.search('Matrix\s+(.+?);', f, re.M | re.S) - matrix = m.group(1).strip() - matrix_lines = matrix.split('\n') - - taxon_names = [] - sequences = {} - sequences_list = [] - for line in matrix_lines: - parts = line.strip().split() - assert len(parts) == 2 - taxon_name = parts[0] - sequence = parts[1] - - taxon_names.append(taxon_name) - sequences_list.append(sequence) - sequences[taxon_name] = sequence - - pattern_list = [] - - k=0 - while k < nchar: - site_pattern = '' - for i,m in enumerate(sequences_list): - site_pattern += m[k] - pattern_list.append(site_pattern) - k+=1 - pattern_dict = dict() - for i in pattern_list: - pattern_dict[i] = pattern_dict.get(i, 0) + 1 - - tmp = [] - for key in pattern_dict.keys(): ###convert dict to key of tupules -# print 'key=', key - tmp.append((pattern_dict[key],key)) - - sorted_values = sorted(tmp) ###sorted according to key smaller to larger - sorted_values.sort(cmp = lambda x,y:cmp(x[1],y[1])) ###sorted according to values in alphabetical order - - return pattern_dict - - From a2fe87c36ff1c5c21200cb3028189ea757d1b3c4 Mon Sep 17 00:00:00 2001 From: Suman Neupane Date: Fri, 26 May 2017 11:54:34 -0400 Subject: [PATCH 8/9] Delete randomSeq.py --- randomSeq.py | 211 --------------------------------------------------- 1 file changed, 211 deletions(-) delete mode 100644 randomSeq.py diff --git a/randomSeq.py b/randomSeq.py deleted file mode 100644 index 4925a91..0000000 --- a/randomSeq.py +++ /dev/null @@ -1,211 +0,0 @@ -import sys, re, os, random - - -# number1 = raw_input('number_of_runs? > ') -# num_run = int(number1) -# #print num_run -# -# ###specifying number of taxa to be generated -# number2 = raw_input('number_of_taxa? > ') -# ntax = int(number2) -# -# ###specifying number of sites to be generated -# number3 = raw_input('number of sites? > ') -# num_sites = int(number3) -# -# ###specifying number of dna matrix to be generated -# number4 = raw_input('number of genes? > ') -# number_genes = int(number4) - - - -num_run = 1 -#print num_run - -###specifying number of taxa to be generated -ntax = 6 - -###specifying number of sites to be generated -num_sites = 10000 - -###specifying number of dna matrix to be generated -number_genes = 10 - -num_trials_pergene = 2 - - -for i in range(num_run): - run_name = i+1 - run_name2 = 'trial'+ str(run_name) - print run_name2 - master_dir = os.path.join(os.path.abspath(os.curdir), run_name2) - if not os.path.exists(master_dir): - os.mkdir(master_dir) - - - ###creating a list for the directories for each randomly generated dna matrix - all_seq = [] - sets = [] - for a, b in enumerate(range(number_genes)): - a += 1 - set = "gene"+ str(a) - sets.append(set) - - ### creating dna matrix with randomly generated sequences - for n, i in enumerate(sets): - ### specifying names of the directory and dna matrix - folder_name = i - n += 1 - new_nexus_file = 'randomseq'+ str(n)+'.nex' - new_nexus_file2 = 'randomseq'+ str(n) - ### creating a list of randomly generated sequences - sequences = [] - for i in range(ntax): - myrandom = [] - for i in range(num_sites): - i = random.randint(1,4) - myrandom.append(i) - dna_list = [] - for m in myrandom: - if m == 1: - m= "A" - if m ==2: - m= "T" - if m ==3: - m= "G" - if m == 4: - m= "C" - dna_list.append(m) - dna_list2 = ''.join(dna_list) - #print dna_list2 - sequences.append(dna_list2) - all_seq.append(sequences) - - - - ### creating a list of taxon_names - taxon_names = [] - k=0 - for i, m in enumerate(sequences): - k += 1 - taxon = "taxon"+ str(k) - taxon_names.append(taxon) - - - ### creating a directory - new_dir = os.path.join(master_dir,folder_name) - if not os.path.exists(new_dir): - os.mkdir(new_dir) - gene_sets = [] -# for i in range(3): - for i in range(num_trials_pergene): - m= i+1 - seed_num = 4648+m - m2 = 'run'+str(m) - gene_sets.append(m2) - new_subdir = os.path.join(new_dir,m2) - if not os.path.exists(new_subdir): - os.mkdir(new_subdir) - - bash_filename = 'qsub.sh' - bash_file_content = '''#$ -S /bin/bash -#$ -cwd -#$ -N %s -#$ -q highpri.q,highmem.q -python runphycas.py -/common/galax/rungalax.sh --treefile trees.t --skip 1 -'''% (new_nexus_file2) - -# # ### saving dna matrix to the directory -# - full_path1 = os.path.join(new_subdir, new_nexus_file) - newf = open(full_path1, 'w') - newf.write('#nexus\n\n') - newf.write('begin data;\n') - newf.write(' dimensions ntax=%d nchar=%d;\n' % (ntax, num_sites)) - newf.write(' format datatype=dna missing=? gap=-;\n') - newf.write(' matrix\n') - longest_taxon_name = max([len(t) for t in taxon_names]) - for t,s in zip(taxon_names, sequences): - formatstr = '%%%ds' % longest_taxon_name - namestr = formatstr % t - newf.write(' %s %s\n' % (namestr, s)) - newf.write(';\n') - newf.write('end;\n') - newf.close() - - full_path2 = os.path.join(new_subdir, bash_filename) - newf = open(full_path2, 'w') - newf.write(bash_file_content) - newf.close() - - full_path8 = os.path.join(new_subdir, 'runphycas.py') - newf = open(full_path8, 'w') - x = open('runphycas.py', 'r').read() - newf.write(x %(int(seed_num), new_nexus_file)) - -# -# -# - combined_sequences = map(''.join, zip(*all_seq)) - num_sites_combined = len(combined_sequences[0]) - new_dir2 = os.path.join(master_dir,'combinedSeq') - if not os.path.exists(new_dir2): - os.mkdir(new_dir2) - - full_path3 = os.path.join(new_dir2, 'combinedSeq.nex') - newf = open(full_path3, 'w') - newf.write('#nexus\n\n') - newf.write('begin data;\n') - newf.write(' dimensions ntax=%d nchar=%d;\n' % (ntax, num_sites_combined)) - newf.write(' format datatype=dna missing=? gap=-;\n') - newf.write(' matrix\n') - longest_taxon_name = max([len(t) for t in taxon_names]) - for t,s in zip(taxon_names, combined_sequences): - formatstr = '%%%ds' % longest_taxon_name - namestr = formatstr % t - newf.write(' %s %s\n' % (namestr, s)) - newf.write(';\n') - newf.write('end;\n') - newf.close() - - full_path4 = os.path.join(new_dir2, bash_filename) - newf = open(full_path4, 'w') - newf.write(bash_file_content) - newf.close() - - - - full_path6 = os.path.join(new_dir2, 'runphycas.py') - newf = open(full_path6, 'w') - x = open('runphycas.py', 'r').read() - newf.write(x %(4648, 'combinedSeq.nex')) - - - full_path7 = os.path.join(master_dir, 'submitall.sh') - newf = open(full_path7, 'w') - newf.write('#!/bin/bash\n') - for i in sets: - for m in gene_sets: - newf.write('cd %s/%s; qsub qsub.sh;cd .. ;cd .. ; \n' %(i, m)) - newf.write('cd %s; qsub qsub.sh; cd ..\n' %('combinedSeq')) - newf.close() - - full_path9 = os.path.join(master_dir, 'treelist.txt') - newf = open(full_path9, 'w') - for i in sets: - for m in gene_sets: - newf.write(i+'/'+m+'/trees.t\n') - newf.close() - - - - full_path10 = os.path.join(master_dir, 'galax.sh') - newf = open(full_path10, 'w') - newf.write('#!/bin/bash\n') - newf.write('/common/galax/rungalax.sh --listfile treelist.txt --skip 1') - newf.close() - - - - From 654ed682b27c9401b0ab9d3f6d6a374ea41748b0 Mon Sep 17 00:00:00 2001 From: Suman Neupane Date: Fri, 26 May 2017 11:54:41 -0400 Subject: [PATCH 9/9] Delete mergeSeq.py --- mergeSeq.py | 453 ---------------------------------------------------- 1 file changed, 453 deletions(-) delete mode 100644 mergeSeq.py diff --git a/mergeSeq.py b/mergeSeq.py deleted file mode 100644 index c28404a..0000000 --- a/mergeSeq.py +++ /dev/null @@ -1,453 +0,0 @@ -import re, os, glob, itertools, fnmatch, sys, shutil -from capture_marglik_pol import capture_script - -from itertools import combinations - -script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) -path = os.path.join(script_dir, 'nexus') -#path = '/Users/suman/Documents/Postdoctoral_projects/Algae_data/two_tupules_run/Mar4/combo_script/nexus' - -first_combo = 700 -last_combo = 770 - -# create output directory, recursively deleting it if it already exists after asking permission -output = 'g-700-770' -if os.path.exists(output): - print 'Directory "%s" exists' % output - answer = raw_input('delete it (y/n)?') - if answer in ['y','yes','Y','Yes','YES']: - shutil.rmtree(output) - else: - sys.exit('Aborting because you did not answer "y" or "yes"') -os.mkdir(output) - -number_to_merge = 2 - -#### counting number of genes to be analyzed -number_of_genes = len(fnmatch.filter(os.listdir(path), '*.nex')) -number_gen = range(1, number_of_genes+1) - -#### counting number of nexus files to be written after combinations -combos = list(itertools.combinations(number_gen, number_to_merge)) -number_of_files = len(combos) -# print number_of_files - -# chunks are pieces of the chloroplast that have not been rearranged -# The largest chunk (1) has 7 genes, the smallest (16-23) have only 1 gene each -chunk_map = { -'rpl23':1, -'rpl2' :1, -'rps19':1, -'rps14':1, -'atpA' :1, -'psbI' :1, -'cemA' :1, -'psbE' :2, -'rps9' :2, -'ycf3' :2, -'rpl36':2, -'petD' :2, -'rpoA' :2, -'psbJ' :3, -'atpI' :3, -'psaJ' :3, -'rps12':3, -'rpl16':4, -'rpl14':4, -'rpl5' :4, -'rps8' :4, -'rpoBa':5, -'rpoBb':5, -'rps3' :5, -'rpoC2':5, -'psaB' :6, -'ccsA' :6, -'psbZ' :6, -'psbM' :6, -'psbF' :7, -'psbL' :7, -'petG' :7, -'psbB' :8, -'psbT' :8, -'rps4' :9, -'clpP' :9, -'rps18':10, -'petB' :10, -'rps7' :11, -'atpE' :11, -'atpH' :12, -'atpF' :12, -'psbH' :13, -'psbK' :13, -'psaC' :14, -'petL' :14, -'rbcL' :15, -'rps11':15, -'rpl20':16, -'tufA' :17, -'psaA' :18, -'psbD' :19, -'atpB' :20, -'psbA' :21, -'psbC' :22, -'psbN' :23 -} - -##### nexus file for each merged pair - -new_way = True -genes = [] -data = {} -gene_javascript_data = [] -print 'Reading nexus files...' -for filename in glob.glob(os.path.join(path, '*.nex')): -# print 'Reading file:',filename - #pol m = re.match('(.+)[.]nex', os.path.basename(filename)) - m = re.match('(.+)[-]stripped[.]nex', os.path.basename(filename)) - gene_name = m.group(1) - - genes.append(gene_name) - f = open(filename, 'r').read() - m = re.search('ntax\s*=\s*(\d+)', f, re.M | re.S) - ntax = int(m.group(1)) - m = re.search('nchar\s*=\s*(\d+)', f, re.M | re.S) - nchar = int(m.group(1)) - - gene_javascript_data.append((chunk_map[gene_name], gene_name, nchar)) - - m = re.search('Matrix\s+(.+?);', f, re.M | re.S) - matrix = m.group(1).strip() - - matrix_lines = matrix.split('\n') - - taxon_names = [] - sequences = {} - for line in matrix_lines: - parts = line.strip().split() - assert len(parts) == 2 - taxon_name = parts[0] - sequence = parts[1] - taxon_names.append(taxon_name) - sequences[taxon_name] = sequence - - if not new_way: - if len(data) == 0: - for t in taxon_names: - data[t] = [] - - for t in taxon_names: - if new_way: - group = data.setdefault(t, []) - group.append(sequences[t]) - else: - data[t].append(sequences[t]) - -print 'Found %d genes' % len(genes) -for c,g,n in sorted(gene_javascript_data): - print ' {name:"%s", chunk:%d, seqlen:%d},' % (g,c,n) - -taxa_sorted = sorted(data.keys()) -# print "taxa_sorted=", taxa_sorted - -number_of_genes = len(genes) -number_gen = range(1, number_of_genes+1) - -#### counting number of nexus files to be written after combinations -combos = list(itertools.combinations(number_gen, number_to_merge)) -number_of_combos = len(combos) -print 'Number of combinations:',number_of_combos - -combo_list=[] -current_combo = 0 -for a,b in combos: - current_combo += 1 - if current_combo <= first_combo or current_combo > last_combo: - continue - print 'current_combo=', current_combo-1 - - new_taxonlist = [] - se1 = [] - se2 = [] - new_sequences = [] - seq1 = [] - seq2 = [] - genes_merged = (genes[a-1]+'_'+genes[b-1]) -# print 'genes[a-1]==', genes[a-1] -# print 'genes[b-1]==', genes[b-1] -# - for t in taxa_sorted: - new_taxonlist.append((t,data[t][a-1],data[t][b-1])[0]) - new_sequences.append((data[t][a-1]+data[t][b-1])) - seq1.append((data[t][a-1])) - seq2.append((data[t][b-1])) -# print 'new_taxonlist==', new_taxonlist - ntax = len(new_taxonlist) - newnchar = len(new_sequences[0]) - nchar_gene1 = len(seq1[0]) - nchar_gene2 = len(seq2[0]) -# print 'nchar_gene1====', nchar_gene1 -# print 'nchar_gene2====', nchar_gene2 - - new_nexus_file = '%s.nex' % genes_merged - gene_dir = os.path.join(output, '%s' % (genes_merged)) - if not os.path.exists(gene_dir): - os.mkdir(gene_dir) - - combo_list.append(genes_merged) - - mrbayes_single_run1 = '''begin mrbayes; -delete Ankyra_judai Atractomorpha_echinata Bracteacoccus_aerius Bracteacoccus_minor Chlamydomonas_reinhardtii Chromochloris_zofingiensis Dunaliella_salina Floydiella_terrestris Gonium_pectorale Kirchneriella_aperta Mychonastes_homosphaera Oedogonium_cardiacum Ourococcus_multisporus Pleodorina_starrii Pseudomuriella_schumacherensis Rotundella_rotunda Schizomeris_leibleinii Stigeoclonium_helveticum Volvox_carteri; -charset first = 1-%d\\3; -charset second = 2-%d\\3; -charset third = 3-%d\\3; -partition mine = 3: first, second, third; -set partition=mine; -lset applyto=(all) nst=6 ngammacat=4 rates=gamma; -prset applyto=(all) statefreqpr=Dirichlet(1.0,1.0,1.0,1.0) ratepr=variable revmatpr=dirichlet(1,1,1,1,1,1) brlenspr=Unconstrained:GammaDir(1.0,0.100,1.0,1.0) shapepr=exponential(1.0); -unlink shape=(all) statefreq=(all) revmat=(all); -set seed=9223 swapseed=9223; -mcmcp ngen=8000000 samplefreq=500 printfreq=8000000 starttree=random nruns=1 nchains=1 savebrlens=yes filename=ss; -[sumt filename=mcmc;] -[mcmcp filename=ss;] -ss alpha=0.3 nsteps=30 burninss=-2; -end; -'''% (nchar_gene1, nchar_gene1, nchar_gene1) - - mrbayes_single_run2 = '''begin mrbayes; -delete Ankyra_judai Atractomorpha_echinata Bracteacoccus_aerius Bracteacoccus_minor Chlamydomonas_reinhardtii Chromochloris_zofingiensis Dunaliella_salina Floydiella_terrestris Gonium_pectorale Kirchneriella_aperta Mychonastes_homosphaera Oedogonium_cardiacum Ourococcus_multisporus Pleodorina_starrii Pseudomuriella_schumacherensis Rotundella_rotunda Schizomeris_leibleinii Stigeoclonium_helveticum Volvox_carteri; -charset first = 1-%d\\3; -charset second = 2-%d\\3; -charset third = 3-%d\\3; -partition mine = 3: first, second, third; -set partition=mine; -lset applyto=(all) nst=6 ngammacat=4 rates=gamma; -prset applyto=(all) statefreqpr=Dirichlet(1.0,1.0,1.0,1.0) ratepr=variable revmatpr=dirichlet(1,1,1,1,1,1) brlenspr=Unconstrained:GammaDir(1.0,0.100,1.0,1.0) shapepr=exponential(1.0); -unlink shape=(all) statefreq=(all) revmat=(all); -set seed=9223 swapseed=9223; -mcmcp ngen=8000000 samplefreq=500 printfreq=8000000 starttree=random nruns=1 nchains=1 savebrlens=yes filename=ss; -[sumt filename=mcmc;] -[mcmcp filename=ss;] -ss alpha=0.3 nsteps=30 burninss=-2; -end; -'''% (nchar_gene2, nchar_gene2, nchar_gene2) - - - mrbayes_concat_topo_brlen = '''begin mrbayes; -delete Ankyra_judai Atractomorpha_echinata Bracteacoccus_aerius Bracteacoccus_minor Chlamydomonas_reinhardtii Chromochloris_zofingiensis Dunaliella_salina Floydiella_terrestris Gonium_pectorale Kirchneriella_aperta Mychonastes_homosphaera Oedogonium_cardiacum Ourococcus_multisporus Pleodorina_starrii Pseudomuriella_schumacherensis Rotundella_rotunda Schizomeris_leibleinii Stigeoclonium_helveticum Volvox_carteri; -charset first = 1-%d\\3; -charset second = 2-%d\\3; -charset third = 3-%d\\3; -charset fourth = %d-%d\\3; -charset fifth = %d-%d\\3; -charset sixth = %d-%d\\3; -partition mine = 6: first, second, third, fourth, fifth, sixth; -set partition=mine; -lset applyto=(all) nst=6 ngammacat=4 rates=gamma; -prset applyto=(all) statefreqpr=Dirichlet(1.0,1.0,1.0,1.0) ratepr=variable revmatpr=dirichlet(1,1,1,1,1,1) brlenspr=Unconstrained:GammaDir(1.0,0.100,1.0,1.0) shapepr=exponential(1.0); -unlink shape=(all) statefreq=(all) revmat=(all); -set seed=9223 swapseed=9223; -mcmcp ngen=8000000 samplefreq=500 printfreq=8000000 starttree=random nruns=1 nchains=1 savebrlens=yes filename=ss; -[sumt filename=mcmc;] -[mcmcp filename=ss;] -ss alpha=0.3 nsteps=30 burninss=-2; -end; -'''% (nchar_gene1, nchar_gene1, nchar_gene1, nchar_gene1+1, newnchar, nchar_gene1+2, newnchar, nchar_gene1+3,newnchar) - - mrbayes_concat_topo = '''begin mrbayes; -delete Ankyra_judai Atractomorpha_echinata Bracteacoccus_aerius Bracteacoccus_minor Chlamydomonas_reinhardtii Chromochloris_zofingiensis Dunaliella_salina Floydiella_terrestris Gonium_pectorale Kirchneriella_aperta Mychonastes_homosphaera Oedogonium_cardiacum Ourococcus_multisporus Pleodorina_starrii Pseudomuriella_schumacherensis Rotundella_rotunda Schizomeris_leibleinii Stigeoclonium_helveticum Volvox_carteri; -charset first = 1-%d\\3; -charset second = 2-%d\\3; -charset third = 3-%d\\3; -charset fourth = %d-%d\\3; -charset fifth = %d-%d\\3; -charset sixth = %d-%d\\3; -partition mine = 6: first, second, third, fourth, fifth, sixth; -set partition=mine; -lset applyto=(all) nst=6 ngammacat=4 rates=gamma; -prset applyto=(all) statefreqpr=Dirichlet(1.0,1.0,1.0,1.0) revmatpr=dirichlet(1,1,1,1,1,1) brlenspr=Unconstrained:GammaDir(1.0,0.100,1.0,1.0) shapepr=exponential(1.0); -unlink shape=(all) statefreq=(all) revmat=(all) brlens=(all); -set seed=9223 swapseed=9223; -mcmcp ngen=8000000 samplefreq=500 printfreq=8000000 starttree=random nruns=1 nchains=1 savebrlens=yes filename=ss; -[sumt filename=mcmc;] -[mcmcp filename=ss;] -ss alpha=0.3 nsteps=30 burninss=-2; -end; -'''% (nchar_gene1, nchar_gene1, nchar_gene1, nchar_gene1+1, newnchar, nchar_gene1+2, newnchar, nchar_gene1+3,newnchar) - - -#concat_topo -###########submitallscript########### - submitallscript ='''cd %s; qsub qsub.sh; cd .. -cd %s; qsub qsub.sh; cd .. -cd concat_topo_brlen; qsub qsub.sh; cd .. -cd concat_topo; qsub qsub.sh; cd .. -'''%(genes[a-1], genes[b-1]) - submitall = os.path.join(gene_dir, 'submitall.sh') - newf = open(submitall, 'w') - newf.write(submitallscript) -###########submitallscript########### - - -###########qsubscript for concatenated set########### -# qsubscript = '''#!/bin/bash -# #$ -S /bin/bash -# #$ -cwd -# #$ -N concatenated -# #$ -q highpri.q,highmem.q -# mb concatenated.nex -# rm -f mcmc.* ss.*''' -###########qsubscript for concatenated set########### - - -###########gene1########### - single_gene_dir = os.path.join(gene_dir, genes[a-1]) - if not os.path.exists(single_gene_dir): - os.mkdir(single_gene_dir) - output_path = os.path.join(single_gene_dir, ('%s.nex' %(genes[a-1]))) - newf = open(output_path, 'w') - newf.write('#NEXUS\n\n') - newf.write('Begin data;\n') - newf.write(' Dimensions ntax=%d nchar=%d;\n' % (ntax, nchar_gene1)) - newf.write(' Format datatype=dna missing=? gap=-;\n') - newf.write(' Matrix\n') - longest_taxon_name = max([len(t) for t in new_taxonlist]) - for t,s in zip(new_taxonlist, seq1): - formatstr = '%%%ds' % longest_taxon_name - namestr = formatstr % t - newf.write(' %s %s\n' % (namestr, s)) - newf.write(';\n') - newf.write('end;\n\n') - newf.write(mrbayes_single_run1) - newf.close() - ##qsub_script##### - qsub = os.path.join(single_gene_dir, 'qsub.sh') - newf = open(qsub, 'w') - newf.write('''#!/bin/bash -#$ -S /bin/bash -#$ -cwd -#$ -N %s -#$ -q highpri.q,highmem.q -mb %s.nex -rm -f mcmc.* ss.*'''%(genes[a-1], genes[a-1])) - ##qsub_script##### -###########gene1########### - - -###########gene2########### - single_gene_dir = os.path.join(gene_dir, genes[b-1]) - if not os.path.exists(single_gene_dir): - os.mkdir(single_gene_dir) - output_path = os.path.join(single_gene_dir, ('%s.nex' %(genes[b-1]))) - newf = open(output_path, 'w') - newf.write('#NEXUS\n\n') - newf.write('Begin data;\n') - newf.write(' Dimensions ntax=%d nchar=%d;\n' % (ntax, nchar_gene2)) - newf.write(' Format datatype=dna missing=? gap=-;\n') - newf.write(' Matrix\n') - longest_taxon_name = max([len(t) for t in new_taxonlist]) - for t,s in zip(new_taxonlist, seq2): - formatstr = '%%%ds' % longest_taxon_name - namestr = formatstr % t - newf.write(' %s %s\n' % (namestr, s)) - newf.write(';\n') - newf.write('end;\n\n') - newf.write(mrbayes_single_run2) - newf.close() - ####qsub_script##### - qsub = os.path.join(single_gene_dir, 'qsub.sh') - newf = open(qsub, 'w') - newf.write('''#!/bin/bash -#$ -S /bin/bash -#$ -cwd -#$ -N %s -#$ -q highpri.q,highmem.q -mb %s.nex -rm -f mcmc.* ss.*'''%(genes[b-1], genes[b-1])) - ####qsub_script##### -###########gene2########### - - -###########concat_topo_brlen########### - concat_dir = os.path.join(gene_dir, 'concat_topo_brlen') - if not os.path.exists(concat_dir): - os.mkdir(concat_dir) - output_path = os.path.join(concat_dir, 'concatenated.nex') - newf = open(output_path, 'w') - newf.write('#NEXUS\n\n') - newf.write('Begin data;\n') - newf.write(' Dimensions ntax=%d nchar=%d;\n' % (ntax, newnchar)) - newf.write(' Format datatype=dna missing=? gap=-;\n') - newf.write(' Matrix\n') - longest_taxon_name = max([len(t) for t in new_taxonlist]) - for t,s in zip(new_taxonlist, new_sequences): - formatstr = '%%%ds' % longest_taxon_name - namestr = formatstr % t - newf.write(' %s %s\n' % (namestr, s)) - newf.write(';\n') - newf.write('end;\n\n') - newf.write(mrbayes_concat_topo_brlen) - newf.close() - ####qsub_script concat_topo_brlen##### - qsub = os.path.join(concat_dir, 'qsub.sh') - newf = open(qsub, 'w') - newf.write('''#!/bin/bash -#$ -S /bin/bash -#$ -cwd -#$ -N %s_%s_concat_topo_brlen -#$ -q highpri.q,highmem.q -mb concatenated.nex -rm -f mcmc.* ss.*'''%(genes[a-1],genes[b-1])) - ####qsub_script concat_topo_brlen##### -###########concat_topo_brlen########### - - -###########concat_topo########### - concat_dir = os.path.join(gene_dir, 'concat_topo') - if not os.path.exists(concat_dir): - os.mkdir(concat_dir) - output_path = os.path.join(concat_dir, 'concatenated.nex') - newf = open(output_path, 'w') - newf.write('#NEXUS\n\n') - newf.write('Begin data;\n') - newf.write(' Dimensions ntax=%d nchar=%d;\n' % (ntax, newnchar)) - newf.write(' Format datatype=dna missing=? gap=-;\n') - newf.write(' Matrix\n') - longest_taxon_name = max([len(t) for t in new_taxonlist]) - for t,s in zip(new_taxonlist, new_sequences): - formatstr = '%%%ds' % longest_taxon_name - namestr = formatstr % t - newf.write(' %s %s\n' % (namestr, s)) - newf.write(';\n') - newf.write('end;\n\n') - newf.write(mrbayes_concat_topo) - newf.close() - ####qsub_script concat_topo##### - qsub = os.path.join(concat_dir, 'qsub.sh') - newf = open(qsub, 'w') - newf.write('''#!/bin/bash -#$ -S /bin/bash -#$ -cwd -#$ -N %s_%s_concat_topo -#$ -q highpri.q,highmem.q -mb concatenated.nex -rm -f mcmc.* ss.*'''%(genes[a-1],genes[b-1])) - ####qsub_script concat_topo##### -###########concat_topo########### - - -####### go.sh ######### -jobscript = os.path.join(output, 'go.sh') -newf = open(jobscript, 'w') -newf.write('#!/bin/bash\n\n') -for i in combo_list: - newf.write('cd %s\n. submitall.sh\ncd ..\n\n' %(i)) -newf.close() - -####### capture_marg.py ######### -jobscript2 = os.path.join(output, 'capture_marg.py') -newf = open(jobscript2, 'w') -newf.write(capture_script) -newf.close() - - -