From 938c6fd892e3dc572ac91283f58c65b1a40e565b Mon Sep 17 00:00:00 2001
From: Suman Neupane <suman.neupane@uconn.edu>
Date: Fri, 26 May 2017 11:53:23 -0400
Subject: [PATCH 1/9] Delete brnlenMCMC.py

---
 brnlenMCMC.py | 494 --------------------------------------------------
 1 file changed, 494 deletions(-)
 delete mode 100644 brnlenMCMC.py

diff --git a/brnlenMCMC.py b/brnlenMCMC.py
deleted file mode 100644
index 87c6160..0000000
--- a/brnlenMCMC.py
+++ /dev/null
@@ -1,494 +0,0 @@
-##########################################################################################
-# This script reads a nexus DNA matrix (through module readseq.py) and a newick tree
-# topology, and computes log-likelihood of the topology under Jukes Cantor+GAMMA model, 
-# and performs MCMC on branch length parameter
-##########################################################################################
-
-
-import readSeq
-import random
-import re, os, itertools, sys, glob
-from itertools import chain
-from scipy.stats import  gamma
-from math import exp, log
-
-
-##########################################################################################
-tree_file_name = 'tree.tre'
-sequence_file = 'example3.nex'
-alpha = 0.5 #gamma shape parameter for rate categories
-n_gen = 50000
-save_every = 50
-mean_expo = 10. #mean_expo = mean of exponential distribution for branch length prior
-##########################################################################################
-    
-
-
-
-class node(object):
-    def __init__(self, ndnum):              # initialization function
-        self.rsib = None                    # right sibling
-        self.lchild = None                  # left child
-        self.par = None                     # parent node
-        self.number = ndnum                 # node number (internals negative, tips 0 or positive)
-        self.edgelen = 0.0                  # branch length
-        self.descendants = set([ndnum])     # set containing descendant leaf set
-        self.partial = None                 # will have length 4*npatterns
-        
-
-    def __str__(self):
-        # __str__ is a built-in function that is used by print to show an object
-        descendants_as_string = ','.join(['%d' % d for d in self.descendants])
-
-        lchildstr = 'None'
-        if self.lchild is not None:
-            lchildstr = '%d' % self.lchild.number
-            
-        rsibstr = 'None'
-        if self.rsib is not None:
-            rsibstr = '%d' % self.rsib.number
-            
-        parstr = 'None'
-        if self.par is not None:
-            parstr = '%d' % self.par.number
-            
-        return 'node: number=%d edgelen=%g lchild=%s rsib=%s parent=%s descendants=[%s]' % (self.number, self.edgelen, lchildstr, rsibstr, parstr, descendants_as_string)
- 
-    
- 
-def allocatePartial(node, patterns, rates):
-    if node.number > 0:
-        npatterns = len(patterns)
-
-        if node.partial is None:
-            node.partial = [0.0]*(4*4*npatterns)
-#             print len(node.partial)
-        for i,pattern in enumerate(patterns.keys()):
-            base = pattern[node.number-1]
-            for l in range(4):
-                if base == 'A':
-                
-                
-                    node.partial[i*16+l*4 + 0] = 1.0
-                elif base == 'C':
-                    node.partial[i*16+l*4 + 1] = 1.0
-                elif base == 'G':
-                    node.partial[i*16+l*4 + 2] = 1.0
-                elif base == 'T':
-                    node.partial[i*16+l*4 + 3] = 1.0
-                else:
-                    assert(False), 'oops, something went horribly wrong!'
-                        
-    else:
-        
-        npatterns = len(patterns)
-        if node.partial is None:
-            node.partial = [0.0]*(4*4*npatterns)
-
-        like_list = []
-        for i,pattern in enumerate(patterns.keys()):
-            m_list = []
-            num_pattern = patterns[pattern]
-
-            for l,m in enumerate(rates):
-            
-                psame = (0.25+0.75*exp(-4.0*m*(node.lchild.edgelen)/3.0))
-                pdiff = (0.25-0.25*exp(-4.0*m*(node.lchild.edgelen)/3.0))
-
-                psame2 = (0.25+0.75*exp(-4.0*m*(node.lchild.rsib.edgelen)/3.0))
-                pdiff2 = (0.25-0.25*exp(-4.0*m*(node.lchild.rsib.edgelen)/3.0))
-
-                num_pattern = patterns[pattern]                    
-                pAA = psame*(node.lchild.partial[i*16+l*4 + 0])
-                pAC = pdiff*(node.lchild.partial[i*16+l*4 + 1])
-                pAG = pdiff*(node.lchild.partial[i*16+l*4 + 2])
-                pAT = pdiff*(node.lchild.partial[i*16+l*4 + 3])
-
-                pAA2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 0])
-                pAC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1])
-                pAG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2])
-                pAT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3])
-           
-                pfromA_lchild = pAA+pAC+pAG+pAT
-                pfromA_rchild = pAA2+pAC2+pAG2+pAT2
-                node.partial[i*16+l*4 + 0] = pfromA_lchild*pfromA_rchild
-
-
-                ######################################################
-
-                pCA = pdiff*(node.lchild.partial[i*16+l*4 + 0])
-                pCC = psame*(node.lchild.partial[i*16+l*4 + 1])
-                pCG = pdiff*(node.lchild.partial[i*16+l*4 + 2])
-                pCT = pdiff*(node.lchild.partial[i*16+l*4 + 3])
-
-                pCA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0])
-                pCC2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 1])
-                pCG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2])
-                pCT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3])
-            
-                pfromC_lchild = pCA+pCC+pCG+pCT
-                pfromC_rchild = pCA2+pCC2+pCG2+pCT2
-                node.partial[i*16+l*4 + 1] = pfromC_lchild*pfromC_rchild
-            
-                #######################################################
-# 
-                pGA = pdiff*(node.lchild.partial[i*16+l*4 + 0])
-                pGC = pdiff*(node.lchild.partial[i*16+l*4 + 1])
-                pGG = psame*(node.lchild.partial[i*16+l*4 + 2])
-                pGT = pdiff*(node.lchild.partial[i*16+l*4 + 3])
-
-                pGA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0])
-                pGC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1])
-                pGG2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 2])
-                pGT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3])
-                
-                pfromG_lchild = pGA+pGC+pGG+pGT
-                pfromG_rchild = pGA2+pGC2+pGG2+pGT2
-                node.partial[i*16+l*4 + 2] = pfromG_lchild*pfromG_rchild
-           
-                #######################################################
-            
-                pTA = pdiff*(node.lchild.partial[i*16+l*4 + 0])
-                pTC = pdiff*(node.lchild.partial[i*16+l*4 + 1])
-                pTG = pdiff*(node.lchild.partial[i*16+l*4 + 2])
-                pTT = psame*(node.lchild.partial[i*16+l*4 + 3])
-
-                pTA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0])
-                pTC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1])
-                pTG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2])
-                pTT2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 3])
-
-                pfromT_lchild = pTA+pTC+pTG+pTT
-                pfromT_rchild = pTA2+pTC2+pTG2+pTT2
-                node.partial[i*16+l*4 + 3] = pfromT_lchild*pfromT_rchild
-            
-            site_like = (sum(node.partial[i*16:i*16+16]))*0.25*0.25
-            site_log_like = (log(site_like))*num_pattern
-            like_list.append(site_log_like)
-        log_like = sum(like_list)           
-        return log_like
-
-    
-
-def mcmcbrn(postorder, patterns, rates):
-    nodes = readnewick(treenewick())
-    mcmc = 0
-    output = os.path.join('brnlenMCMC_results.txt')
-    newf = open(output, 'w')
-    newf.write('%s\t'%('n_gen'))  
-    newf.write( '%s\t%s\t'%('LnL','LnPr'))
-    for nl in postorder:
-        newf.write( 'node%s\t'%(nl.number))
-    newf.write('\n')
-    start_log_prior = 0.0
-    for nd in nodes:
-        start_log_prior += (-nd.edgelen/mean_expo)-(log(mean_expo))
-    start_log_like = prepareTree(nodes, patterns, rates)
-   
-
-    newf.write('%s\t'%(mcmc)) 
-    print 'mcmc gen=', mcmc
-    print start_log_like, start_log_prior, 
-
-    newf.write( '%.6f\t%.6f\t'%(start_log_like,start_log_prior))
-    for nl in postorder:
-        newf.write( '%.6f\t'%(nl.edgelen))
-        print nl.edgelen,
-    print 
-    print '**************************'
-
-    newf.write('\n')
-#     print
-    for r in range(n_gen):
-        for i in range(len(postorder)):
-            preedgelen = nodes[i].edgelen
-            currentlike = prepareTree(nodes, patterns, rates)
-#             currentlike = 0.0
-            currentprior = 0.0
-            for nd in nodes:
-                currentprior += (-nd.edgelen/mean_expo)-(log(mean_expo))   
-            current_ln_posterior = currentlike + currentprior
-
-            u = random.random()
-            m = exp(0.2*(u-0.5))
-            nodes[i].edgelen = preedgelen*m
-            proposedprior = 0.0
-            for nd in nodes:
-                proposedprior += (-nd.edgelen/mean_expo)-(log(mean_expo))   
-
-            proposedlike = prepareTree(nodes, patterns, rates)
-            proposed_ln_posterior = proposedlike + proposedprior
-            hastings_ratio = log(m)
-            logR = proposed_ln_posterior - current_ln_posterior + hastings_ratio
-            u2 = random.random()
-            if log(u2) < logR:
-                nodes[i].edgelen = nodes[i].edgelen
-                log_prior = proposedprior
-                log_likelihood = proposedlike
-#                 print 'log(u2) < logR so new proposal accepted..'
-            else:
-                nodes[i].edgelen = preedgelen
-                log_prior = currentprior
-                log_likelihood = currentlike
-
-#                 print 'log(u2) > logR so failed to accept new proposal..'
-
-        if (r+1) % save_every == 0:        
-            newf.write('%s\t'%(mcmc+1))
-            print 'mcmc gen=', mcmc+1
-            print log_likelihood,log_prior,
-            newf.write( '%.6f\t%.6f\t'%(log_likelihood,log_prior))
-            for j,k in enumerate(nodes):
-                newf.write( '%.6f\t'%(k.edgelen))
-                print k.edgelen,
-            newf.write('\n')
-            print 
-            print '**************************'
-
-            newf.flush()
-
-        mcmc+=1
-       
-        
-def treenewick():
-    script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
-    path = os.path.join(script_dir, tree_file_name)
-    with open(path, 'r') as content:
-        newick = content.read()
-    return newick
-# 
-
-def gammaRates(alpha):
-    bounds = [0.0, 0.25, 0.50, 0.75, 1.]
-    rates = []
-    for i in range(4):
-#         print i
-        lower = gamma.ppf(bounds[i], alpha, 0, 1./alpha)
-        upper = gamma.ppf(bounds[i+1], alpha, 0, 1./alpha)
-        mean_rate = ((gamma.cdf(upper, alpha+1., 0, 1./alpha) - gamma.cdf(lower, alpha+1., 0, 1./alpha)))*4.
-        rates.append(mean_rate)
-    return rates
-
-def prepareTree(postorder, patterns, rates):
-    likelihood_lists = []
-    for nd in postorder:
-        likelihood_lists.append(allocatePartial(nd, patterns, rates))
-#     print 'log-likelihood of the topology =', likelihood_lists[-1]
-    return likelihood_lists[-1]
-
-def joinRandomPair(node_list, next_node_number, is_deep_coalescence):
-    # pick first of two lineages to join and delete from node_list
-    i = random.randint(1, len(node_list))
-    ndi = node_list[i-1]
-    del node_list[i-1]
-
-    # pick second of two lineages to join and delete from node_list
-    j = random.randint(1, len(node_list))
-    ndj = node_list[j-1]
-    del node_list[j-1]
-
-    # join selected nodes and add ancestor to node_list
-    ancnd = node(next_node_number)
-    ancnd.deep = is_deep_coalescence
-    ancnd.lchild = ndi
-    ancnd.descendants = set()
-    ancnd.descendants |= ndi.descendants
-    ancnd.descendants |= ndj.descendants
-    ndi.rsib = ndj
-    ndi.par = ancnd
-    ndj.par = ancnd
-    node_list.append(ancnd)
-
-    return node_list
-
-
-def makeNewick(nd, brlen_scaler = 1.0, start = True): #
-    global _newick
-    global _TL
-
-    if start:
-        _newick = ''
-        _TL = 0.0
-
-    if nd.lchild:
-        _newick += '('
-        makeNewick(nd.lchild, brlen_scaler, False)
-
-    else:
-        blen = nd.edgelen*brlen_scaler
-        _TL += blen
-        _newick += '%d:%.5f' % (nd.number, blen)
-
-    if nd.rsib:
-        _newick += ','
-        makeNewick(nd.rsib, brlen_scaler, False)
-    elif nd.par is not None:
-        blen = nd.par.edgelen*brlen_scaler
-        _TL += blen
-        _newick += '):%.3f' % blen
-
-    return _newick, _TL
-
-def calcActualHeight(root): 
-    h = 0.0
-    nd = root
-    while nd.lchild:
-        nd = nd.lchild
-        h += nd.edgelen
-    return h
-
-
-def readnewick(tree):
-    total_length = len(tree)
-    internal_node_number = -1
-
-    root = node(internal_node_number)
-    nd = root
-    i = 0
-    pre = [root]
-    while i < total_length:
-        m = tree[i]
-
-        if m =='(':
-            internal_node_number -= 1
-
-            child = node(internal_node_number)
-            pre.append(child)
-            nd.lchild=child
-
-            child.par=nd
-            nd=child
-        elif m == ',':
-            internal_node_number -= 1
-            rsib = node(internal_node_number)
-            pre.append(rsib)
-            nd.rsib = rsib
-            rsib.par=nd.par
-            nd = rsib
-        elif m == ')':
-            nd = nd.par
-    
-        elif m == ':':
-            edge_len_str = ''
-            i+=1
-            m = tree[i]
-            assert m in ['0','1','2','3','4','5','6','7','8', '9','.']
-            while m in ['0','1','2','3','4','5','6','7','8', '9','.']:
-                edge_len_str += m
-                i+=1
-                m = tree[i]
-            i -=1
-            nd.edgelen = float(edge_len_str)
-
-
-        else:
-            internal_node_number += 1
-
-            if True:
-                assert m in ['0','1','2','3','4','5','6','7','8', '9'], 'Error : expecting m to be a digit when in fact it was "%s"' % m
-                mm = ''
-                while m in ['0','1','2','3','4','5','6','7','8', '9' ]:
-
-                    mm += m
-                
-                    i += 1
-                    m = tree[i]
-                nd.number = int(mm)
-                i -= 1
-
-        i += 1
-
-    post = pre[:]
-    post.reverse()
-    return post
-
-def Makenewick(pre):
-    newickstring = ''
-    for i,nd in enumerate(pre):
-        if nd.lchild:
-            newickstring += '('
-       
-        elif nd.rsib:
-            newickstring += '%d' %(nd.number)
-            newickstring += ':%.1f' % nd.edgelen
-            newickstring += ','
-
-        else:
-            newickstring += '%d' %(nd.number)
-            newickstring += ':%.1f' % nd.edgelen
-            tmpnd = nd
-            while (tmpnd.par is not None) and (tmpnd.rsib is None):
-                newickstring += ')'
-                newickstring += ':%.1f' % tmpnd.par.edgelen
-                tmpnd = tmpnd.par
-
-            if tmpnd.par is not None:
-                newickstring += ','
-    return newickstring
-
-###################yule tree###################################################
-# calcPhi computes sum_{K=2}^S 1/K, where S is the number of leaves in the tree
-# - num_species is the number of leaves (tips) in the tree
-def calcPhi(num_species):
-    phi = sum([1.0/(K+2.0) for K in range(num_species-1)])
-    return phi
-
-# yuleTree creates a species tree in which edge lengths are measured in
-# expected number of substitutions.
-# - num_species is the number of leaves
-# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio
-def yuleTree(num_species, mu_over_s):
-    # create num_species nodes numbered 1, 2, ..., num_species
-    nodes = [node(i+1) for i in range(num_species)]
-
-    next_node_number = num_species + 1
-    while len(nodes) > 1:
-        # choose a speciation time in generations
-        K = float(len(nodes))
-        mean_epoch_length = mu_over_s/K
-        t = random.gammavariate(1.0, mean_epoch_length)
-
-        # update each node's edgelen
-        for n in nodes:
-            n.edgelen += t      # same as: n.edgelen = n.edgelen + t
-
-        nodes = joinRandomPair(nodes, next_node_number, False)
-        next_node_number += 1
-
-    return nodes[0]
-
-# calcExpectedHeight returns the expected height of the species tree in terms of
-# expected number of substitutions from the root to one tip.
-# - num_species is the number of leaves
-# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio
-def calcExpectedHeight(num_species, mu_over_s):
-    return mu_over_s*calcPhi(num_species)
-
-
-if __name__ == '__main__':
-    random_seed = 348889 # 7632557, 12345
-    number_of_species = 5
-    mutation_speciation_rate_ratio = 0.689655172 # 0.689655172  # yields tree height 1 for 6 species
-    random.seed(random_seed)
-    species_tree_root = yuleTree(number_of_species, mutation_speciation_rate_ratio)
-#     print '#########'
-#     print species_tree_root
-    newick = makeNewick(species_tree_root)
-#     print 'Random number seed: %d' % random_seed
-#     print 'Simulating one tree:'
-#     print '  number of species              = %d' % number_of_species
-#     print '  mutation-speciation rate ratio = %g' % mutation_speciation_rate_ratio
-#     print '  actual tree length             =',newick[1]
-    expected_height = calcExpectedHeight(number_of_species, mutation_speciation_rate_ratio)
-#     print '  expected height                =',expected_height
-    actual_height = calcActualHeight(species_tree_root)
-#     print '  actual height                  =',actual_height
-    print 'true tree: ',newick[0]
-    print '**************************'
-    
-#     yuletree = '(((1:0.54019,(5:0.40299,10:0.40299):0.1372):0.72686,(6:0.10576,4:0.10576):1.16129):0.42537,(2:0.58122,(9:0.21295,(7:0.16691,(8:0.14622,3:0.14622):0.02069):0.04604):0.36827):1.1112)'
-    rates_list = gammaRates(alpha)
-    postorder = readnewick(treenewick())
-    result =  prepareTree(postorder, readSeq.patterns(sequence_file), rates_list)
-#     try1 = readSeq.patterns()
-    result2 = mcmcbrn(postorder, readSeq.patterns(sequence_file), rates_list)

From 9368821e7aa6ac32b6968be3d3bbbf0b6a05675e Mon Sep 17 00:00:00 2001
From: Suman Neupane <suman.neupane@uconn.edu>
Date: Fri, 26 May 2017 11:53:36 -0400
Subject: [PATCH 2/9] Delete treeLike_JCGAMMA.py

---
 treeLike_JCGAMMA.py | 413 --------------------------------------------
 1 file changed, 413 deletions(-)
 delete mode 100644 treeLike_JCGAMMA.py

diff --git a/treeLike_JCGAMMA.py b/treeLike_JCGAMMA.py
deleted file mode 100644
index 6a777cf..0000000
--- a/treeLike_JCGAMMA.py
+++ /dev/null
@@ -1,413 +0,0 @@
-##################################
-# This script reads a nexus DNA matrix (through module readseq.py) and a newick tree
-# topology, and computes log-likelihood of the topology under Jukes Cantor+GAMMA model
-###################################
-
-
-import readSeq
-import random
-import re, os, itertools, sys, glob
-from itertools import chain
-from scipy.stats import  gamma
-from math import exp, log
-
-##########################################################################################
-tree_file_name = 'tree.tre'
-sequence_file = 'example3.nex'
-alpha = 0.5 #gamma shape parameter for rate categories
-##########################################################################################
-
-
-
-
-
-class node(object):
-    def __init__(self, ndnum):              # initialization function
-        self.rsib = None                    # right sibling
-        self.lchild = None                  # left child
-        self.par = None                     # parent node
-        self.number = ndnum                 # node number (internals negative, tips 0 or positive)
-        self.edgelen = 0.0                  # branch length
-        self.descendants = set([ndnum])     # set containing descendant leaf set
-        self.partial = None                 # will have length 4*npatterns
-        
-
-
-         
-    def __str__(self):
-        # __str__ is a built-in function that is used by print to show an object
-        descendants_as_string = ','.join(['%d' % d for d in self.descendants])
-
-        lchildstr = 'None'
-        if self.lchild is not None:
-            lchildstr = '%d' % self.lchild.number
-            
-        rsibstr = 'None'
-        if self.rsib is not None:
-            rsibstr = '%d' % self.rsib.number
-            
-        parstr = 'None'
-        if self.par is not None:
-            parstr = '%d' % self.par.number
-            
-        return 'node: number=%d edgelen=%g lchild=%s rsib=%s parent=%s descendants=[%s]' % (self.number, self.edgelen, lchildstr, rsibstr, parstr, descendants_as_string)
- 
- 
-def allocatePartial(node, patterns, rates):
-    if node.number > 0:
-        npatterns = len(patterns)
-#             print 'npat', npatterns
-        node.partial = [0.0]*(4*4*npatterns)
-#             print len(node.partial)
-        for i,pattern in enumerate(patterns.keys()):
-            base = pattern[node.number-1]
-            for l in range(4):
-                if base == 'A':
-                    node.partial[i*16+l*4 + 0] = 1.0
-                elif base == 'C':
-                    node.partial[i*16+l*4 + 1] = 1.0
-                elif base == 'G':
-                    node.partial[i*16+l*4 + 2] = 1.0
-                elif base == 'T':
-                    node.partial[i*16+l*4 + 3] = 1.0
-                else:
-                    assert(False), 'oops, something went horribly wrong!'
-                        
-    else:
-#             rt = [0.03338775, 0.25191592, 0.82026848, 2.89442785]
-#             rt = [2.89442785]
-
-#             rt = [1.0, 1.0, 1.0, 1.0]
-
-        npatterns = len(patterns)
-#             print 'patterns=', patterns
-        node.partial = [0.0]*(4*4*npatterns)
-        like_list = []
-        for i,pattern in enumerate(patterns.keys()):
-#                 print i, pattern, patterns.keys()
-            m_list = []
-            num_pattern = patterns[pattern]
-#                 print num_pattern
-
-            for l,m in enumerate(rates):
-            
-
-                psame = (0.25+0.75*exp(-4.0*m*(node.lchild.edgelen)/3.0))
-                pdiff = (0.25-0.25*exp(-4.0*m*(node.lchild.edgelen)/3.0))
-
-                psame2 = (0.25+0.75*exp(-4.0*m*(node.lchild.rsib.edgelen)/3.0))
-                pdiff2 = (0.25-0.25*exp(-4.0*m*(node.lchild.rsib.edgelen)/3.0))
-
-                num_pattern = patterns[pattern]                    
-                pAA = psame*(node.lchild.partial[i*16+l*4 + 0])
-                pAC = pdiff*(node.lchild.partial[i*16+l*4 + 1])
-                pAG = pdiff*(node.lchild.partial[i*16+l*4 + 2])
-                pAT = pdiff*(node.lchild.partial[i*16+l*4 + 3])
-
-                pAA2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 0])
-                pAC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1])
-                pAG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2])
-                pAT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3])
-           
-                pfromA_lchild = pAA+pAC+pAG+pAT
-                pfromA_rchild = pAA2+pAC2+pAG2+pAT2
-                node.partial[i*16+l*4 + 0] = pfromA_lchild*pfromA_rchild
-
-
-                ######################################################
-
-                pCA = pdiff*(node.lchild.partial[i*16+l*4 + 0])
-                pCC = psame*(node.lchild.partial[i*16+l*4 + 1])
-                pCG = pdiff*(node.lchild.partial[i*16+l*4 + 2])
-                pCT = pdiff*(node.lchild.partial[i*16+l*4 + 3])
-
-                pCA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0])
-                pCC2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 1])
-                pCG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2])
-                pCT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3])
-            
-                pfromC_lchild = pCA+pCC+pCG+pCT
-                pfromC_rchild = pCA2+pCC2+pCG2+pCT2
-                node.partial[i*16+l*4 + 1] = pfromC_lchild*pfromC_rchild
-            
-                #######################################################
-# 
-                pGA = pdiff*(node.lchild.partial[i*16+l*4 + 0])
-                pGC = pdiff*(node.lchild.partial[i*16+l*4 + 1])
-                pGG = psame*(node.lchild.partial[i*16+l*4 + 2])
-                pGT = pdiff*(node.lchild.partial[i*16+l*4 + 3])
-
-                pGA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0])
-                pGC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1])
-                pGG2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 2])
-                pGT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3])
-                
-                pfromG_lchild = pGA+pGC+pGG+pGT
-                pfromG_rchild = pGA2+pGC2+pGG2+pGT2
-                node.partial[i*16+l*4 + 2] = pfromG_lchild*pfromG_rchild
-           
-                #######################################################
-            
-                pTA = pdiff*(node.lchild.partial[i*16+l*4 + 0])
-                pTC = pdiff*(node.lchild.partial[i*16+l*4 + 1])
-                pTG = pdiff*(node.lchild.partial[i*16+l*4 + 2])
-                pTT = psame*(node.lchild.partial[i*16+l*4 + 3])
-
-                pTA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0])
-                pTC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1])
-                pTG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2])
-                pTT2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 3])
-
-                pfromT_lchild = pTA+pTC+pTG+pTT
-                pfromT_rchild = pTA2+pTC2+pTG2+pTT2
-                node.partial[i*16+l*4 + 3] = pfromT_lchild*pfromT_rchild
-            
-            site_like = (sum(node.partial[i*16:i*16+16]))*0.25*0.25
-            site_log_like = (log(site_like))*num_pattern
-            like_list.append(site_log_like)
-        log_like = sum(like_list)           
-        return log_like
-
-        
-def treenewick():
-    script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
-    path = os.path.join(script_dir, tree_file_name)
-    with open(path, 'r') as content:
-        newick = content.read()
-    return newick
-# 
-
-def gammaRates(alpha):
-    bounds = [0.0, 0.25, 0.50, 0.75, 1.]
-    rates = []
-    for i in range(4):
-#         print i
-        lower = gamma.ppf(bounds[i], alpha, 0, 1./alpha)
-        upper = gamma.ppf(bounds[i+1], alpha, 0, 1./alpha)
-        mean_rate = ((gamma.cdf(upper, alpha+1., 0, 1./alpha) - gamma.cdf(lower, alpha+1., 0, 1./alpha)))*4.
-        rates.append(mean_rate)
-    return rates
-
-def prepareTree(postorder, patterns, rates):
-    likelihood_lists = []
-    for nd in postorder:
-        likelihood_lists.append(allocatePartial(nd, patterns, rates))
-    print 'log-likelihood of the topology =', likelihood_lists[-1]
-
-
-def joinRandomPair(node_list, next_node_number, is_deep_coalescence):
-    # pick first of two lineages to join and delete from node_list
-    i = random.randint(1, len(node_list))
-    ndi = node_list[i-1]
-    del node_list[i-1]
-
-    # pick second of two lineages to join and delete from node_list
-    j = random.randint(1, len(node_list))
-    ndj = node_list[j-1]
-    del node_list[j-1]
-
-    # join selected nodes and add ancestor to node_list
-    ancnd = node(next_node_number)
-    ancnd.deep = is_deep_coalescence
-    ancnd.lchild = ndi
-    ancnd.descendants = set()
-    ancnd.descendants |= ndi.descendants
-    ancnd.descendants |= ndj.descendants
-    ndi.rsib = ndj
-    ndi.par = ancnd
-    ndj.par = ancnd
-    node_list.append(ancnd)
-
-    return node_list
-
-
-def makeNewick(nd, brlen_scaler = 1.0, start = True): #
-    global _newick
-    global _TL
-
-    if start:
-        _newick = ''
-        _TL = 0.0
-
-    if nd.lchild:
-        _newick += '('
-        makeNewick(nd.lchild, brlen_scaler, False)
-
-    else:
-        blen = nd.edgelen*brlen_scaler
-        _TL += blen
-        _newick += '%d:%.5f' % (nd.number, blen)
-
-    if nd.rsib:
-        _newick += ','
-        makeNewick(nd.rsib, brlen_scaler, False)
-    elif nd.par is not None:
-        blen = nd.par.edgelen*brlen_scaler
-        _TL += blen
-        _newick += '):%.3f' % blen
-
-    return _newick, _TL
-
-def calcActualHeight(root): 
-    h = 0.0
-    nd = root
-    while nd.lchild:
-        nd = nd.lchild
-        h += nd.edgelen
-    return h
-
-
-def readnewick(tree):
-    total_length = len(tree)
-    internal_node_number = -1
-
-    root = node(internal_node_number)
-    nd = root
-    i = 0
-    pre = [root]
-    while i < total_length:
-        m = tree[i]
-
-        if m =='(':
-            internal_node_number -= 1
-
-            child = node(internal_node_number)
-            pre.append(child)
-            nd.lchild=child
-
-            child.par=nd
-            nd=child
-        elif m == ',':
-            internal_node_number -= 1
-            rsib = node(internal_node_number)
-            pre.append(rsib)
-            nd.rsib = rsib
-            rsib.par=nd.par
-            nd = rsib
-        elif m == ')':
-            nd = nd.par
-    
-        elif m == ':':
-            edge_len_str = ''
-            i+=1
-            m = tree[i]
-            assert m in ['0','1','2','3','4','5','6','7','8', '9','.']
-            while m in ['0','1','2','3','4','5','6','7','8', '9','.']:
-                edge_len_str += m
-                i+=1
-                m = tree[i]
-            i -=1
-            nd.edgelen = float(edge_len_str)
-
-
-        else:
-            internal_node_number += 1
-
-            if True:
-                assert m in ['0','1','2','3','4','5','6','7','8', '9'], 'Error : expecting m to be a digit when in fact it was "%s"' % m
-                mm = ''
-                while m in ['0','1','2','3','4','5','6','7','8', '9' ]:
-
-                    mm += m
-                
-                    i += 1
-                    m = tree[i]
-                nd.number = int(mm)
-                i -= 1
-
-        i += 1
-
-    post = pre[:]
-    post.reverse()
-    for nd in post:
-        print nd.number, nd.edgelen
-    return post
-
-def Makenewick(pre):
-    newickstring = ''
-    for i,nd in enumerate(pre):
-        if nd.lchild:
-            newickstring += '('
-       
-        elif nd.rsib:
-            newickstring += '%d' %(nd.number)
-            newickstring += ':%.1f' % nd.edgelen
-            newickstring += ','
-
-        else:
-            newickstring += '%d' %(nd.number)
-            newickstring += ':%.1f' % nd.edgelen
-            tmpnd = nd
-            while (tmpnd.par is not None) and (tmpnd.rsib is None):
-                newickstring += ')'
-                newickstring += ':%.1f' % tmpnd.par.edgelen
-                tmpnd = tmpnd.par
-
-            if tmpnd.par is not None:
-                newickstring += ','
-    return newickstring
-
-###################yule tree###################################################
-# calcPhi computes sum_{K=2}^S 1/K, where S is the number of leaves in the tree
-# - num_species is the number of leaves (tips) in the tree
-def calcPhi(num_species):
-    phi = sum([1.0/(K+2.0) for K in range(num_species-1)])
-    return phi
-
-# yuleTree creates a species tree in which edge lengths are measured in
-# expected number of substitutions.
-# - num_species is the number of leaves
-# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio
-def yuleTree(num_species, mu_over_s):
-    # create num_species nodes numbered 1, 2, ..., num_species
-    nodes = [node(i+1) for i in range(num_species)]
-
-    next_node_number = num_species + 1
-    while len(nodes) > 1:
-        # choose a speciation time in generations
-        K = float(len(nodes))
-        mean_epoch_length = mu_over_s/K
-        t = random.gammavariate(1.0, mean_epoch_length)
-
-        # update each node's edgelen
-        for n in nodes:
-            n.edgelen += t      # same as: n.edgelen = n.edgelen + t
-
-        nodes = joinRandomPair(nodes, next_node_number, False)
-        next_node_number += 1
-
-    return nodes[0]
-
-# calcExpectedHeight returns the expected height of the species tree in terms of
-# expected number of substitutions from the root to one tip.
-# - num_species is the number of leaves
-# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio
-def calcExpectedHeight(num_species, mu_over_s):
-    return mu_over_s*calcPhi(num_species)
-
-
-if __name__ == '__main__':
-    random_seed = 24553 # 7632557, 12345
-    number_of_species = 5
-    mutation_speciation_rate_ratio = 0.4 # 0.689655172  # yields tree height 1 for 6 species
-    random.seed(random_seed)
-    species_tree_root = yuleTree(number_of_species, mutation_speciation_rate_ratio)
-#     print '#########'
-#     print species_tree_root
-    newick = makeNewick(species_tree_root)
-#     print 'Random number seed: %d' % random_seed
-#     print 'Simulating one tree:'
-#     print '  number of species              = %d' % number_of_species
-#     print '  mutation-speciation rate ratio = %g' % mutation_speciation_rate_ratio
-#     print '  actual tree length             =',newick[1]
-    expected_height = calcExpectedHeight(number_of_species, mutation_speciation_rate_ratio)
-#     print '  expected height                =',expected_height
-    actual_height = calcActualHeight(species_tree_root)
-#     print '  actual height                  =',actual_height
-#     print '  newick: ',newick[0]
-    
-    alpha = 0.5 ### gamma shape parameter rate categories
-#     yuletree = '(((1:0.54019,(5:0.40299,10:0.40299):0.1372):0.72686,(6:0.10576,4:0.10576):1.16129):0.42537,(2:0.58122,(9:0.21295,(7:0.16691,(8:0.14622,3:0.14622):0.02069):0.04604):0.36827):1.1112)'
-    rates_list = gammaRates(alpha)
-    postorder = readnewick(treenewick())
-    result =  prepareTree(postorder, readSeq.patterns(sequence_file), rates_list)

From c51f63b08358e5c2ce494388b4370fb84daac6af Mon Sep 17 00:00:00 2001
From: Suman Neupane <suman.neupane@uconn.edu>
Date: Fri, 26 May 2017 11:53:43 -0400
Subject: [PATCH 3/9] Delete treeLike.py

---
 treeLike.py | 374 ----------------------------------------------------
 1 file changed, 374 deletions(-)
 delete mode 100644 treeLike.py

diff --git a/treeLike.py b/treeLike.py
deleted file mode 100644
index 1142e5e..0000000
--- a/treeLike.py
+++ /dev/null
@@ -1,374 +0,0 @@
-##################################
-# This script reads a nexus DNA matrix (through module readseq.py) and a newick tree
-# topology, and computes log-likelihood of the topology under Jukes Cantor model
-###################################
-
-
-import readSeq
-import random
-import re, os, itertools, sys, glob
-from itertools import chain
-from math import exp, log
-
-##########################################################################################
-tree_file_name = 'tree.tre'
-sequence_file = 'example3.nex'
-##########################################################################################
-
-
-class node(object):
-    def __init__(self, ndnum):              # initialization function
-        self.rsib = None                    # right sibling
-        self.lchild = None                  # left child
-        self.par = None                     # parent node
-        self.number = ndnum                 # node number (internals negative, tips 0 or positive)
-        self.edgelen = 0.0                  # branch length
-        self.descendants = set([ndnum])     # set containing descendant leaf set
-        self.partial = None                 # will have length 4*npatterns
-        
-         
-    def __str__(self):
-        # __str__ is a built-in function that is used by print to show an object
-        descendants_as_string = ','.join(['%d' % d for d in self.descendants])
-
-        lchildstr = 'None'
-        if self.lchild is not None:
-            lchildstr = '%d' % self.lchild.number
-            
-        rsibstr = 'None'
-        if self.rsib is not None:
-            rsibstr = '%d' % self.rsib.number
-            
-        parstr = 'None'
-        if self.par is not None:
-            parstr = '%d' % self.par.number
-            
-        return 'node: number=%d edgelen=%g lchild=%s rsib=%s parent=%s descendants=[%s]' % (self.number, self.edgelen, lchildstr, rsibstr, parstr, descendants_as_string)
-
-def allocatePartial(node, patterns):
-    if node.number > 0:
-        npatterns = len(patterns)
-        node.partial = [0.0]*(4*npatterns)
-        for i,pattern in enumerate(patterns.keys()):
-            base = pattern[node.number-1]
-            if base == 'A':
-                node.partial[i*4 + 0] = 1.0
-            elif base == 'C':
-                node.partial[i*4 + 1] = 1.0
-            elif base == 'G':
-                node.partial[i*4 + 2] = 1.0
-            elif base == 'T':
-                node.partial[i*4 + 3] = 1.0
-            else:
-                assert(False), 'oops, something went horribly wrong!'
-                        
-    else:
-        npatterns = len(patterns)
-        node.partial = [0.0]*(4*npatterns)
-        like_list = []
-        for i,pattern in enumerate(patterns.keys()):
-            psame = (0.25+0.75*exp(-4.0*(node.lchild.edgelen)/3.0))
-            pdiff = (0.25-0.25*exp(-4.0*(node.lchild.edgelen)/3.0))
-
-            psame2 = (0.25+0.75*exp(-4.0*(node.lchild.rsib.edgelen)/3.0))
-            pdiff2 = (0.25-0.25*exp(-4.0*(node.lchild.rsib.edgelen)/3.0))
-
-            num_pattern = patterns[pattern]
-            
-            pAA = psame*(node.lchild.partial[i*4 + 0])
-            pAC = pdiff*(node.lchild.partial[i*4 + 1])
-            pAG = pdiff*(node.lchild.partial[i*4 + 2])
-            pAT = pdiff*(node.lchild.partial[i*4 + 3])
-
-            pAA2 = psame2*(node.lchild.rsib.partial[i*4 + 0])
-            pAC2 = pdiff2*(node.lchild.rsib.partial[i*4 + 1])
-            pAG2 = pdiff2*(node.lchild.rsib.partial[i*4 + 2])
-            pAT2 = pdiff2*(node.lchild.rsib.partial[i*4 + 3])
-           
-            pfromA_lchild = pAA+pAC+pAG+pAT
-            pfromA_rchild = pAA2+pAC2+pAG2+pAT2
-            node.partial[i*4 + 0] = pfromA_lchild*pfromA_rchild
-
-            ######################################################
-
-            pCA = pdiff*(node.lchild.partial[i*4 + 0])
-            pCC = psame*(node.lchild.partial[i*4 + 1])
-            pCG = pdiff*(node.lchild.partial[i*4 + 2])
-            pCT = pdiff*(node.lchild.partial[i*4 + 3])
-
-            pCA2 = pdiff2*(node.lchild.rsib.partial[i*4 + 0])
-            pCC2 = psame2*(node.lchild.rsib.partial[i*4 + 1])
-            pCG2 = pdiff2*(node.lchild.rsib.partial[i*4 + 2])
-            pCT2 = pdiff2*(node.lchild.rsib.partial[i*4 + 3])
-            
-            pfromC_lchild = pCA+pCC+pCG+pCT
-            pfromC_rchild = pCA2+pCC2+pCG2+pCT2
-            node.partial[i*4 + 1] = pfromC_lchild*pfromC_rchild
-            
-            #######################################################
-# 
-            pGA = pdiff*(node.lchild.partial[i*4 + 0])
-            pGC = pdiff*(node.lchild.partial[i*4 + 1])
-            pGG = psame*(node.lchild.partial[i*4 + 2])
-            pGT = pdiff*(node.lchild.partial[i*4 + 3])
-# 
-            pGA2 = pdiff2*(node.lchild.rsib.partial[i*4 + 0])
-            pGC2 = pdiff2*(node.lchild.rsib.partial[i*4 + 1])
-            pGG2 = psame2*(node.lchild.rsib.partial[i*4 + 2])
-            pGT2 = pdiff2*(node.lchild.rsib.partial[i*4 + 3])
-#                 
-            pfromG_lchild = pGA+pGC+pGG+pGT
-            pfromG_rchild = pGA2+pGC2+pGG2+pGT2
-            node.partial[i*4 + 2] = pfromG_lchild*pfromG_rchild
-            
-            #######################################################
-            
-            pTA = pdiff*(node.lchild.partial[i*4 + 0])
-            pTC = pdiff*(node.lchild.partial[i*4 + 1])
-            pTG = pdiff*(node.lchild.partial[i*4 + 2])
-            pTT = psame*(node.lchild.partial[i*4 + 3])
-# 
-            pTA2 = pdiff2*(node.lchild.rsib.partial[i*4 + 0])
-            pTC2 = pdiff2*(node.lchild.rsib.partial[i*4 + 1])
-            pTG2 = pdiff2*(node.lchild.rsib.partial[i*4 + 2])
-            pTT2 = psame2*(node.lchild.rsib.partial[i*4 + 3])
-# 
-            pfromT_lchild = pTA+pTC+pTG+pTT
-            pfromT_rchild = pTA2+pTC2+pTG2+pTT2
-            node.partial[i*4 + 3] = pfromT_lchild*pfromT_rchild
-            
-            #########################################################
-        
-            site_log_like = (log((sum(node.partial[i*4:i*4+4]))*0.25))*num_pattern
-            like_list.append(site_log_like)
-
-        log_Like = sum(like_list)
-        return log_Like
-    
-def treenewick():
-    script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
-    path = os.path.join(script_dir, tree_file_name)
-    with open(path, 'r') as content:
-        newick = content.read()
-    return newick
-    
-def prepareTree(postorder, patterns):
-    likelihood_lists = []
-    for nd in postorder:
-        likelihood_lists.append(allocatePartial(nd, patterns))
-    print 'log-likelihood of the topology =', likelihood_lists[-1]
-
-def joinRandomPair(node_list, next_node_number, is_deep_coalescence):
-    # pick first of two lineages to join and delete from node_list
-    i = random.randint(1, len(node_list))
-    ndi = node_list[i-1]
-    del node_list[i-1]
-
-    # pick second of two lineages to join and delete from node_list
-    j = random.randint(1, len(node_list))
-    ndj = node_list[j-1]
-    del node_list[j-1]
-
-    # join selected nodes and add ancestor to node_list
-    ancnd = node(next_node_number)
-    ancnd.deep = is_deep_coalescence
-    ancnd.lchild = ndi
-    ancnd.descendants = set()
-    ancnd.descendants |= ndi.descendants
-    ancnd.descendants |= ndj.descendants
-    ndi.rsib = ndj
-    ndi.par = ancnd
-    ndj.par = ancnd
-    node_list.append(ancnd)
-
-    return node_list
-
-
-def makeNewick(nd, brlen_scaler = 1.0, start = True): #
-    global _newick
-    global _TL
-
-    if start:
-        _newick = ''
-        _TL = 0.0
-
-    if nd.lchild:
-        _newick += '('
-        makeNewick(nd.lchild, brlen_scaler, False)
-
-    else:
-        blen = nd.edgelen*brlen_scaler
-        _TL += blen
-        _newick += '%d:%.5f' % (nd.number, blen)
-
-    if nd.rsib:
-        _newick += ','
-        makeNewick(nd.rsib, brlen_scaler, False)
-    elif nd.par is not None:
-        blen = nd.par.edgelen*brlen_scaler
-        _TL += blen
-        _newick += '):%.3f' % blen
-
-    return _newick, _TL
-
-def calcActualHeight(root): 
-    h = 0.0
-    nd = root
-    while nd.lchild:
-        nd = nd.lchild
-        h += nd.edgelen
-    return h
-
-
-def readnewick(tree):
-    total_length = len(tree)
-    internal_node_number = -1
-
-    root = node(internal_node_number)
-    nd = root
-    i = 0
-    pre = [root]
-    while i < total_length:
-        m = tree[i]
-
-        if m =='(':
-            internal_node_number -= 1
-
-            child = node(internal_node_number)
-            pre.append(child)
-            nd.lchild=child
-
-            child.par=nd
-            nd=child
-        elif m == ',':
-            internal_node_number -= 1
-            rsib = node(internal_node_number)
-            pre.append(rsib)
-            nd.rsib = rsib
-            rsib.par=nd.par
-            nd = rsib
-        elif m == ')':
-            nd = nd.par
-    
-        elif m == ':':
-            edge_len_str = ''
-            i+=1
-            m = tree[i]
-            assert m in ['0','1','2','3','4','5','6','7','8', '9','.']
-            while m in ['0','1','2','3','4','5','6','7','8', '9','.']:
-                edge_len_str += m
-                i+=1
-                m = tree[i]
-            i -=1
-            nd.edgelen = float(edge_len_str)
-
-
-        else:
-            internal_node_number += 1
-
-            if True:
-                assert m in ['0','1','2','3','4','5','6','7','8', '9'], 'Error : expecting m to be a digit when in fact it was "%s"' % m
-                mm = ''
-                while m in ['0','1','2','3','4','5','6','7','8', '9' ]:
-
-                    mm += m
-                
-                    i += 1
-                    m = tree[i]
-                nd.number = int(mm)
-                i -= 1
-
-        i += 1
-
-    post = pre[:]
-    post.reverse()
-    return post
-
-def Makenewick(pre):
-    newickstring = ''
-    for i,nd in enumerate(pre):
-        if nd.lchild:
-            newickstring += '('
-       
-        elif nd.rsib:
-            newickstring += '%d' %(nd.number)
-            newickstring += ':%.1f' % nd.edgelen
-            newickstring += ','
-
-        else:
-            newickstring += '%d' %(nd.number)
-            newickstring += ':%.1f' % nd.edgelen
-            tmpnd = nd
-            while (tmpnd.par is not None) and (tmpnd.rsib is None):
-                newickstring += ')'
-                newickstring += ':%.1f' % tmpnd.par.edgelen
-                tmpnd = tmpnd.par
-
-            if tmpnd.par is not None:
-                newickstring += ','
-    return newickstring
-
-###################yule tree###################################################
-# calcPhi computes sum_{K=2}^S 1/K, where S is the number of leaves in the tree
-# - num_species is the number of leaves (tips) in the tree
-def calcPhi(num_species):
-    phi = sum([1.0/(K+2.0) for K in range(num_species-1)])
-    return phi
-
-# yuleTree creates a species tree in which edge lengths are measured in
-# expected number of substitutions.
-# - num_species is the number of leaves
-# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio
-def yuleTree(num_species, mu_over_s):
-    # create num_species nodes numbered 1, 2, ..., num_species
-    nodes = [node(i+1) for i in range(num_species)]
-
-    next_node_number = num_species + 1
-    while len(nodes) > 1:
-        # choose a speciation time in generations
-        K = float(len(nodes))
-        mean_epoch_length = mu_over_s/K
-        t = random.gammavariate(1.0, mean_epoch_length)
-
-        # update each node's edgelen
-        for n in nodes:
-            n.edgelen += t      # same as: n.edgelen = n.edgelen + t
-
-        nodes = joinRandomPair(nodes, next_node_number, False)
-        next_node_number += 1
-
-    return nodes[0]
-
-# calcExpectedHeight returns the expected height of the species tree in terms of
-# expected number of substitutions from the root to one tip.
-# - num_species is the number of leaves
-# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio
-def calcExpectedHeight(num_species, mu_over_s):
-    return mu_over_s*calcPhi(num_species)
-
-
-if __name__ == '__main__':
-    random_seed = 24553 # 7632557, 12345
-    number_of_species = 5
-    mutation_speciation_rate_ratio = 0.4 # 0.689655172  # yields tree height 1 for 6 species
-    random.seed(random_seed)
-    species_tree_root = yuleTree(number_of_species, mutation_speciation_rate_ratio)
-#     print '#########'
-#     print species_tree_root
-    newick = makeNewick(species_tree_root)
-#     print 'Random number seed: %d' % random_seed
-#     print 'Simulating one tree:'
-#     print '  number of species              = %d' % number_of_species
-#     print '  mutation-speciation rate ratio = %g' % mutation_speciation_rate_ratio
-#     print '  actual tree length             =',newick[1]
-    expected_height = calcExpectedHeight(number_of_species, mutation_speciation_rate_ratio)
-#     print '  expected height                =',expected_height
-    actual_height = calcActualHeight(species_tree_root)
-#     print '  actual height                  =',actual_height
-#     print '  newick: ',newick[0]
-    
-    
-#     yuletree = '(((1:0.03915,5:0.03915):0.387,(4:0.42253,2:0.42253):0.004):0.118,3:0.54433)'
-    
-    postorder = readnewick(treenewick())
-    result =  prepareTree(postorder, readSeq.patterns(sequence_file))
\ No newline at end of file

From 306e319a76b19b9bcc8ce22c2caa209d2ac1c1cc Mon Sep 17 00:00:00 2001
From: Suman Neupane <suman.neupane@uconn.edu>
Date: Fri, 26 May 2017 11:53:55 -0400
Subject: [PATCH 4/9] Delete tree.tre

---
 tree.tre | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 tree.tre

diff --git a/tree.tre b/tree.tre
deleted file mode 100644
index 2d84738..0000000
--- a/tree.tre
+++ /dev/null
@@ -1 +0,0 @@
-(5:1.8601,((3:0.47109,2:0.47109):0.492,(4:0.05805,1:0.05805):0.906):0.896)
\ No newline at end of file

From 8d860d53e6fed33382f1f6b27fc596df0fa696a9 Mon Sep 17 00:00:00 2001
From: Suman Neupane <suman.neupane@uconn.edu>
Date: Fri, 26 May 2017 11:54:15 -0400
Subject: [PATCH 5/9] Delete simSequences.py

---
 simSequences.py | 174 ------------------------------------------------
 1 file changed, 174 deletions(-)
 delete mode 100644 simSequences.py

diff --git a/simSequences.py b/simSequences.py
deleted file mode 100644
index 4181da1..0000000
--- a/simSequences.py
+++ /dev/null
@@ -1,174 +0,0 @@
-import readSeq
-import random
-import re, os, itertools, sys, glob
-from itertools import chain
-from math import exp, log
-class node(object):
-    def __init__(self, ndnum):              # initialization function
-        self.rsib = None                    # right sibling
-        self.lchild = None                  # left child
-        self.par = None                     # parent node
-        self.number = ndnum                 # node number (internals negative, tips 0 or positive)
-        self.edgelen = 0.0                  # branch length
-        self.descendants = set([ndnum])     # set containing descendant leaf set
-        self.partial = None                 # will have length 4*npatterns
-        self.state = None
-        self.states = None
-
-
-    def simulateSequences(self, num_sites):
-        self.states = [str]*(num_sites)
-        freq = [0.25, 0.25, 0.25, 0.25]
-        current_states = [ 'A', 'C', 'G', 'T']
-        if self.par is None:
-            
-            for i in range(num_sites):
-        
-                ran_nm = random.random()
-                if ran_nm < freq[0]:
-#                     self.state = 'A'
-                    self.states[i] = 'A'
-                elif ran_nm <= freq[0]+freq[1]:
-#                     self.state = 'C'
-                    self.states[i] = 'C'
-                elif ran_nm <= freq[0]+freq[1]+freq[2]:
-#                     self.state = 'G'
-                    self.states[i] = 'G'
-
-                else: 
-#                     self.state = 'T'
-                    self.states[i] = 'T'
-
-        else:
-            for m in range(num_sites):
-                prob = []
-                ran_nm = random.random()
-
-                for i in current_states:
-                    if self.par.states[m] == i:
-                        p = (0.25+0.75*exp(-4.0*(self.edgelen)/3.0))
-                        prob.append(p)
-                    else:
-                        p = (0.25-0.25*exp(-4.0*(self.edgelen)/3.0))
-                        prob.append(p)
-                for i in prob:
-
-                    if ran_nm <= prob[0]:
-#                         self.state = 'A'
-                        self.states[m] = 'A'
-                   
-                    elif ran_nm <= prob[0]+ prob[1]:
-#                         self.state = 'C'
-                        self.states[m] = 'C'
-
-                    elif ran_nm <= prob[0]+ prob[1]+ prob[2]:
-#                         self.state = 'G'
-                        self.states[m] = 'G'
-
-                    else:
-#                         self.state = 'T'
-                        self.states[m] = 'T'
-                       
-        return self.states
-
-    def __str__(self):
-        # __str__ is a built-in function that is used by print to show an object
-        descendants_as_string = ','.join(['%d' % d for d in self.descendants])
-
-        lchildstr = 'None'
-        if self.lchild is not None:
-            lchildstr = '%d' % self.lchild.number
-            
-        rsibstr = 'None'
-        if self.rsib is not None:
-            rsibstr = '%d' % self.rsib.number
-            
-        parstr = 'None'
-        if self.par is not None:
-            parstr = '%d' % self.par.number
-            
-        return 'node: number=%d edgelen=%g lchild=%s rsib=%s parent=%s descendants=[%s]' % (self.number, self.edgelen, lchildstr, rsibstr, parstr, descendants_as_string)
-
-def simulate(preorder, ntax,  num_sites, out):
-    newf = open(out, 'w')
-    newf.write('#nexus\n\n')
-    newf.write('begin data;\n')
-    newf.write('dimensions ntax=%d nchar=%d;\n' % (ntax, num_sites))
-    newf.write('format datatype=dna missing=? gap=-;\n')
-    newf.write('matrix\n')
-    master = {}
-    for nd in preorder:
-        master[nd.number]=nd.simulateSequences(num_sites)
-        if nd.number >0:
-            newf.write('%s  %s\n' % (nd.number, ''.join(nd.simulateSequences(num_sites))))
-    newf.write(';\n')
-    newf.write('end;')
-def readnewick(tree):
-    total_length = len(tree)
-    internal_node_number = -1
-
-    root = node(internal_node_number)
-    nd = root
-    i = 0
-    pre = [root]
-    while i < total_length:
-        m = tree[i]
-
-        if m =='(':
-            internal_node_number -= 1
-
-            child = node(internal_node_number)
-            pre.append(child)
-            nd.lchild=child
-
-            child.par=nd
-            nd=child
-        elif m == ',':
-            internal_node_number -= 1
-            rsib = node(internal_node_number)
-            pre.append(rsib)
-            nd.rsib = rsib
-            rsib.par=nd.par
-            nd = rsib
-        elif m == ')':
-            nd = nd.par
-    
-        elif m == ':':
-            edge_len_str = ''
-            i+=1
-            m = tree[i]
-            assert m in ['0','1','2','3','4','5','6','7','8', '9','.']
-            while m in ['0','1','2','3','4','5','6','7','8', '9','.']:
-                edge_len_str += m
-                i+=1
-                m = tree[i]
-            i -=1
-            nd.edgelen = float(edge_len_str)
-        else:
-            internal_node_number += 1
-
-            if True:
-                assert m in ['0','1','2','3','4','5','6','7','8', '9'], 'Error : expecting m to be a digit when in fact it was "%s"' % m
-                mm = ''
-                while m in ['0','1','2','3','4','5','6','7','8', '9' ]:
-
-                    mm += m
-                    i += 1
-                    m = tree[i]
-                nd.number = int(mm)
-                i -= 1
-        i += 1
-
-    return pre
-
-if __name__ == '__main__':
-
-    output_filename = os.path.join('simulated_output.nexus')
-
-    yuletree = '(5:1.86010,((3:0.47109,2:0.47109):0.492,(4:0.05805,1:0.05805):0.906):0.896)'
-    
-    preorder = readnewick(yuletree)
-    ntax = 5
-    num_sites = 100
-    result = simulate(preorder, ntax, num_sites, output_filename)
-

From e205f4efccf756476b1caecdfe953536420d72a5 Mon Sep 17 00:00:00 2001
From: Suman Neupane <suman.neupane@uconn.edu>
Date: Fri, 26 May 2017 11:54:22 -0400
Subject: [PATCH 6/9] Delete readtree.py

---
 readtree.py | 22 ----------------------
 1 file changed, 22 deletions(-)
 delete mode 100644 readtree.py

diff --git a/readtree.py b/readtree.py
deleted file mode 100644
index a7b4943..0000000
--- a/readtree.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# def treenewck():
-#     script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
-#     path = os.path.join(script_dir, 'nexus')
-#     for filename in glob.glob(os.path.join(path, '*.tre*')):
-#         f = open(filename, 'r').read()
-
-import re, os, glob, itertools, fnmatch, sys, shutil
-# dirname, filename = os.path.split(os.path.abspath(__file__))
-def treenewick():
-    script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
-#     print script_dir
-    path = os.path.join(script_dir, 'tree.tre')
-    with open(path, 'r') as content:
-        newick = content.read()
-    return newick
-a = treenewick()
-print a
-
-
-# dirname, filename = os.path.split(os.path.abspath(__file__))
-# print "running from", dirname
-# print "file is", filename
\ No newline at end of file

From 6883d2d2d1c7fde7e554be10a6f2e4d2960aa812 Mon Sep 17 00:00:00 2001
From: Suman Neupane <suman.neupane@uconn.edu>
Date: Fri, 26 May 2017 11:54:28 -0400
Subject: [PATCH 7/9] Delete readSeq.py

---
 readSeq.py | 69 ------------------------------------------------------
 1 file changed, 69 deletions(-)
 delete mode 100644 readSeq.py

diff --git a/readSeq.py b/readSeq.py
deleted file mode 100644
index 91c58a5..0000000
--- a/readSeq.py
+++ /dev/null
@@ -1,69 +0,0 @@
-def patterns(sequence_file):
- # 
-    import re, os, glob, itertools, fnmatch, sys, shutil
-    from itertools import combinations
-    from collections import Counter
-
-    script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
-#     path = os.path.join(script_dir, 'nexus')
-
-    genes = []
-    data = {}
-#     print 'Reading nexus files...'
-    for filename in glob.glob(os.path.join(script_dir, sequence_file)):
-
-        m = re.match('(.+).nex', os.path.basename(filename))
-        gene_name = m.group(1)
-#         print 'gene_name=', gene_name
-        genes.append(gene_name)
-        f = open(filename, 'r').read()
-
-        m = re.search('ntax\s*=\s*(\d+)', f, re.M | re.S)
-        ntax = int(m.group(1))
-#         print 'ntax=', ntax
-
-        m = re.search('nchar\s*=\s*(\d+)', f, re.M | re.S)
-        nchar = int(m.group(1))
-#         print 'nchar=', nchar
-
-        m = re.search('Matrix\s+(.+?);', f, re.M | re.S)
-        matrix = m.group(1).strip()
-        matrix_lines = matrix.split('\n')
-
-        taxon_names = []
-        sequences = {}
-        sequences_list = []
-        for line in matrix_lines:
-            parts = line.strip().split()
-            assert len(parts) == 2
-            taxon_name = parts[0]
-            sequence = parts[1]
-        
-            taxon_names.append(taxon_name)
-            sequences_list.append(sequence)
-            sequences[taxon_name] = sequence
-        
-        pattern_list = []
-    
-        k=0
-        while k < nchar:
-            site_pattern = ''
-            for i,m in enumerate(sequences_list):
-                site_pattern += m[k]
-            pattern_list.append(site_pattern)
-            k+=1
-        pattern_dict = dict()
-        for i in pattern_list:
-            pattern_dict[i] = pattern_dict.get(i, 0) + 1 
-    
-        tmp = []
-        for key in pattern_dict.keys(): ###convert dict to key of tupules
-#             print 'key=', key
-            tmp.append((pattern_dict[key],key))
-    
-        sorted_values = sorted(tmp) ###sorted according to key smaller to larger
-        sorted_values.sort(cmp = lambda x,y:cmp(x[1],y[1])) ###sorted according to values in alphabetical order
-    
-    return pattern_dict
-
-

From a2fe87c36ff1c5c21200cb3028189ea757d1b3c4 Mon Sep 17 00:00:00 2001
From: Suman Neupane <suman.neupane@uconn.edu>
Date: Fri, 26 May 2017 11:54:34 -0400
Subject: [PATCH 8/9] Delete randomSeq.py

---
 randomSeq.py | 211 ---------------------------------------------------
 1 file changed, 211 deletions(-)
 delete mode 100644 randomSeq.py

diff --git a/randomSeq.py b/randomSeq.py
deleted file mode 100644
index 4925a91..0000000
--- a/randomSeq.py
+++ /dev/null
@@ -1,211 +0,0 @@
-import sys, re, os, random
-
-
-# number1 = raw_input('number_of_runs? > ')
-# num_run = int(number1)
-# #print num_run
-# 
-# ###specifying number of taxa to be generated
-# number2 = raw_input('number_of_taxa? > ')
-# ntax = int(number2)
-# 
-# ###specifying number of sites to be generated
-# number3 = raw_input('number of sites? > ')
-# num_sites = int(number3)
-# 
-# ###specifying number of dna matrix to be generated
-# number4 = raw_input('number of genes? > ')
-# number_genes = int(number4)
-
-
-
-num_run = 1
-#print num_run
-
-###specifying number of taxa to be generated
-ntax = 6
-
-###specifying number of sites to be generated
-num_sites = 10000
-
-###specifying number of dna matrix to be generated
-number_genes = 10
-
-num_trials_pergene = 2
-
-
-for i in range(num_run):
-    run_name = i+1
-    run_name2 = 'trial'+ str(run_name)
-    print run_name2    
-    master_dir = os.path.join(os.path.abspath(os.curdir), run_name2) 
-    if not os.path.exists(master_dir):
-        os.mkdir(master_dir)
-
-
-    ###creating a list for the directories for each randomly generated dna matrix
-    all_seq = []
-    sets = []
-    for a, b in enumerate(range(number_genes)):
-        a += 1
-        set = "gene"+ str(a)
-        sets.append(set)
-
-    ### creating dna matrix with randomly generated sequences
-    for n, i in enumerate(sets):
-        ### specifying names of the directory and dna matrix 
-        folder_name = i
-        n += 1
-        new_nexus_file = 'randomseq'+ str(n)+'.nex'
-        new_nexus_file2 = 'randomseq'+ str(n)
-    ### creating a list of randomly generated sequences
-        sequences = []
-        for i in range(ntax):
-            myrandom = []
-            for i in range(num_sites):
-                i = random.randint(1,4)
-                myrandom.append(i)
-            dna_list = []   
-            for m in myrandom:
-                if m == 1:
-                    m= "A"
-                if m ==2:
-                    m= "T"
-                if m ==3:
-                    m= "G"
-                if m == 4:
-                    m= "C"
-                dna_list.append(m)
-            dna_list2 = ''.join(dna_list)
-            #print dna_list2
-            sequences.append(dna_list2)
-        all_seq.append(sequences)
-
-
-
-    ### creating a list of taxon_names
-        taxon_names = []
-        k=0
-        for i, m in enumerate(sequences):
-            k += 1
-            taxon = "taxon"+ str(k)
-            taxon_names.append(taxon)
-        
-
-    ### creating a directory 
-        new_dir = os.path.join(master_dir,folder_name)
-        if not os.path.exists(new_dir):
-            os.mkdir(new_dir)
-        gene_sets = []        
-#        for i in range(3):
-        for i in range(num_trials_pergene):
-            m= i+1
-            seed_num = 4648+m
-            m2 = 'run'+str(m)
-            gene_sets.append(m2)        
-            new_subdir = os.path.join(new_dir,m2)    
-            if not os.path.exists(new_subdir):
-                os.mkdir(new_subdir)
-
-            bash_filename = 'qsub.sh'
-            bash_file_content = '''#$ -S /bin/bash 
-#$ -cwd
-#$ -N %s
-#$ -q highpri.q,highmem.q
-python runphycas.py
-/common/galax/rungalax.sh --treefile trees.t --skip 1
-'''% (new_nexus_file2)
-  
-#     #     ### saving dna matrix to the directory
-#         
-            full_path1 = os.path.join(new_subdir, new_nexus_file) 
-            newf = open(full_path1, 'w')
-            newf.write('#nexus\n\n')
-            newf.write('begin data;\n')
-            newf.write('  dimensions ntax=%d nchar=%d;\n' % (ntax, num_sites))
-            newf.write('  format datatype=dna missing=? gap=-;\n')
-            newf.write('  matrix\n')
-            longest_taxon_name = max([len(t) for t in taxon_names])
-            for t,s in zip(taxon_names, sequences):
-                formatstr = '%%%ds' % longest_taxon_name
-                namestr = formatstr % t
-                newf.write('    %s  %s\n' % (namestr, s))
-            newf.write(';\n')
-            newf.write('end;\n')
-            newf.close()
-
-            full_path2 = os.path.join(new_subdir, bash_filename)
-            newf = open(full_path2, 'w')
-            newf.write(bash_file_content)
-            newf.close()
- 
-            full_path8 = os.path.join(new_subdir, 'runphycas.py')
-            newf = open(full_path8, 'w')
-            x = open('runphycas.py', 'r').read()
-            newf.write(x %(int(seed_num), new_nexus_file))
-
-# 
-#     
-#     
-        combined_sequences = map(''.join, zip(*all_seq))
-        num_sites_combined = len(combined_sequences[0])
-        new_dir2 = os.path.join(master_dir,'combinedSeq') 
-        if not os.path.exists(new_dir2):
-            os.mkdir(new_dir2)
-
-        full_path3 = os.path.join(new_dir2, 'combinedSeq.nex') 
-        newf = open(full_path3, 'w')
-        newf.write('#nexus\n\n')
-        newf.write('begin data;\n')
-        newf.write('  dimensions ntax=%d nchar=%d;\n' % (ntax, num_sites_combined))
-        newf.write('  format datatype=dna missing=? gap=-;\n')
-        newf.write('  matrix\n')
-        longest_taxon_name = max([len(t) for t in taxon_names])
-        for t,s in zip(taxon_names, combined_sequences):
-            formatstr = '%%%ds' % longest_taxon_name
-            namestr = formatstr % t
-            newf.write('    %s  %s\n' % (namestr, s))
-        newf.write(';\n')
-        newf.write('end;\n')
-        newf.close()
-
-        full_path4 = os.path.join(new_dir2, bash_filename)
-        newf = open(full_path4, 'w')
-        newf.write(bash_file_content)
-        newf.close()
- 
-
-
-        full_path6 = os.path.join(new_dir2, 'runphycas.py')
-        newf = open(full_path6, 'w')
-        x = open('runphycas.py', 'r').read()
-        newf.write(x %(4648, 'combinedSeq.nex'))
-
-
-        full_path7 = os.path.join(master_dir, 'submitall.sh') 
-        newf = open(full_path7, 'w')
-        newf.write('#!/bin/bash\n')
-        for i in sets:
-            for m in gene_sets:
-                newf.write('cd %s/%s; qsub qsub.sh;cd .. ;cd .. ; \n' %(i, m))
-        newf.write('cd %s; qsub qsub.sh; cd ..\n' %('combinedSeq'))
-        newf.close()
-
-        full_path9 = os.path.join(master_dir, 'treelist.txt')
-        newf = open(full_path9, 'w')
-        for i in sets:
-            for m in gene_sets:
-                newf.write(i+'/'+m+'/trees.t\n')
-        newf.close()
-
-
-
-        full_path10 = os.path.join(master_dir, 'galax.sh')
-        newf = open(full_path10, 'w')
-        newf.write('#!/bin/bash\n')
-        newf.write('/common/galax/rungalax.sh --listfile treelist.txt --skip 1')
-        newf.close()
-
-
-
-

From 654ed682b27c9401b0ab9d3f6d6a374ea41748b0 Mon Sep 17 00:00:00 2001
From: Suman Neupane <suman.neupane@uconn.edu>
Date: Fri, 26 May 2017 11:54:41 -0400
Subject: [PATCH 9/9] Delete mergeSeq.py

---
 mergeSeq.py | 453 ----------------------------------------------------
 1 file changed, 453 deletions(-)
 delete mode 100644 mergeSeq.py

diff --git a/mergeSeq.py b/mergeSeq.py
deleted file mode 100644
index c28404a..0000000
--- a/mergeSeq.py
+++ /dev/null
@@ -1,453 +0,0 @@
-import re, os, glob, itertools, fnmatch, sys, shutil
-from capture_marglik_pol import capture_script
-
-from itertools import combinations
-
-script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
-path = os.path.join(script_dir, 'nexus')
-#path = '/Users/suman/Documents/Postdoctoral_projects/Algae_data/two_tupules_run/Mar4/combo_script/nexus'
-
-first_combo = 700
-last_combo  = 770
-
-# create output directory, recursively deleting it if it already exists after asking permission
-output = 'g-700-770'
-if os.path.exists(output):
-    print 'Directory "%s" exists' % output
-    answer = raw_input('delete it (y/n)?')
-    if answer in ['y','yes','Y','Yes','YES']:
-        shutil.rmtree(output)
-    else:
-        sys.exit('Aborting because you did not answer "y" or "yes"')
-os.mkdir(output)
-
-number_to_merge = 2
-
-#### counting  number of genes to be analyzed
-number_of_genes = len(fnmatch.filter(os.listdir(path), '*.nex'))
-number_gen = range(1, number_of_genes+1)
-
-#### counting number of nexus files to be written after combinations
-combos = list(itertools.combinations(number_gen, number_to_merge))
-number_of_files = len(combos)
-# print number_of_files
-
-# chunks are pieces of the chloroplast that have not been rearranged
-# The largest chunk (1) has 7 genes, the smallest (16-23) have only 1 gene each
-chunk_map = {
-'rpl23':1,
-'rpl2' :1,
-'rps19':1,
-'rps14':1,
-'atpA' :1,
-'psbI' :1,
-'cemA' :1,
-'psbE' :2,
-'rps9' :2,
-'ycf3' :2,
-'rpl36':2,
-'petD' :2,
-'rpoA' :2,
-'psbJ' :3,
-'atpI' :3,
-'psaJ' :3,
-'rps12':3,
-'rpl16':4,
-'rpl14':4,
-'rpl5' :4,
-'rps8' :4,
-'rpoBa':5,
-'rpoBb':5,
-'rps3' :5,
-'rpoC2':5,
-'psaB' :6,
-'ccsA' :6,
-'psbZ' :6,
-'psbM' :6,
-'psbF' :7,
-'psbL' :7,
-'petG' :7,
-'psbB' :8,
-'psbT' :8,
-'rps4' :9,
-'clpP' :9,
-'rps18':10,
-'petB' :10,
-'rps7' :11,
-'atpE' :11,
-'atpH' :12,
-'atpF' :12,
-'psbH' :13,
-'psbK' :13,
-'psaC' :14,
-'petL' :14,
-'rbcL' :15,
-'rps11':15,
-'rpl20':16,
-'tufA' :17,
-'psaA' :18,
-'psbD' :19,
-'atpB' :20,
-'psbA' :21,
-'psbC' :22,
-'psbN' :23
-}
-
-##### nexus file for each merged pair
-
-new_way = True
-genes = []
-data = {}
-gene_javascript_data = []
-print 'Reading nexus files...'
-for filename in glob.glob(os.path.join(path, '*.nex')):
-#     print 'Reading file:',filename
-    #pol m = re.match('(.+)[.]nex', os.path.basename(filename))
-    m = re.match('(.+)[-]stripped[.]nex', os.path.basename(filename))
-    gene_name = m.group(1)
-
-    genes.append(gene_name)
-    f = open(filename, 'r').read()
-    m = re.search('ntax\s*=\s*(\d+)', f, re.M | re.S)
-    ntax = int(m.group(1))
-    m = re.search('nchar\s*=\s*(\d+)', f, re.M | re.S)
-    nchar = int(m.group(1))
-
-    gene_javascript_data.append((chunk_map[gene_name], gene_name, nchar))
-
-    m = re.search('Matrix\s+(.+?);', f, re.M | re.S)
-    matrix = m.group(1).strip()
-
-    matrix_lines = matrix.split('\n')
-
-    taxon_names = []
-    sequences = {}
-    for line in matrix_lines:
-        parts = line.strip().split()
-        assert len(parts) == 2
-        taxon_name = parts[0]
-        sequence = parts[1]
-        taxon_names.append(taxon_name)
-        sequences[taxon_name] = sequence
-
-    if not new_way:
-        if len(data) == 0:
-            for t in taxon_names:
-                data[t] = []
-
-    for t in taxon_names:
-        if new_way:
-            group = data.setdefault(t, [])
-            group.append(sequences[t])
-        else:
-            data[t].append(sequences[t])
-
-print 'Found %d genes' % len(genes)
-for c,g,n in sorted(gene_javascript_data):
-    print '    {name:"%s",  chunk:%d, seqlen:%d},' % (g,c,n)
-
-taxa_sorted = sorted(data.keys())
-# print "taxa_sorted=", taxa_sorted
-
-number_of_genes = len(genes)
-number_gen = range(1, number_of_genes+1)
-
-#### counting number of nexus files to be written after combinations
-combos = list(itertools.combinations(number_gen, number_to_merge))
-number_of_combos = len(combos)
-print 'Number of combinations:',number_of_combos
-
-combo_list=[]
-current_combo = 0
-for a,b in combos:
-    current_combo += 1
-    if current_combo <= first_combo or current_combo > last_combo:
-        continue
-    print 'current_combo=', current_combo-1
-
-    new_taxonlist = []
-    se1 = []
-    se2 = []
-    new_sequences = []
-    seq1 = []
-    seq2 = []
-    genes_merged = (genes[a-1]+'_'+genes[b-1])
-#     print 'genes[a-1]==', genes[a-1]
-#     print 'genes[b-1]==', genes[b-1]
-#
-    for t in taxa_sorted:
-        new_taxonlist.append((t,data[t][a-1],data[t][b-1])[0])
-        new_sequences.append((data[t][a-1]+data[t][b-1]))
-        seq1.append((data[t][a-1]))
-        seq2.append((data[t][b-1]))
-#     print 'new_taxonlist==', new_taxonlist
-    ntax = len(new_taxonlist)
-    newnchar = len(new_sequences[0])
-    nchar_gene1 = len(seq1[0])
-    nchar_gene2 = len(seq2[0])
-#     print 'nchar_gene1====', nchar_gene1
-#     print 'nchar_gene2====', nchar_gene2
-
-    new_nexus_file = '%s.nex' % genes_merged
-    gene_dir = os.path.join(output, '%s' % (genes_merged))
-    if not os.path.exists(gene_dir):
-        os.mkdir(gene_dir)
-
-    combo_list.append(genes_merged)
-
-    mrbayes_single_run1 = '''begin mrbayes;
-delete Ankyra_judai Atractomorpha_echinata Bracteacoccus_aerius Bracteacoccus_minor Chlamydomonas_reinhardtii Chromochloris_zofingiensis Dunaliella_salina Floydiella_terrestris Gonium_pectorale Kirchneriella_aperta Mychonastes_homosphaera Oedogonium_cardiacum Ourococcus_multisporus Pleodorina_starrii Pseudomuriella_schumacherensis Rotundella_rotunda Schizomeris_leibleinii Stigeoclonium_helveticum Volvox_carteri;
-charset first  = 1-%d\\3;
-charset second = 2-%d\\3;
-charset third  = 3-%d\\3;
-partition mine = 3: first, second, third;
-set partition=mine;
-lset applyto=(all) nst=6 ngammacat=4 rates=gamma;
-prset applyto=(all) statefreqpr=Dirichlet(1.0,1.0,1.0,1.0) ratepr=variable revmatpr=dirichlet(1,1,1,1,1,1) brlenspr=Unconstrained:GammaDir(1.0,0.100,1.0,1.0) shapepr=exponential(1.0);
-unlink shape=(all) statefreq=(all) revmat=(all);
-set seed=9223 swapseed=9223;
-mcmcp ngen=8000000 samplefreq=500 printfreq=8000000 starttree=random nruns=1 nchains=1 savebrlens=yes filename=ss;
-[sumt filename=mcmc;]
-[mcmcp filename=ss;]
-ss alpha=0.3 nsteps=30 burninss=-2;
-end;
-'''% (nchar_gene1, nchar_gene1, nchar_gene1)
-
-    mrbayes_single_run2 = '''begin mrbayes;
-delete Ankyra_judai Atractomorpha_echinata Bracteacoccus_aerius Bracteacoccus_minor Chlamydomonas_reinhardtii Chromochloris_zofingiensis Dunaliella_salina Floydiella_terrestris Gonium_pectorale Kirchneriella_aperta Mychonastes_homosphaera Oedogonium_cardiacum Ourococcus_multisporus Pleodorina_starrii Pseudomuriella_schumacherensis Rotundella_rotunda Schizomeris_leibleinii Stigeoclonium_helveticum Volvox_carteri;
-charset first  = 1-%d\\3;
-charset second = 2-%d\\3;
-charset third  = 3-%d\\3;
-partition mine = 3: first, second, third;
-set partition=mine;
-lset applyto=(all) nst=6 ngammacat=4 rates=gamma;
-prset applyto=(all) statefreqpr=Dirichlet(1.0,1.0,1.0,1.0) ratepr=variable revmatpr=dirichlet(1,1,1,1,1,1) brlenspr=Unconstrained:GammaDir(1.0,0.100,1.0,1.0) shapepr=exponential(1.0);
-unlink shape=(all) statefreq=(all) revmat=(all);
-set seed=9223 swapseed=9223;
-mcmcp ngen=8000000 samplefreq=500 printfreq=8000000 starttree=random nruns=1 nchains=1 savebrlens=yes filename=ss;
-[sumt filename=mcmc;]
-[mcmcp filename=ss;]
-ss alpha=0.3 nsteps=30 burninss=-2;
-end;
-'''% (nchar_gene2, nchar_gene2, nchar_gene2)
-
-
-    mrbayes_concat_topo_brlen = '''begin mrbayes;
-delete Ankyra_judai Atractomorpha_echinata Bracteacoccus_aerius Bracteacoccus_minor Chlamydomonas_reinhardtii Chromochloris_zofingiensis Dunaliella_salina Floydiella_terrestris Gonium_pectorale Kirchneriella_aperta Mychonastes_homosphaera Oedogonium_cardiacum Ourococcus_multisporus Pleodorina_starrii Pseudomuriella_schumacherensis Rotundella_rotunda Schizomeris_leibleinii Stigeoclonium_helveticum Volvox_carteri;
-charset first  = 1-%d\\3;
-charset second = 2-%d\\3;
-charset third  = 3-%d\\3;
-charset fourth = %d-%d\\3;
-charset fifth  = %d-%d\\3;
-charset sixth  = %d-%d\\3;
-partition mine = 6: first, second, third, fourth, fifth, sixth;
-set partition=mine;
-lset applyto=(all) nst=6 ngammacat=4 rates=gamma;
-prset applyto=(all) statefreqpr=Dirichlet(1.0,1.0,1.0,1.0) ratepr=variable revmatpr=dirichlet(1,1,1,1,1,1) brlenspr=Unconstrained:GammaDir(1.0,0.100,1.0,1.0) shapepr=exponential(1.0);
-unlink shape=(all) statefreq=(all) revmat=(all);
-set seed=9223 swapseed=9223;
-mcmcp ngen=8000000 samplefreq=500 printfreq=8000000 starttree=random nruns=1 nchains=1 savebrlens=yes filename=ss;
-[sumt filename=mcmc;]
-[mcmcp filename=ss;]
-ss alpha=0.3 nsteps=30 burninss=-2;
-end;
-'''% (nchar_gene1, nchar_gene1, nchar_gene1, nchar_gene1+1, newnchar, nchar_gene1+2, newnchar, nchar_gene1+3,newnchar)
-
-    mrbayes_concat_topo = '''begin mrbayes;
-delete Ankyra_judai Atractomorpha_echinata Bracteacoccus_aerius Bracteacoccus_minor Chlamydomonas_reinhardtii Chromochloris_zofingiensis Dunaliella_salina Floydiella_terrestris Gonium_pectorale Kirchneriella_aperta Mychonastes_homosphaera Oedogonium_cardiacum Ourococcus_multisporus Pleodorina_starrii Pseudomuriella_schumacherensis Rotundella_rotunda Schizomeris_leibleinii Stigeoclonium_helveticum Volvox_carteri;
-charset first = 1-%d\\3;
-charset second = 2-%d\\3;
-charset third = 3-%d\\3;
-charset fourth = %d-%d\\3;
-charset fifth = %d-%d\\3;
-charset sixth = %d-%d\\3;
-partition mine = 6: first, second, third, fourth, fifth, sixth;
-set partition=mine;
-lset applyto=(all) nst=6 ngammacat=4 rates=gamma;
-prset applyto=(all) statefreqpr=Dirichlet(1.0,1.0,1.0,1.0) revmatpr=dirichlet(1,1,1,1,1,1) brlenspr=Unconstrained:GammaDir(1.0,0.100,1.0,1.0) shapepr=exponential(1.0);
-unlink shape=(all) statefreq=(all) revmat=(all) brlens=(all);
-set seed=9223 swapseed=9223;
-mcmcp ngen=8000000 samplefreq=500 printfreq=8000000 starttree=random nruns=1 nchains=1 savebrlens=yes filename=ss;
-[sumt filename=mcmc;]
-[mcmcp filename=ss;]
-ss alpha=0.3 nsteps=30 burninss=-2;
-end;
-'''% (nchar_gene1, nchar_gene1, nchar_gene1, nchar_gene1+1, newnchar, nchar_gene1+2, newnchar, nchar_gene1+3,newnchar)
-
-
-#concat_topo
-###########submitallscript###########
-    submitallscript ='''cd %s; qsub qsub.sh; cd ..
-cd %s; qsub qsub.sh; cd ..
-cd concat_topo_brlen; qsub qsub.sh; cd ..
-cd concat_topo; qsub qsub.sh; cd ..
-'''%(genes[a-1], genes[b-1])
-    submitall = os.path.join(gene_dir, 'submitall.sh')
-    newf = open(submitall, 'w')
-    newf.write(submitallscript)
-###########submitallscript###########
-
-
-###########qsubscript for concatenated set###########
-#     qsubscript = '''#!/bin/bash
-# #$ -S /bin/bash
-# #$ -cwd
-# #$ -N concatenated
-# #$ -q highpri.q,highmem.q
-# mb concatenated.nex
-# rm -f mcmc.* ss.*'''
-###########qsubscript for concatenated set###########
-
-
-###########gene1###########
-    single_gene_dir = os.path.join(gene_dir, genes[a-1])
-    if not os.path.exists(single_gene_dir):
-        os.mkdir(single_gene_dir)
-    output_path = os.path.join(single_gene_dir, ('%s.nex' %(genes[a-1])))
-    newf = open(output_path, 'w')
-    newf.write('#NEXUS\n\n')
-    newf.write('Begin data;\n')
-    newf.write('  Dimensions ntax=%d nchar=%d;\n' % (ntax, nchar_gene1))
-    newf.write('  Format datatype=dna missing=? gap=-;\n')
-    newf.write('  Matrix\n')
-    longest_taxon_name = max([len(t) for t in new_taxonlist])
-    for t,s in zip(new_taxonlist, seq1):
-        formatstr = '%%%ds' % longest_taxon_name
-        namestr = formatstr % t
-        newf.write('    %s  %s\n' % (namestr, s))
-    newf.write(';\n')
-    newf.write('end;\n\n')
-    newf.write(mrbayes_single_run1)
-    newf.close()
- ##qsub_script#####
-    qsub = os.path.join(single_gene_dir, 'qsub.sh')
-    newf = open(qsub, 'w')
-    newf.write('''#!/bin/bash
-#$ -S /bin/bash
-#$ -cwd
-#$ -N %s
-#$ -q highpri.q,highmem.q
-mb %s.nex
-rm -f mcmc.* ss.*'''%(genes[a-1], genes[a-1]))
- ##qsub_script#####
-###########gene1###########
-
-
-###########gene2###########
-    single_gene_dir = os.path.join(gene_dir, genes[b-1])
-    if not os.path.exists(single_gene_dir):
-        os.mkdir(single_gene_dir)
-    output_path = os.path.join(single_gene_dir, ('%s.nex' %(genes[b-1])))
-    newf = open(output_path, 'w')
-    newf.write('#NEXUS\n\n')
-    newf.write('Begin data;\n')
-    newf.write('  Dimensions ntax=%d nchar=%d;\n' % (ntax, nchar_gene2))
-    newf.write('  Format datatype=dna missing=? gap=-;\n')
-    newf.write('  Matrix\n')
-    longest_taxon_name = max([len(t) for t in new_taxonlist])
-    for t,s in zip(new_taxonlist, seq2):
-        formatstr = '%%%ds' % longest_taxon_name
-        namestr = formatstr % t
-        newf.write('    %s  %s\n' % (namestr, s))
-    newf.write(';\n')
-    newf.write('end;\n\n')
-    newf.write(mrbayes_single_run2)
-    newf.close()
- ####qsub_script#####
-    qsub = os.path.join(single_gene_dir, 'qsub.sh')
-    newf = open(qsub, 'w')
-    newf.write('''#!/bin/bash
-#$ -S /bin/bash
-#$ -cwd
-#$ -N %s
-#$ -q highpri.q,highmem.q
-mb %s.nex
-rm -f mcmc.* ss.*'''%(genes[b-1], genes[b-1]))
- ####qsub_script#####
-###########gene2###########
-
-
-###########concat_topo_brlen###########
-    concat_dir =  os.path.join(gene_dir, 'concat_topo_brlen')
-    if not os.path.exists(concat_dir):
-        os.mkdir(concat_dir)
-    output_path = os.path.join(concat_dir, 'concatenated.nex')
-    newf = open(output_path, 'w')
-    newf.write('#NEXUS\n\n')
-    newf.write('Begin data;\n')
-    newf.write('  Dimensions ntax=%d nchar=%d;\n' % (ntax, newnchar))
-    newf.write('  Format datatype=dna missing=? gap=-;\n')
-    newf.write('  Matrix\n')
-    longest_taxon_name = max([len(t) for t in new_taxonlist])
-    for t,s in zip(new_taxonlist, new_sequences):
-        formatstr = '%%%ds' % longest_taxon_name
-        namestr = formatstr % t
-        newf.write('    %s  %s\n' % (namestr, s))
-    newf.write(';\n')
-    newf.write('end;\n\n')
-    newf.write(mrbayes_concat_topo_brlen)
-    newf.close()
- ####qsub_script concat_topo_brlen#####
-    qsub = os.path.join(concat_dir, 'qsub.sh')
-    newf = open(qsub, 'w')
-    newf.write('''#!/bin/bash
-#$ -S /bin/bash
-#$ -cwd
-#$ -N %s_%s_concat_topo_brlen
-#$ -q highpri.q,highmem.q
-mb concatenated.nex
-rm -f mcmc.* ss.*'''%(genes[a-1],genes[b-1]))
- ####qsub_script concat_topo_brlen#####
-###########concat_topo_brlen###########
-
-
-###########concat_topo###########
-    concat_dir =  os.path.join(gene_dir, 'concat_topo')
-    if not os.path.exists(concat_dir):
-        os.mkdir(concat_dir)
-    output_path = os.path.join(concat_dir, 'concatenated.nex')
-    newf = open(output_path, 'w')
-    newf.write('#NEXUS\n\n')
-    newf.write('Begin data;\n')
-    newf.write('  Dimensions ntax=%d nchar=%d;\n' % (ntax, newnchar))
-    newf.write('  Format datatype=dna missing=? gap=-;\n')
-    newf.write('  Matrix\n')
-    longest_taxon_name = max([len(t) for t in new_taxonlist])
-    for t,s in zip(new_taxonlist, new_sequences):
-        formatstr = '%%%ds' % longest_taxon_name
-        namestr = formatstr % t
-        newf.write('    %s  %s\n' % (namestr, s))
-    newf.write(';\n')
-    newf.write('end;\n\n')
-    newf.write(mrbayes_concat_topo)
-    newf.close()
- ####qsub_script concat_topo#####
-    qsub = os.path.join(concat_dir, 'qsub.sh')
-    newf = open(qsub, 'w')
-    newf.write('''#!/bin/bash
-#$ -S /bin/bash
-#$ -cwd
-#$ -N %s_%s_concat_topo
-#$ -q highpri.q,highmem.q
-mb concatenated.nex
-rm -f mcmc.* ss.*'''%(genes[a-1],genes[b-1]))
-  ####qsub_script concat_topo#####
-###########concat_topo###########
-
-
-####### go.sh #########
-jobscript = os.path.join(output, 'go.sh')
-newf = open(jobscript, 'w')
-newf.write('#!/bin/bash\n\n')
-for i in combo_list:
-    newf.write('cd %s\n. submitall.sh\ncd ..\n\n' %(i))
-newf.close()
-
-####### capture_marg.py #########
-jobscript2 = os.path.join(output, 'capture_marg.py')
-newf = open(jobscript2, 'w')
-newf.write(capture_script)
-newf.close()
-
-
-