From becd19bfe07a5f8f8550effb514aaa42d73fb52f Mon Sep 17 00:00:00 2001 From: Suman Neupane Date: Fri, 26 May 2017 11:51:35 -0400 Subject: [PATCH] all old brnlenmcmc --- brnlenMCMC/brnlenMCMC.py | 494 +++++++++++++++++++++++++++++++++++++++ brnlenMCMC/readSeq.py | 69 ++++++ brnlenMCMC/readtree.py | 22 ++ brnlenMCMC/tree.tre | 1 + 4 files changed, 586 insertions(+) create mode 100644 brnlenMCMC/brnlenMCMC.py create mode 100644 brnlenMCMC/readSeq.py create mode 100644 brnlenMCMC/readtree.py create mode 100644 brnlenMCMC/tree.tre diff --git a/brnlenMCMC/brnlenMCMC.py b/brnlenMCMC/brnlenMCMC.py new file mode 100644 index 0000000..b63467a --- /dev/null +++ b/brnlenMCMC/brnlenMCMC.py @@ -0,0 +1,494 @@ +########################################################################################## +# This script reads a nexus DNA matrix (through module readseq.py) and a newick tree +# topology, and computes log-likelihood of the topology under Jukes Cantor+GAMMA model, +# and performs MCMC on branch length parameter +########################################################################################## + + +import readSeq +import random +import re, os, itertools, sys, glob +from itertools import chain +from scipy.stats import gamma +from math import exp, log + + +########################################################################################## +tree_file_name = 'tree.tre' +sequence_file = 'example3.nex' +alpha = 0.5 #gamma shape parameter for rate categories +n_gen = 4 +save_every = 1 +mean_expo = 10. #mean_expo = mean of exponential distribution for branch length prior +########################################################################################## + + + + +class node(object): + def __init__(self, ndnum): # initialization function + self.rsib = None # right sibling + self.lchild = None # left child + self.par = None # parent node + self.number = ndnum # node number (internals negative, tips 0 or positive) + self.edgelen = 0.0 # branch length + self.descendants = set([ndnum]) # set containing descendant leaf set + self.partial = None # will have length 4*npatterns + + + def __str__(self): + # __str__ is a built-in function that is used by print to show an object + descendants_as_string = ','.join(['%d' % d for d in self.descendants]) + + lchildstr = 'None' + if self.lchild is not None: + lchildstr = '%d' % self.lchild.number + + rsibstr = 'None' + if self.rsib is not None: + rsibstr = '%d' % self.rsib.number + + parstr = 'None' + if self.par is not None: + parstr = '%d' % self.par.number + + return 'node: number=%d edgelen=%g lchild=%s rsib=%s parent=%s descendants=[%s]' % (self.number, self.edgelen, lchildstr, rsibstr, parstr, descendants_as_string) + + + +def allocatePartial(node, patterns, rates): + if node.number > 0: + npatterns = len(patterns) + + if node.partial is None: + node.partial = [0.0]*(4*4*npatterns) +# print len(node.partial) + for i,pattern in enumerate(patterns.keys()): + base = pattern[node.number-1] + for l in range(4): + if base == 'A': + + + node.partial[i*16+l*4 + 0] = 1.0 + elif base == 'C': + node.partial[i*16+l*4 + 1] = 1.0 + elif base == 'G': + node.partial[i*16+l*4 + 2] = 1.0 + elif base == 'T': + node.partial[i*16+l*4 + 3] = 1.0 + else: + assert(False), 'oops, something went horribly wrong!' + + else: + + npatterns = len(patterns) + if node.partial is None: + node.partial = [0.0]*(4*4*npatterns) + + like_list = [] + for i,pattern in enumerate(patterns.keys()): + m_list = [] + num_pattern = patterns[pattern] + for l,m in enumerate(rates): + + psame = (0.25+0.75*exp(-4.0*m*(node.lchild.edgelen)/3.0)) + pdiff = (0.25-0.25*exp(-4.0*m*(node.lchild.edgelen)/3.0)) + + psame2 = (0.25+0.75*exp(-4.0*m*(node.lchild.rsib.edgelen)/3.0)) + pdiff2 = (0.25-0.25*exp(-4.0*m*(node.lchild.rsib.edgelen)/3.0)) + + num_pattern = patterns[pattern] + pAA = psame*(node.lchild.partial[i*16+l*4 + 0]) + pAC = pdiff*(node.lchild.partial[i*16+l*4 + 1]) + pAG = pdiff*(node.lchild.partial[i*16+l*4 + 2]) + pAT = pdiff*(node.lchild.partial[i*16+l*4 + 3]) + + pAA2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 0]) + pAC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1]) + pAG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2]) + pAT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3]) + + pfromA_lchild = pAA+pAC+pAG+pAT + pfromA_rchild = pAA2+pAC2+pAG2+pAT2 + node.partial[i*16+l*4 + 0] = pfromA_lchild*pfromA_rchild + + + ###################################################### + + pCA = pdiff*(node.lchild.partial[i*16+l*4 + 0]) + pCC = psame*(node.lchild.partial[i*16+l*4 + 1]) + pCG = pdiff*(node.lchild.partial[i*16+l*4 + 2]) + pCT = pdiff*(node.lchild.partial[i*16+l*4 + 3]) + + pCA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0]) + pCC2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 1]) + pCG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2]) + pCT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3]) + + pfromC_lchild = pCA+pCC+pCG+pCT + pfromC_rchild = pCA2+pCC2+pCG2+pCT2 + node.partial[i*16+l*4 + 1] = pfromC_lchild*pfromC_rchild + + ####################################################### +# + pGA = pdiff*(node.lchild.partial[i*16+l*4 + 0]) + pGC = pdiff*(node.lchild.partial[i*16+l*4 + 1]) + pGG = psame*(node.lchild.partial[i*16+l*4 + 2]) + pGT = pdiff*(node.lchild.partial[i*16+l*4 + 3]) + + pGA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0]) + pGC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1]) + pGG2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 2]) + pGT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3]) + + pfromG_lchild = pGA+pGC+pGG+pGT + pfromG_rchild = pGA2+pGC2+pGG2+pGT2 + node.partial[i*16+l*4 + 2] = pfromG_lchild*pfromG_rchild + + ####################################################### + + pTA = pdiff*(node.lchild.partial[i*16+l*4 + 0]) + pTC = pdiff*(node.lchild.partial[i*16+l*4 + 1]) + pTG = pdiff*(node.lchild.partial[i*16+l*4 + 2]) + pTT = psame*(node.lchild.partial[i*16+l*4 + 3]) + + pTA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0]) + pTC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1]) + pTG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2]) + pTT2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 3]) + + pfromT_lchild = pTA+pTC+pTG+pTT + pfromT_rchild = pTA2+pTC2+pTG2+pTT2 + node.partial[i*16+l*4 + 3] = pfromT_lchild*pfromT_rchild + site_like = (sum(node.partial[i*16:i*16+16]))*0.25*0.25 + site_log_like = (log(site_like))*num_pattern + like_list.append(site_log_like) + log_like = sum(like_list) + return log_like + + + +def mcmcbrn(postorder, patterns, rates): + nodes = postorder +# nodes = readnewick(treenewick()) + + mcmc = 0 + output = os.path.join('brnlenMCMC_results.txt') + newf = open(output, 'w') + newf.write('%s\t'%('n_gen')) + newf.write( '%s\t%s\t'%('LnL','LnPr')) + for nl in postorder: + newf.write( 'node%s\t'%(nl.number)) + newf.write('\n') + start_log_prior = 0.0 + for nd in nodes: + start_log_prior += (-nd.edgelen/mean_expo)-(log(mean_expo)) + start_log_like = prepareTree(nodes, patterns, rates) + + + newf.write('%s\t'%(mcmc)) + print 'mcmc gen=', mcmc + print start_log_like, start_log_prior, + + newf.write( '%.6f\t%.6f\t'%(start_log_like,start_log_prior)) + for nl in postorder: + newf.write( '%.6f\t'%(nl.edgelen)) + print nl.edgelen, + print + print '**************************' + + newf.write('\n') +# print + for r in range(n_gen): + for i in range(len(postorder)): + preedgelen = nodes[i].edgelen + currentlike = prepareTree(nodes, patterns, rates) +# currentlike = 0.0 + currentprior = 0.0 + for nd in nodes: + currentprior += (-nd.edgelen/mean_expo)-(log(mean_expo)) + current_ln_posterior = currentlike + currentprior + + u = random.random() + m = exp(0.2*(u-0.5)) + nodes[i].edgelen = preedgelen*m + proposedprior = 0.0 + for nd in nodes: + proposedprior += (-nd.edgelen/mean_expo)-(log(mean_expo)) + + proposedlike = prepareTree(nodes, patterns, rates) + proposed_ln_posterior = proposedlike + proposedprior + hastings_ratio = log(m) + logR = proposed_ln_posterior - current_ln_posterior + hastings_ratio + u2 = random.random() + if log(u2) < logR: + nodes[i].edgelen = nodes[i].edgelen + log_prior = proposedprior + log_likelihood = proposedlike +# print 'log(u2) < logR so new proposal accepted..' + else: + nodes[i].edgelen = preedgelen + log_prior = currentprior + log_likelihood = currentlike + +# print 'log(u2) > logR so failed to accept new proposal..' + + if (r+1) % save_every == 0: + newf.write('%s\t'%(mcmc+1)) + print 'mcmc gen=', mcmc+1 + print log_likelihood,log_prior, + newf.write( '%.6f\t%.6f\t'%(log_likelihood,log_prior)) + for j,k in enumerate(nodes): + newf.write( '%.6f\t'%(k.edgelen)) + print k.edgelen, + newf.write('\n') + print + print '**************************' + + newf.flush() + + mcmc+=1 + + +def treenewick(): + script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) + path = os.path.join(script_dir, tree_file_name) + with open(path, 'r') as content: + newick = content.read() + return newick +# + +def gammaRates(alpha): + bounds = [0.0, 0.25, 0.50, 0.75, 1.] + rates = [] + for i in range(4): +# print i + lower = gamma.ppf(bounds[i], alpha, 0, 1./alpha) + upper = gamma.ppf(bounds[i+1], alpha, 0, 1./alpha) + mean_rate = ((gamma.cdf(upper, alpha+1., 0, 1./alpha) - gamma.cdf(lower, alpha+1., 0, 1./alpha)))*4. + rates.append(mean_rate) + return rates + +def prepareTree(postorder, patterns, rates): + likelihood_lists = [] + for nd in postorder: + likelihood_lists.append(allocatePartial(nd, patterns, rates)) +# print 'log-likelihood of the topology =', likelihood_lists[-1] + return likelihood_lists[-1] + +def joinRandomPair(node_list, next_node_number, is_deep_coalescence): + # pick first of two lineages to join and delete from node_list + i = random.randint(1, len(node_list)) + ndi = node_list[i-1] + del node_list[i-1] + + # pick second of two lineages to join and delete from node_list + j = random.randint(1, len(node_list)) + ndj = node_list[j-1] + del node_list[j-1] + + # join selected nodes and add ancestor to node_list + ancnd = node(next_node_number) + ancnd.deep = is_deep_coalescence + ancnd.lchild = ndi + ancnd.descendants = set() + ancnd.descendants |= ndi.descendants + ancnd.descendants |= ndj.descendants + ndi.rsib = ndj + ndi.par = ancnd + ndj.par = ancnd + node_list.append(ancnd) + + return node_list + + +def makeNewick(nd, brlen_scaler = 1.0, start = True): # + global _newick + global _TL + + if start: + _newick = '' + _TL = 0.0 + + if nd.lchild: + _newick += '(' + makeNewick(nd.lchild, brlen_scaler, False) + + else: + blen = nd.edgelen*brlen_scaler + _TL += blen + _newick += '%d:%.5f' % (nd.number, blen) + + if nd.rsib: + _newick += ',' + makeNewick(nd.rsib, brlen_scaler, False) + elif nd.par is not None: + blen = nd.par.edgelen*brlen_scaler + _TL += blen + _newick += '):%.3f' % blen + + return _newick, _TL + +def calcActualHeight(root): + h = 0.0 + nd = root + while nd.lchild: + nd = nd.lchild + h += nd.edgelen + return h + + +def readnewick(tree): + total_length = len(tree) + internal_node_number = -1 + + root = node(internal_node_number) + nd = root + i = 0 + pre = [root] + while i < total_length: + m = tree[i] + + if m =='(': + internal_node_number -= 1 + + child = node(internal_node_number) + pre.append(child) + nd.lchild=child + + child.par=nd + nd=child + elif m == ',': + internal_node_number -= 1 + rsib = node(internal_node_number) + pre.append(rsib) + nd.rsib = rsib + rsib.par=nd.par + nd = rsib + elif m == ')': + nd = nd.par + + elif m == ':': + edge_len_str = '' + i+=1 + m = tree[i] + assert m in ['0','1','2','3','4','5','6','7','8', '9','.'] + while m in ['0','1','2','3','4','5','6','7','8', '9','.']: + edge_len_str += m + i+=1 + m = tree[i] + i -=1 + nd.edgelen = float(edge_len_str) + + + else: + internal_node_number += 1 + + if True: + assert m in ['0','1','2','3','4','5','6','7','8', '9'], 'Error : expecting m to be a digit when in fact it was "%s"' % m + mm = '' + while m in ['0','1','2','3','4','5','6','7','8', '9' ]: + + mm += m + + i += 1 + m = tree[i] + nd.number = int(mm) + i -= 1 + + i += 1 + + post = pre[:] + post.reverse() + return post + +def Makenewick(pre): + newickstring = '' + for i,nd in enumerate(pre): + if nd.lchild: + newickstring += '(' + + elif nd.rsib: + newickstring += '%d' %(nd.number) + newickstring += ':%.1f' % nd.edgelen + newickstring += ',' + + else: + newickstring += '%d' %(nd.number) + newickstring += ':%.1f' % nd.edgelen + tmpnd = nd + while (tmpnd.par is not None) and (tmpnd.rsib is None): + newickstring += ')' + newickstring += ':%.1f' % tmpnd.par.edgelen + tmpnd = tmpnd.par + + if tmpnd.par is not None: + newickstring += ',' + return newickstring + +###################yule tree################################################### +# calcPhi computes sum_{K=2}^S 1/K, where S is the number of leaves in the tree +# - num_species is the number of leaves (tips) in the tree +def calcPhi(num_species): + phi = sum([1.0/(K+2.0) for K in range(num_species-1)]) + return phi + +# yuleTree creates a species tree in which edge lengths are measured in +# expected number of substitutions. +# - num_species is the number of leaves +# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio +def yuleTree(num_species, mu_over_s): + # create num_species nodes numbered 1, 2, ..., num_species + nodes = [node(i+1) for i in range(num_species)] + + next_node_number = num_species + 1 + while len(nodes) > 1: + # choose a speciation time in generations + K = float(len(nodes)) + mean_epoch_length = mu_over_s/K + t = random.gammavariate(1.0, mean_epoch_length) + + # update each node's edgelen + for n in nodes: + n.edgelen += t # same as: n.edgelen = n.edgelen + t + + nodes = joinRandomPair(nodes, next_node_number, False) + next_node_number += 1 + + return nodes[0] + +# calcExpectedHeight returns the expected height of the species tree in terms of +# expected number of substitutions from the root to one tip. +# - num_species is the number of leaves +# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio +def calcExpectedHeight(num_species, mu_over_s): + return mu_over_s*calcPhi(num_species) + + +if __name__ == '__main__': + random_seed = 348889 # 7632557, 12345 + number_of_species = 5 + mutation_speciation_rate_ratio = 0.689655172 # 0.689655172 # yields tree height 1 for 6 species + random.seed(random_seed) + species_tree_root = yuleTree(number_of_species, mutation_speciation_rate_ratio) +# print '#########' +# print species_tree_root + newick = makeNewick(species_tree_root) +# print 'Random number seed: %d' % random_seed +# print 'Simulating one tree:' +# print ' number of species = %d' % number_of_species +# print ' mutation-speciation rate ratio = %g' % mutation_speciation_rate_ratio +# print ' actual tree length =',newick[1] + expected_height = calcExpectedHeight(number_of_species, mutation_speciation_rate_ratio) +# print ' expected height =',expected_height + actual_height = calcActualHeight(species_tree_root) +# print ' actual height =',actual_height + print 'true tree: ',newick[0] + print '**************************' + +# yuletree = '(((1:0.54019,(5:0.40299,10:0.40299):0.1372):0.72686,(6:0.10576,4:0.10576):1.16129):0.42537,(2:0.58122,(9:0.21295,(7:0.16691,(8:0.14622,3:0.14622):0.02069):0.04604):0.36827):1.1112)' + rates_list = gammaRates(alpha) + postorder = readnewick(treenewick()) + result = prepareTree(postorder, readSeq.patterns(sequence_file), rates_list) +# try1 = readSeq.patterns() + result2 = mcmcbrn(postorder, readSeq.patterns(sequence_file), rates_list) diff --git a/brnlenMCMC/readSeq.py b/brnlenMCMC/readSeq.py new file mode 100644 index 0000000..91c58a5 --- /dev/null +++ b/brnlenMCMC/readSeq.py @@ -0,0 +1,69 @@ +def patterns(sequence_file): + # + import re, os, glob, itertools, fnmatch, sys, shutil + from itertools import combinations + from collections import Counter + + script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) +# path = os.path.join(script_dir, 'nexus') + + genes = [] + data = {} +# print 'Reading nexus files...' + for filename in glob.glob(os.path.join(script_dir, sequence_file)): + + m = re.match('(.+).nex', os.path.basename(filename)) + gene_name = m.group(1) +# print 'gene_name=', gene_name + genes.append(gene_name) + f = open(filename, 'r').read() + + m = re.search('ntax\s*=\s*(\d+)', f, re.M | re.S) + ntax = int(m.group(1)) +# print 'ntax=', ntax + + m = re.search('nchar\s*=\s*(\d+)', f, re.M | re.S) + nchar = int(m.group(1)) +# print 'nchar=', nchar + + m = re.search('Matrix\s+(.+?);', f, re.M | re.S) + matrix = m.group(1).strip() + matrix_lines = matrix.split('\n') + + taxon_names = [] + sequences = {} + sequences_list = [] + for line in matrix_lines: + parts = line.strip().split() + assert len(parts) == 2 + taxon_name = parts[0] + sequence = parts[1] + + taxon_names.append(taxon_name) + sequences_list.append(sequence) + sequences[taxon_name] = sequence + + pattern_list = [] + + k=0 + while k < nchar: + site_pattern = '' + for i,m in enumerate(sequences_list): + site_pattern += m[k] + pattern_list.append(site_pattern) + k+=1 + pattern_dict = dict() + for i in pattern_list: + pattern_dict[i] = pattern_dict.get(i, 0) + 1 + + tmp = [] + for key in pattern_dict.keys(): ###convert dict to key of tupules +# print 'key=', key + tmp.append((pattern_dict[key],key)) + + sorted_values = sorted(tmp) ###sorted according to key smaller to larger + sorted_values.sort(cmp = lambda x,y:cmp(x[1],y[1])) ###sorted according to values in alphabetical order + + return pattern_dict + + diff --git a/brnlenMCMC/readtree.py b/brnlenMCMC/readtree.py new file mode 100644 index 0000000..a7b4943 --- /dev/null +++ b/brnlenMCMC/readtree.py @@ -0,0 +1,22 @@ +# def treenewck(): +# script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) +# path = os.path.join(script_dir, 'nexus') +# for filename in glob.glob(os.path.join(path, '*.tre*')): +# f = open(filename, 'r').read() + +import re, os, glob, itertools, fnmatch, sys, shutil +# dirname, filename = os.path.split(os.path.abspath(__file__)) +def treenewick(): + script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) +# print script_dir + path = os.path.join(script_dir, 'tree.tre') + with open(path, 'r') as content: + newick = content.read() + return newick +a = treenewick() +print a + + +# dirname, filename = os.path.split(os.path.abspath(__file__)) +# print "running from", dirname +# print "file is", filename \ No newline at end of file diff --git a/brnlenMCMC/tree.tre b/brnlenMCMC/tree.tre new file mode 100644 index 0000000..2d84738 --- /dev/null +++ b/brnlenMCMC/tree.tre @@ -0,0 +1 @@ +(5:1.8601,((3:0.47109,2:0.47109):0.492,(4:0.05805,1:0.05805):0.906):0.896) \ No newline at end of file