diff --git a/treeLike.py b/treeLike.py deleted file mode 100644 index 1142e5e..0000000 --- a/treeLike.py +++ /dev/null @@ -1,374 +0,0 @@ -################################## -# This script reads a nexus DNA matrix (through module readseq.py) and a newick tree -# topology, and computes log-likelihood of the topology under Jukes Cantor model -################################### - - -import readSeq -import random -import re, os, itertools, sys, glob -from itertools import chain -from math import exp, log - -########################################################################################## -tree_file_name = 'tree.tre' -sequence_file = 'example3.nex' -########################################################################################## - - -class node(object): - def __init__(self, ndnum): # initialization function - self.rsib = None # right sibling - self.lchild = None # left child - self.par = None # parent node - self.number = ndnum # node number (internals negative, tips 0 or positive) - self.edgelen = 0.0 # branch length - self.descendants = set([ndnum]) # set containing descendant leaf set - self.partial = None # will have length 4*npatterns - - - def __str__(self): - # __str__ is a built-in function that is used by print to show an object - descendants_as_string = ','.join(['%d' % d for d in self.descendants]) - - lchildstr = 'None' - if self.lchild is not None: - lchildstr = '%d' % self.lchild.number - - rsibstr = 'None' - if self.rsib is not None: - rsibstr = '%d' % self.rsib.number - - parstr = 'None' - if self.par is not None: - parstr = '%d' % self.par.number - - return 'node: number=%d edgelen=%g lchild=%s rsib=%s parent=%s descendants=[%s]' % (self.number, self.edgelen, lchildstr, rsibstr, parstr, descendants_as_string) - -def allocatePartial(node, patterns): - if node.number > 0: - npatterns = len(patterns) - node.partial = [0.0]*(4*npatterns) - for i,pattern in enumerate(patterns.keys()): - base = pattern[node.number-1] - if base == 'A': - node.partial[i*4 + 0] = 1.0 - elif base == 'C': - node.partial[i*4 + 1] = 1.0 - elif base == 'G': - node.partial[i*4 + 2] = 1.0 - elif base == 'T': - node.partial[i*4 + 3] = 1.0 - else: - assert(False), 'oops, something went horribly wrong!' - - else: - npatterns = len(patterns) - node.partial = [0.0]*(4*npatterns) - like_list = [] - for i,pattern in enumerate(patterns.keys()): - psame = (0.25+0.75*exp(-4.0*(node.lchild.edgelen)/3.0)) - pdiff = (0.25-0.25*exp(-4.0*(node.lchild.edgelen)/3.0)) - - psame2 = (0.25+0.75*exp(-4.0*(node.lchild.rsib.edgelen)/3.0)) - pdiff2 = (0.25-0.25*exp(-4.0*(node.lchild.rsib.edgelen)/3.0)) - - num_pattern = patterns[pattern] - - pAA = psame*(node.lchild.partial[i*4 + 0]) - pAC = pdiff*(node.lchild.partial[i*4 + 1]) - pAG = pdiff*(node.lchild.partial[i*4 + 2]) - pAT = pdiff*(node.lchild.partial[i*4 + 3]) - - pAA2 = psame2*(node.lchild.rsib.partial[i*4 + 0]) - pAC2 = pdiff2*(node.lchild.rsib.partial[i*4 + 1]) - pAG2 = pdiff2*(node.lchild.rsib.partial[i*4 + 2]) - pAT2 = pdiff2*(node.lchild.rsib.partial[i*4 + 3]) - - pfromA_lchild = pAA+pAC+pAG+pAT - pfromA_rchild = pAA2+pAC2+pAG2+pAT2 - node.partial[i*4 + 0] = pfromA_lchild*pfromA_rchild - - ###################################################### - - pCA = pdiff*(node.lchild.partial[i*4 + 0]) - pCC = psame*(node.lchild.partial[i*4 + 1]) - pCG = pdiff*(node.lchild.partial[i*4 + 2]) - pCT = pdiff*(node.lchild.partial[i*4 + 3]) - - pCA2 = pdiff2*(node.lchild.rsib.partial[i*4 + 0]) - pCC2 = psame2*(node.lchild.rsib.partial[i*4 + 1]) - pCG2 = pdiff2*(node.lchild.rsib.partial[i*4 + 2]) - pCT2 = pdiff2*(node.lchild.rsib.partial[i*4 + 3]) - - pfromC_lchild = pCA+pCC+pCG+pCT - pfromC_rchild = pCA2+pCC2+pCG2+pCT2 - node.partial[i*4 + 1] = pfromC_lchild*pfromC_rchild - - ####################################################### -# - pGA = pdiff*(node.lchild.partial[i*4 + 0]) - pGC = pdiff*(node.lchild.partial[i*4 + 1]) - pGG = psame*(node.lchild.partial[i*4 + 2]) - pGT = pdiff*(node.lchild.partial[i*4 + 3]) -# - pGA2 = pdiff2*(node.lchild.rsib.partial[i*4 + 0]) - pGC2 = pdiff2*(node.lchild.rsib.partial[i*4 + 1]) - pGG2 = psame2*(node.lchild.rsib.partial[i*4 + 2]) - pGT2 = pdiff2*(node.lchild.rsib.partial[i*4 + 3]) -# - pfromG_lchild = pGA+pGC+pGG+pGT - pfromG_rchild = pGA2+pGC2+pGG2+pGT2 - node.partial[i*4 + 2] = pfromG_lchild*pfromG_rchild - - ####################################################### - - pTA = pdiff*(node.lchild.partial[i*4 + 0]) - pTC = pdiff*(node.lchild.partial[i*4 + 1]) - pTG = pdiff*(node.lchild.partial[i*4 + 2]) - pTT = psame*(node.lchild.partial[i*4 + 3]) -# - pTA2 = pdiff2*(node.lchild.rsib.partial[i*4 + 0]) - pTC2 = pdiff2*(node.lchild.rsib.partial[i*4 + 1]) - pTG2 = pdiff2*(node.lchild.rsib.partial[i*4 + 2]) - pTT2 = psame2*(node.lchild.rsib.partial[i*4 + 3]) -# - pfromT_lchild = pTA+pTC+pTG+pTT - pfromT_rchild = pTA2+pTC2+pTG2+pTT2 - node.partial[i*4 + 3] = pfromT_lchild*pfromT_rchild - - ######################################################### - - site_log_like = (log((sum(node.partial[i*4:i*4+4]))*0.25))*num_pattern - like_list.append(site_log_like) - - log_Like = sum(like_list) - return log_Like - -def treenewick(): - script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) - path = os.path.join(script_dir, tree_file_name) - with open(path, 'r') as content: - newick = content.read() - return newick - -def prepareTree(postorder, patterns): - likelihood_lists = [] - for nd in postorder: - likelihood_lists.append(allocatePartial(nd, patterns)) - print 'log-likelihood of the topology =', likelihood_lists[-1] - -def joinRandomPair(node_list, next_node_number, is_deep_coalescence): - # pick first of two lineages to join and delete from node_list - i = random.randint(1, len(node_list)) - ndi = node_list[i-1] - del node_list[i-1] - - # pick second of two lineages to join and delete from node_list - j = random.randint(1, len(node_list)) - ndj = node_list[j-1] - del node_list[j-1] - - # join selected nodes and add ancestor to node_list - ancnd = node(next_node_number) - ancnd.deep = is_deep_coalescence - ancnd.lchild = ndi - ancnd.descendants = set() - ancnd.descendants |= ndi.descendants - ancnd.descendants |= ndj.descendants - ndi.rsib = ndj - ndi.par = ancnd - ndj.par = ancnd - node_list.append(ancnd) - - return node_list - - -def makeNewick(nd, brlen_scaler = 1.0, start = True): # - global _newick - global _TL - - if start: - _newick = '' - _TL = 0.0 - - if nd.lchild: - _newick += '(' - makeNewick(nd.lchild, brlen_scaler, False) - - else: - blen = nd.edgelen*brlen_scaler - _TL += blen - _newick += '%d:%.5f' % (nd.number, blen) - - if nd.rsib: - _newick += ',' - makeNewick(nd.rsib, brlen_scaler, False) - elif nd.par is not None: - blen = nd.par.edgelen*brlen_scaler - _TL += blen - _newick += '):%.3f' % blen - - return _newick, _TL - -def calcActualHeight(root): - h = 0.0 - nd = root - while nd.lchild: - nd = nd.lchild - h += nd.edgelen - return h - - -def readnewick(tree): - total_length = len(tree) - internal_node_number = -1 - - root = node(internal_node_number) - nd = root - i = 0 - pre = [root] - while i < total_length: - m = tree[i] - - if m =='(': - internal_node_number -= 1 - - child = node(internal_node_number) - pre.append(child) - nd.lchild=child - - child.par=nd - nd=child - elif m == ',': - internal_node_number -= 1 - rsib = node(internal_node_number) - pre.append(rsib) - nd.rsib = rsib - rsib.par=nd.par - nd = rsib - elif m == ')': - nd = nd.par - - elif m == ':': - edge_len_str = '' - i+=1 - m = tree[i] - assert m in ['0','1','2','3','4','5','6','7','8', '9','.'] - while m in ['0','1','2','3','4','5','6','7','8', '9','.']: - edge_len_str += m - i+=1 - m = tree[i] - i -=1 - nd.edgelen = float(edge_len_str) - - - else: - internal_node_number += 1 - - if True: - assert m in ['0','1','2','3','4','5','6','7','8', '9'], 'Error : expecting m to be a digit when in fact it was "%s"' % m - mm = '' - while m in ['0','1','2','3','4','5','6','7','8', '9' ]: - - mm += m - - i += 1 - m = tree[i] - nd.number = int(mm) - i -= 1 - - i += 1 - - post = pre[:] - post.reverse() - return post - -def Makenewick(pre): - newickstring = '' - for i,nd in enumerate(pre): - if nd.lchild: - newickstring += '(' - - elif nd.rsib: - newickstring += '%d' %(nd.number) - newickstring += ':%.1f' % nd.edgelen - newickstring += ',' - - else: - newickstring += '%d' %(nd.number) - newickstring += ':%.1f' % nd.edgelen - tmpnd = nd - while (tmpnd.par is not None) and (tmpnd.rsib is None): - newickstring += ')' - newickstring += ':%.1f' % tmpnd.par.edgelen - tmpnd = tmpnd.par - - if tmpnd.par is not None: - newickstring += ',' - return newickstring - -###################yule tree################################################### -# calcPhi computes sum_{K=2}^S 1/K, where S is the number of leaves in the tree -# - num_species is the number of leaves (tips) in the tree -def calcPhi(num_species): - phi = sum([1.0/(K+2.0) for K in range(num_species-1)]) - return phi - -# yuleTree creates a species tree in which edge lengths are measured in -# expected number of substitutions. -# - num_species is the number of leaves -# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio -def yuleTree(num_species, mu_over_s): - # create num_species nodes numbered 1, 2, ..., num_species - nodes = [node(i+1) for i in range(num_species)] - - next_node_number = num_species + 1 - while len(nodes) > 1: - # choose a speciation time in generations - K = float(len(nodes)) - mean_epoch_length = mu_over_s/K - t = random.gammavariate(1.0, mean_epoch_length) - - # update each node's edgelen - for n in nodes: - n.edgelen += t # same as: n.edgelen = n.edgelen + t - - nodes = joinRandomPair(nodes, next_node_number, False) - next_node_number += 1 - - return nodes[0] - -# calcExpectedHeight returns the expected height of the species tree in terms of -# expected number of substitutions from the root to one tip. -# - num_species is the number of leaves -# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio -def calcExpectedHeight(num_species, mu_over_s): - return mu_over_s*calcPhi(num_species) - - -if __name__ == '__main__': - random_seed = 24553 # 7632557, 12345 - number_of_species = 5 - mutation_speciation_rate_ratio = 0.4 # 0.689655172 # yields tree height 1 for 6 species - random.seed(random_seed) - species_tree_root = yuleTree(number_of_species, mutation_speciation_rate_ratio) -# print '#########' -# print species_tree_root - newick = makeNewick(species_tree_root) -# print 'Random number seed: %d' % random_seed -# print 'Simulating one tree:' -# print ' number of species = %d' % number_of_species -# print ' mutation-speciation rate ratio = %g' % mutation_speciation_rate_ratio -# print ' actual tree length =',newick[1] - expected_height = calcExpectedHeight(number_of_species, mutation_speciation_rate_ratio) -# print ' expected height =',expected_height - actual_height = calcActualHeight(species_tree_root) -# print ' actual height =',actual_height -# print ' newick: ',newick[0] - - -# yuletree = '(((1:0.03915,5:0.03915):0.387,(4:0.42253,2:0.42253):0.004):0.118,3:0.54433)' - - postorder = readnewick(treenewick()) - result = prepareTree(postorder, readSeq.patterns(sequence_file)) \ No newline at end of file