Skip to content

Commit

Permalink
final script to compute log-likilhood of a toplogy under Jukes Cantor…
Browse files Browse the repository at this point in the history
… model
  • Loading branch information
sun13005 committed Apr 22, 2017
1 parent 152caa6 commit a6626b4
Show file tree
Hide file tree
Showing 2 changed files with 443 additions and 78 deletions.
377 changes: 377 additions & 0 deletions tree_like.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,377 @@
##################################
# This script reads a nexus DNA matrix (through module readseq.py) and a newick tree
# topology, and computes log-likelihood of the topology under Jukes Cantor model
###################################

import readseq
import random
import re, os, itertools, sys, glob
from itertools import chain
from math import exp, log
class node(object):
def __init__(self, ndnum): # initialization function
self.rsib = None # right sibling
self.lchild = None # left child
self.par = None # parent node
self.number = ndnum # node number (internals negative, tips 0 or positive)
self.edgelen = 0.0 # branch length
self.descendants = set([ndnum]) # set containing descendant leaf set
self.partial = None # will have length 4*npatterns

def allocatePartial(self, patterns):
if self.number > 0:
npatterns = len(patterns)
self.partial = [0.0]*(4*npatterns)
for i,pattern in enumerate(patterns.keys()):
base = pattern[self.number-1]
if base == 'A':
self.partial[i*4 + 0] = 1.0
elif base == 'C':
self.partial[i*4 + 1] = 1.0
elif base == 'G':
self.partial[i*4 + 2] = 1.0
elif base == 'T':
self.partial[i*4 + 3] = 1.0
else:
assert(False), 'oops, something went horribly wrong!'

else:
npatterns = len(patterns)
self.partial = [0.0]*(4*npatterns)
like_list = []
for i,pattern in enumerate(patterns.keys()):
psame = (0.25+0.75*exp(-4.0*(self.lchild.edgelen)/3.0))
pdiff = (0.25-0.25*exp(-4.0*(self.lchild.edgelen)/3.0))

psame2 = (0.25+0.75*exp(-4.0*(self.lchild.rsib.edgelen)/3.0))
pdiff2 = (0.25-0.25*exp(-4.0*(self.lchild.rsib.edgelen)/3.0))

num_pattern = patterns[pattern]

pAA = psame*(self.lchild.partial[i*4 + 0])
pAC = pdiff*(self.lchild.partial[i*4 + 1])
pAG = pdiff*(self.lchild.partial[i*4 + 2])
pAT = pdiff*(self.lchild.partial[i*4 + 3])

pAA2 = psame2*(self.lchild.rsib.partial[i*4 + 0])
pAC2 = pdiff2*(self.lchild.rsib.partial[i*4 + 1])
pAG2 = pdiff2*(self.lchild.rsib.partial[i*4 + 2])
pAT2 = pdiff2*(self.lchild.rsib.partial[i*4 + 3])

pfromA_lchild = pAA+pAC+pAG+pAT
pfromA_rchild = pAA2+pAC2+pAG2+pAT2
self.partial[i*4 + 0] = pfromA_lchild*pfromA_rchild

######################################################

pCA = pdiff*(self.lchild.partial[i*4 + 0])
pCC = psame*(self.lchild.partial[i*4 + 1])
pCG = pdiff*(self.lchild.partial[i*4 + 2])
pCT = pdiff*(self.lchild.partial[i*4 + 3])

pCA2 = pdiff2*(self.lchild.rsib.partial[i*4 + 0])
pCC2 = psame2*(self.lchild.rsib.partial[i*4 + 1])
pCG2 = pdiff2*(self.lchild.rsib.partial[i*4 + 2])
pCT2 = pdiff2*(self.lchild.rsib.partial[i*4 + 3])

pfromC_lchild = pCA+pCC+pCG+pCT
pfromC_rchild = pCA2+pCC2+pCG2+pCT2
self.partial[i*4 + 1] = pfromC_lchild*pfromC_rchild

#######################################################
#
pGA = pdiff*(self.lchild.partial[i*4 + 0])
pGC = pdiff*(self.lchild.partial[i*4 + 1])
pGG = psame*(self.lchild.partial[i*4 + 2])
pGT = pdiff*(self.lchild.partial[i*4 + 3])
#
pGA2 = pdiff2*(self.lchild.rsib.partial[i*4 + 0])
pGC2 = pdiff2*(self.lchild.rsib.partial[i*4 + 1])
pGG2 = psame2*(self.lchild.rsib.partial[i*4 + 2])
pGT2 = pdiff2*(self.lchild.rsib.partial[i*4 + 3])
#
pfromG_lchild = pGA+pGC+pGG+pGT
pfromG_rchild = pGA2+pGC2+pGG2+pGT2
self.partial[i*4 + 2] = pfromG_lchild*pfromG_rchild

#######################################################

pTA = pdiff*(self.lchild.partial[i*4 + 0])
pTC = pdiff*(self.lchild.partial[i*4 + 1])
pTG = pdiff*(self.lchild.partial[i*4 + 2])
pTT = psame*(self.lchild.partial[i*4 + 3])
#
pTA2 = pdiff2*(self.lchild.rsib.partial[i*4 + 0])
pTC2 = pdiff2*(self.lchild.rsib.partial[i*4 + 1])
pTG2 = pdiff2*(self.lchild.rsib.partial[i*4 + 2])
pTT2 = psame2*(self.lchild.rsib.partial[i*4 + 3])
#
pfromT_lchild = pTA+pTC+pTG+pTT
pfromT_rchild = pTA2+pTC2+pTG2+pTT2
self.partial[i*4 + 3] = pfromT_lchild*pfromT_rchild

#########################################################

site_log_like = (log((sum(self.partial[i*4:i*4+4]))*0.25))*num_pattern
like_list.append(site_log_like)

log_Like = sum(like_list)
return log_Like
# print 'log-like=', log_Like

def __str__(self):
# __str__ is a built-in function that is used by print to show an object
descendants_as_string = ','.join(['%d' % d for d in self.descendants])

lchildstr = 'None'
if self.lchild is not None:
lchildstr = '%d' % self.lchild.number

rsibstr = 'None'
if self.rsib is not None:
rsibstr = '%d' % self.rsib.number

parstr = 'None'
if self.par is not None:
parstr = '%d' % self.par.number

return 'node: number=%d edgelen=%g lchild=%s rsib=%s parent=%s descendants=[%s]' % (self.number, self.edgelen, lchildstr, rsibstr, parstr, descendants_as_string)


def prepareTree(postorder, patterns):
likelihood_lists = []
for nd in postorder:
likelihood_lists.append(nd.allocatePartial(patterns))
print 'log-likelihood of the topology =', likelihood_lists[-1]

def joinRandomPair(node_list, next_node_number, is_deep_coalescence):
# pick first of two lineages to join and delete from node_list
i = random.randint(1, len(node_list))
ndi = node_list[i-1]
del node_list[i-1]

# pick second of two lineages to join and delete from node_list
j = random.randint(1, len(node_list))
ndj = node_list[j-1]
del node_list[j-1]

# join selected nodes and add ancestor to node_list
ancnd = node(next_node_number)
ancnd.deep = is_deep_coalescence
ancnd.lchild = ndi
ancnd.descendants = set()
ancnd.descendants |= ndi.descendants
ancnd.descendants |= ndj.descendants
ndi.rsib = ndj
ndi.par = ancnd
ndj.par = ancnd
node_list.append(ancnd)

return node_list


def makeNewick(nd, brlen_scaler = 1.0, start = True): #
global _newick
global _TL

if start:
_newick = ''
_TL = 0.0

if nd.lchild:
_newick += '('
makeNewick(nd.lchild, brlen_scaler, False)

else:
blen = nd.edgelen*brlen_scaler
_TL += blen
_newick += '%d:%.5f' % (nd.number, blen)

if nd.rsib:
_newick += ','
makeNewick(nd.rsib, brlen_scaler, False)
elif nd.par is not None:
blen = nd.par.edgelen*brlen_scaler
_TL += blen
_newick += '):%.3f' % blen

return _newick, _TL

def calcActualHeight(root):
h = 0.0
nd = root
while nd.lchild:
nd = nd.lchild
h += nd.edgelen
return h

def getPostorder(nd, start = True): # how to travel across tree
global _postorder
if start:
_postorder = [] # start with an empty list


if nd.lchild:
getPostorder(nd.lchild, False) # recursive function to
_postorder.append(nd)

if nd.rsib:
getPostorder(nd.rsib, False)



return _postorder


def readnewick(tree):
total_length = len(tree)
internal_node_number = -1

root = node(internal_node_number)
nd = root
i = 0
pre = [root]
while i < total_length:
m = tree[i]

if m =='(':
internal_node_number -= 1

child = node(internal_node_number)
pre.append(child)
nd.lchild=child

child.par=nd
nd=child
elif m == ',':
internal_node_number -= 1
rsib = node(internal_node_number)
pre.append(rsib)
nd.rsib = rsib
rsib.par=nd.par
nd = rsib
elif m == ')':
nd = nd.par

elif m == ':':
edge_len_str = ''
i+=1
m = tree[i]
assert m in ['0','1','2','3','4','5','6','7','8', '9','.']
while m in ['0','1','2','3','4','5','6','7','8', '9','.']:
edge_len_str += m
i+=1
m = tree[i]
i -=1
nd.edgelen = float(edge_len_str)


else:
internal_node_number += 1

if True:
assert m in ['0','1','2','3','4','5','6','7','8', '9'], 'Error : expecting m to be a digit when in fact it was "%s"' % m
mm = ''
while m in ['0','1','2','3','4','5','6','7','8', '9' ]:

mm += m

i += 1
m = tree[i]
nd.number = int(mm)
i -= 1

i += 1

post = pre[:]
post.reverse()
return post

def Makenewick(pre):
newickstring = ''
for i,nd in enumerate(pre):
if nd.lchild:
newickstring += '('

elif nd.rsib:
newickstring += '%d' %(nd.number)
newickstring += ':%.1f' % nd.edgelen
newickstring += ','

else:
newickstring += '%d' %(nd.number)
newickstring += ':%.1f' % nd.edgelen
tmpnd = nd
while (tmpnd.par is not None) and (tmpnd.rsib is None):
newickstring += ')'
newickstring += ':%.1f' % tmpnd.par.edgelen
tmpnd = tmpnd.par

if tmpnd.par is not None:
newickstring += ','
return newickstring

###################yule tree###################################################
# calcPhi computes sum_{K=2}^S 1/K, where S is the number of leaves in the tree
# - num_species is the number of leaves (tips) in the tree
def calcPhi(num_species):
phi = sum([1.0/(K+2.0) for K in range(num_species-1)])
return phi

# yuleTree creates a species tree in which edge lengths are measured in
# expected number of substitutions.
# - num_species is the number of leaves
# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio
def yuleTree(num_species, mu_over_s):
# create num_species nodes numbered 1, 2, ..., num_species
nodes = [node(i+1) for i in range(num_species)]

next_node_number = num_species + 1
while len(nodes) > 1:
# choose a speciation time in generations
K = float(len(nodes))
mean_epoch_length = mu_over_s/K
t = random.gammavariate(1.0, mean_epoch_length)

# update each node's edgelen
for n in nodes:
n.edgelen += t # same as: n.edgelen = n.edgelen + t

nodes = joinRandomPair(nodes, next_node_number, False)
next_node_number += 1

return nodes[0]

# calcExpectedHeight returns the expected height of the species tree in terms of
# expected number of substitutions from the root to one tip.
# - num_species is the number of leaves
# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio
def calcExpectedHeight(num_species, mu_over_s):
return mu_over_s*calcPhi(num_species)


if __name__ == '__main__':
random_seed = 24553 # 7632557, 12345
number_of_species = 5
mutation_speciation_rate_ratio = 0.4 # 0.689655172 # yields tree height 1 for 6 species
random.seed(random_seed)
species_tree_root = yuleTree(number_of_species, mutation_speciation_rate_ratio)
# print '#########'
# print species_tree_root
newick = makeNewick(species_tree_root)
# print 'Random number seed: %d' % random_seed
# print 'Simulating one tree:'
# print ' number of species = %d' % number_of_species
# print ' mutation-speciation rate ratio = %g' % mutation_speciation_rate_ratio
# print ' actual tree length =',newick[1]
expected_height = calcExpectedHeight(number_of_species, mutation_speciation_rate_ratio)
# print ' expected height =',expected_height
actual_height = calcActualHeight(species_tree_root)
# print ' actual height =',actual_height
# print ' newick: ',newick[0]


yuletree = '(((1:0.03915,5:0.03915):0.387,(4:0.42253,2:0.42253):0.004):0.118,3:0.54433)'

postorder = readnewick(yuletree)
result = prepareTree(postorder, readseq.patterns())
Loading

0 comments on commit a6626b4

Please sign in to comment.