Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
##########################################################################################
''' This script reads DNA matrix in nexus format (through readseq.py module) and performs
mcmc tree search under GTR+GAMMA model. It's extremely slow!! - scripting language such as
Python is not good for this purpose. I wrote this script to run on a small data set
(6 taxon) to understand how full GTR model works so please don't use it as a full-fledeged
phylogeny software.
contact: suman.neupane@uconn.edu
'''
##########################################################################################
import readSeq
import random
import re, os, itertools, sys, glob
from itertools import chain
from scipy.stats import gamma
from math import exp, log, lgamma
import operator
import numpy as np
##########################################################################################
# tree_file_name = 'real_treeForSeqSimulation.tre' # if supplied a starting tree
sequence_file = 'simulated_sequences.nex'
# sequence_file = 'fake.nex'
# base_frequences = [0.25, 0.25, 0.25, 0.25] # stationary frequencies of A, C, G, T
kappa = 3.0 # set kappa to 1.0 for F81+GAMMA
# base_frequences = [0.37272151039415824, 0.2951859176504787, 0.19949665725757845, 0.13259591469778456] # stationary frequencies of A, C, G, T, set all base_frequences to 0.25 for K80+GAMMA
alpha = 0.5 # gamma shape parameter for rate categories
n_gen = 100
save_every = 5
mean_expo = 10. # mean_expo = mean of exponential distribution for branch length prior
base_prior = [10., 10., 10., 10.] # Dirichlet prior for base frequencies
exrate_prior = [10., 10., 10., 10., 10., 10] # Dirichlet prior for rate rate exchangeabilites
##########################################################################################
class node(object):
def __init__(self, ndnum): # initialization function
self.rsib = None # right sibling
self.lchild = None # left child
self.par = None # parent node
self.number = ndnum # node number (internals negative, tips 0 or positive)
self.edgelen = 0.0 # branch length
self.descendants = set([ndnum]) # set containing descendant leaf set
self.partial = None # will have length 4*npatterns
self.freq = [0.0, 0.0, 0.0, 0.0]
self.ex_rate = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
def __str__(self):
# __str__ is a built-in function that is used by print to show an object
descendants_as_string = ','.join(['%d' % d for d in self.descendants])
lchildstr = 'None'
if self.lchild is not None:
lchildstr = '%d' % self.lchild.number
rsibstr = 'None'
if self.rsib is not None:
rsibstr = '%d' % self.rsib.number
parstr = 'None'
if self.par is not None:
parstr = '%d' % self.par.number
return 'node: number=%d edgelen=%g lchild=%s rsib=%s parent=%s descendants=[%s]' % (self.number, self.edgelen, lchildstr, rsibstr, parstr, descendants_as_string)
# getting 4 gamma rates categories for among site rate heterogeneity
def gammaRates(alpha):
bounds = [0.0, 0.25, 0.50, 0.75, 1.]
rates = []
for i in range(4):
# print i
lower = gamma.ppf(bounds[i], alpha, 0, 1./alpha)
upper = gamma.ppf(bounds[i+1], alpha, 0, 1./alpha)
mean_rate = ((gamma.cdf(upper, alpha+1., 0, 1./alpha) - gamma.cdf(lower, alpha+1., 0, 1./alpha)))*4.
rates.append(mean_rate)
return rates
# computing partial likelihoods at the node above the root node
def allocatePartial(node, patterns, rates, freqA, freqC, freqG, freqT, _a, _b, _c, _d, _e, _f):
if node.number > 0:
npatterns = len(patterns)
if node.partial is None:
node.partial = [0.0]*(4*4*npatterns)
# print len(node.partial)
for i,pattern in enumerate(patterns.keys()):
base2 = pattern[node.number-1]
for l in range(4):
if base2 == 'A':
node.partial[i*16+l*4 + 0] = 1.0
elif base2 == 'C':
node.partial[i*16+l*4 + 1] = 1.0
elif base2 == 'G':
node.partial[i*16+l*4 + 2] = 1.0
elif base2 == 'T':
node.partial[i*16+l*4 + 3] = 1.0
# print '>>>>>>>>>>>>>', node.partial
else:
assert(False), 'oops, something went horribly wrong!'
else:
npatterns = len(patterns)
if node.partial is None:
node.partial = [0.0]*(4*4*npatterns)
like_list = []
for i,pattern in enumerate(patterns.keys()):
m_list = []
num_pattern = patterns[pattern]
for l,r in enumerate(rates):
num_pattern = patterns[pattern]
alphaT1 = r*((node.lchild.edgelen)/(2.*((freqA * freqC * _a) + (freqA * freqG * _b) + (freqA * freqT * _c) + (freqC * freqG * _d) + (freqC * freqT * _e) + (freqG * freqT * _f))))
alphaT2 = r*((node.lchild.rsib.edgelen)/(2.*((freqA * freqC * _a) + (freqA * freqG * _b) + (freqA * freqT * _c) + (freqC * freqG * _d) + (freqC * freqT * _e) + (freqG * freqT * _f))))
# GTR rate matrices
A1 = np.mat([[-(alphaT1*freqC*_a + alphaT1*freqG*_b + alphaT1*freqT*_c),alphaT1*freqC*_a, alphaT1*freqG*_b, alphaT1*freqT*_c],
[ alphaT1*freqA*_a,-(alphaT1*freqA*_a + alphaT1*freqG*_d + alphaT1*freqT*_e), alphaT1*freqG*_d, alphaT1*freqT*_e],
[ alphaT1*freqA*_b, alphaT1*freqC*_d,-(alphaT1*freqA*_b + alphaT1*freqC*_d + alphaT1*freqT*_f), alphaT1*freqT*_f],
[ alphaT1*freqA*_c, alphaT1*freqC*_e, alphaT1*freqG*_f,-(alphaT1*freqA*_c + alphaT1*freqC*_e + alphaT1*freqG*_f)]])
A2 = np.mat([[-(alphaT2*freqC*_a + alphaT2*freqG*_b + alphaT2*freqT*_c),alphaT2*freqC*_a, alphaT2*freqG*_b, alphaT2*freqT*_c],
[ alphaT2*freqA*_a,-(alphaT2*freqA*_a + alphaT2*freqG*_d + alphaT2*freqT*_e), alphaT2*freqG*_d, alphaT2*freqT*_e],
[ alphaT2*freqA*_b, alphaT2*freqC*_d,-(alphaT2*freqA*_b + alphaT2*freqC*_d + alphaT2*freqT*_f), alphaT2*freqT*_f],
[ alphaT2*freqA*_c, alphaT2*freqC*_e, alphaT2*freqG*_f,-(alphaT2*freqA*_c + alphaT2*freqC*_e + alphaT2*freqG*_f)]])
eigenvalue1, eigenvector1 = np.linalg.eig(A1)
eigenvalue2, eigenvector2 = np.linalg.eig(A2)
U1 = eigenvector1
Uinv1 = np.linalg.inv(U1)
D1 = np.diag(np.exp(eigenvalue1))
P1 = np.dot(U1,np.dot(D1,Uinv1))
U2 = eigenvector2
Uinv2 = np.linalg.inv(U2)
D2 = np.diag(np.exp(eigenvalue2))
P2 = np.dot(U2,np.dot(D2,Uinv2))
pAA = P1[[0],[0]] * (node.lchild.partial[i*16+l*4 + 0])
pAC = P1[[0],[1]] * (node.lchild.partial[i*16+l*4 + 1])
pAG = P1[[0],[2]] * (node.lchild.partial[i*16+l*4 + 2])
pAT = P1[[0],[3]] * (node.lchild.partial[i*16+l*4 + 3])
pAA2 = P2[[0],[0]] * (node.lchild.rsib.partial[i*16+l*4 + 0])
pAC2 = P2[[0],[1]] * (node.lchild.rsib.partial[i*16+l*4 + 1])
pAG2 = P2[[0],[2]] * (node.lchild.rsib.partial[i*16+l*4 + 2])
pAT2 = P2[[0],[3]] * (node.lchild.rsib.partial[i*16+l*4 + 3])
pfromA_lchild = pAA+pAC+pAG+pAT
pfromA_rchild = pAA2+pAC2+pAG2+pAT2
node.partial[i*16+l*4 + 0] = pfromA_lchild*pfromA_rchild
######################################################
pCA = P1[[1],[0]] * (node.lchild.partial[i*16+l*4 + 0])
pCC = P1[[1],[1]] * (node.lchild.partial[i*16+l*4 + 1])
pCG = P1[[1],[2]] * (node.lchild.partial[i*16+l*4 + 2])
pCT = P1[[1],[3]] * (node.lchild.partial[i*16+l*4 + 3])
pCA2 = P2[[1],[0]] * (node.lchild.rsib.partial[i*16+l*4 + 0])
pCC2 = P2[[1],[1]] * (node.lchild.rsib.partial[i*16+l*4 + 1])
pCG2 = P2[[1],[2]] * (node.lchild.rsib.partial[i*16+l*4 + 2])
pCT2 = P2[[1],[3]] * (node.lchild.rsib.partial[i*16+l*4 + 3])
pfromC_lchild = pCA+pCC+pCG+pCT
pfromC_rchild = pCA2+pCC2+pCG2+pCT2
node.partial[i*16+l*4 + 1] = pfromC_lchild*pfromC_rchild
#######################################################
pGA = P1[[2],[0]] * (node.lchild.partial[i*16+l*4 + 0])
pGC = P1[[2],[1]] * (node.lchild.partial[i*16+l*4 + 1])
pGG = P1[[2],[2]] * (node.lchild.partial[i*16+l*4 + 2])
pGT = P1[[2],[3]] * (node.lchild.partial[i*16+l*4 + 3])
pGA2 = P2[[2],[0]] * (node.lchild.rsib.partial[i*16+l*4 + 0])
pGC2 = P2[[2],[1]] * (node.lchild.rsib.partial[i*16+l*4 + 1])
pGG2 = P2[[2],[2]] * (node.lchild.rsib.partial[i*16+l*4 + 2])
pGT2 = P2[[2],[3]] * (node.lchild.rsib.partial[i*16+l*4 + 3])
pfromG_lchild = pGA+pGC+pGG+pGT
pfromG_rchild = pGA2+pGC2+pGG2+pGT2
node.partial[i*16+l*4 + 2] = pfromG_lchild*pfromG_rchild
#######################################################
pTA = P1[[3],[0]] * (node.lchild.partial[i*16+l*4 + 0])
pTC = P1[[3],[1]] * (node.lchild.partial[i*16+l*4 + 1])
pTG = P1[[3],[2]] * (node.lchild.partial[i*16+l*4 + 2])
pTT = P1[[3],[3]] * (node.lchild.partial[i*16+l*4 + 3])
pTA2 = P2[[3],[0]] * (node.lchild.rsib.partial[i*16+l*4 + 0])
pTC2 = P2[[3],[1]] * (node.lchild.rsib.partial[i*16+l*4 + 1])
pTG2 = P2[[3],[2]] * (node.lchild.rsib.partial[i*16+l*4 + 2])
pTT2 = P2[[3],[3]] * (node.lchild.rsib.partial[i*16+l*4 + 3])
pfromT_lchild = pTA+pTC+pTG+pTT
pfromT_rchild = pTA2+pTC2+pTG2+pTT2
node.partial[i*16+l*4 + 3] = pfromT_lchild*pfromT_rchild
# site_like = (sum(node.partial[i*16:i*16+16]))*0.25*0.25
# site_log_like = (log(site_like))*num_pattern
# like_list.append(site_log_like)
# log_like = sum(like_list)
# return log_like
# return node.partial[i*16:i*16+16]
# computing final likelihood at the root of tree
def rootLoglike(postorder, patterns, rates, freqA, freqC, freqG, freqT, _a, _b, _c, _d, _e, _f):
# getting partial likelihoods of all the nodes, but our interest is on the node above the root which is postorder[-1]
prepareTree(postorder, patterns, rates, freqA, freqC, freqG, freqT, _a, _b, _c, _d, _e, _f)
npatterns = len(patterns)
node = postorder[-1]
if node.partial is None:
node.partial = [0.0]*(4*4*npatterns)
like_list = []
for i,pattern in enumerate(patterns.keys()):
num_pattern = patterns[pattern]
# determining the nucleotide base for the root node, node.lchild is the node above the root node whose partial likelihood has already been computed using preparTree function
base = pattern[node.number-1]
if base == 'A':
final_freq = freqA
elif base == 'C':
final_freq = freqC
elif base == 'G':
final_freq = freqG
elif base == 'T':
final_freq = freqT
for l,r in enumerate(rates):
alphaT3 = r*((node.lchild.edgelen)/(2.*((freqA * freqC * _a) + (freqA * freqG * _b) + (freqA * freqT * _c) + (freqC * freqG * _d) + (freqC * freqT * _e) + (freqG * freqT * _f))))
# GTR rate matrix
A3 = np.mat([[-(alphaT3*freqC*_a + alphaT3*freqG*_b + alphaT3*freqT*_c),alphaT3*freqC*_a, alphaT3*freqG*_b, alphaT3*freqT*_c],
[ alphaT3*freqA*_a,-(alphaT3*freqA*_a + alphaT3*freqG*_d + alphaT3*freqT*_e), alphaT3*freqG*_d, alphaT3*freqT*_e],
[ alphaT3*freqA*_b, alphaT3*freqC*_d,-(alphaT3*freqA*_b + alphaT3*freqC*_d + alphaT3*freqT*_f), alphaT3*freqT*_f],
[ alphaT3*freqA*_c, alphaT3*freqC*_e, alphaT3*freqG*_f,-(alphaT3*freqA*_c + alphaT3*freqC*_e + alphaT3*freqG*_f)]])
# Diagonalizing rate*time (A3) matrix to find eigen values and eigen vector matrices
eigenvalue3, eigenvector3 = np.linalg.eig(A3)
U3 = eigenvector3
Uinv3 = np.linalg.inv(U3)
# exponentiating eigen values
D3 = np.diag(np.exp(eigenvalue3))
# getting probability matrix
P3 = np.dot(U3,np.dot(D3,Uinv3))
if base == 'A':
node.partial[i*16+l*4 + 0] = 1.0
pAA = P3[[0],[0]] * (node.lchild.partial[i*16+l*4 + 0])
pAC = P3[[0],[1]] * (node.lchild.partial[i*16+l*4 + 1])
pAG = P3[[0],[2]] * (node.lchild.partial[i*16+l*4 + 2])
pAT = P3[[0],[3]] * (node.lchild.partial[i*16+l*4 + 3])
pfromA_lchild = pAA+pAC+pAG+pAT
node.partial[i*16+l*4 + 0] = pfromA_lchild
elif base == 'C':
node.partial[i*16+l*4 + 1] = 1.0
pCA = P3[[1],[0]] * (node.lchild.partial[i*16+l*4 + 0])
pCC = P3[[1],[1]] * (node.lchild.partial[i*16+l*4 + 1])
pCG = P3[[1],[2]] * (node.lchild.partial[i*16+l*4 + 2])
pCT = P3[[1],[3]] * (node.lchild.partial[i*16+l*4 + 3])
pfromC_lchild = pCA+pCC+pCG+pCT
node.partial[i*16+l*4 + 1] = pfromC_lchild
elif base == 'G':
node.partial[i*16+l*4 + 2] = 1.0
pGA = P3[[2],[0]] * (node.lchild.partial[i*16+l*4 + 0])
pGC = P3[[2],[1]] * (node.lchild.partial[i*16+l*4 + 1])
pGG = P3[[2],[2]] * (node.lchild.partial[i*16+l*4 + 2])
pGT = P3[[2],[3]] * (node.lchild.partial[i*16+l*4 + 3])
pfromG_lchild = pGA+pGC+pGG+pGT
node.partial[i*16+l*4 + 2] = pfromG_lchild
elif base == 'T':
node.partial[i*16+l*4 + 3] = 1.0
pTA = P3[[3],[0]] * (node.lchild.partial[i*16+l*4 + 0])
pTC = P3[[3],[1]] * (node.lchild.partial[i*16+l*4 + 1])
pTG = P3[[3],[2]] * (node.lchild.partial[i*16+l*4 + 2])
pTT = P3[[3],[3]] * (node.lchild.partial[i*16+l*4 + 3])
pfromT_lchild = pTA+pTC+pTG+pTT
node.partial[i*16+l*4 + 3] = pfromT_lchild
else:
assert(False), 'something went horribly wrong!'
site_like = (sum(node.partial[i*16:i*16+16]))*final_freq*0.25
site_log_like = (log(site_like))*num_pattern
like_list.append(site_log_like)
log_like = sum(like_list)
return log_like
# function to call allocatePartial function
def prepareTree(postorder, patterns, rates, freqA, freqC, freqG, freqT, _a, _b, _c, _d, _e, _f):
likelihood_lists = []
for nd in postorder[:-1]:
likelihood_lists.append(allocatePartial(nd, patterns, rates, freqA, freqC, freqG, freqT, _a, _b, _c, _d, _e, _f))
# print 'likelihood_lists=', likelihood_lists[-1]
return likelihood_lists[-1]
# nearest node interchange function for Larget and Simon move
def nniNodeSwap(a, b):
x = a.par
y = b.par
# Detach a from tree
if a.rsib:
x.lchild = a.rsib
a.par = None
a.rsib = None
else:
x.lchild.rsib = None
a.par = None
# Detach b from tree
if b == x.rsib:
x.rsib = None
b.par = None
else:
y.lchild = x
b.rsib = None
b.par = None
# Reattach a to y
x.rsib = a
a.par = y
#Reattach b to x
x.lchild.rsib = b
b.par = x
# preorder node traversal
def getPreorder(nd, start = True):
global _preorder
if start:
_preorder = []
_preorder.append(nd)
if nd.lchild:
getPreorder(nd.lchild, False)
if nd.rsib:
getPreorder(nd.rsib, False)
return _preorder
# creating unrooted starting tree with starting node as the outermost leaf
def rootlchild(preorder):
m = preorder[0]
t = preorder[1]
m.edgelen = preorder[1].edgelen
t.edgelen = 0.0
m.lchild = t.rsib
t.par = None
t.lchild = None
t.rsib = None
m.par = t
t.lchild = m
preorder[0] = t
preorder[1] = m
new_pre = getPreorder(preorder[0])
return new_pre
# proposing new tree state using Larget and Simon (1999) and Holder, Lewis, Swofford, Larget (2005).
def proposeNewState(postorder):
global _case
global internal_nodes
global m
global _topology_changed
global _d, _a, _x, _y, _c, _b, _orig_edgelen_top, _orig_edgelen_middle, _orig_edgelen_bottom, _new_edgelen_top, _new_edgelen_middle, _new_edgelen_bottom
# getting the list of all the internal nodes
internal_nodes = []
for nd in postorder:
if (nd.par is not None) and (nd.par.edgelen !=0.0) and (nd.lchild is not None):
internal_nodes.append(nd)
# nodes to be manipulated for the proposed tree state change where _x is the
#random internal middle node of the contiguous 3-edge segment
_x = random.choice(internal_nodes)
_a = _x.lchild
_b = _a.rsib
_y = _x.par
_c = _y.par
if _x == _y.lchild:
_d = _x.rsib
else:
_d = _y.lchild
# selecting the internal node randomly
u1 = random.random()
a_on_path = random.random() <0.5
if a_on_path:
_orig_edgelen_top = _a.edgelen
else:
_orig_edgelen_top = _b.edgelen
_orig_edgelen_middle = _x.edgelen
c_on_path = random.random() <0.5
# selecting the other two random edges connected to the middle edge of a 3-edge segment to be changed
if c_on_path:
_orig_edgelen_bottom = _y.edgelen
else:
_orig_edgelen_bottom = _d.edgelen
_orig_edgelen_middle = _x.edgelen
u2 = random.random()
m = exp(0.2*(u2-0.5)) ## 0.2 is the tuning parameter, larger values make larger changes to the 3-edge sement
# print 'm1====================================', m
# modifying 3-edge sement
_new_edgelen_top = m*_orig_edgelen_top
_new_edgelen_middle = m*_orig_edgelen_middle
_new_edgelen_bottom = m*_orig_edgelen_bottom
# changed total length of 3-edge segment
new_focal_path_length = _new_edgelen_top + _new_edgelen_middle + _new_edgelen_bottom
# deciding the point along the modified 3-edge path where a random edge attached to
# 3-edge segment is swapped/attached
u3 = random.random()
new_attachment_point = u3*new_focal_path_length
_smallest_edge_length = 1.e-12
if new_attachment_point <= _smallest_edge_length:
new_attachment_point = _smallest_edge_length
elif new_focal_path_length - new_attachment_point <= _smallest_edge_length:
new_attachment_point = new_focal_path_length - _smallest_edge_length
u7 = random.random()
if a_on_path and c_on_path:
if u7 < 0.5:
_case = 1
##################################################################
# CASE = 1
##################################################################
# (a) b*
# \ /
# \ /
# (x) d
# \ /
# \ /
# (y)
# |
# |
# (c)
##################################################################
# CASE = 1
##################################################################
if new_attachment_point > _new_edgelen_top + _new_edgelen_middle:
print 'topology changed True, case 1'
_topology_changed = True
nniNodeSwap(_b, _d)
_a.edgelen = _new_edgelen_top + _new_edgelen_middle
_x.edgelen = new_attachment_point - _a.edgelen
_y.edgelen = new_focal_path_length - new_attachment_point
else:
print 'topology changed False, case 1'
_topology_changed = False;
_a.edgelen = new_attachment_point
_x.edgelen = _new_edgelen_top + _new_edgelen_middle - new_attachment_point
_y.edgelen = _new_edgelen_bottom
else:
_case = 2
##################################################################
# CASE = 2
##################################################################
# (a) b
# \ /
# \ /
# (x) d*
# \ /
# \ /
# (y)
# |
# |
# (c)
##################################################################
# CASE = 2
##################################################################
if new_attachment_point < _new_edgelen_top:
print 'topology changed True, case 2'
_topology_changed = True
nniNodeSwap(_b, _d)
_a.edgelen = new_attachment_point
_x.edgelen = _new_edgelen_top - new_attachment_point
_y.edgelen = _new_edgelen_middle + _new_edgelen_bottom
else:
print 'topology changed False, case 2'
_topology_changed = False;
_a.edgelen = _new_edgelen_top
_x.edgelen = new_attachment_point - _new_edgelen_top
_y.edgelen = new_focal_path_length - new_attachment_point
elif (a_on_path == False) and (c_on_path):
if u7 < 0.5:
_case = 3
##################################################################
# CASE = 3
##################################################################
# a* (b)
# \ /
# \ /
# (x) d
# \ /
# \ /
# (y)
# |
# |
# (c)
##################################################################
# CASE = 3
##################################################################
if new_attachment_point > _new_edgelen_top + _new_edgelen_middle:
print 'topology changed True, case 3'
_topology_changed = True
nniNodeSwap(_a, _d)
_b.edgelen = _new_edgelen_top + _new_edgelen_middle
_x.edgelen = new_attachment_point - _b.edgelen
_y.edgelen = new_focal_path_length - new_attachment_point
else:
print 'topology changed False, case 3'
_topology_changed = False;
_b.edgelen = new_attachment_point
_x.edgelen = _new_edgelen_top + _new_edgelen_middle - new_attachment_point
_y.edgelen = _new_edgelen_bottom
else:
_case = 4
##################################################################
# CASE = 4
##################################################################
# a (b)
# \ /
# \ /
# (x) d*
# \ /
# \ /
# (y)
# |
# |
# (c)
##################################################################
# CASE = 4
##################################################################
if new_attachment_point < _new_edgelen_top:
print 'topology changed True, case 4'
_topology_changed = True
nniNodeSwap(_a, _d)
_b.edgelen = new_attachment_point
_x.edgelen = _new_edgelen_top - new_attachment_point
_y.edgelen = _new_edgelen_middle + _new_edgelen_bottom
else:
print 'topology changed False, case 4'
_topology_changed = False;
_b.edgelen = _new_edgelen_top
_x.edgelen = new_attachment_point - _new_edgelen_top
_y.edgelen = new_focal_path_length - new_attachment_point
elif (a_on_path) and (c_on_path==False):
if u7 < 0.5:
_case = 5
##################################################################
# CASE = 5
##################################################################
# (a) b*
# \ /
# \ /
# (x) d
# \ /
# \ /
# (y)
# |
# |
# c
##################################################################
# CASE = 5
##################################################################
if new_attachment_point > _new_edgelen_top + _new_edgelen_middle:
print 'topology changed True, case 5'
_topology_changed = True
nniNodeSwap(_a, _d)
_a.edgelen = _new_edgelen_top + _new_edgelen_middle
_x.edgelen = new_attachment_point - _a.edgelen
_d.edgelen = new_focal_path_length - new_attachment_point
else:
print 'topology changed False, case 5'
_topology_changed = False;
_a.edgelen = new_attachment_point
_x.edgelen = _new_edgelen_top + _new_edgelen_middle - new_attachment_point
_d.edgelen = _new_edgelen_bottom
else:
_case = 6
##################################################################
# CASE = 6
##################################################################
# (a) b
# \ /
# \ /
# (x) d
# \ /
# \ /
# (y)
# |
# |
# c*
##################################################################
# CASE = 6
##################################################################
if new_attachment_point < _new_edgelen_top:
print 'topology changed True, case 6'
_topology_changed = True
nniNodeSwap(_a, _d)
_d.edgelen = _new_edgelen_bottom + _new_edgelen_middle
_x.edgelen = _new_edgelen_top - new_attachment_point
_a.edgelen = new_attachment_point
else:
print 'topology changed False, case 6'
_topology_changed = False;
_a.edgelen = _new_edgelen_top
_x.edgelen = new_attachment_point - _new_edgelen_top
_d.edgelen = new_focal_path_length - new_attachment_point
else:
if u7 < 0.5:
_case = 7
##################################################################
# CASE = 7
##################################################################
# a* (b)
# \ /
# \ /
# (x) (d)
# \ /
# \ /
# (y)
# |
# |
# c
##################################################################
# CASE = 7
##################################################################
if new_attachment_point > _new_edgelen_top + _new_edgelen_middle:
print 'topology changed True, case 7'
_topology_changed = True
nniNodeSwap(_b, _d)
_d.edgelen = new_focal_path_length - new_attachment_point
_x.edgelen = new_attachment_point - _new_edgelen_top - _new_edgelen_middle
_b.edgelen = _new_edgelen_top + _new_edgelen_middle
else:
print 'topology changed False, case 7'
_topology_changed = False;
_b.edgelen = new_attachment_point
_x.edgelen = _new_edgelen_top + _new_edgelen_middle - new_attachment_point
_d.edgelen = _new_edgelen_bottom
else:
_case = 8
##################################################################
# CASE = 8
##################################################################
# a (b)
# \ /
# \ /
# (x) (d)
# \ /
# \ /
# (y)
# |
# |
# c*
##################################################################
# CASE = 8
##################################################################
if new_attachment_point < _new_edgelen_top:
print 'topology changed True, case 8'
_topology_changed = True
nniNodeSwap(_b, _d)
_b.edgelen = new_attachment_point
_x.edgelen = _new_edgelen_top - new_attachment_point
_d.edgelen = _new_edgelen_middle + _new_edgelen_bottom
else:
print 'topology changed False, case 8'
_topology_changed = False;
_b.edgelen = _new_edgelen_top
_x.edgelen = new_attachment_point - _new_edgelen_top
_d.edgelen = new_focal_path_length - new_attachment_point
preorder = getPreorder(postorder[-1])
postorder = preorder[:]
postorder.reverse()
return postorder
# function to revert to original state if the proposed move is rejected
def revert_function(postorder):
if (_case == 1 or _case == 2):
if (_topology_changed):
nniNodeSwap(_d, _b);
_a.edgelen= _orig_edgelen_top
_x.edgelen = _orig_edgelen_middle
_y.edgelen = _orig_edgelen_bottom
else:
_a.edgelen = _orig_edgelen_top
_x.edgelen = _orig_edgelen_middle
_y.edgelen = _orig_edgelen_bottom
elif (_case == 3 or _case == 4):
if (_topology_changed):
nniNodeSwap(_d, _a)
_b.edgelen = _orig_edgelen_top
_x.edgelen = _orig_edgelen_middle
_y.edgelen = _orig_edgelen_bottom
else:
_b.edgelen = _orig_edgelen_top
_x.edgelen = _orig_edgelen_middle
_y.edgelen = _orig_edgelen_bottom
elif (_case == 5 or _case == 6):
if (_topology_changed):
nniNodeSwap(_d, _a)
_a.edgelen = _orig_edgelen_top
_x.edgelen = _orig_edgelen_middle
_d.edgelen = _orig_edgelen_bottom
else:
_a.edgelen = _orig_edgelen_top
_x.edgelen = _orig_edgelen_middle
_d.edgelen = _orig_edgelen_bottom
elif (_case == 7 or _case == 8):
if (_topology_changed):
nniNodeSwap(_d, _b)
_b.edgelen = _orig_edgelen_top
_x.edgelen = _orig_edgelen_middle
_d.edgelen = _orig_edgelen_bottom
else:
_b.edgelen = _orig_edgelen_top
_x.edgelen = _orig_edgelen_middle
_d.edgelen = _orig_edgelen_bottom
preorder = getPreorder(postorder[-1])
postorder = preorder[:]
postorder.reverse()
return postorder
# total possible unrooted topology for prior
def binaryTrees():
def factorial(n):
num = 1
while n >= 1:
num = num * n
n = n - 1
return num
total_binary_trees = factorial(2*number_of_species-5)/(factorial(number_of_species-3) * (2**(number_of_species-3)))
return total_binary_trees
# Hastings Ratio for base frequencies
def freqHastings(x1, x2, x3, x4):
tuning_lambda = 100.
theta1 = x1*tuning_lambda
theta2 = x2*tuning_lambda
theta3 = x3*tuning_lambda
theta4 = x4*tuning_lambda
g1 = random.gammavariate(theta1, 1) # 38.513225771664025
g2 = random.gammavariate(theta2, 1) # 36.717529156834885
g3 = random.gammavariate(theta3, 1) # 18.346105461005955
g4 = random.gammavariate(theta4, 1) # 10.628233156657865
gsum = g1 + g2 + g3 + g4 # 104.20509354616274
x1star = g1/gsum # 0.36959062614921706
x2star = g2/gsum # 0.3523582956198687
x3star = g3/gsum # 0.17605766509751897
x4star = g4/gsum # 0.10199341313339515
# proposed frequencies from Dirichlet distribution
proposed_freq = [x1star, x2star, x3star, x4star]
#forward
log_forward_density = (theta1-1)*log(x1)
log_forward_density += (theta2-1)*log(x2)
log_forward_density += (theta3-1)*log(x3)
log_forward_density += (theta4-1)*log(x4)
log_forward_density += lgamma(theta1 + theta2 + theta3 + theta4)
log_forward_density -= lgamma(theta1)
log_forward_density -= lgamma(theta2)
log_forward_density -= lgamma(theta3)
log_forward_density -= lgamma(theta4)
#reverse
theta1star = x1star*tuning_lambda
theta2star = x2star*tuning_lambda
theta3star = x3star*tuning_lambda
theta4star = x4star*tuning_lambda
log_reverse_density = (theta1star-1)*log(x1star)
log_reverse_density += (theta2star-1)*log(x2star)
log_reverse_density += (theta3star-1)*log(x3star)
log_reverse_density += (theta4star-1)*log(x4star)
log_reverse_density += lgamma(theta1star + theta2star + theta3star + theta4star)
log_reverse_density -= lgamma(theta1star)
log_reverse_density -= lgamma(theta2star)
log_reverse_density -= lgamma(theta3star)
log_reverse_density -= lgamma(theta4star)
log_hastings_ratio = log_reverse_density - log_forward_density
return proposed_freq, log_hastings_ratio
# Hastings Ratio for rate exchangeabilites
def exrateHastings2(x1, x2, x3, x4, x5, x6):
tuning_lambda = 100.
theta1 = x1*tuning_lambda
theta2 = x2*tuning_lambda
theta3 = x3*tuning_lambda
theta4 = x4*tuning_lambda
theta5 = x5*tuning_lambda
theta6 = x6*tuning_lambda
g1 = random.gammavariate(theta1, 1)
g2 = random.gammavariate(theta2, 1)
g3 = random.gammavariate(theta3, 1)
g4 = random.gammavariate(theta4, 1)
g5 = random.gammavariate(theta5, 1)
g6 = random.gammavariate(theta6, 1)
gsum = g1 + g2 + g3 + g4 + g5 + g6
x1star = g1/gsum
x2star = g2/gsum
x3star = g3/gsum
x4star = g4/gsum
x5star = g5/gsum
x6star = g6/gsum
# proposed rates from Dirichlet distribution
proposed_rates = [x1star, x2star, x3star, x4star, x5star, x6star]
#forward
log_forward_density = (theta1-1)*log(x1)
log_forward_density += (theta2-1)*log(x2)
log_forward_density += (theta3-1)*log(x3)
log_forward_density += (theta4-1)*log(x4)
log_forward_density += (theta5-1)*log(x5)
log_forward_density += (theta6-1)*log(x6)
log_forward_density += lgamma(theta1 + theta2 + theta3 + theta4 + theta5 + theta6)
log_forward_density -= lgamma(theta1)
log_forward_density -= lgamma(theta2)
log_forward_density -= lgamma(theta3)
log_forward_density -= lgamma(theta4)
log_forward_density -= lgamma(theta5)
log_forward_density -= lgamma(theta6)
#reverse
theta1star = x1star*tuning_lambda
theta2star = x2star*tuning_lambda
theta3star = x3star*tuning_lambda
theta4star = x4star*tuning_lambda
theta5star = x5star*tuning_lambda
theta6star = x6star*tuning_lambda
log_reverse_density = (theta1star-1)*log(x1star)
log_reverse_density += (theta2star-1)*log(x2star)
log_reverse_density += (theta3star-1)*log(x3star)
log_reverse_density += (theta4star-1)*log(x4star)
log_reverse_density += (theta5star-1)*log(x5star)
log_reverse_density += (theta6star-1)*log(x6star)
log_reverse_density += lgamma(theta1star + theta2star + theta3star + theta4star + theta5star + theta6star)
log_reverse_density -= lgamma(theta1star)
log_reverse_density -= lgamma(theta2star)
log_reverse_density -= lgamma(theta3star)
log_reverse_density -= lgamma(theta4star)
log_reverse_density -= lgamma(theta5star)
log_reverse_density -= lgamma(theta6star)
log_hastings_ratio = log_reverse_density - log_forward_density
return proposed_rates, log_hastings_ratio
# computing log prior density of proposed frequencies from Dirichlet prior distribution
def freqprior(x1, x2, x3, x4):
theta1 = base_prior[0]
theta2 = base_prior[1]
theta3 = base_prior[2]
theta4 = base_prior[3]
log_freq_prior = (theta1-1)*log(x1)
log_freq_prior += (theta2-1)*log(x2)
log_freq_prior += (theta3-1)*log(x3)
log_freq_prior += (theta4-1)*log(x4)
log_freq_prior += lgamma(theta1 + theta2 + theta3 + theta4)
log_freq_prior -= lgamma(theta1)
log_freq_prior -= lgamma(theta2)
log_freq_prior -= lgamma(theta3)
log_freq_prior -= lgamma(theta4)
return log_freq_prior
def exrateprior(x1, x2, x3, x4, x5, x6):
theta1 = exrate_prior[0]
theta2 = exrate_prior[1]
theta3 = exrate_prior[2]
theta4 = exrate_prior[3]
theta5 = exrate_prior[4]
theta6 = exrate_prior[5]
log_exrate_prior = (theta1-1)*log(x1)
log_exrate_prior += (theta2-1)*log(x2)
log_exrate_prior += (theta3-1)*log(x3)
log_exrate_prior += (theta4-1)*log(x4)
log_exrate_prior += (theta4-1)*log(x5)
log_exrate_prior += (theta4-1)*log(x6)
log_exrate_prior += lgamma(theta1 + theta2 + theta3 + theta4 + theta5 + theta6)
log_exrate_prior -= lgamma(theta1)
log_exrate_prior -= lgamma(theta2)
log_exrate_prior -= lgamma(theta3)
log_exrate_prior -= lgamma(theta4)
log_exrate_prior -= lgamma(theta5)
log_exrate_prior -= lgamma(theta6)
return log_exrate_prior
# the final step to mcmc analysis for proposing and then accepting or rejecting the Larget-Simon LOCAL move
def mcmc(postorder, patterns, rates):
mcmc = 0
output = os.path.join('mcmc.txt')
output2 = os.path.join('trees.txt')
newf = open(output, 'w')
newf2 = open(output2, 'w')
newf.write('%s\t'%('n_gen'))
newf.write( '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t'%('LnL', 'freq_prior', 'TL', 'freqA','freqC','freqG','freqT','AC','AG','AT','CG','CT','GT'))
a = 0
b = 0
# for n,nl in enumerate(postorder):
# if nl.number > 0.0:
# newf.write( 'node%s\t'%(a+1))
# a+=1
newf.write('\n')
#starting frequency
node.freq = [0.4, 0.3, 0.2, 0.1]
# ex_rate = [0.166666666667, 0.166666666667, 0.166666666667, 0.166666666667, 0.166666666667, 0.166666666667]
node.ex_rate = [0.16, 0.16, 0.17, 0.17, 0.17, 0.17]
for r in range(n_gen):
# print
print
print 'START#####################################################################################>'
print '********* starting postorder *********'
pre = postorder[:]
pre.reverse()
newick = makeNewick(pre[0])
# print 'newick ==', newick
# for i in postorder:
# print i.number,
# print
# for i in postorder:
# print i.edgelen,
#
# print
# print 'start freq1=======================================>', node.freq[0], node.freq[1], node.freq[2], node.freq[3]
#
# print
'''estimating first likelihood'''
##################################################################################
currentlike = rootLoglike(postorder, patterns, rates, node.freq[0], node.freq[1], node.freq[2], node.freq[3], node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5])
print 'currentlike 1===%.5f'%currentlike
'''computing prior densities for base frequencies, branch lengths, and topologies'''
##################################################################################
#log prior density of starting frequencies
current_freq_prior = freqprior(node.freq[0], node.freq[1], node.freq[2], node.freq[3])
# joint log prior density of branch lengths
current_brln_prior = 0.0
for nd in postorder:
current_brln_prior += (-nd.edgelen/mean_expo)-(log(mean_expo))
# log prior probability of topologies
total_possible_binary_trees = float(binaryTrees())
topology_prior = -log(total_possible_binary_trees)
# log prior probability of rate exchangeabilities
exchangeabilities_prior = exrateprior(node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5])
# computing joint log-prior of base frequencies, branch lengths, topologies and rate exchangeabilities
print 'current_freq_prior 1 ==%.5f'%current_freq_prior
joint_log_prior = current_brln_prior + topology_prior + current_freq_prior + exchangeabilities_prior
'''computing first log-posterior before topology change'''
##################################################################################
current_ln_posterior = currentlike + joint_log_prior
##############################
'''proposing new topology'''
##############################
postorder = proposeNewState(postorder)
print
# print
print '********* new postorder *********'
# for i in postorder:
# print i.number,
# print
# for i in postorder:
# print i.edgelen,
# print
'''estimating second likelihood after topology change'''
##################################################################################
proposedlike = float(rootLoglike(postorder, patterns, rates, node.freq[0], node.freq[1], node.freq[2], node.freq[3], node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5]))
print 'proposedlike 1===%.5f' %proposedlike
pre2 = postorder[:]
pre2.reverse()
newick = makeNewick(pre2[0])
# print 'newick2 ==', newick
'''computing prior densities for base frequencies, branch lengths, and topologies'''
##################################################################################
#log prior density of same starting frequencies
current_freq_prior = freqprior(node.freq[0], node.freq[1], node.freq[2], node.freq[3])
# joint log prior density of new branch lengths from the changed topology
proposed_brln_prior = 0.0
for nd in postorder:
proposed_brln_prior += (-nd.edgelen/mean_expo)-(log(mean_expo))
# log prior probability of topologies
total_possible_binary_trees = float(binaryTrees())
topology_prior = -log(total_possible_binary_trees)
# print 'current_freq_prior 1 ==%.5f'%current_freq_prior
# log prior probability of rate exchangeabilities
exchangeabilities_prior = exrateprior(node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5])
# computing joint log-prior of base frequencies, branch lengths, topologies, and rate exchangeabilities
joint_log_prior = proposed_brln_prior + topology_prior + current_freq_prior + exchangeabilities_prior
'''computing second log-posterior after topology change'''
##################################################################################
proposed_ln_posterior = proposedlike + joint_log_prior
'''calculating acceptance ratio'''
##################################################################################
hastings_ratio = 3.*log(m) # log hasting_ratio
logR = proposed_ln_posterior - current_ln_posterior + hastings_ratio
'''decide whether to accept or reject the proposal move'''
##################################################################################
u8 = random.random()
print 'log(u8), logR==', log(u8), logR
# print
if log(u8) < logR:
postorder = postorder
log_likelihood = proposedlike
print 'log(u8) < logR, so new proposal accepted..'
else:
postorder = revert_function(postorder)
log_likelihood = currentlike
print 'log(u8) > logR, so failed to accept new proposal..'
####################################################
#####################################################
''' computations after proposing new frequencies'''
####################################################
####################################################
print
print '********* start new frequency proposal *********'
# print
# for i in postorder:
# print i.number,
# print
pre3 = postorder[:]
pre3.reverse()
newick = makeNewick(pre3[0])
# print 'newick3 ==', newick
'''estimating third likelihood before the new frequency proposal'''
##################################################################################
currentlike = float(rootLoglike(postorder, patterns, rates, node.freq[0], node.freq[1], node.freq[2], node.freq[3], node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5]))
print 'currentlike 2===%.5f'%currentlike
'''computing prior densities for base frequencies, branch lengths, and topologies'''
##################################################################################
#log prior density of current frequencies
current_freq_prior = freqprior(node.freq[0], node.freq[1], node.freq[2], node.freq[3])
print 'pre_node.freq=', node.freq[0], node.freq[1], node.freq[2], node.freq[3]
# joint log prior density of branch lengths
current_brln_prior = 0.0
for nd in postorder:
current_brln_prior += (-nd.edgelen/mean_expo)-(log(mean_expo))
# log prior probability of topologies
total_possible_binary_trees = float(binaryTrees())
topology_prior = -log(total_possible_binary_trees)
# log prior probability of rate exchangeabilities
exchangeabilities_prior = exrateprior(node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5])
# computing joint log-prior of base frequencies, branch lengths, topologies, and rate exchangeabilities
print 'current_freq_prior 2===%.5f'%current_freq_prior
joint_log_prior = current_brln_prior + topology_prior + current_freq_prior + exchangeabilities_prior
'''computing third log-posterior before the new frequency proposal'''
##################################################################################
current_ln_posterior = currentlike + joint_log_prior
###################################
'''proposing new new frequencies'''
###################################
proposed_freq = freqHastings(node.freq[0], node.freq[1], node.freq[2], node.freq[3])[0]
# print 'proposed new freq===', proposed_freq
'''estimating fourth likelihood after frequency change'''
##################################################################################
proposedlike = float(rootLoglike(postorder, patterns, rates, proposed_freq[0], proposed_freq[1], proposed_freq[2], proposed_freq[3], node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5]))
print 'proposedlike 2===%.5f'%proposedlike
print 'proposed pre_node.freq=', proposed_freq[0], proposed_freq[1], proposed_freq[2], proposed_freq[3]
'''computing prior densities for base frequencies, branch lengths, and topologies'''
##################################################################################
#log prior density of new proposed frequencies
proposed_freq_prior = freqprior(proposed_freq[0], proposed_freq[1], proposed_freq[2], proposed_freq[3])
print 'proposed_freq_prior 2===%.5f'%proposed_freq_prior
# joint log prior density of branch lengths
proposed_brln_prior = 0.0
for nd in postorder:
proposed_brln_prior += (-nd.edgelen/mean_expo)-(log(mean_expo))
# log prior probability of topologies
total_possible_binary_trees = float(binaryTrees())
topology_prior = -log(total_possible_binary_trees)
# log prior probability of rate exchangeabilities
exchangeabilities_prior = exrateprior(node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5])
# computing joint log prior density of new proposed frequencies, branch lengths, topologies, and rate exchangeabilities
joint_log_prior = proposed_brln_prior + topology_prior + proposed_freq_prior + exchangeabilities_prior
'''computing fourth log-posterior after frequency changed'''
##################################################################################
proposed_ln_posterior = proposedlike + joint_log_prior
'''calculating acceptance ratio'''
##################################################################################
hastings_ratio1 = freqHastings(node.freq[0], node.freq[1], node.freq[2], node.freq[3])[1] # hastings_ratio for frequency change
logR1 = proposed_ln_posterior - current_ln_posterior + hastings_ratio1
'''decide whether to accept or reject the proposal move'''
##################################################################################
u9 = random.random()
print 'log(u9), logR1===', log(u9), logR1
if log(u9) < logR1:
node.freq[0] = proposed_freq[0]
node.freq[1] = proposed_freq[1]
node.freq[2] = proposed_freq[2]
node.freq[3] = proposed_freq[3]
log_likelihood = proposedlike
freq_prior = proposed_freq_prior
print 'log(u9) < logR, so new proposal accepted..'
else:
node.freq[0] = node.freq[0]
node.freq[1] = node.freq[1]
node.freq[2] = node.freq[2]
node.freq[3] = node.freq[3]
log_likelihood = currentlike
freq_prior = current_freq_prior
print 'log(u9) > logR, so failed to accept new proposal..'
##########################################################
##########################################################
''' computations after proposing new exchangeabilities'''
##########################################################
##########################################################
print
print '********* start new exchangeabilities proposal *********'
# print
# for i in postorder:
# print i.number,
# print
pre4 = postorder[:]
pre4.reverse()
newick = makeNewick(pre4[0])
print 'newick44 ==', newick
'''estimating fifth likelihood before the new ex rate proposal'''
##################################################################################
currentlike = float(rootLoglike(postorder, patterns, rates, node.freq[0], node.freq[1], node.freq[2], node.freq[3], node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5]))
print 'currentlike 3===%.3f'%currentlike
'''computing prior densities for base frequencies, branch lengths, and topologies'''
##################################################################################
#log prior density of current frequencies
current_freq_prior = freqprior(node.freq[0], node.freq[1], node.freq[2], node.freq[3])
print 'starting frequencies=', node.freq[0], node.freq[1], node.freq[2], node.freq[3]
# joint log prior density of branch lengths
current_brln_prior = 0.0
for nd in postorder:
current_brln_prior += (-nd.edgelen/mean_expo)-(log(mean_expo))
# log prior probability of topologies
total_possible_binary_trees = float(binaryTrees())
topology_prior = -log(total_possible_binary_trees)
# log prior probability of rate exchangeabilities
current_exchangeabilities_prior = exrateprior(node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5])
print 'current_exchangeabilities=', node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5]
print 'current_exchangeabilities_PAUPversion=', node.ex_rate[0]/node.ex_rate[5], node.ex_rate[1]/node.ex_rate[5], node.ex_rate[2]/node.ex_rate[5], node.ex_rate[3]/node.ex_rate[5], node.ex_rate[4]/node.ex_rate[5], node.ex_rate[5]/node.ex_rate[5]
# computing joint log-prior of base frequencies, branch lengths, topologies, and rate exchangeabilities
print 'current_freq_prior 2===%.5f'%current_freq_prior
joint_log_prior = current_brln_prior + topology_prior + current_freq_prior + current_exchangeabilities_prior
'''computing fifth log-posterior before the new frequency proposal'''
##################################################################################
current_ln_posterior = currentlike + joint_log_prior
#############################################
'''proposing new new rate exchangeabilities'''
#############################################
proposed_exchangeability_rates = exrateHastings2(node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5])[0]
# print 'proposed new freq===', proposed_freq
print 'proposed_exchangeability =', proposed_exchangeability_rates[0], proposed_exchangeability_rates[1], proposed_exchangeability_rates[2], proposed_exchangeability_rates[3], proposed_exchangeability_rates[4], proposed_exchangeability_rates[5]
print 'proposed_exchangeability_PAUPform =', proposed_exchangeability_rates[0]/proposed_exchangeability_rates[5], proposed_exchangeability_rates[1]/proposed_exchangeability_rates[5], proposed_exchangeability_rates[2]/proposed_exchangeability_rates[5], proposed_exchangeability_rates[3]/proposed_exchangeability_rates[5], proposed_exchangeability_rates[4]/proposed_exchangeability_rates[5], proposed_exchangeability_rates[5]/proposed_exchangeability_rates[5]
'''estimating sixth likelihood after rate exchangeabilities change'''
##################################################################################
proposedlike = float(rootLoglike(postorder, patterns, rates, node.freq[0], node.freq[1], node.freq[2], node.freq[3], proposed_exchangeability_rates[0], proposed_exchangeability_rates[1], proposed_exchangeability_rates[2], proposed_exchangeability_rates[3], proposed_exchangeability_rates[4], proposed_exchangeability_rates[5]))
print 'proposedlike 2===%.3f'%proposedlike
'''computing prior densities for base frequencies, branch lengths, and topologies'''
##################################################################################
#log prior density of new proposed frequencies
proposed_freq_prior = freqprior(node.freq[0], node.freq[1], node.freq[2], node.freq[3])
print 'proposed_freq_prior 2===%.3f'%proposed_freq_prior
print 'proposed frequencies=', node.freq[0], node.freq[1], node.freq[2], node.freq[3]
# joint log prior density of branch lengths
proposed_brln_prior = 0.0
for nd in postorder:
proposed_brln_prior += (-nd.edgelen/mean_expo)-(log(mean_expo))
# log prior probability of topologies
total_possible_binary_trees = float(binaryTrees())
topology_prior = -log(total_possible_binary_trees)
# log prior probability of rate exchangeabilities
proposed_exchangeabilities_prior = exrateprior(proposed_exchangeability_rates[0], proposed_exchangeability_rates[1], proposed_exchangeability_rates[2], proposed_exchangeability_rates[3], proposed_exchangeability_rates[4], proposed_exchangeability_rates[5])
# computing joint log prior density of new proposed frequencies, branch lengths, topologies, and rate exchangeabilities
joint_log_prior = proposed_brln_prior + topology_prior + proposed_freq_prior + proposed_exchangeabilities_prior
'''computing fourth log-posterior after rate exchangeabilities change'''
##################################################################################
proposed_ln_posterior = proposedlike + joint_log_prior
'''calculating acceptance ratio'''
##################################################################################
hastings_ratio2 = exrateHastings2(node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5])[1] # hastings_ratio for rate exchangeabilities change
logR2 = proposed_ln_posterior - current_ln_posterior + hastings_ratio2
'''decide whether to accept or reject the proposal move'''
##################################################################################
u10 = random.random()
print 'log(u10), logR2===', log(u10), logR2
if log(u10) < logR2:
node.ex_rate[0] = proposed_exchangeability_rates[0]
node.ex_rate[1] = proposed_exchangeability_rates[1]
node.ex_rate[2] = proposed_exchangeability_rates[2]
node.ex_rate[3] = proposed_exchangeability_rates[3]
node.ex_rate[4] = proposed_exchangeability_rates[4]
node.ex_rate[5] = proposed_exchangeability_rates[5]
log_likelihood = proposedlike
exchangeability_rates_prior = proposed_exchangeabilities_prior
print 'log(u10) < logR2, so new proposal accepted..'
else:
node.ex_rate[0] = node.ex_rate[0]
node.ex_rate[1] = node.ex_rate[1]
node.ex_rate[2] = node.ex_rate[2]
node.ex_rate[3] = node.ex_rate[3]
node.ex_rate[4] = node.ex_rate[4]
node.ex_rate[5] = node.ex_rate[5]
log_likelihood = currentlike
exchangeability_rates_prior = current_exchangeabilities_prior
print 'log(u10) > logR2, so failed to accept new proposal..'
################################################################
################################################################
''' final accepted toplogies and frequencies for output files'''
###############################################################
###############################################################
if (r+1) % save_every == 0:
newf.write('%s\t'%(mcmc+1))
print 'final log_likelihood==', log_likelihood
newf.write('%.6f\t'%(log_likelihood))
newf.write( '%.6f\t'%(freq_prior))
TL = 0.0
for j,k in enumerate(postorder):
TL += k.edgelen
newf.write('%.6f\t'%(TL))
sorted_postorder = sorted(postorder, key=operator.attrgetter('number'))
for i in node.freq:
newf.write('%.6f\t'%i)
for i in node.ex_rate:
newf.write('%.6f\t'%i)
# i = 0
# while i < len(sorted_postorder):
# if sorted_postorder[i].number > 0:
# newf.write('%.6f\t'%(sorted_postorder[i].edgelen))
# i += 1
newf.write('\n')
print
if (r+1) % save_every == 0:
pre = postorder[:]
pre.reverse()
newick = makeNewick(pre[0])
newf2.write('%s;\n'%(newick))
print 'end of mcmc gen=', mcmc+1
print '**************************'
mcmc+=1
newf.flush()
def joinRandomPair(node_list, next_node_number, is_deep_coalescence):
# pick first of two lineages to join and delete from node_list
i = random.randint(1, len(node_list))
ndi = node_list[i-1]
del node_list[i-1]
# pick second of two lineages to join and delete from node_list
j = random.randint(1, len(node_list))
ndj = node_list[j-1]
del node_list[j-1]
# join selected nodes and add ancestor to node_list
ancnd = node(next_node_number)
ancnd.deep = is_deep_coalescence
ancnd.lchild = ndi
ancnd.descendants = set()
ancnd.descendants |= ndi.descendants
ancnd.descendants |= ndj.descendants
ndi.rsib = ndj
ndi.par = ancnd
ndj.par = ancnd
node_list.append(ancnd)
return node_list
# function to make newick string through preorder node traversal
def makeNewick(nd, brlen_scaler = 1.0, start = True): #
global _newick
global _TL
if start:
assert nd.par is None, 'makeNewick function assumes that nd provided is the root node (which has no parent), but that was not true in this case'
assert nd.lchild is not None, 'makeNewick function assumes that nd provided is the root node (which has at least one child), but that was not true in this case'
assert nd.lchild.rsib is None, 'makeNewick function assumes that nd provided is the root node (which has only one child), but that was not true in this case'
_TL = 0.0
_newick = '(%d:%.5f' % (nd.number, nd.lchild.edgelen)
if not start:
if nd.lchild:
_newick += '('
makeNewick(nd.lchild, brlen_scaler, False)
else:
blen = nd.edgelen*brlen_scaler
_TL += blen
_newick += '%d:%.5f' % (nd.number, blen)
if start:
_newick += ','
makeNewick(nd.lchild.lchild, brlen_scaler, False)
elif nd.rsib:
_newick += ','
makeNewick(nd.rsib, brlen_scaler, False)
elif nd.par is not None and nd.par.par is not None and nd.par.par.par is not None:
blen = nd.par.edgelen*brlen_scaler
_TL += blen
_newick += '):%.5f' % blen
else:
_newick += ')'
return _newick
def calcActualHeight(root):
h = 0.0
nd = root
while nd.lchild:
nd = nd.lchild
h += nd.edgelen
return h
def readnewick(tree):
total_length = len(tree)
internal_node_number = -1
root = node(internal_node_number)
nd = root
i = 0
pre = [root]
while i < total_length:
m = tree[i]
if m =='(':
internal_node_number -= 1
child = node(internal_node_number)
pre.append(child)
nd.lchild=child
child.par=nd
nd=child
elif m == ',':
internal_node_number -= 1
rsib = node(internal_node_number)
pre.append(rsib)
nd.rsib = rsib
rsib.par=nd.par
nd = rsib
elif m == ')':
nd = nd.par
elif m == ':':
edge_len_str = ''
i+=1
m = tree[i]
assert m in ['0','1','2','3','4','5','6','7','8', '9','.']
while m in ['0','1','2','3','4','5','6','7','8', '9','.']:
edge_len_str += m
i+=1
m = tree[i]
i -=1
nd.edgelen = float(edge_len_str)
else:
internal_node_number += 1
if True:
assert m in ['0','1','2','3','4','5','6','7','8', '9'], 'Error : expecting m to be a digit when in fact it was "%s"' % m
mm = ''
while m in ['0','1','2','3','4','5','6','7','8', '9' ]:
mm += m
i += 1
m = tree[i]
nd.number = int(mm)
i -= 1
i += 1
#post = pre[:]
#post.reverse()
# return post
new_pre = rootlchild(pre)
return new_pre
def Makenewick(pre):
newickstring = ''
for i,nd in enumerate(pre):
if nd.lchild:
newickstring += '('
elif nd.rsib:
newickstring += '%d' %(nd.number)
newickstring += ':%.4f' % nd.edgelen
newickstring += ','
else:
newickstring += '%d' %(nd.number)
newickstring += ':%.4f' % nd.edgelen
tmpnd = nd
while (tmpnd.par is not None) and (tmpnd.rsib is None):
newickstring += ')'
newickstring += ':%.4f' % tmpnd.par.edgelen
tmpnd = tmpnd.par
if tmpnd.par is not None:
newickstring += ','
return newickstring
# function to read a newick tree
def treenewick():
script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
path = os.path.join(script_dir, tree_file_name)
with open(path, 'r') as content:
newick = content.read()
return newick
################### yule tree simulator ###################################################
# calcPhi computes sum_{K=2}^S 1/K, where S is the number of leaves in the tree
# - num_species is the number of leaves (tips) in the tree
def calcPhi(num_species):
phi = sum([1.0/(K+2.0) for K in range(num_species-1)])
return phi
# yuleTree creates a species tree in which edge lengths are measured in
# expected number of substitutions.
# - num_species is the number of leaves
# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio
def yuleTree(num_species, mu_over_s):
# create num_species nodes numbered 1, 2, ..., num_species
nodes = [node(i+1) for i in range(num_species)]
next_node_number = num_species + 1
while len(nodes) > 1:
# choose a speciation time in generations
K = float(len(nodes))
mean_epoch_length = mu_over_s/K
t = random.gammavariate(1.0, mean_epoch_length)
# update each node's edgelen
for n in nodes:
n.edgelen += t # same as: n.edgelen = n.edgelen + t
nodes = joinRandomPair(nodes, next_node_number, False)
next_node_number += 1
return nodes[0]
# calcExpectedHeight returns the expected height of the species tree in terms of
# expected number of substitutions from the root to one tip.
# - num_species is the number of leaves
# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio
def calcExpectedHeight(num_species, mu_over_s):
return mu_over_s*calcPhi(num_species)
def makeStartNewick(nd, brlen_scaler = 1.0, start = True): #
global _newick
global _TL
if start:
_newick = ''
_TL = 0.0
if nd.lchild:
_newick += '('
makeNewick(nd.lchild, brlen_scaler, False)
else:
blen = nd.edgelen*brlen_scaler
_TL += blen
_newick += '%d:%.5f' % (nd.number, blen)
if nd.rsib:
_newick += ','
makeNewick(nd.rsib, brlen_scaler, False)
elif nd.par is not None:
blen = nd.par.edgelen*brlen_scaler
_TL += blen
_newick += '):%.5f' % blen
# return _newick, _TL
return _newick
def createRandomTree(number_of_species):
root_node_length = random.lognormvariate(1, 0.05)
species_tree_root = yuleTree(int(number_of_species)-1, 0.689655172)
newick = makeStartNewick(species_tree_root)
_newick = '(%s:%.5f,' %(number_of_species,root_node_length)
_newick += newick[1:-1]
_newick += ')'
print 'random starting tree =', _newick
return _newick
if __name__ == '__main__':
random_seed = 2551189 # 7632557, 12345
# number_of_species = 5
# mutation_speciation_rate_ratio = 0.689655172 # 0.689655172 # yields tree height 1 for 6 species
random.seed(random_seed)
# species_tree_root = yuleTree(number_of_species, mutation_speciation_rate_ratio)
# newick = makeStartNewick(species_tree_root)
# print 'Random number seed: %d' % random_seed
# print 'Simulating one tree:'
# print ' number of species = %d' % number_of_species
# print ' mutation-speciation rate ratio = %g' % mutation_speciation_rate_ratio
# print ' actual tree length =',newick[1]
# expected_height = calcExpectedHeight(number_of_species, mutation_speciation_rate_ratio)
# print ' expected height =',expected_height
# actual_height = calcActualHeight(species_tree_root)
# print ' actual height =',actual_height
# print 'true tree: ',newick[0]
# print '**************************'
number_of_species = readSeq.patterns(sequence_file)[0]
random_start_tree = createRandomTree(number_of_species)
preorder = readnewick(random_start_tree)
postorder = preorder[:]
postorder.reverse()
rates_list = gammaRates(alpha)
result = mcmc(postorder, readSeq.patterns(sequence_file)[1], rates_list)