Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
BayesPhyloInPython2/GTR_MCMC.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1696 lines (1388 sloc)
69.3 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
########################################################################################## | |
''' This script reads DNA matrix in nexus format (through readseq.py module) and performs | |
mcmc tree search under GTR+GAMMA model. It's extremely slow!! - scripting language such as | |
Python is not good for this purpose. I wrote this script to run on a small data set | |
(6 taxon) to understand how full GTR model works so please don't use it as a full-fledeged | |
phylogeny software. | |
contact: suman.neupane@uconn.edu | |
''' | |
########################################################################################## | |
import readSeq | |
import random | |
import re, os, itertools, sys, glob | |
from itertools import chain | |
from scipy.stats import gamma | |
from math import exp, log, lgamma | |
import operator | |
import numpy as np | |
########################################################################################## | |
# tree_file_name = 'real_treeForSeqSimulation.tre' # if supplied a starting tree | |
sequence_file = 'simulated_sequences.nex' | |
# sequence_file = 'fake.nex' | |
# base_frequences = [0.25, 0.25, 0.25, 0.25] # stationary frequencies of A, C, G, T | |
kappa = 3.0 # set kappa to 1.0 for F81+GAMMA | |
# base_frequences = [0.37272151039415824, 0.2951859176504787, 0.19949665725757845, 0.13259591469778456] # stationary frequencies of A, C, G, T, set all base_frequences to 0.25 for K80+GAMMA | |
alpha = 0.5 # gamma shape parameter for rate categories | |
n_gen = 100 | |
save_every = 5 | |
mean_expo = 10. # mean_expo = mean of exponential distribution for branch length prior | |
base_prior = [10., 10., 10., 10.] # Dirichlet prior for base frequencies | |
exrate_prior = [10., 10., 10., 10., 10., 10] # Dirichlet prior for rate rate exchangeabilites | |
########################################################################################## | |
class node(object): | |
def __init__(self, ndnum): # initialization function | |
self.rsib = None # right sibling | |
self.lchild = None # left child | |
self.par = None # parent node | |
self.number = ndnum # node number (internals negative, tips 0 or positive) | |
self.edgelen = 0.0 # branch length | |
self.descendants = set([ndnum]) # set containing descendant leaf set | |
self.partial = None # will have length 4*npatterns | |
self.freq = [0.0, 0.0, 0.0, 0.0] | |
self.ex_rate = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | |
def __str__(self): | |
# __str__ is a built-in function that is used by print to show an object | |
descendants_as_string = ','.join(['%d' % d for d in self.descendants]) | |
lchildstr = 'None' | |
if self.lchild is not None: | |
lchildstr = '%d' % self.lchild.number | |
rsibstr = 'None' | |
if self.rsib is not None: | |
rsibstr = '%d' % self.rsib.number | |
parstr = 'None' | |
if self.par is not None: | |
parstr = '%d' % self.par.number | |
return 'node: number=%d edgelen=%g lchild=%s rsib=%s parent=%s descendants=[%s]' % (self.number, self.edgelen, lchildstr, rsibstr, parstr, descendants_as_string) | |
# getting 4 gamma rates categories for among site rate heterogeneity | |
def gammaRates(alpha): | |
bounds = [0.0, 0.25, 0.50, 0.75, 1.] | |
rates = [] | |
for i in range(4): | |
# print i | |
lower = gamma.ppf(bounds[i], alpha, 0, 1./alpha) | |
upper = gamma.ppf(bounds[i+1], alpha, 0, 1./alpha) | |
mean_rate = ((gamma.cdf(upper, alpha+1., 0, 1./alpha) - gamma.cdf(lower, alpha+1., 0, 1./alpha)))*4. | |
rates.append(mean_rate) | |
return rates | |
# computing partial likelihoods at the node above the root node | |
def allocatePartial(node, patterns, rates, freqA, freqC, freqG, freqT, _a, _b, _c, _d, _e, _f): | |
if node.number > 0: | |
npatterns = len(patterns) | |
if node.partial is None: | |
node.partial = [0.0]*(4*4*npatterns) | |
# print len(node.partial) | |
for i,pattern in enumerate(patterns.keys()): | |
base2 = pattern[node.number-1] | |
for l in range(4): | |
if base2 == 'A': | |
node.partial[i*16+l*4 + 0] = 1.0 | |
elif base2 == 'C': | |
node.partial[i*16+l*4 + 1] = 1.0 | |
elif base2 == 'G': | |
node.partial[i*16+l*4 + 2] = 1.0 | |
elif base2 == 'T': | |
node.partial[i*16+l*4 + 3] = 1.0 | |
# print '>>>>>>>>>>>>>', node.partial | |
else: | |
assert(False), 'oops, something went horribly wrong!' | |
else: | |
npatterns = len(patterns) | |
if node.partial is None: | |
node.partial = [0.0]*(4*4*npatterns) | |
like_list = [] | |
for i,pattern in enumerate(patterns.keys()): | |
m_list = [] | |
num_pattern = patterns[pattern] | |
for l,r in enumerate(rates): | |
num_pattern = patterns[pattern] | |
alphaT1 = r*((node.lchild.edgelen)/(2.*((freqA * freqC * _a) + (freqA * freqG * _b) + (freqA * freqT * _c) + (freqC * freqG * _d) + (freqC * freqT * _e) + (freqG * freqT * _f)))) | |
alphaT2 = r*((node.lchild.rsib.edgelen)/(2.*((freqA * freqC * _a) + (freqA * freqG * _b) + (freqA * freqT * _c) + (freqC * freqG * _d) + (freqC * freqT * _e) + (freqG * freqT * _f)))) | |
# GTR rate matrices | |
A1 = np.mat([[-(alphaT1*freqC*_a + alphaT1*freqG*_b + alphaT1*freqT*_c),alphaT1*freqC*_a, alphaT1*freqG*_b, alphaT1*freqT*_c], | |
[ alphaT1*freqA*_a,-(alphaT1*freqA*_a + alphaT1*freqG*_d + alphaT1*freqT*_e), alphaT1*freqG*_d, alphaT1*freqT*_e], | |
[ alphaT1*freqA*_b, alphaT1*freqC*_d,-(alphaT1*freqA*_b + alphaT1*freqC*_d + alphaT1*freqT*_f), alphaT1*freqT*_f], | |
[ alphaT1*freqA*_c, alphaT1*freqC*_e, alphaT1*freqG*_f,-(alphaT1*freqA*_c + alphaT1*freqC*_e + alphaT1*freqG*_f)]]) | |
A2 = np.mat([[-(alphaT2*freqC*_a + alphaT2*freqG*_b + alphaT2*freqT*_c),alphaT2*freqC*_a, alphaT2*freqG*_b, alphaT2*freqT*_c], | |
[ alphaT2*freqA*_a,-(alphaT2*freqA*_a + alphaT2*freqG*_d + alphaT2*freqT*_e), alphaT2*freqG*_d, alphaT2*freqT*_e], | |
[ alphaT2*freqA*_b, alphaT2*freqC*_d,-(alphaT2*freqA*_b + alphaT2*freqC*_d + alphaT2*freqT*_f), alphaT2*freqT*_f], | |
[ alphaT2*freqA*_c, alphaT2*freqC*_e, alphaT2*freqG*_f,-(alphaT2*freqA*_c + alphaT2*freqC*_e + alphaT2*freqG*_f)]]) | |
eigenvalue1, eigenvector1 = np.linalg.eig(A1) | |
eigenvalue2, eigenvector2 = np.linalg.eig(A2) | |
U1 = eigenvector1 | |
Uinv1 = np.linalg.inv(U1) | |
D1 = np.diag(np.exp(eigenvalue1)) | |
P1 = np.dot(U1,np.dot(D1,Uinv1)) | |
U2 = eigenvector2 | |
Uinv2 = np.linalg.inv(U2) | |
D2 = np.diag(np.exp(eigenvalue2)) | |
P2 = np.dot(U2,np.dot(D2,Uinv2)) | |
pAA = P1[[0],[0]] * (node.lchild.partial[i*16+l*4 + 0]) | |
pAC = P1[[0],[1]] * (node.lchild.partial[i*16+l*4 + 1]) | |
pAG = P1[[0],[2]] * (node.lchild.partial[i*16+l*4 + 2]) | |
pAT = P1[[0],[3]] * (node.lchild.partial[i*16+l*4 + 3]) | |
pAA2 = P2[[0],[0]] * (node.lchild.rsib.partial[i*16+l*4 + 0]) | |
pAC2 = P2[[0],[1]] * (node.lchild.rsib.partial[i*16+l*4 + 1]) | |
pAG2 = P2[[0],[2]] * (node.lchild.rsib.partial[i*16+l*4 + 2]) | |
pAT2 = P2[[0],[3]] * (node.lchild.rsib.partial[i*16+l*4 + 3]) | |
pfromA_lchild = pAA+pAC+pAG+pAT | |
pfromA_rchild = pAA2+pAC2+pAG2+pAT2 | |
node.partial[i*16+l*4 + 0] = pfromA_lchild*pfromA_rchild | |
###################################################### | |
pCA = P1[[1],[0]] * (node.lchild.partial[i*16+l*4 + 0]) | |
pCC = P1[[1],[1]] * (node.lchild.partial[i*16+l*4 + 1]) | |
pCG = P1[[1],[2]] * (node.lchild.partial[i*16+l*4 + 2]) | |
pCT = P1[[1],[3]] * (node.lchild.partial[i*16+l*4 + 3]) | |
pCA2 = P2[[1],[0]] * (node.lchild.rsib.partial[i*16+l*4 + 0]) | |
pCC2 = P2[[1],[1]] * (node.lchild.rsib.partial[i*16+l*4 + 1]) | |
pCG2 = P2[[1],[2]] * (node.lchild.rsib.partial[i*16+l*4 + 2]) | |
pCT2 = P2[[1],[3]] * (node.lchild.rsib.partial[i*16+l*4 + 3]) | |
pfromC_lchild = pCA+pCC+pCG+pCT | |
pfromC_rchild = pCA2+pCC2+pCG2+pCT2 | |
node.partial[i*16+l*4 + 1] = pfromC_lchild*pfromC_rchild | |
####################################################### | |
pGA = P1[[2],[0]] * (node.lchild.partial[i*16+l*4 + 0]) | |
pGC = P1[[2],[1]] * (node.lchild.partial[i*16+l*4 + 1]) | |
pGG = P1[[2],[2]] * (node.lchild.partial[i*16+l*4 + 2]) | |
pGT = P1[[2],[3]] * (node.lchild.partial[i*16+l*4 + 3]) | |
pGA2 = P2[[2],[0]] * (node.lchild.rsib.partial[i*16+l*4 + 0]) | |
pGC2 = P2[[2],[1]] * (node.lchild.rsib.partial[i*16+l*4 + 1]) | |
pGG2 = P2[[2],[2]] * (node.lchild.rsib.partial[i*16+l*4 + 2]) | |
pGT2 = P2[[2],[3]] * (node.lchild.rsib.partial[i*16+l*4 + 3]) | |
pfromG_lchild = pGA+pGC+pGG+pGT | |
pfromG_rchild = pGA2+pGC2+pGG2+pGT2 | |
node.partial[i*16+l*4 + 2] = pfromG_lchild*pfromG_rchild | |
####################################################### | |
pTA = P1[[3],[0]] * (node.lchild.partial[i*16+l*4 + 0]) | |
pTC = P1[[3],[1]] * (node.lchild.partial[i*16+l*4 + 1]) | |
pTG = P1[[3],[2]] * (node.lchild.partial[i*16+l*4 + 2]) | |
pTT = P1[[3],[3]] * (node.lchild.partial[i*16+l*4 + 3]) | |
pTA2 = P2[[3],[0]] * (node.lchild.rsib.partial[i*16+l*4 + 0]) | |
pTC2 = P2[[3],[1]] * (node.lchild.rsib.partial[i*16+l*4 + 1]) | |
pTG2 = P2[[3],[2]] * (node.lchild.rsib.partial[i*16+l*4 + 2]) | |
pTT2 = P2[[3],[3]] * (node.lchild.rsib.partial[i*16+l*4 + 3]) | |
pfromT_lchild = pTA+pTC+pTG+pTT | |
pfromT_rchild = pTA2+pTC2+pTG2+pTT2 | |
node.partial[i*16+l*4 + 3] = pfromT_lchild*pfromT_rchild | |
# site_like = (sum(node.partial[i*16:i*16+16]))*0.25*0.25 | |
# site_log_like = (log(site_like))*num_pattern | |
# like_list.append(site_log_like) | |
# log_like = sum(like_list) | |
# return log_like | |
# return node.partial[i*16:i*16+16] | |
# computing final likelihood at the root of tree | |
def rootLoglike(postorder, patterns, rates, freqA, freqC, freqG, freqT, _a, _b, _c, _d, _e, _f): | |
# getting partial likelihoods of all the nodes, but our interest is on the node above the root which is postorder[-1] | |
prepareTree(postorder, patterns, rates, freqA, freqC, freqG, freqT, _a, _b, _c, _d, _e, _f) | |
npatterns = len(patterns) | |
node = postorder[-1] | |
if node.partial is None: | |
node.partial = [0.0]*(4*4*npatterns) | |
like_list = [] | |
for i,pattern in enumerate(patterns.keys()): | |
num_pattern = patterns[pattern] | |
# determining the nucleotide base for the root node, node.lchild is the node above the root node whose partial likelihood has already been computed using preparTree function | |
base = pattern[node.number-1] | |
if base == 'A': | |
final_freq = freqA | |
elif base == 'C': | |
final_freq = freqC | |
elif base == 'G': | |
final_freq = freqG | |
elif base == 'T': | |
final_freq = freqT | |
for l,r in enumerate(rates): | |
alphaT3 = r*((node.lchild.edgelen)/(2.*((freqA * freqC * _a) + (freqA * freqG * _b) + (freqA * freqT * _c) + (freqC * freqG * _d) + (freqC * freqT * _e) + (freqG * freqT * _f)))) | |
# GTR rate matrix | |
A3 = np.mat([[-(alphaT3*freqC*_a + alphaT3*freqG*_b + alphaT3*freqT*_c),alphaT3*freqC*_a, alphaT3*freqG*_b, alphaT3*freqT*_c], | |
[ alphaT3*freqA*_a,-(alphaT3*freqA*_a + alphaT3*freqG*_d + alphaT3*freqT*_e), alphaT3*freqG*_d, alphaT3*freqT*_e], | |
[ alphaT3*freqA*_b, alphaT3*freqC*_d,-(alphaT3*freqA*_b + alphaT3*freqC*_d + alphaT3*freqT*_f), alphaT3*freqT*_f], | |
[ alphaT3*freqA*_c, alphaT3*freqC*_e, alphaT3*freqG*_f,-(alphaT3*freqA*_c + alphaT3*freqC*_e + alphaT3*freqG*_f)]]) | |
# Diagonalizing rate*time (A3) matrix to find eigen values and eigen vector matrices | |
eigenvalue3, eigenvector3 = np.linalg.eig(A3) | |
U3 = eigenvector3 | |
Uinv3 = np.linalg.inv(U3) | |
# exponentiating eigen values | |
D3 = np.diag(np.exp(eigenvalue3)) | |
# getting probability matrix | |
P3 = np.dot(U3,np.dot(D3,Uinv3)) | |
if base == 'A': | |
node.partial[i*16+l*4 + 0] = 1.0 | |
pAA = P3[[0],[0]] * (node.lchild.partial[i*16+l*4 + 0]) | |
pAC = P3[[0],[1]] * (node.lchild.partial[i*16+l*4 + 1]) | |
pAG = P3[[0],[2]] * (node.lchild.partial[i*16+l*4 + 2]) | |
pAT = P3[[0],[3]] * (node.lchild.partial[i*16+l*4 + 3]) | |
pfromA_lchild = pAA+pAC+pAG+pAT | |
node.partial[i*16+l*4 + 0] = pfromA_lchild | |
elif base == 'C': | |
node.partial[i*16+l*4 + 1] = 1.0 | |
pCA = P3[[1],[0]] * (node.lchild.partial[i*16+l*4 + 0]) | |
pCC = P3[[1],[1]] * (node.lchild.partial[i*16+l*4 + 1]) | |
pCG = P3[[1],[2]] * (node.lchild.partial[i*16+l*4 + 2]) | |
pCT = P3[[1],[3]] * (node.lchild.partial[i*16+l*4 + 3]) | |
pfromC_lchild = pCA+pCC+pCG+pCT | |
node.partial[i*16+l*4 + 1] = pfromC_lchild | |
elif base == 'G': | |
node.partial[i*16+l*4 + 2] = 1.0 | |
pGA = P3[[2],[0]] * (node.lchild.partial[i*16+l*4 + 0]) | |
pGC = P3[[2],[1]] * (node.lchild.partial[i*16+l*4 + 1]) | |
pGG = P3[[2],[2]] * (node.lchild.partial[i*16+l*4 + 2]) | |
pGT = P3[[2],[3]] * (node.lchild.partial[i*16+l*4 + 3]) | |
pfromG_lchild = pGA+pGC+pGG+pGT | |
node.partial[i*16+l*4 + 2] = pfromG_lchild | |
elif base == 'T': | |
node.partial[i*16+l*4 + 3] = 1.0 | |
pTA = P3[[3],[0]] * (node.lchild.partial[i*16+l*4 + 0]) | |
pTC = P3[[3],[1]] * (node.lchild.partial[i*16+l*4 + 1]) | |
pTG = P3[[3],[2]] * (node.lchild.partial[i*16+l*4 + 2]) | |
pTT = P3[[3],[3]] * (node.lchild.partial[i*16+l*4 + 3]) | |
pfromT_lchild = pTA+pTC+pTG+pTT | |
node.partial[i*16+l*4 + 3] = pfromT_lchild | |
else: | |
assert(False), 'something went horribly wrong!' | |
site_like = (sum(node.partial[i*16:i*16+16]))*final_freq*0.25 | |
site_log_like = (log(site_like))*num_pattern | |
like_list.append(site_log_like) | |
log_like = sum(like_list) | |
return log_like | |
# function to call allocatePartial function | |
def prepareTree(postorder, patterns, rates, freqA, freqC, freqG, freqT, _a, _b, _c, _d, _e, _f): | |
likelihood_lists = [] | |
for nd in postorder[:-1]: | |
likelihood_lists.append(allocatePartial(nd, patterns, rates, freqA, freqC, freqG, freqT, _a, _b, _c, _d, _e, _f)) | |
# print 'likelihood_lists=', likelihood_lists[-1] | |
return likelihood_lists[-1] | |
# nearest node interchange function for Larget and Simon move | |
def nniNodeSwap(a, b): | |
x = a.par | |
y = b.par | |
# Detach a from tree | |
if a.rsib: | |
x.lchild = a.rsib | |
a.par = None | |
a.rsib = None | |
else: | |
x.lchild.rsib = None | |
a.par = None | |
# Detach b from tree | |
if b == x.rsib: | |
x.rsib = None | |
b.par = None | |
else: | |
y.lchild = x | |
b.rsib = None | |
b.par = None | |
# Reattach a to y | |
x.rsib = a | |
a.par = y | |
#Reattach b to x | |
x.lchild.rsib = b | |
b.par = x | |
# preorder node traversal | |
def getPreorder(nd, start = True): | |
global _preorder | |
if start: | |
_preorder = [] | |
_preorder.append(nd) | |
if nd.lchild: | |
getPreorder(nd.lchild, False) | |
if nd.rsib: | |
getPreorder(nd.rsib, False) | |
return _preorder | |
# creating unrooted starting tree with starting node as the outermost leaf | |
def rootlchild(preorder): | |
m = preorder[0] | |
t = preorder[1] | |
m.edgelen = preorder[1].edgelen | |
t.edgelen = 0.0 | |
m.lchild = t.rsib | |
t.par = None | |
t.lchild = None | |
t.rsib = None | |
m.par = t | |
t.lchild = m | |
preorder[0] = t | |
preorder[1] = m | |
new_pre = getPreorder(preorder[0]) | |
return new_pre | |
# proposing new tree state using Larget and Simon (1999) and Holder, Lewis, Swofford, Larget (2005). | |
def proposeNewState(postorder): | |
global _case | |
global internal_nodes | |
global m | |
global _topology_changed | |
global _d, _a, _x, _y, _c, _b, _orig_edgelen_top, _orig_edgelen_middle, _orig_edgelen_bottom, _new_edgelen_top, _new_edgelen_middle, _new_edgelen_bottom | |
# getting the list of all the internal nodes | |
internal_nodes = [] | |
for nd in postorder: | |
if (nd.par is not None) and (nd.par.edgelen !=0.0) and (nd.lchild is not None): | |
internal_nodes.append(nd) | |
# nodes to be manipulated for the proposed tree state change where _x is the | |
#random internal middle node of the contiguous 3-edge segment | |
_x = random.choice(internal_nodes) | |
_a = _x.lchild | |
_b = _a.rsib | |
_y = _x.par | |
_c = _y.par | |
if _x == _y.lchild: | |
_d = _x.rsib | |
else: | |
_d = _y.lchild | |
# selecting the internal node randomly | |
u1 = random.random() | |
a_on_path = random.random() <0.5 | |
if a_on_path: | |
_orig_edgelen_top = _a.edgelen | |
else: | |
_orig_edgelen_top = _b.edgelen | |
_orig_edgelen_middle = _x.edgelen | |
c_on_path = random.random() <0.5 | |
# selecting the other two random edges connected to the middle edge of a 3-edge segment to be changed | |
if c_on_path: | |
_orig_edgelen_bottom = _y.edgelen | |
else: | |
_orig_edgelen_bottom = _d.edgelen | |
_orig_edgelen_middle = _x.edgelen | |
u2 = random.random() | |
m = exp(0.2*(u2-0.5)) ## 0.2 is the tuning parameter, larger values make larger changes to the 3-edge sement | |
# print 'm1====================================', m | |
# modifying 3-edge sement | |
_new_edgelen_top = m*_orig_edgelen_top | |
_new_edgelen_middle = m*_orig_edgelen_middle | |
_new_edgelen_bottom = m*_orig_edgelen_bottom | |
# changed total length of 3-edge segment | |
new_focal_path_length = _new_edgelen_top + _new_edgelen_middle + _new_edgelen_bottom | |
# deciding the point along the modified 3-edge path where a random edge attached to | |
# 3-edge segment is swapped/attached | |
u3 = random.random() | |
new_attachment_point = u3*new_focal_path_length | |
_smallest_edge_length = 1.e-12 | |
if new_attachment_point <= _smallest_edge_length: | |
new_attachment_point = _smallest_edge_length | |
elif new_focal_path_length - new_attachment_point <= _smallest_edge_length: | |
new_attachment_point = new_focal_path_length - _smallest_edge_length | |
u7 = random.random() | |
if a_on_path and c_on_path: | |
if u7 < 0.5: | |
_case = 1 | |
################################################################## | |
# CASE = 1 | |
################################################################## | |
# (a) b* | |
# \ / | |
# \ / | |
# (x) d | |
# \ / | |
# \ / | |
# (y) | |
# | | |
# | | |
# (c) | |
################################################################## | |
# CASE = 1 | |
################################################################## | |
if new_attachment_point > _new_edgelen_top + _new_edgelen_middle: | |
print 'topology changed True, case 1' | |
_topology_changed = True | |
nniNodeSwap(_b, _d) | |
_a.edgelen = _new_edgelen_top + _new_edgelen_middle | |
_x.edgelen = new_attachment_point - _a.edgelen | |
_y.edgelen = new_focal_path_length - new_attachment_point | |
else: | |
print 'topology changed False, case 1' | |
_topology_changed = False; | |
_a.edgelen = new_attachment_point | |
_x.edgelen = _new_edgelen_top + _new_edgelen_middle - new_attachment_point | |
_y.edgelen = _new_edgelen_bottom | |
else: | |
_case = 2 | |
################################################################## | |
# CASE = 2 | |
################################################################## | |
# (a) b | |
# \ / | |
# \ / | |
# (x) d* | |
# \ / | |
# \ / | |
# (y) | |
# | | |
# | | |
# (c) | |
################################################################## | |
# CASE = 2 | |
################################################################## | |
if new_attachment_point < _new_edgelen_top: | |
print 'topology changed True, case 2' | |
_topology_changed = True | |
nniNodeSwap(_b, _d) | |
_a.edgelen = new_attachment_point | |
_x.edgelen = _new_edgelen_top - new_attachment_point | |
_y.edgelen = _new_edgelen_middle + _new_edgelen_bottom | |
else: | |
print 'topology changed False, case 2' | |
_topology_changed = False; | |
_a.edgelen = _new_edgelen_top | |
_x.edgelen = new_attachment_point - _new_edgelen_top | |
_y.edgelen = new_focal_path_length - new_attachment_point | |
elif (a_on_path == False) and (c_on_path): | |
if u7 < 0.5: | |
_case = 3 | |
################################################################## | |
# CASE = 3 | |
################################################################## | |
# a* (b) | |
# \ / | |
# \ / | |
# (x) d | |
# \ / | |
# \ / | |
# (y) | |
# | | |
# | | |
# (c) | |
################################################################## | |
# CASE = 3 | |
################################################################## | |
if new_attachment_point > _new_edgelen_top + _new_edgelen_middle: | |
print 'topology changed True, case 3' | |
_topology_changed = True | |
nniNodeSwap(_a, _d) | |
_b.edgelen = _new_edgelen_top + _new_edgelen_middle | |
_x.edgelen = new_attachment_point - _b.edgelen | |
_y.edgelen = new_focal_path_length - new_attachment_point | |
else: | |
print 'topology changed False, case 3' | |
_topology_changed = False; | |
_b.edgelen = new_attachment_point | |
_x.edgelen = _new_edgelen_top + _new_edgelen_middle - new_attachment_point | |
_y.edgelen = _new_edgelen_bottom | |
else: | |
_case = 4 | |
################################################################## | |
# CASE = 4 | |
################################################################## | |
# a (b) | |
# \ / | |
# \ / | |
# (x) d* | |
# \ / | |
# \ / | |
# (y) | |
# | | |
# | | |
# (c) | |
################################################################## | |
# CASE = 4 | |
################################################################## | |
if new_attachment_point < _new_edgelen_top: | |
print 'topology changed True, case 4' | |
_topology_changed = True | |
nniNodeSwap(_a, _d) | |
_b.edgelen = new_attachment_point | |
_x.edgelen = _new_edgelen_top - new_attachment_point | |
_y.edgelen = _new_edgelen_middle + _new_edgelen_bottom | |
else: | |
print 'topology changed False, case 4' | |
_topology_changed = False; | |
_b.edgelen = _new_edgelen_top | |
_x.edgelen = new_attachment_point - _new_edgelen_top | |
_y.edgelen = new_focal_path_length - new_attachment_point | |
elif (a_on_path) and (c_on_path==False): | |
if u7 < 0.5: | |
_case = 5 | |
################################################################## | |
# CASE = 5 | |
################################################################## | |
# (a) b* | |
# \ / | |
# \ / | |
# (x) d | |
# \ / | |
# \ / | |
# (y) | |
# | | |
# | | |
# c | |
################################################################## | |
# CASE = 5 | |
################################################################## | |
if new_attachment_point > _new_edgelen_top + _new_edgelen_middle: | |
print 'topology changed True, case 5' | |
_topology_changed = True | |
nniNodeSwap(_a, _d) | |
_a.edgelen = _new_edgelen_top + _new_edgelen_middle | |
_x.edgelen = new_attachment_point - _a.edgelen | |
_d.edgelen = new_focal_path_length - new_attachment_point | |
else: | |
print 'topology changed False, case 5' | |
_topology_changed = False; | |
_a.edgelen = new_attachment_point | |
_x.edgelen = _new_edgelen_top + _new_edgelen_middle - new_attachment_point | |
_d.edgelen = _new_edgelen_bottom | |
else: | |
_case = 6 | |
################################################################## | |
# CASE = 6 | |
################################################################## | |
# (a) b | |
# \ / | |
# \ / | |
# (x) d | |
# \ / | |
# \ / | |
# (y) | |
# | | |
# | | |
# c* | |
################################################################## | |
# CASE = 6 | |
################################################################## | |
if new_attachment_point < _new_edgelen_top: | |
print 'topology changed True, case 6' | |
_topology_changed = True | |
nniNodeSwap(_a, _d) | |
_d.edgelen = _new_edgelen_bottom + _new_edgelen_middle | |
_x.edgelen = _new_edgelen_top - new_attachment_point | |
_a.edgelen = new_attachment_point | |
else: | |
print 'topology changed False, case 6' | |
_topology_changed = False; | |
_a.edgelen = _new_edgelen_top | |
_x.edgelen = new_attachment_point - _new_edgelen_top | |
_d.edgelen = new_focal_path_length - new_attachment_point | |
else: | |
if u7 < 0.5: | |
_case = 7 | |
################################################################## | |
# CASE = 7 | |
################################################################## | |
# a* (b) | |
# \ / | |
# \ / | |
# (x) (d) | |
# \ / | |
# \ / | |
# (y) | |
# | | |
# | | |
# c | |
################################################################## | |
# CASE = 7 | |
################################################################## | |
if new_attachment_point > _new_edgelen_top + _new_edgelen_middle: | |
print 'topology changed True, case 7' | |
_topology_changed = True | |
nniNodeSwap(_b, _d) | |
_d.edgelen = new_focal_path_length - new_attachment_point | |
_x.edgelen = new_attachment_point - _new_edgelen_top - _new_edgelen_middle | |
_b.edgelen = _new_edgelen_top + _new_edgelen_middle | |
else: | |
print 'topology changed False, case 7' | |
_topology_changed = False; | |
_b.edgelen = new_attachment_point | |
_x.edgelen = _new_edgelen_top + _new_edgelen_middle - new_attachment_point | |
_d.edgelen = _new_edgelen_bottom | |
else: | |
_case = 8 | |
################################################################## | |
# CASE = 8 | |
################################################################## | |
# a (b) | |
# \ / | |
# \ / | |
# (x) (d) | |
# \ / | |
# \ / | |
# (y) | |
# | | |
# | | |
# c* | |
################################################################## | |
# CASE = 8 | |
################################################################## | |
if new_attachment_point < _new_edgelen_top: | |
print 'topology changed True, case 8' | |
_topology_changed = True | |
nniNodeSwap(_b, _d) | |
_b.edgelen = new_attachment_point | |
_x.edgelen = _new_edgelen_top - new_attachment_point | |
_d.edgelen = _new_edgelen_middle + _new_edgelen_bottom | |
else: | |
print 'topology changed False, case 8' | |
_topology_changed = False; | |
_b.edgelen = _new_edgelen_top | |
_x.edgelen = new_attachment_point - _new_edgelen_top | |
_d.edgelen = new_focal_path_length - new_attachment_point | |
preorder = getPreorder(postorder[-1]) | |
postorder = preorder[:] | |
postorder.reverse() | |
return postorder | |
# function to revert to original state if the proposed move is rejected | |
def revert_function(postorder): | |
if (_case == 1 or _case == 2): | |
if (_topology_changed): | |
nniNodeSwap(_d, _b); | |
_a.edgelen= _orig_edgelen_top | |
_x.edgelen = _orig_edgelen_middle | |
_y.edgelen = _orig_edgelen_bottom | |
else: | |
_a.edgelen = _orig_edgelen_top | |
_x.edgelen = _orig_edgelen_middle | |
_y.edgelen = _orig_edgelen_bottom | |
elif (_case == 3 or _case == 4): | |
if (_topology_changed): | |
nniNodeSwap(_d, _a) | |
_b.edgelen = _orig_edgelen_top | |
_x.edgelen = _orig_edgelen_middle | |
_y.edgelen = _orig_edgelen_bottom | |
else: | |
_b.edgelen = _orig_edgelen_top | |
_x.edgelen = _orig_edgelen_middle | |
_y.edgelen = _orig_edgelen_bottom | |
elif (_case == 5 or _case == 6): | |
if (_topology_changed): | |
nniNodeSwap(_d, _a) | |
_a.edgelen = _orig_edgelen_top | |
_x.edgelen = _orig_edgelen_middle | |
_d.edgelen = _orig_edgelen_bottom | |
else: | |
_a.edgelen = _orig_edgelen_top | |
_x.edgelen = _orig_edgelen_middle | |
_d.edgelen = _orig_edgelen_bottom | |
elif (_case == 7 or _case == 8): | |
if (_topology_changed): | |
nniNodeSwap(_d, _b) | |
_b.edgelen = _orig_edgelen_top | |
_x.edgelen = _orig_edgelen_middle | |
_d.edgelen = _orig_edgelen_bottom | |
else: | |
_b.edgelen = _orig_edgelen_top | |
_x.edgelen = _orig_edgelen_middle | |
_d.edgelen = _orig_edgelen_bottom | |
preorder = getPreorder(postorder[-1]) | |
postorder = preorder[:] | |
postorder.reverse() | |
return postorder | |
# total possible unrooted topology for prior | |
def binaryTrees(): | |
def factorial(n): | |
num = 1 | |
while n >= 1: | |
num = num * n | |
n = n - 1 | |
return num | |
total_binary_trees = factorial(2*number_of_species-5)/(factorial(number_of_species-3) * (2**(number_of_species-3))) | |
return total_binary_trees | |
# Hastings Ratio for base frequencies | |
def freqHastings(x1, x2, x3, x4): | |
tuning_lambda = 100. | |
theta1 = x1*tuning_lambda | |
theta2 = x2*tuning_lambda | |
theta3 = x3*tuning_lambda | |
theta4 = x4*tuning_lambda | |
g1 = random.gammavariate(theta1, 1) # 38.513225771664025 | |
g2 = random.gammavariate(theta2, 1) # 36.717529156834885 | |
g3 = random.gammavariate(theta3, 1) # 18.346105461005955 | |
g4 = random.gammavariate(theta4, 1) # 10.628233156657865 | |
gsum = g1 + g2 + g3 + g4 # 104.20509354616274 | |
x1star = g1/gsum # 0.36959062614921706 | |
x2star = g2/gsum # 0.3523582956198687 | |
x3star = g3/gsum # 0.17605766509751897 | |
x4star = g4/gsum # 0.10199341313339515 | |
# proposed frequencies from Dirichlet distribution | |
proposed_freq = [x1star, x2star, x3star, x4star] | |
#forward | |
log_forward_density = (theta1-1)*log(x1) | |
log_forward_density += (theta2-1)*log(x2) | |
log_forward_density += (theta3-1)*log(x3) | |
log_forward_density += (theta4-1)*log(x4) | |
log_forward_density += lgamma(theta1 + theta2 + theta3 + theta4) | |
log_forward_density -= lgamma(theta1) | |
log_forward_density -= lgamma(theta2) | |
log_forward_density -= lgamma(theta3) | |
log_forward_density -= lgamma(theta4) | |
#reverse | |
theta1star = x1star*tuning_lambda | |
theta2star = x2star*tuning_lambda | |
theta3star = x3star*tuning_lambda | |
theta4star = x4star*tuning_lambda | |
log_reverse_density = (theta1star-1)*log(x1star) | |
log_reverse_density += (theta2star-1)*log(x2star) | |
log_reverse_density += (theta3star-1)*log(x3star) | |
log_reverse_density += (theta4star-1)*log(x4star) | |
log_reverse_density += lgamma(theta1star + theta2star + theta3star + theta4star) | |
log_reverse_density -= lgamma(theta1star) | |
log_reverse_density -= lgamma(theta2star) | |
log_reverse_density -= lgamma(theta3star) | |
log_reverse_density -= lgamma(theta4star) | |
log_hastings_ratio = log_reverse_density - log_forward_density | |
return proposed_freq, log_hastings_ratio | |
# Hastings Ratio for rate exchangeabilites | |
def exrateHastings2(x1, x2, x3, x4, x5, x6): | |
tuning_lambda = 100. | |
theta1 = x1*tuning_lambda | |
theta2 = x2*tuning_lambda | |
theta3 = x3*tuning_lambda | |
theta4 = x4*tuning_lambda | |
theta5 = x5*tuning_lambda | |
theta6 = x6*tuning_lambda | |
g1 = random.gammavariate(theta1, 1) | |
g2 = random.gammavariate(theta2, 1) | |
g3 = random.gammavariate(theta3, 1) | |
g4 = random.gammavariate(theta4, 1) | |
g5 = random.gammavariate(theta5, 1) | |
g6 = random.gammavariate(theta6, 1) | |
gsum = g1 + g2 + g3 + g4 + g5 + g6 | |
x1star = g1/gsum | |
x2star = g2/gsum | |
x3star = g3/gsum | |
x4star = g4/gsum | |
x5star = g5/gsum | |
x6star = g6/gsum | |
# proposed rates from Dirichlet distribution | |
proposed_rates = [x1star, x2star, x3star, x4star, x5star, x6star] | |
#forward | |
log_forward_density = (theta1-1)*log(x1) | |
log_forward_density += (theta2-1)*log(x2) | |
log_forward_density += (theta3-1)*log(x3) | |
log_forward_density += (theta4-1)*log(x4) | |
log_forward_density += (theta5-1)*log(x5) | |
log_forward_density += (theta6-1)*log(x6) | |
log_forward_density += lgamma(theta1 + theta2 + theta3 + theta4 + theta5 + theta6) | |
log_forward_density -= lgamma(theta1) | |
log_forward_density -= lgamma(theta2) | |
log_forward_density -= lgamma(theta3) | |
log_forward_density -= lgamma(theta4) | |
log_forward_density -= lgamma(theta5) | |
log_forward_density -= lgamma(theta6) | |
#reverse | |
theta1star = x1star*tuning_lambda | |
theta2star = x2star*tuning_lambda | |
theta3star = x3star*tuning_lambda | |
theta4star = x4star*tuning_lambda | |
theta5star = x5star*tuning_lambda | |
theta6star = x6star*tuning_lambda | |
log_reverse_density = (theta1star-1)*log(x1star) | |
log_reverse_density += (theta2star-1)*log(x2star) | |
log_reverse_density += (theta3star-1)*log(x3star) | |
log_reverse_density += (theta4star-1)*log(x4star) | |
log_reverse_density += (theta5star-1)*log(x5star) | |
log_reverse_density += (theta6star-1)*log(x6star) | |
log_reverse_density += lgamma(theta1star + theta2star + theta3star + theta4star + theta5star + theta6star) | |
log_reverse_density -= lgamma(theta1star) | |
log_reverse_density -= lgamma(theta2star) | |
log_reverse_density -= lgamma(theta3star) | |
log_reverse_density -= lgamma(theta4star) | |
log_reverse_density -= lgamma(theta5star) | |
log_reverse_density -= lgamma(theta6star) | |
log_hastings_ratio = log_reverse_density - log_forward_density | |
return proposed_rates, log_hastings_ratio | |
# computing log prior density of proposed frequencies from Dirichlet prior distribution | |
def freqprior(x1, x2, x3, x4): | |
theta1 = base_prior[0] | |
theta2 = base_prior[1] | |
theta3 = base_prior[2] | |
theta4 = base_prior[3] | |
log_freq_prior = (theta1-1)*log(x1) | |
log_freq_prior += (theta2-1)*log(x2) | |
log_freq_prior += (theta3-1)*log(x3) | |
log_freq_prior += (theta4-1)*log(x4) | |
log_freq_prior += lgamma(theta1 + theta2 + theta3 + theta4) | |
log_freq_prior -= lgamma(theta1) | |
log_freq_prior -= lgamma(theta2) | |
log_freq_prior -= lgamma(theta3) | |
log_freq_prior -= lgamma(theta4) | |
return log_freq_prior | |
def exrateprior(x1, x2, x3, x4, x5, x6): | |
theta1 = exrate_prior[0] | |
theta2 = exrate_prior[1] | |
theta3 = exrate_prior[2] | |
theta4 = exrate_prior[3] | |
theta5 = exrate_prior[4] | |
theta6 = exrate_prior[5] | |
log_exrate_prior = (theta1-1)*log(x1) | |
log_exrate_prior += (theta2-1)*log(x2) | |
log_exrate_prior += (theta3-1)*log(x3) | |
log_exrate_prior += (theta4-1)*log(x4) | |
log_exrate_prior += (theta4-1)*log(x5) | |
log_exrate_prior += (theta4-1)*log(x6) | |
log_exrate_prior += lgamma(theta1 + theta2 + theta3 + theta4 + theta5 + theta6) | |
log_exrate_prior -= lgamma(theta1) | |
log_exrate_prior -= lgamma(theta2) | |
log_exrate_prior -= lgamma(theta3) | |
log_exrate_prior -= lgamma(theta4) | |
log_exrate_prior -= lgamma(theta5) | |
log_exrate_prior -= lgamma(theta6) | |
return log_exrate_prior | |
# the final step to mcmc analysis for proposing and then accepting or rejecting the Larget-Simon LOCAL move | |
def mcmc(postorder, patterns, rates): | |
mcmc = 0 | |
output = os.path.join('mcmc.txt') | |
output2 = os.path.join('trees.txt') | |
newf = open(output, 'w') | |
newf2 = open(output2, 'w') | |
newf.write('%s\t'%('n_gen')) | |
newf.write( '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t'%('LnL', 'freq_prior', 'TL', 'freqA','freqC','freqG','freqT','AC','AG','AT','CG','CT','GT')) | |
a = 0 | |
b = 0 | |
# for n,nl in enumerate(postorder): | |
# if nl.number > 0.0: | |
# newf.write( 'node%s\t'%(a+1)) | |
# a+=1 | |
newf.write('\n') | |
#starting frequency | |
node.freq = [0.4, 0.3, 0.2, 0.1] | |
# ex_rate = [0.166666666667, 0.166666666667, 0.166666666667, 0.166666666667, 0.166666666667, 0.166666666667] | |
node.ex_rate = [0.16, 0.16, 0.17, 0.17, 0.17, 0.17] | |
for r in range(n_gen): | |
print 'START#####################################################################################>' | |
print '********* starting postorder *********' | |
pre = postorder[:] | |
pre.reverse() | |
newick = makeNewick(pre[0]) | |
# print 'newick ==', newick | |
# for i in postorder: | |
# print i.number, | |
# for i in postorder: | |
# print i.edgelen, | |
# | |
# print 'start freq1=======================================>', node.freq[0], node.freq[1], node.freq[2], node.freq[3] | |
# | |
'''estimating first likelihood''' | |
################################################################################## | |
currentlike = rootLoglike(postorder, patterns, rates, node.freq[0], node.freq[1], node.freq[2], node.freq[3], node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5]) | |
print 'currentlike 1===%.5f'%currentlike | |
'''computing prior densities for base frequencies, branch lengths, and topologies''' | |
################################################################################## | |
#log prior density of starting frequencies | |
current_freq_prior = freqprior(node.freq[0], node.freq[1], node.freq[2], node.freq[3]) | |
# joint log prior density of branch lengths | |
current_brln_prior = 0.0 | |
for nd in postorder: | |
current_brln_prior += (-nd.edgelen/mean_expo)-(log(mean_expo)) | |
# log prior probability of topologies | |
total_possible_binary_trees = float(binaryTrees()) | |
topology_prior = -log(total_possible_binary_trees) | |
# log prior probability of rate exchangeabilities | |
exchangeabilities_prior = exrateprior(node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5]) | |
# computing joint log-prior of base frequencies, branch lengths, topologies and rate exchangeabilities | |
print 'current_freq_prior 1 ==%.5f'%current_freq_prior | |
joint_log_prior = current_brln_prior + topology_prior + current_freq_prior + exchangeabilities_prior | |
'''computing first log-posterior before topology change''' | |
################################################################################## | |
current_ln_posterior = currentlike + joint_log_prior | |
############################## | |
'''proposing new topology''' | |
############################## | |
postorder = proposeNewState(postorder) | |
print '********* new postorder *********' | |
# for i in postorder: | |
# print i.number, | |
# for i in postorder: | |
# print i.edgelen, | |
'''estimating second likelihood after topology change''' | |
################################################################################## | |
proposedlike = float(rootLoglike(postorder, patterns, rates, node.freq[0], node.freq[1], node.freq[2], node.freq[3], node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5])) | |
print 'proposedlike 1===%.5f' %proposedlike | |
pre2 = postorder[:] | |
pre2.reverse() | |
newick = makeNewick(pre2[0]) | |
# print 'newick2 ==', newick | |
'''computing prior densities for base frequencies, branch lengths, and topologies''' | |
################################################################################## | |
#log prior density of same starting frequencies | |
current_freq_prior = freqprior(node.freq[0], node.freq[1], node.freq[2], node.freq[3]) | |
# joint log prior density of new branch lengths from the changed topology | |
proposed_brln_prior = 0.0 | |
for nd in postorder: | |
proposed_brln_prior += (-nd.edgelen/mean_expo)-(log(mean_expo)) | |
# log prior probability of topologies | |
total_possible_binary_trees = float(binaryTrees()) | |
topology_prior = -log(total_possible_binary_trees) | |
# print 'current_freq_prior 1 ==%.5f'%current_freq_prior | |
# log prior probability of rate exchangeabilities | |
exchangeabilities_prior = exrateprior(node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5]) | |
# computing joint log-prior of base frequencies, branch lengths, topologies, and rate exchangeabilities | |
joint_log_prior = proposed_brln_prior + topology_prior + current_freq_prior + exchangeabilities_prior | |
'''computing second log-posterior after topology change''' | |
################################################################################## | |
proposed_ln_posterior = proposedlike + joint_log_prior | |
'''calculating acceptance ratio''' | |
################################################################################## | |
hastings_ratio = 3.*log(m) # log hasting_ratio | |
logR = proposed_ln_posterior - current_ln_posterior + hastings_ratio | |
'''decide whether to accept or reject the proposal move''' | |
################################################################################## | |
u8 = random.random() | |
print 'log(u8), logR==', log(u8), logR | |
if log(u8) < logR: | |
postorder = postorder | |
log_likelihood = proposedlike | |
print 'log(u8) < logR, so new proposal accepted..' | |
else: | |
postorder = revert_function(postorder) | |
log_likelihood = currentlike | |
print 'log(u8) > logR, so failed to accept new proposal..' | |
#################################################### | |
##################################################### | |
''' computations after proposing new frequencies''' | |
#################################################### | |
#################################################### | |
print '********* start new frequency proposal *********' | |
# for i in postorder: | |
# print i.number, | |
pre3 = postorder[:] | |
pre3.reverse() | |
newick = makeNewick(pre3[0]) | |
# print 'newick3 ==', newick | |
'''estimating third likelihood before the new frequency proposal''' | |
################################################################################## | |
currentlike = float(rootLoglike(postorder, patterns, rates, node.freq[0], node.freq[1], node.freq[2], node.freq[3], node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5])) | |
print 'currentlike 2===%.5f'%currentlike | |
'''computing prior densities for base frequencies, branch lengths, and topologies''' | |
################################################################################## | |
#log prior density of current frequencies | |
current_freq_prior = freqprior(node.freq[0], node.freq[1], node.freq[2], node.freq[3]) | |
print 'pre_node.freq=', node.freq[0], node.freq[1], node.freq[2], node.freq[3] | |
# joint log prior density of branch lengths | |
current_brln_prior = 0.0 | |
for nd in postorder: | |
current_brln_prior += (-nd.edgelen/mean_expo)-(log(mean_expo)) | |
# log prior probability of topologies | |
total_possible_binary_trees = float(binaryTrees()) | |
topology_prior = -log(total_possible_binary_trees) | |
# log prior probability of rate exchangeabilities | |
exchangeabilities_prior = exrateprior(node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5]) | |
# computing joint log-prior of base frequencies, branch lengths, topologies, and rate exchangeabilities | |
print 'current_freq_prior 2===%.5f'%current_freq_prior | |
joint_log_prior = current_brln_prior + topology_prior + current_freq_prior + exchangeabilities_prior | |
'''computing third log-posterior before the new frequency proposal''' | |
################################################################################## | |
current_ln_posterior = currentlike + joint_log_prior | |
################################### | |
'''proposing new new frequencies''' | |
################################### | |
proposed_freq = freqHastings(node.freq[0], node.freq[1], node.freq[2], node.freq[3])[0] | |
# print 'proposed new freq===', proposed_freq | |
'''estimating fourth likelihood after frequency change''' | |
################################################################################## | |
proposedlike = float(rootLoglike(postorder, patterns, rates, proposed_freq[0], proposed_freq[1], proposed_freq[2], proposed_freq[3], node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5])) | |
print 'proposedlike 2===%.5f'%proposedlike | |
print 'proposed pre_node.freq=', proposed_freq[0], proposed_freq[1], proposed_freq[2], proposed_freq[3] | |
'''computing prior densities for base frequencies, branch lengths, and topologies''' | |
################################################################################## | |
#log prior density of new proposed frequencies | |
proposed_freq_prior = freqprior(proposed_freq[0], proposed_freq[1], proposed_freq[2], proposed_freq[3]) | |
print 'proposed_freq_prior 2===%.5f'%proposed_freq_prior | |
# joint log prior density of branch lengths | |
proposed_brln_prior = 0.0 | |
for nd in postorder: | |
proposed_brln_prior += (-nd.edgelen/mean_expo)-(log(mean_expo)) | |
# log prior probability of topologies | |
total_possible_binary_trees = float(binaryTrees()) | |
topology_prior = -log(total_possible_binary_trees) | |
# log prior probability of rate exchangeabilities | |
exchangeabilities_prior = exrateprior(node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5]) | |
# computing joint log prior density of new proposed frequencies, branch lengths, topologies, and rate exchangeabilities | |
joint_log_prior = proposed_brln_prior + topology_prior + proposed_freq_prior + exchangeabilities_prior | |
'''computing fourth log-posterior after frequency changed''' | |
################################################################################## | |
proposed_ln_posterior = proposedlike + joint_log_prior | |
'''calculating acceptance ratio''' | |
################################################################################## | |
hastings_ratio1 = freqHastings(node.freq[0], node.freq[1], node.freq[2], node.freq[3])[1] # hastings_ratio for frequency change | |
logR1 = proposed_ln_posterior - current_ln_posterior + hastings_ratio1 | |
'''decide whether to accept or reject the proposal move''' | |
################################################################################## | |
u9 = random.random() | |
print 'log(u9), logR1===', log(u9), logR1 | |
if log(u9) < logR1: | |
node.freq[0] = proposed_freq[0] | |
node.freq[1] = proposed_freq[1] | |
node.freq[2] = proposed_freq[2] | |
node.freq[3] = proposed_freq[3] | |
log_likelihood = proposedlike | |
freq_prior = proposed_freq_prior | |
print 'log(u9) < logR, so new proposal accepted..' | |
else: | |
node.freq[0] = node.freq[0] | |
node.freq[1] = node.freq[1] | |
node.freq[2] = node.freq[2] | |
node.freq[3] = node.freq[3] | |
log_likelihood = currentlike | |
freq_prior = current_freq_prior | |
print 'log(u9) > logR, so failed to accept new proposal..' | |
########################################################## | |
########################################################## | |
''' computations after proposing new exchangeabilities''' | |
########################################################## | |
########################################################## | |
print '********* start new exchangeabilities proposal *********' | |
# for i in postorder: | |
# print i.number, | |
pre4 = postorder[:] | |
pre4.reverse() | |
newick = makeNewick(pre4[0]) | |
print 'newick44 ==', newick | |
'''estimating fifth likelihood before the new ex rate proposal''' | |
################################################################################## | |
currentlike = float(rootLoglike(postorder, patterns, rates, node.freq[0], node.freq[1], node.freq[2], node.freq[3], node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5])) | |
print 'currentlike 3===%.3f'%currentlike | |
'''computing prior densities for base frequencies, branch lengths, and topologies''' | |
################################################################################## | |
#log prior density of current frequencies | |
current_freq_prior = freqprior(node.freq[0], node.freq[1], node.freq[2], node.freq[3]) | |
print 'starting frequencies=', node.freq[0], node.freq[1], node.freq[2], node.freq[3] | |
# joint log prior density of branch lengths | |
current_brln_prior = 0.0 | |
for nd in postorder: | |
current_brln_prior += (-nd.edgelen/mean_expo)-(log(mean_expo)) | |
# log prior probability of topologies | |
total_possible_binary_trees = float(binaryTrees()) | |
topology_prior = -log(total_possible_binary_trees) | |
# log prior probability of rate exchangeabilities | |
current_exchangeabilities_prior = exrateprior(node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5]) | |
print 'current_exchangeabilities=', node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5] | |
print 'current_exchangeabilities_PAUPversion=', node.ex_rate[0]/node.ex_rate[5], node.ex_rate[1]/node.ex_rate[5], node.ex_rate[2]/node.ex_rate[5], node.ex_rate[3]/node.ex_rate[5], node.ex_rate[4]/node.ex_rate[5], node.ex_rate[5]/node.ex_rate[5] | |
# computing joint log-prior of base frequencies, branch lengths, topologies, and rate exchangeabilities | |
print 'current_freq_prior 2===%.5f'%current_freq_prior | |
joint_log_prior = current_brln_prior + topology_prior + current_freq_prior + current_exchangeabilities_prior | |
'''computing fifth log-posterior before the new frequency proposal''' | |
################################################################################## | |
current_ln_posterior = currentlike + joint_log_prior | |
############################################# | |
'''proposing new new rate exchangeabilities''' | |
############################################# | |
proposed_exchangeability_rates = exrateHastings2(node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5])[0] | |
# print 'proposed new freq===', proposed_freq | |
print 'proposed_exchangeability =', proposed_exchangeability_rates[0], proposed_exchangeability_rates[1], proposed_exchangeability_rates[2], proposed_exchangeability_rates[3], proposed_exchangeability_rates[4], proposed_exchangeability_rates[5] | |
print 'proposed_exchangeability_PAUPform =', proposed_exchangeability_rates[0]/proposed_exchangeability_rates[5], proposed_exchangeability_rates[1]/proposed_exchangeability_rates[5], proposed_exchangeability_rates[2]/proposed_exchangeability_rates[5], proposed_exchangeability_rates[3]/proposed_exchangeability_rates[5], proposed_exchangeability_rates[4]/proposed_exchangeability_rates[5], proposed_exchangeability_rates[5]/proposed_exchangeability_rates[5] | |
'''estimating sixth likelihood after rate exchangeabilities change''' | |
################################################################################## | |
proposedlike = float(rootLoglike(postorder, patterns, rates, node.freq[0], node.freq[1], node.freq[2], node.freq[3], proposed_exchangeability_rates[0], proposed_exchangeability_rates[1], proposed_exchangeability_rates[2], proposed_exchangeability_rates[3], proposed_exchangeability_rates[4], proposed_exchangeability_rates[5])) | |
print 'proposedlike 2===%.3f'%proposedlike | |
'''computing prior densities for base frequencies, branch lengths, and topologies''' | |
################################################################################## | |
#log prior density of new proposed frequencies | |
proposed_freq_prior = freqprior(node.freq[0], node.freq[1], node.freq[2], node.freq[3]) | |
print 'proposed_freq_prior 2===%.3f'%proposed_freq_prior | |
print 'proposed frequencies=', node.freq[0], node.freq[1], node.freq[2], node.freq[3] | |
# joint log prior density of branch lengths | |
proposed_brln_prior = 0.0 | |
for nd in postorder: | |
proposed_brln_prior += (-nd.edgelen/mean_expo)-(log(mean_expo)) | |
# log prior probability of topologies | |
total_possible_binary_trees = float(binaryTrees()) | |
topology_prior = -log(total_possible_binary_trees) | |
# log prior probability of rate exchangeabilities | |
proposed_exchangeabilities_prior = exrateprior(proposed_exchangeability_rates[0], proposed_exchangeability_rates[1], proposed_exchangeability_rates[2], proposed_exchangeability_rates[3], proposed_exchangeability_rates[4], proposed_exchangeability_rates[5]) | |
# computing joint log prior density of new proposed frequencies, branch lengths, topologies, and rate exchangeabilities | |
joint_log_prior = proposed_brln_prior + topology_prior + proposed_freq_prior + proposed_exchangeabilities_prior | |
'''computing fourth log-posterior after rate exchangeabilities change''' | |
################################################################################## | |
proposed_ln_posterior = proposedlike + joint_log_prior | |
'''calculating acceptance ratio''' | |
################################################################################## | |
hastings_ratio2 = exrateHastings2(node.ex_rate[0], node.ex_rate[1], node.ex_rate[2], node.ex_rate[3], node.ex_rate[4], node.ex_rate[5])[1] # hastings_ratio for rate exchangeabilities change | |
logR2 = proposed_ln_posterior - current_ln_posterior + hastings_ratio2 | |
'''decide whether to accept or reject the proposal move''' | |
################################################################################## | |
u10 = random.random() | |
print 'log(u10), logR2===', log(u10), logR2 | |
if log(u10) < logR2: | |
node.ex_rate[0] = proposed_exchangeability_rates[0] | |
node.ex_rate[1] = proposed_exchangeability_rates[1] | |
node.ex_rate[2] = proposed_exchangeability_rates[2] | |
node.ex_rate[3] = proposed_exchangeability_rates[3] | |
node.ex_rate[4] = proposed_exchangeability_rates[4] | |
node.ex_rate[5] = proposed_exchangeability_rates[5] | |
log_likelihood = proposedlike | |
exchangeability_rates_prior = proposed_exchangeabilities_prior | |
print 'log(u10) < logR2, so new proposal accepted..' | |
else: | |
node.ex_rate[0] = node.ex_rate[0] | |
node.ex_rate[1] = node.ex_rate[1] | |
node.ex_rate[2] = node.ex_rate[2] | |
node.ex_rate[3] = node.ex_rate[3] | |
node.ex_rate[4] = node.ex_rate[4] | |
node.ex_rate[5] = node.ex_rate[5] | |
log_likelihood = currentlike | |
exchangeability_rates_prior = current_exchangeabilities_prior | |
print 'log(u10) > logR2, so failed to accept new proposal..' | |
################################################################ | |
################################################################ | |
''' final accepted toplogies and frequencies for output files''' | |
############################################################### | |
############################################################### | |
if (r+1) % save_every == 0: | |
newf.write('%s\t'%(mcmc+1)) | |
print 'final log_likelihood==', log_likelihood | |
newf.write('%.6f\t'%(log_likelihood)) | |
newf.write( '%.6f\t'%(freq_prior)) | |
TL = 0.0 | |
for j,k in enumerate(postorder): | |
TL += k.edgelen | |
newf.write('%.6f\t'%(TL)) | |
sorted_postorder = sorted(postorder, key=operator.attrgetter('number')) | |
for i in node.freq: | |
newf.write('%.6f\t'%i) | |
for i in node.ex_rate: | |
newf.write('%.6f\t'%i) | |
# i = 0 | |
# while i < len(sorted_postorder): | |
# if sorted_postorder[i].number > 0: | |
# newf.write('%.6f\t'%(sorted_postorder[i].edgelen)) | |
# i += 1 | |
newf.write('\n') | |
if (r+1) % save_every == 0: | |
pre = postorder[:] | |
pre.reverse() | |
newick = makeNewick(pre[0]) | |
newf2.write('%s;\n'%(newick)) | |
print 'end of mcmc gen=', mcmc+1 | |
print '**************************' | |
mcmc+=1 | |
newf.flush() | |
def joinRandomPair(node_list, next_node_number, is_deep_coalescence): | |
# pick first of two lineages to join and delete from node_list | |
i = random.randint(1, len(node_list)) | |
ndi = node_list[i-1] | |
del node_list[i-1] | |
# pick second of two lineages to join and delete from node_list | |
j = random.randint(1, len(node_list)) | |
ndj = node_list[j-1] | |
del node_list[j-1] | |
# join selected nodes and add ancestor to node_list | |
ancnd = node(next_node_number) | |
ancnd.deep = is_deep_coalescence | |
ancnd.lchild = ndi | |
ancnd.descendants = set() | |
ancnd.descendants |= ndi.descendants | |
ancnd.descendants |= ndj.descendants | |
ndi.rsib = ndj | |
ndi.par = ancnd | |
ndj.par = ancnd | |
node_list.append(ancnd) | |
return node_list | |
# function to make newick string through preorder node traversal | |
def makeNewick(nd, brlen_scaler = 1.0, start = True): # | |
global _newick | |
global _TL | |
if start: | |
assert nd.par is None, 'makeNewick function assumes that nd provided is the root node (which has no parent), but that was not true in this case' | |
assert nd.lchild is not None, 'makeNewick function assumes that nd provided is the root node (which has at least one child), but that was not true in this case' | |
assert nd.lchild.rsib is None, 'makeNewick function assumes that nd provided is the root node (which has only one child), but that was not true in this case' | |
_TL = 0.0 | |
_newick = '(%d:%.5f' % (nd.number, nd.lchild.edgelen) | |
if not start: | |
if nd.lchild: | |
_newick += '(' | |
makeNewick(nd.lchild, brlen_scaler, False) | |
else: | |
blen = nd.edgelen*brlen_scaler | |
_TL += blen | |
_newick += '%d:%.5f' % (nd.number, blen) | |
if start: | |
_newick += ',' | |
makeNewick(nd.lchild.lchild, brlen_scaler, False) | |
elif nd.rsib: | |
_newick += ',' | |
makeNewick(nd.rsib, brlen_scaler, False) | |
elif nd.par is not None and nd.par.par is not None and nd.par.par.par is not None: | |
blen = nd.par.edgelen*brlen_scaler | |
_TL += blen | |
_newick += '):%.5f' % blen | |
else: | |
_newick += ')' | |
return _newick | |
def calcActualHeight(root): | |
h = 0.0 | |
nd = root | |
while nd.lchild: | |
nd = nd.lchild | |
h += nd.edgelen | |
return h | |
def readnewick(tree): | |
total_length = len(tree) | |
internal_node_number = -1 | |
root = node(internal_node_number) | |
nd = root | |
i = 0 | |
pre = [root] | |
while i < total_length: | |
m = tree[i] | |
if m =='(': | |
internal_node_number -= 1 | |
child = node(internal_node_number) | |
pre.append(child) | |
nd.lchild=child | |
child.par=nd | |
nd=child | |
elif m == ',': | |
internal_node_number -= 1 | |
rsib = node(internal_node_number) | |
pre.append(rsib) | |
nd.rsib = rsib | |
rsib.par=nd.par | |
nd = rsib | |
elif m == ')': | |
nd = nd.par | |
elif m == ':': | |
edge_len_str = '' | |
i+=1 | |
m = tree[i] | |
assert m in ['0','1','2','3','4','5','6','7','8', '9','.'] | |
while m in ['0','1','2','3','4','5','6','7','8', '9','.']: | |
edge_len_str += m | |
i+=1 | |
m = tree[i] | |
i -=1 | |
nd.edgelen = float(edge_len_str) | |
else: | |
internal_node_number += 1 | |
if True: | |
assert m in ['0','1','2','3','4','5','6','7','8', '9'], 'Error : expecting m to be a digit when in fact it was "%s"' % m | |
mm = '' | |
while m in ['0','1','2','3','4','5','6','7','8', '9' ]: | |
mm += m | |
i += 1 | |
m = tree[i] | |
nd.number = int(mm) | |
i -= 1 | |
i += 1 | |
#post = pre[:] | |
#post.reverse() | |
# return post | |
new_pre = rootlchild(pre) | |
return new_pre | |
def Makenewick(pre): | |
newickstring = '' | |
for i,nd in enumerate(pre): | |
if nd.lchild: | |
newickstring += '(' | |
elif nd.rsib: | |
newickstring += '%d' %(nd.number) | |
newickstring += ':%.4f' % nd.edgelen | |
newickstring += ',' | |
else: | |
newickstring += '%d' %(nd.number) | |
newickstring += ':%.4f' % nd.edgelen | |
tmpnd = nd | |
while (tmpnd.par is not None) and (tmpnd.rsib is None): | |
newickstring += ')' | |
newickstring += ':%.4f' % tmpnd.par.edgelen | |
tmpnd = tmpnd.par | |
if tmpnd.par is not None: | |
newickstring += ',' | |
return newickstring | |
# function to read a newick tree | |
def treenewick(): | |
script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) | |
path = os.path.join(script_dir, tree_file_name) | |
with open(path, 'r') as content: | |
newick = content.read() | |
return newick | |
################### yule tree simulator ################################################### | |
# calcPhi computes sum_{K=2}^S 1/K, where S is the number of leaves in the tree | |
# - num_species is the number of leaves (tips) in the tree | |
def calcPhi(num_species): | |
phi = sum([1.0/(K+2.0) for K in range(num_species-1)]) | |
return phi | |
# yuleTree creates a species tree in which edge lengths are measured in | |
# expected number of substitutions. | |
# - num_species is the number of leaves | |
# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio | |
def yuleTree(num_species, mu_over_s): | |
# create num_species nodes numbered 1, 2, ..., num_species | |
nodes = [node(i+1) for i in range(num_species)] | |
next_node_number = num_species + 1 | |
while len(nodes) > 1: | |
# choose a speciation time in generations | |
K = float(len(nodes)) | |
mean_epoch_length = mu_over_s/K | |
t = random.gammavariate(1.0, mean_epoch_length) | |
# update each node's edgelen | |
for n in nodes: | |
n.edgelen += t # same as: n.edgelen = n.edgelen + t | |
nodes = joinRandomPair(nodes, next_node_number, False) | |
next_node_number += 1 | |
return nodes[0] | |
# calcExpectedHeight returns the expected height of the species tree in terms of | |
# expected number of substitutions from the root to one tip. | |
# - num_species is the number of leaves | |
# - mu_over_s is the mutations-per-generation/speciations-per-generation rate ratio | |
def calcExpectedHeight(num_species, mu_over_s): | |
return mu_over_s*calcPhi(num_species) | |
def makeStartNewick(nd, brlen_scaler = 1.0, start = True): # | |
global _newick | |
global _TL | |
if start: | |
_newick = '' | |
_TL = 0.0 | |
if nd.lchild: | |
_newick += '(' | |
makeNewick(nd.lchild, brlen_scaler, False) | |
else: | |
blen = nd.edgelen*brlen_scaler | |
_TL += blen | |
_newick += '%d:%.5f' % (nd.number, blen) | |
if nd.rsib: | |
_newick += ',' | |
makeNewick(nd.rsib, brlen_scaler, False) | |
elif nd.par is not None: | |
blen = nd.par.edgelen*brlen_scaler | |
_TL += blen | |
_newick += '):%.5f' % blen | |
# return _newick, _TL | |
return _newick | |
def createRandomTree(number_of_species): | |
root_node_length = random.lognormvariate(1, 0.05) | |
species_tree_root = yuleTree(int(number_of_species)-1, 0.689655172) | |
newick = makeStartNewick(species_tree_root) | |
_newick = '(%s:%.5f,' %(number_of_species,root_node_length) | |
_newick += newick[1:-1] | |
_newick += ')' | |
print 'random starting tree =', _newick | |
return _newick | |
if __name__ == '__main__': | |
random_seed = 2551189 # 7632557, 12345 | |
# number_of_species = 5 | |
# mutation_speciation_rate_ratio = 0.689655172 # 0.689655172 # yields tree height 1 for 6 species | |
random.seed(random_seed) | |
# species_tree_root = yuleTree(number_of_species, mutation_speciation_rate_ratio) | |
# newick = makeStartNewick(species_tree_root) | |
# print 'Random number seed: %d' % random_seed | |
# print 'Simulating one tree:' | |
# print ' number of species = %d' % number_of_species | |
# print ' mutation-speciation rate ratio = %g' % mutation_speciation_rate_ratio | |
# print ' actual tree length =',newick[1] | |
# expected_height = calcExpectedHeight(number_of_species, mutation_speciation_rate_ratio) | |
# print ' expected height =',expected_height | |
# actual_height = calcActualHeight(species_tree_root) | |
# print ' actual height =',actual_height | |
# print 'true tree: ',newick[0] | |
# print '**************************' | |
number_of_species = readSeq.patterns(sequence_file)[0] | |
random_start_tree = createRandomTree(number_of_species) | |
preorder = readnewick(random_start_tree) | |
postorder = preorder[:] | |
postorder.reverse() | |
rates_list = gammaRates(alpha) | |
result = mcmc(postorder, readSeq.patterns(sequence_file)[1], rates_list) | |