Skip to content

Brnlen mcmc #2

Merged
merged 12 commits into from
May 26, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions JC+GAMMA/readSeq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
def patterns():
#
import re, os, glob, itertools, fnmatch, sys, shutil
from itertools import combinations
from collections import Counter

script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
# path = os.path.join(script_dir, 'nexus')

genes = []
data = {}
# print 'Reading nexus files...'
for filename in glob.glob(os.path.join(script_dir, '*.nex')):

m = re.match('(.+).nex', os.path.basename(filename))
gene_name = m.group(1)
# print 'gene_name=', gene_name
genes.append(gene_name)
f = open(filename, 'r').read()

m = re.search('ntax\s*=\s*(\d+)', f, re.M | re.S)
ntax = int(m.group(1))
# print 'ntax=', ntax

m = re.search('nchar\s*=\s*(\d+)', f, re.M | re.S)
nchar = int(m.group(1))
# print 'nchar=', nchar

m = re.search('Matrix\s+(.+?);', f, re.M | re.S)
matrix = m.group(1).strip()
matrix_lines = matrix.split('\n')

taxon_names = []
sequences = {}
sequences_list = []
for line in matrix_lines:
parts = line.strip().split()
assert len(parts) == 2
taxon_name = parts[0]
sequence = parts[1]

taxon_names.append(taxon_name)
sequences_list.append(sequence)
sequences[taxon_name] = sequence

pattern_list = []

k=0
while k < nchar:
site_pattern = ''
for i,m in enumerate(sequences_list):
site_pattern += m[k]
pattern_list.append(site_pattern)
k+=1
pattern_dict = dict()
for i in pattern_list:
pattern_dict[i] = pattern_dict.get(i, 0) + 1

tmp = []
for key in pattern_dict.keys(): ###convert dict to key of tupules
# print 'key=', key
tmp.append((pattern_dict[key],key))

sorted_values = sorted(tmp) ###sorted according to key smaller to larger
sorted_values.sort(cmp = lambda x,y:cmp(x[1],y[1])) ###sorted according to values in alphabetical order

return pattern_dict


Binary file added JC+GAMMA/readSeq.pyc
Binary file not shown.
File renamed without changes.
20 changes: 20 additions & 0 deletions JC+GAMMA/simulated.nex

Large diffs are not rendered by default.

259 changes: 122 additions & 137 deletions treeLike_JCGAMMA.py → JC+GAMMA/test3.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
##################################
# This script reads a nexus DNA matrix (through module readseq.py) and a newick tree
# topology, and computes log-likelihood of the topology under Jukes Cantor+GAMMA model
# topology, and computes log-likelihood of the topology under Jukes Cantor model
###################################


Expand All @@ -10,17 +10,6 @@
from itertools import chain
from scipy.stats import gamma
from math import exp, log

##########################################################################################
tree_file_name = 'tree.tre'
sequence_file = 'example3.nex'
alpha = 0.5 #gamma shape parameter for rate categories
##########################################################################################





class node(object):
def __init__(self, ndnum): # initialization function
self.rsib = None # right sibling
Expand All @@ -31,7 +20,120 @@ def __init__(self, ndnum): # initialization function
self.descendants = set([ndnum]) # set containing descendant leaf set
self.partial = None # will have length 4*npatterns

def allocatePartial(self, patterns, rates):
if self.number > 0:
npatterns = len(patterns)
# print 'npat', npatterns
self.partial = [0.0]*(4*4*npatterns)
# print len(self.partial)
for i,pattern in enumerate(patterns.keys()):
base = pattern[self.number-1]
for l in range(4):
if base == 'A':
self.partial[i*16+l*4 + 0] = 1.0
elif base == 'C':
self.partial[i*16+l*4 + 1] = 1.0
elif base == 'G':
self.partial[i*16+l*4 + 2] = 1.0
elif base == 'T':
self.partial[i*16+l*4 + 3] = 1.0
else:
assert(False), 'oops, something went horribly wrong!'

else:
# rt = [0.03338775, 0.25191592, 0.82026848, 2.89442785]
# rt = [2.89442785]

# rt = [1.0, 1.0, 1.0, 1.0]

npatterns = len(patterns)
# print 'patterns=', patterns
self.partial = [0.0]*(4*4*npatterns)
like_list = []
for i,pattern in enumerate(patterns.keys()):
# print i, pattern, patterns.keys()
m_list = []
num_pattern = patterns[pattern]
# print num_pattern

for l,m in enumerate(rates):


psame = (0.25+0.75*exp(-4.0*m*(self.lchild.edgelen)/3.0))
pdiff = (0.25-0.25*exp(-4.0*m*(self.lchild.edgelen)/3.0))

psame2 = (0.25+0.75*exp(-4.0*m*(self.lchild.rsib.edgelen)/3.0))
pdiff2 = (0.25-0.25*exp(-4.0*m*(self.lchild.rsib.edgelen)/3.0))

num_pattern = patterns[pattern]
pAA = psame*(self.lchild.partial[i*16+l*4 + 0])
pAC = pdiff*(self.lchild.partial[i*16+l*4 + 1])
pAG = pdiff*(self.lchild.partial[i*16+l*4 + 2])
pAT = pdiff*(self.lchild.partial[i*16+l*4 + 3])

pAA2 = psame2*(self.lchild.rsib.partial[i*16+l*4 + 0])
pAC2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 1])
pAG2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 2])
pAT2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 3])

pfromA_lchild = pAA+pAC+pAG+pAT
pfromA_rchild = pAA2+pAC2+pAG2+pAT2
self.partial[i*16+l*4 + 0] = pfromA_lchild*pfromA_rchild


######################################################

pCA = pdiff*(self.lchild.partial[i*16+l*4 + 0])
pCC = psame*(self.lchild.partial[i*16+l*4 + 1])
pCG = pdiff*(self.lchild.partial[i*16+l*4 + 2])
pCT = pdiff*(self.lchild.partial[i*16+l*4 + 3])

pCA2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 0])
pCC2 = psame2*(self.lchild.rsib.partial[i*16+l*4 + 1])
pCG2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 2])
pCT2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 3])

pfromC_lchild = pCA+pCC+pCG+pCT
pfromC_rchild = pCA2+pCC2+pCG2+pCT2
self.partial[i*16+l*4 + 1] = pfromC_lchild*pfromC_rchild

#######################################################
#
pGA = pdiff*(self.lchild.partial[i*16+l*4 + 0])
pGC = pdiff*(self.lchild.partial[i*16+l*4 + 1])
pGG = psame*(self.lchild.partial[i*16+l*4 + 2])
pGT = pdiff*(self.lchild.partial[i*16+l*4 + 3])

pGA2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 0])
pGC2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 1])
pGG2 = psame2*(self.lchild.rsib.partial[i*16+l*4 + 2])
pGT2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 3])

pfromG_lchild = pGA+pGC+pGG+pGT
pfromG_rchild = pGA2+pGC2+pGG2+pGT2
self.partial[i*16+l*4 + 2] = pfromG_lchild*pfromG_rchild

#######################################################

pTA = pdiff*(self.lchild.partial[i*16+l*4 + 0])
pTC = pdiff*(self.lchild.partial[i*16+l*4 + 1])
pTG = pdiff*(self.lchild.partial[i*16+l*4 + 2])
pTT = psame*(self.lchild.partial[i*16+l*4 + 3])

pTA2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 0])
pTC2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 1])
pTG2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 2])
pTT2 = psame2*(self.lchild.rsib.partial[i*16+l*4 + 3])

pfromT_lchild = pTA+pTC+pTG+pTT
pfromT_rchild = pTA2+pTC2+pTG2+pTT2
self.partial[i*16+l*4 + 3] = pfromT_lchild*pfromT_rchild

site_like = (sum(self.partial[i*16:i*16+16]))*0.25*0.25
site_log_like = (log(site_like))*num_pattern
like_list.append(site_log_like)
log_like = sum(like_list)
return log_like


def __str__(self):
Expand All @@ -51,131 +153,16 @@ def __str__(self):
parstr = '%d' % self.par.number

return 'node: number=%d edgelen=%g lchild=%s rsib=%s parent=%s descendants=[%s]' % (self.number, self.edgelen, lchildstr, rsibstr, parstr, descendants_as_string)


def allocatePartial(node, patterns, rates):
if node.number > 0:
npatterns = len(patterns)
# print 'npat', npatterns
node.partial = [0.0]*(4*4*npatterns)
# print len(node.partial)
for i,pattern in enumerate(patterns.keys()):
base = pattern[node.number-1]
for l in range(4):
if base == 'A':
node.partial[i*16+l*4 + 0] = 1.0
elif base == 'C':
node.partial[i*16+l*4 + 1] = 1.0
elif base == 'G':
node.partial[i*16+l*4 + 2] = 1.0
elif base == 'T':
node.partial[i*16+l*4 + 3] = 1.0
else:
assert(False), 'oops, something went horribly wrong!'

else:
# rt = [0.03338775, 0.25191592, 0.82026848, 2.89442785]
# rt = [2.89442785]

# rt = [1.0, 1.0, 1.0, 1.0]

npatterns = len(patterns)
# print 'patterns=', patterns
node.partial = [0.0]*(4*4*npatterns)
like_list = []
for i,pattern in enumerate(patterns.keys()):
# print i, pattern, patterns.keys()
m_list = []
num_pattern = patterns[pattern]
# print num_pattern

for l,m in enumerate(rates):


psame = (0.25+0.75*exp(-4.0*m*(node.lchild.edgelen)/3.0))
pdiff = (0.25-0.25*exp(-4.0*m*(node.lchild.edgelen)/3.0))

psame2 = (0.25+0.75*exp(-4.0*m*(node.lchild.rsib.edgelen)/3.0))
pdiff2 = (0.25-0.25*exp(-4.0*m*(node.lchild.rsib.edgelen)/3.0))

num_pattern = patterns[pattern]
pAA = psame*(node.lchild.partial[i*16+l*4 + 0])
pAC = pdiff*(node.lchild.partial[i*16+l*4 + 1])
pAG = pdiff*(node.lchild.partial[i*16+l*4 + 2])
pAT = pdiff*(node.lchild.partial[i*16+l*4 + 3])

pAA2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 0])
pAC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1])
pAG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2])
pAT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3])

pfromA_lchild = pAA+pAC+pAG+pAT
pfromA_rchild = pAA2+pAC2+pAG2+pAT2
node.partial[i*16+l*4 + 0] = pfromA_lchild*pfromA_rchild


######################################################

pCA = pdiff*(node.lchild.partial[i*16+l*4 + 0])
pCC = psame*(node.lchild.partial[i*16+l*4 + 1])
pCG = pdiff*(node.lchild.partial[i*16+l*4 + 2])
pCT = pdiff*(node.lchild.partial[i*16+l*4 + 3])

pCA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0])
pCC2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 1])
pCG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2])
pCT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3])

pfromC_lchild = pCA+pCC+pCG+pCT
pfromC_rchild = pCA2+pCC2+pCG2+pCT2
node.partial[i*16+l*4 + 1] = pfromC_lchild*pfromC_rchild

#######################################################
#
pGA = pdiff*(node.lchild.partial[i*16+l*4 + 0])
pGC = pdiff*(node.lchild.partial[i*16+l*4 + 1])
pGG = psame*(node.lchild.partial[i*16+l*4 + 2])
pGT = pdiff*(node.lchild.partial[i*16+l*4 + 3])

pGA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0])
pGC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1])
pGG2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 2])
pGT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3])

pfromG_lchild = pGA+pGC+pGG+pGT
pfromG_rchild = pGA2+pGC2+pGG2+pGT2
node.partial[i*16+l*4 + 2] = pfromG_lchild*pfromG_rchild

#######################################################

pTA = pdiff*(node.lchild.partial[i*16+l*4 + 0])
pTC = pdiff*(node.lchild.partial[i*16+l*4 + 1])
pTG = pdiff*(node.lchild.partial[i*16+l*4 + 2])
pTT = psame*(node.lchild.partial[i*16+l*4 + 3])

pTA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0])
pTC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1])
pTG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2])
pTT2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 3])

pfromT_lchild = pTA+pTC+pTG+pTT
pfromT_rchild = pTA2+pTC2+pTG2+pTT2
node.partial[i*16+l*4 + 3] = pfromT_lchild*pfromT_rchild

site_like = (sum(node.partial[i*16:i*16+16]))*0.25*0.25
site_log_like = (log(site_like))*num_pattern
like_list.append(site_log_like)
log_like = sum(like_list)
return log_like


def treenewick():
script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
path = os.path.join(script_dir, tree_file_name)
# print script_dir
path = os.path.join(script_dir, 'tree.tre')
with open(path, 'r') as content:
newick = content.read()
return newick
#
a = treenewick()


def gammaRates(alpha):
bounds = [0.0, 0.25, 0.50, 0.75, 1.]
Expand All @@ -191,7 +178,7 @@ def gammaRates(alpha):
def prepareTree(postorder, patterns, rates):
likelihood_lists = []
for nd in postorder:
likelihood_lists.append(allocatePartial(nd, patterns, rates))
likelihood_lists.append(nd.allocatePartial(patterns, rates))
print 'log-likelihood of the topology =', likelihood_lists[-1]


Expand Down Expand Up @@ -319,8 +306,6 @@ def readnewick(tree):

post = pre[:]
post.reverse()
for nd in post:
print nd.number, nd.edgelen
return post

def Makenewick(pre):
Expand Down Expand Up @@ -407,7 +392,7 @@ def calcExpectedHeight(num_species, mu_over_s):
# print ' newick: ',newick[0]

alpha = 0.5 ### gamma shape parameter rate categories
# yuletree = '(((1:0.54019,(5:0.40299,10:0.40299):0.1372):0.72686,(6:0.10576,4:0.10576):1.16129):0.42537,(2:0.58122,(9:0.21295,(7:0.16691,(8:0.14622,3:0.14622):0.02069):0.04604):0.36827):1.1112)'
yuletree = '(((1:0.54019,(5:0.40299,10:0.40299):0.1372):0.72686,(6:0.10576,4:0.10576):1.16129):0.42537,(2:0.58122,(9:0.21295,(7:0.16691,(8:0.14622,3:0.14622):0.02069):0.04604):0.36827):1.1112)'
rates_list = gammaRates(alpha)
postorder = readnewick(treenewick())
result = prepareTree(postorder, readSeq.patterns(sequence_file), rates_list)
postorder = readnewick(yuletree)
result = prepareTree(postorder, readSeq.patterns(), rates_list)
1 change: 1 addition & 0 deletions JC+GAMMA/tree.tre
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
(((1:0.54019,(5:0.40299,10:0.40299):0.1372):0.72686,(6:0.10576,4:0.10576):1.16129):0.42537,(2:0.58122,(9:0.21295,(7:0.16691,(8:0.14622,3:0.14622):0.02069):0.04604):0.36827):1.1112)
Loading