Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Merge pull request #2 from sun13005/brnlenMCMC
Brnlen mcmc
  • Loading branch information
sun13005 committed May 26, 2017
2 parents fdc28d8 + b5d929a commit d51978f
Show file tree
Hide file tree
Showing 14 changed files with 241 additions and 989 deletions.
69 changes: 69 additions & 0 deletions JC+GAMMA/readSeq.py
@@ -0,0 +1,69 @@
def patterns():
#
import re, os, glob, itertools, fnmatch, sys, shutil
from itertools import combinations
from collections import Counter

script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
# path = os.path.join(script_dir, 'nexus')

genes = []
data = {}
# print 'Reading nexus files...'
for filename in glob.glob(os.path.join(script_dir, '*.nex')):

m = re.match('(.+).nex', os.path.basename(filename))
gene_name = m.group(1)
# print 'gene_name=', gene_name
genes.append(gene_name)
f = open(filename, 'r').read()

m = re.search('ntax\s*=\s*(\d+)', f, re.M | re.S)
ntax = int(m.group(1))
# print 'ntax=', ntax

m = re.search('nchar\s*=\s*(\d+)', f, re.M | re.S)
nchar = int(m.group(1))
# print 'nchar=', nchar

m = re.search('Matrix\s+(.+?);', f, re.M | re.S)
matrix = m.group(1).strip()
matrix_lines = matrix.split('\n')

taxon_names = []
sequences = {}
sequences_list = []
for line in matrix_lines:
parts = line.strip().split()
assert len(parts) == 2
taxon_name = parts[0]
sequence = parts[1]

taxon_names.append(taxon_name)
sequences_list.append(sequence)
sequences[taxon_name] = sequence

pattern_list = []

k=0
while k < nchar:
site_pattern = ''
for i,m in enumerate(sequences_list):
site_pattern += m[k]
pattern_list.append(site_pattern)
k+=1
pattern_dict = dict()
for i in pattern_list:
pattern_dict[i] = pattern_dict.get(i, 0) + 1

tmp = []
for key in pattern_dict.keys(): ###convert dict to key of tupules
# print 'key=', key
tmp.append((pattern_dict[key],key))

sorted_values = sorted(tmp) ###sorted according to key smaller to larger
sorted_values.sort(cmp = lambda x,y:cmp(x[1],y[1])) ###sorted according to values in alphabetical order

return pattern_dict


Binary file added JC+GAMMA/readSeq.pyc
Binary file not shown.
File renamed without changes.
20 changes: 20 additions & 0 deletions JC+GAMMA/simulated.nex

Large diffs are not rendered by default.

259 changes: 122 additions & 137 deletions treeLike_JCGAMMA.py → JC+GAMMA/test3.py
@@ -1,6 +1,6 @@
##################################
# This script reads a nexus DNA matrix (through module readseq.py) and a newick tree
# topology, and computes log-likelihood of the topology under Jukes Cantor+GAMMA model
# topology, and computes log-likelihood of the topology under Jukes Cantor model
###################################


Expand All @@ -10,17 +10,6 @@ import re, os, itertools, sys, glob
from itertools import chain
from scipy.stats import gamma
from math import exp, log

##########################################################################################
tree_file_name = 'tree.tre'
sequence_file = 'example3.nex'
alpha = 0.5 #gamma shape parameter for rate categories
##########################################################################################





class node(object):
def __init__(self, ndnum): # initialization function
self.rsib = None # right sibling
Expand All @@ -31,7 +20,120 @@ class node(object):
self.descendants = set([ndnum]) # set containing descendant leaf set
self.partial = None # will have length 4*npatterns

def allocatePartial(self, patterns, rates):
if self.number > 0:
npatterns = len(patterns)
# print 'npat', npatterns
self.partial = [0.0]*(4*4*npatterns)
# print len(self.partial)
for i,pattern in enumerate(patterns.keys()):
base = pattern[self.number-1]
for l in range(4):
if base == 'A':
self.partial[i*16+l*4 + 0] = 1.0
elif base == 'C':
self.partial[i*16+l*4 + 1] = 1.0
elif base == 'G':
self.partial[i*16+l*4 + 2] = 1.0
elif base == 'T':
self.partial[i*16+l*4 + 3] = 1.0
else:
assert(False), 'oops, something went horribly wrong!'

else:
# rt = [0.03338775, 0.25191592, 0.82026848, 2.89442785]
# rt = [2.89442785]

# rt = [1.0, 1.0, 1.0, 1.0]

npatterns = len(patterns)
# print 'patterns=', patterns
self.partial = [0.0]*(4*4*npatterns)
like_list = []
for i,pattern in enumerate(patterns.keys()):
# print i, pattern, patterns.keys()
m_list = []
num_pattern = patterns[pattern]
# print num_pattern

for l,m in enumerate(rates):


psame = (0.25+0.75*exp(-4.0*m*(self.lchild.edgelen)/3.0))
pdiff = (0.25-0.25*exp(-4.0*m*(self.lchild.edgelen)/3.0))

psame2 = (0.25+0.75*exp(-4.0*m*(self.lchild.rsib.edgelen)/3.0))
pdiff2 = (0.25-0.25*exp(-4.0*m*(self.lchild.rsib.edgelen)/3.0))

num_pattern = patterns[pattern]
pAA = psame*(self.lchild.partial[i*16+l*4 + 0])
pAC = pdiff*(self.lchild.partial[i*16+l*4 + 1])
pAG = pdiff*(self.lchild.partial[i*16+l*4 + 2])
pAT = pdiff*(self.lchild.partial[i*16+l*4 + 3])

pAA2 = psame2*(self.lchild.rsib.partial[i*16+l*4 + 0])
pAC2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 1])
pAG2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 2])
pAT2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 3])

pfromA_lchild = pAA+pAC+pAG+pAT
pfromA_rchild = pAA2+pAC2+pAG2+pAT2
self.partial[i*16+l*4 + 0] = pfromA_lchild*pfromA_rchild


######################################################

pCA = pdiff*(self.lchild.partial[i*16+l*4 + 0])
pCC = psame*(self.lchild.partial[i*16+l*4 + 1])
pCG = pdiff*(self.lchild.partial[i*16+l*4 + 2])
pCT = pdiff*(self.lchild.partial[i*16+l*4 + 3])

pCA2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 0])
pCC2 = psame2*(self.lchild.rsib.partial[i*16+l*4 + 1])
pCG2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 2])
pCT2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 3])

pfromC_lchild = pCA+pCC+pCG+pCT
pfromC_rchild = pCA2+pCC2+pCG2+pCT2
self.partial[i*16+l*4 + 1] = pfromC_lchild*pfromC_rchild

#######################################################
#
pGA = pdiff*(self.lchild.partial[i*16+l*4 + 0])
pGC = pdiff*(self.lchild.partial[i*16+l*4 + 1])
pGG = psame*(self.lchild.partial[i*16+l*4 + 2])
pGT = pdiff*(self.lchild.partial[i*16+l*4 + 3])

pGA2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 0])
pGC2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 1])
pGG2 = psame2*(self.lchild.rsib.partial[i*16+l*4 + 2])
pGT2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 3])

pfromG_lchild = pGA+pGC+pGG+pGT
pfromG_rchild = pGA2+pGC2+pGG2+pGT2
self.partial[i*16+l*4 + 2] = pfromG_lchild*pfromG_rchild

#######################################################

pTA = pdiff*(self.lchild.partial[i*16+l*4 + 0])
pTC = pdiff*(self.lchild.partial[i*16+l*4 + 1])
pTG = pdiff*(self.lchild.partial[i*16+l*4 + 2])
pTT = psame*(self.lchild.partial[i*16+l*4 + 3])

pTA2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 0])
pTC2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 1])
pTG2 = pdiff2*(self.lchild.rsib.partial[i*16+l*4 + 2])
pTT2 = psame2*(self.lchild.rsib.partial[i*16+l*4 + 3])

pfromT_lchild = pTA+pTC+pTG+pTT
pfromT_rchild = pTA2+pTC2+pTG2+pTT2
self.partial[i*16+l*4 + 3] = pfromT_lchild*pfromT_rchild

site_like = (sum(self.partial[i*16:i*16+16]))*0.25*0.25
site_log_like = (log(site_like))*num_pattern
like_list.append(site_log_like)
log_like = sum(like_list)
return log_like


def __str__(self):
Expand All @@ -51,131 +153,16 @@ class node(object):
parstr = '%d' % self.par.number

return 'node: number=%d edgelen=%g lchild=%s rsib=%s parent=%s descendants=[%s]' % (self.number, self.edgelen, lchildstr, rsibstr, parstr, descendants_as_string)


def allocatePartial(node, patterns, rates):
if node.number > 0:
npatterns = len(patterns)
# print 'npat', npatterns
node.partial = [0.0]*(4*4*npatterns)
# print len(node.partial)
for i,pattern in enumerate(patterns.keys()):
base = pattern[node.number-1]
for l in range(4):
if base == 'A':
node.partial[i*16+l*4 + 0] = 1.0
elif base == 'C':
node.partial[i*16+l*4 + 1] = 1.0
elif base == 'G':
node.partial[i*16+l*4 + 2] = 1.0
elif base == 'T':
node.partial[i*16+l*4 + 3] = 1.0
else:
assert(False), 'oops, something went horribly wrong!'

else:
# rt = [0.03338775, 0.25191592, 0.82026848, 2.89442785]
# rt = [2.89442785]

# rt = [1.0, 1.0, 1.0, 1.0]

npatterns = len(patterns)
# print 'patterns=', patterns
node.partial = [0.0]*(4*4*npatterns)
like_list = []
for i,pattern in enumerate(patterns.keys()):
# print i, pattern, patterns.keys()
m_list = []
num_pattern = patterns[pattern]
# print num_pattern

for l,m in enumerate(rates):


psame = (0.25+0.75*exp(-4.0*m*(node.lchild.edgelen)/3.0))
pdiff = (0.25-0.25*exp(-4.0*m*(node.lchild.edgelen)/3.0))

psame2 = (0.25+0.75*exp(-4.0*m*(node.lchild.rsib.edgelen)/3.0))
pdiff2 = (0.25-0.25*exp(-4.0*m*(node.lchild.rsib.edgelen)/3.0))

num_pattern = patterns[pattern]
pAA = psame*(node.lchild.partial[i*16+l*4 + 0])
pAC = pdiff*(node.lchild.partial[i*16+l*4 + 1])
pAG = pdiff*(node.lchild.partial[i*16+l*4 + 2])
pAT = pdiff*(node.lchild.partial[i*16+l*4 + 3])

pAA2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 0])
pAC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1])
pAG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2])
pAT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3])

pfromA_lchild = pAA+pAC+pAG+pAT
pfromA_rchild = pAA2+pAC2+pAG2+pAT2
node.partial[i*16+l*4 + 0] = pfromA_lchild*pfromA_rchild


######################################################

pCA = pdiff*(node.lchild.partial[i*16+l*4 + 0])
pCC = psame*(node.lchild.partial[i*16+l*4 + 1])
pCG = pdiff*(node.lchild.partial[i*16+l*4 + 2])
pCT = pdiff*(node.lchild.partial[i*16+l*4 + 3])

pCA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0])
pCC2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 1])
pCG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2])
pCT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3])

pfromC_lchild = pCA+pCC+pCG+pCT
pfromC_rchild = pCA2+pCC2+pCG2+pCT2
node.partial[i*16+l*4 + 1] = pfromC_lchild*pfromC_rchild

#######################################################
#
pGA = pdiff*(node.lchild.partial[i*16+l*4 + 0])
pGC = pdiff*(node.lchild.partial[i*16+l*4 + 1])
pGG = psame*(node.lchild.partial[i*16+l*4 + 2])
pGT = pdiff*(node.lchild.partial[i*16+l*4 + 3])

pGA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0])
pGC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1])
pGG2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 2])
pGT2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 3])

pfromG_lchild = pGA+pGC+pGG+pGT
pfromG_rchild = pGA2+pGC2+pGG2+pGT2
node.partial[i*16+l*4 + 2] = pfromG_lchild*pfromG_rchild

#######################################################

pTA = pdiff*(node.lchild.partial[i*16+l*4 + 0])
pTC = pdiff*(node.lchild.partial[i*16+l*4 + 1])
pTG = pdiff*(node.lchild.partial[i*16+l*4 + 2])
pTT = psame*(node.lchild.partial[i*16+l*4 + 3])

pTA2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 0])
pTC2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 1])
pTG2 = pdiff2*(node.lchild.rsib.partial[i*16+l*4 + 2])
pTT2 = psame2*(node.lchild.rsib.partial[i*16+l*4 + 3])

pfromT_lchild = pTA+pTC+pTG+pTT
pfromT_rchild = pTA2+pTC2+pTG2+pTT2
node.partial[i*16+l*4 + 3] = pfromT_lchild*pfromT_rchild

site_like = (sum(node.partial[i*16:i*16+16]))*0.25*0.25
site_log_like = (log(site_like))*num_pattern
like_list.append(site_log_like)
log_like = sum(like_list)
return log_like


def treenewick():
script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
path = os.path.join(script_dir, tree_file_name)
# print script_dir
path = os.path.join(script_dir, 'tree.tre')
with open(path, 'r') as content:
newick = content.read()
return newick
#
a = treenewick()


def gammaRates(alpha):
bounds = [0.0, 0.25, 0.50, 0.75, 1.]
Expand All @@ -191,7 +178,7 @@ def gammaRates(alpha):
def prepareTree(postorder, patterns, rates):
likelihood_lists = []
for nd in postorder:
likelihood_lists.append(allocatePartial(nd, patterns, rates))
likelihood_lists.append(nd.allocatePartial(patterns, rates))
print 'log-likelihood of the topology =', likelihood_lists[-1]


Expand Down Expand Up @@ -319,8 +306,6 @@ def readnewick(tree):

post = pre[:]
post.reverse()
for nd in post:
print nd.number, nd.edgelen
return post

def Makenewick(pre):
Expand Down Expand Up @@ -407,7 +392,7 @@ if __name__ == '__main__':
# print ' newick: ',newick[0]

alpha = 0.5 ### gamma shape parameter rate categories
# yuletree = '(((1:0.54019,(5:0.40299,10:0.40299):0.1372):0.72686,(6:0.10576,4:0.10576):1.16129):0.42537,(2:0.58122,(9:0.21295,(7:0.16691,(8:0.14622,3:0.14622):0.02069):0.04604):0.36827):1.1112)'
yuletree = '(((1:0.54019,(5:0.40299,10:0.40299):0.1372):0.72686,(6:0.10576,4:0.10576):1.16129):0.42537,(2:0.58122,(9:0.21295,(7:0.16691,(8:0.14622,3:0.14622):0.02069):0.04604):0.36827):1.1112)'
rates_list = gammaRates(alpha)
postorder = readnewick(treenewick())
result = prepareTree(postorder, readSeq.patterns(sequence_file), rates_list)
postorder = readnewick(yuletree)
result = prepareTree(postorder, readSeq.patterns(), rates_list)
1 change: 1 addition & 0 deletions JC+GAMMA/tree.tre
@@ -0,0 +1 @@
(((1:0.54019,(5:0.40299,10:0.40299):0.1372):0.72686,(6:0.10576,4:0.10576):1.16129):0.42537,(2:0.58122,(9:0.21295,(7:0.16691,(8:0.14622,3:0.14622):0.02069):0.04604):0.36827):1.1112)

0 comments on commit d51978f

Please sign in to comment.