Skip to content

Commit

Permalink
simple script to simulate nucleotide sequences under a tree and Jukes…
Browse files Browse the repository at this point in the history
… Cantor model
  • Loading branch information
sun13005 committed Apr 21, 2017
1 parent 35eb323 commit 00e8a42
Show file tree
Hide file tree
Showing 2 changed files with 287 additions and 10 deletions.
191 changes: 191 additions & 0 deletions simSequences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@

# git clone https://github.uconn.edu/sun13005/sntree.git .
# git status
# git add <file name>
# git commit -am "some message"
# git push
# git log
# git show fe580563:tree.py > tmp.py --> to extract older version
import readseq
import random
import re, os, itertools, sys, glob
from itertools import chain
from math import exp, log
class node(object):
def __init__(self, ndnum): # initialization function
self.rsib = None # right sibling
self.lchild = None # left child
self.par = None # parent node
self.number = ndnum # node number (internals negative, tips 0 or positive)
self.edgelen = 0.0 # branch length
self.descendants = set([ndnum]) # set containing descendant leaf set
self.partial = None # will have length 4*npatterns
self.state = None
self.states = None


def simulateSequences(self, num_sites):
self.states = [str]*(num_sites)
freq = [0.25, 0.25, 0.25, 0.25]
current_states = [ 'A', 'C', 'G', 'T']
if self.par is None:

for i in range(num_sites):

ran_nm = random.random()
if ran_nm < freq[0]:
self.state = 'A'
self.states[i] = 'A'
elif ran_nm <= freq[0]+freq[1]:
self.state = 'C'
self.states[i] = 'C'
elif ran_nm <= freq[0]+freq[1]+freq[2]:
self.state = 'G'
self.states[i] = 'G'

else:
self.state = 'T'
self.states[i] = 'T'

else:
for m in range(num_sites):
prob = []
ran_nm = random.random()

for i in current_states:
if self.par.states[m] == i:
p = (0.25+0.75*exp(-4.0*(self.edgelen)/3.0))
prob.append(p)
else:
p = (0.25-0.25*exp(-4.0*(self.edgelen)/3.0))
prob.append(p)
for i in prob:

if ran_nm <= prob[0]:
self.state = 'A'
self.states[m] = 'A'

elif ran_nm <= prob[0]+ prob[1]:
self.state = 'C'
self.states[m] = 'C'

elif ran_nm <= prob[0]+ prob[1]+ prob[2]:
self.state = 'G'
self.states[m] = 'G'

else:
self.state = 'T'
self.states[m] = 'T'

return self.states


def __str__(self):
# __str__ is a built-in function that is used by print to show an object
descendants_as_string = ','.join(['%d' % d for d in self.descendants])

lchildstr = 'None'
if self.lchild is not None:
lchildstr = '%d' % self.lchild.number

rsibstr = 'None'
if self.rsib is not None:
rsibstr = '%d' % self.rsib.number

parstr = 'None'
if self.par is not None:
parstr = '%d' % self.par.number

return 'node: number=%d edgelen=%g lchild=%s rsib=%s parent=%s descendants=[%s]' % (self.number, self.edgelen, lchildstr, rsibstr, parstr, descendants_as_string)




def simulate2(preorder, ntax, num_sites, out):
newf = open(out, 'w')
newf.write('#nexus\n\n')
newf.write('begin data;\n')
newf.write('dimensions ntax=%d nchar=%d;\n' % (ntax, num_sites))
newf.write('format datatype=dna missing=? gap=-;\n')
newf.write('matrix\n')
master = {}
for nd in preorder:
master[nd.number]=nd.simulateSequences(num_sites)
if nd.number >0:
newf.write('%s %s\n' % (nd.number, ''.join(nd.simulateSequences(num_sites))))
newf.write(';\n')
newf.write('end;')




def readnewick(tree):
total_length = len(tree)
internal_node_number = -1

root = node(internal_node_number)
nd = root
i = 0
pre = [root]
while i < total_length:
m = tree[i]

if m =='(':
internal_node_number -= 1

child = node(internal_node_number)
pre.append(child)
nd.lchild=child

child.par=nd
nd=child
elif m == ',':
internal_node_number -= 1
rsib = node(internal_node_number)
pre.append(rsib)
nd.rsib = rsib
rsib.par=nd.par
nd = rsib
elif m == ')':
nd = nd.par

elif m == ':':
edge_len_str = ''
i+=1
m = tree[i]
assert m in ['0','1','2','3','4','5','6','7','8', '9','.']
while m in ['0','1','2','3','4','5','6','7','8', '9','.']:
edge_len_str += m
i+=1
m = tree[i]
i -=1
nd.edgelen = float(edge_len_str)
else:
internal_node_number += 1

if True:
assert m in ['0','1','2','3','4','5','6','7','8', '9'], 'Error : expecting m to be a digit when in fact it was "%s"' % m
mm = ''
while m in ['0','1','2','3','4','5','6','7','8', '9' ]:

mm += m
i += 1
m = tree[i]
nd.number = int(mm)
i -= 1
i += 1

return pre

if __name__ == '__main__':

output_filename = os.path.join('simulated_output.nexus')

yuletree = '(((1:0.54019,(5:0.40299,10:0.40299):0.13720):0.72686,(6:0.10576,4:0.10576):1.16129):0.42537,(2:0.58122,(9:0.21295,(7:0.16691,(8:0.14622,3:0.14622):0.02069):0.04604):0.36827):1.11120)'

preorder = readnewick(yuletree)
ntax = 10
num_sites = 10000

result = simulate2(preorder, ntax, num_sites, output_filename)

106 changes: 96 additions & 10 deletions tree_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def __init__(self, ndnum): # initialization function
self.edgelen = 0.0 # branch length
self.descendants = set([ndnum]) # set containing descendant leaf set
self.partial = None # will have length 4*npatterns
self.state = None
# self.probsame = 0.0
# self.probdiff = 0.0

Expand Down Expand Up @@ -151,6 +152,74 @@ def allocatePartial(self, patterns):
print 'Like=', Like

print '**************************************************'
###############################################################
###############################################################

def simulateSequences(self, ran_nm):
print '*****************START***********************'

# print self.number
freq = [0.25, 0.25, 0.25, 0.25]
# ran_num = []
# for i in range(8):
# u = random.random()
# ran_num.append(u)
# print 'ran_num=', ran_num
# print p[0]
current_states = [ 'A', 'C', 'G', 'T']
collect = []
# r = 0
print 'ran_nm=0', ran_nm
if self.par is None:
def root():
if ran_nm < freq[0]:
self.state = 'A'
elif ran_nm <= freq[0]+freq[1]:
self.state = 'C'
elif ran_nm <= freq[0]+freq[1]+freq[2]:
self.state = 'G'
else:
self.state = 'T'
# print 'self.state', self.state

root2 = root()
# print 'ran_num1=', ran_num[r]
# print 'ran_num2=', ran_num[r]
else:
print 'self.number=', self.number
print 'ran_nm=', ran_nm

prob = []
for i in current_states:
# print 'self.par.state=', self.par.state
if self.par.state == i:
p = (0.25+0.75*exp(-4.0*(self.edgelen)/3.0))
prob.append(p)
else:
p = (0.25-0.25*exp(-4.0*(self.edgelen)/3.0))
prob.append(p)
# print 'prob=', prob
#
for i in prob:
# print 'r=', r
# print 'ran_num[r]', ran_num[r]
if ran_nm <= prob[0]:
self.state = 'A'
# print self.state
elif ran_nm <= prob[0]+ prob[1]:
self.state = 'C'
# print self.state
elif ran_nm <= prob[0]+ prob[1]+ prob[2]:
self.state = 'G'
# print self.state
else:
self.state = 'T'
print self.state
# print 'ran_num_other=', ran_num[r]

print '********************END**********************************'
# print 'ran_num==', ran_num


def __str__(self):
# __str__ is a built-in function that is used by print to show an object
Expand All @@ -175,9 +244,23 @@ def __str__(self):

def prepareTree(postorder, patterns):
for nd in postorder:

nd.allocatePartial(patterns)

def random1():
ran_num = []
for i in range(9):
u = random.random()
ran_num.append(u)
print 'ran_num=', ran_num
return ran_num


def simulate(preorder, ran_num):
print 'ran_num_list=', ran_num
r = 0
for nd in preorder:
nd.simulateSequences(ran_num[r])
r+=1

def joinRandomPair(node_list, next_node_number, is_deep_coalescence):
# pick first of two lineages to join and delete from node_list
Expand Down Expand Up @@ -350,22 +433,22 @@ def readnewick(tree):
# print nd.number

# print '############################ postorder1 ############################'
post = pre[:]
post.reverse()
# post = pre[:]
# post.reverse()
# for nd in post:
# print nd.number

# print '############################ postorder2 ############################'
# for i in range(len(pre)):
# print pre[-i-1].number

# for nd in pre:
# print nd.number

return post
return pre
# print post
# y= getPostorder(root)

# for nd in pre:
# print nd.number



Expand Down Expand Up @@ -441,7 +524,7 @@ def calcExpectedHeight(num_species, mu_over_s):


if __name__ == '__main__':
random_seed = 24223 # 7632557, 12345
random_seed = 24553 # 7632557, 12345
number_of_species = 5
mutation_speciation_rate_ratio = 0.4 # 0.689655172 # yields tree height 1 for 6 species
random.seed(random_seed)
Expand All @@ -463,12 +546,15 @@ def calcExpectedHeight(num_species, mu_over_s):

yuletree = '(((1:0.03915,5:0.03915):0.387,(4:0.42253,2:0.42253):0.004):0.118,3:0.54433)'

postorder = readnewick(yuletree)
# postorder = readnewick(yuletree)
preorder = readnewick(yuletree)

# for nd in postorder:
# print nd.number
y = prepareTree(postorder, readseq.patterns())
z = readnewick(yuletree)
# y = prepareTree(postorder, readseq.patterns())
# z = readnewick(yuletree)
a = simulate(preorder, random1())
# print a
# for nd in z:
# print nd.number, nd.edgelen
# for nd in postorder:
Expand Down

0 comments on commit 00e8a42

Please sign in to comment.