diff --git a/simSequences.py b/simSequences.py new file mode 100644 index 0000000..f753091 --- /dev/null +++ b/simSequences.py @@ -0,0 +1,191 @@ + +# git clone https://github.uconn.edu/sun13005/sntree.git . +# git status +# git add +# git commit -am "some message" +# git push +# git log +# git show fe580563:tree.py > tmp.py --> to extract older version +import readseq +import random +import re, os, itertools, sys, glob +from itertools import chain +from math import exp, log +class node(object): + def __init__(self, ndnum): # initialization function + self.rsib = None # right sibling + self.lchild = None # left child + self.par = None # parent node + self.number = ndnum # node number (internals negative, tips 0 or positive) + self.edgelen = 0.0 # branch length + self.descendants = set([ndnum]) # set containing descendant leaf set + self.partial = None # will have length 4*npatterns + self.state = None + self.states = None + + + def simulateSequences(self, num_sites): + self.states = [str]*(num_sites) + freq = [0.25, 0.25, 0.25, 0.25] + current_states = [ 'A', 'C', 'G', 'T'] + if self.par is None: + + for i in range(num_sites): + + ran_nm = random.random() + if ran_nm < freq[0]: + self.state = 'A' + self.states[i] = 'A' + elif ran_nm <= freq[0]+freq[1]: + self.state = 'C' + self.states[i] = 'C' + elif ran_nm <= freq[0]+freq[1]+freq[2]: + self.state = 'G' + self.states[i] = 'G' + + else: + self.state = 'T' + self.states[i] = 'T' + + else: + for m in range(num_sites): + prob = [] + ran_nm = random.random() + + for i in current_states: + if self.par.states[m] == i: + p = (0.25+0.75*exp(-4.0*(self.edgelen)/3.0)) + prob.append(p) + else: + p = (0.25-0.25*exp(-4.0*(self.edgelen)/3.0)) + prob.append(p) + for i in prob: + + if ran_nm <= prob[0]: + self.state = 'A' + self.states[m] = 'A' + + elif ran_nm <= prob[0]+ prob[1]: + self.state = 'C' + self.states[m] = 'C' + + elif ran_nm <= prob[0]+ prob[1]+ prob[2]: + self.state = 'G' + self.states[m] = 'G' + + else: + self.state = 'T' + self.states[m] = 'T' + + return self.states + + + def __str__(self): + # __str__ is a built-in function that is used by print to show an object + descendants_as_string = ','.join(['%d' % d for d in self.descendants]) + + lchildstr = 'None' + if self.lchild is not None: + lchildstr = '%d' % self.lchild.number + + rsibstr = 'None' + if self.rsib is not None: + rsibstr = '%d' % self.rsib.number + + parstr = 'None' + if self.par is not None: + parstr = '%d' % self.par.number + + return 'node: number=%d edgelen=%g lchild=%s rsib=%s parent=%s descendants=[%s]' % (self.number, self.edgelen, lchildstr, rsibstr, parstr, descendants_as_string) + + + + +def simulate2(preorder, ntax, num_sites, out): + newf = open(out, 'w') + newf.write('#nexus\n\n') + newf.write('begin data;\n') + newf.write('dimensions ntax=%d nchar=%d;\n' % (ntax, num_sites)) + newf.write('format datatype=dna missing=? gap=-;\n') + newf.write('matrix\n') + master = {} + for nd in preorder: + master[nd.number]=nd.simulateSequences(num_sites) + if nd.number >0: + newf.write('%s %s\n' % (nd.number, ''.join(nd.simulateSequences(num_sites)))) + newf.write(';\n') + newf.write('end;') + + + + +def readnewick(tree): + total_length = len(tree) + internal_node_number = -1 + + root = node(internal_node_number) + nd = root + i = 0 + pre = [root] + while i < total_length: + m = tree[i] + + if m =='(': + internal_node_number -= 1 + + child = node(internal_node_number) + pre.append(child) + nd.lchild=child + + child.par=nd + nd=child + elif m == ',': + internal_node_number -= 1 + rsib = node(internal_node_number) + pre.append(rsib) + nd.rsib = rsib + rsib.par=nd.par + nd = rsib + elif m == ')': + nd = nd.par + + elif m == ':': + edge_len_str = '' + i+=1 + m = tree[i] + assert m in ['0','1','2','3','4','5','6','7','8', '9','.'] + while m in ['0','1','2','3','4','5','6','7','8', '9','.']: + edge_len_str += m + i+=1 + m = tree[i] + i -=1 + nd.edgelen = float(edge_len_str) + else: + internal_node_number += 1 + + if True: + assert m in ['0','1','2','3','4','5','6','7','8', '9'], 'Error : expecting m to be a digit when in fact it was "%s"' % m + mm = '' + while m in ['0','1','2','3','4','5','6','7','8', '9' ]: + + mm += m + i += 1 + m = tree[i] + nd.number = int(mm) + i -= 1 + i += 1 + + return pre + +if __name__ == '__main__': + + output_filename = os.path.join('simulated_output.nexus') + + yuletree = '(((1:0.54019,(5:0.40299,10:0.40299):0.13720):0.72686,(6:0.10576,4:0.10576):1.16129):0.42537,(2:0.58122,(9:0.21295,(7:0.16691,(8:0.14622,3:0.14622):0.02069):0.04604):0.36827):1.11120)' + + preorder = readnewick(yuletree) + ntax = 10 + num_sites = 10000 + + result = simulate2(preorder, ntax, num_sites, output_filename) + diff --git a/tree_test.py b/tree_test.py index c013706..2cbf27e 100644 --- a/tree_test.py +++ b/tree_test.py @@ -20,6 +20,7 @@ def __init__(self, ndnum): # initialization function self.edgelen = 0.0 # branch length self.descendants = set([ndnum]) # set containing descendant leaf set self.partial = None # will have length 4*npatterns + self.state = None # self.probsame = 0.0 # self.probdiff = 0.0 @@ -151,6 +152,74 @@ def allocatePartial(self, patterns): print 'Like=', Like print '**************************************************' + ############################################################### + ############################################################### + + def simulateSequences(self, ran_nm): + print '*****************START***********************' + +# print self.number + freq = [0.25, 0.25, 0.25, 0.25] +# ran_num = [] +# for i in range(8): +# u = random.random() +# ran_num.append(u) +# print 'ran_num=', ran_num +# print p[0] + current_states = [ 'A', 'C', 'G', 'T'] + collect = [] +# r = 0 + print 'ran_nm=0', ran_nm + if self.par is None: + def root(): + if ran_nm < freq[0]: + self.state = 'A' + elif ran_nm <= freq[0]+freq[1]: + self.state = 'C' + elif ran_nm <= freq[0]+freq[1]+freq[2]: + self.state = 'G' + else: + self.state = 'T' +# print 'self.state', self.state + + root2 = root() +# print 'ran_num1=', ran_num[r] +# print 'ran_num2=', ran_num[r] + else: + print 'self.number=', self.number + print 'ran_nm=', ran_nm + + prob = [] + for i in current_states: +# print 'self.par.state=', self.par.state + if self.par.state == i: + p = (0.25+0.75*exp(-4.0*(self.edgelen)/3.0)) + prob.append(p) + else: + p = (0.25-0.25*exp(-4.0*(self.edgelen)/3.0)) + prob.append(p) +# print 'prob=', prob +# + for i in prob: +# print 'r=', r +# print 'ran_num[r]', ran_num[r] + if ran_nm <= prob[0]: + self.state = 'A' +# print self.state + elif ran_nm <= prob[0]+ prob[1]: + self.state = 'C' +# print self.state + elif ran_nm <= prob[0]+ prob[1]+ prob[2]: + self.state = 'G' +# print self.state + else: + self.state = 'T' + print self.state +# print 'ran_num_other=', ran_num[r] + + print '********************END**********************************' +# print 'ran_num==', ran_num + def __str__(self): # __str__ is a built-in function that is used by print to show an object @@ -175,9 +244,23 @@ def __str__(self): def prepareTree(postorder, patterns): for nd in postorder: - nd.allocatePartial(patterns) +def random1(): + ran_num = [] + for i in range(9): + u = random.random() + ran_num.append(u) + print 'ran_num=', ran_num + return ran_num + + +def simulate(preorder, ran_num): + print 'ran_num_list=', ran_num + r = 0 + for nd in preorder: + nd.simulateSequences(ran_num[r]) + r+=1 def joinRandomPair(node_list, next_node_number, is_deep_coalescence): # pick first of two lineages to join and delete from node_list @@ -350,8 +433,8 @@ def readnewick(tree): # print nd.number # print '############################ postorder1 ############################' - post = pre[:] - post.reverse() +# post = pre[:] +# post.reverse() # for nd in post: # print nd.number @@ -359,13 +442,13 @@ def readnewick(tree): # for i in range(len(pre)): # print pre[-i-1].number +# for nd in pre: +# print nd.number - return post + return pre # print post # y= getPostorder(root) -# for nd in pre: -# print nd.number @@ -441,7 +524,7 @@ def calcExpectedHeight(num_species, mu_over_s): if __name__ == '__main__': - random_seed = 24223 # 7632557, 12345 + random_seed = 24553 # 7632557, 12345 number_of_species = 5 mutation_speciation_rate_ratio = 0.4 # 0.689655172 # yields tree height 1 for 6 species random.seed(random_seed) @@ -463,12 +546,15 @@ def calcExpectedHeight(num_species, mu_over_s): yuletree = '(((1:0.03915,5:0.03915):0.387,(4:0.42253,2:0.42253):0.004):0.118,3:0.54433)' - postorder = readnewick(yuletree) +# postorder = readnewick(yuletree) + preorder = readnewick(yuletree) # for nd in postorder: # print nd.number - y = prepareTree(postorder, readseq.patterns()) - z = readnewick(yuletree) +# y = prepareTree(postorder, readseq.patterns()) +# z = readnewick(yuletree) + a = simulate(preorder, random1()) +# print a # for nd in z: # print nd.number, nd.edgelen # for nd in postorder: