diff --git a/readseqs.py b/readseqs.py deleted file mode 100644 index 8667f86..0000000 --- a/readseqs.py +++ /dev/null @@ -1,72 +0,0 @@ -def patterns(): - - - import re, os, glob, itertools, fnmatch, sys, shutil - from itertools import combinations - from collections import Counter - - script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) - path = os.path.join(script_dir, 'nexus') - - - genes = [] - data = {} -# print 'Reading nexus files...' - for filename in glob.glob(os.path.join(path, '*.nex')): - - m = re.match('(.+).nex', os.path.basename(filename)) - gene_name = m.group(1) -# print 'gene_name=', gene_name - genes.append(gene_name) - f = open(filename, 'r').read() - - m = re.search('ntax\s*=\s*(\d+)', f, re.M | re.S) - ntax = int(m.group(1)) -# print 'ntax=', ntax - - m = re.search('nchar\s*=\s*(\d+)', f, re.M | re.S) - nchar = int(m.group(1)) -# print 'nchar=', nchar - - - m = re.search('Matrix\s+(.+?);', f, re.M | re.S) - matrix = m.group(1).strip() - matrix_lines = matrix.split('\n') - - taxon_names = [] - sequences = {} - sequences_list = [] - for line in matrix_lines: - parts = line.strip().split() - assert len(parts) == 2 - taxon_name = parts[0] - sequence = parts[1] - - taxon_names.append(taxon_name) - sequences_list.append(sequence) - sequences[taxon_name] = sequence - - pattern_list = [] - - k=0 - while k < nchar: - site_pattern = '' - for i,m in enumerate(sequences_list): - site_pattern += m[k] - pattern_list.append(site_pattern) - k+=1 - pattern_dict = dict() - for i in pattern_list: - pattern_dict[i] = pattern_dict.get(i, 0) + 1 - - tmp = [] - for key in pattern_dict.keys(): ###convert dict to key of tupules -# print 'key=', key - tmp.append((pattern_dict[key],key)) - - sorted_values = sorted(tmp) ###sorted according to key smaller to larger - sorted_values.sort(cmp = lambda x,y:cmp(x[1],y[1])) ###sorted according to values in alphabetical order - - return pattern_dict - - diff --git a/tree_test.py b/tree_test.py index 2cbf27e..ea4d760 100644 --- a/tree_test.py +++ b/tree_test.py @@ -21,28 +21,14 @@ def __init__(self, ndnum): # initialization function self.descendants = set([ndnum]) # set containing descendant leaf set self.partial = None # will have length 4*npatterns self.state = None -# self.probsame = 0.0 -# self.probdiff = 0.0 def allocatePartial(self, patterns): - # skip internals for now -# print 'self.number, self.edgelen =',self.number, self.edgelen if self.number > 0: - # patterns['ACGTT'] = 51 npatterns = len(patterns) self.partial = [0.0]*(4*npatterns) - -# print 'self.partial=', self.partial - for i,pattern in enumerate(patterns.keys()): -# print 'patterns=', patterns -# print 'pattern=', pattern -# print patterns[pattern] - - base = pattern[self.number-1] -# print 'i = %d, pattern = %s, base = %s' % (i,pattern,base) if base == 'A': self.partial[i*4 + 0] = 1.0 elif base == 'C': @@ -54,28 +40,12 @@ def allocatePartial(self, patterns): else: assert(False), 'oops, something went horribly wrong!' - -# print self.partial -# print '****************************************************************************************************************' - else: npatterns = len(patterns) -# print 'npatterns=', npatterns self.partial = [0.0]*(4*npatterns) like_list = [] for i,pattern in enumerate(patterns.keys()): -# print 'patterns=', patterns -# print 'pattern=', pattern num_pattern = patterns[pattern] -# print 'edge_length==', self.lchild.edgelen -# print 'test1=', self.lchild.partial[i*4 + 0], self.lchild.rsib.partial[i*4 + 0] -# print 'test2=', self.lchild.partial[i*4 + 1], self.lchild.rsib.partial[i*4 + 1] -# print 'test3=', self.lchild.partial[i*4 + 2], self.lchild.rsib.partial[i*4 + 2] -# print 'test4=', self.lchild.partial[i*4 + 3], self.lchild.rsib.partial[i*4 + 3] -# print'____________________' -# print 'test5=', self.lchild.edgelen -# print 'test6=', self.lchild.rsib.edgelen -# print'____________________' pAA = (0.25+0.75*exp(-4.0*(self.lchild.edgelen)/3.0))*(self.lchild.partial[i*4 + 0]) pAC = (0.25-0.25*exp(-4.0*(self.lchild.edgelen)/3.0))*(self.lchild.partial[i*4 + 1]) pAG = (0.25-0.25*exp(-4.0*(self.lchild.edgelen)/3.0))*(self.lchild.partial[i*4 + 2]) @@ -106,17 +76,17 @@ def allocatePartial(self, patterns): pfromC_rchild = pCA2+pCC2+pCG2+pCT2 self.partial[i*4 + 1] = pfromC_lchild*pfromC_rchild ############################################################################################################################## -# + pGA = (0.25-0.25*exp(-4.0*(self.lchild.edgelen)/3.0))*(self.lchild.partial[i*4 + 0]) pGC = (0.25-0.25*exp(-4.0*(self.lchild.edgelen)/3.0))*(self.lchild.partial[i*4 + 1]) pGG = (0.25+0.75*exp(-4.0*(self.lchild.edgelen)/3.0))*(self.lchild.partial[i*4 + 2]) pGT = (0.25-0.25*exp(-4.0*(self.lchild.edgelen)/3.0))*(self.lchild.partial[i*4 + 3]) -# + pGA2 = (0.25-0.25*exp(-4.0*(self.lchild.rsib.edgelen)/3.0))*(self.lchild.rsib.partial[i*4 + 0]) pGC2 = (0.25-0.25*exp(-4.0*(self.lchild.rsib.edgelen)/3.0))*(self.lchild.rsib.partial[i*4 + 1]) pGG2 = (0.25+0.75*exp(-4.0*(self.lchild.rsib.edgelen)/3.0))*(self.lchild.rsib.partial[i*4 + 2]) pGT2 = (0.25-0.25*exp(-4.0*(self.lchild.rsib.edgelen)/3.0))*(self.lchild.rsib.partial[i*4 + 3]) -# + pfromG_lchild = pGA+pGC+pGG+pGT pfromG_rchild = pGA2+pGC2+pGG2+pGT2 self.partial[i*4 + 2] = pfromG_lchild*pfromG_rchild @@ -125,102 +95,24 @@ def allocatePartial(self, patterns): pTC = (0.25-0.25*exp(-4.0*(self.lchild.edgelen)/3.0))*(self.lchild.partial[i*4 + 1]) pTG = (0.25-0.25*exp(-4.0*(self.lchild.edgelen)/3.0))*(self.lchild.partial[i*4 + 2]) pTT = (0.25+0.75*exp(-4.0*(self.lchild.edgelen)/3.0))*(self.lchild.partial[i*4 + 3]) -# + pTA2 = (0.25-0.25*exp(-4.0*(self.lchild.rsib.edgelen)/3.0))*(self.lchild.rsib.partial[i*4 + 0]) pTC2 = (0.25-0.25*exp(-4.0*(self.lchild.rsib.edgelen)/3.0))*(self.lchild.rsib.partial[i*4 + 1]) pTG2 = (0.25-0.25*exp(-4.0*(self.lchild.rsib.edgelen)/3.0))*(self.lchild.rsib.partial[i*4 + 2]) pTT2 = (0.25+0.75*exp(-4.0*(self.lchild.rsib.edgelen)/3.0))*(self.lchild.rsib.partial[i*4 + 3]) -# + pfromT_lchild = pTA+pTC+pTG+pTT pfromT_rchild = pTA2+pTC2+pTG2+pTT2 self.partial[i*4 + 3] = pfromT_lchild*pfromT_rchild ############################################################################################################################## -# print '~~~~~~~~~~~~~~~~~~~~' -# print 'num_pattern=', num_pattern -# print self.partial -# print self.partial[i*4:i*4+4] -# site_like2 = sum(self.partial[i:i+4])*num_pattern site_like = (log((sum(self.partial[i*4:i*4+4]))*0.25))*num_pattern # print 'site_like=', site_like like_list.append(site_like) -# print '~~~~~~~~~~~~~~~~~~~~' - - Like = sum(like_list) print 'Like=', Like - - print '**************************************************' - ############################################################### - ############################################################### - - def simulateSequences(self, ran_nm): - print '*****************START***********************' - -# print self.number - freq = [0.25, 0.25, 0.25, 0.25] -# ran_num = [] -# for i in range(8): -# u = random.random() -# ran_num.append(u) -# print 'ran_num=', ran_num -# print p[0] - current_states = [ 'A', 'C', 'G', 'T'] - collect = [] -# r = 0 - print 'ran_nm=0', ran_nm - if self.par is None: - def root(): - if ran_nm < freq[0]: - self.state = 'A' - elif ran_nm <= freq[0]+freq[1]: - self.state = 'C' - elif ran_nm <= freq[0]+freq[1]+freq[2]: - self.state = 'G' - else: - self.state = 'T' -# print 'self.state', self.state - root2 = root() -# print 'ran_num1=', ran_num[r] -# print 'ran_num2=', ran_num[r] - else: - print 'self.number=', self.number - print 'ran_nm=', ran_nm - - prob = [] - for i in current_states: -# print 'self.par.state=', self.par.state - if self.par.state == i: - p = (0.25+0.75*exp(-4.0*(self.edgelen)/3.0)) - prob.append(p) - else: - p = (0.25-0.25*exp(-4.0*(self.edgelen)/3.0)) - prob.append(p) -# print 'prob=', prob -# - for i in prob: -# print 'r=', r -# print 'ran_num[r]', ran_num[r] - if ran_nm <= prob[0]: - self.state = 'A' -# print self.state - elif ran_nm <= prob[0]+ prob[1]: - self.state = 'C' -# print self.state - elif ran_nm <= prob[0]+ prob[1]+ prob[2]: - self.state = 'G' -# print self.state - else: - self.state = 'T' - print self.state -# print 'ran_num_other=', ran_num[r] - - print '********************END**********************************' -# print 'ran_num==', ran_num - - def __str__(self): # __str__ is a built-in function that is used by print to show an object descendants_as_string = ','.join(['%d' % d for d in self.descendants]) @@ -246,21 +138,6 @@ def prepareTree(postorder, patterns): for nd in postorder: nd.allocatePartial(patterns) -def random1(): - ran_num = [] - for i in range(9): - u = random.random() - ran_num.append(u) - print 'ran_num=', ran_num - return ran_num - - -def simulate(preorder, ran_num): - print 'ran_num_list=', ran_num - r = 0 - for nd in preorder: - nd.simulateSequences(ran_num[r]) - r+=1 def joinRandomPair(node_list, next_node_number, is_deep_coalescence): # pick first of two lineages to join and delete from node_list @@ -296,8 +173,6 @@ def makeNewick(nd, brlen_scaler = 1.0, start = True): # _newick = '' _TL = 0.0 -# print - if nd.lchild: _newick += '(' makeNewick(nd.lchild, brlen_scaler, False) @@ -365,13 +240,6 @@ def readnewick(tree): child.par=nd nd=child - # nd.edgelen = int_br2[len_int_br] - # len_int_br -=1 - - # print 'nd=', nd.number - # print 'nd.edgelen', nd.edgelen - -# print '*******************' elif m == ',': internal_node_number -= 1 # print 'internal_node_number=', internal_node_number @@ -380,16 +248,8 @@ def readnewick(tree): nd.rsib = rsib rsib.par=nd.par nd = rsib - # len_int_br +=1 - - # nd.edgelen = int_br2[len_int_br] - # len_int_br -=1 - - # print 'nd=', nd.number -# print '...................' elif m == ')': nd = nd.par -# print '+++++++++++++++++++' elif m == ':': edge_len_str = '' @@ -403,7 +263,6 @@ def readnewick(tree): i -=1 nd.edgelen = float(edge_len_str) -# print '<<<<<<<<<<<<<<<<<<<<' else: internal_node_number += 1 @@ -417,40 +276,14 @@ def readnewick(tree): i += 1 m = tree[i] - # nd.edgelen = ext_br2[t] nd.number = int(mm) - # print 'ext_node', mm i -= 1 -# print '^^^^^^^^^^^^^^^^^^^^^^^^' - -# print 'nd.number=', nd.number -# print 'nd.edgelen=', nd.edgelen i += 1 -# print '############################ preorder ############################' -# for nd in pre: -# print nd.number - -# print '############################ postorder1 ############################' -# post = pre[:] -# post.reverse() -# for nd in post: -# print nd.number - -# print '############################ postorder2 ############################' -# for i in range(len(pre)): -# print pre[-i-1].number - -# for nd in pre: -# print nd.number - - return pre -# print post -# y= getPostorder(root) - - - + post = pre[:] + post.reverse() + return post def Makenewick(pre): newickstring = '' @@ -546,21 +379,11 @@ def calcExpectedHeight(num_species, mu_over_s): yuletree = '(((1:0.03915,5:0.03915):0.387,(4:0.42253,2:0.42253):0.004):0.118,3:0.54433)' -# postorder = readnewick(yuletree) - preorder = readnewick(yuletree) + postorder = readnewick(yuletree) +# preorder = readnewick(yuletree) # for nd in postorder: # print nd.number -# y = prepareTree(postorder, readseq.patterns()) + y = prepareTree(postorder, readseq.patterns()) # z = readnewick(yuletree) - a = simulate(preorder, random1()) -# print a -# for nd in z: -# print nd.number, nd.edgelen -# for nd in postorder: -# nd.allocatePartial(patterns) -# -# prepareTree = prepareTree(readnewick, readseq.patterns()) -# print prepareTree - -###########################read sequences######### +# a = simulate(preorder, random1())