creates randomly generted sequences with no underlying topology

sun13005 · Apr 22, 2017 · 1a72916 · 1a72916
1 parent 60a48bf
commit 1a72916
Showing 1 changed file with 212 additions and 0 deletions.
diff --git a/randomseq.py b/randomseq.py
@@ -0,0 +1,212 @@
+import sys, re, os, random
+
+
+# number1 = raw_input('number_of_runs? > ')
+# num_run = int(number1)
+# #print num_run
+# 
+# ###specifying number of taxa to be generated
+# number2 = raw_input('number_of_taxa? > ')
+# ntax = int(number2)
+# 
+# ###specifying number of sites to be generated
+# number3 = raw_input('number of sites? > ')
+# num_sites = int(number3)
+# 
+# ###specifying number of dna matrix to be generated
+# number4 = raw_input('number of genes? > ')
+# number_genes = int(number4)
+
+
+
+
+num_run = 1
+#print num_run
+
+###specifying number of taxa to be generated
+ntax = 6
+
+###specifying number of sites to be generated
+num_sites = 10000
+
+###specifying number of dna matrix to be generated
+number_genes = 10
+
+num_trials_pergene = 2
+
+
+for i in range(num_run):
+    run_name = i+1
+    run_name2 = 'trial'+ str(run_name)
+    print run_name2    
+    master_dir = os.path.join(os.path.abspath(os.curdir), run_name2) 
+    if not os.path.exists(master_dir):
+        os.mkdir(master_dir)
+
+
+    ###creating a list for the directories for each randomly generated dna matrix
+    all_seq = []
+    sets = []
+    for a, b in enumerate(range(number_genes)):
+        a += 1
+        set = "gene"+ str(a)
+        sets.append(set)
+
+    ### creating dna matrix with randomly generated sequences
+    for n, i in enumerate(sets):
+        ### specifying names of the directory and dna matrix 
+        folder_name = i
+        n += 1
+        new_nexus_file = 'randomseq'+ str(n)+'.nex'
+        new_nexus_file2 = 'randomseq'+ str(n)
+    ### creating a list of randomly generated sequences
+        sequences = []
+        for i in range(ntax):
+            myrandom = []
+            for i in range(num_sites):
+                i = random.randint(1,4)
+                myrandom.append(i)
+            dna_list = []   
+            for m in myrandom:
+                if m == 1:
+                    m= "A"
+                if m ==2:
+                    m= "T"
+                if m ==3:
+                    m= "G"
+                if m == 4:
+                    m= "C"
+                dna_list.append(m)
+            dna_list2 = ''.join(dna_list)
+            #print dna_list2
+            sequences.append(dna_list2)
+        all_seq.append(sequences)
+
+
+
+    ### creating a list of taxon_names
+        taxon_names = []
+        k=0
+        for i, m in enumerate(sequences):
+            k += 1
+            taxon = "taxon"+ str(k)
+            taxon_names.append(taxon)
+
+
+    ### creating a directory 
+        new_dir = os.path.join(master_dir,folder_name)
+        if not os.path.exists(new_dir):
+            os.mkdir(new_dir)
+        gene_sets = []        
+#        for i in range(3):
+        for i in range(num_trials_pergene):
+            m= i+1
+            seed_num = 4648+m
+            m2 = 'run'+str(m)
+            gene_sets.append(m2)        
+            new_subdir = os.path.join(new_dir,m2)    
+            if not os.path.exists(new_subdir):
+                os.mkdir(new_subdir)
+
+            bash_filename = 'qsub.sh'
+            bash_file_content = '''#$ -S /bin/bash 
+#$ -cwd
+#$ -N %s
+#$ -q highpri.q,highmem.q
+python runphycas.py
+/common/galax/rungalax.sh --treefile trees.t --skip 1
+'''% (new_nexus_file2)
+
+#     #     ### saving dna matrix to the directory
+#         
+            full_path1 = os.path.join(new_subdir, new_nexus_file) 
+            newf = open(full_path1, 'w')
+            newf.write('#nexus\n\n')
+            newf.write('begin data;\n')
+            newf.write('  dimensions ntax=%d nchar=%d;\n' % (ntax, num_sites))
+            newf.write('  format datatype=dna missing=? gap=-;\n')
+            newf.write('  matrix\n')
+            longest_taxon_name = max([len(t) for t in taxon_names])
+            for t,s in zip(taxon_names, sequences):
+                formatstr = '%%%ds' % longest_taxon_name
+                namestr = formatstr % t
+                newf.write('    %s  %s\n' % (namestr, s))
+            newf.write(';\n')
+            newf.write('end;\n')
+            newf.close()
+
+            full_path2 = os.path.join(new_subdir, bash_filename)
+            newf = open(full_path2, 'w')
+            newf.write(bash_file_content)
+            newf.close()
+
+            full_path8 = os.path.join(new_subdir, 'runphycas.py')
+            newf = open(full_path8, 'w')
+            x = open('runphycas.py', 'r').read()
+            newf.write(x %(int(seed_num), new_nexus_file))
+
+# 
+#     
+#     
+        combined_sequences = map(''.join, zip(*all_seq))
+        num_sites_combined = len(combined_sequences[0])
+        new_dir2 = os.path.join(master_dir,'combinedSeq') 
+        if not os.path.exists(new_dir2):
+            os.mkdir(new_dir2)
+
+        full_path3 = os.path.join(new_dir2, 'combinedSeq.nex') 
+        newf = open(full_path3, 'w')
+        newf.write('#nexus\n\n')
+        newf.write('begin data;\n')
+        newf.write('  dimensions ntax=%d nchar=%d;\n' % (ntax, num_sites_combined))
+        newf.write('  format datatype=dna missing=? gap=-;\n')
+        newf.write('  matrix\n')
+        longest_taxon_name = max([len(t) for t in taxon_names])
+        for t,s in zip(taxon_names, combined_sequences):
+            formatstr = '%%%ds' % longest_taxon_name
+            namestr = formatstr % t
+            newf.write('    %s  %s\n' % (namestr, s))
+        newf.write(';\n')
+        newf.write('end;\n')
+        newf.close()
+
+        full_path4 = os.path.join(new_dir2, bash_filename)
+        newf = open(full_path4, 'w')
+        newf.write(bash_file_content)
+        newf.close()
+
+
+
+        full_path6 = os.path.join(new_dir2, 'runphycas.py')
+        newf = open(full_path6, 'w')
+        x = open('runphycas.py', 'r').read()
+        newf.write(x %(4648, 'combinedSeq.nex'))
+
+
+        full_path7 = os.path.join(master_dir, 'submitall.sh') 
+        newf = open(full_path7, 'w')
+        newf.write('#!/bin/bash\n')
+        for i in sets:
+            for m in gene_sets:
+                newf.write('cd %s/%s; qsub qsub.sh;cd .. ;cd .. ; \n' %(i, m))
+        newf.write('cd %s; qsub qsub.sh; cd ..\n' %('combinedSeq'))
+        newf.close()
+
+        full_path9 = os.path.join(master_dir, 'treelist.txt')
+        newf = open(full_path9, 'w')
+        for i in sets:
+            for m in gene_sets:
+                newf.write(i+'/'+m+'/trees.t\n')
+        newf.close()
+
+
+
+        full_path10 = os.path.join(master_dir, 'galax.sh')
+        newf = open(full_path10, 'w')
+        newf.write('#!/bin/bash\n')
+        newf.write('/common/galax/rungalax.sh --listfile treelist.txt --skip 1')
+        newf.close()
+
+
+
+