Skip to content

Commit

Permalink
creates randomly generted sequences with no underlying topology
Browse files Browse the repository at this point in the history
  • Loading branch information
sun13005 committed Apr 22, 2017
1 parent 60a48bf commit 1a72916
Showing 1 changed file with 212 additions and 0 deletions.
212 changes: 212 additions & 0 deletions randomseq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
import sys, re, os, random


# number1 = raw_input('number_of_runs? > ')
# num_run = int(number1)
# #print num_run
#
# ###specifying number of taxa to be generated
# number2 = raw_input('number_of_taxa? > ')
# ntax = int(number2)
#
# ###specifying number of sites to be generated
# number3 = raw_input('number of sites? > ')
# num_sites = int(number3)
#
# ###specifying number of dna matrix to be generated
# number4 = raw_input('number of genes? > ')
# number_genes = int(number4)




num_run = 1
#print num_run

###specifying number of taxa to be generated
ntax = 6

###specifying number of sites to be generated
num_sites = 10000

###specifying number of dna matrix to be generated
number_genes = 10

num_trials_pergene = 2


for i in range(num_run):
run_name = i+1
run_name2 = 'trial'+ str(run_name)
print run_name2
master_dir = os.path.join(os.path.abspath(os.curdir), run_name2)
if not os.path.exists(master_dir):
os.mkdir(master_dir)


###creating a list for the directories for each randomly generated dna matrix
all_seq = []
sets = []
for a, b in enumerate(range(number_genes)):
a += 1
set = "gene"+ str(a)
sets.append(set)

### creating dna matrix with randomly generated sequences
for n, i in enumerate(sets):
### specifying names of the directory and dna matrix
folder_name = i
n += 1
new_nexus_file = 'randomseq'+ str(n)+'.nex'
new_nexus_file2 = 'randomseq'+ str(n)
### creating a list of randomly generated sequences
sequences = []
for i in range(ntax):
myrandom = []
for i in range(num_sites):
i = random.randint(1,4)
myrandom.append(i)
dna_list = []
for m in myrandom:
if m == 1:
m= "A"
if m ==2:
m= "T"
if m ==3:
m= "G"
if m == 4:
m= "C"
dna_list.append(m)
dna_list2 = ''.join(dna_list)
#print dna_list2
sequences.append(dna_list2)
all_seq.append(sequences)



### creating a list of taxon_names
taxon_names = []
k=0
for i, m in enumerate(sequences):
k += 1
taxon = "taxon"+ str(k)
taxon_names.append(taxon)


### creating a directory
new_dir = os.path.join(master_dir,folder_name)
if not os.path.exists(new_dir):
os.mkdir(new_dir)
gene_sets = []
# for i in range(3):
for i in range(num_trials_pergene):
m= i+1
seed_num = 4648+m
m2 = 'run'+str(m)
gene_sets.append(m2)
new_subdir = os.path.join(new_dir,m2)
if not os.path.exists(new_subdir):
os.mkdir(new_subdir)

bash_filename = 'qsub.sh'
bash_file_content = '''#$ -S /bin/bash
#$ -cwd
#$ -N %s
#$ -q highpri.q,highmem.q
python runphycas.py
/common/galax/rungalax.sh --treefile trees.t --skip 1
'''% (new_nexus_file2)

# # ### saving dna matrix to the directory
#
full_path1 = os.path.join(new_subdir, new_nexus_file)
newf = open(full_path1, 'w')
newf.write('#nexus\n\n')
newf.write('begin data;\n')
newf.write(' dimensions ntax=%d nchar=%d;\n' % (ntax, num_sites))
newf.write(' format datatype=dna missing=? gap=-;\n')
newf.write(' matrix\n')
longest_taxon_name = max([len(t) for t in taxon_names])
for t,s in zip(taxon_names, sequences):
formatstr = '%%%ds' % longest_taxon_name
namestr = formatstr % t
newf.write(' %s %s\n' % (namestr, s))
newf.write(';\n')
newf.write('end;\n')
newf.close()

full_path2 = os.path.join(new_subdir, bash_filename)
newf = open(full_path2, 'w')
newf.write(bash_file_content)
newf.close()

full_path8 = os.path.join(new_subdir, 'runphycas.py')
newf = open(full_path8, 'w')
x = open('runphycas.py', 'r').read()
newf.write(x %(int(seed_num), new_nexus_file))

#
#
#
combined_sequences = map(''.join, zip(*all_seq))
num_sites_combined = len(combined_sequences[0])
new_dir2 = os.path.join(master_dir,'combinedSeq')
if not os.path.exists(new_dir2):
os.mkdir(new_dir2)

full_path3 = os.path.join(new_dir2, 'combinedSeq.nex')
newf = open(full_path3, 'w')
newf.write('#nexus\n\n')
newf.write('begin data;\n')
newf.write(' dimensions ntax=%d nchar=%d;\n' % (ntax, num_sites_combined))
newf.write(' format datatype=dna missing=? gap=-;\n')
newf.write(' matrix\n')
longest_taxon_name = max([len(t) for t in taxon_names])
for t,s in zip(taxon_names, combined_sequences):
formatstr = '%%%ds' % longest_taxon_name
namestr = formatstr % t
newf.write(' %s %s\n' % (namestr, s))
newf.write(';\n')
newf.write('end;\n')
newf.close()

full_path4 = os.path.join(new_dir2, bash_filename)
newf = open(full_path4, 'w')
newf.write(bash_file_content)
newf.close()



full_path6 = os.path.join(new_dir2, 'runphycas.py')
newf = open(full_path6, 'w')
x = open('runphycas.py', 'r').read()
newf.write(x %(4648, 'combinedSeq.nex'))


full_path7 = os.path.join(master_dir, 'submitall.sh')
newf = open(full_path7, 'w')
newf.write('#!/bin/bash\n')
for i in sets:
for m in gene_sets:
newf.write('cd %s/%s; qsub qsub.sh;cd .. ;cd .. ; \n' %(i, m))
newf.write('cd %s; qsub qsub.sh; cd ..\n' %('combinedSeq'))
newf.close()

full_path9 = os.path.join(master_dir, 'treelist.txt')
newf = open(full_path9, 'w')
for i in sets:
for m in gene_sets:
newf.write(i+'/'+m+'/trees.t\n')
newf.close()



full_path10 = os.path.join(master_dir, 'galax.sh')
newf = open(full_path10, 'w')
newf.write('#!/bin/bash\n')
newf.write('/common/galax/rungalax.sh --listfile treelist.txt --skip 1')
newf.close()




0 comments on commit 1a72916

Please sign in to comment.