diff --git a/randomseq.py b/randomseq.py new file mode 100644 index 0000000..280b307 --- /dev/null +++ b/randomseq.py @@ -0,0 +1,212 @@ +import sys, re, os, random + + +# number1 = raw_input('number_of_runs? > ') +# num_run = int(number1) +# #print num_run +# +# ###specifying number of taxa to be generated +# number2 = raw_input('number_of_taxa? > ') +# ntax = int(number2) +# +# ###specifying number of sites to be generated +# number3 = raw_input('number of sites? > ') +# num_sites = int(number3) +# +# ###specifying number of dna matrix to be generated +# number4 = raw_input('number of genes? > ') +# number_genes = int(number4) + + + + +num_run = 1 +#print num_run + +###specifying number of taxa to be generated +ntax = 6 + +###specifying number of sites to be generated +num_sites = 10000 + +###specifying number of dna matrix to be generated +number_genes = 10 + +num_trials_pergene = 2 + + +for i in range(num_run): + run_name = i+1 + run_name2 = 'trial'+ str(run_name) + print run_name2 + master_dir = os.path.join(os.path.abspath(os.curdir), run_name2) + if not os.path.exists(master_dir): + os.mkdir(master_dir) + + + ###creating a list for the directories for each randomly generated dna matrix + all_seq = [] + sets = [] + for a, b in enumerate(range(number_genes)): + a += 1 + set = "gene"+ str(a) + sets.append(set) + + ### creating dna matrix with randomly generated sequences + for n, i in enumerate(sets): + ### specifying names of the directory and dna matrix + folder_name = i + n += 1 + new_nexus_file = 'randomseq'+ str(n)+'.nex' + new_nexus_file2 = 'randomseq'+ str(n) + ### creating a list of randomly generated sequences + sequences = [] + for i in range(ntax): + myrandom = [] + for i in range(num_sites): + i = random.randint(1,4) + myrandom.append(i) + dna_list = [] + for m in myrandom: + if m == 1: + m= "A" + if m ==2: + m= "T" + if m ==3: + m= "G" + if m == 4: + m= "C" + dna_list.append(m) + dna_list2 = ''.join(dna_list) + #print dna_list2 + sequences.append(dna_list2) + all_seq.append(sequences) + + + + ### creating a list of taxon_names + taxon_names = [] + k=0 + for i, m in enumerate(sequences): + k += 1 + taxon = "taxon"+ str(k) + taxon_names.append(taxon) + + + ### creating a directory + new_dir = os.path.join(master_dir,folder_name) + if not os.path.exists(new_dir): + os.mkdir(new_dir) + gene_sets = [] +# for i in range(3): + for i in range(num_trials_pergene): + m= i+1 + seed_num = 4648+m + m2 = 'run'+str(m) + gene_sets.append(m2) + new_subdir = os.path.join(new_dir,m2) + if not os.path.exists(new_subdir): + os.mkdir(new_subdir) + + bash_filename = 'qsub.sh' + bash_file_content = '''#$ -S /bin/bash +#$ -cwd +#$ -N %s +#$ -q highpri.q,highmem.q +python runphycas.py +/common/galax/rungalax.sh --treefile trees.t --skip 1 +'''% (new_nexus_file2) + +# # ### saving dna matrix to the directory +# + full_path1 = os.path.join(new_subdir, new_nexus_file) + newf = open(full_path1, 'w') + newf.write('#nexus\n\n') + newf.write('begin data;\n') + newf.write(' dimensions ntax=%d nchar=%d;\n' % (ntax, num_sites)) + newf.write(' format datatype=dna missing=? gap=-;\n') + newf.write(' matrix\n') + longest_taxon_name = max([len(t) for t in taxon_names]) + for t,s in zip(taxon_names, sequences): + formatstr = '%%%ds' % longest_taxon_name + namestr = formatstr % t + newf.write(' %s %s\n' % (namestr, s)) + newf.write(';\n') + newf.write('end;\n') + newf.close() + + full_path2 = os.path.join(new_subdir, bash_filename) + newf = open(full_path2, 'w') + newf.write(bash_file_content) + newf.close() + + full_path8 = os.path.join(new_subdir, 'runphycas.py') + newf = open(full_path8, 'w') + x = open('runphycas.py', 'r').read() + newf.write(x %(int(seed_num), new_nexus_file)) + +# +# +# + combined_sequences = map(''.join, zip(*all_seq)) + num_sites_combined = len(combined_sequences[0]) + new_dir2 = os.path.join(master_dir,'combinedSeq') + if not os.path.exists(new_dir2): + os.mkdir(new_dir2) + + full_path3 = os.path.join(new_dir2, 'combinedSeq.nex') + newf = open(full_path3, 'w') + newf.write('#nexus\n\n') + newf.write('begin data;\n') + newf.write(' dimensions ntax=%d nchar=%d;\n' % (ntax, num_sites_combined)) + newf.write(' format datatype=dna missing=? gap=-;\n') + newf.write(' matrix\n') + longest_taxon_name = max([len(t) for t in taxon_names]) + for t,s in zip(taxon_names, combined_sequences): + formatstr = '%%%ds' % longest_taxon_name + namestr = formatstr % t + newf.write(' %s %s\n' % (namestr, s)) + newf.write(';\n') + newf.write('end;\n') + newf.close() + + full_path4 = os.path.join(new_dir2, bash_filename) + newf = open(full_path4, 'w') + newf.write(bash_file_content) + newf.close() + + + + full_path6 = os.path.join(new_dir2, 'runphycas.py') + newf = open(full_path6, 'w') + x = open('runphycas.py', 'r').read() + newf.write(x %(4648, 'combinedSeq.nex')) + + + full_path7 = os.path.join(master_dir, 'submitall.sh') + newf = open(full_path7, 'w') + newf.write('#!/bin/bash\n') + for i in sets: + for m in gene_sets: + newf.write('cd %s/%s; qsub qsub.sh;cd .. ;cd .. ; \n' %(i, m)) + newf.write('cd %s; qsub qsub.sh; cd ..\n' %('combinedSeq')) + newf.close() + + full_path9 = os.path.join(master_dir, 'treelist.txt') + newf = open(full_path9, 'w') + for i in sets: + for m in gene_sets: + newf.write(i+'/'+m+'/trees.t\n') + newf.close() + + + + full_path10 = os.path.join(master_dir, 'galax.sh') + newf = open(full_path10, 'w') + newf.write('#!/bin/bash\n') + newf.write('/common/galax/rungalax.sh --listfile treelist.txt --skip 1') + newf.close() + + + +