Skip to content

Commit

Permalink
script that reads nexus data and creates list of site patterns
Browse files Browse the repository at this point in the history
  • Loading branch information
sun13005 committed Apr 19, 2017
1 parent 3173c3e commit 35eb323
Showing 1 changed file with 89 additions and 0 deletions.
89 changes: 89 additions & 0 deletions readseq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
def patterns():


import re, os, glob, itertools, fnmatch, sys, shutil
from itertools import combinations
from collections import Counter

script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
path = os.path.join(script_dir, 'nexus')


genes = []
data = {}
# print 'Reading nexus files...'
for filename in glob.glob(os.path.join(path, '*.nex')):

m = re.match('(.+).nex', os.path.basename(filename))
gene_name = m.group(1)
# print 'gene_name=', gene_name
genes.append(gene_name)
f = open(filename, 'r').read()

m = re.search('ntax\s*=\s*(\d+)', f, re.M | re.S)
ntax = int(m.group(1))
# print 'ntax=', ntax

m = re.search('nchar\s*=\s*(\d+)', f, re.M | re.S)
nchar = int(m.group(1))
# print 'nchar=', nchar


m = re.search('Matrix\s+(.+?);', f, re.M | re.S)
matrix = m.group(1).strip()
matrix_lines = matrix.split('\n')

taxon_names = []
sequences = {}
sequences_list = []
for line in matrix_lines:
parts = line.strip().split()
assert len(parts) == 2
taxon_name = parts[0]
sequence = parts[1]
# print 'sequence===', sequence

taxon_names.append(taxon_name)
sequences_list.append(sequence)
sequences[taxon_name] = sequence



# print 'sequences dict==', sequences
# print 'sequences list==', sequences_list

pattern_list = []

k=0
while k < nchar:
site_pattern = ''
for i,m in enumerate(sequences_list):
site_pattern += m[k]
# print 'site_pattern=', site_pattern
pattern_list.append(site_pattern)
k+=1
# print '!!!!!!!!'
# print pattern_list
# print
pattern_dict = dict()
for i in pattern_list:
pattern_dict[i] = pattern_dict.get(i, 0) + 1 #http://www.pythonlearn.com/html-008/cfbook010.html#

# print'------------'
# print pattern_dict
# print
tmp = []
for key in pattern_dict.keys(): ###convert dict to key of tupules
# print 'key=', key
tmp.append((pattern_dict[key],key))

sorted_values = sorted(tmp) ###sorted according to key smaller to larger
# print sorted_values
# print '**********'
# sorted_values.reverse() ###sorted according to key values larger to smaller
sorted_values.sort(cmp = lambda x,y:cmp(x[1],y[1])) ###sorted according to values in alphabetical order
# print sorted_values

return pattern_dict


0 comments on commit 35eb323

Please sign in to comment.