From 35eb323d027ed9f37ae3f87e7883864980a98b08 Mon Sep 17 00:00:00 2001 From: Suman Neupane Date: Wed, 19 Apr 2017 12:30:07 -0400 Subject: [PATCH] script that reads nexus data and creates list of site patterns --- readseq.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 readseq.py diff --git a/readseq.py b/readseq.py new file mode 100644 index 0000000..75bdb56 --- /dev/null +++ b/readseq.py @@ -0,0 +1,89 @@ +def patterns(): + + + import re, os, glob, itertools, fnmatch, sys, shutil + from itertools import combinations + from collections import Counter + + script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) + path = os.path.join(script_dir, 'nexus') + + + genes = [] + data = {} +# print 'Reading nexus files...' + for filename in glob.glob(os.path.join(path, '*.nex')): + + m = re.match('(.+).nex', os.path.basename(filename)) + gene_name = m.group(1) +# print 'gene_name=', gene_name + genes.append(gene_name) + f = open(filename, 'r').read() + + m = re.search('ntax\s*=\s*(\d+)', f, re.M | re.S) + ntax = int(m.group(1)) +# print 'ntax=', ntax + + m = re.search('nchar\s*=\s*(\d+)', f, re.M | re.S) + nchar = int(m.group(1)) +# print 'nchar=', nchar + + + m = re.search('Matrix\s+(.+?);', f, re.M | re.S) + matrix = m.group(1).strip() + matrix_lines = matrix.split('\n') + + taxon_names = [] + sequences = {} + sequences_list = [] + for line in matrix_lines: + parts = line.strip().split() + assert len(parts) == 2 + taxon_name = parts[0] + sequence = parts[1] + # print 'sequence===', sequence + + taxon_names.append(taxon_name) + sequences_list.append(sequence) + sequences[taxon_name] = sequence + + + + # print 'sequences dict==', sequences +# print 'sequences list==', sequences_list + + pattern_list = [] + + k=0 + while k < nchar: + site_pattern = '' + for i,m in enumerate(sequences_list): + site_pattern += m[k] + # print 'site_pattern=', site_pattern + pattern_list.append(site_pattern) + k+=1 +# print '!!!!!!!!' +# print pattern_list +# print + pattern_dict = dict() + for i in pattern_list: + pattern_dict[i] = pattern_dict.get(i, 0) + 1 #http://www.pythonlearn.com/html-008/cfbook010.html# + +# print'------------' +# print pattern_dict +# print + tmp = [] + for key in pattern_dict.keys(): ###convert dict to key of tupules +# print 'key=', key + tmp.append((pattern_dict[key],key)) + + sorted_values = sorted(tmp) ###sorted according to key smaller to larger +# print sorted_values +# print '**********' + # sorted_values.reverse() ###sorted according to key values larger to smaller + sorted_values.sort(cmp = lambda x,y:cmp(x[1],y[1])) ###sorted according to values in alphabetical order +# print sorted_values + + return pattern_dict + +