From 35eb323d027ed9f37ae3f87e7883864980a98b08 Mon Sep 17 00:00:00 2001
From: Suman Neupane <suman.neupane@uconn.edu>
Date: Wed, 19 Apr 2017 12:30:07 -0400
Subject: [PATCH] script that reads nexus data and creates list of site
 patterns

---
 readseq.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 readseq.py

diff --git a/readseq.py b/readseq.py
new file mode 100644
index 0000000..75bdb56
--- /dev/null
+++ b/readseq.py
@@ -0,0 +1,89 @@
+def patterns():
+ 
+ 
+    import re, os, glob, itertools, fnmatch, sys, shutil
+    from itertools import combinations
+    from collections import Counter
+
+    script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
+    path = os.path.join(script_dir, 'nexus')
+
+
+    genes = []
+    data = {}
+#     print 'Reading nexus files...'
+    for filename in glob.glob(os.path.join(path, '*.nex')):
+
+        m = re.match('(.+).nex', os.path.basename(filename))
+        gene_name = m.group(1)
+#         print 'gene_name=', gene_name
+        genes.append(gene_name)
+        f = open(filename, 'r').read()
+
+        m = re.search('ntax\s*=\s*(\d+)', f, re.M | re.S)
+        ntax = int(m.group(1))
+#         print 'ntax=', ntax
+
+        m = re.search('nchar\s*=\s*(\d+)', f, re.M | re.S)
+        nchar = int(m.group(1))
+#         print 'nchar=', nchar
+
+
+        m = re.search('Matrix\s+(.+?);', f, re.M | re.S)
+        matrix = m.group(1).strip()
+        matrix_lines = matrix.split('\n')
+
+        taxon_names = []
+        sequences = {}
+        sequences_list = []
+        for line in matrix_lines:
+            parts = line.strip().split()
+            assert len(parts) == 2
+            taxon_name = parts[0]
+            sequence = parts[1]
+    #         print 'sequence===', sequence
+        
+            taxon_names.append(taxon_name)
+            sequences_list.append(sequence)
+            sequences[taxon_name] = sequence
+        
+        
+        
+    #     print 'sequences dict==', sequences
+#         print 'sequences list==', sequences_list
+
+        pattern_list = []
+    
+        k=0
+        while k < nchar:
+            site_pattern = ''
+            for i,m in enumerate(sequences_list):
+                site_pattern += m[k]
+    #         print 'site_pattern=', site_pattern 
+            pattern_list.append(site_pattern)
+            k+=1
+#         print '!!!!!!!!'
+#         print pattern_list
+#         print 
+        pattern_dict = dict()
+        for i in pattern_list:
+            pattern_dict[i] = pattern_dict.get(i, 0) + 1 #http://www.pythonlearn.com/html-008/cfbook010.html#
+    
+#         print'------------'
+#         print pattern_dict 
+#         print
+        tmp = []
+        for key in pattern_dict.keys(): ###convert dict to key of tupules
+#             print 'key=', key
+            tmp.append((pattern_dict[key],key))
+    
+        sorted_values = sorted(tmp) ###sorted according to key smaller to larger
+#         print sorted_values
+#         print '**********'
+    #     sorted_values.reverse() ###sorted according to key values larger to smaller
+        sorted_values.sort(cmp = lambda x,y:cmp(x[1],y[1])) ###sorted according to values in alphabetical order
+#         print sorted_values
+    
+    return pattern_dict
+
+