Skip to content
Permalink
6d5cc671f5
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
387 lines (302 sloc) 14 KB
#!/usr/bin/python
import sys
import string
class Node(object):
"""A node in the suffix tree.
suffix_node
the index of a node with a matching suffix, representing a suffix link.
-1 indicates this node has no suffix link.
label
label is used to hold the label of a leaf node (the start of the corresponding suffix)
depth
depth is the length of the substring from the root to the node
parent_index
index of this nodes parent
"""
def __init__(self,label=0,depth=0,parent=-1):
self.suffix_node = -1
self.label=label
self.depth=depth
self.parent=parent
def __repr__(self):
return "Node(suffix link: %d, label: %d)"%(self.suffix_node,self.label)
class Edge(object):
"""An edge in the suffix tree.
first_char_index
index of start of string part represented by this edge
last_char_index
index of end of string part represented by this edge
source_node_index
index of source node of edge
dest_node_index
index of destination node of edge
"""
def __init__(self, first_char_index, last_char_index, source_node_index, dest_node_index):
self.first_char_index = first_char_index
self.last_char_index = last_char_index
self.source_node_index = source_node_index
self.dest_node_index = dest_node_index
@property
def length(self):
return self.last_char_index - self.first_char_index
def __repr__(self):
return 'Edge(%d, %d, %d %d)'% (self.source_node_index, self.dest_node_index
,self.first_char_index, self.last_char_index )
class Suffix(object):
"""Represents a suffix from first_char_index to last_char_index.
source_node_index
index of node where this suffix starts
first_char_index
index of start of suffix in string
last_char_index
index of end of suffix in string
ext
keep track of which extension we are in
"""
def __init__(self, source_node_index, first_char_index, last_char_index,ext):
self.source_node_index = source_node_index
self.first_char_index = first_char_index
self.last_char_index = last_char_index
self.ext=ext
def __repr__(self):
return 'Suffix(%d, %d, %d, %d)'%(self.source_node_index,self.first_char_index,self.last_char_index,self.ext)
@property
def length(self):
return self.last_char_index - self.first_char_index
def explicit(self):
"""A suffix is explicit if it ends on a node. first_char_index
is set greater than last_char_index to indicate this.
"""
return self.first_char_index > self.last_char_index
def implicit(self):
return self.last_char_index >= self.first_char_index
class SuffixTree(object):
"""A suffix tree for string matching. Uses Ukkonen's algorithm
for construction.
"""
def __init__(self, string, case_insensitive=False,sep_chars=[]):
"""
string
the string for which to construct a suffix tree
"""
self.string = string
self.sep_chars=sep_chars
self.case_insensitive = case_insensitive
self.N = len(string) - 1
self.nodes = [Node()]
self.edges = {}
self.alphabet={}
self.active = Suffix(0, 0, -1,0)
if self.case_insensitive:
self.string = self.string.lower()
for i in range(len(string)):
self._add_prefix(i)
self._add_to_alphabet(self.string[i])
def __repr__(self):
"""
Lists edges in the suffix tree
"""
s=""
for x in self.alphabet.keys():
s+=x
curr_index = self.N
s = "\tLabel \tStart \tEnd \tDepth \tSuf \tParent \tFirst \tLast \tString\n"
values = self.edges.values()
values.sort(key=lambda x: (self.nodes[x.dest_node_index].label,self.nodes[x.source_node_index]))
for edge in values:
if edge.source_node_index == -1:
continue
s += "\t%s \t%s \t%s \t%s \t%s \t%s \t%s \t%s \t"%(self.nodes[edge.dest_node_index].label, edge.source_node_index
,edge.dest_node_index ,self.nodes[edge.dest_node_index].depth
,self.nodes[edge.dest_node_index].suffix_node, self.nodes[edge.dest_node_index].parent
,edge.first_char_index
,edge.last_char_index)
top = min(curr_index, edge.last_char_index)
s += self.string[edge.first_char_index:top+1] + "\n"
return s
def _add_to_alphabet(self,char):
self.alphabet[char]=1
def _add_prefix(self, last_char_index):
"""The core construction method.
"""
last_parent_node = -1
while True:
# the current phase of the algorithm is given by last_char_index
# the current suffix being inserted is given by the path from the root to self.active.source_node_index
# followed by the string from self.active.first_char_index to the last_char_index in self.string
# So it is of the form (A)(string[self.active.first_char_index..self.active.last_char_index])(self.string[last_char_index]) [***]
# and the path (A) ends at the node self.active.source_node_index
# any splits will occur in an edge from self.active.source_node_index
# we need to remember this node for the purposes of creating suffix links
parent_node = self.active.source_node_index
if self.active.explicit():
# active is at a node (meaning that the middle piece of the form [***] is missing
if (self.active.source_node_index, self.string[last_char_index]) in self.edges:
# Rule 3 (do nothing): edge leaving source_node_index is marked with the last character of active suffix
# prefix is already in tree
break
else:
# active is in the middle of an edge from source node in the direction of active.first_char_index
e = self.edges[self.active.source_node_index, self.string[self.active.first_char_index]]
if self.string[e.first_char_index + self.active.length + 1] == self.string[last_char_index]:
# Rule 3 (do nothing): suffix lies in middle of an edge
# prefix is already in tree
break
# Rule 2 in the middle of an edge, we know that at least one path continues, we must split edge
# Also, this new internal node will have a suffix link to it from "last_parent_node"
# assuming last parent node isn't the root (this happens a bit later)
parent_node = self._split_edge(e, self.active)
# either active is at a node, but there is no edge marked with the last character of active suffix
# OR we're in Rule 2, we just split the edge, and parent_node is now the new node
# we add the new leaf node in either case (Rule 2)
self.nodes.append(Node(self.active.ext))
e = Edge(last_char_index, self.N, parent_node, len(self.nodes) - 1)
self.nodes[len(self.nodes)-1].parent=(parent_node,self.string[e.first_char_index])
self._insert_edge(e)
# here's where we put in the suffix link to the parent of the newly created node
# from the previous parent
if last_parent_node > 0:
self.nodes[last_parent_node].suffix_node = parent_node
# Remember the parent for future suffix links
last_parent_node = parent_node
# if the active node is the root, then the next prefix to consider is obtained by just incrementing
# the first_char_index; otherwise, the next prefix to consider is obtained by following a suffix
# link. Suppose the suffix link goes from v to s(v). Write the path to v in the form XA with X a single character and A a string
# then the path to the node s(v) is A. In other words, either way (i.e. from the root or via a suffix link)
# we have moved to the next extension in this phase.
if self.active.source_node_index == 0:
self.active.ext+=1
self.active.first_char_index += 1
else:
self.active.ext+=1
self.active.source_node_index = self.nodes[self.active.source_node_index].suffix_node
# traverse the tree along the active suffix, sucking up edges until the either
# the suffix becomes explicit or the end of the prefix lies in the middle of an edge
# leaving the active node
self._canonize_suffix(self.active)
# we get here after a "Rule 3" situation, meaning that all of "suffix" is
# implicitly in the tree. We have to finish up by creating the last suffix link
# and moving down the tree so the end of the suffix lies within an edge leaving
# the active node.
if last_parent_node > 0:
self.nodes[last_parent_node].suffix_node = parent_node
self.active.last_char_index += 1
self._canonize_suffix(self.active)
def _insert_edge(self, edge):
self.edges[(edge.source_node_index, self.string[edge.first_char_index])] = edge
self.nodes[edge.dest_node_index].depth=self.nodes[edge.source_node_index].depth+edge.length+1
def _remove_edge(self, edge):
self.edges.pop((edge.source_node_index, self.string[edge.first_char_index]))
def _split_edge(self, edge, suffix):
self.nodes.append(Node(-1))
e = Edge(edge.first_char_index, edge.first_char_index + suffix.length, suffix.source_node_index, len(self.nodes) - 1)
self.nodes[len(self.nodes)-1].parent=(suffix.source_node_index,self.string[e.first_char_index])
self._remove_edge(edge)
self._insert_edge(e)
self.nodes[e.dest_node_index].suffix_node = suffix.source_node_index ### need to add node for each edge
edge.first_char_index += suffix.length + 1
self.nodes[edge.dest_node_index].parent=(len(self.nodes)-1,self.string[edge.first_char_index])
edge.source_node_index = e.dest_node_index
self._insert_edge(edge)
return e.dest_node_index
def _canonize_suffix(self, suffix):
"""This canonizes the suffix, walking along its suffix string until it
is explicit or there are no more matched nodes.
"""
if not suffix.explicit():
e = self.edges[suffix.source_node_index, self.string[suffix.first_char_index]]
if e.length <= suffix.length:
suffix.first_char_index += e.length + 1
suffix.source_node_index = e.dest_node_index
self._canonize_suffix(suffix)
# Public methods
def find_substring(self, substring,nodelist=[]):
"""Returns the index of substring in string or -1 if it
is not found.
"""
if not substring:
return -1
if self.case_insensitive:
substring = substring.lower()
curr_node = 0
i = 0
while i < len(substring):
edge = self.edges.get((curr_node, substring[i]))
if not edge:
return -1
ln = min(edge.length + 1, len(substring) - i)
if substring[i:i + ln] != self.string[edge.first_char_index:edge.first_char_index + ln]:
return -1
i += edge.length + 1
curr_node = edge.dest_node_index
return self.count_leaves_below(curr_node,nodelist)
def count_leaves_below(self,node,nodelist):
j=0 ; n=1
for x in self.alphabet.keys():
e=self.edges.get((node,x))
if e:
j+=self.count_leaves_below(e.dest_node_index,nodelist)
n=0
if n==1:
nodelist.append(self.nodes[node].label)
return max(n,j)
def common_substring_nodes(self,curr_node=0,answers=[]):
n=1; l={}
for x in self.alphabet.keys():
e=self.edges.get((curr_node,x))
if e:
l.update(self.common_substring_nodes(e.dest_node_index,answers))
n=0
if n==1:
for t in self.sep_chars:
if t in self.string[self.nodes[curr_node].label:]:
l[t]=1
break
if len(l.keys())==len(self.sep_chars):
answers.append(curr_node)
return l
def prefix_of_node(self,curr_node):
prefix=''
while True:
n=self.nodes[curr_node].parent
e=self.edges.get(n)
if e:
prefix=self.string[e.first_char_index:e.last_char_index+1]+prefix
curr_node=e.source_node_index
else:
break
return prefix
def has_substring(self, substring):
return self.find_substring(substring) != -1
value,key,s="","",{}
with open("datafile.txt","rU") as f:
for x in f:
if x[0]!=">":
value=value+x.strip()
else:
if key!="": s[key]=value
key=x[1:].strip()
value=""
big_string=''
j=0
sepchars=[]
for x in s.values():
big_string+=x
big_string+=str(len(sepchars))
sepchars.append(str(len(sepchars)))
if len(sepchars)==10:
n=SuffixTree(big_string,case_insensitive=True,sep_chars=sepchars)
a=[]
n.common_substring_nodes(answers=a)
a.sort(key=lambda x: -(n.nodes[x].depth))
b=n.prefix_of_node(a[0])
print b.upper()
bigstring=""
sepchars=[]
if len(sepchars)>0:
n=SuffixTree(big_string,case_insensitive=True,sep_chars=sepchars)
a=[]
n.common_substring_nodes(answers=a)
a.sort(key=lambda x: -(n.nodes[x].depth))
b=n.prefix_of_node(a[0])
print b.upper()