Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
305 lines (236 sloc) 11.5 KB
#!/usr/bin/python
class Node(object):
"""A node in the suffix tree.
suffix_node
the index of a node with a matching suffix, representing a suffix link.
-1 indicates this node has no suffix link.
depth
depth of the node in the tree
"""
def __init__(self,depth=0):
self.suffix_node = -1
self.depth=depth
def __repr__(self):
return "Node(suffix link: %d, depth: %d)"%(self.suffix_node,self.depth)
class Edge(object):
"""An edge in the suffix tree.
first_char_index
index of start of string part represented by this edge
last_char_index
index of end of string part represented by this edge
source_node_index
index of source node of edge
dest_node_index
index of destination node of edge
"""
def __init__(self, first_char_index, last_char_index, source_node_index, dest_node_index):
self.first_char_index = first_char_index
self.last_char_index = last_char_index
self.source_node_index = source_node_index
self.dest_node_index = dest_node_index
@property
def length(self):
return self.last_char_index - self.first_char_index
def __repr__(self):
return 'Edge(%d, %d, %d %d)'% (self.source_node_index, self.dest_node_index
,self.first_char_index, self.last_char_index )
class Suffix(object):
"""Represents a suffix from first_char_index to last_char_index.
source_node_index
index of node where this suffix starts
first_char_index
index of start of suffix in string
last_char_index
index of end of suffix in string
"""
def __init__(self, source_node_index, first_char_index, last_char_index):
self.source_node_index = source_node_index
self.first_char_index = first_char_index
self.last_char_index = last_char_index
def __repr__(self):
return 'Suffix(%d, %d, %d)'%(self.source_node_index,self.first_char_index,self.last_char_index)
@property
def length(self):
return self.last_char_index - self.first_char_index
def explicit(self):
"""A suffix is explicit if it ends on a node. first_char_index
is set greater than last_char_index to indicate this.
"""
return self.first_char_index > self.last_char_index
def implicit(self):
return self.last_char_index >= self.first_char_index
class SuffixTree(object):
"""A suffix tree for string matching. Uses Ukkonen's algorithm
for construction.
"""
def __init__(self, string, case_insensitive=False):
"""
string
the string for which to construct a suffix tree
"""
self.string = string
self.case_insensitive = case_insensitive
self.N = len(string) - 1
self.nodes = [Node()]
self.edges = {}
self.alphabet={}
self.active = Suffix(0, 0, -1)
if self.case_insensitive:
self.string = self.string.lower()
for i in range(len(string)):
self._add_prefix(i)
self._add_to_alphabet(self.string[i])
def __repr__(self):
"""
Lists edges in the suffix tree
"""
s=""
for x in self.alphabet.keys():
s+=x
curr_index = self.N
s = "\tDepth \tStart \tEnd \tSuf \tFirst \tLast \tString\n"
values = self.edges.values()
values.sort(key=lambda x: (self.nodes[x.source_node_index].depth,self.nodes[x.source_node_index]))
for edge in values:
if edge.source_node_index == -1:
continue
s += "\t%s \t%s \t%s \t%s \t%s \t%s \t"%(self.nodes[edge.source_node_index].depth, edge.source_node_index
,edge.dest_node_index
,self.nodes[edge.dest_node_index].suffix_node
,edge.first_char_index
,edge.last_char_index)
top = min(curr_index, edge.last_char_index)
s += self.string[edge.first_char_index:top+1] + "\n"
return s
def _add_to_alphabet(self,char):
self.alphabet[char]=1
def _add_prefix(self, last_char_index):
"""The core construction method.
"""
last_parent_node = -1
while True:
# the current phase of the algorithm is given by last_char_index
# the current suffix being inserted is given by the path from the root to self.active.source_node_index
# followed by the string from self.active.first_char_index to the last_char_index in self.string
# So it is of the form (A)(string[self.active.first_char_index..self.active.last_char_index])(self.string[last_char_index]) [***]
# and the path (A) ends at the node self.active.source_node_index
# any splits will occur in an edge from self.active.source_node_index
# we need to remember this node for the purposes of creating suffix links
parent_node = self.active.source_node_index
if self.active.explicit():
# active is at a node (meaning that the middle piece of the form [***] is missing
if (self.active.source_node_index, self.string[last_char_index]) in self.edges:
# Rule 3 (do nothing): edge leaving source_node_index is marked with the last character of active suffix
# prefix is already in tree
break
else:
# active is in the middle of an edge from source node in the direction of active.first_char_index
e = self.edges[self.active.source_node_index, self.string[self.active.first_char_index]]
if self.string[e.first_char_index + self.active.length + 1] == self.string[last_char_index]:
# Rule 3 (do nothing): suffix lies in middle of an edge
# prefix is already in tree
break
# Rule 2 in the middle of an edge, we know that at least one path continues, we must split edge
# Also, this new internal node will have a suffix link to it from "last_parent_node"
# assuming last parent node isn't the root (this happens a bit later)
parent_node = self._split_edge(e, self.active)
# either active is at a node, but there is no edge marked with the last character of active suffix
# OR we're in Rule 2, we just split the edge, and parent_node is now the new node
# we add the new leaf node in either case (Rule 2)
self.nodes.append(Node(self.nodes[parent_node].depth+1))
e = Edge(last_char_index, self.N, parent_node, len(self.nodes) - 1)
self._insert_edge(e)
# here's where we put in the suffix link to the parent of the newly created node
# from the previous parent
if last_parent_node > 0:
self.nodes[last_parent_node].suffix_node = parent_node
# Remember the parent for future suffix links
last_parent_node = parent_node
# if the active node is the root, then the next prefix to consider is obtained by just incrementing
# the first_char_index; otherwise, the next prefix to consider is obtained by following a suffix
# link. Suppose the suffix link goes from v to s(v). Write the path to v in the form XA with X a single character and A a string
# then the path to the node s(v) is A. In other words, either way (i.e. from the root or via a suffix link)
# we have moved to the next extension in this phase.
if self.active.source_node_index == 0:
self.active.first_char_index += 1
else:
self.active.source_node_index = self.nodes[self.active.source_node_index].suffix_node
# traverse the tree along the active suffix, sucking up edges until the either
# the suffix becomes explicit or the end of the prefix lies in the middle of an edge
# leaving the active node
self._canonize_suffix(self.active)
# we get here after a "Rule 3" situation, meaning that all of "suffix" is
# implicitly in the tree. We have to finish up by creating the last suffix link
# and moving down the tree so the end of the suffix lies within an edge leaving
# the active node.
if last_parent_node > 0:
self.nodes[last_parent_node].suffix_node = parent_node
self.active.last_char_index += 1
self._canonize_suffix(self.active)
def _insert_edge(self, edge):
self.edges[(edge.source_node_index, self.string[edge.first_char_index])] = edge
def _remove_edge(self, edge):
self.edges.pop((edge.source_node_index, self.string[edge.first_char_index]))
def _split_edge(self, edge, suffix):
self.nodes.append(Node(self.nodes[suffix.source_node_index].depth+1))
e = Edge(edge.first_char_index, edge.first_char_index + suffix.length, suffix.source_node_index, len(self.nodes) - 1)
self._remove_edge(edge)
self._insert_edge(e)
self.nodes[e.dest_node_index].suffix_node = suffix.source_node_index ### need to add node for each edge
edge.first_char_index += suffix.length + 1
edge.source_node_index = e.dest_node_index
self._insert_edge(edge)
return e.dest_node_index
def _canonize_suffix(self, suffix):
"""This canonizes the suffix, walking along its suffix string until it
is explicit or there are no more matched nodes.
"""
if not suffix.explicit():
e = self.edges[suffix.source_node_index, self.string[suffix.first_char_index]]
if e.length <= suffix.length:
suffix.first_char_index += e.length + 1
suffix.source_node_index = e.dest_node_index
self._canonize_suffix(suffix)
def _count_leaves_below(self,node):
children=[]; j=0 ; n=1
for x in self.alphabet.keys():
e=self.edges.get((node,x))
if e:
j+=self.count_leaves_below(e.dest_node_index)
n=0
return max(n,j)
# Public methods
def find_substring(self, substring):
"""Returns the index of substring in string or -1 if it
is not found.
"""
if not substring:
return -1
if self.case_insensitive:
substring = substring.lower()
curr_node = 0
i = 0
while i < len(substring):
edge = self.edges.get((curr_node, substring[i]))
if not edge:
return -1
ln = min(edge.length + 1, len(substring) - i)
if substring[i:i + ln] != self.string[edge.first_char_index:edge.first_char_index + ln]:
return -1
i += edge.length + 1
curr_node = edge.dest_node_index
return self.count_leaves_below(curr_node)
def count_leaves_below(self,node):
children=[]; j=0 ; n=1
for x in self.alphabet.keys():
e=self.edges.get((node,x))
if e:
j+=self.count_leaves_below(e.dest_node_index)
n=0
return max(n,j)
def has_substring(self, substring):
return self.find_substring(substring) != -1
t='banana$'
s='ab'
n=SuffixTree(t)
print n
You can’t perform that action at this time.