Skip to content
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 278 lines (215 sloc) 10.8 KB
class Node(object):
"""A node in the suffix tree.
the index of a node with a matching suffix, representing a suffix link.
-1 indicates this node has no suffix link.
depth of the node in the tree
def __init__(self,depth=0):
self.suffix_node = -1
def __repr__(self):
return "Node(suffix link: %d, depth: %d)"%(self.suffix_node,self.depth)
class Edge(object):
"""An edge in the suffix tree.
index of start of string part represented by this edge
index of end of string part represented by this edge
index of source node of edge
index of destination node of edge
def __init__(self, first_char_index, last_char_index, source_node_index, dest_node_index):
self.first_char_index = first_char_index
self.last_char_index = last_char_index
self.source_node_index = source_node_index
self.dest_node_index = dest_node_index
def length(self):
return self.last_char_index - self.first_char_index
def __repr__(self):
return 'Edge(%d, %d, %d %d)'% (self.source_node_index, self.dest_node_index
,self.first_char_index, self.last_char_index )
class Suffix(object):
"""Represents a suffix from first_char_index to last_char_index.
index of node where this suffix starts
index of start of suffix in string
index of end of suffix in string
def __init__(self, source_node_index, first_char_index, last_char_index):
self.source_node_index = source_node_index
self.first_char_index = first_char_index
self.last_char_index = last_char_index
def __repr__(self):
return 'Suffix(%d, %d, %d)'%(self.source_node_index,self.first_char_index,self.last_char_index)
def length(self):
return self.last_char_index - self.first_char_index
def explicit(self):
"""A suffix is explicit if it ends on a node. first_char_index
is set greater than last_char_index to indicate this.
return self.first_char_index > self.last_char_index
def implicit(self):
return self.last_char_index >= self.first_char_index
class SuffixTree(object):
"""A suffix tree for string matching. Uses Ukkonen's algorithm
for construction.
def __init__(self, string, case_insensitive=False):
the string for which to construct a suffix tree
self.string = string
self.case_insensitive = case_insensitive
self.N = len(string) - 1
self.nodes = [Node()]
self.edges = {} = Suffix(0, 0, -1)
if self.case_insensitive:
self.string = self.string.lower()
for i in range(len(string)):
print self
def __repr__(self):
Lists edges in the suffix tree
curr_index = self.N
s = "\tDepth \tStart \tEnd \tSuf \tFirst \tLast \tString\n"
values = self.edges.values()
values.sort(key=lambda x: (self.nodes[x.source_node_index].depth,self.nodes[x.source_node_index]))
for edge in values:
if edge.source_node_index == -1:
s += "\t%s \t%s \t%s \t%s \t%s \t%s \t"%(self.nodes[edge.source_node_index].depth, edge.source_node_index
top = min(curr_index, edge.last_char_index)
s += self.string[edge.first_char_index:top+1] + "\n"
return s
def _add_prefix(self, last_char_index):
"""The core construction method.
last_parent_node = -1
while True:
# the current phase of the algorithm is given by last_char_index
# the current suffix being inserted is given by the path from the root to
# followed by the string from to the last_char_index in self.string
# So it is of the form (A)(string[])(self.string[last_char_index]) [***]
# and the path (A) ends at the node
# any splits will occur in an edge from
# we need to remember this node for the purposes of creating suffix links
parent_node =
# active is at a node (meaning that the middle piece of the form [***] is missing
if (, self.string[last_char_index]) in self.edges:
# Rule 3 (do nothing): edge leaving source_node_index is marked with the last character of active suffix
# prefix is already in tree
# active is in the middle of an edge from source node in the direction of active.first_char_index
e = self.edges[, self.string[]]
if self.string[e.first_char_index + + 1] == self.string[last_char_index]:
# Rule 3 (do nothing): suffix lies in middle of an edge
# prefix is already in tree
# Rule 2 in the middle of an edge, we know that at least one path continues, we must split edge
# Also, this new internal node will have a suffix link to it from "last_parent_node"
# assuming last parent node isn't the root (this happens a bit later)
parent_node = self._split_edge(e,
# either active is at a node, but there is no edge marked with the last character of active suffix
# OR we're in Rule 2, we just split the edge, and parent_node is now the new node
# we add the new leaf node in either case (Rule 2)
e = Edge(last_char_index, self.N, parent_node, len(self.nodes) - 1)
# here's where we put in the suffix link to the parent of the newly created node
# from the previous parent
if last_parent_node > 0:
self.nodes[last_parent_node].suffix_node = parent_node
# Remember the parent for future suffix links
last_parent_node = parent_node
# if the active node is the root, then the next prefix to consider is obtained by just incrementing
# the first_char_index; otherwise, the next prefix to consider is obtained by following a suffix
# link. Suppose the suffix link goes from v to s(v). Write the path to v in the form XA with X a single character and A a string
# then the path to the node s(v) is A. In other words, either way (i.e. from the root or via a suffix link)
# we have moved to the next extension in this phase.
if == 0: += 1
else: = self.nodes[].suffix_node
# traverse the tree along the active suffix, sucking up edges until the either
# the suffix becomes explicit or the end of the prefix lies in the middle of an edge
# leaving the active node
# we get here after a "Rule 3" situation, meaning that all of "suffix" is
# implicitly in the tree. We have to finish up by creating the last suffix link
# and moving down the tree so the end of the suffix lies within an edge leaving
# the active node.
if last_parent_node > 0:
self.nodes[last_parent_node].suffix_node = parent_node += 1
def _insert_edge(self, edge):
self.edges[(edge.source_node_index, self.string[edge.first_char_index])] = edge
def _remove_edge(self, edge):
self.edges.pop((edge.source_node_index, self.string[edge.first_char_index]))
def _split_edge(self, edge, suffix):
e = Edge(edge.first_char_index, edge.first_char_index + suffix.length, suffix.source_node_index, len(self.nodes) - 1)
self.nodes[e.dest_node_index].suffix_node = suffix.source_node_index ### need to add node for each edge
edge.first_char_index += suffix.length + 1
edge.source_node_index = e.dest_node_index
return e.dest_node_index
def _canonize_suffix(self, suffix):
"""This canonizes the suffix, walking along its suffix string until it
is explicit or there are no more matched nodes.
if not suffix.explicit():
e = self.edges[suffix.source_node_index, self.string[suffix.first_char_index]]
if e.length <= suffix.length:
suffix.first_char_index += e.length + 1
suffix.source_node_index = e.dest_node_index
# Public methods
def find_substring(self, substring):
"""Returns the index of substring in string or -1 if it
is not found.
if not substring:
return -1
if self.case_insensitive:
substring = substring.lower()
curr_node = 0
i = 0
while i < len(substring):
edge = self.edges.get((curr_node, substring[i]))
if not edge:
return -1
ln = min(edge.length + 1, len(substring) - i)
if substring[i:i + ln] != self.string[edge.first_char_index:edge.first_char_index + ln]:
return -1
i += edge.length + 1
curr_node = edge.dest_node_index
return edge.first_char_index - len(substring) + ln
def has_substring(self, substring):
return self.find_substring(substring) != -1
print n
print s,"occurs at",n.find_substring(s),"in",t
You can’t perform that action at this time.