Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
355 lines (276 sloc) 13.3 KB
#!/usr/bin/python
import sys
class Node(object):
"""A node in the suffix tree.
suffix_node
the index of a node with a matching suffix, representing a suffix link.
-1 indicates this node has no suffix link.
label
label is used to hold the label of a leaf node (the start of the corresponding suffix)
depth
depth is the length of the substring from the root to the node
parent_index
index of this nodes parent
"""
def __init__(self,label=0,depth=0,parent=-1):
self.suffix_node = -1
self.label=label
self.depth=depth
self.parent=parent
def __repr__(self):
return "Node(suffix link: %d, label: %d)"%(self.suffix_node,self.label)
class Edge(object):
"""An edge in the suffix tree.
first_char_index
index of start of string part represented by this edge
last_char_index
index of end of string part represented by this edge
source_node_index
index of source node of edge
dest_node_index
index of destination node of edge
"""
def __init__(self, first_char_index, last_char_index, source_node_index, dest_node_index):
self.first_char_index = first_char_index
self.last_char_index = last_char_index
self.source_node_index = source_node_index
self.dest_node_index = dest_node_index
@property
def length(self):
return self.last_char_index - self.first_char_index
def __repr__(self):
return 'Edge(%d, %d, %d %d)'% (self.source_node_index, self.dest_node_index
,self.first_char_index, self.last_char_index )
class Suffix(object):
"""Represents a suffix from first_char_index to last_char_index.
source_node_index
index of node where this suffix starts
first_char_index
index of start of suffix in string
last_char_index
index of end of suffix in string
ext
keep track of which extension we are in
"""
def __init__(self, source_node_index, first_char_index, last_char_index,ext):
self.source_node_index = source_node_index
self.first_char_index = first_char_index
self.last_char_index = last_char_index
self.ext=ext
def __repr__(self):
return 'Suffix(%d, %d, %d, %d)'%(self.source_node_index,self.first_char_index,self.last_char_index,self.ext)
@property
def length(self):
return self.last_char_index - self.first_char_index
def explicit(self):
"""A suffix is explicit if it ends on a node. first_char_index
is set greater than last_char_index to indicate this.
"""
return self.first_char_index > self.last_char_index
def implicit(self):
return self.last_char_index >= self.first_char_index
class SuffixTree(object):
"""A suffix tree for string matching. Uses Ukkonen's algorithm
for construction.
"""
def __init__(self, string, case_insensitive=False,sep_chars=[]):
"""
string
the string for which to construct a suffix tree
"""
self.string = string
self.sep_chars=sep_chars
self.case_insensitive = case_insensitive
self.N = len(string) - 1
self.nodes = [Node()]
self.edges = {}
self.alphabet={}
self.active = Suffix(0, 0, -1,0)
if self.case_insensitive:
self.string = self.string.lower()
for i in range(len(string)):
self._add_prefix(i)
self._add_to_alphabet(self.string[i])
def __repr__(self):
"""
Lists edges in the suffix tree
"""
s=""
for x in self.alphabet.keys():
s+=x
curr_index = self.N
s = "\tLabel \tStart \tEnd \tDepth \tSuf \tParent \tFirst \tLast \tString\n"
values = self.edges.values()
values.sort(key=lambda x: (self.nodes[x.dest_node_index].label,self.nodes[x.source_node_index]))
for edge in values:
if edge.source_node_index == -1:
continue
s += "\t%s \t%s \t%s \t%s \t%s \t%s \t%s \t%s \t"%(self.nodes[edge.dest_node_index].label, edge.source_node_index
,edge.dest_node_index ,self.nodes[edge.dest_node_index].depth
,self.nodes[edge.dest_node_index].suffix_node, self.nodes[edge.dest_node_index].parent
,edge.first_char_index
,edge.last_char_index)
top = min(curr_index, edge.last_char_index)
s += self.string[edge.first_char_index:top+1] + "\n"
return s
def _add_to_alphabet(self,char):
self.alphabet[char]=1
def _add_prefix(self, last_char_index):
"""The core construction method.
"""
last_parent_node = -1
while True:
# the current phase of the algorithm is given by last_char_index
# the current suffix being inserted is given by the path from the root to self.active.source_node_index
# followed by the string from self.active.first_char_index to the last_char_index in self.string
# So it is of the form (A)(string[self.active.first_char_index..self.active.last_char_index])(self.string[last_char_index]) [***]
# and the path (A) ends at the node self.active.source_node_index
# any splits will occur in an edge from self.active.source_node_index
# we need to remember this node for the purposes of creating suffix links
parent_node = self.active.source_node_index
if self.active.explicit():
# active is at a node (meaning that the middle piece of the form [***] is missing
if (self.active.source_node_index, self.string[last_char_index]) in self.edges:
# Rule 3 (do nothing): edge leaving source_node_index is marked with the last character of active suffix
# prefix is already in tree
break
else:
# active is in the middle of an edge from source node in the direction of active.first_char_index
e = self.edges[self.active.source_node_index, self.string[self.active.first_char_index]]
if self.string[e.first_char_index + self.active.length + 1] == self.string[last_char_index]:
# Rule 3 (do nothing): suffix lies in middle of an edge
# prefix is already in tree
break
# Rule 2 in the middle of an edge, we know that at least one path continues, we must split edge
# Also, this new internal node will have a suffix link to it from "last_parent_node"
# assuming last parent node isn't the root (this happens a bit later)
parent_node = self._split_edge(e, self.active)
# either active is at a node, but there is no edge marked with the last character of active suffix
# OR we're in Rule 2, we just split the edge, and parent_node is now the new node
# we add the new leaf node in either case (Rule 2)
self.nodes.append(Node(self.active.ext))
e = Edge(last_char_index, self.N, parent_node, len(self.nodes) - 1)
self.nodes[len(self.nodes)-1].parent=(parent_node,self.string[e.first_char_index])
self._insert_edge(e)
# here's where we put in the suffix link to the parent of the newly created node
# from the previous parent
if last_parent_node > 0:
self.nodes[last_parent_node].suffix_node = parent_node
# Remember the parent for future suffix links
last_parent_node = parent_node
# if the active node is the root, then the next prefix to consider is obtained by just incrementing
# the first_char_index; otherwise, the next prefix to consider is obtained by following a suffix
# link. Suppose the suffix link goes from v to s(v). Write the path to v in the form XA with X a single character and A a string
# then the path to the node s(v) is A. In other words, either way (i.e. from the root or via a suffix link)
# we have moved to the next extension in this phase.
if self.active.source_node_index == 0:
self.active.ext+=1
self.active.first_char_index += 1
else:
self.active.ext+=1
self.active.source_node_index = self.nodes[self.active.source_node_index].suffix_node
# traverse the tree along the active suffix, sucking up edges until the either
# the suffix becomes explicit or the end of the prefix lies in the middle of an edge
# leaving the active node
self._canonize_suffix(self.active)
# we get here after a "Rule 3" situation, meaning that all of "suffix" is
# implicitly in the tree. We have to finish up by creating the last suffix link
# and moving down the tree so the end of the suffix lies within an edge leaving
# the active node.
if last_parent_node > 0:
self.nodes[last_parent_node].suffix_node = parent_node
self.active.last_char_index += 1
self._canonize_suffix(self.active)
def _insert_edge(self, edge):
self.edges[(edge.source_node_index, self.string[edge.first_char_index])] = edge
self.nodes[edge.dest_node_index].depth=self.nodes[edge.source_node_index].depth+edge.length+1
def _remove_edge(self, edge):
self.edges.pop((edge.source_node_index, self.string[edge.first_char_index]))
def _split_edge(self, edge, suffix):
self.nodes.append(Node(-1))
e = Edge(edge.first_char_index, edge.first_char_index + suffix.length, suffix.source_node_index, len(self.nodes) - 1)
self.nodes[len(self.nodes)-1].parent=(suffix.source_node_index,self.string[e.first_char_index])
self._remove_edge(edge)
self._insert_edge(e)
self.nodes[e.dest_node_index].suffix_node = suffix.source_node_index ### need to add node for each edge
edge.first_char_index += suffix.length + 1
self.nodes[edge.dest_node_index].parent=(len(self.nodes)-1,self.string[edge.first_char_index])
edge.source_node_index = e.dest_node_index
self._insert_edge(edge)
return e.dest_node_index
def _canonize_suffix(self, suffix):
"""This canonizes the suffix, walking along its suffix string until it
is explicit or there are no more matched nodes.
"""
if not suffix.explicit():
e = self.edges[suffix.source_node_index, self.string[suffix.first_char_index]]
if e.length <= suffix.length:
suffix.first_char_index += e.length + 1
suffix.source_node_index = e.dest_node_index
self._canonize_suffix(suffix)
# Public methods
def find_substring(self, substring,nodelist=[]):
"""Returns the index of substring in string or -1 if it
is not found.
"""
if not substring:
return -1
if self.case_insensitive:
substring = substring.lower()
curr_node = 0
i = 0
while i < len(substring):
edge = self.edges.get((curr_node, substring[i]))
if not edge:
return -1
ln = min(edge.length + 1, len(substring) - i)
if substring[i:i + ln] != self.string[edge.first_char_index:edge.first_char_index + ln]:
return -1
i += edge.length + 1
curr_node = edge.dest_node_index
return self.count_leaves_below(curr_node,nodelist)
def count_leaves_below(self,node,nodelist):
j=0 ; n=1
for x in self.alphabet.keys():
e=self.edges.get((node,x))
if e:
j+=self.count_leaves_below(e.dest_node_index,nodelist)
n=0
if n==1:
nodelist.append(self.nodes[node].label)
return max(n,j)
def common_substring_nodes(self,curr_node=0,answers=[]):
n=1; l={}
for x in self.alphabet.keys():
e=self.edges.get((curr_node,x))
if e:
l.update(self.common_substring_nodes(e.dest_node_index,answers))
n=0
if n==1:
for t in self.sep_chars:
if t in self.string[self.nodes[curr_node].label:]:
l[t]=1
break
if len(l.keys())==len(self.sep_chars):
answers.append(curr_node)
return l
def prefix_of_node(self,curr_node):
prefix=''
while True:
n=self.nodes[curr_node].parent
e=self.edges.get(n)
if e:
prefix=self.string[e.first_char_index:e.last_char_index+1]+prefix
curr_node=e.source_node_index
else:
break
return prefix
def has_substring(self, substring):
return self.find_substring(substring) != -1
s=sys.stdin.read()
#s='banana$andes#ran*'
n=SuffixTree(s,case_insensitive=True,sep_chars=['*','$','#'])
a=[]
n.common_substring_nodes(answers=a)
a.sort(key=lambda x: -(n.nodes[x].depth))
print n.prefix_of_node(a[0])
You can’t perform that action at this time.