Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Genomics/jtsuffix.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
304 lines (236 sloc)
11.5 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
class Node(object): | |
"""A node in the suffix tree. | |
suffix_node | |
the index of a node with a matching suffix, representing a suffix link. | |
-1 indicates this node has no suffix link. | |
depth | |
depth of the node in the tree | |
""" | |
def __init__(self,depth=0): | |
self.suffix_node = -1 | |
self.depth=depth | |
def __repr__(self): | |
return "Node(suffix link: %d, depth: %d)"%(self.suffix_node,self.depth) | |
class Edge(object): | |
"""An edge in the suffix tree. | |
first_char_index | |
index of start of string part represented by this edge | |
last_char_index | |
index of end of string part represented by this edge | |
source_node_index | |
index of source node of edge | |
dest_node_index | |
index of destination node of edge | |
""" | |
def __init__(self, first_char_index, last_char_index, source_node_index, dest_node_index): | |
self.first_char_index = first_char_index | |
self.last_char_index = last_char_index | |
self.source_node_index = source_node_index | |
self.dest_node_index = dest_node_index | |
@property | |
def length(self): | |
return self.last_char_index - self.first_char_index | |
def __repr__(self): | |
return 'Edge(%d, %d, %d %d)'% (self.source_node_index, self.dest_node_index | |
,self.first_char_index, self.last_char_index ) | |
class Suffix(object): | |
"""Represents a suffix from first_char_index to last_char_index. | |
source_node_index | |
index of node where this suffix starts | |
first_char_index | |
index of start of suffix in string | |
last_char_index | |
index of end of suffix in string | |
""" | |
def __init__(self, source_node_index, first_char_index, last_char_index): | |
self.source_node_index = source_node_index | |
self.first_char_index = first_char_index | |
self.last_char_index = last_char_index | |
def __repr__(self): | |
return 'Suffix(%d, %d, %d)'%(self.source_node_index,self.first_char_index,self.last_char_index) | |
@property | |
def length(self): | |
return self.last_char_index - self.first_char_index | |
def explicit(self): | |
"""A suffix is explicit if it ends on a node. first_char_index | |
is set greater than last_char_index to indicate this. | |
""" | |
return self.first_char_index > self.last_char_index | |
def implicit(self): | |
return self.last_char_index >= self.first_char_index | |
class SuffixTree(object): | |
"""A suffix tree for string matching. Uses Ukkonen's algorithm | |
for construction. | |
""" | |
def __init__(self, string, case_insensitive=False): | |
""" | |
string | |
the string for which to construct a suffix tree | |
""" | |
self.string = string | |
self.case_insensitive = case_insensitive | |
self.N = len(string) - 1 | |
self.nodes = [Node()] | |
self.edges = {} | |
self.alphabet={} | |
self.active = Suffix(0, 0, -1) | |
if self.case_insensitive: | |
self.string = self.string.lower() | |
for i in range(len(string)): | |
self._add_prefix(i) | |
self._add_to_alphabet(self.string[i]) | |
def __repr__(self): | |
""" | |
Lists edges in the suffix tree | |
""" | |
s="" | |
for x in self.alphabet.keys(): | |
s+=x | |
curr_index = self.N | |
s = "\tDepth \tStart \tEnd \tSuf \tFirst \tLast \tString\n" | |
values = self.edges.values() | |
values.sort(key=lambda x: (self.nodes[x.source_node_index].depth,self.nodes[x.source_node_index])) | |
for edge in values: | |
if edge.source_node_index == -1: | |
continue | |
s += "\t%s \t%s \t%s \t%s \t%s \t%s \t"%(self.nodes[edge.source_node_index].depth, edge.source_node_index | |
,edge.dest_node_index | |
,self.nodes[edge.dest_node_index].suffix_node | |
,edge.first_char_index | |
,edge.last_char_index) | |
top = min(curr_index, edge.last_char_index) | |
s += self.string[edge.first_char_index:top+1] + "\n" | |
return s | |
def _add_to_alphabet(self,char): | |
self.alphabet[char]=1 | |
def _add_prefix(self, last_char_index): | |
"""The core construction method. | |
""" | |
last_parent_node = -1 | |
while True: | |
# the current phase of the algorithm is given by last_char_index | |
# the current suffix being inserted is given by the path from the root to self.active.source_node_index | |
# followed by the string from self.active.first_char_index to the last_char_index in self.string | |
# So it is of the form (A)(string[self.active.first_char_index..self.active.last_char_index])(self.string[last_char_index]) [***] | |
# and the path (A) ends at the node self.active.source_node_index | |
# any splits will occur in an edge from self.active.source_node_index | |
# we need to remember this node for the purposes of creating suffix links | |
parent_node = self.active.source_node_index | |
if self.active.explicit(): | |
# active is at a node (meaning that the middle piece of the form [***] is missing | |
if (self.active.source_node_index, self.string[last_char_index]) in self.edges: | |
# Rule 3 (do nothing): edge leaving source_node_index is marked with the last character of active suffix | |
# prefix is already in tree | |
break | |
else: | |
# active is in the middle of an edge from source node in the direction of active.first_char_index | |
e = self.edges[self.active.source_node_index, self.string[self.active.first_char_index]] | |
if self.string[e.first_char_index + self.active.length + 1] == self.string[last_char_index]: | |
# Rule 3 (do nothing): suffix lies in middle of an edge | |
# prefix is already in tree | |
break | |
# Rule 2 in the middle of an edge, we know that at least one path continues, we must split edge | |
# Also, this new internal node will have a suffix link to it from "last_parent_node" | |
# assuming last parent node isn't the root (this happens a bit later) | |
parent_node = self._split_edge(e, self.active) | |
# either active is at a node, but there is no edge marked with the last character of active suffix | |
# OR we're in Rule 2, we just split the edge, and parent_node is now the new node | |
# we add the new leaf node in either case (Rule 2) | |
self.nodes.append(Node(self.nodes[parent_node].depth+1)) | |
e = Edge(last_char_index, self.N, parent_node, len(self.nodes) - 1) | |
self._insert_edge(e) | |
# here's where we put in the suffix link to the parent of the newly created node | |
# from the previous parent | |
if last_parent_node > 0: | |
self.nodes[last_parent_node].suffix_node = parent_node | |
# Remember the parent for future suffix links | |
last_parent_node = parent_node | |
# if the active node is the root, then the next prefix to consider is obtained by just incrementing | |
# the first_char_index; otherwise, the next prefix to consider is obtained by following a suffix | |
# link. Suppose the suffix link goes from v to s(v). Write the path to v in the form XA with X a single character and A a string | |
# then the path to the node s(v) is A. In other words, either way (i.e. from the root or via a suffix link) | |
# we have moved to the next extension in this phase. | |
if self.active.source_node_index == 0: | |
self.active.first_char_index += 1 | |
else: | |
self.active.source_node_index = self.nodes[self.active.source_node_index].suffix_node | |
# traverse the tree along the active suffix, sucking up edges until the either | |
# the suffix becomes explicit or the end of the prefix lies in the middle of an edge | |
# leaving the active node | |
self._canonize_suffix(self.active) | |
# we get here after a "Rule 3" situation, meaning that all of "suffix" is | |
# implicitly in the tree. We have to finish up by creating the last suffix link | |
# and moving down the tree so the end of the suffix lies within an edge leaving | |
# the active node. | |
if last_parent_node > 0: | |
self.nodes[last_parent_node].suffix_node = parent_node | |
self.active.last_char_index += 1 | |
self._canonize_suffix(self.active) | |
def _insert_edge(self, edge): | |
self.edges[(edge.source_node_index, self.string[edge.first_char_index])] = edge | |
def _remove_edge(self, edge): | |
self.edges.pop((edge.source_node_index, self.string[edge.first_char_index])) | |
def _split_edge(self, edge, suffix): | |
self.nodes.append(Node(self.nodes[suffix.source_node_index].depth+1)) | |
e = Edge(edge.first_char_index, edge.first_char_index + suffix.length, suffix.source_node_index, len(self.nodes) - 1) | |
self._remove_edge(edge) | |
self._insert_edge(e) | |
self.nodes[e.dest_node_index].suffix_node = suffix.source_node_index ### need to add node for each edge | |
edge.first_char_index += suffix.length + 1 | |
edge.source_node_index = e.dest_node_index | |
self._insert_edge(edge) | |
return e.dest_node_index | |
def _canonize_suffix(self, suffix): | |
"""This canonizes the suffix, walking along its suffix string until it | |
is explicit or there are no more matched nodes. | |
""" | |
if not suffix.explicit(): | |
e = self.edges[suffix.source_node_index, self.string[suffix.first_char_index]] | |
if e.length <= suffix.length: | |
suffix.first_char_index += e.length + 1 | |
suffix.source_node_index = e.dest_node_index | |
self._canonize_suffix(suffix) | |
def _count_leaves_below(self,node): | |
children=[]; j=0 ; n=1 | |
for x in self.alphabet.keys(): | |
e=self.edges.get((node,x)) | |
if e: | |
j+=self.count_leaves_below(e.dest_node_index) | |
n=0 | |
return max(n,j) | |
# Public methods | |
def find_substring(self, substring): | |
"""Returns the index of substring in string or -1 if it | |
is not found. | |
""" | |
if not substring: | |
return -1 | |
if self.case_insensitive: | |
substring = substring.lower() | |
curr_node = 0 | |
i = 0 | |
while i < len(substring): | |
edge = self.edges.get((curr_node, substring[i])) | |
if not edge: | |
return -1 | |
ln = min(edge.length + 1, len(substring) - i) | |
if substring[i:i + ln] != self.string[edge.first_char_index:edge.first_char_index + ln]: | |
return -1 | |
i += edge.length + 1 | |
curr_node = edge.dest_node_index | |
return self.count_leaves_below(curr_node) | |
def count_leaves_below(self,node): | |
children=[]; j=0 ; n=1 | |
for x in self.alphabet.keys(): | |
e=self.edges.get((node,x)) | |
if e: | |
j+=self.count_leaves_below(e.dest_node_index) | |
n=0 | |
return max(n,j) | |
def has_substring(self, substring): | |
return self.find_substring(substring) != -1 | |
t='banana$' | |
s='ab' | |
n=SuffixTree(t) | |
print n |