longestsubstring.py

#!/usr/bin/python

import sys
import string

class Node(object):
    """A node in the suffix tree.

    suffix_node
        the index of a node with a matching suffix, representing a suffix link.
        -1 indicates this node has no suffix link.

    label
        label is used to hold the label of a leaf node (the start of the corresponding suffix)

    depth
        depth is the length of the substring from the root to the node


    parent_index
         index of this nodes parent
    """
    def __init__(self,label=0,depth=0,parent=-1):
        self.suffix_node = -1
        self.label=label
        self.depth=depth
        self.parent=parent

    def __repr__(self):
        return "Node(suffix link: %d, label: %d)"%(self.suffix_node,self.label)

class Edge(object):
    """An edge in the suffix tree.

    first_char_index
        index of start of string part represented by this edge

    last_char_index
        index of end of string part represented by this edge

    source_node_index
        index of source node of edge

    dest_node_index
        index of destination node of edge
    """
    def __init__(self, first_char_index, last_char_index, source_node_index, dest_node_index):
        self.first_char_index = first_char_index
        self.last_char_index = last_char_index
        self.source_node_index = source_node_index
        self.dest_node_index = dest_node_index

    @property
    def length(self):
        return self.last_char_index - self.first_char_index

    def __repr__(self):
        return 'Edge(%d, %d, %d %d)'% (self.source_node_index, self.dest_node_index
                                        ,self.first_char_index, self.last_char_index )


class Suffix(object):
    """Represents a suffix from first_char_index to last_char_index.

    source_node_index
        index of node where this suffix starts

    first_char_index
        index of start of suffix in string

    last_char_index
        index of end of suffix in string

    ext
        keep track of which extension we are in

    """
    def __init__(self, source_node_index, first_char_index, last_char_index,ext):
        self.source_node_index = source_node_index
        self.first_char_index = first_char_index
        self.last_char_index = last_char_index
        self.ext=ext


    def __repr__(self):
        return 'Suffix(%d, %d, %d, %d)'%(self.source_node_index,self.first_char_index,self.last_char_index,self.ext)

    @property
    def length(self):
        return self.last_char_index - self.first_char_index

    def explicit(self):
        """A suffix is explicit if it ends on a node. first_char_index
        is set greater than last_char_index to indicate this.
        """
        return self.first_char_index > self.last_char_index

    def implicit(self):
        return self.last_char_index >= self.first_char_index


class SuffixTree(object):
    """A suffix tree for string matching. Uses Ukkonen's algorithm
    for construction.
    """
    def __init__(self, string, case_insensitive=False,sep_chars=[]):
        """
        string
            the string for which to construct a suffix tree
        """
        self.string = string
        self.sep_chars=sep_chars
        self.case_insensitive = case_insensitive
        self.N = len(string) - 1
        self.nodes = [Node()]
        self.edges = {}
        self.alphabet={}
        self.active = Suffix(0, 0, -1,0)
        if self.case_insensitive:
            self.string = self.string.lower()
        for i in range(len(string)):
            self._add_prefix(i)
            self._add_to_alphabet(self.string[i])


    def __repr__(self):
        """
        Lists edges in the suffix tree
        """
        s=""
        for x in self.alphabet.keys():
            s+=x
        curr_index = self.N
        s = "\tLabel \tStart \tEnd \tDepth \tSuf \tParent \tFirst \tLast \tString\n"
        values = self.edges.values()
        values.sort(key=lambda x: (self.nodes[x.dest_node_index].label,self.nodes[x.source_node_index]))
        for edge in values:
            if edge.source_node_index == -1:
                continue
            s += "\t%s \t%s \t%s \t%s \t%s \t%s \t%s \t%s \t"%(self.nodes[edge.dest_node_index].label,  edge.source_node_index
                    ,edge.dest_node_index ,self.nodes[edge.dest_node_index].depth
                    ,self.nodes[edge.dest_node_index].suffix_node, self.nodes[edge.dest_node_index].parent
                    ,edge.first_char_index
                    ,edge.last_char_index)


            top = min(curr_index, edge.last_char_index)
            s += self.string[edge.first_char_index:top+1] + "\n"
        return s

    def _add_to_alphabet(self,char):
        self.alphabet[char]=1

    def _add_prefix(self, last_char_index):
        """The core construction method.
        """
        last_parent_node = -1

        while True:
            # the current phase of the algorithm is given by last_char_index
            # the current suffix being inserted is given by the path from the root to self.active.source_node_index
            # followed by the string from self.active.first_char_index to the last_char_index in self.string
            # So it is of the form (A)(string[self.active.first_char_index..self.active.last_char_index])(self.string[last_char_index]) [***]
            # and the path (A) ends at the node self.active.source_node_index

            # any splits will occur in an edge from self.active.source_node_index
            # we need to remember this node for the purposes of creating suffix links


            parent_node = self.active.source_node_index

            if self.active.explicit():
                # active is at a node (meaning that the middle piece of the form [***] is missing
                if (self.active.source_node_index, self.string[last_char_index]) in self.edges:
                    # Rule 3 (do nothing): edge leaving source_node_index is marked with the last character of active suffix
                    # prefix is already in tree
                    break
            else:
                # active is in the middle of an edge from source node in the direction of active.first_char_index
                e = self.edges[self.active.source_node_index, self.string[self.active.first_char_index]]
                if self.string[e.first_char_index + self.active.length + 1] == self.string[last_char_index]:
                    # Rule 3 (do nothing): suffix lies in middle of an edge
                    # prefix is already in tree
                    break

                # Rule 2 in the middle of an edge, we know that at least one path continues, we must split edge

                # Also, this new internal node will have a suffix link to it from "last_parent_node"
                # assuming last parent node isn't the root (this happens a bit later)

                parent_node = self._split_edge(e, self.active)

            # either active is at a node, but there is no edge marked with the last character of active suffix
            # OR we're in Rule 2, we just split the edge, and parent_node is now the new node
            # we add the new leaf node in either case (Rule 2)


            self.nodes.append(Node(self.active.ext))
            e = Edge(last_char_index, self.N, parent_node, len(self.nodes) - 1)
            self.nodes[len(self.nodes)-1].parent=(parent_node,self.string[e.first_char_index])
            self._insert_edge(e)

            # here's where we put in the suffix link to the parent of the newly created node
            # from the previous parent

            if last_parent_node > 0:
                self.nodes[last_parent_node].suffix_node = parent_node

            # Remember the parent for future suffix links

            last_parent_node = parent_node

            # if the active node is the root, then the next prefix to consider is obtained by just incrementing
            # the first_char_index; otherwise, the next prefix to consider is obtained by following a suffix
            # link.  Suppose the suffix link goes from v to s(v). Write the path to v in the form XA with X a single character and A a string
            # then the path to the node s(v) is A.  In other words, either way (i.e. from the root or via a suffix link)
            # we have moved to the next extension in this phase.

            if self.active.source_node_index == 0:
                self.active.ext+=1
                self.active.first_char_index += 1
            else:
                self.active.ext+=1
                self.active.source_node_index = self.nodes[self.active.source_node_index].suffix_node


            # traverse the tree along the active suffix, sucking up edges until the either
            # the suffix becomes explicit or the end of the prefix lies in the middle of an edge
            # leaving the active node

            self._canonize_suffix(self.active)

        # we get here after a "Rule 3" situation, meaning that all of "suffix" is
        # implicitly in the tree.  We have to finish up by creating the last suffix link
        # and moving down the tree so the end of the suffix lies within an edge leaving
        # the active node.

        if last_parent_node > 0:
            self.nodes[last_parent_node].suffix_node = parent_node
        self.active.last_char_index += 1
        self._canonize_suffix(self.active)

    def _insert_edge(self, edge):
        self.edges[(edge.source_node_index, self.string[edge.first_char_index])] = edge
        self.nodes[edge.dest_node_index].depth=self.nodes[edge.source_node_index].depth+edge.length+1

    def _remove_edge(self, edge):
        self.edges.pop((edge.source_node_index, self.string[edge.first_char_index]))

    def _split_edge(self, edge, suffix):
        self.nodes.append(Node(-1))
        e = Edge(edge.first_char_index, edge.first_char_index + suffix.length, suffix.source_node_index, len(self.nodes) - 1)
        self.nodes[len(self.nodes)-1].parent=(suffix.source_node_index,self.string[e.first_char_index])
        self._remove_edge(edge)
        self._insert_edge(e)
        self.nodes[e.dest_node_index].suffix_node = suffix.source_node_index  ### need to add node for each edge

        edge.first_char_index += suffix.length + 1
        self.nodes[edge.dest_node_index].parent=(len(self.nodes)-1,self.string[edge.first_char_index])
        edge.source_node_index = e.dest_node_index
        self._insert_edge(edge)
        return e.dest_node_index

    def _canonize_suffix(self, suffix):
        """This canonizes the suffix, walking along its suffix string until it
        is explicit or there are no more matched nodes.
        """
        if not suffix.explicit():
            e = self.edges[suffix.source_node_index, self.string[suffix.first_char_index]]
            if e.length <= suffix.length:
                suffix.first_char_index += e.length + 1
                suffix.source_node_index = e.dest_node_index
                self._canonize_suffix(suffix)


    # Public methods
    def find_substring(self, substring,nodelist=[]):
        """Returns the index of substring in string or -1 if it
        is not found.
        """
        if not substring:
            return -1
        if self.case_insensitive:
            substring = substring.lower()
        curr_node = 0
        i = 0
        while i < len(substring):
            edge = self.edges.get((curr_node, substring[i]))
            if not edge:
                return -1
            ln = min(edge.length + 1, len(substring) - i)
            if substring[i:i + ln] != self.string[edge.first_char_index:edge.first_char_index + ln]:
                return -1
            i += edge.length + 1
            curr_node = edge.dest_node_index
        return self.count_leaves_below(curr_node,nodelist)


    def count_leaves_below(self,node,nodelist):
        j=0 ; n=1
        for x in self.alphabet.keys():
            e=self.edges.get((node,x))
            if e:
                j+=self.count_leaves_below(e.dest_node_index,nodelist)
                n=0
        if n==1:
            nodelist.append(self.nodes[node].label)
        return max(n,j)


    def common_substring_nodes(self,curr_node=0,answers=[]):
        n=1; l={}
        for x in self.alphabet.keys():
            e=self.edges.get((curr_node,x))
            if e:
                l.update(self.common_substring_nodes(e.dest_node_index,answers))
                n=0
        if n==1:
            for t in self.sep_chars:
                if t in self.string[self.nodes[curr_node].label:]:
                    l[t]=1
                    break
        if len(l.keys())==len(self.sep_chars):
            answers.append(curr_node)
        return l


    def prefix_of_node(self,curr_node):
        prefix=''
        while True:
            n=self.nodes[curr_node].parent
            e=self.edges.get(n)
            if e:
                prefix=self.string[e.first_char_index:e.last_char_index+1]+prefix
                curr_node=e.source_node_index
            else:
                break
        return prefix

    def has_substring(self, substring):
        return self.find_substring(substring) != -1


value,key,s="","",{}

with open("datafile.txt","rU") as f:
    for x in f:
        if x[0]!=">":
            value=value+x.strip()
        else:
            if key!="": s[key]=value
            key=x[1:].strip()
            value=""

big_string=''
j=0
sepchars=[]


for x in s.values():
    big_string+=x
    big_string+=str(len(sepchars))
    sepchars.append(str(len(sepchars)))
    if len(sepchars)==10:
        n=SuffixTree(big_string,case_insensitive=True,sep_chars=sepchars)
        a=[]
        n.common_substring_nodes(answers=a)
        a.sort(key=lambda x: -(n.nodes[x].depth))
        b=n.prefix_of_node(a[0])
        print b.upper()
        bigstring=""
        sepchars=[]

if len(sepchars)>0:
    n=SuffixTree(big_string,case_insensitive=True,sep_chars=sepchars)
    a=[]
    n.common_substring_nodes(answers=a)
    a.sort(key=lambda x: -(n.nodes[x].depth))
    b=n.prefix_of_node(a[0])
    print b.upper()
	#!/usr/bin/python

	import sys
	import string

	class Node(object):
	"""A node in the suffix tree.

	suffix_node
	the index of a node with a matching suffix, representing a suffix link.
	-1 indicates this node has no suffix link.

	label
	label is used to hold the label of a leaf node (the start of the corresponding suffix)

	depth
	depth is the length of the substring from the root to the node


	parent_index
	index of this nodes parent
	"""
	def __init__(self,label=0,depth=0,parent=-1):
	self.suffix_node = -1
	self.label=label
	self.depth=depth
	self.parent=parent

	def __repr__(self):
	return "Node(suffix link: %d, label: %d)"%(self.suffix_node,self.label)

	class Edge(object):
	"""An edge in the suffix tree.

	first_char_index
	index of start of string part represented by this edge

	last_char_index
	index of end of string part represented by this edge

	source_node_index
	index of source node of edge

	dest_node_index
	index of destination node of edge
	"""
	def __init__(self, first_char_index, last_char_index, source_node_index, dest_node_index):
	self.first_char_index = first_char_index
	self.last_char_index = last_char_index
	self.source_node_index = source_node_index
	self.dest_node_index = dest_node_index

	@property
	def length(self):
	return self.last_char_index - self.first_char_index

	def __repr__(self):
	return 'Edge(%d, %d, %d %d)'% (self.source_node_index, self.dest_node_index
	,self.first_char_index, self.last_char_index )


	class Suffix(object):
	"""Represents a suffix from first_char_index to last_char_index.

	source_node_index
	index of node where this suffix starts

	first_char_index
	index of start of suffix in string

	last_char_index
	index of end of suffix in string

	ext
	keep track of which extension we are in

	"""
	def __init__(self, source_node_index, first_char_index, last_char_index,ext):
	self.source_node_index = source_node_index
	self.first_char_index = first_char_index
	self.last_char_index = last_char_index
	self.ext=ext


	def __repr__(self):
	return 'Suffix(%d, %d, %d, %d)'%(self.source_node_index,self.first_char_index,self.last_char_index,self.ext)

	@property
	def length(self):
	return self.last_char_index - self.first_char_index

	def explicit(self):
	"""A suffix is explicit if it ends on a node. first_char_index
	is set greater than last_char_index to indicate this.
	"""
	return self.first_char_index > self.last_char_index

	def implicit(self):
	return self.last_char_index >= self.first_char_index


	class SuffixTree(object):
	"""A suffix tree for string matching. Uses Ukkonen's algorithm
	for construction.
	"""
	def __init__(self, string, case_insensitive=False,sep_chars=[]):
	"""
	string
	the string for which to construct a suffix tree
	"""
	self.string = string
	self.sep_chars=sep_chars
	self.case_insensitive = case_insensitive
	self.N = len(string) - 1
	self.nodes = [Node()]
	self.edges = {}
	self.alphabet={}
	self.active = Suffix(0, 0, -1,0)
	if self.case_insensitive:
	self.string = self.string.lower()
	for i in range(len(string)):
	self._add_prefix(i)
	self._add_to_alphabet(self.string[i])




	def __repr__(self):
	"""
	Lists edges in the suffix tree
	"""
	s=""
	for x in self.alphabet.keys():
	s+=x
	curr_index = self.N
	s = "\tLabel \tStart \tEnd \tDepth \tSuf \tParent \tFirst \tLast \tString\n"
	values = self.edges.values()
	values.sort(key=lambda x: (self.nodes[x.dest_node_index].label,self.nodes[x.source_node_index]))
	for edge in values:
	if edge.source_node_index == -1:
	continue
	s += "\t%s \t%s \t%s \t%s \t%s \t%s \t%s \t%s \t"%(self.nodes[edge.dest_node_index].label, edge.source_node_index
	,edge.dest_node_index ,self.nodes[edge.dest_node_index].depth
	,self.nodes[edge.dest_node_index].suffix_node, self.nodes[edge.dest_node_index].parent
	,edge.first_char_index
	,edge.last_char_index)


	top = min(curr_index, edge.last_char_index)
	s += self.string[edge.first_char_index:top+1] + "\n"
	return s

	def _add_to_alphabet(self,char):
	self.alphabet[char]=1

	def _add_prefix(self, last_char_index):
	"""The core construction method.
	"""
	last_parent_node = -1

	while True:
	# the current phase of the algorithm is given by last_char_index
	# the current suffix being inserted is given by the path from the root to self.active.source_node_index
	# followed by the string from self.active.first_char_index to the last_char_index in self.string
	# So it is of the form (A)(string[self.active.first_char_index..self.active.last_char_index])(self.string[last_char_index]) [***]
	# and the path (A) ends at the node self.active.source_node_index

	# any splits will occur in an edge from self.active.source_node_index
	# we need to remember this node for the purposes of creating suffix links


	parent_node = self.active.source_node_index

	if self.active.explicit():
	# active is at a node (meaning that the middle piece of the form [***] is missing
	if (self.active.source_node_index, self.string[last_char_index]) in self.edges:
	# Rule 3 (do nothing): edge leaving source_node_index is marked with the last character of active suffix
	# prefix is already in tree
	break
	else:
	# active is in the middle of an edge from source node in the direction of active.first_char_index
	e = self.edges[self.active.source_node_index, self.string[self.active.first_char_index]]
	if self.string[e.first_char_index + self.active.length + 1] == self.string[last_char_index]:
	# Rule 3 (do nothing): suffix lies in middle of an edge
	# prefix is already in tree
	break

	# Rule 2 in the middle of an edge, we know that at least one path continues, we must split edge

	# Also, this new internal node will have a suffix link to it from "last_parent_node"
	# assuming last parent node isn't the root (this happens a bit later)

	parent_node = self._split_edge(e, self.active)

	# either active is at a node, but there is no edge marked with the last character of active suffix
	# OR we're in Rule 2, we just split the edge, and parent_node is now the new node
	# we add the new leaf node in either case (Rule 2)


	self.nodes.append(Node(self.active.ext))
	e = Edge(last_char_index, self.N, parent_node, len(self.nodes) - 1)
	self.nodes[len(self.nodes)-1].parent=(parent_node,self.string[e.first_char_index])
	self._insert_edge(e)

	# here's where we put in the suffix link to the parent of the newly created node
	# from the previous parent

	if last_parent_node > 0:
	self.nodes[last_parent_node].suffix_node = parent_node

	# Remember the parent for future suffix links

	last_parent_node = parent_node

	# if the active node is the root, then the next prefix to consider is obtained by just incrementing
	# the first_char_index; otherwise, the next prefix to consider is obtained by following a suffix
	# link. Suppose the suffix link goes from v to s(v). Write the path to v in the form XA with X a single character and A a string
	# then the path to the node s(v) is A. In other words, either way (i.e. from the root or via a suffix link)
	# we have moved to the next extension in this phase.

	if self.active.source_node_index == 0:
	self.active.ext+=1
	self.active.first_char_index += 1
	else:
	self.active.ext+=1
	self.active.source_node_index = self.nodes[self.active.source_node_index].suffix_node


	# traverse the tree along the active suffix, sucking up edges until the either
	# the suffix becomes explicit or the end of the prefix lies in the middle of an edge
	# leaving the active node

	self._canonize_suffix(self.active)

	# we get here after a "Rule 3" situation, meaning that all of "suffix" is
	# implicitly in the tree. We have to finish up by creating the last suffix link
	# and moving down the tree so the end of the suffix lies within an edge leaving
	# the active node.

	if last_parent_node > 0:
	self.nodes[last_parent_node].suffix_node = parent_node
	self.active.last_char_index += 1
	self._canonize_suffix(self.active)

	def _insert_edge(self, edge):
	self.edges[(edge.source_node_index, self.string[edge.first_char_index])] = edge
	self.nodes[edge.dest_node_index].depth=self.nodes[edge.source_node_index].depth+edge.length+1

	def _remove_edge(self, edge):
	self.edges.pop((edge.source_node_index, self.string[edge.first_char_index]))

	def _split_edge(self, edge, suffix):
	self.nodes.append(Node(-1))
	e = Edge(edge.first_char_index, edge.first_char_index + suffix.length, suffix.source_node_index, len(self.nodes) - 1)
	self.nodes[len(self.nodes)-1].parent=(suffix.source_node_index,self.string[e.first_char_index])
	self._remove_edge(edge)
	self._insert_edge(e)
	self.nodes[e.dest_node_index].suffix_node = suffix.source_node_index ### need to add node for each edge

	edge.first_char_index += suffix.length + 1
	self.nodes[edge.dest_node_index].parent=(len(self.nodes)-1,self.string[edge.first_char_index])
	edge.source_node_index = e.dest_node_index
	self._insert_edge(edge)
	return e.dest_node_index

	def _canonize_suffix(self, suffix):
	"""This canonizes the suffix, walking along its suffix string until it
	is explicit or there are no more matched nodes.
	"""
	if not suffix.explicit():
	e = self.edges[suffix.source_node_index, self.string[suffix.first_char_index]]
	if e.length <= suffix.length:
	suffix.first_char_index += e.length + 1
	suffix.source_node_index = e.dest_node_index
	self._canonize_suffix(suffix)


	# Public methods
	def find_substring(self, substring,nodelist=[]):
	"""Returns the index of substring in string or -1 if it
	is not found.
	"""
	if not substring:
	return -1
	if self.case_insensitive:
	substring = substring.lower()
	curr_node = 0
	i = 0
	while i < len(substring):
	edge = self.edges.get((curr_node, substring[i]))
	if not edge:
	return -1
	ln = min(edge.length + 1, len(substring) - i)
	if substring[i:i + ln] != self.string[edge.first_char_index:edge.first_char_index + ln]:
	return -1
	i += edge.length + 1
	curr_node = edge.dest_node_index
	return self.count_leaves_below(curr_node,nodelist)


	def count_leaves_below(self,node,nodelist):
	j=0 ; n=1
	for x in self.alphabet.keys():
	e=self.edges.get((node,x))
	if e:
	j+=self.count_leaves_below(e.dest_node_index,nodelist)
	n=0
	if n==1:
	nodelist.append(self.nodes[node].label)
	return max(n,j)


	def common_substring_nodes(self,curr_node=0,answers=[]):
	n=1; l={}
	for x in self.alphabet.keys():
	e=self.edges.get((curr_node,x))
	if e:
	l.update(self.common_substring_nodes(e.dest_node_index,answers))
	n=0
	if n==1:
	for t in self.sep_chars:
	if t in self.string[self.nodes[curr_node].label:]:
	l[t]=1
	break
	if len(l.keys())==len(self.sep_chars):
	answers.append(curr_node)
	return l


	def prefix_of_node(self,curr_node):
	prefix=''
	while True:
	n=self.nodes[curr_node].parent
	e=self.edges.get(n)
	if e:
	prefix=self.string[e.first_char_index:e.last_char_index+1]+prefix
	curr_node=e.source_node_index
	else:
	break
	return prefix

	def has_substring(self, substring):
	return self.find_substring(substring) != -1



	value,key,s="","",{}

	with open("datafile.txt","rU") as f:
	for x in f:
	if x[0]!=">":
	value=value+x.strip()
	else:
	if key!="": s[key]=value
	key=x[1:].strip()
	value=""

	big_string=''
	j=0
	sepchars=[]




	for x in s.values():
	big_string+=x
	big_string+=str(len(sepchars))
	sepchars.append(str(len(sepchars)))
	if len(sepchars)==10:
	n=SuffixTree(big_string,case_insensitive=True,sep_chars=sepchars)
	a=[]
	n.common_substring_nodes(answers=a)
	a.sort(key=lambda x: -(n.nodes[x].depth))
	b=n.prefix_of_node(a[0])
	print b.upper()
	bigstring=""
	sepchars=[]

	if len(sepchars)>0:
	n=SuffixTree(big_string,case_insensitive=True,sep_chars=sepchars)
	a=[]
	n.common_substring_nodes(answers=a)
	a.sort(key=lambda x: -(n.nodes[x].depth))
	b=n.prefix_of_node(a[0])
	print b.upper()