input_parser.py

#!/usr/bin/env python
"""
@author Peter Zaffetti 2018
"""
from logger import get_logger
from primatives.micelle_connection import MicelleConnection
from primatives.micelle import Micelle
from primatives.point import Point
from enum import Enum
import re


def parse_micelle_input_file(filename="test.dat"):
    """
    Takes a file containing a set of points that make up a Micelle and parses the file contents into a Micelle object.

    :param filename: the file to be opened and parsed.
    :return: The Micelle object generated from the file.
    """
    logger = get_logger(parse_micelle_input_file.__name__)

    input_file = open(filename, "r")
    micelle_points = list()

    for line in input_file:
        line = line.replace("(", "").replace(")", "") # PDZ- remove any parens
        line = ''.join(line.split())  # PDZ- Remove all whitespace in the row

        # check to see if the line starts with a comment (% or #), or is blank, or a non-digit, if so continue
        if line.startswith("%") or line.startswith("#") or line == "" or (line[0].isdigit() is False and line[0] != "-"):
            continue
        comment_split_line = line.split("%")
        point_values = comment_split_line[0]
        comma_split_point_values = point_values.split(",")
        logger.debug(comma_split_point_values)

        if len(comma_split_point_values) != 3:
            raise Exception("The micelle point file is not formatted properly")
        else:
            x_val = float(comma_split_point_values[0])
            y_val = float(comma_split_point_values[1])
            z_val = float(comma_split_point_values[2])
            # indices are implicit when the list points don't have indices. They end up being the index in the list, which ends up being the current length.
            micelle_points.append(Point(len(micelle_points), x_val, y_val, z_val))

    # TODO: PDZ- figure out what the identifier should be for X,Y,Z input files. Right now it is hardcoded to 1.
    return [(1, Micelle(micelle_points))]


def parse_connection_indices_file(filename):
    """
    Takes a file containing a set of indices that for all connections for a Micelle input file (not required). Generates
    a micelle connection object to be used.

    :param filename: the file to be opened and parsed.
    :return: the Connection object generated from the file.
    """
    logger = get_logger(parse_connection_indices_file.__name__)

    input_file = open(filename, "r")
    connections = list()

    for line in input_file:
        logger.debug(line)
        split_line = line.split()

        # skip any comment lines or empty lines or lines that don't start with a digit
        if line.startswith("%") or line.startswith("#") or line == "" or not line[0].isdigit():
            continue

        # make sure that a connection consists of 1 source index and at least 1 other connection index.
        if len(split_line) <= 1:
            raise RuntimeError("The connection line: {} is malformed. It should have a source index and at least 1 other connection index.".format(line))

        # select the first point as the source index and all points from index 2 to the end for the connection indices.
        # NOTE: Skip index 1 since it is just the number of connections for this index.
        connections.append(MicelleConnection(split_line[0], split_line[2:]))

    return connections


class PDBIndex(Enum):
    TAG = 1
    ATOM_NUM = 2
    ATOM_NAME = 3
    RESIDUE_NAME = 4
    CLUSTER_SHAPE_ID = 5
    X_COORD = 6
    Y_COORD = 7
    Z_COORD = 8
    CLUSTER_ID = 9
    CLUSTER_SIZE = 10
    BETA = 11
    OCCUPANCY = 12


def parse_micelle_from_pdb_file(filename="test.dat"):
    """
    Parses micelles from a file in PDB format.

    :param filename: the file containing the micelles.
    :return: a list of micelles, parsed from the file.
    """
    header_line_regex = re.compile(r"HEADER.*")
    remark_line_regex = re.compile(r"REMARK.*")
    '''
    The groups of the regex below are:
     1. the "ATOM" tag
     2. atom num
     3. atom name
     4. residue name
     5. cluster shape id (currently identified as a word but might just be digits)
     6. x coord
     7. y coord
     8. z coord
     9. cluster id
     10. size of cluster
     11. 1st annotation (beta/temperature_factor?)
     12. 2nd annotation (occupancy?)

    The regex below should match and group the following example line:
    ATOM  69901 CH2C SLI  13        18.010  29.315  33.694   110     8           1 0
    '''
    atom_line_regex = re.compile(r"(\w*)\s*(\d*)\s*(\w*)\s*(\w*)\s*(\w*)\s*(-*\d*.?\d*)\s*(-*\d*.?\d*)\s*(-*\d*.?\d*)\s*(\d*)\s*(\d*)\s*(\d*)\s*(\d*)")

    micelle_id_dict = dict()

    input_file = open(filename, "r")
    for line in input_file:

        # if the line is a header or a remark, skip over it.
        if header_line_regex.match(line):
            continue
        if remark_line_regex.match(line):
            continue

        match_results = atom_line_regex.match(line)
        if match_results:
            if match_results.group(1) != "ATOM":
                continue

            beta = int(match_results.group(PDBIndex.BETA.value))
            num = int(match_results.group(PDBIndex.ATOM_NUM.value))
            point_id = (int(match_results.group(PDBIndex.BETA.value)) * 100000) + int(match_results.group(PDBIndex.ATOM_NUM.value))
            micelle_point = Point(point_id, float(match_results.group(PDBIndex.X_COORD.value)), float(match_results.group(PDBIndex.Y_COORD.value)), float(match_results.group(PDBIndex.Z_COORD.value)))
            # get the id of the micelle which contains the point
            micelle_id = match_results.group(PDBIndex.CLUSTER_ID.value)
            # append the point to the micelle points corresponding to the id
            try:
                micelle_id_dict[micelle_id].append(micelle_point)
            except KeyError:
                micelle_id_dict[micelle_id] = [micelle_point]

    # convert all the list of micelle points to objects
    micelles = list()
    # TODO: handle the id of the micelle
    for id, points in micelle_id_dict.items():
        micelles.append((id, Micelle(points)))

    return micelles
	#!/usr/bin/env python
	"""
	@author Peter Zaffetti 2018
	"""
	from logger import get_logger
	from primatives.micelle_connection import MicelleConnection
	from primatives.micelle import Micelle
	from primatives.point import Point
	from enum import Enum
	import re


	def parse_micelle_input_file(filename="test.dat"):
	"""
	Takes a file containing a set of points that make up a Micelle and parses the file contents into a Micelle object.

	:param filename: the file to be opened and parsed.
	:return: The Micelle object generated from the file.
	"""
	logger = get_logger(parse_micelle_input_file.__name__)

	input_file = open(filename, "r")
	micelle_points = list()

	for line in input_file:
	line = line.replace("(", "").replace(")", "") # PDZ- remove any parens
	line = ''.join(line.split()) # PDZ- Remove all whitespace in the row

	# check to see if the line starts with a comment (% or #), or is blank, or a non-digit, if so continue
	if line.startswith("%") or line.startswith("#") or line == "" or (line[0].isdigit() is False and line[0] != "-"):
	continue
	comment_split_line = line.split("%")
	point_values = comment_split_line[0]
	comma_split_point_values = point_values.split(",")
	logger.debug(comma_split_point_values)

	if len(comma_split_point_values) != 3:
	raise Exception("The micelle point file is not formatted properly")
	else:
	x_val = float(comma_split_point_values[0])
	y_val = float(comma_split_point_values[1])
	z_val = float(comma_split_point_values[2])
	# indices are implicit when the list points don't have indices. They end up being the index in the list, which ends up being the current length.
	micelle_points.append(Point(len(micelle_points), x_val, y_val, z_val))

	# TODO: PDZ- figure out what the identifier should be for X,Y,Z input files. Right now it is hardcoded to 1.
	return [(1, Micelle(micelle_points))]


	def parse_connection_indices_file(filename):
	"""
	Takes a file containing a set of indices that for all connections for a Micelle input file (not required). Generates
	a micelle connection object to be used.

	:param filename: the file to be opened and parsed.
	:return: the Connection object generated from the file.
	"""
	logger = get_logger(parse_connection_indices_file.__name__)

	input_file = open(filename, "r")
	connections = list()

	for line in input_file:
	logger.debug(line)
	split_line = line.split()

	# skip any comment lines or empty lines or lines that don't start with a digit
	if line.startswith("%") or line.startswith("#") or line == "" or not line[0].isdigit():
	continue

	# make sure that a connection consists of 1 source index and at least 1 other connection index.
	if len(split_line) <= 1:
	raise RuntimeError("The connection line: {} is malformed. It should have a source index and at least 1 other connection index.".format(line))

	# select the first point as the source index and all points from index 2 to the end for the connection indices.
	# NOTE: Skip index 1 since it is just the number of connections for this index.
	connections.append(MicelleConnection(split_line[0], split_line[2:]))

	return connections


	class PDBIndex(Enum):
	TAG = 1
	ATOM_NUM = 2
	ATOM_NAME = 3
	RESIDUE_NAME = 4
	CLUSTER_SHAPE_ID = 5
	X_COORD = 6
	Y_COORD = 7
	Z_COORD = 8
	CLUSTER_ID = 9
	CLUSTER_SIZE = 10
	BETA = 11
	OCCUPANCY = 12


	def parse_micelle_from_pdb_file(filename="test.dat"):
	"""
	Parses micelles from a file in PDB format.

	:param filename: the file containing the micelles.
	:return: a list of micelles, parsed from the file.
	"""
	header_line_regex = re.compile(r"HEADER.*")
	remark_line_regex = re.compile(r"REMARK.*")
	'''
	The groups of the regex below are:
	1. the "ATOM" tag
	2. atom num
	3. atom name
	4. residue name
	5. cluster shape id (currently identified as a word but might just be digits)
	6. x coord
	7. y coord
	8. z coord
	9. cluster id
	10. size of cluster
	11. 1st annotation (beta/temperature_factor?)
	12. 2nd annotation (occupancy?)

	The regex below should match and group the following example line:
	ATOM 69901 CH2C SLI 13 18.010 29.315 33.694 110 8 1 0
	'''
	atom_line_regex = re.compile(r"(\w)\s(\d)\s(\w)\s(\w)\s(\w)\s(-\d.?\d)\s(-\d.?\d)\s(-\d.?\d)\s(\d)\s(\d)\s(\d)\s(\d*)")

	micelle_id_dict = dict()

	input_file = open(filename, "r")
	for line in input_file:

	# if the line is a header or a remark, skip over it.
	if header_line_regex.match(line):
	continue
	if remark_line_regex.match(line):
	continue

	match_results = atom_line_regex.match(line)
	if match_results:
	if match_results.group(1) != "ATOM":
	continue

	beta = int(match_results.group(PDBIndex.BETA.value))
	num = int(match_results.group(PDBIndex.ATOM_NUM.value))
	point_id = (int(match_results.group(PDBIndex.BETA.value)) * 100000) + int(match_results.group(PDBIndex.ATOM_NUM.value))
	micelle_point = Point(point_id, float(match_results.group(PDBIndex.X_COORD.value)), float(match_results.group(PDBIndex.Y_COORD.value)), float(match_results.group(PDBIndex.Z_COORD.value)))
	# get the id of the micelle which contains the point
	micelle_id = match_results.group(PDBIndex.CLUSTER_ID.value)
	# append the point to the micelle points corresponding to the id
	try:
	micelle_id_dict[micelle_id].append(micelle_point)
	except KeyError:
	micelle_id_dict[micelle_id] = [micelle_point]

	# convert all the list of micelle points to objects
	micelles = list()
	# TODO: handle the id of the micelle
	for id, points in micelle_id_dict.items():
	micelles.append((id, Micelle(points)))

	return micelles