Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Micelle_Shape_Identifier/input_parser.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
160 lines (130 sloc)
6.05 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
@author Peter Zaffetti 2018 | |
""" | |
from logger import get_logger | |
from primatives.micelle_connection import MicelleConnection | |
from primatives.micelle import Micelle | |
from primatives.point import Point | |
from enum import Enum | |
import re | |
def parse_micelle_input_file(filename="test.dat"): | |
""" | |
Takes a file containing a set of points that make up a Micelle and parses the file contents into a Micelle object. | |
:param filename: the file to be opened and parsed. | |
:return: The Micelle object generated from the file. | |
""" | |
logger = get_logger(parse_micelle_input_file.__name__) | |
input_file = open(filename, "r") | |
micelle_points = list() | |
for line in input_file: | |
line = line.replace("(", "").replace(")", "") # PDZ- remove any parens | |
line = ''.join(line.split()) # PDZ- Remove all whitespace in the row | |
# check to see if the line starts with a comment (% or #), or is blank, or a non-digit, if so continue | |
if line.startswith("%") or line.startswith("#") or line == "" or (line[0].isdigit() is False and line[0] != "-"): | |
continue | |
comment_split_line = line.split("%") | |
point_values = comment_split_line[0] | |
comma_split_point_values = point_values.split(",") | |
logger.debug(comma_split_point_values) | |
if len(comma_split_point_values) != 3: | |
raise Exception("The micelle point file is not formatted properly") | |
else: | |
x_val = float(comma_split_point_values[0]) | |
y_val = float(comma_split_point_values[1]) | |
z_val = float(comma_split_point_values[2]) | |
# indices are implicit when the list points don't have indices. They end up being the index in the list, which ends up being the current length. | |
micelle_points.append(Point(len(micelle_points), x_val, y_val, z_val)) | |
# TODO: PDZ- figure out what the identifier should be for X,Y,Z input files. Right now it is hardcoded to 1. | |
return [(1, Micelle(micelle_points))] | |
def parse_connection_indices_file(filename): | |
""" | |
Takes a file containing a set of indices that for all connections for a Micelle input file (not required). Generates | |
a micelle connection object to be used. | |
:param filename: the file to be opened and parsed. | |
:return: the Connection object generated from the file. | |
""" | |
logger = get_logger(parse_connection_indices_file.__name__) | |
input_file = open(filename, "r") | |
connections = list() | |
for line in input_file: | |
logger.debug(line) | |
split_line = line.split() | |
# skip any comment lines or empty lines or lines that don't start with a digit | |
if line.startswith("%") or line.startswith("#") or line == "" or not line[0].isdigit(): | |
continue | |
# make sure that a connection consists of 1 source index and at least 1 other connection index. | |
if len(split_line) <= 1: | |
raise RuntimeError("The connection line: {} is malformed. It should have a source index and at least 1 other connection index.".format(line)) | |
# select the first point as the source index and all points from index 2 to the end for the connection indices. | |
# NOTE: Skip index 1 since it is just the number of connections for this index. | |
connections.append(MicelleConnection(split_line[0], split_line[2:])) | |
return connections | |
class PDBIndex(Enum): | |
TAG = 1 | |
ATOM_NUM = 2 | |
ATOM_NAME = 3 | |
RESIDUE_NAME = 4 | |
CLUSTER_SHAPE_ID = 5 | |
X_COORD = 6 | |
Y_COORD = 7 | |
Z_COORD = 8 | |
CLUSTER_ID = 9 | |
CLUSTER_SIZE = 10 | |
BETA = 11 | |
OCCUPANCY = 12 | |
def parse_micelle_from_pdb_file(filename="test.dat"): | |
""" | |
Parses micelles from a file in PDB format. | |
:param filename: the file containing the micelles. | |
:return: a list of micelles, parsed from the file. | |
""" | |
header_line_regex = re.compile(r"HEADER.*") | |
remark_line_regex = re.compile(r"REMARK.*") | |
''' | |
The groups of the regex below are: | |
1. the "ATOM" tag | |
2. atom num | |
3. atom name | |
4. residue name | |
5. cluster shape id (currently identified as a word but might just be digits) | |
6. x coord | |
7. y coord | |
8. z coord | |
9. cluster id | |
10. size of cluster | |
11. 1st annotation (beta/temperature_factor?) | |
12. 2nd annotation (occupancy?) | |
The regex below should match and group the following example line: | |
ATOM 69901 CH2C SLI 13 18.010 29.315 33.694 110 8 1 0 | |
''' | |
atom_line_regex = re.compile(r"(\w*)\s*(\d*)\s*(\w*)\s*(\w*)\s*(\w*)\s*(-*\d*.?\d*)\s*(-*\d*.?\d*)\s*(-*\d*.?\d*)\s*(\d*)\s*(\d*)\s*(\d*)\s*(\d*)") | |
micelle_id_dict = dict() | |
input_file = open(filename, "r") | |
for line in input_file: | |
# if the line is a header or a remark, skip over it. | |
if header_line_regex.match(line): | |
continue | |
if remark_line_regex.match(line): | |
continue | |
match_results = atom_line_regex.match(line) | |
if match_results: | |
if match_results.group(1) != "ATOM": | |
continue | |
beta = int(match_results.group(PDBIndex.BETA.value)) | |
num = int(match_results.group(PDBIndex.ATOM_NUM.value)) | |
point_id = (int(match_results.group(PDBIndex.BETA.value)) * 100000) + int(match_results.group(PDBIndex.ATOM_NUM.value)) | |
micelle_point = Point(point_id, float(match_results.group(PDBIndex.X_COORD.value)), float(match_results.group(PDBIndex.Y_COORD.value)), float(match_results.group(PDBIndex.Z_COORD.value))) | |
# get the id of the micelle which contains the point | |
micelle_id = match_results.group(PDBIndex.CLUSTER_ID.value) | |
# append the point to the micelle points corresponding to the id | |
try: | |
micelle_id_dict[micelle_id].append(micelle_point) | |
except KeyError: | |
micelle_id_dict[micelle_id] = [micelle_point] | |
# convert all the list of micelle points to objects | |
micelles = list() | |
# TODO: handle the id of the micelle | |
for id, points in micelle_id_dict.items(): | |
micelles.append((id, Micelle(points))) | |
return micelles |