# MolDF
# Author: Ruibin Liu <ruibinliuphd@gmail.com>
# License: MIT
# Code Repository: https://github.com/Ruibin-Liu/MolDF
"""Mol2 format reading.
Read a Tripos ``.mol2`` file into a dictionary of ``pandas DataFrames``. Different
categories like 'ATOM' and 'BOND' are read into different DataFrame objects.
"""
from __future__ import annotations
import os
import warnings
from collections import defaultdict
import pandas as pd # type: ignore
IMPLEMENTED_MOL2_CATS = ["ATOM", "MOLECULE", "BOND", "HEADER"]
"""MOL2 categories that are currently implemented."""
ATOM_COL_NAMES = (
"atom_id", # int
"atom_name", # str
"x", # float
"y", # float
"z", # float
"atom_type", # str
"subst_id", # int, optional
"subst_name", # str, optional
"charge", # float, optional
"status_bit", # str, optional
)
"""MOL2 ``ATOM`` column names."""
BOND_COL_NAMES = (
"bond_id", # int
"origin_atom_id", # int
"target_atom_id", # int
"bond_type", # str
"status_bit", # str, optional
)
"""MOL2 ``BOND`` column names."""
[docs]
def read_mol2(
mol2_file: str | os.PathLike,
category_names: list | None = None,
) -> dict[str, pd.DataFrame]:
"""Reads a ``.mol2`` file's categories into a ``dict`` of ``Pandas DataFrame`` s.
Args:
mol2_file(required): file name for a PDB file.
category_names (optional): a list of categories as to the ``.mol2`` file format.
If ``None``, [``'ATOM'``, ``'MOLECULE'``, ``'BOND'``, ``'HEADER'``] is used.
Defaults to **None**.
Returns:
A dict of ``category_name`` as keys(s) and ``pd.DataFrame`` as values.
Raises:
NotImplementedError: if ``category_names`` not a subset of
[``'ATOM'``, ``'MOLECULE'``, ``'BOND'``, ``'HEADER'``]
"""
data: dict[str, pd.DataFrame] = {}
if category_names is None:
category_names = ["ATOM", "MOLECULE", "BOND", "HEADER"]
for category_name in category_names:
if category_name not in IMPLEMENTED_MOL2_CATS:
implemented = ", ".join(IMPLEMENTED_MOL2_CATS)
raise NotImplementedError(
f"""Only {implemented} categories are implemented for the MOL2 format.
Create an issue at https://github.com/Ruibin-Liu/moldf if
you want the {category_name} category to be implemented.
"""
)
data[category_name] = pd.DataFrame()
category_block_lines: dict[str, list] = defaultdict(list)
with open(mol2_file, "r", encoding="utf-8") as mol_f:
line = mol_f.readline()
while line:
if line.startswith("#"):
category_name = "HEADER"
line = line.lstrip("#").strip()
if line:
category_block_lines[category_name].append(tuple(line.split(": ")))
elif line.startswith("@<TRIPOS>"):
category_name = line.strip()[9:]
if category_name not in category_names:
line = mol_f.readline()
continue
line = mol_f.readline()
while line and line != "\n" and line[0] != "@":
category_block_lines[category_name].append(
tuple(line.strip().split())
)
line = mol_f.readline()
if line and line[0] == "@":
continue
line = mol_f.readline()
for category_name in category_names:
if category_name not in category_block_lines:
warnings.warn(
f"The required category {category_name} is not in the file {mol2_file}.",
RuntimeWarning,
stacklevel=2,
)
elif category_name == "ATOM":
width = len(category_block_lines[category_name][0])
data[category_name] = pd.DataFrame(
category_block_lines[category_name],
columns=ATOM_COL_NAMES[0:width],
)
data[category_name] = _set_atom_df_dtypes(data[category_name])
elif category_name == "BOND":
width = len(category_block_lines[category_name][0])
data[category_name] = pd.DataFrame(
category_block_lines[category_name],
columns=BOND_COL_NAMES[0:width],
)
data[category_name] = _set_bond_df_dtypes(data[category_name])
elif category_name == "MOLECULE":
data[category_name] = _get_molecule_df(category_block_lines[category_name])
elif category_name == "HEADER":
data[category_name] = _get_header_df(category_block_lines[category_name])
return data
[docs]
def _get_molecule_df(molecule_lines: list[tuple]) -> pd.DataFrame:
"""Turns the ``MOLECULE`` lines into a ``Pandas DataFrame``.
Args:
molecule_lines (required): a list of tuples corresponding to
each line's content.
Returns:
``Pandas DataFrame`` of The ``MOLECULE`` category
"""
molecule_attrs: dict[str, list[str] | list[int]] = {}
line_0 = {"mol_name": [" ".join(molecule_lines[0])]}
line_1_names = ["num_atoms", "num_bonds", "num_subst", "num_feat", "num_sets"]
line_1 = {
name: [int(value)] for name, value in zip(line_1_names, molecule_lines[1])
}
line_2 = {"mol_type": [molecule_lines[2][0]]}
line_3 = {"charge_type": [molecule_lines[3][0]]}
molecule_attrs = {**line_0, **line_1, **line_2, **line_3}
if len(molecule_lines) > 4:
line_4 = {"status_bits": [molecule_lines[4][0]]}
molecule_attrs = {**molecule_attrs, **line_4}
if len(molecule_lines) > 5:
line_5 = {"mol_comment": [molecule_lines[5][0]]}
molecule_attrs = {**molecule_attrs, **line_5}
return pd.DataFrame(molecule_attrs)
[docs]
def _set_atom_df_dtypes(data_df: pd.DataFrame) -> pd.DataFrame:
"""Sets the data types for the ``ATOM`` category.
Args:
data_df (required): original ``Pandas DataFrame``
for the ``ATOM`` category with all strings.
Returns:
``Pandas DataFrame`` of The ``ATOM`` category
"""
data_df[["atom_id", "x", "y", "z"]] = data_df[["atom_id", "x", "y", "z"]].astype(
{"atom_id": "int32", "x": "float32", "y": "float32", "z": "float32"}
)
if "subst_id" in data_df.columns:
data_df["subst_id"] = data_df["subst_id"].astype("int32")
if "charge" in data_df.columns:
data_df["charge"] = data_df["charge"].astype("float32")
return data_df
[docs]
def _set_bond_df_dtypes(data_df: pd.DataFrame) -> pd.DataFrame:
"""Sets the data types for the ``BOND`` category
Args:
data_df (required): original ``Pandas DataFrame``
for the ``BOND`` category with all strings.
Returns:
``Pandas DataFrame`` of The ``BOND`` category
"""
data_df[["bond_id", "origin_atom_id", "target_atom_id"]] = data_df[
["bond_id", "origin_atom_id", "target_atom_id"]
].astype({"bond_id": "int32", "origin_atom_id": "int32", "target_atom_id": "int32"})
return data_df