Source code for moldf.read_mol2

# MolDF
# Author: Ruibin Liu <ruibinliuphd@gmail.com>
# License: MIT
# Code Repository: https://github.com/Ruibin-Liu/MolDF
"""Mol2 format reading.

Read a Tripos ``.mol2`` file into a dictionary of ``pandas DataFrames``. Different
categories like 'ATOM' and 'BOND' are read into different DataFrame objects.

"""
from __future__ import annotations

import os
import warnings
from collections import defaultdict

import pandas as pd  # type: ignore

IMPLEMENTED_MOL2_CATS = ["ATOM", "MOLECULE", "BOND", "HEADER"]
"""MOL2 categories that are currently implemented."""


ATOM_COL_NAMES = (
    "atom_id",  # int
    "atom_name",  # str
    "x",  # float
    "y",  # float
    "z",  # float
    "atom_type",  # str
    "subst_id",  # int, optional
    "subst_name",  # str, optional
    "charge",  # float, optional
    "status_bit",  # str, optional
)
"""MOL2 ``ATOM`` column names."""


BOND_COL_NAMES = (
    "bond_id",  # int
    "origin_atom_id",  # int
    "target_atom_id",  # int
    "bond_type",  # str
    "status_bit",  # str, optional
)
"""MOL2 ``BOND`` column names."""


[docs] def read_mol2( mol2_file: str | os.PathLike, category_names: list | None = None, ) -> dict[str, pd.DataFrame]: """Reads a ``.mol2`` file's categories into a ``dict`` of ``Pandas DataFrame`` s. Args: mol2_file(required): file name for a PDB file. category_names (optional): a list of categories as to the ``.mol2`` file format. If ``None``, [``'ATOM'``, ``'MOLECULE'``, ``'BOND'``, ``'HEADER'``] is used. Defaults to **None**. Returns: A dict of ``category_name`` as keys(s) and ``pd.DataFrame`` as values. Raises: NotImplementedError: if ``category_names`` not a subset of [``'ATOM'``, ``'MOLECULE'``, ``'BOND'``, ``'HEADER'``] """ data: dict[str, pd.DataFrame] = {} if category_names is None: category_names = ["ATOM", "MOLECULE", "BOND", "HEADER"] for category_name in category_names: if category_name not in IMPLEMENTED_MOL2_CATS: implemented = ", ".join(IMPLEMENTED_MOL2_CATS) raise NotImplementedError( f"""Only {implemented} categories are implemented for the MOL2 format. Create an issue at https://github.com/Ruibin-Liu/moldf if you want the {category_name} category to be implemented. """ ) data[category_name] = pd.DataFrame() category_block_lines: dict[str, list] = defaultdict(list) with open(mol2_file, "r", encoding="utf-8") as mol_f: line = mol_f.readline() while line: if line.startswith("#"): category_name = "HEADER" line = line.lstrip("#").strip() if line: category_block_lines[category_name].append(tuple(line.split(": "))) elif line.startswith("@<TRIPOS>"): category_name = line.strip()[9:] if category_name not in category_names: line = mol_f.readline() continue line = mol_f.readline() while line and line != "\n" and line[0] != "@": category_block_lines[category_name].append( tuple(line.strip().split()) ) line = mol_f.readline() if line and line[0] == "@": continue line = mol_f.readline() for category_name in category_names: if category_name not in category_block_lines: warnings.warn( f"The required category {category_name} is not in the file {mol2_file}.", RuntimeWarning, stacklevel=2, ) elif category_name == "ATOM": width = len(category_block_lines[category_name][0]) data[category_name] = pd.DataFrame( category_block_lines[category_name], columns=ATOM_COL_NAMES[0:width], ) data[category_name] = _set_atom_df_dtypes(data[category_name]) elif category_name == "BOND": width = len(category_block_lines[category_name][0]) data[category_name] = pd.DataFrame( category_block_lines[category_name], columns=BOND_COL_NAMES[0:width], ) data[category_name] = _set_bond_df_dtypes(data[category_name]) elif category_name == "MOLECULE": data[category_name] = _get_molecule_df(category_block_lines[category_name]) elif category_name == "HEADER": data[category_name] = _get_header_df(category_block_lines[category_name]) return data
[docs] def _get_header_df(header_lines: list[tuple]) -> pd.DataFrame: """Turns the ``HEADER`` lines into a ``Pandas DataFrame``. The ``HEADER`` lines are those starting with one or multiple ``#`` symbols. Args: header_lines (required): a list of tuples corresponding to each line's content. Tuples are generate by splitting the lines by ``:``. Returns: ``Pandas DataFrame`` of The ``HEADER`` category """ header_attrs: dict[str, list[str]] = defaultdict(list) n_no_name = 0 for header_line in header_lines: header_line = tuple([i.strip() for i in header_line]) if len(header_line) == 1: header_attrs[f"info_{n_no_name}"] = [header_line[0]] elif len(header_line) == 2: header_attrs[header_line[0]] = [header_line[1]] else: message = f"The line {header_line} has > 2 items separated" message += " by ':'. moldf uses the first as column name" message += " and concatenate the rest as value." warnings.warn( message, RuntimeWarning, stacklevel=2, ) value = ";".join(header_line[1:]) header_attrs[header_line[0]] = [value] return pd.DataFrame(header_attrs)
[docs] def _get_molecule_df(molecule_lines: list[tuple]) -> pd.DataFrame: """Turns the ``MOLECULE`` lines into a ``Pandas DataFrame``. Args: molecule_lines (required): a list of tuples corresponding to each line's content. Returns: ``Pandas DataFrame`` of The ``MOLECULE`` category """ molecule_attrs: dict[str, list[str] | list[int]] = {} line_0 = {"mol_name": [" ".join(molecule_lines[0])]} line_1_names = ["num_atoms", "num_bonds", "num_subst", "num_feat", "num_sets"] line_1 = { name: [int(value)] for name, value in zip(line_1_names, molecule_lines[1]) } line_2 = {"mol_type": [molecule_lines[2][0]]} line_3 = {"charge_type": [molecule_lines[3][0]]} molecule_attrs = {**line_0, **line_1, **line_2, **line_3} if len(molecule_lines) > 4: line_4 = {"status_bits": [molecule_lines[4][0]]} molecule_attrs = {**molecule_attrs, **line_4} if len(molecule_lines) > 5: line_5 = {"mol_comment": [molecule_lines[5][0]]} molecule_attrs = {**molecule_attrs, **line_5} return pd.DataFrame(molecule_attrs)
[docs] def _set_atom_df_dtypes(data_df: pd.DataFrame) -> pd.DataFrame: """Sets the data types for the ``ATOM`` category. Args: data_df (required): original ``Pandas DataFrame`` for the ``ATOM`` category with all strings. Returns: ``Pandas DataFrame`` of The ``ATOM`` category """ data_df[["atom_id", "x", "y", "z"]] = data_df[["atom_id", "x", "y", "z"]].astype( {"atom_id": "int32", "x": "float32", "y": "float32", "z": "float32"} ) if "subst_id" in data_df.columns: data_df["subst_id"] = data_df["subst_id"].astype("int32") if "charge" in data_df.columns: data_df["charge"] = data_df["charge"].astype("float32") return data_df
[docs] def _set_bond_df_dtypes(data_df: pd.DataFrame) -> pd.DataFrame: """Sets the data types for the ``BOND`` category Args: data_df (required): original ``Pandas DataFrame`` for the ``BOND`` category with all strings. Returns: ``Pandas DataFrame`` of The ``BOND`` category """ data_df[["bond_id", "origin_atom_id", "target_atom_id"]] = data_df[ ["bond_id", "origin_atom_id", "target_atom_id"] ].astype({"bond_id": "int32", "origin_atom_id": "int32", "target_atom_id": "int32"}) return data_df