Source code for moldf.read_pdbx

# MolDF
# Author: Ruibin Liu <ruibinliuphd@gmail.com>
# License: MIT
# Code Repository: https://github.com/Ruibin-Liu/MolDF
"""PDBx/mmCIF format reading.

Reads a PDBx file into a dict of ``Pandas DataFrame`` s.
The dict keys are read from the mmCIF category names automatically.
For each category, the column names of the corresponding ``DataFrame``
are read form the category attributes automatically.

For example:
::

  _audit_conform.dict_name       mmcif_pdbx.dic
  _audit_conform.dict_version    5.355
  _audit_conform.dict_location   http://mmcif.pdb.org/dictionaries/ascii/mmcif_pdbx.dic

In this category, the category name is ``_audit_conform``, so the returned dict
key is ``_audit_conform``. The category attributes are ``dict_name``,
``dict_version``, and ``dict_location``, so the returned dict value as a
``DataFrame`` has the exact column names.
"""
from __future__ import annotations

import io
import os
import urllib.request
from collections import defaultdict
from pathlib import Path

import pandas as pd  # type: ignore

from .split_line import split_line

AF2_MODEL = 4
"""For AlphaFold structures, the version to use."""



[docs]
def read_pdbx(
    pdbx_file: str | os.PathLike[str] | None = None,
    pdb_id: str | None = None,
    save_pdbx_file: bool = True,
    pdbx_file_dir: str | os.PathLike | None = None,
    category_names: list | None = None,
    convert_dtype: bool = False,
) -> dict[str, pd.DataFrame]:
    """Reads a ``.cif`` file's categories into a ``dict`` of ``Pandas DataFrame`` s.

    Args:
        pdb_id (optional): PDB/Uniprot ID. Required if ``pdbx_file`` is ``None``.
            Defaults to **None**.
        pdbx_file (optional): file name for a PDBx/mmCIF file. Used over `pdb_id`.
            Defaults to **None**.
        category_names (optional): a list of categories in the mmCIF file format.
            If ``None``, ``all`` is used and all categories will be processed.
            Defaults to **None**.
        save_pdbx_file (optional): whether to save the fetched PDBx file from RCSB
            to ``pdbx_file_dir``. Defaults to **False**.
        pdbx_file_dir (optional): directory to save fetched PDBx files. If ``None`` but
            ``save_pdbx_file`` is ``True``, './PDBx_files' is used.
            Defaults to **None**.
        convert_dtype (optional): whether to convert the data types according to the
            RCSB mmcif specifications. Defaults to **False**.

    Returns:
        A dict of ``Pandas DataFrame`` s corresponding to required categories.

    Raises:
        ValueError: if none of ``pdb_id`` or ``pdbx_file`` is provided, or if ``pdb_id``
            is given but cannot the PDB file cannot be downloaded from RCSB, or the
            PDB file is corrupted like no end-line symbol, or some content is irregular.
        FileNotFoundError: if ``pdbx_file`` cannot be found.
    """
    data: dict[str, pd.DataFrame] = {}
    if pdb_id is None and pdbx_file is None:
        raise ValueError("At least one of pdb_id and pdbx_file has to be given.")
    elif pdbx_file is None:
        pdb_id = str(pdb_id).upper()

        if pdbx_file_dir is None:
            pdbx_file_dir = "./PDBx_files"
        pdbx_file_dir = Path(pdbx_file_dir)
        file_path_1 = Path(pdbx_file_dir, f"{pdb_id}.cif")
        file_path_2 = Path(pdbx_file_dir, f"{pdb_id.lower()}.cif")
        if os.path.exists(file_path_1):
            pdbx_file_handle: io.TextIOWrapper | io.StringIO = open(
                file_path_1, "r", encoding="utf-8"
            )
        elif os.path.exists(file_path_2):
            pdbx_file_handle = open(file_path_2, "r", encoding="utf-8")
        else:
            if len(pdb_id) == 4:
                pdbx_file_url = f"https://files.rcsb.org/view/{pdb_id}.cif"
            elif len(pdb_id) < 4:
                pdbx_file_url = f"https://files.rcsb.org/ligands/view/{pdb_id}.cif"
            else:
                pdbx_file_url = f"https://alphafold.ebi.ac.uk/files/AF-{pdb_id}-F1-model_v{AF2_MODEL}.cif"
            try:
                with urllib.request.urlopen(pdbx_file_url) as response:
                    raw_data = response.read()
                text = raw_data.decode("utf-8")
                pdbx_file_handle = io.StringIO(text)
                if save_pdbx_file:
                    if not pdbx_file_dir.exists():
                        pdbx_file_dir.mkdir(parents=True, exist_ok=True)
                    file_path = Path(pdbx_file_dir, f"{pdb_id}.cif")
                    with open(file_path, "w", encoding="utf-8") as p_file:
                        p_file.write(text)
            except urllib.error.HTTPError as http_error:
                raise ValueError(
                    f"Cannot download PDBx file from url {pdbx_file_url}."
                ) from http_error
    else:
        pdbx_file = Path(pdbx_file)
        if not pdbx_file.exists():
            raise FileNotFoundError(f"File {pdbx_file} not found.")
        pdbx_file_handle = open(pdbx_file, "r", encoding="utf-8")
    if not category_names:
        category_names = ["all"]
    category_name = ""
    with pdbx_file_handle:
        line = pdbx_file_handle.readline()  # title line
        _ = line.strip().split("_")[1]
        processing_category = False
        while line:
            if line[0] == "#":
                processing_category = True
                loop_category = False
                category_lines = ""
                category_cols = []
                category_dict = defaultdict(list)
                line = pdbx_file_handle.readline()  # category first line
                if not line:
                    break
                if line == "loop_\n":
                    loop_category = True
                    line = pdbx_file_handle.readline()
                in_multiple_line = (
                    False  # lines quoted by ';', they need to be specially treated
                )
                while line.strip() not in ["#", "#   #", "##"]:
                    if line[0] == "_" and not in_multiple_line:
                        category_name, category_nth_col = (
                            line.strip().split(" ")[0].split(".")
                        )
                        category_cols.append(category_nth_col)
                    if (
                        category_name in category_names or "all" in category_names
                    ):  # in a required category
                        if line[0] == ";":
                            in_multiple_line = True  # a ';' quoted record begins
                            line = line.replace(
                                '"', "'"
                            )  # to avoid conflict with the next action
                            line = '"' + line[1:]
                            # these ; symbols are not contents but a pair of container
                            # symbols for a single record
                            # but other ';' are parts of a single record, so we use "
                            # to make it easier to parse
                            if len(line.strip()) == 1:
                                in_multiple_line = False  # a ';' quoted record ends
                        elif in_multiple_line:
                            line = line.replace(
                                '"', "'"
                            )  # to avoid conflict with the above action
                        category_lines += line
                    line = pdbx_file_handle.readline()
                    if not line:
                        raise ValueError(
                            f"{pdbx_file} not normally ended. Download it again?"
                        )
                if "all" not in category_names and category_name not in category_names:
                    continue
                category_lines = category_lines.replace(
                    "\n", " "
                )  # cannot rely on newline to parse anyway.
                records = split_line(category_lines)
                if not loop_category:
                    if len(records) != 2 * len(category_cols):
                        raise ValueError(
                            f"{pdbx_file} category {category_name} has irregular contents: {records}"  # noqa
                        )
                    for i, rec in enumerate(records):
                        if i % 2 == 0:
                            col_name = rec.split(".")[1]
                            if col_name in [
                                "pdbx_seq_one_letter_code",
                                "pdbx_seq_one_letter_code_can",
                            ]:
                                records[i + 1] = records[i + 1].replace(" ", "")
                            category_dict[rec.split(".")[1]] = [records[i + 1]]
                else:
                    if (
                        records[len(category_cols)][0] == "_"
                        or records[len(category_cols) - 1][0] != "_"
                        or not len(records) % len(category_cols) == 0
                    ):
                        raise ValueError(
                            f"""{pdbx_file} category {category_name} has irregular contents:
                            {len(records)} for {len(category_cols)}"""  # noqa
                        )
                    record_data = records[len(category_cols) :]
                    for i, i_data in enumerate(record_data):
                        col = i % len(category_cols)
                        if category_cols[col] in [
                            "pdbx_seq_one_letter_code",
                            "pdbx_seq_one_letter_code_can",
                        ]:
                            i_data = i_data.replace(
                                " ", ""
                            )  # delete the gaps in the one_letter_code
                        category_dict[category_cols[col]].append(i_data)
                data[category_name] = pd.DataFrame(category_dict)
                for c_name in category_names:
                    if c_name not in data:
                        break
                        # at least one required_category is not processed,
                        # continue the main while loop
                else:
                    break
                    # all required_categories are processed,
                    # break the main while loop

            if not processing_category:
                line = pdbx_file_handle.readline()
    if "_atom_site" in data and convert_dtype:
        col_dtypes = {
            "id": "int",
            "label_seq_id": "int",
            "Cartn_x": "float",
            "Cartn_y": "float",
            "Cartn_z": "float",
            "occupancy": "float",
            "B_iso_or_equiv": "float",
            "auth_seq_id": "int",
            "pdbx_PDB_model_num": "int",
            "pdbx_sifts_xref_db_num": "int",
        }
        col_dtypes = {
            col_name: data_type
            for col_name, data_type in col_dtypes.items()
            if col_name in data["_atom_site"]
        }
        data["_atom_site"] = data["_atom_site"].astype(col_dtypes)

    return data