Source code for moldf.read_pdbx

# MolDF
# Author: Ruibin Liu <ruibinliuphd@gmail.com>
# License: MIT
# Code Repository: https://github.com/Ruibin-Liu/MolDF
"""PDBx/mmCIF format reading.

Reads a PDBx file into a dict of ``Pandas DataFrame`` s.
The dict keys are read from the mmCIF category names automatically.
For each category, the column names of the corresponding ``DataFrame``
are read form the category attributes automatically.

For example:
::

  _audit_conform.dict_name       mmcif_pdbx.dic
  _audit_conform.dict_version    5.355
  _audit_conform.dict_location   http://mmcif.pdb.org/dictionaries/ascii/mmcif_pdbx.dic

In this category, the category name is ``_audit_conform``, so the returned dict
key is ``_audit_conform``. The category attributes are ``dict_name``,
``dict_version``, and ``dict_location``, so the returned dict value as a
``DataFrame`` has the exact column names.
"""
from __future__ import annotations

import io
import os
import urllib.request
from collections import defaultdict
from pathlib import Path

import pandas as pd  # type: ignore

from .split_line import split_line

AF2_MODEL = 4
"""For AlphaFold structures, the version to use."""


[docs] def read_pdbx( pdbx_file: str | os.PathLike[str] | None = None, pdb_id: str | None = None, save_pdbx_file: bool = True, pdbx_file_dir: str | os.PathLike | None = None, category_names: list | None = None, convert_dtype: bool = False, ) -> dict[str, pd.DataFrame]: """Reads a ``.cif`` file's categories into a ``dict`` of ``Pandas DataFrame`` s. Args: pdb_id (optional): PDB/Uniprot ID. Required if ``pdbx_file`` is ``None``. Defaults to **None**. pdbx_file (optional): file name for a PDBx/mmCIF file. Used over `pdb_id`. Defaults to **None**. category_names (optional): a list of categories in the mmCIF file format. If ``None``, ``all`` is used and all categories will be processed. Defaults to **None**. save_pdbx_file (optional): whether to save the fetched PDBx file from RCSB to ``pdbx_file_dir``. Defaults to **False**. pdbx_file_dir (optional): directory to save fetched PDBx files. If ``None`` but ``save_pdbx_file`` is ``True``, './PDBx_files' is used. Defaults to **None**. convert_dtype (optional): whether to convert the data types according to the RCSB mmcif specifications. Defaults to **False**. Returns: A dict of ``Pandas DataFrame`` s corresponding to required categories. Raises: ValueError: if none of ``pdb_id`` or ``pdbx_file`` is provided, or if ``pdb_id`` is given but cannot the PDB file cannot be downloaded from RCSB, or the PDB file is corrupted like no end-line symbol, or some content is irregular. FileNotFoundError: if ``pdbx_file`` cannot be found. """ data: dict[str, pd.DataFrame] = {} if pdb_id is None and pdbx_file is None: raise ValueError("At least one of pdb_id and pdbx_file has to be given.") elif pdbx_file is None: pdb_id = str(pdb_id).upper() if pdbx_file_dir is None: pdbx_file_dir = "./PDBx_files" pdbx_file_dir = Path(pdbx_file_dir) file_path_1 = Path(pdbx_file_dir, f"{pdb_id}.cif") file_path_2 = Path(pdbx_file_dir, f"{pdb_id.lower()}.cif") if os.path.exists(file_path_1): pdbx_file_handle: io.TextIOWrapper | io.StringIO = open( file_path_1, "r", encoding="utf-8" ) elif os.path.exists(file_path_2): pdbx_file_handle = open(file_path_2, "r", encoding="utf-8") else: if len(pdb_id) == 4: pdbx_file_url = f"https://files.rcsb.org/view/{pdb_id}.cif" elif len(pdb_id) < 4: pdbx_file_url = f"https://files.rcsb.org/ligands/view/{pdb_id}.cif" else: pdbx_file_url = f"https://alphafold.ebi.ac.uk/files/AF-{pdb_id}-F1-model_v{AF2_MODEL}.cif" try: with urllib.request.urlopen(pdbx_file_url) as response: raw_data = response.read() text = raw_data.decode("utf-8") pdbx_file_handle = io.StringIO(text) if save_pdbx_file: if not pdbx_file_dir.exists(): pdbx_file_dir.mkdir(parents=True, exist_ok=True) file_path = Path(pdbx_file_dir, f"{pdb_id}.cif") with open(file_path, "w", encoding="utf-8") as p_file: p_file.write(text) except urllib.error.HTTPError as http_error: raise ValueError( f"Cannot download PDBx file from url {pdbx_file_url}." ) from http_error else: pdbx_file = Path(pdbx_file) if not pdbx_file.exists(): raise FileNotFoundError(f"File {pdbx_file} not found.") pdbx_file_handle = open(pdbx_file, "r", encoding="utf-8") if not category_names: category_names = ["all"] category_name = "" with pdbx_file_handle: line = pdbx_file_handle.readline() # title line _ = line.strip().split("_")[1] processing_category = False while line: if line[0] == "#": processing_category = True loop_category = False category_lines = "" category_cols = [] category_dict = defaultdict(list) line = pdbx_file_handle.readline() # category first line if not line: break if line == "loop_\n": loop_category = True line = pdbx_file_handle.readline() in_multiple_line = ( False # lines quoted by ';', they need to be specially treated ) while line.strip() not in ["#", "# #", "##"]: if line[0] == "_" and not in_multiple_line: category_name, category_nth_col = ( line.strip().split(" ")[0].split(".") ) category_cols.append(category_nth_col) if ( category_name in category_names or "all" in category_names ): # in a required category if line[0] == ";": in_multiple_line = True # a ';' quoted record begins line = line.replace( '"', "'" ) # to avoid conflict with the next action line = '"' + line[1:] # these ; symbols are not contents but a pair of container # symbols for a single record # but other ';' are parts of a single record, so we use " # to make it easier to parse if len(line.strip()) == 1: in_multiple_line = False # a ';' quoted record ends elif in_multiple_line: line = line.replace( '"', "'" ) # to avoid conflict with the above action category_lines += line line = pdbx_file_handle.readline() if not line: raise ValueError( f"{pdbx_file} not normally ended. Download it again?" ) if "all" not in category_names and category_name not in category_names: continue category_lines = category_lines.replace( "\n", " " ) # cannot rely on newline to parse anyway. records = split_line(category_lines) if not loop_category: if len(records) != 2 * len(category_cols): raise ValueError( f"{pdbx_file} category {category_name} has irregular contents: {records}" # noqa ) for i, rec in enumerate(records): if i % 2 == 0: col_name = rec.split(".")[1] if col_name in [ "pdbx_seq_one_letter_code", "pdbx_seq_one_letter_code_can", ]: records[i + 1] = records[i + 1].replace(" ", "") category_dict[rec.split(".")[1]] = [records[i + 1]] else: if ( records[len(category_cols)][0] == "_" or records[len(category_cols) - 1][0] != "_" or not len(records) % len(category_cols) == 0 ): raise ValueError( f"""{pdbx_file} category {category_name} has irregular contents: {len(records)} for {len(category_cols)}""" # noqa ) record_data = records[len(category_cols) :] for i, i_data in enumerate(record_data): col = i % len(category_cols) if category_cols[col] in [ "pdbx_seq_one_letter_code", "pdbx_seq_one_letter_code_can", ]: i_data = i_data.replace( " ", "" ) # delete the gaps in the one_letter_code category_dict[category_cols[col]].append(i_data) data[category_name] = pd.DataFrame(category_dict) for c_name in category_names: if c_name not in data: break # at least one required_category is not processed, # continue the main while loop else: break # all required_categories are processed, # break the main while loop if not processing_category: line = pdbx_file_handle.readline() if "_atom_site" in data and convert_dtype: col_dtypes = { "id": "int", "label_seq_id": "int", "Cartn_x": "float", "Cartn_y": "float", "Cartn_z": "float", "occupancy": "float", "B_iso_or_equiv": "float", "auth_seq_id": "int", "pdbx_PDB_model_num": "int", "pdbx_sifts_xref_db_num": "int", } col_dtypes = { col_name: data_type for col_name, data_type in col_dtypes.items() if col_name in data["_atom_site"] } data["_atom_site"] = data["_atom_site"].astype(col_dtypes) return data