Source code for moldf.write_pdbx

# MolDF
# Author: Ruibin Liu <ruibinliuphd@gmail.com>
# License: MIT
# Code Repository: https://github.com/Ruibin-Liu/MolDF
"""PDBx/mmCIF format writing.

Write a dict of ``Pandas DataFrame`` back to a PDBx file.

"""
from __future__ import annotations

import os
from collections import defaultdict
from pathlib import Path

import pandas as pd  # type: ignore


[docs] def write_pdbx( pdbx: dict[str, pd.DataFrame], file_name: str | os.PathLike | None = None ) -> None: """Writes a ``dict`` of ``Pandas DataFrame`` s into a PDBx file. Args: pdbx (required): a ``dict`` of ``Pandas DataFrame`` s to write. file_name (optional): file name to write a PDBx file. If ``None``, ``moldf_output.cif`` will be used as the file name. Defaults to **None**. Raises: TypeError: if ``pdbx`` is not a valid dict of ``DataFrame``. """ if not file_name: file_name = "moldf_output.cif" if not isinstance(pdbx, dict): raise TypeError(f"pdbx has to be a dict but {type(pdbx)} is provided.") multi_record: dict[str, int] = defaultdict(bool) max_tag_length: dict[str, int] = defaultdict(int) for category_name, records in pdbx.items(): if not isinstance(records, pd.DataFrame): raise TypeError( f"pdbx values have to be Pandas DataFrames but {category_name} is a {type(records)}." # noqa ) if len(records) > 1: multi_record[category_name] = True for col in records.columns: tag_length = len(category_name) + 1 + len(col) max_tag_length[category_name] = max( max_tag_length[category_name], tag_length ) with open(file_name, "w", encoding="utf-8") as f: # write header target_name = Path(file_name).name if ".cif" == target_name[-4:]: f.write(f"data_{target_name[:-4]}\n") else: f.write(f"data_{target_name}\n") # write each category for category_name, records in pdbx.items(): # categories that only have a record f.write("#\n") if not multi_record[category_name]: for col in records.columns: tag = f"{category_name}.{col}" f.write(f"{tag:{max_tag_length[category_name]+3}}") content = records[col].iloc[0] if '"' in content and "'" in content: raise ValueError( f"'{content}' cannot be written into a PDBx file." ) elif "'" in content: content = f'"{content}"' elif '"' in content: content = f"'{content}'" elif " " in content: content = f"'{content}'" content_length = len(content) if tag_length + content_length > 130: content = content.strip('"').strip("'") f.write("\n;") if category_name == "_struct_ref": for i in range(0, content_length // 80): f.write(f"{content[80*i:80*(i+1)]}\n") else: f.write(f"{content}\n") f.write(";\n") else: f.write(f"{content}\n") # categories that have multiple records else: max_col_length = defaultdict(int) for col in records.columns: if records[col].dtype == "int": max_col_length[col] = len(str(max(records[col]))) elif col == "occupancy": max_col_length[col] = 4 elif records[col].dtype == "float": max_int_width = max( len(str(int(max(records[col])))), len(str(int(min(records[col])))), ) max_col_length[col] = max_int_width + 4 if col == "B_iso_or_equiv": max_col_length[col] = max_int_width + 3 else: max_col_length[col] = max(records[col].str.len()) if records[col].str.contains(" ").any(): max_col_length[col] = max_col_length[col] + 1 f.write(f"{category_name}.{col}\n") for _, record in records.iterrows(): for col in records.columns: content = record[col] pad_length = max_col_length[col] if isinstance(content, str): if '"' in content and "'" in content: raise ValueError( f"'{content}' cannot be written into a PDBx file." ) elif "'" in content: content = f'"{content}"' elif '"' in content: content = f"'{content}'" elif " " in content: content = f"'{content}'" f.write(f"{content:<{pad_length+1}}") elif isinstance(content, int): f.write(f"{content:<{pad_length+1}}") elif isinstance(content, float) and col in [ "Cartn_x", "Cartn_y", "Cartn_z", ]: f.write(f"{content:<{pad_length+1}.3f}") elif isinstance(content, float) and col in [ "occupancy", "B_iso_or_equiv", ]: f.write(f"{content:<{pad_length+1}.2f}") f.write("\n") f.write("#")