Source code for moldf.pdb_dataframe

# MolDF
# Author: Ruibin Liu <ruibinliuphd@gmail.com>
# License: MIT
# Code Repository: https://github.com/Ruibin-Liu/MolDF
""" ``PDBDataFrame`` as a subclass of ``Pandas DataFrame``.

Several features are added to make PDB data more accessible and selectable:

1. Properties like ``sequences``, ``heavy_atoms``, ``backbone``, and ``water`` are
directly accessed by ``.`` operation.

2. Atom selection by using methods whose names are just the column names plus ``s``
(plural form). For example, selecting atoms by names is simply
``df.atom_names([names])`` where ``atom_name`` is the column name
and ``atom_names`` is the selection function. Each selection returns a
``PDBDataFrame`` object as well, which means we can chain selections one by one
like ``df.atom_names([names]).residue_numbers([numbers])``.

3. Distance matrix as a ``@property`` and ``@classmethod``.
"""
from __future__ import annotations

import copy
import functools
import warnings
from collections import defaultdict
from collections.abc import Iterable
from itertools import combinations

import numpy as np  # type: ignore
import pandas as pd  # type: ignore
from scipy.spatial.distance import cdist, pdist, squareform  # type: ignore
from scipy.spatial.transform import Rotation  # type: ignore
from typing_extensions import Self

from .constants import AMINO_ACIDS, ELEMENT_MASSES
from .covalent_bond import get_covalent_bond_cutoffs, get_residue_template

RESIDUE_CODES = AMINO_ACIDS
"""dict[str, str], turn 3-, 2-, and 1-letter residue codes to 1-letter codes."""


PDBX_COLS = {
    "record_name": "group_PDB",
    "atom_number": "id",
    "atom_name": "label_atom_id",
    "alt_loc": "label_alt_id",
    "residue_name": "label_comp_id",
    "chain_id": "label_asym_id",
    "residue_number": "label_seq_id",
    "insertion": "pdbx_PDB_ins_code",
    "x_coord": "Cartn_x",
    "y_coord": "Cartn_y",
    "z_coord": "Cartn_z",
    "occupancy": "occupancy",
    "b_factor": "B_iso_or_equiv",
    "segment_id": "label_entity_id",
    "element_symbol": "type_symbol",
    "charge": "pdbx_formal_charge",
    "nmr_model": "pdbx_PDB_model_num",
}
"""dict[str, str], PDB and mmCIF column name dictionary."""


[docs] class PDBDataFrame(pd.DataFrame): """Pandas DataFrame with extended attributes and methods for PDB data. It enables Pythonic atom selection methods and convenient ``.`` accessing to common PDB structure properties. Args: *args: all ``pd.DataFrame`` positional arguments. For example, the ``_atom_site`` dataframe returned by reading a PDB file. pdb_format (optional): PDB format in the underlying provided data. If ``None``, ``PDB`` is assumed. Defaults to **None**. use_squared_distance (optional): whether to use squared distance when calculating distance matrix. Defaults to **True**. use_square_form (optional): whether to use a square matrix for the distance matrix. Defaults to **False**. **kwargs: all ``pd.DataFrame`` acceptable keyword arguments. Returns: A ``PDBDataFrame`` instance. Example ------- >>> from moldf import read_pdb, PDBDataFrame >>> pdb = read_pdb(pdb_id='1vii') >>> pdb_df = pdb['_atom_site'] >>> pdb_df = PDBDataFrame(pdb_df) Warnings -------- This subclass uses a custom ``__hash__`` function for caching some calculations. And thus a custom ``__eq__`` function is also implemented. For other typical ``DataFrame`` operations, use those ``.all()``, ``.any()``, ``.bool()`` functions to do comparison. """ _metadata = [ "_use_squared_distance", "_use_square_form", "_is_chimera", "_RESIDUE_CODES", "_ELEMENT_MASSES", "_pdb_format", ] def __init__( self, *args, pdb_format: str | None = None, use_squared_distance: bool = True, use_square_form: bool = False, **kwargs, ) -> None: super().__init__(*args, **kwargs) self._pdb_format: str | None = pdb_format if self._pdb_format is None: self._pdb_format = "PDB" if self._pdb_format.lower() in ["mmcif", "pdbx"]: self._pdbx_to_pdb() self._use_squared_distance: bool = use_squared_distance self._use_square_form: bool = use_square_form self._hash_random_state: int = 0 self._is_chimera = False self._RESIDUE_CODES: dict[str, str] = {} self._ELEMENT_MASSES: dict[str, float] = {} self._ter_line_removed: bool = False self._atoms: Self | None = None @property def _constructor(self): return PDBDataFrame def __hash__(self) -> int: """Uses head X coords to hash; for distance matrix calculation cache.""" try: sample_atom_numbers = self.sample( 5, random_state=self.hash_random_state, replace=True )["atom_number"] except ValueError: sample_atom_numbers = [] return hash(tuple(self[self.atom_number.isin(sample_atom_numbers)].x_coord)) def __eq__(self, other) -> bool: """Uses head X coords to compare; for distance matrix calculation cache.""" return self.__hash__() == other.__hash__()
[docs] def _pdbx_to_pdb(self, keep_original: bool = False): """Converts PDBx '_atom_site' DataFrame to PDB format. Args: keep_original (optional): whether to keep the original columns in the PDBx '_atom_site' DataFrame. Defaults to **False**. """ pdbx_cols = {k: v for k, v in PDBX_COLS.items() if v in self.columns} for pdb_name, pdbx_name in pdbx_cols.items(): self[pdb_name] = self[pdbx_name] if not keep_original: drop_columns = [col for col in self.columns if col not in pdbx_cols.keys()] self.drop(columns=drop_columns, inplace=True) self._pdb_format = "PDBx"
@property def pdb_format(self) -> str: """ The format of the current PDBDataFrame. """ return self._pdb_format # type: ignore @property def RESIDUE_CODES(self) -> dict[str, str]: """ A dict of ``residue_name`` as keys and ``residue_code`` as values, where ``residue_code`` is a 1-character code used in sequences. **Settable**. """ if not self._RESIDUE_CODES: res_name_width = 3 if self.is_chimera: res_name_width = 4 self._RESIDUE_CODES = { res.upper().ljust(res_name_width): code for res, code in RESIDUE_CODES.items() } return self._RESIDUE_CODES @RESIDUE_CODES.setter def RESIDUE_CODES(self, residue_codes: dict[str, str]) -> None: res_name_width = 3 if self.is_chimera: res_name_width = 4 self._RESIDUE_CODES = { res.upper().ljust(res_name_width): code for res, code in residue_codes.items() } @property def ELEMENT_MASSES(self) -> dict[str, float]: """ A dict of ``element_symbol`` as keys and ``element_mass`` as values, where ``element_mass`` is taken from NIST. **Settable**. """ if not self._ELEMENT_MASSES: self._ELEMENT_MASSES = { e.upper().rjust(2): mass for e, mass in ELEMENT_MASSES.items() } return self._ELEMENT_MASSES @ELEMENT_MASSES.setter def ELEMENT_MASSES(self, element_masses: dict[str, float]): self._ELEMENT_MASSES = { e.upper().rjust(2): mass for e, mass in element_masses.items() } @property def is_chimera(self) -> bool: """ Whether the original read-in PDB was Chimera compatible format. The main effect is the ``residue_name`` str width is 4 in the Chimera compatible format instead of 3 as in the standard PDB format. **Not settable**. """ residue_name_set = self.atoms.residue_name.unique() try: residue_name_set = self.atoms.residue_name.unique() if len(residue_name_set) > 0 and len(residue_name_set[0]) == 4: self._is_chimera = True except AttributeError: pass # 'residue_name' not in self.coords return self._is_chimera @property def hash_random_state(self) -> int: """The ``random_state`` used in the ``__hash__`` function. **Settable**.""" return self._hash_random_state @hash_random_state.setter def hash_random_state(self, random_state: int) -> None: self._hash_random_state = random_state @property def use_squared_distance(self) -> bool: """ Whether R or R^2 is used in distance matrix calculations. Using R^2 saves computation time. **Settable**. """ return self._use_squared_distance @use_squared_distance.setter def use_squared_distance(self, use_r2: bool) -> None: self._use_squared_distance = use_r2 @property def use_square_form(self) -> bool: """ Whether the distance matrix will be in a square form. Using square form consumes less memory. **Settable**. """ return self._use_square_form @use_square_form.setter def use_square_form(self, square_form: bool) -> None: self._use_square_form = square_form @property def atoms(self) -> Self: """Gets atoms in the ``ATOM`` and ``HETATM`` entries. In other words, removing 'TER' lines. Returns: sub ``PDBDataFrame``. """ if self._atoms is None: if self._ter_line_removed: self._atoms = self else: self._atoms = self.record_names(["ATOM ", "HETATM"]) self._ter_line_removed = True return self._atoms @property def coords(self) -> Self: """ Gets the ``x_coord``, ``y_coord``, and ``z_coord`` columns only. Use ``pdb_df.coords.values`` to get the underlying Numpy array of the coordinates. """ return self.atoms[["x_coord", "y_coord", "z_coord"]] @property def element_set(self) -> set: """Gets the set of element symbols.""" elements = self.atoms.element_symbol.unique() return set([e.strip().upper() for e in elements]) @property @functools.lru_cache() def bonds(self) -> dict: """Gets the list of bonds. Each bond is represented as a pair of ``atom_number`` values. Raises: ValueError: if the list of ``atom_number`` is not a set. """ return self.get_bonds_by_template()
[docs] def get_bonds_by_distance( self, single_radii_set: str | None = None, need_non_covalent: bool = False, non_covalent_cutoff: float = 4.5, ) -> dict: """Gets all the bonds purely by covalent radii constraints. Args: single_radii_set (optional): radii sets to use. If ``None``, ``single_C`` is used as to Cordero (PMID 18478144). Another option is ``single_PA`` which refers to Pyykkö's studies (PMID 19058281;19856342;15832398, and doi:10.1103/PhysRevB.85.024115). Defaults to **None**. need_non_covalent (optional): whether non-covalent 'bonding' should be included. Defaults to **False**. non_covalent_cutoff (optional): distance cutoff for non-covalent 'bonding'. Raises: ValueError: if the list of ``atom_number`` is not unique or if the ``single_radii_set`` is not valid. Returns: a dictionary of bonds with tuple of ``atom_number`` as keys and bond types as values. """ atoms = self.atoms if len(atoms) != len(atoms.atom_number.unique()): raise ValueError("The 'atom_number' list has repetitive values.") if single_radii_set is None: single_radii_set = "single_C" if single_radii_set not in ["single_C", "single_PA"]: message = "The 'single_radii_set' has to be one of " message += "'single_C' and 'single_PA', but " message += f"{single_radii_set} is provided." raise ValueError(message) results: dict = {} element_set = self.element_set if "D" in element_set or "T" in element_set: element_set.update({"H"}) single_bonds, double_bonds, triple_bonds = get_covalent_bond_cutoffs( element_set ) # No-bond, triple-bond, and default single bond self.use_square_form = False non_covalent_cutoff = non_covalent_cutoff**2 dis_matrix = self.atoms.distance_matrix element_matrix = combinations(self.atoms.element_symbol.to_list(), 2) atom_number_matrix = combinations(self.atoms.atom_number.to_list(), 2) for i, (elements, atom_numbers) in enumerate( zip(element_matrix, atom_number_matrix) ): bond_type: int | float = 0 dis = dis_matrix[i] first_element, second_element = elements if dis <= triple_bonds[(first_element, second_element)]: bond_type = 3 elif dis <= double_bonds[(first_element, second_element)]: bond_type = 2 elif dis <= single_bonds[(first_element, second_element)]: bond_type = 1 elif need_non_covalent and dis < non_covalent_cutoff: bond_type = 0.5 atom_number_1, atom_number_2 = atom_numbers if bond_type > 0: results[(atom_number_1, atom_number_2)] = bond_type return results
[docs] def get_bonds_by_template(self) -> dict: """Gets covalent bonds based on residue/ligand templates.""" residue_name_sets = self.residue_name.unique() intra_bonds_dict = {} for residue_name in residue_name_sets: intra_bonds_dict[residue_name] = get_residue_template( residue_name=residue_name.strip() ) bonds: dict = {} # intro bonds all_residues = PDBDataFrame.get_residue_list(self.atoms, include_heteros=True) for chain_id, residue_name, residue_number in all_residues: residue = ( self.atoms.chain_ids([chain_id]) .residue_numbers(residue_number) .residue_names([residue_name]) ) name_matrix = combinations(residue.atom_name, 2) number_matrix = combinations(residue.atom_number, 2) for names, numbers in zip(name_matrix, number_matrix): first_name, second_name = names first_name = first_name.strip() second_name = second_name.strip() first_number, second_number = numbers bond_type = intra_bonds_dict[residue_name].get( (first_name, second_name) ) if bond_type: bonds[(first_number, second_number)] = bond_type[0] # inter bonds for peptide bonds carb_atoms = self.atoms.atom_names(["C"]) nitro_atoms = self.atoms.atom_names(["N"]) if len(carb_atoms) != len(nitro_atoms): cn_matrix = PDBDataFrame.get_distance_matrix(carb_atoms, nitro_atoms) for carb_index, carb_number in enumerate(carb_atoms.atom_number): for nitro_index, nitro_number in enumerate(nitro_atoms.atom_number): if cn_matrix[carb_index, nitro_index] < 2.7889: bonds[(carb_number, nitro_number)] = "SING" else: for i in range(len(carb_atoms) - 1): carb_atom = carb_atoms.iloc[i] carb_xyz = (carb_atom.x_coord, carb_atom.y_coord, carb_atom.z_coord) carb_number = carb_atom.atom_number nitro_atom = nitro_atoms.iloc[i + 1] nitro_xyz = (nitro_atom.x_coord, nitro_atom.y_coord, nitro_atom.z_coord) nitro_number = nitro_atom.atom_number dis = ( (carb_xyz[0] - nitro_xyz[0]) ** 2 + (carb_xyz[1] - nitro_xyz[1]) ** 2 + (carb_xyz[2] - nitro_xyz[2]) ** 2 ) if dis < 2.7889: bonds[(carb_number, nitro_number)] = "SING" # inter bonds for disulfide bonds sulfur_atoms = self.residues.residue_names(["CYS"]).atom_names(["SG"]) s_dis_matrix = sulfur_atoms.distance_matrix s_atom_matrix = combinations(sulfur_atoms.atom_number, 2) for i, numbers in enumerate(s_atom_matrix): if s_dis_matrix[i] < 9.0: # 3.0 A is used as cutoff bonds[(numbers[0], numbers[1])] = "SING" # inter bonds for other pairs dis_matrix = self.atoms.distance_matrix res_name_matrix = combinations(self.atoms.residue_name, 2) atom_number_matrix = combinations(self.atoms.atom_number, 2) element_symbol_matrix = combinations(self.atoms.element_symbol, 2) element_set = self.element_set if "D" in element_set or "T" in element_set: element_set.update({"H"}) single_bonds, double_bonds, triple_bonds = get_covalent_bond_cutoffs( element_set ) for i, (names, numbers, symbols) in enumerate( zip(res_name_matrix, atom_number_matrix, element_symbol_matrix) ): first_res = names[0].strip() second_res = names[1].strip() first_ele = symbols[0] second_ele = symbols[1] if first_res == "HOH" or second_res == "HOH": continue if not (first_res in RESIDUE_CODES and second_res in RESIDUE_CODES): if dis_matrix[i] <= triple_bonds[(first_ele, second_ele)]: bonds[numbers] = "TRIP" elif dis_matrix[i] <= double_bonds[(first_ele, second_ele)]: bonds[numbers] = "DOUB" elif dis_matrix[i] <= single_bonds[(first_ele, second_ele)]: bonds[numbers] = "SING" return bonds
@property @functools.lru_cache() def sequences(self) -> dict[str, str]: """ Gets the sequences for each chain as a dict of ``chain_id`` as key(s) and ``chain_sequence`` as value(s). """ chain_sequence: dict[str, str] = defaultdict(str) for resi_info in self.residue_list: chain_sequence[resi_info[0]] += self.RESIDUE_CODES[resi_info[1]] return chain_sequence @property def chain_list(self) -> list: """ Gets all chain ids as a list. """ return PDBDataFrame.get_chain_list(self) @property def residue_list(self) -> list[tuple]: """ Gets all residues as a list of tuple (``chain_id``, ``residue_name``, ``residue_number``). """ return PDBDataFrame.get_residue_list(self) @property def backbone(self) -> Self: """Gets backbone or N+CA+C+O atoms. Returns: sub ``PDBDataFrame`` """ return self.residues.atom_names( ["N", "CA", "C", "O"], suppress_warning=True, ).element_symbols(["C", "N", "O"]) @property def side_chain(self) -> Self: """Gets side chain or NOT N+CA+C+O atoms. Returns: sub ``PDBDataFrame``. """ return self.residues.atom_names( ["N", "CA", "C", "O"], invert=True, suppress_warning=True, ).element_symbols(["C", "N", "O", "S"]) @property def ca_atoms(self) -> Self: """Gets the alpha carbon (CA) atoms. Returns: sub ``PDBDataFrame``. """ return self.residues.atom_names( ["CA"], suppress_warning=True, ).element_symbols(["C"]) @property def heavy_atoms(self) -> Self: """Gets the heavy or NOT hydrogen atoms. Returns: sub ``PDBDataFrame``. """ return self.element_symbols(["H", "D", "T"], invert=True) @property def hetero_atoms(self) -> Self: """Gets the hetero (``HETATM``) atoms. Returns: sub ``PDBDataFrame``. """ return self.record_names(["HETATM"]) @property def residues(self) -> Self: """Gets the residue (``ATOM``) atoms. Returns: sub ``PDBDataFrame``. """ return self.record_names(["ATOM "]) @property def water(self) -> Self: """Gets all water atoms. Returns: sub ``PDBDataFrame``. """ return self.atoms.residue_names(["HOH"]) @property def n_atoms(self) -> int: """Gets the number of atoms.""" return len(self.atoms) @property def n_residues(self) -> int: """Gets the number of residues.""" return len(self.residues) @property def n_chains(self) -> int: """Gets the number of chains.""" ter_lines = self[self.record_name == "TER "] ter_residues = PDBDataFrame.get_residue_list(ter_lines) oxt_lines = self[self.atom_name == " OXT"] oxt_residues = PDBDataFrame.get_residue_list(oxt_lines) n_chain_ids = len(self.chain_id.unique()) ter_oxt_residues = copy.deepcopy(ter_residues) for oxt_residue in oxt_residues: if oxt_residue not in ter_oxt_residues or not ter_residues: ter_oxt_residues.append(oxt_residue) return max(len(ter_oxt_residues), n_chain_ids) @property def n_segments(self) -> int: """Gets the number of segments.""" return len(self.atoms.segment_id.unique()) @property def n_models(self) -> int: """Gets the number of models.""" if "nmr_model" in self.columns: return len(self.nmr_model.unique()) return 1 @property @functools.lru_cache() def center_of_geometry(self) -> np.ndarray: """Gets the center of geometry as a ``(3, )`` ``np.ndarray``.""" return np.mean(self.coords.values, axis=0) @property @functools.lru_cache() def center_of_mass(self) -> np.ndarray: """Gets the center of mass as a ``(3, )`` ``np.ndarray``.""" masses = self.atoms.get_masses() masses = masses / masses.sum() return np.sum(self.coords.values * masses[:, None], axis=0) @property @functools.lru_cache() def radius_of_gyration(self) -> float: """Gets the radius of gyration""" com = self.center_of_mass com_t = (com[0], com[1], com[2]) # type: ignore dist_to_com = PDBDataFrame.get_distance_matrix(self.atoms, com_t, use_r2=True) masses = self.atoms.get_masses() masses = masses / masses.sum() return np.sum(dist_to_com * masses[:, None], axis=0)[0]
[docs] @functools.lru_cache() def get_masses(self) -> np.ndarray: """Gets the masses for all atoms in the current dataframe.""" masses = np.zeros(len(self.atoms), dtype="float32") for i, element in enumerate(self.atoms.element_symbol): masses[i] = self.ELEMENT_MASSES[element] return masses
@property def distance_matrix(self) -> np.ndarray: """Gets the distance matrix.""" return PDBDataFrame.get_distance_matrix( self.atoms, use_r2=self.use_squared_distance, square_form=self.use_square_form, )
[docs] def rmsd( self, other: Self | np.ndarray | None = None, align: bool = True, weights: list | None = None, selection: Self | list | None = None, ) -> list | float: """Calculates RMSD 1) among sets of coordinates in one ``PDBDataFrame`` with multiple ``nmr_model`` s or 2) two sets of coordinates in two ``PDBDataFrames``. Args: other (optional): the other ``PDBDataFrame`` or (N, 3) ``numpy.ndarray`` to calculate RMSD against. If ``None``, ``self`` should contain at least two sets of coordinates (``nmr_model`` has >= 2 unique values). Defaults to **None**. align (optional): whether to align the structures before calculating RMSD. If ``False``, the ``weights`` and ``selection`` keywords are ignored. Defaults to **True**. weights (optional): a list of weights for all the atoms in ``selection`` to do structure alignment. If ``None``, all coordinates in the ``selection`` or ``self`` have the same weights. Defaults to **None**. selection (optional): a list of ``atom_number`` s in ``self`` or a PDBDataFrame after the filtering methods. If ``None``, all coordinates in ``self`` are used for structure alignment. Defaults to **None**. Returns: RMSD or a list of RMSD's. Raises: ValueError: if dimensionalities mismatch among ``self``, ``other``, ``weights``, and ``selection`` if they are not ``None``; or ``atom_number`` s in ``self`` are not unique. TypeError: if ``other``, ``weights``, and ``selection`` have unsupported types. """ result: list = [] other_coords_list: list = [] if "nmr_model" in self.columns and len(self.nmr_model.unique()) >= 2: all_models = self.nmr_model.unique() first_model = self.nmr_models(int(all_models[0])) if len(first_model.atom_number.unique()) != len(first_model): raise ValueError("'atom_number's in 'self' are not unique.") n_atoms = len(first_model) if other is None: for other_index in all_models[1:]: other_model = self.nmr_models(int(other_index)) other_coords_list.append(other_model.coords.values) else: first_model = self if len(first_model.atom_number.unique()) != len(first_model): raise ValueError("'atom_number's in 'self' are not unique.") n_atoms = len(first_model) if other is None: message = "'self' has only one set of coordinates and 'other' " message += "is not provided." raise ValueError(message) # message = "'other' is not a PDBDataFrame or np.ndarray instance " # message += f"but a {type(other)}." # raise ValueError(message) elif isinstance(other, type(self)): if (self.atom_number != other.atom_number).any(): message = "'self' and 'other' have mismatched 'atom_number' values." raise ValueError(message) other_coords_list.append(other.coords.values) elif isinstance(other, np.ndarray): if other.shape != (n_atoms, 3): message = f"'other' shape is {other.shape} but expected to be" message += f" ({n_atoms}, 3)." raise ValueError(message) other_coords_list.append(other) else: raise TypeError(f"Unsupported type {type(other)} for 'other'.") first_coords = first_model.coords.values align_weights = np.ones(n_atoms) if align: if selection is not None: align_weights = np.zeros(n_atoms) if not isinstance(selection, (type(self), list)): message = f"'selection' type is {type(selection)}, but list" message += " or PDBDataFrame is expected." raise TypeError(message) n_selection = len(selection) if len(selection) > n_atoms: message = f"'selection' length is {len(selection)}, " message += f"larger than the number of atoms {n_atoms}." raise ValueError(message) if isinstance(selection, type(self)): selection = self.atom_number.to_list() mask = first_model.atom_number.isin(selection) align_weights[mask] = 1.0 if weights is not None: if not isinstance(weights, list): message = f"'weights' type is {type(weights)}, but list" message += " is expected." raise TypeError(message) if len(weights) != n_selection: message = f"'weights' length is {len(weights)}, " message += "not equal to the selection length" message += f" {n_selection}." raise ValueError(message) align_weights[mask] = np.array(weights) elif weights is not None: if not isinstance(weights, list): message = f"'weights' type is {type(weights)}, but list" message += " is expected." raise TypeError(message) if len(weights) != n_atoms: message = f"'weights' length is {len(weights)}, " message += "not equal to the number of atoms" message += f" {n_atoms} without 'selection'." raise ValueError(message) align_weights = np.array(weights) first_centered = False for other_coords in other_coords_list: if align: if not first_centered: first_coords = first_coords - first_model.center_of_geometry first_centered = True other_cog = np.mean(other_coords, axis=0) other_coords = other_coords - other_cog rot = Rotation.align_vectors( first_coords, other_coords, weights=align_weights )[0] other_coords = rot.apply(other_coords) rms = np.sqrt(np.sum((first_coords - other_coords) ** 2) / n_atoms) result.append(rms) if len(result) == 1: return result[0] return result
[docs] def record_names(self, names: list[str], invert: bool = False) -> Self: """Filter by ``record_name``. Args: names (required): a list of ``record_name`` s. invert (optional): whether to invert the selection. Defaults to **False**. Returns: sub ``PDBDataFrame`` """ names = [name.strip().upper() for name in names] if self.pdb_format.upper() == "PDB": # type: ignore names = [name.ljust(6) for name in names] if invert: return self[~self.record_name.isin(names)] return self[self.record_name.isin(names)]
[docs] def atom_numbers( self, numbers: list[int] | int, relation: str | None = None, invert: bool = False, ) -> Self: """Filter by ``atom_number``. Args: numbers (required): one or a list of ``atom_number`` s. relation (optional): ``atom_number`` relationship to ``numbers``. If ``numbers`` is an integer, it has to be one of ``<``, ``<=``, ``=``, ``>=``, and ``>``. If ``None``, ``<=`` is used. Ignored if a list of integers are provided to ``numbers``. Defaults to **None**. invert (optional): whether to invert the selection. Defaults to **False**. Returns: sub ``PDBDataFrame`` """ return self._filter_num_col( numbers, "atom_number", relation=relation, invert=invert )
[docs] def atom_names( self, names: list[str], names_2c: list[str] | None = None, invert: bool = False, suppress_warning: bool = False, ) -> Self: """Filter by ``atom_name``. Args: names (required): a list of ``atom_name`` s whose ``element_symbols`` have only one character. Atoms in common residues and ligands should be provide here like ``C, H, O, N, S, P, F``. names_2c (optional): a list of ``atom_name`` s whose ``element_symbols`` have two characters like ion (``FE``) and chloride (``CL``). Defaults to **None**. invert (optional): whether to invert the selection. Defaults to **False**. suppress_warning: whether to suppress the warning message about possible conflicts between ``names`` and ``names_2c``. Defaults to **False**. Returns: sub ``PDBDataFrame`` """ atom_name_strings = [name.strip().upper() for name in names] if self.pdb_format.upper() == "PDB": # type: ignore atom_name_strings = [] for name in names: if len(name) == 4: atom_name_strings.append(name) else: atom_name_strings.append(f" {name}".ljust(4)) if ( len(name) == 2 and f"{name[0]}{name[1].lower()}" in self.ELEMENT_MASSES ): if f" {name[0]}" not in self.ELEMENT_MASSES: atom_name_strings.append(f"{name}".ljust(4)) # eg 'MG' where ' M' is not a legal ele in self.ELEMENT_MASSES. continue if suppress_warning: continue message = f"Atom name {name} is an atom of element {name[0]} " message += f"but not element {name[0]}{name[1].lower()}." message += "If you want the latter, put it in the 'names_2c' list." warnings.warn( message, RuntimeWarning, stacklevel=2, ) if names_2c is not None: for name in names_2c: atom_name_strings.append(f"{name}".ljust(4)) if invert: return self[~self.atom_name.isin(atom_name_strings)] return self[self.atom_name.isin(atom_name_strings)]
[docs] def alt_locs(self, locs: list[str], invert: bool = False) -> Self: """Filter by ``alt_loc``. Args: locs (required): a list of ``alt_loc`` s. invert (optional): whether to invert the selection. Defaults to **False**. Returns: sub PDBDataFrame """ if invert: return self[~self.alt_loc.isin(locs)] return self[self.alt_loc.isin(list(locs) + [" "])]
[docs] def residue_names(self, names: list[str], invert: bool = False) -> Self: """Filter by ``residue_names``. Args: names (required): a list of ``residue_name`` s invert (optional): whether to invert the selection. Defaults to **False**. Returns: sub ``PDBDataFrame`` """ names = [name.strip().upper() for name in names] if self.pdb_format.upper() == "PDB" and self.is_chimera: # type: ignore names = [(name + " ").upper().rjust(4) for name in names] if invert: return self[~self.residue_name.isin(names)] return self[self.residue_name.isin(names)]
[docs] def chain_ids(self, ids: list[str], invert: bool = False) -> Self: """Filter by ``chain_id``. Args: ids (required): a list of ``chain_id`` s. invert (optional): whether to invert the selection. Defaults to **False**. Returns: sub ``PDBDataFrame`` """ if invert: return self[~self.chain_id.isin(ids)] return self[self.chain_id.isin(ids)]
[docs] def residue_numbers( self, numbers: list[int] | int, relation: str | None = None, invert: bool = False, ) -> Self: """Filter by ``residue_number``. Args: numbers (required): one or a list of ``residue_number`` s. relation (optional): ``residue_number`` relationship to ``numbers``. If ``numbers`` is an integer, it has to be one of ``<``, ``<=``, ``=``, ``>=``, and ``>``. If ``None``, '<=' is used. Ignored if a list of integers are provided to ``numbers``. Defaults to **None**. invert (optional): whether to invert the selection. Defaults to **False**. Returns: sub ``PDBDataFrame`` """ return self._filter_num_col( numbers, "residue_number", relation=relation, invert=invert )
[docs] def insertions(self, codes: list[str], invert: bool = False) -> Self: """Filter by ``insertion``. Args: codes (required): a list of ``insertion`` codes. invert (optional): whether to invert the selection. Defaults to **False**. Returns: sub ``PDBDataFrame`` """ if invert: return self[~self.insertion.isin(codes)] return self[self.insertion.isin(codes)]
[docs] def x_coords( self, value: float, relation: str | None = None, invert: bool = False, epsilon: float = 0.01, ) -> Self: """Filter by ``x_coord``. Args: value (required): value to select ``x_coord`` s. relation (optional): ``x_coord`` relationship to ``value``. It has to be one of ``'<'``, ``'<='``, ``'='``, ``'>='``, and ``'>'``. If ``None``, ``'<='`` is used. Defaults to **None**. invert (optional): whether to invert the selection. Defaults to **False**. epsilon (optional): atoms ``abs(x_coord - value)`` <= ``epsilon`` are selected when ``invert`` = ``False``. Defaults to **0.01**. Returns: sub ``PDBDataFrame`` """ return self._filter_num_col( value, "x_coord", relation=relation, invert=invert, epsilon=epsilon )
[docs] def y_coords( self, value: float, relation: str | None = None, invert: bool = False, epsilon: float = 0.01, ) -> Self: """Filter by ``y_coord``. Args: value (required): value to select ``y_coord`` s. relation (optional): ``y_coord`` relationship to ``value``. It has to be one of ``'<'``, ``'<='``, ``'='``, ``'>='``, and ``'>'``. If ``None``, ``'<='`` is used. Defaults to **None**. invert (optional): whether to invert the selection. Defaults to **False**. epsilon (optional): atoms ``abs(y_coord - value)`` <= ``epsilon`` are selected when ``invert`` = ``False``. Defaults to **0.01**. Returns: sub ``PDBDataFrame`` """ return self._filter_num_col( value, "y_coord", relation=relation, invert=invert, epsilon=epsilon )
[docs] def z_coords( self, value: float, relation: str | None = None, invert: bool = False, epsilon: float = 0.01, ) -> Self: """Filter by ``z_coord``. Args: value (required): value to select ``z_coord`` s. relation (optional): ``z_coord`` relationship to ``value``. It has to be one of ``'<'``, ``'<='``, ``'='``, ``'>='``, and ``'>'``. If ``None``, ``'<='`` is used. Defaults to **None**. invert (optional): whether to invert the selection. Defaults to **False**. epsilon (optional): atoms ``abs(z_coord - value)`` <= ``epsilon`` are selected when ``invert`` = ``False``. Defaults to **0.01**. Returns: sub ``PDBDataFrame`` """ return self._filter_num_col( value, "z_coord", relation=relation, invert=invert, epsilon=epsilon )
[docs] def occupancies( self, value: float, relation: str | None = None, invert: bool = False, epsilon: float = 0.01, ) -> Self: """Filter by ``occupancy``. Args: value (required): value to select ``occupancy`` s. relation (optional): ``occupancy`` relationship to ``value``. It has to be one of ``'<'``, ``'<='``, ``'='``, ``'>='``, and ``'>'``. If ``None``, ``'<='`` is used. Defaults to **None**. invert (optional): whether to invert the selection. Defaults to **False**. epsilon (optional): atoms ``abs(occupancy - value)`` <= ``epsilon`` are selected when ``invert`` = ``False``. Defaults to **0.01**. Returns: sub ``PDBDataFrame`` """ return self._filter_num_col( value, "occupancy", relation=relation, invert=invert, epsilon=epsilon )
[docs] def b_factors( self, value: float, relation: str | None = None, invert: bool = False, epsilon: float = 0.01, ) -> Self: """Filter by ``b_factor``. Args: value (required): value to select ``b_factor`` s. relation (optional): ``b_factor`` relationship to ``value``. It has to be one of ``'<'``, ``'<='``, ``'='``, ``'>='``, and ``'>'``. If ``None``, ``'<='`` is used. Defaults to **None**. invert (optional): whether to invert the selection. Defaults to **False**. epsilon (optional): atoms ``abs(b_factor - value)`` <= ``epsilon`` are selected when ``invert`` = ``False``. Defaults to **0.01**. Returns: sub ``PDBDataFrame`` """ return self._filter_num_col( value, "b_factor", relation=relation, invert=invert, epsilon=epsilon )
[docs] def segment_ids(self, ids: list[str], invert: bool = False) -> Self: """Filter by ``segment_id``. Args: ids (required): a list of ``segment_id`` s. invert (optional): whether to invert the selection. Defaults to **False**. Returns: sub ``PDBDataFrame`` """ ids = [i.strip() for i in ids] if self.pdb_format.upper() == "PDB": # type: ignore ids = [i.ljust(4) for i in ids] if invert: return self[~self.segment_id.isin(ids)] return self[self.segment_id.isin(ids)]
[docs] def element_symbols(self, symbols: list[str], invert: bool = False) -> Self: """Filter by ``element_symbol``. Args: symbols (required): a list of ``element_symbol`` s. invert (optional): whether to invert the selection. Defaults to **False**. Returns: sub ``PDBDataFrame`` """ symbols = [symbol.strip().upper() for symbol in symbols] if self.pdb_format.upper() == "PDB": # type: ignore symbols = [symbol.rjust(2) for symbol in symbols] if invert: return self[~self.element_symbol.isin(symbols)] return self[self.element_symbol.isin(symbols)]
[docs] def charges(self, charges: list[str], invert: bool = False) -> Self: """Filter by ``charge``. Args: charges (required): a list of ``charge`` s. invert (optional): whether to invert the selection. Defaults to **False**. Returns: sub ``PDBDataFrame`` Notes: ``charge`` is ``2-char`` string in the PDB specifications. """ charges = [charge.strip() for charge in charges] if self.pdb_format.upper() == "PDB": # type: ignore charges = [charge.rjust(2) for charge in charges] if invert: return self[~self.charge.isin(charges)] return self[self.charge.isin(charges)]
[docs] def nmr_models( self, models: list[int] | int, relation: str | None = None, invert: bool = False ) -> Self: """Filter by ``nmr_model``. Args: models (required): one or a list of ``nmr_model`` ids. relation (optional): ``nmr_model`` relationship to ``models``. If ``models`` is an integer, it has to be one of ``'<'``, ``'<='``, ``'='``, ``'>='``, and ``'>'``. If ``None``, ``'<='`` is used. Ignored if a list of integers are provided to ``models``. Defaults to **None**. invert (optional): whether to invert the selection. Defaults to **False**. Returns: sub ``PDBDataFrame`` """ if "nmr_model" in self.columns: return self._filter_num_col( models, "nmr_model", relation=relation, invert=invert ) return self
[docs] def distances( self, other: np.ndarray | Self | Iterable, cut_off: float = np.inf, to: str | None = None, invert: bool = False, ) -> Self: """Filter by ``distance`` to a reference point or group of atoms. Args: other: the other group's coordinate(s). cut_off: the distance cutoff to filter. to: if ``other`` is a group atoms, using which method to determine whether the atoms meet the cut_off distance. If None, ``COM`` or center of mass is used if ``other`` is ``PDBDataFrame``, and ``COG`` or center of geometry is used if ``other`` is ``np.ndarray`` or ``Iterable``. The following are allowed: ``com``, ``center of mass``, ``center_of_mass``: use the center of mass for the ``other``. ``cog``, ``center of geometry``, ``center_of_geometry``: use the center of geometry for the ``other``. ``all``: whether all the pair-distances meet the ``cut_off`` distance criteria. ``any``: whether any of the pair-distances meets the ``cut_off`` distance criteria. invert: whether to invert the selection. Defaults to **False**. Returns: sub ``PDBDataFrame`` """ if not (isinstance(other, type(self)) or isinstance(other, Iterable)): message = "Only 'PDBDataFrame', 'np.ndarray', and 'Iterable' types " message += f"are supported, not {type(other)} for 'other'." raise TypeError(message) if isinstance(other, Iterable) and not isinstance(other, type(self)): other = np.asanyarray(other, dtype="float32") if other.shape != (3,): if len(other.shape) != 2 or other.shape[1] != 3: message = "An 'Iterable' input should have a (N, 3) or (3,) shape, " message += f"not '{other.shape}' in 'other'." raise TypeError(message) else: other = other.reshape(1, 3) cut_off = cut_off**2 allowed_tos = [ "com", "center of mass", "center_of_mass", "cog", "center of geometry", "center_of_geometry", "any", "all", ] if to is None or to.lower() not in allowed_tos: if to is not None: message = "Only center of mass (COM), center of geometry (COG) " message += f"'all', or 'any' is supported. '{to}' is reset to " if isinstance(other, type(self)): message += "'COM' for 'to'." else: message += "'COG' for 'to'." warnings.warn( message, RuntimeWarning, stacklevel=2, ) to = "COG" if isinstance(other, type(self)): to = "COM" elif to.lower() in [ "com", "center of mass", "center_of_mass", ] and not isinstance(other, type(self)): message = "Only center of geometry (COG), 'all', or 'any' is supported " message += f"if 'other' is 'Iterable'. '{to}' is reset to 'COG' for 'to'." to = "COG" warnings.warn( message, RuntimeWarning, stacklevel=2, ) other_data: np.ndarray | tuple | Self = np.asanyarray(other) if isinstance(other, type(self)): if to.lower() in ["com", "center of mass", "center_of_mass"]: to = "COM" other_data = np.asanyarray(other.center_of_mass) other_data = (other_data[0], other_data[1], other_data[2]) elif to.lower() in ["cog", "center of geometry", "center_of_geometry"]: to = "COG" other_data = np.asanyarray(other.center_of_geometry) other_data = (other_data[0], other_data[1], other_data[2]) else: # 'any' or 'all' to = to.upper() other_data = other else: if to.lower() in ["cog", "center of geometry", "center_of_geometry"]: to = "COG" other_data = np.mean(other, axis=0) other_data = (other_data[0], other_data[1], other_data[2]) else: # 'any' or 'all' to = to.upper() other_data = tuple( [ (other[i, 0], other[i, 1], other[i, 2]) for i in range(other.shape[0]) ] ) distance_matrix = PDBDataFrame.get_distance_matrix( self, other_data=other_data, ) mask = distance_matrix <= cut_off if to in ["COG", "COM"]: if invert: return self.atoms[~mask[:, 0]] return self.atoms[mask[:, 0]] if (to == "ALL" and not invert) or (to == "ANY" and invert): return self.atoms[mask.all(axis=1)] return self.atoms[mask.any(axis=1)]
[docs] def _filter_num_col( self, value: int | float | list[int], num_col_name: str, relation: str | None = None, invert: bool = False, epsilon: float | int = 0.01, suppress_warning: bool = False, ) -> Self: """Generic function to do filter by a numerical column. Args: value (required): value(s) to select by the column given by the ``num_col_name`` input. num_col_name (required): one of ``atom_number``, ``residue_number``, ``x_coord``, ``y_coord``, or ``z_coord``, ``occupancy``, ``b_factor``, and ``nmr_model``. Note: the ``charge`` column is not numerical by ``PDB`` format. relation (optional): ``x/y/z_coord`` relationship to ``value``. It has to be one of ``'<'``, ``'<='``, ``'='``, ``'>='``, and ``'>'``. If ``None``, ``'<='`` is used. Ignored if a list of integers are provided to ``value``. Defaults to **None**. invert (optional): whether to invert the selection. Defaults to **False**. epsilon (optional): atoms ``abs``(``num_col_value`` - ``value``) <= ``epsilon`` are selected when ``invert`` = ``False`` and ``relation`` = ``'='``. Ignored if a list of integers are provided to ``value``. Defaults to **0.01**. suppress_warning (optional): whether to suppress warnings. Defaults to **False**. Returns: sub ``PDBDataFrame`` Raises: ValueError: if xyz not in [``atom_number``, ``residue_number``, ``x_coord``, ``y_coord``, or ``z_coord``, ``occupancy``, ``b_factor``, ``nmr_model``] or ``relation`` not in [``'<'``, ``'<='``, ``'='``, ``'>='``, ``'>'``] when selecting on float cols. """ allowed_num_col_names = [ "atom_number", "residue_number", "x_coord", "y_coord", "z_coord", "occupancy", "b_factor", "nmr_model", ] if num_col_name not in allowed_num_col_names: message = f"Only '{allowed_num_col_names}' are allowed in 'num_col_name' " message += f"but {num_col_name} was put." raise ValueError(message) allowed_relations = ["<", "<=", "=", ">=", ">"] if relation is None and not isinstance(value, list): if isinstance(value, float): relation = "<=" elif isinstance(value, int): relation = "=" else: message = "Only 'int', 'float', or 'list[int]' are allowed in 'value' " message += f"but {type(value)} was put." raise ValueError(message) elif not isinstance(value, list) and relation not in allowed_relations: message = f"Only '{allowed_relations}' are allowed in 'relation' " message += f"but {relation} was put." raise ValueError(message) elif isinstance(value, list): for v in value: if not isinstance(v, int): message = "Only 'int' is allowed in 'value' if it is a list, " message += f"but {type(v)} was in {value}." raise ValueError(message) if relation is not None: relation = None if not suppress_warning: message = "'relation' is ignored " message += "when a list is provided to 'value'." warnings.warn( message, RuntimeWarning, stacklevel=2, ) if relation == "<": if invert: return self.atoms[self.atoms[num_col_name].values >= value] return self.atoms[self.atoms[num_col_name].values < value] elif relation == "<=": if invert: return self.atoms[self.atoms[num_col_name].values > value] return self.atoms[self.atoms[num_col_name].values <= value] elif relation == "=": if invert: return self.atoms[ np.abs(self.atoms[num_col_name].values - value) >= epsilon ] return self.atoms[np.abs(self.atoms[num_col_name].values - value) < epsilon] elif relation == ">=": if invert: return self.atoms[self.atoms[num_col_name].values < value] return self.atoms[self.atoms[num_col_name].values >= value] elif relation == ">": if invert: return self.atoms[self.atoms[num_col_name].values <= value] return self.atoms[self.atoms[num_col_name].values > value] # relation is None -> a list of numbers if invert: return self.atoms[~np.isin(self.atoms[num_col_name].values, value)] return self.atoms[np.isin(self.atoms[num_col_name].values, value)]
[docs] @classmethod def get_chain_list( cls, pdb_df: Self, ) -> list: """Gets the list of chain ids given a ``PDBDataFrame`` object. Args: pdb_df (required): a ``PDBDataFrame`` object. Returns: a list of chain ids. Warning: This method relies on the original and standard numbering of the atoms and residues in the DataFrame. Therefore, any selecting or sorting of the it can lead to wrong results. """ chains: list = [] pre_residue_number = 0 for chain_id, _, residue_number in pdb_df.residue_list: if not chains: chains.append(chain_id) if chain_id != chains[-1]: chains.append(chain_id) elif residue_number < pre_residue_number: chains.append(chain_id) pre_residue_number = residue_number return chains
[docs] @classmethod def get_residue_list( cls, pdb_df: Self, include_heteros: bool = False ) -> list[tuple]: """Gets the list of residues given a ``PDBDataFrame`` object. Args: pdb_df (required): a ``PDBDataFrame`` object. include_heteros (optional): whether to include hetero ligands. Defaults to **False**. Returns: a list of residues as (``chain_id``, ``residue_name``, ``residue_number``). Warning: This method relies on the original and standard numbering of the atoms and residues in the DataFrame. Therefore, any selecting or sorting of the it can lead to wrong results. """ all_residues: list[tuple] = [] prev_atom_number = 0 for atom_number, chain, residue_name, residue_number in zip( pdb_df["atom_number"], pdb_df["chain_id"], pdb_df["residue_name"], pdb_df["residue_number"], ): if not include_heteros and residue_name not in pdb_df.RESIDUE_CODES: continue residue = (chain, residue_name, residue_number) if len(all_residues) == 0: all_residues.append(residue) elif residue != all_residues[-1]: all_residues.append(residue) elif residue == all_residues[-1] and (atom_number - prev_atom_number) > 1: all_residues.append(residue) prev_atom_number = atom_number return all_residues
[docs] @classmethod @functools.lru_cache() def get_distance_matrix( cls, pdb_df: Self, other_data: Self | tuple | None = None, use_r2: bool = True, square_form: bool = False, ) -> np.ndarray: """Calculates the distance matrix given a ``PDBDataFrame`` object and (optional) reference data. Args: pdb_df (required): a ``PDBDataFrame`` object. other_data (optional): the coordinates of to calculate the distances against. Defaults to **None**. use_r2 (optional): whether to use r^2 or r for distance matrix. Defaults to **True**. square_form (optional): whether to output a square form of the density matrix. If two ``PDBDataFrame``s are different or ``other_data`` is not a ``PDBDataFrame``, ``square_form`` is ignored. Defaults to **False**. Returns: distance matrix (squared or condensed form) Raises: ValueError: if ``other_data`` is not of ``PDBDataFrame|tuple|None`` type or wrong shape if it is a ``tuple``. """ cols = ["x_coord", "y_coord", "z_coord"] if other_data is None or ( isinstance(other_data, cls) and pdb_df.atoms == other_data.atoms ): if not square_form: if use_r2: return pdist(pdb_df.atoms[cols].values, "sqeuclidean") return pdist(pdb_df.atoms[cols].values) if use_r2: return squareform(pdist(pdb_df.atoms[cols].values, "sqeuclidean")) return squareform(pdist(pdb_df.atoms[cols].values)) elif isinstance(other_data, cls): if use_r2: return cdist( pdb_df.atoms[cols].values, other_data.atoms[cols].values, "sqeuclidean", ) return cdist( pdb_df.atoms[cols].values, other_data.atoms[cols].values, "euclidean" ) elif isinstance(other_data, tuple): other_array = np.asanyarray(other_data) if other_array.shape != (3,): if not (len(other_array.shape) == 2 and other_array.shape[1] == 3): message = "'other_data' expects a shape of (N, 3) or (3,) " message += f"if given a tuple, but {other_array.shape} was given." raise ValueError(message) else: other_array = other_array.reshape(1, 3) if use_r2: return cdist(pdb_df.atoms[cols].values, other_array, "sqeuclidean") return cdist(pdb_df.atoms[cols].values, other_array, "euclidean") raise ValueError("'other_data' type has to be tuple or PDBDataFrame or None.")