Source code for moldf.write_jcsv

# MolDF
# Author: Ruibin Liu <ruibinliuphd@gmail.com>
# License: MIT
# Code Repository: https://github.com/Ruibin-Liu/MolDF
"""Write any dict of Pandas DataFrame to JCSV."""
from __future__ import annotations

import os

import pandas as pd  # type: ignore



[docs]
def write_jcsv(
    data: dict[str, pd.DataFrame],
    file_name: str | os.PathLike | None = None,
    write_meta: bool = True,
    **kwargs,
) -> None:
    """Write a dict of ``Pandas DataFrame`` s into a JCSV file.
    See https://github.com/Ruibin-Liu/JCSV for definitions.

    Args:
        data (required): a ``dict`` of ``Pandas DataFrame`` s to write.
        file_name (optional): file name to write a JCSV file. If ``None``,
            ``moldf_output.jcsv`` will be used as the file name if ``path_or_buf`` is not
            specified in ``**kwargs``. Defaults to **None**.
        write_meta (optional): whether to write meta data into the **first** category.
            Currently, only the first line number for each category is recorded.
            Defaults to **True**.
        **kwargs: keyword arguments for ``pd.DataFrame.to_csv``. Invalid ones are ignored.
            Check https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html

    Raises:
        TypeError: if ``data`` is not a valid dict of ``DataFrame``.
        ValueError: if ``"`` in any of the column names.
    """
    if file_name is None:
        file_name = kwargs.get("path_or_buf")
        if file_name is None:
            file_name = "moldf_output.jcsv"

    if not isinstance(data, dict):
        raise TypeError(f"'data' has to be a dict but {type(data)} is provided.")
    for key in data:
        if not isinstance(data[key], pd.DataFrame):
            raise TypeError(f"{data[key]} is not a Pandas DataFrame object.")

    exclusion = ["self", "path_or_buf", "df", "formatter"]
    to_csv_kargs = [
        karg
        for karg in pd.DataFrame.to_csv.__code__.co_varnames
        if karg not in exclusion
    ]
    kwargs = {k: v for k, v in kwargs.items() if k in to_csv_kargs}

    line_terminator = kwargs.get("lineterminator")
    if line_terminator is None:
        line_terminator = kwargs.get("line_terminator")  # old Pandas versions
    if line_terminator is None:
        line_terminator = os.linesep

    with open(file_name, "w", encoding="utf-8") as out_file:
        if write_meta:
            meta_line = f"#jcsv_meta{line_terminator}"
            out_file.write(meta_line)
            out_file.write(f"category,start_line_index{line_terminator}")
            out_file.write(f"meta,1{line_terminator}")  # just to be self-consistent
            last_cat_line_length = len(data) + 3
            for key, df in data.items():
                if "'" in key:
                    key = f'"{key}"'
                elif '"' in key:
                    message = '" is not supported in category names, but'
                    message += f" {key} has it."
                    raise ValueError(message)

                out_file.write(f"{key},{last_cat_line_length+1}{line_terminator}")
                last_cat_line_length += len(df) + 2

        for key, df in data.items():
            key_line = f"#{key}{line_terminator}"
            out_file.write(key_line)
            out_file.write(df.to_csv(index=False, **kwargs))