Skip to content

Input/Output Utilities

This module provides functions for loading and saving datasets, as well as converting between different data formats. It is useful for preparing data for training and testing DirectMultiStep models.

Example Use

The most useful functions are load_dataset_sm, load_dataset_nosm, save_dataset_sm, and load_pharma_compounds. These functions allow you to load and save datasets in a variety of formats.

from pathlib import Path
from directmultistep.utils.io import load_pharma_compounds

data_path = Path.cwd() / "data"

_products, _sms, _path_strings, _steps_list, nameToIdx = load_pharma_compounds(data_path / "pharma_compounds.json")

Source Code

directmultistep.utils.io

DatasetDict

Bases: TypedDict

A dictionary type for storing dataset information.

Attributes:

Name Type Description
products list[str]

List of product SMILES strings.

starting_materials list[str]

List of starting material SMILES strings.

path_strings list[str]

List of string representations of reaction paths.

n_steps_list list[int]

List of integers representing the number of steps in each path.

ds_name str

Name of the dataset.

nameToIdx dict[str, list[int]] | None

A dictionary mapping names to lists of indices.

Source code in src/directmultistep/utils/io.py
class DatasetDict(TypedDict, total=False):
    """
    A dictionary type for storing dataset information.

    Attributes:
        products: List of product SMILES strings.
        starting_materials: List of starting material SMILES strings.
        path_strings: List of string representations of reaction paths.
        n_steps_list: List of integers representing the number of steps in each path.
        ds_name: Name of the dataset.
        nameToIdx: A dictionary mapping names to lists of indices.
    """

    products: list[str]
    starting_materials: list[str]
    path_strings: list[str]
    n_steps_list: list[int]
    ds_name: str
    nameToIdx: dict[str, list[int]] | None

load_dataset_sm(path)

Loads a dataset from a pickle file containing starting materials.

Parameters:

Name Type Description Default
path Path

The path to the pickle file.

required

Returns:

Type Description
DatasetDict

A dictionary containing the loaded dataset.

Source code in src/directmultistep/utils/io.py
def load_dataset_sm(path: Path) -> DatasetDict:
    """Loads a dataset from a pickle file containing starting materials.

    Args:
        path: The path to the pickle file.

    Returns:
        A dictionary containing the loaded dataset.
    """
    with open(path, "rb") as file:
        products, starting_materials, path_strings, n_steps_list = pickle.load(file)
    ds_name = path.stem.split("_")[0]
    return {
        "products": products,
        "starting_materials": starting_materials,
        "path_strings": path_strings,
        "n_steps_list": n_steps_list,
        "ds_name": ds_name,
    }

load_dataset_nosm(path)

Loads a dataset from a pickle file without starting materials.

Parameters:

Name Type Description Default
path Path

The path to the pickle file.

required

Returns:

Type Description
DatasetDict

A dictionary containing the loaded dataset.

Source code in src/directmultistep/utils/io.py
def load_dataset_nosm(path: Path) -> DatasetDict:
    """Loads a dataset from a pickle file without starting materials.

    Args:
        path: The path to the pickle file.

    Returns:
        A dictionary containing the loaded dataset.
    """
    with open(path, "rb") as file:
        products, _, path_strings, n_steps_list = pickle.load(file)
    ds_name = path.stem.split("_")[0]
    return {
        "products": products,
        "path_strings": path_strings,
        "n_steps_list": n_steps_list,
        "ds_name": ds_name,
    }

save_dataset_sm(data, path)

Saves a dataset to a pickle file, including starting materials.

Parameters:

Name Type Description Default
data dict[str, Any]

The dataset dictionary to save.

required
path Path

The path to save the pickle file.

required
Source code in src/directmultistep/utils/io.py
def save_dataset_sm(data: dict[str, Any], path: Path) -> None:
    """Saves a dataset to a pickle file, including starting materials.

    Args:
        data: The dataset dictionary to save.
        path: The path to save the pickle file.
    """
    with open(path, "wb") as file:
        p, sm, ps, ns = data["products"], data.get("starting_materials", []), data["path_strings"], data["n_steps_list"]
        pickle.dump((p, sm, ps, ns), file)

convert_dict_of_lists_to_list_of_dicts(dict_of_lists)

Converts a dictionary of lists to a list of dictionaries.

Parameters:

Name Type Description Default
dict_of_lists DatasetDict

The dictionary of lists to convert.

required

Returns:

Type Description
list[dict[str, str]]

A list of dictionaries.

Source code in src/directmultistep/utils/io.py
def convert_dict_of_lists_to_list_of_dicts(dict_of_lists: DatasetDict) -> list[dict[str, str]]:
    """Converts a dictionary of lists to a list of dictionaries.

    Args:
        dict_of_lists: The dictionary of lists to convert.

    Returns:
        A list of dictionaries.
    """
    return [dict(zip(dict_of_lists.keys(), values)) for values in zip(*dict_of_lists.values())]

convert_list_of_dicts_to_dict_of_lists(list_of_dicts)

Converts a list of dictionaries to a dictionary of lists.

Parameters:

Name Type Description Default
list_of_dicts list[dict[str, str]]

The list of dictionaries to convert.

required

Returns:

Type Description
dict[str, list[str]]

A dictionary of lists.

Source code in src/directmultistep/utils/io.py
def convert_list_of_dicts_to_dict_of_lists(list_of_dicts: list[dict[str, str]]) -> dict[str, list[str]]:
    """Converts a list of dictionaries to a dictionary of lists.

    Args:
        list_of_dicts: The list of dictionaries to convert.

    Returns:
        A dictionary of lists.
    """
    return {key: [item[key] for item in list_of_dicts] for key in list_of_dicts[0].keys()}

load_pharma_compounds(path_to_json, load_sm=True)

Loads pharmaceutical compounds from a JSON file.

Parameters:

Name Type Description Default
path_to_json Path

The path to the JSON file.

required
load_sm bool

Whether to load starting materials.

True

Returns:

Type Description
DatasetDict

A dictionary containing the loaded dataset.

Source code in src/directmultistep/utils/io.py
def load_pharma_compounds(
    path_to_json: Path,
    load_sm: bool = True,
) -> DatasetDict:
    """Loads pharmaceutical compounds from a JSON file.

    Args:
        path_to_json: The path to the JSON file.
        load_sm: Whether to load starting materials.

    Returns:
        A dictionary containing the loaded dataset.
    """
    with open(path_to_json, "r") as file:
        data = json.load(file)
    _products, _sms, _path_strings, _steps_list = [], [], [], []
    name_idx: dict[str, list[int]] = {}
    idx = 0
    for item in data:
        path_dict = eval(item["path"])
        all_sm = find_leaves(path_dict)
        if load_sm:
            for sm in all_sm:
                name_idx.setdefault(item["name"], []).append(idx)
                _path_strings.append(item["path"])
                _products.append(eval(item["path"])["smiles"])
                _sms.append(sm)
                _steps_list.append(max_tree_depth(path_dict))
                idx += 1
        else:
            name_idx.setdefault(item["name"], []).append(idx)
            _path_strings.append(item["path"])
            _products.append(eval(item["path"])["smiles"])
            _steps_list.append(max_tree_depth(path_dict))
            idx += 1

    if load_sm:
        return {
            "products": _products,
            "starting_materials": _sms,
            "path_strings": _path_strings,
            "n_steps_list": _steps_list,
            "nameToIdx": name_idx,
        }
    else:
        return {
            "products": _products,
            "path_strings": _path_strings,
            "n_steps_list": _steps_list,
            "nameToIdx": name_idx,
        }

load_commercial_stock(path)

Loads a set of molecules from a file, canonicalizes them, and returns a set.

Parameters:

Name Type Description Default
path Path

The path to the file containing molecules.

required

Returns:

Type Description
set[str]

A set of canonicalized SMILES strings.

Source code in src/directmultistep/utils/io.py
def load_commercial_stock(path: Path) -> set[str]:
    """Loads a set of molecules from a file, canonicalizes them, and returns a set.

    Args:
        path: The path to the file containing molecules.

    Returns:
        A set of canonicalized SMILES strings.
    """
    with open(path, "r") as file:
        stock = file.readlines()
    canonical_stock = set()
    for molecule in stock:
        canonical_stock.add(canonicalize_smiles(molecule.strip()))
    return canonical_stock