Source code for rxn.chemutils.smiles_standardization

import logging
from typing import Optional

from .conversion import inchi_to_mol, mol_to_inchi, mol_to_smiles, smiles_to_mol
from .exceptions import InvalidInchi, InvalidSmiles

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())

RXN_SMILES_SEPARATOR = ">>"


[docs]def standardize_smiles(
    smiles: str,
    canonicalize: bool = True,
    sanitize: bool = True,
    find_radicals: bool = True,
    inchify: bool = False,
) -> str:
    """Ensure that a SMILES follows a desired standard.

    It allows canonicalization, sanitization and inchification keeping stereochemistry with isomericSmile=True.
    It can process multiple molecules separated by ".".
    Note that inchify set to True will also canonicalize the molecule.

    Args:
        smiles: SMILES representation of a molecule.
        canonicalize: canonicalize SMILES. Defaults to True.
        sanitize: sanitize SMILES. Defaults to True.
        inchify: inchify the SMILES. Defaults to False.

    Returns:
        a SMILES following the desired standard.
    """
    try:
        molecule = smiles_to_mol(smiles, sanitize=sanitize, find_radicals=find_radicals)
    except InvalidSmiles:
        logger.error(f"SMILES parsing failure: {smiles}.")
        raise

    if inchify:
        try:
            inchi_string = mol_to_inchi(molecule)
        except InvalidInchi:
            logger.error(
                f"Inchification failure for SMILES: {smiles}. Returning its canonical version."
            )
            return mol_to_smiles(molecule, isomericSmiles=True)
        else:
            # canonical set to True because we can't guarantee no canonicalization
            try:
                molecule_from_inchi = inchi_to_mol(inchi_string)
            except InvalidInchi:
                logger.error(
                    f"De-inchification failure for InChi: {inchi_string}. Returning its canonical version."
                )
                return mol_to_smiles(molecule, isomericSmiles=True)
            return mol_to_smiles(molecule_from_inchi, canonical=True)
    if canonicalize:
        return mol_to_smiles(molecule, isomericSmiles=True)
    else:
        return smiles


[docs]def standardize_molecules(
    molecules: str,
    canonicalize: bool = True,
    sanitize: bool = True,
    inchify: bool = False,
    fragment_bond: str = "~",
    ordered_precursors: bool = True,
    molecule_token_delimiter: Optional[str] = None,
    is_enzymatic: bool = False,
    enzyme_separator: str = "|",
) -> str:
    """Ensure that a set of molecules represented by a string follows a desired standard.

    Args:
        molecules: molecules SMILES. Molecules can be separated via a ".".
            Fragments are supported with a custom `fragment_bond`.
        canonicalize: canonicalize SMILES. Defaults to True.
        sanitize: sanitize SMILES. Defaults to True.
        inchify: inchify the SMILES. Defaults to False.
        fragment_bond: fragment bond. Defaults to '~'.
        ordered_precursors: order precursors. Defaults to True.
        molecule_token_delimiter: delimiter for big molecule tokens. Defaults to None
        is_enzymatic: the molecules are representing an enzymatic reaction. Defaults to False.
        enzyme_separator: separator for molecules and the enzyme. Defaults to '|'.

    Returns:
        standardized molecules.

    Examples:
        Standardize multiple molecules:
        >>> standardize_molecules('CCO.CC')
        'CC.CCO'
        Standardize multiple molecules including fragment information:
        >>> standardize_molecules('CCO.CC~C')
        'CCO.C~CC'
    """
    enzyme = ""
    if is_enzymatic:
        splitted_molecules = molecules.split(enzyme_separator)
        molecules = splitted_molecules[0]
        if len(splitted_molecules) > 1:
            enzyme = splitted_molecules[1]
            enzyme = "{}{}".format(enzyme_separator, enzyme)
    if molecule_token_delimiter is not None:
        molecules = molecules.replace(molecule_token_delimiter, "")
    if fragment_bond in molecules:
        standardized_molecules_list = [
            # make sure we remove the fragment to have valid SMILES
            standardize_smiles(
                molecule.replace(fragment_bond, "."),
                canonicalize=canonicalize,
                sanitize=sanitize,
                inchify=inchify,
            ).replace(".", fragment_bond)
            for molecule in molecules.split(".")
        ]
        if ordered_precursors:
            standardized_molecules_list = sorted(standardized_molecules_list)
        standardized_molecules = ".".join(standardized_molecules_list)
    else:
        if ordered_precursors:
            # RDKit guarantees ordered precursors
            standardized_molecules = standardize_smiles(
                molecules,
                canonicalize=canonicalize,
                sanitize=sanitize,
                inchify=inchify,
            )
        else:
            standardized_molecules_list = [
                standardize_smiles(
                    molecule,
                    canonicalize=canonicalize,
                    sanitize=sanitize,
                    inchify=inchify,
                )
                for molecule in molecules.split(".")
            ]
            standardized_molecules = ".".join(standardized_molecules_list)
    # add optional enzyme information
    standardized_molecules = "{}{}".format(standardized_molecules, enzyme)
    return standardized_molecules