Source code for rxn.reaction_preprocessing.molecule_standardizer

from typing import List, Optional, Tuple

from rxn.chemutils.conversion import canonicalize_smiles
from rxn.chemutils.exceptions import InvalidSmiles
from rxn.chemutils.reaction_equation import ReactionEquation

from rxn.reaction_preprocessing.annotations.missing_annotation_detector import (
    MissingAnnotationDetector,
)
from rxn.reaction_preprocessing.annotations.molecule_annotation import (
    MoleculeAnnotation,
)
from rxn.reaction_preprocessing.annotations.molecule_replacer import MoleculeReplacer
from rxn.reaction_preprocessing.annotations.rejected_molecules_filter import (
    RejectedMoleculesFilter,
)
from rxn.reaction_preprocessing.cleaner import remove_isotope_information


[docs]class MoleculeStandardizationError(ValueError):
    """Base class for standardization exceptions."""


[docs]class RejectedMolecule(MoleculeStandardizationError):
    """Exception raised when standardizing a molecule annotated as "Rejected"."""

    def __init__(self, smiles: str):
        """
        Args:
            smiles: rejected SMILES string.
        """
        super().__init__(f'Cannot standardize: rejected molecule "{smiles}"')


[docs]class MissingAnnotation(MoleculeStandardizationError):
    """Exception raised when standardizing a molecule that should be annotated."""

    def __init__(self, smiles: str):
        """
        Args:
            smiles: rejected SMILES string.
        """
        super().__init__(f'Cannot standardize: molecule "{smiles}" must be annotated.')
        self.smiles = smiles


[docs]class MoleculeStandardizer:
    """
    Class to standardize standalone molecules (reactions are standardized with
    the Standardizer class).

    Note that the standardization of one molecule may lead to a combination
    of molecules, hence the functions return lists of strings.
    """

[docs]    def __init__(
        self,
        annotations: Optional[List[MoleculeAnnotation]] = None,
        discard_missing_annotations: bool = False,
        canonicalize: bool = True,
    ):
        """
        Args:
            annotations: A list of MoleculeAnnotation objects used to perform
                the substitutions /rejections. Defaults to an empty list.
            discard_missing_annotations: whether reactions containing unannotated
                molecules that should be must be rejected.
            canonicalize: whether to canonicalize the compounds.
        """
        if annotations is None:
            annotations = []

        self.discard_unannotated_metals = discard_missing_annotations
        self.canonicalize = canonicalize

        self.rejection_filter = RejectedMoleculesFilter.from_molecule_annotations(
            annotations
        )
        self.missing_annotation_detector = (
            MissingAnnotationDetector.from_molecule_annotations(annotations)
        )
        self.molecule_replacer = MoleculeReplacer.from_molecule_annotations(annotations)

    def __call__(self, smiles: str) -> List[str]:
        """See doc for standardize()."""
        return self.standardize(smiles)

[docs]    def standardize(self, smiles: str) -> List[str]:
        """
        Standardize a molecule.

        The returned value is a list, because in some cases standardization
        returns two independent molecules.

        Args:
            smiles: SMILES string to standardize. Use dots for fragment bonds!

        Raises:
            SanitizationError of one of its subclasses: error in sanitization.
            InvalidSmiles: Invalid SMILES.
            ValueError: "~" being used for fragment bonds.

        Returns:
            Standardized SMILES string.
        """
        if "~" in smiles:
            raise ValueError(f'MoleculeStandardizer must be used without "~": {smiles}')

        # Discard isotope information
        smiles = remove_isotope_information(smiles)

        # Check validity of SMILES (may raise InvalidSmiles), and
        # overwrite if canonicalization required
        canonical_smiles = canonicalize_smiles(smiles)
        if self.canonicalize:
            smiles = canonical_smiles

        # Check for rejected molecules
        if not self.rejection_filter.is_valid_molecule_smiles(smiles):
            raise RejectedMolecule(smiles)

        # Check for non-annotated molecules
        if self.discard_unannotated_metals:
            if self.missing_annotation_detector.molecule_needs_annotation(smiles):
                raise MissingAnnotation(smiles)

        # Replace annotated molecules
        return self.molecule_replacer.replace_molecule_smiles(smiles)

[docs]    def standardize_in_equation(self, reaction: ReactionEquation) -> ReactionEquation:
        """
        Do the molecule-wise standardization for a reaction equation.

        Relies on standardize_in_equation_with_errors(), for modularity purposes.
        Will propagate the exceptions raised in that function.
        """

        # Ignoring the lists of SMILES returned in the tuple (which, by construction,
        # will always be empty: if not, an exception will have been raised earlier).
        reaction, *_ = self.standardize_in_equation_with_errors(
            reaction, propagate_exceptions=True
        )
        return reaction

[docs]    def standardize_in_equation_with_errors(
        self, reaction: ReactionEquation, propagate_exceptions: bool = False
    ) -> Tuple[ReactionEquation, List[str], List[str], List[str]]:
        """
        Do the molecule-wise standardization for a reaction equation, and get the reasons for
        potential failures.

        This function was originally implemented in Standardizer, and then moved here for more
        modularity.

        Args:
            reaction: reaction to standardize.
            propagate_exceptions: if True, will stop execution and raise directly
                instead of collecting the SMILES leading to the failure. Not ideal,
                but probably the only way (?) to not have duplicated code in the
                function standardize_in_equation().

        Returns:
            Tuple:
                - the standardized reaction equation (or an empty one if there was a failure).
                - list of invalid SMILES in the reaction.
                - list of rejected SMILES in the reaction.
                - list of missing annotations in the reaction.
        """

        missing_annotations = []
        invalid_smiles = []
        rejected_smiles = []

        # Iterate over the reactants, agents, products and update the
        # standardized reaction at the same time
        standardized_reaction = ReactionEquation([], [], [])
        for original_role_group, new_role_group in zip(reaction, standardized_reaction):
            for smiles in original_role_group:
                try:
                    new_role_group.extend(self.standardize(smiles))
                except InvalidSmiles:
                    if propagate_exceptions:
                        raise
                    invalid_smiles.append(smiles)
                except RejectedMolecule:
                    if propagate_exceptions:
                        raise
                    rejected_smiles.append(smiles)
                except MissingAnnotation:
                    if propagate_exceptions:
                        raise
                    missing_annotations.append(smiles)

        # If there was any error: replace by empty reaction equation (">>")
        if invalid_smiles or rejected_smiles or missing_annotations:
            standardized_reaction = ReactionEquation([], [], [])

        return (
            standardized_reaction,
            invalid_smiles,
            rejected_smiles,
            missing_annotations,
        )