from typing import List, Optional, Tuple
from rxn.chemutils.conversion import canonicalize_smiles
from rxn.chemutils.exceptions import InvalidSmiles
from rxn.chemutils.reaction_equation import ReactionEquation
from rxn.reaction_preprocessing.annotations.missing_annotation_detector import (
MissingAnnotationDetector,
)
from rxn.reaction_preprocessing.annotations.molecule_annotation import (
MoleculeAnnotation,
)
from rxn.reaction_preprocessing.annotations.molecule_replacer import MoleculeReplacer
from rxn.reaction_preprocessing.annotations.rejected_molecules_filter import (
RejectedMoleculesFilter,
)
from rxn.reaction_preprocessing.cleaner import remove_isotope_information
[docs]class MoleculeStandardizationError(ValueError):
"""Base class for standardization exceptions."""
[docs]class RejectedMolecule(MoleculeStandardizationError):
"""Exception raised when standardizing a molecule annotated as "Rejected"."""
def __init__(self, smiles: str):
"""
Args:
smiles: rejected SMILES string.
"""
super().__init__(f'Cannot standardize: rejected molecule "{smiles}"')
[docs]class MissingAnnotation(MoleculeStandardizationError):
"""Exception raised when standardizing a molecule that should be annotated."""
def __init__(self, smiles: str):
"""
Args:
smiles: rejected SMILES string.
"""
super().__init__(f'Cannot standardize: molecule "{smiles}" must be annotated.')
self.smiles = smiles
[docs]class MoleculeStandardizer:
"""
Class to standardize standalone molecules (reactions are standardized with
the Standardizer class).
Note that the standardization of one molecule may lead to a combination
of molecules, hence the functions return lists of strings.
"""
[docs] def __init__(
self,
annotations: Optional[List[MoleculeAnnotation]] = None,
discard_missing_annotations: bool = False,
canonicalize: bool = True,
):
"""
Args:
annotations: A list of MoleculeAnnotation objects used to perform
the substitutions /rejections. Defaults to an empty list.
discard_missing_annotations: whether reactions containing unannotated
molecules that should be must be rejected.
canonicalize: whether to canonicalize the compounds.
"""
if annotations is None:
annotations = []
self.discard_unannotated_metals = discard_missing_annotations
self.canonicalize = canonicalize
self.rejection_filter = RejectedMoleculesFilter.from_molecule_annotations(
annotations
)
self.missing_annotation_detector = (
MissingAnnotationDetector.from_molecule_annotations(annotations)
)
self.molecule_replacer = MoleculeReplacer.from_molecule_annotations(annotations)
def __call__(self, smiles: str) -> List[str]:
"""See doc for standardize()."""
return self.standardize(smiles)
[docs] def standardize(self, smiles: str) -> List[str]:
"""
Standardize a molecule.
The returned value is a list, because in some cases standardization
returns two independent molecules.
Args:
smiles: SMILES string to standardize. Use dots for fragment bonds!
Raises:
SanitizationError of one of its subclasses: error in sanitization.
InvalidSmiles: Invalid SMILES.
ValueError: "~" being used for fragment bonds.
Returns:
Standardized SMILES string.
"""
if "~" in smiles:
raise ValueError(f'MoleculeStandardizer must be used without "~": {smiles}')
# Discard isotope information
smiles = remove_isotope_information(smiles)
# Check validity of SMILES (may raise InvalidSmiles), and
# overwrite if canonicalization required
canonical_smiles = canonicalize_smiles(smiles)
if self.canonicalize:
smiles = canonical_smiles
# Check for rejected molecules
if not self.rejection_filter.is_valid_molecule_smiles(smiles):
raise RejectedMolecule(smiles)
# Check for non-annotated molecules
if self.discard_unannotated_metals:
if self.missing_annotation_detector.molecule_needs_annotation(smiles):
raise MissingAnnotation(smiles)
# Replace annotated molecules
return self.molecule_replacer.replace_molecule_smiles(smiles)
[docs] def standardize_in_equation(self, reaction: ReactionEquation) -> ReactionEquation:
"""
Do the molecule-wise standardization for a reaction equation.
Relies on standardize_in_equation_with_errors(), for modularity purposes.
Will propagate the exceptions raised in that function.
"""
# Ignoring the lists of SMILES returned in the tuple (which, by construction,
# will always be empty: if not, an exception will have been raised earlier).
reaction, *_ = self.standardize_in_equation_with_errors(
reaction, propagate_exceptions=True
)
return reaction
[docs] def standardize_in_equation_with_errors(
self, reaction: ReactionEquation, propagate_exceptions: bool = False
) -> Tuple[ReactionEquation, List[str], List[str], List[str]]:
"""
Do the molecule-wise standardization for a reaction equation, and get the reasons for
potential failures.
This function was originally implemented in Standardizer, and then moved here for more
modularity.
Args:
reaction: reaction to standardize.
propagate_exceptions: if True, will stop execution and raise directly
instead of collecting the SMILES leading to the failure. Not ideal,
but probably the only way (?) to not have duplicated code in the
function standardize_in_equation().
Returns:
Tuple:
- the standardized reaction equation (or an empty one if there was a failure).
- list of invalid SMILES in the reaction.
- list of rejected SMILES in the reaction.
- list of missing annotations in the reaction.
"""
missing_annotations = []
invalid_smiles = []
rejected_smiles = []
# Iterate over the reactants, agents, products and update the
# standardized reaction at the same time
standardized_reaction = ReactionEquation([], [], [])
for original_role_group, new_role_group in zip(reaction, standardized_reaction):
for smiles in original_role_group:
try:
new_role_group.extend(self.standardize(smiles))
except InvalidSmiles:
if propagate_exceptions:
raise
invalid_smiles.append(smiles)
except RejectedMolecule:
if propagate_exceptions:
raise
rejected_smiles.append(smiles)
except MissingAnnotation:
if propagate_exceptions:
raise
missing_annotations.append(smiles)
# If there was any error: replace by empty reaction equation (">>")
if invalid_smiles or rejected_smiles or missing_annotations:
standardized_reaction = ReactionEquation([], [], [])
return (
standardized_reaction,
invalid_smiles,
rejected_smiles,
missing_annotations,
)