Source code for rxn.reaction_preprocessing.standardizer

# LICENSED INTERNAL CODE. PROPERTY OF IBM.
# IBM Research Zurich Licensed Internal Code
# (C) Copyright IBM Corp. 2021
# ALL RIGHTS RESERVED
""" A utility class to apply standardization to the data """

from pathlib import Path
from typing import List, Optional

from attr import define
from rxn.chemutils.miscellaneous import remove_chiral_centers
from rxn.chemutils.reaction_smiles import parse_any_reaction_smiles
from rxn.utilities.csv import CsvIterator, StreamingCsvEditor

from rxn.reaction_preprocessing.annotations.molecule_annotation import (
    MoleculeAnnotation,
    load_annotations_multiple,
)
from rxn.reaction_preprocessing.config import StandardizeConfig
from rxn.reaction_preprocessing.molecule_standardizer import MoleculeStandardizer


[docs]class Standardizer:
[docs] def __init__( self, annotations: List[MoleculeAnnotation], discard_unannotated_metals: bool, reaction_column_name: str, fragment_bond: Optional[str] = None, remove_stereo_if_not_defined_in_precursors: bool = False, keep_intermediate_columns: bool = False, ): """Creates a new instance of the Standardizer class. Args: annotations: A list of MoleculeAnnotation objects used to perform the substitutions/rejections discard_unannotated_metals: whether reactions containing unannotated molecules with transition metals must be rejected. reaction_column_name: The name of the DataFrame column containing the reaction SMILES. fragment_bond: the fragment bond used in the dataframe. remove_stereo_if_not_defined_in_precursors: Remove chiral centers from products. keep_intermediate_columns: Whether the columns generated during preprocessing should be kept. """ self.molecule_standardizer = MoleculeStandardizer( annotations=annotations, discard_missing_annotations=discard_unannotated_metals, canonicalize=True, ) self.remove_stereo_if_not_defined_in_precursors = ( remove_stereo_if_not_defined_in_precursors ) self.fragment_bond = fragment_bond self.keep_intermediate_columns = keep_intermediate_columns self.rxn_column = reaction_column_name
[docs] def standardize_file(self, input_csv: Path, output_csv: Path) -> None: """Standardize the reactions in a CSV file. Args: input_csv: CSV with the reactions to standardize. output_csv: CSV where to save the standardized reactions. """ with open(input_csv, "rt") as f_in, open(output_csv, "wt") as f_out: csv_iterator = CsvIterator.from_stream(f_in) csv_iterator = self.standardize_iterator(csv_iterator) csv_iterator.to_stream(f_out)
[docs] def standardize_iterator(self, csv_iterator: CsvIterator) -> CsvIterator: """Standardize the reactions in a CSV iterator. Same as ``standardize_file``, except that it acts directly on the iterator. Args: csv_iterator: input CSV iterator for the reactions to standardize. Returns: CsvIterator with reactions after the standardization step. """ editor = self._instantiate_csv_editor() return editor.process(csv_iterator)
def _instantiate_csv_editor(self) -> StreamingCsvEditor: if self.keep_intermediate_columns: return StreamingCsvEditor( columns_in=[self.rxn_column], columns_out=StandardizationOutput.keys(self.rxn_column), transformation=self._standardize_big, ) else: return StreamingCsvEditor( columns_in=[self.rxn_column], columns_out=[self.rxn_column], transformation=self._standardize_small, ) def standardize_one(self, rxn_smiles: str) -> "StandardizationOutput": original_rxn_smiles = rxn_smiles # Remove stereo information from products, if needed rxn_smiles = self._remove_stereo_if_not_defined_in_precursors(rxn_smiles) # Read the reaction SMILES while allowing for different formats (with # fragment bond, extended reaction SMILES, etc.). reaction_equation = parse_any_reaction_smiles(rxn_smiles) ( standardized_reaction, invalid_smiles, rejected_smiles, missing_annotations, ) = self.molecule_standardizer.standardize_in_equation_with_errors( reaction_equation, propagate_exceptions=False ) standardized_smiles = standardized_reaction.to_string(self.fragment_bond) return StandardizationOutput( standardized_rxn_smiles=standardized_smiles, original_rxn_smiles=original_rxn_smiles, invalid_smiles=invalid_smiles, rejected_smiles=rejected_smiles, missing_annotations=missing_annotations, ) def _standardize_small(self, rxn_smiles: str) -> str: return self.standardize_one(rxn_smiles).standardized_rxn_smiles def _standardize_big(self, rxn_smiles: str) -> List[str]: return self.standardize_one(rxn_smiles).values() def _remove_stereo_if_not_defined_in_precursors(self, rxn_smiles: str) -> str: """ Remove stereocenters from products if not explainable by precursors. """ if not self.remove_stereo_if_not_defined_in_precursors: return rxn_smiles reactants, reagents, products = rxn_smiles.split(">") if "@" in products and not ("@" in reactants or "@" in reagents): rxn_smiles = remove_chiral_centers(rxn_smiles) # replaces with the group return rxn_smiles
[docs]@define class StandardizationOutput: """Contains the results and additional information for the standardization of one reaction SMILES.""" standardized_rxn_smiles: str original_rxn_smiles: str invalid_smiles: List[str] rejected_smiles: List[str] missing_annotations: List[str]
[docs] @staticmethod def keys(rxn_column_name: str) -> List[str]: """Return the "keys" in the right order (same order as values).""" return [ rxn_column_name, f"{rxn_column_name}_before_std", f"{rxn_column_name}_invalid_smiles", f"{rxn_column_name}_rejected_smiles", f"{rxn_column_name}_missing_annotations", ]
[docs] def values(self) -> List[str]: """Return the "values" in the right order (same order as keys).""" return [ self.standardized_rxn_smiles, self.original_rxn_smiles, str(self.invalid_smiles), str(self.rejected_smiles), str(self.missing_annotations), ]
[docs]def standardize(cfg: StandardizeConfig) -> None: output_path = Path(cfg.output_file_path) input_path = Path(cfg.input_file_path) if not input_path.exists(): raise ValueError( f"Input file for standardization does not exist: {cfg.input_file_path}" ) # Create a list of MoleculeAnnotations from the json files provided. annotations = load_annotations_multiple(cfg.annotation_file_paths) # Create an instance of the Standardizer std = Standardizer( annotations, discard_unannotated_metals=cfg.discard_unannotated_metals, reaction_column_name=cfg.reaction_column_name, fragment_bond=cfg.fragment_bond.value, remove_stereo_if_not_defined_in_precursors=cfg.remove_stereo_if_not_defined_in_precursors, keep_intermediate_columns=cfg.keep_intermediate_columns, ) std.standardize_file(input_path, output_path)