Source code for rxn.chemutils.smiles_standardization

import logging
from typing import Optional

from .conversion import inchi_to_mol, mol_to_inchi, mol_to_smiles, smiles_to_mol
from .exceptions import InvalidInchi, InvalidSmiles

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())

RXN_SMILES_SEPARATOR = ">>"


[docs]def standardize_smiles( smiles: str, canonicalize: bool = True, sanitize: bool = True, find_radicals: bool = True, inchify: bool = False, ) -> str: """Ensure that a SMILES follows a desired standard. It allows canonicalization, sanitization and inchification keeping stereochemistry with isomericSmile=True. It can process multiple molecules separated by ".". Note that inchify set to True will also canonicalize the molecule. Args: smiles: SMILES representation of a molecule. canonicalize: canonicalize SMILES. Defaults to True. sanitize: sanitize SMILES. Defaults to True. inchify: inchify the SMILES. Defaults to False. Returns: a SMILES following the desired standard. """ try: molecule = smiles_to_mol(smiles, sanitize=sanitize, find_radicals=find_radicals) except InvalidSmiles: logger.error(f"SMILES parsing failure: {smiles}.") raise if inchify: try: inchi_string = mol_to_inchi(molecule) except InvalidInchi: logger.error( f"Inchification failure for SMILES: {smiles}. Returning its canonical version." ) return mol_to_smiles(molecule, isomericSmiles=True) else: # canonical set to True because we can't guarantee no canonicalization try: molecule_from_inchi = inchi_to_mol(inchi_string) except InvalidInchi: logger.error( f"De-inchification failure for InChi: {inchi_string}. Returning its canonical version." ) return mol_to_smiles(molecule, isomericSmiles=True) return mol_to_smiles(molecule_from_inchi, canonical=True) if canonicalize: return mol_to_smiles(molecule, isomericSmiles=True) else: return smiles
[docs]def standardize_molecules( molecules: str, canonicalize: bool = True, sanitize: bool = True, inchify: bool = False, fragment_bond: str = "~", ordered_precursors: bool = True, molecule_token_delimiter: Optional[str] = None, is_enzymatic: bool = False, enzyme_separator: str = "|", ) -> str: """Ensure that a set of molecules represented by a string follows a desired standard. Args: molecules: molecules SMILES. Molecules can be separated via a ".". Fragments are supported with a custom `fragment_bond`. canonicalize: canonicalize SMILES. Defaults to True. sanitize: sanitize SMILES. Defaults to True. inchify: inchify the SMILES. Defaults to False. fragment_bond: fragment bond. Defaults to '~'. ordered_precursors: order precursors. Defaults to True. molecule_token_delimiter: delimiter for big molecule tokens. Defaults to None is_enzymatic: the molecules are representing an enzymatic reaction. Defaults to False. enzyme_separator: separator for molecules and the enzyme. Defaults to '|'. Returns: standardized molecules. Examples: Standardize multiple molecules: >>> standardize_molecules('CCO.CC') 'CC.CCO' Standardize multiple molecules including fragment information: >>> standardize_molecules('CCO.CC~C') 'CCO.C~CC' """ enzyme = "" if is_enzymatic: splitted_molecules = molecules.split(enzyme_separator) molecules = splitted_molecules[0] if len(splitted_molecules) > 1: enzyme = splitted_molecules[1] enzyme = "{}{}".format(enzyme_separator, enzyme) if molecule_token_delimiter is not None: molecules = molecules.replace(molecule_token_delimiter, "") if fragment_bond in molecules: standardized_molecules_list = [ # make sure we remove the fragment to have valid SMILES standardize_smiles( molecule.replace(fragment_bond, "."), canonicalize=canonicalize, sanitize=sanitize, inchify=inchify, ).replace(".", fragment_bond) for molecule in molecules.split(".") ] if ordered_precursors: standardized_molecules_list = sorted(standardized_molecules_list) standardized_molecules = ".".join(standardized_molecules_list) else: if ordered_precursors: # RDKit guarantees ordered precursors standardized_molecules = standardize_smiles( molecules, canonicalize=canonicalize, sanitize=sanitize, inchify=inchify, ) else: standardized_molecules_list = [ standardize_smiles( molecule, canonicalize=canonicalize, sanitize=sanitize, inchify=inchify, ) for molecule in molecules.split(".") ] standardized_molecules = ".".join(standardized_molecules_list) # add optional enzyme information standardized_molecules = "{}{}".format(standardized_molecules, enzyme) return standardized_molecules