Source code for rxn.chemutils.smiles_randomization

import random

from rdkit import Chem

from .conversion import mol_to_smiles, smiles_to_mol

# Highest value to give as a random seed for RDKit.
# Any value higher than that will cause problems.
_MAX_RDKIT_RANDOM_SEED = 2147483647


[docs]def randomize_smiles_rotated(smiles: str, with_order_reversal: bool = True) -> str: """ Randomize a SMILES string by doing a cyclic rotation of the atomic indices. Adapted from https://github.com/GLambard/SMILES-X/blob/758478663030580a363a9ee61c11f6d6448e18a1/SMILESX/augm.py#L19. The outputs of this function can be reproduced by setting the seed with random.seed(). Raises: InvalidSmiles: for invalid molecules. Args: smiles: SMILES string to randomize. with_order_reversal: whether to reverse the atom order with 50% chance. Returns: Randomized SMILES string. """ mol = smiles_to_mol(smiles, sanitize=False) n_atoms = mol.GetNumAtoms() # Generate random values rotation_index = random.randint(0, n_atoms - 1) reverse_order = with_order_reversal and random.choice([True, False]) # Generate new atom indices order atoms = list(range(n_atoms)) new_atoms_order = ( atoms[rotation_index % len(atoms) :] + atoms[: rotation_index % len(atoms)] ) if reverse_order: new_atoms_order.reverse() mol = Chem.RenumberAtoms(mol, new_atoms_order) return mol_to_smiles(mol, canonical=False)
[docs]def randomize_smiles_restricted(smiles: str) -> str: """ Randomize a SMILES string in a restricted fashion. The outputs of this function can be reproduced by setting the seed with random.seed(). Raises: InvalidSmiles: for invalid molecules. Args: smiles: SMILES string to randomize. Returns: Randomized SMILES string. """ mol = smiles_to_mol(smiles, sanitize=False) new_atom_order = list(range(mol.GetNumAtoms())) random.shuffle(new_atom_order) mol = Chem.RenumberAtoms(mol, newOrder=new_atom_order) return mol_to_smiles(mol, canonical=False)
[docs]def randomize_smiles_unrestricted(smiles: str) -> str: """ Randomize a SMILES string in an unrestricted fashion. The outputs of this function can be reproduced by setting the seed with random.seed(). Raises: InvalidSmiles: for invalid molecules. Args: smiles: SMILES string to randomize. Returns: Randomized SMILES string. """ # We sample the seed to give to RDKit. This makes the call reproducible # if one sets random.seed() outside this function. seed = random.randint(1, _MAX_RDKIT_RANDOM_SEED) # unlike for the other randomizations, unrestricted randomization does not # work on compounds with multiple fragments. Hence we first split them and # then do the randomization individually sub_smiles = smiles.split(".") mols = [smiles_to_mol(s, sanitize=False) for s in sub_smiles] # Note: to allow for reproducibility, we do not rely on # Chem.MolToSmiles(mol, canonical=False, doRandom=True) # See https://www.rdkit.org/docs/Cookbook.html#enumerate-smiles randomized_mols = [Chem.MolToRandomSmilesVect(mol, 1, seed)[0] for mol in mols] # shuffle the order of the fragments and join them back random.shuffle(randomized_mols) return ".".join(randomized_mols)