import re
from typing import List, Tuple
from .conversion import split_smiles_and_fragment_info
from .reaction_equation import ReactionEquation, cleanup_compounds
from .utils import remove_atom_mapping
# Regex pattern to extract the fragment info from the extended info of reaction SMILES
EXTENDED_FRAGMENT_REGEX = re.compile(r"f:[\d\.,]+")
# Regex pattern to extract the fragment groups from the fragment info
FRAGMENT_GROUP_REGEX = re.compile(r"(\d+(?:\.\d+)*)")
[docs]class UnsupportedExtendedReactionSmiles(ValueError):
def __init__(self, reaction_smiles: str):
f'The syntax of "{reaction_smiles}" is not supported by RDKit.'
[docs]def parse_extended_reaction_smiles(
extended_reaction_smiles: str, remove_atom_maps: bool = True
) -> ReactionEquation:
Convert an extended reaction SMILES (with potential fragment information)
to a ReactionEquation instance.
extended_reaction_smiles: extended reaction SMILES
remove_atom_maps: whether to remove the atom mapping information.
ReactionEquation instance
return _Importer.convert(
extended_reaction_smiles, remove_atom_maps=remove_atom_maps
[docs]def to_extended_reaction_smiles(reaction: ReactionEquation) -> str:
Convert a ReactionEquation instance to an extended reaction SMILES (with
potential fragment information).
reaction: reaction equation to convert
The extended reaction SMILES string.
return _Exporter.convert(reaction)
class _Importer:
Convert extended reaction SMILES to ReactionEquation instances.
def convert(
extended_reaction_smiles: str, remove_atom_maps: bool
) -> ReactionEquation:
Convert an extended SMILES to a reaction equation.
Used to rely more on RDKit; now does as few RDKit operations as necessary.
extended_reaction_smiles: extended reaction SMILES
remove_atom_maps: whether to remove the atom mapping information.
pure_smiles, fragment_info = split_smiles_and_fragment_info(
if remove_atom_maps:
pure_smiles = remove_atom_mapping(pure_smiles)
# We split at the ">" characters, only if they are not preceded by a "-",
# which would indicate a dative bond.
reactant_groups = re.split(r"(?<!-)>", pure_smiles)
mols_groups = [group.split(".") for group in reactant_groups]
mols_groups = [[mol for mol in group if mol] for group in mols_groups]
fragment_groups = determine_fragment_groups(fragment_info)
groups = _Importer.group_fragments(mols_groups, fragment_groups)
reaction_equation = ReactionEquation(*groups)
# If the atom maps were removed, cleaning up the compounds will
# return a much nicer string.
if remove_atom_maps:
reaction_equation = cleanup_compounds(reaction_equation)
return reaction_equation
def group_fragments(
raw_smiles_groups: List[List[str]], fragment_groups: List[List[int]]
) -> List[List[str]]:
Merge the reaction fragments belonging together.
merged_groups = []
offset = 0
for raw_smiles_group in raw_smiles_groups:
merged_smiles_group = merge_molecules_from_fragment_groups(
raw_smiles_group, fragment_groups, offset
offset += len(raw_smiles_group)
return merged_groups
class _Exporter:
Convert ReactionEquation to extended reaction SMILES with fragment information.
def convert(reaction: ReactionEquation) -> str:
offset = 0
reactants, reactant_groups = _Exporter.fragment_group(
reaction.reactants, offset
offset += len(reactants)
agents, agent_groups = _Exporter.fragment_group(reaction.agents, offset)
offset += len(agents)
products, product_groups = _Exporter.fragment_group(reaction.products, offset)
groups = reactant_groups + agent_groups + product_groups
smiles_groups = (
".".join(smiles for smiles in group)
for group in (reactants, agents, products)
smiles_without_fragment_info = ">".join(smiles_groups)
if not groups:
return smiles_without_fragment_info
fragment_info = _Exporter.generate_fragment_info(groups)
return f"{smiles_without_fragment_info} {fragment_info}"
def fragment_group(
compounds: List[str], offset: int
) -> Tuple[List[str], List[List[int]]]:
Converts a group of molecules, some of which possibly are composed of several fragments,
to the list of SMILES to put in the final reaction SMILES, and a list of groups of molecules
belonging together.
['O', '[Na+].[OH-]'] -> (['O', '[Na+]', '[OH-]'], [[1, 2]])
compounds: SMILES string for molecules of the same group (f.i. reactants)
offset: index, in the final reaction SMILES, of the first molecule of that group
smiles_list: List[str] = []
groups: List[List[int]] = []
current_index = offset
for c in compounds:
molecules = c.split(".")
number_fragments = len(molecules)
if number_fragments > 1:
list(range(current_index, current_index + number_fragments))
current_index += number_fragments
return smiles_list, groups
def generate_fragment_info(groups: List[List[int]]) -> str:
if not groups:
return ""
group_strings = [".".join(str(number) for number in g) for g in groups]
all_groups = ",".join(group_strings)
return f"|f:{all_groups}|"
[docs]def determine_fragment_groups(extended_reaction_info: str) -> List[List[int]]:
From the fragment info string (such as ``|f:0.2,5.6|``), determine the groups
of indices that must be grouped.
extended_reaction_info: Extended reaction info, potentially containing
information about fragments, stereochemistry, etc. See documentation on
List of groups (f.i. [[0,2], [5,6]])
# Extract the part related to fragments, starting with "f:"
fragment_subpart_match =
if fragment_subpart_match is None:
return []
fragment_info =
# Extract the groups of associated numbers as strings
fragment_group_matches = FRAGMENT_GROUP_REGEX.findall(fragment_info)
# Convert to lists of integers
groups = []
for match in fragment_group_matches:
indices = match.split(".")
groups.append([int(i) for i in indices])
return groups
[docs]def merge_molecules_from_fragment_groups(
smiles_list: List[str], fragment_groups: List[List[int]], offset: int
) -> List[str]:
Combine molecules according to the fragment definition.
smiles_list: SMILES strings to potentially merge
fragment_groups: indices for groups of molecules belonging together; some may be out of range
offset: what index the first molecule in smiles_list corresponds to
List of molecules, some of which were potentially merged. Non-merged molecules come first.
allowed_indices = set(range(len(smiles_list)))
merged_indices = set()
merged_molecules = []
for group in fragment_groups:
relative_indices = [i - offset for i in group]
in_range = [e in allowed_indices for e in relative_indices]
# either all indices are in the range, or none is
all_in_range = all(in_range)
none_in_range = not any(in_range)
if not (all_in_range or none_in_range):
raise ValueError()
if all_in_range:
merged_molecule = ".".join(smiles_list[i] for i in relative_indices)
remaining_molecule_indices = sorted(list(allowed_indices - merged_indices))
unmerged_molecules = [smiles_list[i] for i in remaining_molecule_indices]
return unmerged_molecules + merged_molecules