Source code for rxn.chemutils.extended_reaction_smiles

import re
from typing import List, Tuple

from .conversion import split_smiles_and_fragment_info
from .reaction_equation import ReactionEquation, cleanup_compounds
from .utils import remove_atom_mapping

# Regex pattern to extract the fragment info from the extended info of reaction SMILES
EXTENDED_FRAGMENT_REGEX = re.compile(r"f:[\d\.,]+")

# Regex pattern to extract the fragment groups from the fragment info
FRAGMENT_GROUP_REGEX = re.compile(r"(\d+(?:\.\d+)*)")


[docs]class UnsupportedExtendedReactionSmiles(ValueError):
    def __init__(self, reaction_smiles: str):
        super().__init__(
            f'The syntax of "{reaction_smiles}" is not supported by RDKit.'
        )


[docs]def parse_extended_reaction_smiles(
    extended_reaction_smiles: str, remove_atom_maps: bool = True
) -> ReactionEquation:
    """
    Convert an extended reaction SMILES (with potential fragment information)
    to a ReactionEquation instance.

    Args:
        extended_reaction_smiles: extended reaction SMILES
        remove_atom_maps: whether to remove the atom mapping information.

    Returns:
        ReactionEquation instance
    """
    return _Importer.convert(
        extended_reaction_smiles, remove_atom_maps=remove_atom_maps
    )


[docs]def to_extended_reaction_smiles(reaction: ReactionEquation) -> str:
    """
    Convert a ReactionEquation instance to an extended reaction SMILES (with
    potential fragment information).

    Args:
        reaction: reaction equation to convert

    Returns:
        The extended reaction SMILES string.
    """
    return _Exporter.convert(reaction)


class _Importer:
    """
    Convert extended reaction SMILES to ReactionEquation instances.
    """

    @staticmethod
    def convert(
        extended_reaction_smiles: str, remove_atom_maps: bool
    ) -> ReactionEquation:
        """
        Convert an extended SMILES to a reaction equation.

        Used to rely more on RDKit; now does as few RDKit operations as necessary.

        Args:
            extended_reaction_smiles: extended reaction SMILES
            remove_atom_maps: whether to remove the atom mapping information.
        """

        pure_smiles, fragment_info = split_smiles_and_fragment_info(
            extended_reaction_smiles
        )

        if remove_atom_maps:
            pure_smiles = remove_atom_mapping(pure_smiles)

        # We split at the ">" characters, only if they are not preceded by a "-",
        # which would indicate a dative bond.
        reactant_groups = re.split(r"(?<!-)>", pure_smiles)
        mols_groups = [group.split(".") for group in reactant_groups]
        mols_groups = [[mol for mol in group if mol] for group in mols_groups]

        fragment_groups = determine_fragment_groups(fragment_info)
        groups = _Importer.group_fragments(mols_groups, fragment_groups)

        reaction_equation = ReactionEquation(*groups)

        # If the atom maps were removed, cleaning up the compounds will
        # return a much nicer string.
        if remove_atom_maps:
            reaction_equation = cleanup_compounds(reaction_equation)

        return reaction_equation

    @staticmethod
    def group_fragments(
        raw_smiles_groups: List[List[str]], fragment_groups: List[List[int]]
    ) -> List[List[str]]:
        """
        Merge the reaction fragments belonging together.
        """
        merged_groups = []

        offset = 0
        for raw_smiles_group in raw_smiles_groups:
            merged_smiles_group = merge_molecules_from_fragment_groups(
                raw_smiles_group, fragment_groups, offset
            )
            merged_groups.append(merged_smiles_group)
            offset += len(raw_smiles_group)

        return merged_groups


class _Exporter:
    """
    Convert ReactionEquation to extended reaction SMILES with fragment information.
    """

    @staticmethod
    def convert(reaction: ReactionEquation) -> str:
        offset = 0
        reactants, reactant_groups = _Exporter.fragment_group(
            reaction.reactants, offset
        )
        offset += len(reactants)
        agents, agent_groups = _Exporter.fragment_group(reaction.agents, offset)
        offset += len(agents)
        products, product_groups = _Exporter.fragment_group(reaction.products, offset)

        groups = reactant_groups + agent_groups + product_groups

        smiles_groups = (
            ".".join(smiles for smiles in group)
            for group in (reactants, agents, products)
        )
        smiles_without_fragment_info = ">".join(smiles_groups)

        if not groups:
            return smiles_without_fragment_info

        fragment_info = _Exporter.generate_fragment_info(groups)
        return f"{smiles_without_fragment_info} {fragment_info}"

    @staticmethod
    def fragment_group(
        compounds: List[str], offset: int
    ) -> Tuple[List[str], List[List[int]]]:
        """
        Converts a group of molecules, some of which possibly are composed of several fragments,
        to the list of SMILES to put in the final reaction SMILES, and a list of groups of molecules
        belonging together.

        Example:
            ['O', '[Na+].[OH-]'] -> (['O', '[Na+]', '[OH-]'], [[1, 2]])

        Args:
            compounds: SMILES string for molecules of the same group (f.i. reactants)
            offset: index, in the final reaction SMILES, of the first molecule of that group
        """

        smiles_list: List[str] = []
        groups: List[List[int]] = []

        current_index = offset
        for c in compounds:
            molecules = c.split(".")
            smiles_list.extend(molecules)

            number_fragments = len(molecules)
            if number_fragments > 1:
                groups.append(
                    list(range(current_index, current_index + number_fragments))
                )

            current_index += number_fragments

        return smiles_list, groups

    @staticmethod
    def generate_fragment_info(groups: List[List[int]]) -> str:
        if not groups:
            return ""

        group_strings = [".".join(str(number) for number in g) for g in groups]
        all_groups = ",".join(group_strings)
        return f"|f:{all_groups}|"


[docs]def determine_fragment_groups(extended_reaction_info: str) -> List[List[int]]:
    """
    From the fragment info string (such as ``|f:0.2,5.6|``), determine the groups
    of indices that must be grouped.

    Args:
        extended_reaction_info: Extended reaction info, potentially containing
            information about fragments, stereochemistry, etc. See documentation on
            https://docs.chemaxon.com/display/docs/chemaxon-extended-smiles-and-smarts-cxsmiles-and-cxsmarts.md

    Returns:
        List of groups (f.i. [[0,2], [5,6]])
    """

    # Extract the part related to fragments, starting with "f:"
    fragment_subpart_match = EXTENDED_FRAGMENT_REGEX.search(extended_reaction_info)
    if fragment_subpart_match is None:
        return []
    fragment_info = fragment_subpart_match.group(0)

    # Extract the groups of associated numbers as strings
    fragment_group_matches = FRAGMENT_GROUP_REGEX.findall(fragment_info)

    # Convert to lists of integers
    groups = []
    for match in fragment_group_matches:
        indices = match.split(".")
        groups.append([int(i) for i in indices])
    return groups


[docs]def merge_molecules_from_fragment_groups(
    smiles_list: List[str], fragment_groups: List[List[int]], offset: int
) -> List[str]:
    """
    Combine molecules according to the fragment definition.

    Args:
        smiles_list: SMILES strings to potentially merge
        fragment_groups: indices for groups of molecules belonging together; some may be out of range
        offset: what index the first molecule in smiles_list corresponds to

    Returns:
        List of molecules, some of which were potentially merged. Non-merged molecules come first.
    """

    allowed_indices = set(range(len(smiles_list)))
    merged_indices = set()

    merged_molecules = []

    for group in fragment_groups:
        relative_indices = [i - offset for i in group]
        in_range = [e in allowed_indices for e in relative_indices]

        # either all indices are in the range, or none is
        all_in_range = all(in_range)
        none_in_range = not any(in_range)
        if not (all_in_range or none_in_range):
            raise ValueError()

        if all_in_range:
            merged_molecule = ".".join(smiles_list[i] for i in relative_indices)
            merged_molecules.append(merged_molecule)
            merged_indices.update(relative_indices)

    remaining_molecule_indices = sorted(list(allowed_indices - merged_indices))
    unmerged_molecules = [smiles_list[i] for i in remaining_molecule_indices]

    return unmerged_molecules + merged_molecules