Source code for rxn.chemutils.tokenization

import logging
import re
import shutil
from typing import List, Optional

from rxn.utilities.files import (
    PathLike,
    dump_list_to_file,
    iterate_lines_from_file,
    raise_if_paths_are_identical,
)

from .exceptions import UnclearWhetherTokenized

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


SMILES_TOKENIZER_PATTERN = r"(\%\([0-9]{3}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\||\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
SMILES_REGEX = re.compile(SMILES_TOKENIZER_PATTERN)


[docs]class TokenizationError(ValueError): """Exception raised in RDKit.""" def __init__(self, title: str, detail: str): """ Initialize TokenizationError. Args: title: title of the error. detail: decscription of the error. """ self.type = "TokenizationError" self.title = title self.detail = detail
[docs]def to_tokens(smiles: str) -> List[str]: """ Tokenize a SMILES molecule or reaction into a list of tokens. Args: smiles: SMILES string to tokenize. Raises: TokenizationError: in case of mismatch between the SMILES and the joined tokens. Returns: List of tokens (give back the original SMILES string if appended). """ tokens = [token for token in SMILES_REGEX.findall(smiles)] if smiles != "".join(tokens): raise TokenizationError( "SmilesJoinedTokensMismatch", f'SMILES="{smiles}" != joined_tokens="{"".join(tokens)}"', ) return tokens
[docs]def tokenize_smiles(smiles: str, fallback_value: Optional[str] = None) -> str: """ Tokenize a SMILES molecule or reaction, and join the tokens with spaces. Args: smiles: SMILES string to tokenize, for instance 'CC(CO)=N>>CC(C=O)N'. fallback_value: what value to returns when the tokenization is unsuccessful. Default: no fallback, will propagate the TokenizationError exception. Returns: SMILES string after tokenization, for instance 'C C ( C O ) = N >> C C ( C = O ) N'. """ try: tokens = to_tokens(smiles) return " ".join(tokens) except TokenizationError: if fallback_value is not None: logger.debug(f'Error when tokenizing "{smiles}"') return fallback_value raise
[docs]def detokenize_smiles(tokenized_smiles: str) -> str: """ Detokenize a tokenized SMILES string (that contains spaces between the characters). Args: tokenized_smiles: tokenized SMILES, for instance 'C C ( C O ) = N >> C C ( C = O ) N' Returns: SMILES after detokenization, for instance 'CC(CO)=N>>CC(C=O)N' """ return tokenized_smiles.replace(" ", "")
[docs]def string_is_tokenized(smiles_line: str) -> bool: """ Whether a string is a tokenized SMILES or not. Args: smiles_line: string to inspect Raises: ValueError: if not possible to determine whether tokenized or not TokenizationError: propagated directly from tokenize_smiles() """ detokenized = detokenize_smiles(smiles_line) tokens = to_tokens(detokenized) if len(tokens) < 2: raise UnclearWhetherTokenized(smiles_line) return " ".join(tokens) == smiles_line
[docs]def tokenize_file( input_file: PathLike, output_file: PathLike, fallback_value: str = "" ) -> None: """ Tokenize a file containing SMILES strings. Args: input_file: file to tokenize. output_file: where to save the tokenized file. fallback_value: placeholder for strings that cannot be tokenized. """ raise_if_paths_are_identical(input_file, output_file) logger.info(f'Tokenizing "{input_file}" -> "{output_file}".') tokenized = ( tokenize_smiles(line, fallback_value) for line in iterate_lines_from_file(input_file) ) dump_list_to_file(tokenized, output_file)
[docs]def detokenize_file( input_file: PathLike, output_file: PathLike, ) -> None: raise_if_paths_are_identical(input_file, output_file) logger.info(f'Detokenizing "{input_file}" -> "{output_file}".') detokenized = ( detokenize_smiles(line) for line in iterate_lines_from_file(input_file) ) dump_list_to_file(detokenized, output_file)
[docs]def ensure_tokenized_file( file: PathLike, postfix: str = ".tokenized", fallback_value: str = "" ) -> str: """ Ensure that a file is tokenized: do nothing if the file is already tokenized, create a tokenized copy otherwise. Args: file: path to the file that we want to ensure is tokenized. postfix: postfix to add to the tokenized copy (if applicable). fallback_value: placeholder for strings that cannot be tokenized (if applicable). Returns: The path to the tokenized file (original path, or path to new file). """ if file_is_tokenized(file): return str(file) tokenized_copy = str(file) + postfix tokenize_file(file, tokenized_copy, fallback_value=fallback_value) return tokenized_copy
[docs]def file_is_tokenized(filepath: PathLike) -> bool: """ Whether a file contains tokenized SMILES or not. By default, this looks at the first non-empty line of the file only! Raises: TokenizationError: propagated from tokenize_smiles() RuntimeError: for empty files or files with empty lines only. Args: filepath: path to the file. """ # Iterative formulation in case the first line(s) of the file don't make it # clear whether tokenized or not. for line in iterate_lines_from_file(filepath): try: return string_is_tokenized(line) except UnclearWhetherTokenized: continue raise RuntimeError( f'Could not determine whether "{filepath}" is tokenized: empty lines only.' )
[docs]def copy_as_detokenized(src: PathLike, dest: PathLike) -> None: """ Copy a source file to a destination, while making sure that it is not tokenized. """ if file_is_tokenized(src): logger.info(f'Copying and detokenizing "{src}" -> "{dest}".') detokenize_file(src, dest) else: logger.info(f'Copying "{src}" -> "{dest}".') shutil.copy(src, dest)