Source code for rxn.reaction_preprocessing.smiles_tokenizer

# LICENSED INTERNAL CODE. PROPERTY OF IBM.
# IBM Research Zurich Licensed Internal Code
# (C) Copyright IBM Corp. 2022
# ALL RIGHTS RESERVED
from rxn.chemutils.tokenization import tokenize_smiles
from rxn.utilities.csv import iterate_csv_column

from rxn.reaction_preprocessing.config import TokenizeConfig


[docs]class SmilesTokenizer: # # Public Methods #
[docs] def tokenize(self, smiles: str) -> str: """ Tokenize a SMILES molecule or reaction, and join the tokens with spaces. Args: smiles: SMILES string to tokenize, for instance 'CC(CO)=N>>CC(C=O)N'. Returns: SMILES string after tokenization, for instance 'C C ( C O ) = N >> C C ( C = O ) N'. """ return tokenize_smiles(smiles)
[docs]def tokenize(cfg: TokenizeConfig) -> None: # Tokenize the reactions tokenizer = SmilesTokenizer() tokenize_fn = tokenizer.tokenize for pair in cfg.input_output_pairs: precursors = f"{pair.out}.precursors_tokens" products = f"{pair.out}.products_tokens" with open(precursors, "wt") as f_precursors, open(products, "wt") as f_products: for rxn in iterate_csv_column(pair.inp, pair.reaction_column_name): precursors_smiles, products_smiles = rxn.split(">>") f_precursors.write(f"{tokenize_fn(precursors_smiles)}\n") f_products.write(f"{tokenize_fn(products_smiles)}\n")