Source code for rxn.reaction_preprocessing.main

#!/usr/bin/env python
# LICENSED INTERNAL CODE. PROPERTY OF IBM.
# IBM Research Zurich Licensed Internal Code
# (C) Copyright IBM Corp. 2021
# ALL RIGHTS RESERVED
import logging
from pathlib import Path

import hydra
from omegaconf import OmegaConf

from rxn.reaction_preprocessing import __version__
from rxn.reaction_preprocessing.augmenter import augment
from rxn.reaction_preprocessing.config import Config, Step
from rxn.reaction_preprocessing.importer import rxn_import
from rxn.reaction_preprocessing.preprocessor import preprocess
from rxn.reaction_preprocessing.smiles_tokenizer import tokenize
from rxn.reaction_preprocessing.stable_data_splitter import split
from rxn.reaction_preprocessing.standardizer import standardize
from rxn.reaction_preprocessing.utils import (
    add_custom_logger_to_file,
    overwrite_logging_format,
)

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


[docs]def preprocess_data(cfg: Config) -> None: """Preprocess data to generate a dataset for training transformer models.""" # Enforce config schema. Will also convert strings to Enums when necessary. cfg = Config.from_generic_config(cfg) logger.info( f"Preprocessing reaction data with rxn-reaction-preprocessing, " f"version {__version__}." ) logger.info( "Running with the following configuration:\n" f"{OmegaConf.to_yaml(cfg, resolve=True)}" ) # Create the output directory (may exist already if this function was # called from the main script). processing_dir = Path(cfg.data.proc_dir) processing_dir.mkdir(parents=True, exist_ok=True) # Save the config with open(processing_dir / "preprocessing_config.yml", "wt") as f: f.write(OmegaConf.to_yaml(cfg, resolve=True)) for step in cfg.common.sequence: logger.info(f"Running step: {step.name}") if step is Step.IMPORT: rxn_import(cfg.rxn_import) elif step is Step.STANDARDIZE: standardize(cfg.standardize) elif step is Step.PREPROCESS: preprocess(cfg.preprocess) elif step is Step.AUGMENT: augment(cfg.augment) elif step is Step.SPLIT: split(cfg.split) elif step is Step.TOKENIZE: tokenize(cfg.tokenize)
[docs]@hydra.main(config_name="base_config", config_path=None) def data_pipeline(cfg: Config) -> None: """Preprocess data to generate a dataset for training transformer models.""" # Enforce config schema. Will also convert strings to Enums when necessary. cfg = Config.from_generic_config(cfg) # Setup logging to file, and overwrite the log format processing_dir = Path(cfg.data.proc_dir) processing_dir.mkdir(parents=True, exist_ok=True) add_custom_logger_to_file(processing_dir / "log.txt") overwrite_logging_format() preprocess_data(cfg)
if __name__ == "__main__": data_pipeline()