Source code for rxn.reaction_preprocessing.scripts.standardize_pistachio_records

#!/usr/bin/env python
# LICENSED INTERNAL CODE. PROPERTY OF IBM.
# IBM Research Zurich Licensed Internal Code
# (C) Copyright IBM Corp. 2021
# ALL RIGHTS RESERVED
import json
import logging
from pathlib import Path

import hydra
from rxn.utilities.files import iterate_lines_from_file

from rxn.reaction_preprocessing.config import Config, Step
from rxn.reaction_preprocessing.pistachio_record_standardizer import (
    PistachioRecordStandardizer,
)

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


[docs]@hydra.main(config_name="base_config", config_path=None) def main(cfg: Config) -> None: """Standardize reactions in a Pistachio JSON. This will do steps similar to the STANDARDIZE and PREPROCESS steps from the normal (CSV-based) reaction preprocessing. Relies on the same config format as the main pipeline, although not all the values from there are needed. """ # Enforce config schema. Will also convert strings to Enums when necessary. cfg = Config.from_generic_config(cfg) # make sure that the required output directories exist Path(cfg.data.proc_dir).mkdir(parents=True, exist_ok=True) if cfg.common.sequence != [Step.STANDARDIZE, Step.PREPROCESS]: raise SystemExit( "Standardization of Pistachio: steps must be STANDARDIZE and PREPROCESS exactly" ) pistachio_standardizer = PistachioRecordStandardizer( cfg_standardize=cfg.standardize, cfg_preprocess=cfg.preprocess ) jsonl_file = cfg.data.path output_jsonl = Path(cfg.data.proc_dir) / "processed.jsonl" with open(output_jsonl, "wt") as f: for json_line in iterate_lines_from_file(jsonl_file): reaction_record = json.loads(json_line) try: updated_record = pistachio_standardizer.standardize(reaction_record) except Exception as e: logger.info(f"Ignoring record: {e}") continue f.write(json.dumps(updated_record)) f.write("\n")
if __name__ == "__main__": main()