Source code for rxn.metrics.tokenize_file

import logging

from rxn.utilities.files import (
    PathLike,
    dump_list_to_file,
    iterate_lines_from_file,
    raise_if_paths_are_identical,
)

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


[docs]def detokenize_class(tokenized_class: str) -> str:
    """
    Function performing a detokenization of the reaction class used in the Transformer classification
    model. E.g. '1 1.2 1.2.3' -> 1.2.3

    Args:
        tokenized_class: str to detokenize

    Raises:
        ValueError: if the input string format is not correct
    """
    if tokenized_class == "0":
        return tokenized_class

    splitted_class = tokenized_class.split(" ")
    if len(splitted_class) == 1 and len(splitted_class[0].split(".")) == 3:
        # here the class is already detokenized
        return tokenized_class
    if len(splitted_class) != 3:
        raise ValueError(
            f'The class to be detokenized, "{tokenized_class}", is probably not in the correct format.'
        )
    return splitted_class[-1]


[docs]def tokenize_class(detokenized_class: str) -> str:
    """
    Function performing a tokenization of the reaction class used in the Transformer classification
    model. E.g. '1.2.3' -> '1 1.2 1.2.3'

    Args:
        detokenized_class: str to tokenize

    Raises:
        ValueError: if the input string format is not correct
    """
    if detokenized_class == "0":
        return detokenized_class

    splitted_class = detokenized_class.split(".")
    if len(splitted_class) == 4 and len(detokenized_class.split(" ")) == 3:
        # here the class is already tokenized
        return detokenized_class
    if len(splitted_class) != 3:
        raise ValueError(
            f'The class to be tokenized, "{detokenized_class}", is probably not in the correct format.'
        )
    a, b, _ = splitted_class
    return f"{a} {a}.{b} {detokenized_class}"


[docs]def tokenize_class_line(class_line: str, invalid_placeholder: str) -> str:
    try:
        return tokenize_class(class_line)
    except ValueError:
        logger.debug(f'Error when tokenizing the class "{class_line}"')
        return invalid_placeholder


[docs]def detokenize_class_line(class_line: str, invalid_placeholder: str) -> str:
    try:
        return detokenize_class(class_line)
    except ValueError:
        logger.debug(f'Error when detokenizing the class "{class_line}"')
        return invalid_placeholder


[docs]def detokenize_classification_file(
    input_file: PathLike, output_file: PathLike, invalid_placeholder: str = ""
) -> None:
    raise_if_paths_are_identical(input_file, output_file)
    logger.info(f'Detokenizing "{input_file}" -> "{output_file}".')

    detokenized = (
        detokenize_class_line(line, invalid_placeholder)
        for line in iterate_lines_from_file(input_file)
    )
    dump_list_to_file(detokenized, output_file)


[docs]def tokenize_classification_file(
    input_file: PathLike, output_file: PathLike, invalid_placeholder: str = ""
) -> None:
    raise_if_paths_are_identical(input_file, output_file)
    logger.info(f'Tokenizing "{input_file}" -> "{output_file}".')

    tokenized = (
        tokenize_class_line(line, invalid_placeholder)
        for line in iterate_lines_from_file(input_file)
    )
    dump_list_to_file(tokenized, output_file)


[docs]def classification_string_is_tokenized(classification_line: str) -> bool:
    """
    Whether a classification line is tokenized or not.

    Args:
        classification_line: line to inspect

    Raises:
        ValueError: for errors in tokenization or detokenization
    """
    detokenized = detokenize_class(classification_line)
    tokenized = tokenize_class(detokenized)
    return classification_line == tokenized


[docs]def classification_file_is_tokenized(filepath: PathLike) -> bool:
    """
    Whether a file contains tokenized classes or not.
    '1.2.3' -> '1 1.2 1.2.3'

    By default, this looks at the first non-empty line of the file only!

    Raises:
        ValueError: for errors in tokenization or detokenization
        RuntimeError: for empty files or files with empty lines only.

    Args:
        filepath: path to the file.
    """
    for line in iterate_lines_from_file(filepath):
        # Ignore empty lines
        if line == "":
            continue
        return classification_string_is_tokenized(line)
    raise RuntimeError(
        f'Could not determine whether "{filepath}" is class-tokenized: empty lines only.'
    )