Source code for rxn.metrics.scripts.ensure_data_dimension

import logging
import math
from pathlib import Path
from typing import Iterable, Tuple

import click
from rxn.utilities.files import (
    PathLike,
    count_lines,
    dump_list_to_file,
    load_list_from_file,
)
from rxn.utilities.logging import setup_console_logger

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


[docs]def ensure_data_dimension( txt_files: Iterable[PathLike], output_dir: PathLike, max_dimension: int ) -> None: # Check the lengths of the files and ensure they are all the same file_length = [count_lines(txt_file) for txt_file in txt_files] if len(set(file_length)) != 1: raise ValueError("The files provided have not the same number of lines.") # Check that there are no files with the same name filenames = [Path(txt_file).name for txt_file in txt_files] if len(set(filenames)) != len(filenames): raise ValueError("Found files with the same same. Aborting") split_no = math.ceil(file_length[0] / max_dimension) new_output_dir = Path(output_dir) new_output_dir.mkdir(parents=True, exist_ok=True) logger.info( f"Splitting in {split_no} files with the same name of the original ones . " f"Saving in {new_output_dir} ." ) for txt_file in txt_files: file_content = load_list_from_file(txt_file) file_name = Path(txt_file).name for chunk_no, chunk_start in enumerate(range(0, file_length[0], max_dimension)): # create a sub_directory sub_directory = Path(new_output_dir) / f"chunk_{chunk_no}" sub_directory.mkdir(parents=True, exist_ok=True) logger.info(f"Created directory {sub_directory} . Saving files .") # save all subfiles dump_list_to_file( file_content[chunk_start : chunk_start + max_dimension], Path(sub_directory) / file_name, )
@click.command(context_settings={"show_default": True}) @click.argument("txt_files", nargs=-1) @click.option("--output_dir", required=True, help="Where to save all the files") @click.option( "--max_dimension", default=50000, type=int, help="Maximum file length allowed without splitting", ) def main(txt_files: Tuple[str, ...], output_dir: str, max_dimension: int) -> None: """ Script to split too big files in subchunks . Useful for class token translations. Takes as input an arbitrary number of files. Files are saved under output_dir/chunk_i for i ranging from 0 to the number of splits needed. """ setup_console_logger() ensure_data_dimension(txt_files, output_dir, max_dimension) if __name__ == "__main__": main()