diff --git a/file.root b/file.root new file mode 100644 index 0000000..12b19d1 Binary files /dev/null and b/file.root differ diff --git a/root_parquet.root b/root_parquet.root new file mode 100644 index 0000000..c5d81a5 Binary files /dev/null and b/root_parquet.root differ diff --git a/src/odapt/parquet_to_parquet.py b/src/odapt/parquet_to_parquet.py new file mode 100644 index 0000000..04d2184 --- /dev/null +++ b/src/odapt/parquet_to_parquet.py @@ -0,0 +1,225 @@ +from __future__ import annotations + +from os import fsdecode +from pathlib import Path + +import awkward as ak + + +def root_to_parquet( + in_file=None, + out_file=None, + *, + force=False, + # step_size=1, Maybe add this for number of row-groups to deal with at once? + list_to32=False, + string_to32=True, + bytestring_to32=True, + emptyarray_to=None, + categorical_as_dictionary=False, + extensionarray=True, + count_nulls=True, + compression="zstd", + compression_level=None, + row_group_size=64 * 1024 * 1024, + data_page_size=None, + parquet_flavor=None, + parquet_version="2.4", + parquet_page_version="1.0", + parquet_metadata_statistics=True, + parquet_dictionary_encoding=False, + parquet_byte_stream_split=False, + parquet_coerce_timestamps=None, + parquet_old_int96_timestamps=None, + parquet_compliant_nested=False, + parquet_extra_options=None, + storage_options=None, +): + """Converts ROOT to Parquet file using Uproot and awkward.to_parquet. Data read from 1 tree, converted to single Parquet file. + + :param in_file: Local ROOT file to convert to Parquet. May contain glob patterns. + :type in_file: str + :param out_file: Name of the output file or file path. + :type out_file: path-like + :param tree: If there are multiple trees in the ROOT file, specify the name of one to write to Parquet. + Command line options: ``-t`` or ``--tree``. + :type tree: None or str + :param force: If true, replaces file if it already exists. Default is False. Command line options ``-f`` or ``--force``. + :type force: Bool, optional + :param step_size: Number of row groups to have in memory at once. Command line options: ``-s`` or ``--step-size``. + :type step_size: int, optional + :param list_to32: If True, convert Awkward lists into 32-bit Arrow lists if they're small enough, even if it means an extra conversion. + Otherwise, signed 32-bit ak.types.ListType maps to Arrow ListType, signed 64-bit ak.types.ListType maps to Arrow LargeListType, and + unsigned 32-bit ak.types.ListType picks whichever Arrow type its values fit into. Command line option ``--list-to32``. + :type list_to32: bool + :param string_to32: Same as the above for Arrow string and ``large_string``. Command line option: ``--string-to32``. + :type string_to32: bool + :param bytestring_to32: Same as the above for Arrow binary and ``large_binary``. Command line option: ``--bytestring-to32``. + :type bytestring_to32: bool + :param emptyarray_to: If None, #ak.types.UnknownType maps to Arrow's + null type; otherwise, it is converted a given numeric dtype. Command line option: ``--emptyarray-to``. + :type emptyarray_to: None or dtype + :param categorical_as_dictionary: If True, #ak.contents.IndexedArray and + #ak.contents.IndexedOptionArray labeled with ``__array__ = "categorical"`` + are mapped to Arrow `DictionaryArray`; otherwise, the projection is + evaluated before conversion (always the case without + `__array__ = "categorical"`). Command line option: ``--categorical-as-dictionary``. + :type categorical_as_dictionary: bool + :param extensionarray: If True, this function returns extended Arrow arrays + (at all levels of nesting), which preserve metadata so that Awkward \u2192 + Arrow \u2192 Awkward preserves the array's #ak.types.Type (though not + the #ak.forms.Form). If False, this function returns generic Arrow arrays + that might be needed for third-party tools that don't recognize Arrow's + extensions. Even with `extensionarray=False`, the values produced by + Arrow's `to_pylist` method are the same as the values produced by Awkward's + #ak.to_list. Command line option: ``--extensionarray``. + :type extensionarray: bool + :param count_nulls: If True, count the number of missing values at each level + and include these in the resulting Arrow array, which makes some downstream + applications faster. If False, skip the up-front cost of counting them. + Command line option: ``--count-nulls``. + :type count_nulls: bool + :param compression: Compression algorithm name, passed to + [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + Parquet supports `{"NONE", "SNAPPY", "GZIP", "BROTLI", "LZ4", "ZSTD"}` + (where `"GZIP"` is also known as "zlib" or "deflate"). If a dict, the keys + are column names (the same column names that #ak.forms.Form.columns returns + and #ak.forms.Form.select_columns accepts) and the values are compression + algorithm names, to compress each column differently. Command line option: ``--compression``. + :type compression: None, str, or dict + :param compression_level: Compression level, passed to + [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + Compression levels have different meanings for different compression + algorithms: GZIP ranges from 1 to 9, but ZSTD ranges from -7 to 22, for + example. Generally, higher numbers provide slower but smaller compression. Command line option + ``--compression-level``. + :type compression_level: None, int, or dict None + :param row_group_size: Maximum number of entries in each row group, + passed to [pyarrow.parquet.ParquetWriter.write_table](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter.write_table). + If None, the Parquet default of 64 MiB is used. Command line options: ``-rg`` or ``--row-group-size``. + :type row_group_size: int or None + :param data_page_size: Number of bytes in each data page, passed to + [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + If None, the Parquet default of 1 MiB is used. Command line option: ``--data-page-size``. + :type data_page_size: None or int + :param parquet_flavor: If None, the output Parquet file will follow + Arrow conventions; if `"spark"`, it will follow Spark conventions. Some + systems, such as Spark and Google BigQuery, might need Spark conventions, + while others might need Arrow conventions. Passed to + [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + as `flavor`. Command line option: ``--parquet-flavor``. + :type parquet_flavor: None or `"spark"` + :param parquet_version: Parquet file format version. + Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + as `version`. Command line option: ``--parquet-version``. + :type parquet_version: `"1.0"`, `"2.4"`, or `"2.6"` + :param parquet_page_version: Parquet page format version. + Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + as `data_page_version`. Command line option: ``--parquet-page-version``. + :type parquet_page_version: `"1.0"` or `"2.0"` + :param parquet_metadata_statistics: If True, include summary + statistics for each data page in the Parquet metadata, which lets some + applications search for data more quickly (by skipping pages). If a dict + mapping column names to bool, include summary statistics on only the + specified columns. Passed to + [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + as `write_statistics`. Command line option: ``--parquet-metadata-statistics``. + :type parquet_metadata_statistics: bool or dict + :param parquet_dictionary_encoding: If True, allow Parquet to pre-compress + with dictionary encoding. If a dict mapping column names to bool, only + use dictionary encoding on the specified columns. Passed to + [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + as `use_dictionary`. Command line option: ``--parquet-dictionary-encoding``. + :type parquet_dictionary_encoding: bool or dict + :param parquet_byte_stream_split: If True, pre-compress floating + point fields (`float32` or `float64`) with byte stream splitting, which + collects all mantissas in one part of the stream and exponents in another. + Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + as `use_byte_stream_split`. Command line option: ``--parquet-byte-stream-split``. + :type parquet_byte_stream_split: bool or dict + :param parquet_coerce_timestamps: If None, any timestamps + (`datetime64` data) are coerced to a given resolution depending on + `parquet_version`: version `"1.0"` and `"2.4"` are coerced to microseconds, + but later versions use the `datetime64`'s own units. If `"ms"` is explicitly + specified, timestamps are coerced to milliseconds; if `"us"`, microseconds. + Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + as `coerce_timestamps`. Command line option: ``--parquet-coerce-timestamps``. + :type parquet_coerce_timestamps: None, `"ms"`, or `"us"` + :param parquet_old_int96_timestamps: If True, use Parquet's INT96 format + for any timestamps (`datetime64` data), taking priority over `parquet_coerce_timestamps`. + If None, let the `parquet_flavor` decide. Passed to + [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + as `use_deprecated_int96_timestamps`. Command line option: ``--parquet-old-int96-timestamps``. + :type parquet_old_int96_timestamps: None or bool + :param parquet_compliant_nested: If True, use the Spark/BigQuery/Parquet + [convention for nested lists](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#nested-types), + in which each list is a one-field record with field name "`element`"; + otherwise, use the Arrow convention, in which the field name is "`item`". + Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + as `use_compliant_nested_type`. Command line option: ``--parquet-compliant-nested``. + :type parquet_compliated_nested: bool + :param parquet_extra_options: Any additional options to pass to + [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + :type parquet_extra_options: None or dict + :param storage_options: Any additional options to pass to + [fsspec.core.url_to_fs](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.core.url_to_fs) + to open a remote file for writing. + :type storage_options: None or dict + + Examples: + --------- + Converts a TTree from a ROOT file to a Parquet File. + + >>> odapt.parquet_to_parquet(in_file="file.parquet", out_file="new_file.parquet") + + Command Line Instructions: + -------------------------- + This function can be run from the command line. Use command + + >>> odapt parquet-to-parquet [options] [OUT_FILE] [IN_FILE] + + """ + path = Path(out_file) + if Path.is_file(path) and not force: + raise FileExistsError + try: + out_file = fsdecode(out_file) + except TypeError: + msg = f"'out_file' argument of 'ak.to_parquet' and 'ak.to_parquet_row_groups' must be a path-like, not {type(out_file).__name__} ('array' argument is first; 'out_file' second)" + raise TypeError(msg) from None + try: + metadata = ak.metadata_from_parquet(in_file) + except FileNotFoundError: + msg = ( + "File: ", + in_file, + " does not exist or is corrupt. Could not retrieve metadata.", + ) + raise FileNotFoundError(msg) from None + + ak.to_parquet_row_groups( + (ak.from_parquet(in_file, row_groups=[i]) for i in metadata["num_row_groups"]), + out_file, + list_to32=list_to32, + string_to32=string_to32, + bytestring_to32=bytestring_to32, + emptyarray_to=emptyarray_to, + categorical_as_dictionary=categorical_as_dictionary, + extensionarray=extensionarray, + count_nulls=count_nulls, + compression=compression, + compression_level=compression_level, + row_group_size=row_group_size, + data_page_size=data_page_size, + parquet_flavor=parquet_flavor, + parquet_version=parquet_version, + parquet_page_version=parquet_page_version, + parquet_metadata_statistics=parquet_metadata_statistics, + parquet_dictionary_encoding=parquet_dictionary_encoding, + parquet_byte_stream_split=parquet_byte_stream_split, + parquet_coerce_timestamps=parquet_coerce_timestamps, + parquet_old_int96_timestamps=parquet_old_int96_timestamps, + parquet_compliant_nested=parquet_compliant_nested, + parquet_extra_options=parquet_extra_options, + storage_options=storage_options, + ) diff --git a/test.parquet b/test.parquet new file mode 100644 index 0000000..c6083a9 Binary files /dev/null and b/test.parquet differ diff --git a/test.root b/test.root new file mode 100644 index 0000000..a553389 Binary files /dev/null and b/test.root differ diff --git a/test1.parquet b/test1.parquet new file mode 100644 index 0000000..3be1b0d Binary files /dev/null and b/test1.parquet differ diff --git a/test1.root b/test1.root new file mode 100644 index 0000000..a802299 Binary files /dev/null and b/test1.root differ diff --git a/tests/samples/copy.root b/tests/samples/copy.root new file mode 100644 index 0000000..2460fd6 Binary files /dev/null and b/tests/samples/copy.root differ diff --git a/tests/samples/destination.root b/tests/samples/destination.root new file mode 100644 index 0000000..1fdd606 Binary files /dev/null and b/tests/samples/destination.root differ diff --git a/tests/samples/mytestfile.hdf5 b/tests/samples/mytestfile.hdf5 new file mode 100644 index 0000000..ebb6f90 Binary files /dev/null and b/tests/samples/mytestfile.hdf5 differ diff --git a/tests/samples/nanoAOD_2015_CMS_Open_Data_ttbar.root b/tests/samples/nanoAOD_2015_CMS_Open_Data_ttbar.root new file mode 100644 index 0000000..c0f9b99 Binary files /dev/null and b/tests/samples/nanoAOD_2015_CMS_Open_Data_ttbar.root differ diff --git a/tests/samples/parquet_HZZ.parquet b/tests/samples/parquet_HZZ.parquet new file mode 100644 index 0000000..f6ba238 Binary files /dev/null and b/tests/samples/parquet_HZZ.parquet differ diff --git a/tests/samples/parquet_HZZ.root b/tests/samples/parquet_HZZ.root new file mode 100644 index 0000000..91edbff Binary files /dev/null and b/tests/samples/parquet_HZZ.root differ diff --git a/tests/samples/parquet_hepdata-example.parquet b/tests/samples/parquet_hepdata-example.parquet new file mode 100644 index 0000000..6bef6c9 Binary files /dev/null and b/tests/samples/parquet_hepdata-example.parquet differ diff --git a/tests/samples/parquet_test.parquet b/tests/samples/parquet_test.parquet new file mode 100644 index 0000000..58174de Binary files /dev/null and b/tests/samples/parquet_test.parquet differ diff --git a/tests/samples/uproot-HZZ.parquet b/tests/samples/uproot-HZZ.parquet new file mode 100644 index 0000000..89fddcd Binary files /dev/null and b/tests/samples/uproot-HZZ.parquet differ diff --git a/tests/samples/uproot-HZZ.root b/tests/samples/uproot-HZZ.root new file mode 100644 index 0000000..a99b94c Binary files /dev/null and b/tests/samples/uproot-HZZ.root differ diff --git a/uproot-HZZ.parquet b/uproot-HZZ.parquet new file mode 100644 index 0000000..b41ca01 Binary files /dev/null and b/uproot-HZZ.parquet differ diff --git a/uproot-HZZ.root b/uproot-HZZ.root new file mode 100644 index 0000000..831fac5 Binary files /dev/null and b/uproot-HZZ.root differ diff --git a/uproot-hepdata-example.parquet b/uproot-hepdata-example.parquet new file mode 100644 index 0000000..f2e2617 Binary files /dev/null and b/uproot-hepdata-example.parquet differ diff --git a/uproot-hepdata-example.root b/uproot-hepdata-example.root new file mode 100644 index 0000000..41b765a Binary files /dev/null and b/uproot-hepdata-example.root differ