Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add converter between HDF5 and ROOT files #9

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/odapt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from __future__ import annotations

from odapt._version import __version__
from odapt.operations import hadd # noqa: F401
from odapt.operations import (
hadd, # noqa: F401
hdf5_to_root, # noqa: F401
)

__all__ = ["__version__"]
38 changes: 0 additions & 38 deletions src/odapt/operations/hadd.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,41 +476,3 @@ def main():
skip_bad_files=args.skip_bad_files,
union=args.union,
)


# def merge_ttree(ttree1, ttree2, name): #hadd includes
# # Use tmpdir? Or just do two at a time, tree reduction style...
# if ttree1.name != ttree2.name:
# print("Names must be the same")

# #title must be the same as the file name? maybe is just a tChain thing

# # Get keys
# t1_keys = ttree1.keys(recursive=True)
# t2_keys = ttree2.keys(recursive=True)

# all_keys = np.union1d(t1_keys, t2_keys)

# for t1_key in t1_keys:
# class_name = ttree1[t1_key].class_name()
# if class_name == "ttree":
# branches = ttree1[t1_key].branches

# # for t2_key in t2_keys:
# if :


# merge_inputs()

# # Check if histograms

# write...
# # read key - get get class name
# # inputs(?) = tlist()
# # if isTree:
# # obj = obj.CloneTree?
# # branches = obj.branches
# # for f2 in files[1:]:
# # other_obj = f2.getListOfKeys().readObj()
# # inputs.Add(other_obj)
# #
77 changes: 77 additions & 0 deletions src/odapt/operations/hdf5_to_root.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from __future__ import annotations

import h5py
import uproot


def hdf5_to_root(
read_path,
write_path,
*,
chunk_shape=True,
compression=None,
):
f = h5py.File(
read_path, "r"
) # 'r' for read only, file must exist (it's the default though)

keys = list(f.keys())
out_file = uproot.recreate(write_path)
for key in keys:
if isinstance((f[key]), h5py.Group):
recur_in_group(group, out_file)
else:
first = True
for chunk in f[key].iter_chunks():
if first is True:
out_file[key] = {key: [f[key][chunk]]}
first = False
out_file[key].extend({key: [f[key][chunk]]})
# del f or something - CLOSE FILE!


def recur_in_group(group, out_file_directory):
keys = list(group.keys())
tree = out_file_directory.mktree(
group.name, {keys[i]: group[keys[i]].dtype for i, value in enumerate(keys)}
) # Fix type obv

for key in keys:
if isinstance((group[key]), h5py.Group):
tree = recur_in_group(group[key], tree)

chunks = list(group[key].iter_chunks())
tree.extend({key: [[group[key][chunk]] for chunk in chunks] for key in keys})
return tree


f = h5py.File("/Users/zobil/Documents/odapt/tests/samples/mytestfile.hdf5", "w")

group = f.create_group("test")
from numpy.random import PCG64, Generator

rng = Generator(PCG64())
array = rng.standard_normal([10000])
group = f.create_group("datasets", track_order=True)
dset = group.create_dataset("mydataset", data=(array), dtype="f", chunks=True)

array1 = rng.standard_normal([10000])
dset1 = group.create_dataset("mydataset1", data=(array1), dtype="f", chunks=True)
hdf5_to_root(
"/Users/zobil/Documents/odapt/tests/samples/mytestfile.hdf5",
"/Users/zobil/Documents/odapt/tests/samples/destination.root",
)
with uproot.open("/Users/zobil/Documents/odapt/tests/samples/destination.root") as file:
keys = file.keys(cycle=False)
print(keys)
ttree1 = file[keys[0]]
print(ttree1.keys(cycle=False))
branches = ttree1.branches
print(file[keys[0]].name == "mydataset")
print(keys[0])
print(ttree1[keys[0]].arrays())
ttree2 = file[keys[1]]
branches = ttree2.branches
print(ttree2.name == "mydataset1")
print(keys[1])
print(ttree2.arrays())
71 changes: 71 additions & 0 deletions src/odapt/operations/root_to_hdf5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from __future__ import annotations

import h5py
import uproot
from skhep_testdata import data_path


def root_to_hdf5(read_path, write_path, *, compression=None, max_step_size=None):
out_file = h5py.File(write_path, "w")
in_file = uproot.open(read_path)
keys = in_file.keys()
tree = in_file[keys[0]]

for key in keys:
print(in_file[key].classname)
if in_file[key].classname == "TTree":
print("here?")
in_file[key].branches
sub_group = out_file.create_group(in_file[key].name)
recur_write_hdf5(in_file[key], sub_group)
else:
dset = out_file.create_dataset(
in_file[key].name, shape=(100, 1), chunks=(20, 1)
)
for chunk in dset.iter_chunks():
dset[chunk] = list(
in_file[key].iterate(step_size=in_file[key].num_baskets)
)


def recur_write_hdf5(
root, group
): # How set attributes?? Is it automatic? Check with printing gour.attrs or dataset attrs
branches = root.branches
for branch in branches:
# print("classname", branch.classname)
if branch.classname == "TTree":
sub_group = group.create_group(branch.name)
for branch in branches:
if (
branch.classname.startswith("Model_TTree")
or branch.classname == "TBranch"
):
recur_write_hdf5(branch, sub_group)
else:
shape_1 = branch.num_entries
dset = group.create_dataset(
branch.name, shape=(branch.num_entries), chunks=(branch.num_entries)
)

if branch.classname == "TBranch" and len(branch.branches) == 0: # what?
chunks = list(dset.iter_chunks())
indx = 0
for i in uproot.iterate(branch, step_size=step_size):
print("iterating: ", i)
dset[chunks[indx]] = i
indx += 1


# So iterate may be worse due to arbitrarily picked batch size? TBasket is likely better choice, but
# this may be dependent on size of baskets...bad I/O... but:
# .entries_to_ranges_or_baskets
# .num_baskets
# .basket(basket_num)
# .basket_compressed/uncompressed_bytes

tree = uproot.open(data_path("uproot-HZZ.root"))
root_to_hdf5(
data_path("uproot-HZZ.root"),
"/Users/zobil/Documents/odapt/tests/samples/mytestfile.hdf5",
)