scikit-hep · zbilodea · Oct 24, 2023 · Oct 24, 2023 · Oct 24, 2023 · Oct 25, 2023
diff --git a/src/odapt/__init__.py b/src/odapt/__init__.py
@@ -6,6 +6,9 @@
 from __future__ import annotations
 
 from odapt._version import __version__
-from odapt.operations import hadd  # noqa: F401
+from odapt.operations import (
+    hadd,  # noqa: F401
+    hdf5_to_root,  # noqa: F401
+)
 
 __all__ = ["__version__"]
diff --git a/src/odapt/operations/hadd.py b/src/odapt/operations/hadd.py
@@ -476,41 +476,3 @@ def main():
         skip_bad_files=args.skip_bad_files,
         union=args.union,
     )
-
-
-# def merge_ttree(ttree1, ttree2, name): #hadd includes
-#     # Use tmpdir? Or just do two at a time, tree reduction style...
-#     if ttree1.name != ttree2.name:
-#         print("Names must be the same")
-
-#     #title must be the same as the file name? maybe is just a tChain thing
-
-#     # Get keys
-#     t1_keys = ttree1.keys(recursive=True)
-#     t2_keys = ttree2.keys(recursive=True)
-
-#     all_keys = np.union1d(t1_keys, t2_keys)
-
-#     for t1_key in t1_keys:
-#         class_name = ttree1[t1_key].class_name()
-#         if class_name == "ttree":
-#             branches = ttree1[t1_key].branches
-
-#         # for t2_key in t2_keys:
-#         if :
-
-
-#         merge_inputs()
-
-#         # Check if histograms
-
-#         write...
-#     #   read key - get get class name
-#     #   inputs(?) = tlist()
-#     #   if isTree:
-#     #       obj = obj.CloneTree?
-#     #       branches = obj.branches
-#     #   for f2 in files[1:]:
-#     #       other_obj = f2.getListOfKeys().readObj()
-#     #       inputs.Add(other_obj)
-#     #
diff --git a/src/odapt/operations/hdf5_to_root.py b/src/odapt/operations/hdf5_to_root.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+import h5py
+import uproot
+
+
+def hdf5_to_root(
+    read_path,
+    write_path,
+    *,
+    chunk_shape=True,
+    compression=None,
+):
+    f = h5py.File(
+        read_path, "r"
+    )  # 'r' for read only, file must exist (it's the default though)
+
+    keys = list(f.keys())
+    out_file = uproot.recreate(write_path)
+    for key in keys:
+        if isinstance((f[key]), h5py.Group):
+            recur_in_group(group, out_file)
+        else:
+            first = True
+            for chunk in f[key].iter_chunks():
+                if first is True:
+                    out_file[key] = {key: [f[key][chunk]]}
+                    first = False
+                out_file[key].extend({key: [f[key][chunk]]})
+    # del f or something - CLOSE FILE!
+
+
+def recur_in_group(group, out_file_directory):
+    keys = list(group.keys())
+    tree = out_file_directory.mktree(
+        group.name, {keys[i]: group[keys[i]].dtype for i, value in enumerate(keys)}
+    )  # Fix type obv
+
+    for key in keys:
+        if isinstance((group[key]), h5py.Group):
+            tree = recur_in_group(group[key], tree)
+
+    chunks = list(group[key].iter_chunks())
+    tree.extend({key: [[group[key][chunk]] for chunk in chunks] for key in keys})
+    return tree
+
+
+f = h5py.File("/Users/zobil/Documents/odapt/tests/samples/mytestfile.hdf5", "w")
+
+group = f.create_group("test")
+from numpy.random import PCG64, Generator
+
+rng = Generator(PCG64())
+array = rng.standard_normal([10000])
+group = f.create_group("datasets", track_order=True)
+dset = group.create_dataset("mydataset", data=(array), dtype="f", chunks=True)
+
+array1 = rng.standard_normal([10000])
+dset1 = group.create_dataset("mydataset1", data=(array1), dtype="f", chunks=True)
+hdf5_to_root(
+    "/Users/zobil/Documents/odapt/tests/samples/mytestfile.hdf5",
+    "/Users/zobil/Documents/odapt/tests/samples/destination.root",
+)
+with uproot.open("/Users/zobil/Documents/odapt/tests/samples/destination.root") as file:
+    keys = file.keys(cycle=False)
+    print(keys)
+    ttree1 = file[keys[0]]
+    print(ttree1.keys(cycle=False))
+    branches = ttree1.branches
+    print(file[keys[0]].name == "mydataset")
+    print(keys[0])
+    print(ttree1[keys[0]].arrays())
+    ttree2 = file[keys[1]]
+    branches = ttree2.branches
+    print(ttree2.name == "mydataset1")
+    print(keys[1])
+    print(ttree2.arrays())
diff --git a/src/odapt/operations/root_to_hdf5.py b/src/odapt/operations/root_to_hdf5.py
@@ -0,0 +1,71 @@
+from __future__ import annotations
+
+import h5py
+import uproot
+from skhep_testdata import data_path
+
+
+def root_to_hdf5(read_path, write_path, *, compression=None, max_step_size=None):
+    out_file = h5py.File(write_path, "w")
+    in_file = uproot.open(read_path)
+    keys = in_file.keys()
+    tree = in_file[keys[0]]
+
+    for key in keys:
+        print(in_file[key].classname)
+        if in_file[key].classname == "TTree":
+            print("here?")
+            in_file[key].branches
+            sub_group = out_file.create_group(in_file[key].name)
+            recur_write_hdf5(in_file[key], sub_group)
+        else:
+            dset = out_file.create_dataset(
+                in_file[key].name, shape=(100, 1), chunks=(20, 1)
+            )
+            for chunk in dset.iter_chunks():
+                dset[chunk] = list(
+                    in_file[key].iterate(step_size=in_file[key].num_baskets)
+                )
+
+
+def recur_write_hdf5(
+    root, group
+):  # How set attributes?? Is it automatic? Check with printing gour.attrs or dataset attrs
+    branches = root.branches
+    for branch in branches:
+        # print("classname", branch.classname)
+        if branch.classname == "TTree":
+            sub_group = group.create_group(branch.name)
+            for branch in branches:
+                if (
+                    branch.classname.startswith("Model_TTree")
+                    or branch.classname == "TBranch"
+                ):
+                    recur_write_hdf5(branch, sub_group)
+        else:
+            shape_1 = branch.num_entries
+            dset = group.create_dataset(
+                branch.name, shape=(branch.num_entries), chunks=(branch.num_entries)
+            )
+
+            if branch.classname == "TBranch" and len(branch.branches) == 0:  # what?
+                chunks = list(dset.iter_chunks())
+                indx = 0
+                for i in uproot.iterate(branch, step_size=step_size):
+                    print("iterating: ", i)
+                    dset[chunks[indx]] = i
+                    indx += 1
+
+
+# So iterate may be worse due to arbitrarily picked batch size? TBasket is likely better choice, but
+# this may be dependent on size of baskets...bad I/O... but:
+# .entries_to_ranges_or_baskets
+# .num_baskets
+# .basket(basket_num)
+# .basket_compressed/uncompressed_bytes
+
+tree = uproot.open(data_path("uproot-HZZ.root"))
+root_to_hdf5(
+    data_path("uproot-HZZ.root"),
+    "/Users/zobil/Documents/odapt/tests/samples/mytestfile.hdf5",
+)