Merge pull request ilastik#2895 from btbest/ome-zarr-in-headless-and-api

Support loading OME-Zarr datasets through headless mode and API
btbest · Sep 13, 2024 · 380d831 · 380d831
2 parents 07e1078 + d6014b2
commit 380d831
Show file tree

Hide file tree

Showing 14 changed files with 459 additions and 137 deletions.
diff --git a/ilastik/applets/dataSelection/dataSelectionGui.py b/ilastik/applets/dataSelection/dataSelectionGui.py
@@ -542,20 +542,24 @@ def _get_dataset_full_path(self, filePath: Path, roleIndex: int) -> Path:
         datasetNames = DatasetInfo.getPossibleInternalPathsFor(filePath.absolute())
         if len(datasetNames) == 0:
             raise RuntimeError(f"File {filePath} has no image datasets")
+        keep_selected_as_default = False
         if len(datasetNames) == 1:
             selected_dataset = datasetNames.pop()
+            keep_selected_as_default = True
         else:
             auto_inner_paths = self._get_previously_used_inner_paths(roleIndex).intersection(set(datasetNames))
             if len(auto_inner_paths) == 1:
                 selected_dataset = auto_inner_paths.pop()
             else:
                 # Ask the user which dataset to choose
-                dlg = SubvolumeSelectionDlg(datasetNames, self)
+                dlg = SubvolumeSelectionDlg(datasetNames, self, offer_remember_dataset=True)
                 if dlg.exec_() != QDialog.Accepted:
                     raise DataSelectionGui.UserCancelledError()
                 selected_index = dlg.combo.currentIndex()
+                keep_selected_as_default = dlg.checkbox.isChecked()
                 selected_dataset = str(datasetNames[selected_index])
-        self._add_default_inner_path(roleIndex=roleIndex, inner_path=selected_dataset)
+        if keep_selected_as_default:
+            self._add_default_inner_path(roleIndex=roleIndex, inner_path=selected_dataset)
         return filePath / selected_dataset.lstrip("/")
 
     def _get_custom_axistags_from_previous_lane(self, role: Union[str, int], info: DatasetInfo) -> Optional[AxisTags]:

diff --git a/ilastik/applets/dataSelection/opDataSelection.py b/ilastik/applets/dataSelection/opDataSelection.py
@@ -296,7 +296,14 @@ def globInternalPaths(cls, file_path: str, glob_str: str, cwd: str = None) -> Li
                 elif cls.pathIsHdf5(path):
                     f = h5py.File(path, "r")
                 elif cls.pathIsN5(path):
-                    f = z5py.N5File(path)  # FIXME
+                    try:
+                        f = z5py.N5File(path)
+                    except AttributeError as e:
+                        # z5py.file doesn't check metadata cleanly:
+                        # `metadata.get('n5') # AttributeError: 'NoneType' object has no attribute 'get'
+                        raise ValueError(f'N5 metadata at "{path}" has incompatible format') from e
+                elif cls.pathIsZarr(path):
+                    f = z5py.ZarrFile(path)
                 else:
                     raise ValueError(f"{path} is not an 'n5' or 'h5' file")
                 internal_paths |= set(globH5N5(f, glob_str))
@@ -317,9 +324,13 @@ def pathIsNpz(cls, path: Path) -> bool:
     def pathIsN5(cls, path: Path) -> bool:
         return PathComponents(Path(path).as_posix()).extension in [".n5"]
 
+    @classmethod
+    def pathIsZarr(cls, path: Path) -> bool:
+        return PathComponents(Path(path).as_posix()).extension in [".zarr"]
+
     @classmethod
     def fileHasInternalPaths(cls, path: str) -> bool:
-        return cls.pathIsHdf5(path) or cls.pathIsN5(path) or cls.pathIsNpz(path)
+        return cls.pathIsHdf5(path) or cls.pathIsN5(path) or cls.pathIsNpz(path) or cls.pathIsZarr(path)
 
     @classmethod
     def getPossibleInternalPathsFor(cls, file_path: Path, min_ndim=2, max_ndim=5) -> List[str]:
@@ -335,6 +346,9 @@ def accumulateInternalPaths(name, val):
         elif cls.pathIsN5(file_path):
             with z5py.N5File(file_path, mode="r+") as f:
                 f.visititems(accumulateInternalPaths)
+        elif cls.pathIsZarr(file_path):
+            with z5py.ZarrFile(file_path, mode="r+") as f:
+                f.visititems(accumulateInternalPaths)
 
         return datasetNames
 
@@ -689,8 +703,11 @@ def isNpz(self) -> bool:
     def isN5(self) -> bool:
         return any(self.pathIsN5(ep) for ep in self.external_paths)
 
+    def isZarr(self) -> bool:
+        return any(self.pathIsZarr(ep) for ep in self.external_paths)
+
     def is_hierarchical(self):
-        return self.isHdf5() or self.isNpz() or self.isN5()
+        return self.isHdf5() or self.isNpz() or self.isN5() or self.isZarr()
 
     def is_in_filesystem(self) -> bool:
         return True

diff --git a/ilastik/experimental/api/_pipelines.py b/ilastik/experimental/api/_pipelines.py
@@ -52,6 +52,21 @@ class PixelClassificationPipeline:
     prob_maps = pipeline.get_probabilities(img)
     ```
 
+    Example using an OME-Zarr dataset stored remotely:
+    ```Python
+    import xarray
+    from ilastik.experimental.api import PixelClassificationPipeline
+    from lazyflow.utility.io_util.OMEZarrStore import OMEZarrStore
+
+    store = OMEZarrStore("<https://example.com/data.zarr>")
+    zarray = store.get_zarr_array("<scale name>")
+    dims = tuple(store.axistags.keys())
+
+    img = xarray.DataArray(zarray, dims=dims)  # downloads the entire image
+    pipeline = PixelClassificationPipeline.from_ilp_file("<path/to/project.ilp>")
+
+    prob_maps = pipeline.get_probabilities(img)
+    ```
     """
 
     @classmethod

diff --git a/ilastik/widgets/ImageFileDialog.py b/ilastik/widgets/ImageFileDialog.py
@@ -37,7 +37,9 @@ def getSelectedPaths(self) -> List[Path]:
         filePaths = []
         for selected_file in self.selectedFiles():
             path = Path(selected_file)
-            if path.name.lower() == "attributes.json" and any(p.suffix.lower() == ".n5" for p in path.parents):
+            if (path.name.lower() == "attributes.json" and any(p.suffix.lower() == ".n5" for p in path.parents)) or (
+                path.name.lower() == ".zgroup" and any(p.suffix.lower() == ".zarr" for p in path.parents)
+            ):
                 # For the n5 extension the attributes.json file has to be selected in the file dialog.
                 # However we need just the *.n5 directory-file.
                 filePaths.append(path.parent)

diff --git a/ilastik/widgets/hdf5SubvolumeSelectionDialog.py b/ilastik/widgets/hdf5SubvolumeSelectionDialog.py
@@ -28,6 +28,7 @@
     QTextEdit,
     QVBoxLayout,
     QWidget,
+    QCheckBox,
 )
 
 from PyQt5.QtCore import Qt
@@ -40,7 +41,7 @@ class SubvolumeSelectionDlg(QDialog):
     A window to ask the user to choose between multiple HDF5 datasets in a single file.
     """
 
-    def __init__(self, datasetNames, parent):
+    def __init__(self, datasetNames, parent, offer_remember_dataset=False):
         super().__init__(parent)
         label = QLabel(
             "Your HDF5/N5 File contains multiple image volumes.\nPlease select the one you would like to open."
@@ -50,6 +51,9 @@ def __init__(self, datasetNames, parent):
         for name in datasetNames:
             self.combo.addItem(name)
 
+        if offer_remember_dataset:
+            self.checkbox = QCheckBox("Always use this dataset in this file (until next ilastik restart)")
+
         buttonbox = QDialogButtonBox(Qt.Horizontal, parent=self)
         buttonbox.setStandardButtons(QDialogButtonBox.Ok | QDialogButtonBox.Cancel)
         buttonbox.accepted.connect(self.accept)
@@ -58,6 +62,8 @@ def __init__(self, datasetNames, parent):
         layout = QVBoxLayout()
         layout.addWidget(label)
         layout.addWidget(self.combo)
+        if offer_remember_dataset:
+            layout.addWidget(self.checkbox)
         layout.addWidget(buttonbox)
 
         self.setLayout(layout)

diff --git a/lazyflow/operators/ioOperators/opInputDataReader.py b/lazyflow/operators/ioOperators/opInputDataReader.py
@@ -80,8 +80,11 @@ class OpInputDataReader(Operator):
     category = "Input"
 
     videoExts = ["ufmf", "mmf"]
-    h5_n5_Exts = ["h5", "hdf5", "ilp", "n5"]
-    n5Selection = ["json"]  # n5 stores data in a directory, containing a json-file which we use to select the n5-file
+    h5_n5_Exts = ["h5", "hdf5", "ilp", "n5", "zarr"]
+    n5Selection = [
+        "json",
+        "zgroup",
+    ]  # n5 stores data in a directory, containing a json-file which we use to select the n5-file
     klbExts = ["klb"]
     npyExts = ["npy"]
     npzExts = ["npz"]
@@ -137,6 +140,7 @@ def __init__(
         super(OpInputDataReader, self).__init__(*args, **kwargs)
         self.internalOperators = []
         self.internalOutput = None
+        self.opInjector = None
         self._file = None
 
         self.WorkingDirectory.setOrConnectIfAvailable(WorkingDirectory)
@@ -151,7 +155,9 @@ def cleanUp(self):
 
     def internalCleanup(self):
         self.Output.disconnect()
-        self.opInjector.cleanUp()
+        if self.opInjector:
+            self.opInjector.cleanUp()
+            self.opInjector = None
         if self._file is not None:
             self._file.close()
             self._file = None
@@ -186,7 +192,7 @@ def setupOutputs(self):
             self._attemptOpenAsKlb,
             self._attemptOpenAsUfmf,
             self._attemptOpenAsMmf,
-            self._attemptOpenAsOmeZarrMultiscale,
+            self._attemptOpenAsOmeZarrUri,
             self._attemptOpenAsRESTfulPrecomputedChunkedVolume,
             self._attemptOpenAsDvidVolume,
             self._attemptOpenAsH5N5Stack,
@@ -288,16 +294,22 @@ def _attemptOpenAsUfmf(self, filePath):
         else:
             return ([], None)
 
-    def _attemptOpenAsOmeZarrMultiscale(self, filePath):
-        if "zarr" not in filePath.lower():
+    def _attemptOpenAsOmeZarrUri(self, filePath):
+        # Local file system paths with .zarr are handled in _attemptOpenAsH5N5
+        path = PathComponents(filePath)
+        if path.extension != ".zarr":
             return ([], None)
         if not (filePath.startswith("http") or filePath.startswith("file")):
             return ([], None)
         # DatasetInfo instantiates a standalone OpInputDataReader to obtain laneShape and dtype.
         # We pass this down to the loader so that it can avoid loading scale metadata unnecessarily.
         reader = OpOMEZarrMultiscaleReader(parent=self, metadata_only_mode=self.parent is None)
-        reader.Scale.connect(self.ActiveScale)
-        reader.BaseUri.setValue(filePath)
+        if path.internalPath and self.parent:
+            # Headless/batch
+            reader.Scale.setValue(path.internalPath.lstrip("/"))
+        else:
+            reader.Scale.connect(self.ActiveScale)
+        reader.BaseUri.setValue(path.externalPath)
         return [reader], reader.Output
 
     def _attemptOpenAsRESTfulPrecomputedChunkedVolume(self, filePath):
@@ -408,22 +420,23 @@ def _attemptOpenAsH5N5(self, filePath):
                         "No internal path provided for dataset in file: {}".format(externalPath)
                     )
                     raise OpInputDataReader.DatasetReadError(msg)
-            try:
-                compression_setting = h5N5File[internalPath].compression
-            except Exception as e:
-                h5N5File.close()
-                msg = "Error reading H5/N5 File: {}\n{}".format(externalPath, e)
-                raise OpInputDataReader.DatasetReadError(msg) from e
 
             # If the h5 dataset is compressed, we'll have better performance
             #  with a multi-process hdf5 access object.
             # (Otherwise, single-process is faster.)
             allow_multiprocess_hdf5 = (
                 "LAZYFLOW_MULTIPROCESS_HDF5" in os.environ and os.environ["LAZYFLOW_MULTIPROCESS_HDF5"] != ""
             )
-            if compression_setting is not None and allow_multiprocess_hdf5 and isinstance(h5N5File, h5py.File):
-                h5N5File.close()
-                h5N5File = MultiProcessHdf5File(externalPath, "r")
+            if isinstance(h5N5File, h5py.File) and allow_multiprocess_hdf5:
+                try:
+                    compression_setting = h5N5File[internalPath].compression
+                except Exception as e:
+                    h5N5File.close()
+                    msg = "Error reading H5/N5 File: {}\n{}".format(externalPath, e)
+                    raise OpInputDataReader.DatasetReadError(msg) from e
+                if compression_setting is not None:
+                    h5N5File.close()
+                    h5N5File = MultiProcessHdf5File(externalPath, "r")
 
         self._file = h5N5File
 

diff --git a/lazyflow/operators/ioOperators/opStreamingH5N5Reader.py b/lazyflow/operators/ioOperators/opStreamingH5N5Reader.py
@@ -20,23 +20,65 @@
 # 		   http://ilastik.org/license/
 ###############################################################################
 # Python
+import contextlib
 import logging
 import time
-import numpy
+from typing import Union
+
 import vigra
 import h5py
 import z5py
-import json
 import os
-import numpy as np
 
 from lazyflow.graph import Operator, InputSlot, OutputSlot
 from lazyflow.utility import Timer
 from lazyflow.utility.helpers import get_default_axisordering, bigintprod
+from lazyflow.utility.io_util.OMEZarrStore import get_axistags_from_spec as get_ome_zarr_axistags
 
 logger = logging.getLogger(__name__)
 
 
+def _find_or_infer_axistags(file: Union[h5py.File, z5py.N5File, z5py.ZarrFile], internalPath: str) -> vigra.AxisTags:
+    assert internalPath in file, "Existence of dataset must be checked earlier"
+    with contextlib.suppress(KeyError):
+        # Look for ilastik-style axistags property.
+        axistagsJson = file[internalPath].attrs["axistags"]
+        axistags = vigra.AxisTags.fromJSON(axistagsJson)
+        axisorder = "".join(tag.key for tag in axistags)
+        if "?" not in axisorder:
+            return axistags
+
+    if isinstance(file, z5py.ZarrFile):
+        try:
+            # Look for OME-Zarr metadata (found at store root, not in dataset)
+            # OME-Zarr stores with more than one multiscale don't exist in public, but the spec allows it
+            multiscale_index = None
+            for i, scale in enumerate(file.attrs["multiscales"]):
+                if any(d.get("path", "") == internalPath.lstrip("/") for d in scale.get("datasets", [])):
+                    multiscale_index = i
+            if multiscale_index is None:
+                raise KeyError("no spec for dataset path")
+            return get_ome_zarr_axistags(file.attrs["multiscales"][multiscale_index])
+        except KeyError as e:
+            msg = (
+                f"Could not find axis information according to OME-Zarr standard "
+                f"for dataset {internalPath} in {file.filename}. "
+                f"Zarr is only supported with OME-format metadata."
+            )
+            raise ValueError(msg) from e
+
+    if not isinstance(file, z5py.ZarrFile):
+        with contextlib.suppress(KeyError):
+            # Look for metadata at dataset level (Neuroglancer-style N5 ["x", "y", "z"])
+            axisorder = "".join(reversed(file[internalPath].attrs["axes"])).lower()
+            return vigra.defaultAxistags(axisorder)
+
+    # Infer from shape
+    axisorder = get_default_axisordering(file[internalPath].shape)
+    logger.info(f"Could not find stored axistags. Inferred {axisorder} from dataset shape.")
+    return vigra.defaultAxistags(str(axisorder))
+
+
 class OpStreamingH5N5Reader(Operator):
     """
     The top-level operator for the data selection applet.
@@ -56,6 +98,7 @@ class OpStreamingH5N5Reader(Operator):
 
     H5EXTS = [".h5", ".hdf5", ".ilp"]
     N5EXTS = [".n5"]
+    ZARREXTS = [".zarr"]
 
     class DatasetReadError(Exception):
         def __init__(self, internalPath):
@@ -76,24 +119,8 @@ def setupOutputs(self):
             raise OpStreamingH5N5Reader.DatasetReadError(internalPath)
 
         dataset = self._h5N5File[internalPath]
-
-        try:
-            # Read the axistags property without actually importing the data
-            # Throws KeyError if 'axistags' can't be found
-            axistagsJson = self._h5N5File[internalPath].attrs["axistags"]
-            axistags = vigra.AxisTags.fromJSON(axistagsJson)
-            axisorder = "".join(tag.key for tag in axistags)
-            if "?" in axisorder:
-                raise KeyError("?")
-        except KeyError:
-            # No axistags found.
-            if "axes" in dataset.attrs:
-                axisorder = "".join(dataset.attrs["axes"][::-1]).lower()
-            else:
-                axisorder = get_default_axisordering(dataset.shape)
-            axistags = vigra.defaultAxistags(str(axisorder))
-
-        assert len(axistags) == len(dataset.shape), f"Mismatch between shape {dataset.shape} and axisorder {axisorder}"
+        axistags = _find_or_infer_axistags(self._h5N5File, internalPath)
+        assert len(axistags) == len(dataset.shape), f"Mismatch between shape {dataset.shape} and axis tags {axistags}"
 
         # Configure our slot meta-info
         self.OutputImage.meta.dtype = dataset.dtype.type
@@ -161,3 +188,5 @@ def get_h5_n5_file(filepath, mode="a"):
             return z5py.N5File(filepath, mode)
         elif ext in OpStreamingH5N5Reader.H5EXTS:
             return h5py.File(filepath, mode)
+        elif ext in OpStreamingH5N5Reader.ZARREXTS:
+            return z5py.ZarrFile(filepath, mode)
diff --git a/lazyflow/operators/ioOperators/opStreamingH5N5SequenceReaderM.py b/lazyflow/operators/ioOperators/opStreamingH5N5SequenceReaderM.py
@@ -256,7 +256,8 @@ def checkGlobString(globString):
         pathComponents = [PathComponents(p.strip()) for p in pathStrings]
         assert len(pathComponents) > 0
 
-        if not all(p.extension in OpStreamingH5N5Reader.H5EXTS + OpStreamingH5N5Reader.N5EXTS for p in pathComponents):
+        known_exts = OpStreamingH5N5Reader.H5EXTS + OpStreamingH5N5Reader.N5EXTS + OpStreamingH5N5Reader.ZARREXTS
+        if not all(p.extension in known_exts for p in pathComponents):
             raise OpStreamingH5N5SequenceReaderM.WrongFileTypeError(globString)
 
         if len(pathComponents) == 1:

diff --git a/lazyflow/operators/ioOperators/opStreamingH5N5SequenceReaderS.py b/lazyflow/operators/ioOperators/opStreamingH5N5SequenceReaderS.py
@@ -254,7 +254,8 @@ def checkGlobString(globString):
         pathComponents = [PathComponents(p.strip()) for p in pathStrings]
         assert len(pathComponents) > 0
 
-        if not all(p.extension in OpStreamingH5N5Reader.H5EXTS + OpStreamingH5N5Reader.N5EXTS for p in pathComponents):
+        known_exts = OpStreamingH5N5Reader.H5EXTS + OpStreamingH5N5Reader.N5EXTS + OpStreamingH5N5Reader.ZARREXTS
+        if not all(p.extension in known_exts for p in pathComponents):
             raise OpStreamingH5N5SequenceReaderS.WrongFileTypeError(globString)
 
         if len(pathComponents) == 1: