Skip to content

Commit

Permalink
Merge pull request ilastik#2895 from btbest/ome-zarr-in-headless-and-api
Browse files Browse the repository at this point in the history
Support loading OME-Zarr datasets through headless mode and API
  • Loading branch information
btbest authored Sep 13, 2024
2 parents 07e1078 + d6014b2 commit 380d831
Show file tree
Hide file tree
Showing 14 changed files with 459 additions and 137 deletions.
8 changes: 6 additions & 2 deletions ilastik/applets/dataSelection/dataSelectionGui.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,20 +542,24 @@ def _get_dataset_full_path(self, filePath: Path, roleIndex: int) -> Path:
datasetNames = DatasetInfo.getPossibleInternalPathsFor(filePath.absolute())
if len(datasetNames) == 0:
raise RuntimeError(f"File {filePath} has no image datasets")
keep_selected_as_default = False
if len(datasetNames) == 1:
selected_dataset = datasetNames.pop()
keep_selected_as_default = True
else:
auto_inner_paths = self._get_previously_used_inner_paths(roleIndex).intersection(set(datasetNames))
if len(auto_inner_paths) == 1:
selected_dataset = auto_inner_paths.pop()
else:
# Ask the user which dataset to choose
dlg = SubvolumeSelectionDlg(datasetNames, self)
dlg = SubvolumeSelectionDlg(datasetNames, self, offer_remember_dataset=True)
if dlg.exec_() != QDialog.Accepted:
raise DataSelectionGui.UserCancelledError()
selected_index = dlg.combo.currentIndex()
keep_selected_as_default = dlg.checkbox.isChecked()
selected_dataset = str(datasetNames[selected_index])
self._add_default_inner_path(roleIndex=roleIndex, inner_path=selected_dataset)
if keep_selected_as_default:
self._add_default_inner_path(roleIndex=roleIndex, inner_path=selected_dataset)
return filePath / selected_dataset.lstrip("/")

def _get_custom_axistags_from_previous_lane(self, role: Union[str, int], info: DatasetInfo) -> Optional[AxisTags]:
Expand Down
23 changes: 20 additions & 3 deletions ilastik/applets/dataSelection/opDataSelection.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,14 @@ def globInternalPaths(cls, file_path: str, glob_str: str, cwd: str = None) -> Li
elif cls.pathIsHdf5(path):
f = h5py.File(path, "r")
elif cls.pathIsN5(path):
f = z5py.N5File(path) # FIXME
try:
f = z5py.N5File(path)
except AttributeError as e:
# z5py.file doesn't check metadata cleanly:
# `metadata.get('n5') # AttributeError: 'NoneType' object has no attribute 'get'
raise ValueError(f'N5 metadata at "{path}" has incompatible format') from e
elif cls.pathIsZarr(path):
f = z5py.ZarrFile(path)
else:
raise ValueError(f"{path} is not an 'n5' or 'h5' file")
internal_paths |= set(globH5N5(f, glob_str))
Expand All @@ -317,9 +324,13 @@ def pathIsNpz(cls, path: Path) -> bool:
def pathIsN5(cls, path: Path) -> bool:
return PathComponents(Path(path).as_posix()).extension in [".n5"]

@classmethod
def pathIsZarr(cls, path: Path) -> bool:
return PathComponents(Path(path).as_posix()).extension in [".zarr"]

@classmethod
def fileHasInternalPaths(cls, path: str) -> bool:
return cls.pathIsHdf5(path) or cls.pathIsN5(path) or cls.pathIsNpz(path)
return cls.pathIsHdf5(path) or cls.pathIsN5(path) or cls.pathIsNpz(path) or cls.pathIsZarr(path)

@classmethod
def getPossibleInternalPathsFor(cls, file_path: Path, min_ndim=2, max_ndim=5) -> List[str]:
Expand All @@ -335,6 +346,9 @@ def accumulateInternalPaths(name, val):
elif cls.pathIsN5(file_path):
with z5py.N5File(file_path, mode="r+") as f:
f.visititems(accumulateInternalPaths)
elif cls.pathIsZarr(file_path):
with z5py.ZarrFile(file_path, mode="r+") as f:
f.visititems(accumulateInternalPaths)

return datasetNames

Expand Down Expand Up @@ -689,8 +703,11 @@ def isNpz(self) -> bool:
def isN5(self) -> bool:
return any(self.pathIsN5(ep) for ep in self.external_paths)

def isZarr(self) -> bool:
return any(self.pathIsZarr(ep) for ep in self.external_paths)

def is_hierarchical(self):
return self.isHdf5() or self.isNpz() or self.isN5()
return self.isHdf5() or self.isNpz() or self.isN5() or self.isZarr()

def is_in_filesystem(self) -> bool:
return True
Expand Down
15 changes: 15 additions & 0 deletions ilastik/experimental/api/_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,21 @@ class PixelClassificationPipeline:
prob_maps = pipeline.get_probabilities(img)
```
Example using an OME-Zarr dataset stored remotely:
```Python
import xarray
from ilastik.experimental.api import PixelClassificationPipeline
from lazyflow.utility.io_util.OMEZarrStore import OMEZarrStore
store = OMEZarrStore("<https://example.com/data.zarr>")
zarray = store.get_zarr_array("<scale name>")
dims = tuple(store.axistags.keys())
img = xarray.DataArray(zarray, dims=dims) # downloads the entire image
pipeline = PixelClassificationPipeline.from_ilp_file("<path/to/project.ilp>")
prob_maps = pipeline.get_probabilities(img)
```
"""

@classmethod
Expand Down
4 changes: 3 additions & 1 deletion ilastik/widgets/ImageFileDialog.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ def getSelectedPaths(self) -> List[Path]:
filePaths = []
for selected_file in self.selectedFiles():
path = Path(selected_file)
if path.name.lower() == "attributes.json" and any(p.suffix.lower() == ".n5" for p in path.parents):
if (path.name.lower() == "attributes.json" and any(p.suffix.lower() == ".n5" for p in path.parents)) or (
path.name.lower() == ".zgroup" and any(p.suffix.lower() == ".zarr" for p in path.parents)
):
# For the n5 extension the attributes.json file has to be selected in the file dialog.
# However we need just the *.n5 directory-file.
filePaths.append(path.parent)
Expand Down
8 changes: 7 additions & 1 deletion ilastik/widgets/hdf5SubvolumeSelectionDialog.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
QTextEdit,
QVBoxLayout,
QWidget,
QCheckBox,
)

from PyQt5.QtCore import Qt
Expand All @@ -40,7 +41,7 @@ class SubvolumeSelectionDlg(QDialog):
A window to ask the user to choose between multiple HDF5 datasets in a single file.
"""

def __init__(self, datasetNames, parent):
def __init__(self, datasetNames, parent, offer_remember_dataset=False):
super().__init__(parent)
label = QLabel(
"Your HDF5/N5 File contains multiple image volumes.\nPlease select the one you would like to open."
Expand All @@ -50,6 +51,9 @@ def __init__(self, datasetNames, parent):
for name in datasetNames:
self.combo.addItem(name)

if offer_remember_dataset:
self.checkbox = QCheckBox("Always use this dataset in this file (until next ilastik restart)")

buttonbox = QDialogButtonBox(Qt.Horizontal, parent=self)
buttonbox.setStandardButtons(QDialogButtonBox.Ok | QDialogButtonBox.Cancel)
buttonbox.accepted.connect(self.accept)
Expand All @@ -58,6 +62,8 @@ def __init__(self, datasetNames, parent):
layout = QVBoxLayout()
layout.addWidget(label)
layout.addWidget(self.combo)
if offer_remember_dataset:
layout.addWidget(self.checkbox)
layout.addWidget(buttonbox)

self.setLayout(layout)
Expand Down
47 changes: 30 additions & 17 deletions lazyflow/operators/ioOperators/opInputDataReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,11 @@ class OpInputDataReader(Operator):
category = "Input"

videoExts = ["ufmf", "mmf"]
h5_n5_Exts = ["h5", "hdf5", "ilp", "n5"]
n5Selection = ["json"] # n5 stores data in a directory, containing a json-file which we use to select the n5-file
h5_n5_Exts = ["h5", "hdf5", "ilp", "n5", "zarr"]
n5Selection = [
"json",
"zgroup",
] # n5 stores data in a directory, containing a json-file which we use to select the n5-file
klbExts = ["klb"]
npyExts = ["npy"]
npzExts = ["npz"]
Expand Down Expand Up @@ -137,6 +140,7 @@ def __init__(
super(OpInputDataReader, self).__init__(*args, **kwargs)
self.internalOperators = []
self.internalOutput = None
self.opInjector = None
self._file = None

self.WorkingDirectory.setOrConnectIfAvailable(WorkingDirectory)
Expand All @@ -151,7 +155,9 @@ def cleanUp(self):

def internalCleanup(self):
self.Output.disconnect()
self.opInjector.cleanUp()
if self.opInjector:
self.opInjector.cleanUp()
self.opInjector = None
if self._file is not None:
self._file.close()
self._file = None
Expand Down Expand Up @@ -186,7 +192,7 @@ def setupOutputs(self):
self._attemptOpenAsKlb,
self._attemptOpenAsUfmf,
self._attemptOpenAsMmf,
self._attemptOpenAsOmeZarrMultiscale,
self._attemptOpenAsOmeZarrUri,
self._attemptOpenAsRESTfulPrecomputedChunkedVolume,
self._attemptOpenAsDvidVolume,
self._attemptOpenAsH5N5Stack,
Expand Down Expand Up @@ -288,16 +294,22 @@ def _attemptOpenAsUfmf(self, filePath):
else:
return ([], None)

def _attemptOpenAsOmeZarrMultiscale(self, filePath):
if "zarr" not in filePath.lower():
def _attemptOpenAsOmeZarrUri(self, filePath):
# Local file system paths with .zarr are handled in _attemptOpenAsH5N5
path = PathComponents(filePath)
if path.extension != ".zarr":
return ([], None)
if not (filePath.startswith("http") or filePath.startswith("file")):
return ([], None)
# DatasetInfo instantiates a standalone OpInputDataReader to obtain laneShape and dtype.
# We pass this down to the loader so that it can avoid loading scale metadata unnecessarily.
reader = OpOMEZarrMultiscaleReader(parent=self, metadata_only_mode=self.parent is None)
reader.Scale.connect(self.ActiveScale)
reader.BaseUri.setValue(filePath)
if path.internalPath and self.parent:
# Headless/batch
reader.Scale.setValue(path.internalPath.lstrip("/"))
else:
reader.Scale.connect(self.ActiveScale)
reader.BaseUri.setValue(path.externalPath)
return [reader], reader.Output

def _attemptOpenAsRESTfulPrecomputedChunkedVolume(self, filePath):
Expand Down Expand Up @@ -408,22 +420,23 @@ def _attemptOpenAsH5N5(self, filePath):
"No internal path provided for dataset in file: {}".format(externalPath)
)
raise OpInputDataReader.DatasetReadError(msg)
try:
compression_setting = h5N5File[internalPath].compression
except Exception as e:
h5N5File.close()
msg = "Error reading H5/N5 File: {}\n{}".format(externalPath, e)
raise OpInputDataReader.DatasetReadError(msg) from e

# If the h5 dataset is compressed, we'll have better performance
# with a multi-process hdf5 access object.
# (Otherwise, single-process is faster.)
allow_multiprocess_hdf5 = (
"LAZYFLOW_MULTIPROCESS_HDF5" in os.environ and os.environ["LAZYFLOW_MULTIPROCESS_HDF5"] != ""
)
if compression_setting is not None and allow_multiprocess_hdf5 and isinstance(h5N5File, h5py.File):
h5N5File.close()
h5N5File = MultiProcessHdf5File(externalPath, "r")
if isinstance(h5N5File, h5py.File) and allow_multiprocess_hdf5:
try:
compression_setting = h5N5File[internalPath].compression
except Exception as e:
h5N5File.close()
msg = "Error reading H5/N5 File: {}\n{}".format(externalPath, e)
raise OpInputDataReader.DatasetReadError(msg) from e
if compression_setting is not None:
h5N5File.close()
h5N5File = MultiProcessHdf5File(externalPath, "r")

self._file = h5N5File

Expand Down
71 changes: 50 additions & 21 deletions lazyflow/operators/ioOperators/opStreamingH5N5Reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,65 @@
# http://ilastik.org/license/
###############################################################################
# Python
import contextlib
import logging
import time
import numpy
from typing import Union

import vigra
import h5py
import z5py
import json
import os
import numpy as np

from lazyflow.graph import Operator, InputSlot, OutputSlot
from lazyflow.utility import Timer
from lazyflow.utility.helpers import get_default_axisordering, bigintprod
from lazyflow.utility.io_util.OMEZarrStore import get_axistags_from_spec as get_ome_zarr_axistags

logger = logging.getLogger(__name__)


def _find_or_infer_axistags(file: Union[h5py.File, z5py.N5File, z5py.ZarrFile], internalPath: str) -> vigra.AxisTags:
assert internalPath in file, "Existence of dataset must be checked earlier"
with contextlib.suppress(KeyError):
# Look for ilastik-style axistags property.
axistagsJson = file[internalPath].attrs["axistags"]
axistags = vigra.AxisTags.fromJSON(axistagsJson)
axisorder = "".join(tag.key for tag in axistags)
if "?" not in axisorder:
return axistags

if isinstance(file, z5py.ZarrFile):
try:
# Look for OME-Zarr metadata (found at store root, not in dataset)
# OME-Zarr stores with more than one multiscale don't exist in public, but the spec allows it
multiscale_index = None
for i, scale in enumerate(file.attrs["multiscales"]):
if any(d.get("path", "") == internalPath.lstrip("/") for d in scale.get("datasets", [])):
multiscale_index = i
if multiscale_index is None:
raise KeyError("no spec for dataset path")
return get_ome_zarr_axistags(file.attrs["multiscales"][multiscale_index])
except KeyError as e:
msg = (
f"Could not find axis information according to OME-Zarr standard "
f"for dataset {internalPath} in {file.filename}. "
f"Zarr is only supported with OME-format metadata."
)
raise ValueError(msg) from e

if not isinstance(file, z5py.ZarrFile):
with contextlib.suppress(KeyError):
# Look for metadata at dataset level (Neuroglancer-style N5 ["x", "y", "z"])
axisorder = "".join(reversed(file[internalPath].attrs["axes"])).lower()
return vigra.defaultAxistags(axisorder)

# Infer from shape
axisorder = get_default_axisordering(file[internalPath].shape)
logger.info(f"Could not find stored axistags. Inferred {axisorder} from dataset shape.")
return vigra.defaultAxistags(str(axisorder))


class OpStreamingH5N5Reader(Operator):
"""
The top-level operator for the data selection applet.
Expand All @@ -56,6 +98,7 @@ class OpStreamingH5N5Reader(Operator):

H5EXTS = [".h5", ".hdf5", ".ilp"]
N5EXTS = [".n5"]
ZARREXTS = [".zarr"]

class DatasetReadError(Exception):
def __init__(self, internalPath):
Expand All @@ -76,24 +119,8 @@ def setupOutputs(self):
raise OpStreamingH5N5Reader.DatasetReadError(internalPath)

dataset = self._h5N5File[internalPath]

try:
# Read the axistags property without actually importing the data
# Throws KeyError if 'axistags' can't be found
axistagsJson = self._h5N5File[internalPath].attrs["axistags"]
axistags = vigra.AxisTags.fromJSON(axistagsJson)
axisorder = "".join(tag.key for tag in axistags)
if "?" in axisorder:
raise KeyError("?")
except KeyError:
# No axistags found.
if "axes" in dataset.attrs:
axisorder = "".join(dataset.attrs["axes"][::-1]).lower()
else:
axisorder = get_default_axisordering(dataset.shape)
axistags = vigra.defaultAxistags(str(axisorder))

assert len(axistags) == len(dataset.shape), f"Mismatch between shape {dataset.shape} and axisorder {axisorder}"
axistags = _find_or_infer_axistags(self._h5N5File, internalPath)
assert len(axistags) == len(dataset.shape), f"Mismatch between shape {dataset.shape} and axis tags {axistags}"

# Configure our slot meta-info
self.OutputImage.meta.dtype = dataset.dtype.type
Expand Down Expand Up @@ -161,3 +188,5 @@ def get_h5_n5_file(filepath, mode="a"):
return z5py.N5File(filepath, mode)
elif ext in OpStreamingH5N5Reader.H5EXTS:
return h5py.File(filepath, mode)
elif ext in OpStreamingH5N5Reader.ZARREXTS:
return z5py.ZarrFile(filepath, mode)
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,8 @@ def checkGlobString(globString):
pathComponents = [PathComponents(p.strip()) for p in pathStrings]
assert len(pathComponents) > 0

if not all(p.extension in OpStreamingH5N5Reader.H5EXTS + OpStreamingH5N5Reader.N5EXTS for p in pathComponents):
known_exts = OpStreamingH5N5Reader.H5EXTS + OpStreamingH5N5Reader.N5EXTS + OpStreamingH5N5Reader.ZARREXTS
if not all(p.extension in known_exts for p in pathComponents):
raise OpStreamingH5N5SequenceReaderM.WrongFileTypeError(globString)

if len(pathComponents) == 1:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,8 @@ def checkGlobString(globString):
pathComponents = [PathComponents(p.strip()) for p in pathStrings]
assert len(pathComponents) > 0

if not all(p.extension in OpStreamingH5N5Reader.H5EXTS + OpStreamingH5N5Reader.N5EXTS for p in pathComponents):
known_exts = OpStreamingH5N5Reader.H5EXTS + OpStreamingH5N5Reader.N5EXTS + OpStreamingH5N5Reader.ZARREXTS
if not all(p.extension in known_exts for p in pathComponents):
raise OpStreamingH5N5SequenceReaderS.WrongFileTypeError(globString)

if len(pathComponents) == 1:
Expand Down
Loading

0 comments on commit 380d831

Please sign in to comment.