Merge branch 'main' into encode_cf_datetime-refactor

pydata · Feb 25, 2025 · 76de275 · 76de275
2 parents 1d2d035 + 2475d49
commit 76de275
Show file tree

Hide file tree

Showing 6 changed files with 135 additions and 10 deletions.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -42,11 +42,13 @@ Breaking changes
   with xarray's handling of standard integers, and interfered with encoding
   times with small integer dtypes and missing values (:pull:`9498`). By
   `Spencer Clark <https://github.com/spencerkclark>`_.
-
+- Warn instead of raise if phony_dims are detected when using h5netcdf-backend and ``phony_dims=None`` (:issue:`10049`, :pull:`10058`)
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
 
 Deprecations
 ~~~~~~~~~~~~
-
+- Move from phony_dims=None to phony_dims="access" for h5netcdf-backend(:issue:`10049`, :pull:`10058`)
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
 
 Bug fixes
 ~~~~~~~~~
@@ -69,6 +71,8 @@ Bug fixes
   integer dtype is specified with units that do not allow for an exact round
   trip (:pull:`9498`). By `Spencer Clark
   <https://github.com/spencerkclark>`_.
+- Fix ``isel`` for multi-coordinate Xarray indexes (:issue:`10063`, :pull:`10066`).
+  By `Benoit Bovy <https://github.com/benbovy>`_.
 
 
 Documentation

diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -372,6 +372,23 @@ def close(self, **kwargs):
         self._manager.close(**kwargs)
 
 
+def _check_phony_dims(phony_dims):
+    emit_phony_dims_warning = False
+    if phony_dims is None:
+        emit_phony_dims_warning = True
+        phony_dims = "access"
+    return emit_phony_dims_warning, phony_dims
+
+
+def _emit_phony_dims_warning():
+    emit_user_level_warning(
+        "The 'phony_dims' kwarg now defaults to 'access'. "
+        "Previously 'phony_dims=None' would raise an error. "
+        "For full netcdf equivalence please use phony_dims='sort'.",
+        UserWarning,
+    )
+
+
 class H5netcdfBackendEntrypoint(BackendEntrypoint):
     """
     Backend for netCDF files based on the h5netcdf package.
@@ -434,6 +451,10 @@ def open_dataset(
         driver_kwds=None,
         storage_options: dict[str, Any] | None = None,
     ) -> Dataset:
+        # Keep this message for some versions
+        # remove and set phony_dims="access" above
+        emit_phony_dims_warning, phony_dims = _check_phony_dims(phony_dims)
+
         filename_or_obj = _normalize_path(filename_or_obj)
         store = H5NetCDFStore.open(
             filename_or_obj,
@@ -460,6 +481,13 @@ def open_dataset(
             use_cftime=use_cftime,
             decode_timedelta=decode_timedelta,
         )
+
+        # only warn if phony_dims exist in file
+        # remove together with the above check
+        # after some versions
+        if store.ds._root._phony_dim_count > 0 and emit_phony_dims_warning:
+            _emit_phony_dims_warning()
+
         return ds
 
     def open_datatree(
@@ -530,6 +558,10 @@ def open_groups_as_dict(
         from xarray.core.treenode import NodePath
         from xarray.core.utils import close_on_error
 
+        # Keep this message for some versions
+        # remove and set phony_dims="access" above
+        emit_phony_dims_warning, phony_dims = _check_phony_dims(phony_dims)
+
         filename_or_obj = _normalize_path(filename_or_obj)
         store = H5NetCDFStore.open(
             filename_or_obj,
@@ -542,6 +574,7 @@ def open_groups_as_dict(
             driver=driver,
             driver_kwds=driver_kwds,
         )
+
         # Check for a group and make it a parent if it exists
         if group:
             parent = NodePath("/") / NodePath(group)
@@ -571,6 +604,12 @@ def open_groups_as_dict(
                 group_name = str(NodePath(path_group))
             groups_dict[group_name] = group_ds
 
+        # only warn if phony_dims exist in file
+        # remove together with the above check
+        # after some versions
+        if store.ds._phony_dim_count > 0 and emit_phony_dims_warning:
+            _emit_phony_dims_warning()
+
         return groups_dict
 
 

diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py
@@ -209,7 +209,7 @@ def to_pandas_index(self) -> pd.Index:
 
     def isel(
         self, indexers: Mapping[Any, int | slice | np.ndarray | Variable]
-    ) -> Self | None:
+    ) -> Index | None:
         """Maybe returns a new index from the current index itself indexed by
         positional indexers.
 
@@ -304,7 +304,7 @@ def reindex_like(self, other: Self) -> dict[Hashable, Any]:
         """
         raise NotImplementedError(f"{self!r} doesn't support re-indexing labels")
 
-    def equals(self, other: Self) -> bool:
+    def equals(self, other: Index) -> bool:
         """Compare this index with another index of the same type.
 
         Implementation is optional but required in order to support alignment.
@@ -1424,7 +1424,7 @@ def create_variables(
 
     def isel(
         self, indexers: Mapping[Any, int | slice | np.ndarray | Variable]
-    ) -> Self | None:
+    ) -> Index | None:
         # TODO: support returning a new index (e.g., possible to re-calculate the
         # the transform or calculate another transform on a reduced dimension space)
         return None
@@ -1486,7 +1486,9 @@ def sel(
 
         return IndexSelResult(results)
 
-    def equals(self, other: Self) -> bool:
+    def equals(self, other: Index) -> bool:
+        if not isinstance(other, CoordinateTransformIndex):
+            return False
         return self.transform.equals(other.transform)
 
     def rename(
@@ -1989,10 +1991,11 @@ def isel_indexes(
     indexes: Indexes[Index],
     indexers: Mapping[Any, Any],
 ) -> tuple[dict[Hashable, Index], dict[Hashable, Variable]]:
-    # TODO: remove if clause in the future. It should be unnecessary.
-    # See failure introduced when removed
-    # https://github.com/pydata/xarray/pull/9002#discussion_r1590443756
-    if any(isinstance(v, PandasMultiIndex) for v in indexes._indexes.values()):
+    # Fast path function _apply_indexes_fast does not work with multi-coordinate
+    # Xarray indexes (see https://github.com/pydata/xarray/issues/10063).
+    # -> call it only in the most common case where all indexes are default
+    # PandasIndex each associated to a single 1-dimensional coordinate.
+    if any(type(idx) is not PandasIndex for idx in indexes._indexes.values()):
         return _apply_indexes(indexes, indexers, "isel")
     else:
         return _apply_indexes_fast(indexes, indexers, "isel")

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -4170,6 +4170,28 @@ def test_roundtrip_complex(self):
         with self.roundtrip(expected) as actual:
             assert_equal(expected, actual)
 
+    def test_phony_dims_warning(self) -> None:
+        import h5py
+
+        foo_data = np.arange(125).reshape(5, 5, 5)
+        bar_data = np.arange(625).reshape(25, 5, 5)
+        var = {"foo1": foo_data, "foo2": bar_data, "foo3": foo_data, "foo4": bar_data}
+        with create_tmp_file() as tmp_file:
+            with h5py.File(tmp_file, "w") as f:
+                grps = ["bar", "baz"]
+                for grp in grps:
+                    fx = f.create_group(grp)
+                    for k, v in var.items():
+                        fx.create_dataset(k, data=v)
+            with pytest.warns(UserWarning, match="The 'phony_dims' kwarg"):
+                with xr.open_dataset(tmp_file, engine="h5netcdf", group="bar") as ds:
+                    assert ds.dims == {
+                        "phony_dim_0": 5,
+                        "phony_dim_1": 5,
+                        "phony_dim_2": 5,
+                        "phony_dim_3": 25,
+                    }
+
 
 @requires_h5netcdf
 @requires_netCDF4

diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py
@@ -372,6 +372,29 @@ def test_open_datatree_specific_group(self, tmpdir, simple_datatree) -> None:
 class TestH5NetCDFDatatreeIO(DatatreeIOBase):
     engine: T_DataTreeNetcdfEngine | None = "h5netcdf"
 
+    def test_phony_dims_warning(self, tmpdir) -> None:
+        filepath = tmpdir + "/phony_dims.nc"
+        import h5py
+
+        foo_data = np.arange(125).reshape(5, 5, 5)
+        bar_data = np.arange(625).reshape(25, 5, 5)
+        var = {"foo1": foo_data, "foo2": bar_data, "foo3": foo_data, "foo4": bar_data}
+        with h5py.File(filepath, "w") as f:
+            grps = ["bar", "baz"]
+            for grp in grps:
+                fx = f.create_group(grp)
+                for k, v in var.items():
+                    fx.create_dataset(k, data=v)
+
+        with pytest.warns(UserWarning, match="The 'phony_dims' kwarg"):
+            with open_datatree(filepath, engine=self.engine) as tree:
+                assert tree.bar.dims == {
+                    "phony_dim_0": 5,
+                    "phony_dim_1": 5,
+                    "phony_dim_2": 5,
+                    "phony_dim_3": 25,
+                }
+
 
 @pytest.mark.skipif(
     have_zarr_v3, reason="datatree support for zarr 3 is not implemented yet"

diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -1593,6 +1593,40 @@ def test_isel_fancy_convert_index_variable(self) -> None:
         assert "x" not in actual.xindexes
         assert not isinstance(actual.x.variable, IndexVariable)
 
+    def test_isel_multicoord_index(self) -> None:
+        # regression test https://github.com/pydata/xarray/issues/10063
+        # isel on a multi-coordinate index should return a unique index associated
+        # to each coordinate
+        class MultiCoordIndex(xr.Index):
+            def __init__(self, idx1, idx2):
+                self.idx1 = idx1
+                self.idx2 = idx2
+
+            @classmethod
+            def from_variables(cls, variables, *, options=None):
+                idx1 = PandasIndex.from_variables(
+                    {"x": variables["x"]}, options=options
+                )
+                idx2 = PandasIndex.from_variables(
+                    {"y": variables["y"]}, options=options
+                )
+
+                return cls(idx1, idx2)
+
+            def create_variables(self, variables=None):
+                return {**self.idx1.create_variables(), **self.idx2.create_variables()}
+
+            def isel(self, indexers):
+                idx1 = self.idx1.isel({"x": indexers.get("x", slice(None))})
+                idx2 = self.idx2.isel({"y": indexers.get("y", slice(None))})
+                return MultiCoordIndex(idx1, idx2)
+
+        coords = xr.Coordinates(coords={"x": [0, 1], "y": [1, 2]}, indexes={})
+        ds = xr.Dataset(coords=coords).set_xindex(["x", "y"], MultiCoordIndex)
+
+        ds2 = ds.isel(x=slice(None), y=slice(None))
+        assert ds2.xindexes["x"] is ds2.xindexes["y"]
+
     def test_sel(self) -> None:
         data = create_test_data()
         int_slicers = {"dim1": slice(None, None, 2), "dim2": slice(2), "dim3": slice(3)}