Skip to content

Commit

Permalink
Merge branch 'main' into encode_cf_datetime-refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
kmuehlbauer authored Feb 25, 2025
2 parents 1d2d035 + 2475d49 commit 76de275
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 10 deletions.
8 changes: 6 additions & 2 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,13 @@ Breaking changes
with xarray's handling of standard integers, and interfered with encoding
times with small integer dtypes and missing values (:pull:`9498`). By
`Spencer Clark <https://github.com/spencerkclark>`_.

- Warn instead of raise if phony_dims are detected when using h5netcdf-backend and ``phony_dims=None`` (:issue:`10049`, :pull:`10058`)
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

Deprecations
~~~~~~~~~~~~

- Move from phony_dims=None to phony_dims="access" for h5netcdf-backend(:issue:`10049`, :pull:`10058`)
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

Bug fixes
~~~~~~~~~
Expand All @@ -69,6 +71,8 @@ Bug fixes
integer dtype is specified with units that do not allow for an exact round
trip (:pull:`9498`). By `Spencer Clark
<https://github.com/spencerkclark>`_.
- Fix ``isel`` for multi-coordinate Xarray indexes (:issue:`10063`, :pull:`10066`).
By `Benoit Bovy <https://github.com/benbovy>`_.


Documentation
Expand Down
39 changes: 39 additions & 0 deletions xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,23 @@ def close(self, **kwargs):
self._manager.close(**kwargs)


def _check_phony_dims(phony_dims):
emit_phony_dims_warning = False
if phony_dims is None:
emit_phony_dims_warning = True
phony_dims = "access"
return emit_phony_dims_warning, phony_dims


def _emit_phony_dims_warning():
emit_user_level_warning(
"The 'phony_dims' kwarg now defaults to 'access'. "
"Previously 'phony_dims=None' would raise an error. "
"For full netcdf equivalence please use phony_dims='sort'.",
UserWarning,
)


class H5netcdfBackendEntrypoint(BackendEntrypoint):
"""
Backend for netCDF files based on the h5netcdf package.
Expand Down Expand Up @@ -434,6 +451,10 @@ def open_dataset(
driver_kwds=None,
storage_options: dict[str, Any] | None = None,
) -> Dataset:
# Keep this message for some versions
# remove and set phony_dims="access" above
emit_phony_dims_warning, phony_dims = _check_phony_dims(phony_dims)

filename_or_obj = _normalize_path(filename_or_obj)
store = H5NetCDFStore.open(
filename_or_obj,
Expand All @@ -460,6 +481,13 @@ def open_dataset(
use_cftime=use_cftime,
decode_timedelta=decode_timedelta,
)

# only warn if phony_dims exist in file
# remove together with the above check
# after some versions
if store.ds._root._phony_dim_count > 0 and emit_phony_dims_warning:
_emit_phony_dims_warning()

return ds

def open_datatree(
Expand Down Expand Up @@ -530,6 +558,10 @@ def open_groups_as_dict(
from xarray.core.treenode import NodePath
from xarray.core.utils import close_on_error

# Keep this message for some versions
# remove and set phony_dims="access" above
emit_phony_dims_warning, phony_dims = _check_phony_dims(phony_dims)

filename_or_obj = _normalize_path(filename_or_obj)
store = H5NetCDFStore.open(
filename_or_obj,
Expand All @@ -542,6 +574,7 @@ def open_groups_as_dict(
driver=driver,
driver_kwds=driver_kwds,
)

# Check for a group and make it a parent if it exists
if group:
parent = NodePath("/") / NodePath(group)
Expand Down Expand Up @@ -571,6 +604,12 @@ def open_groups_as_dict(
group_name = str(NodePath(path_group))
groups_dict[group_name] = group_ds

# only warn if phony_dims exist in file
# remove together with the above check
# after some versions
if store.ds._phony_dim_count > 0 and emit_phony_dims_warning:
_emit_phony_dims_warning()

return groups_dict


Expand Down
19 changes: 11 additions & 8 deletions xarray/core/indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def to_pandas_index(self) -> pd.Index:

def isel(
self, indexers: Mapping[Any, int | slice | np.ndarray | Variable]
) -> Self | None:
) -> Index | None:
"""Maybe returns a new index from the current index itself indexed by
positional indexers.
Expand Down Expand Up @@ -304,7 +304,7 @@ def reindex_like(self, other: Self) -> dict[Hashable, Any]:
"""
raise NotImplementedError(f"{self!r} doesn't support re-indexing labels")

def equals(self, other: Self) -> bool:
def equals(self, other: Index) -> bool:
"""Compare this index with another index of the same type.
Implementation is optional but required in order to support alignment.
Expand Down Expand Up @@ -1424,7 +1424,7 @@ def create_variables(

def isel(
self, indexers: Mapping[Any, int | slice | np.ndarray | Variable]
) -> Self | None:
) -> Index | None:
# TODO: support returning a new index (e.g., possible to re-calculate the
# the transform or calculate another transform on a reduced dimension space)
return None
Expand Down Expand Up @@ -1486,7 +1486,9 @@ def sel(

return IndexSelResult(results)

def equals(self, other: Self) -> bool:
def equals(self, other: Index) -> bool:
if not isinstance(other, CoordinateTransformIndex):
return False
return self.transform.equals(other.transform)

def rename(
Expand Down Expand Up @@ -1989,10 +1991,11 @@ def isel_indexes(
indexes: Indexes[Index],
indexers: Mapping[Any, Any],
) -> tuple[dict[Hashable, Index], dict[Hashable, Variable]]:
# TODO: remove if clause in the future. It should be unnecessary.
# See failure introduced when removed
# https://github.com/pydata/xarray/pull/9002#discussion_r1590443756
if any(isinstance(v, PandasMultiIndex) for v in indexes._indexes.values()):
# Fast path function _apply_indexes_fast does not work with multi-coordinate
# Xarray indexes (see https://github.com/pydata/xarray/issues/10063).
# -> call it only in the most common case where all indexes are default
# PandasIndex each associated to a single 1-dimensional coordinate.
if any(type(idx) is not PandasIndex for idx in indexes._indexes.values()):
return _apply_indexes(indexes, indexers, "isel")
else:
return _apply_indexes_fast(indexes, indexers, "isel")
Expand Down
22 changes: 22 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -4170,6 +4170,28 @@ def test_roundtrip_complex(self):
with self.roundtrip(expected) as actual:
assert_equal(expected, actual)

def test_phony_dims_warning(self) -> None:
import h5py

foo_data = np.arange(125).reshape(5, 5, 5)
bar_data = np.arange(625).reshape(25, 5, 5)
var = {"foo1": foo_data, "foo2": bar_data, "foo3": foo_data, "foo4": bar_data}
with create_tmp_file() as tmp_file:
with h5py.File(tmp_file, "w") as f:
grps = ["bar", "baz"]
for grp in grps:
fx = f.create_group(grp)
for k, v in var.items():
fx.create_dataset(k, data=v)
with pytest.warns(UserWarning, match="The 'phony_dims' kwarg"):
with xr.open_dataset(tmp_file, engine="h5netcdf", group="bar") as ds:
assert ds.dims == {

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / windows-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / windows-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / windows-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / windows-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / windows-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / windows-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.
"phony_dim_0": 5,
"phony_dim_1": 5,
"phony_dim_2": 5,
"phony_dim_3": 25,
}


@requires_h5netcdf
@requires_netCDF4
Expand Down
23 changes: 23 additions & 0 deletions xarray/tests/test_backends_datatree.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,29 @@ def test_open_datatree_specific_group(self, tmpdir, simple_datatree) -> None:
class TestH5NetCDFDatatreeIO(DatatreeIOBase):
engine: T_DataTreeNetcdfEngine | None = "h5netcdf"

def test_phony_dims_warning(self, tmpdir) -> None:
filepath = tmpdir + "/phony_dims.nc"
import h5py

foo_data = np.arange(125).reshape(5, 5, 5)
bar_data = np.arange(625).reshape(25, 5, 5)
var = {"foo1": foo_data, "foo2": bar_data, "foo3": foo_data, "foo4": bar_data}
with h5py.File(filepath, "w") as f:
grps = ["bar", "baz"]
for grp in grps:
fx = f.create_group(grp)
for k, v in var.items():
fx.create_dataset(k, data=v)

with pytest.warns(UserWarning, match="The 'phony_dims' kwarg"):
with open_datatree(filepath, engine=self.engine) as tree:
assert tree.bar.dims == {
"phony_dim_0": 5,
"phony_dim_1": 5,
"phony_dim_2": 5,
"phony_dim_3": 25,
}


@pytest.mark.skipif(
have_zarr_v3, reason="datatree support for zarr 3 is not implemented yet"
Expand Down
34 changes: 34 additions & 0 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1593,6 +1593,40 @@ def test_isel_fancy_convert_index_variable(self) -> None:
assert "x" not in actual.xindexes
assert not isinstance(actual.x.variable, IndexVariable)

def test_isel_multicoord_index(self) -> None:
# regression test https://github.com/pydata/xarray/issues/10063
# isel on a multi-coordinate index should return a unique index associated
# to each coordinate
class MultiCoordIndex(xr.Index):
def __init__(self, idx1, idx2):
self.idx1 = idx1
self.idx2 = idx2

@classmethod
def from_variables(cls, variables, *, options=None):
idx1 = PandasIndex.from_variables(
{"x": variables["x"]}, options=options
)
idx2 = PandasIndex.from_variables(
{"y": variables["y"]}, options=options
)

return cls(idx1, idx2)

def create_variables(self, variables=None):
return {**self.idx1.create_variables(), **self.idx2.create_variables()}

def isel(self, indexers):
idx1 = self.idx1.isel({"x": indexers.get("x", slice(None))})
idx2 = self.idx2.isel({"y": indexers.get("y", slice(None))})
return MultiCoordIndex(idx1, idx2)

coords = xr.Coordinates(coords={"x": [0, 1], "y": [1, 2]}, indexes={})
ds = xr.Dataset(coords=coords).set_xindex(["x", "y"], MultiCoordIndex)

ds2 = ds.isel(x=slice(None), y=slice(None))
assert ds2.xindexes["x"] is ds2.xindexes["y"]

def test_sel(self) -> None:
data = create_test_data()
int_slicers = {"dim1": slice(None, None, 2), "dim2": slice(2), "dim3": slice(3)}
Expand Down

0 comments on commit 76de275

Please sign in to comment.