diff --git a/python-xarray.changes b/python-xarray.changes index 800c9d6..abf21b6 100644 --- a/python-xarray.changes +++ b/python-xarray.changes @@ -1,3 +1,9 @@ +------------------------------------------------------------------- +Mon Mar 18 19:47:16 UTC 2024 - Ben Greiner + +- Add xarray-pr8797-tokenize.patch + * gh#pydata/xarray#8797 fixes gh#pydata/xarray#8788 + ------------------------------------------------------------------- Fri Mar 1 21:04:08 UTC 2024 - Matej Cepl diff --git a/python-xarray.spec b/python-xarray.spec index 6c94115..d3f58f4 100644 --- a/python-xarray.spec +++ b/python-xarray.spec @@ -37,6 +37,8 @@ Source: https://files.pythonhosted.org/packages/source/x/xarray/xarray-% # PATCH-FEATURE-UPSTREAM local_dataset.patch gh#pydata/xarray#5377 mcepl@suse.com # fix xr.tutorial.open_dataset to work with the preloaded cache. Patch0: local_dataset.patch +# PATCH-FIX-UPSTREAM xarray-pr8797-tokenize.patch gh#pydata/xarray#8797 fixes gh#pydata/xarray#8788 +Patch1: https://github.com/pydata/xarray/pull/8797.patch#/xarray-pr8797-tokenize.patch BuildRequires: %{python_module base >= 3.9} BuildRequires: %{python_module pip} BuildRequires: %{python_module setuptools_scm} diff --git a/xarray-pr8797-tokenize.patch b/xarray-pr8797-tokenize.patch new file mode 100644 index 0000000..163c399 --- /dev/null +++ b/xarray-pr8797-tokenize.patch @@ -0,0 +1,195 @@ +From 4eb05f0f73c535455f457e650036c86cdfaf4aa2 Mon Sep 17 00:00:00 2001 +From: crusaderky +Date: Thu, 29 Feb 2024 12:21:18 +0000 +Subject: [PATCH] tokenize() should ignore difference between None and {} attrs + +--- + xarray/core/dataarray.py | 2 +- + xarray/core/dataset.py | 8 ++++---- + xarray/core/variable.py | 6 ++++-- + xarray/namedarray/core.py | 7 +++---- + xarray/namedarray/utils.py | 4 ++-- + xarray/tests/test_dask.py | 35 ++++++++++++++++++++++++----------- + xarray/tests/test_sparse.py | 4 ---- + 7 files changed, 38 insertions(+), 28 deletions(-) + +diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py +index c00fe1a9e6..aeb6b2217c 100644 +--- a/xarray/core/dataarray.py ++++ b/xarray/core/dataarray.py +@@ -1070,7 +1070,7 @@ def reset_coords( + dataset[self.name] = self.variable + return dataset + +- def __dask_tokenize__(self): ++ def __dask_tokenize__(self) -> object: + from dask.base import normalize_token + + return normalize_token((type(self), self._variable, self._coords, self._name)) +diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py +index 884e302b8b..e1fd9e025f 100644 +--- a/xarray/core/dataset.py ++++ b/xarray/core/dataset.py +@@ -694,7 +694,7 @@ def __init__( + data_vars, coords + ) + +- self._attrs = dict(attrs) if attrs is not None else None ++ self._attrs = dict(attrs) if attrs else None + self._close = None + self._encoding = None + self._variables = variables +@@ -739,7 +739,7 @@ def attrs(self) -> dict[Any, Any]: + + @attrs.setter + def attrs(self, value: Mapping[Any, Any]) -> None: +- self._attrs = dict(value) ++ self._attrs = dict(value) if value else None + + @property + def encoding(self) -> dict[Any, Any]: +@@ -856,11 +856,11 @@ def load(self, **kwargs) -> Self: + + return self + +- def __dask_tokenize__(self): ++ def __dask_tokenize__(self) -> object: + from dask.base import normalize_token + + return normalize_token( +- (type(self), self._variables, self._coord_names, self._attrs) ++ (type(self), self._variables, self._coord_names, self._attrs or None) + ) + + def __dask_graph__(self): +diff --git a/xarray/core/variable.py b/xarray/core/variable.py +index cd0c022d70..315c46369b 100644 +--- a/xarray/core/variable.py ++++ b/xarray/core/variable.py +@@ -2592,11 +2592,13 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): + if not isinstance(self._data, PandasIndexingAdapter): + self._data = PandasIndexingAdapter(self._data) + +- def __dask_tokenize__(self): ++ def __dask_tokenize__(self) -> object: + from dask.base import normalize_token + + # Don't waste time converting pd.Index to np.ndarray +- return normalize_token((type(self), self._dims, self._data.array, self._attrs)) ++ return normalize_token( ++ (type(self), self._dims, self._data.array, self._attrs or None) ++ ) + + def load(self): + # data is already loaded into memory for IndexVariable +diff --git a/xarray/namedarray/core.py b/xarray/namedarray/core.py +index 2972269043..fd209bc273 100644 +--- a/xarray/namedarray/core.py ++++ b/xarray/namedarray/core.py +@@ -511,7 +511,7 @@ def attrs(self) -> dict[Any, Any]: + + @attrs.setter + def attrs(self, value: Mapping[Any, Any]) -> None: +- self._attrs = dict(value) ++ self._attrs = dict(value) if value else None + + def _check_shape(self, new_data: duckarray[Any, _DType_co]) -> None: + if new_data.shape != self.shape: +@@ -570,13 +570,12 @@ def real( + return real(self) + return self._new(data=self._data.real) + +- def __dask_tokenize__(self) -> Hashable: ++ def __dask_tokenize__(self) -> object: + # Use v.data, instead of v._data, in order to cope with the wrappers + # around NetCDF and the like + from dask.base import normalize_token + +- s, d, a, attrs = type(self), self._dims, self.data, self.attrs +- return normalize_token((s, d, a, attrs)) # type: ignore[no-any-return] ++ return normalize_token((type(self), self._dims, self.data, self._attrs or None)) + + def __dask_graph__(self) -> Graph | None: + if is_duck_dask_array(self._data): +diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py +index 0326a6173c..b82a80b546 100644 +--- a/xarray/namedarray/utils.py ++++ b/xarray/namedarray/utils.py +@@ -218,7 +218,7 @@ def __eq__(self, other: ReprObject | Any) -> bool: + def __hash__(self) -> int: + return hash((type(self), self._value)) + +- def __dask_tokenize__(self) -> Hashable: ++ def __dask_tokenize__(self) -> object: + from dask.base import normalize_token + +- return normalize_token((type(self), self._value)) # type: ignore[no-any-return] ++ return normalize_token((type(self), self._value)) +diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py +index 07bf773cc8..517fc0c2d6 100644 +--- a/xarray/tests/test_dask.py ++++ b/xarray/tests/test_dask.py +@@ -299,17 +299,6 @@ def test_persist(self): + self.assertLazyAndAllClose(u + 1, v) + self.assertLazyAndAllClose(u + 1, v2) + +- def test_tokenize_empty_attrs(self) -> None: +- # Issue #6970 +- assert self.eager_var._attrs is None +- expected = dask.base.tokenize(self.eager_var) +- assert self.eager_var.attrs == self.eager_var._attrs == {} +- assert ( +- expected +- == dask.base.tokenize(self.eager_var) +- == dask.base.tokenize(self.lazy_var.compute()) +- ) +- + @requires_pint + def test_tokenize_duck_dask_array(self): + import pint +@@ -1573,6 +1562,30 @@ def test_token_identical(obj, transform): + ) + + ++@pytest.mark.parametrize( ++ "obj", ++ [ ++ make_ds(), # Dataset ++ make_ds().variables["c2"], # Variable ++ make_ds().variables["x"], # IndexVariable ++ ], ++) ++def test_tokenize_empty_attrs(obj): ++ """Issues #6970 and #8788""" ++ obj.attrs = {} ++ assert obj._attrs is None ++ a = dask.base.tokenize(obj) ++ ++ assert obj.attrs == {} ++ assert obj._attrs == {} # attrs getter changed None to dict ++ b = dask.base.tokenize(obj) ++ assert a == b ++ ++ obj2 = obj.copy() ++ c = dask.base.tokenize(obj2) ++ assert a == c ++ ++ + def test_recursive_token(): + """Test that tokenization is invoked recursively, and doesn't just rely on the + output of str() +diff --git a/xarray/tests/test_sparse.py b/xarray/tests/test_sparse.py +index 289149bdd6..09c1281875 100644 +--- a/xarray/tests/test_sparse.py ++++ b/xarray/tests/test_sparse.py +@@ -878,10 +878,6 @@ def test_dask_token(): + import dask + + s = sparse.COO.from_numpy(np.array([0, 0, 1, 2])) +- +- # https://github.com/pydata/sparse/issues/300 +- s.__dask_tokenize__ = lambda: dask.base.normalize_token(s.__dict__) +- + a = DataArray(s) + t1 = dask.base.tokenize(a) + t2 = dask.base.tokenize(a)