OBS-URL: https://build.opensuse.org/package/show/devel:languages:python:numeric/python-pandas?expand=0&rev=144
317 lines
13 KiB
Diff
317 lines
13 KiB
Diff
From 1e899afbd9ca20f4ce9d6f93e1f62c072be0ed23 Mon Sep 17 00:00:00 2001
|
|
From: Gen Sato <52241300+halogen22@users.noreply.github.com>
|
|
Date: Tue, 18 Mar 2025 01:33:40 +0900
|
|
Subject: [PATCH] BUG: .mode(dropna=False) doesn't work with nullable integers
|
|
(#61132)
|
|
|
|
* Fix dropna bug when mode
|
|
|
|
* Fix test cases
|
|
|
|
* Fix data type incompatible
|
|
---
|
|
doc/source/whatsnew/v3.0.0.rst | 1 +
|
|
pandas/_libs/hashtable_func_helper.pxi.in | 2 +-
|
|
pandas/core/algorithms.py | 12 +++---
|
|
pandas/core/arrays/base.py | 5 ++-
|
|
pandas/core/arrays/categorical.py | 2 +-
|
|
pandas/core/arrays/datetimelike.py | 2 +-
|
|
pandas/core/arrays/masked.py | 8 +---
|
|
pandas/core/series.py | 2 +-
|
|
pandas/tests/series/test_reductions.py | 23 +++++++++++
|
|
pandas/tests/test_algos.py | 47 +++++++++++++++--------
|
|
10 files changed, 71 insertions(+), 33 deletions(-)
|
|
|
|
Index: pandas-2.2.3/pandas/_libs/hashtable_func_helper.pxi.in
|
|
===================================================================
|
|
--- pandas-2.2.3.orig/pandas/_libs/hashtable_func_helper.pxi.in
|
|
+++ pandas-2.2.3/pandas/_libs/hashtable_func_helper.pxi.in
|
|
@@ -443,7 +443,7 @@ def mode(ndarray[htfunc_t] values, bint
|
|
|
|
if na_counter > 0:
|
|
res_mask = np.zeros(j+1, dtype=np.bool_)
|
|
- res_mask[j] = True
|
|
+ res_mask[j] = (na_counter == max_count)
|
|
return modes[:j + 1], res_mask
|
|
|
|
|
|
Index: pandas-2.2.3/pandas/core/algorithms.py
|
|
===================================================================
|
|
--- pandas-2.2.3.orig/pandas/core/algorithms.py
|
|
+++ pandas-2.2.3/pandas/core/algorithms.py
|
|
@@ -1022,7 +1022,7 @@ def duplicated(
|
|
|
|
def mode(
|
|
values: ArrayLike, dropna: bool = True, mask: npt.NDArray[np.bool_] | None = None
|
|
-) -> ArrayLike:
|
|
+) -> tuple[np.ndarray, npt.NDArray[np.bool_]] | ExtensionArray:
|
|
"""
|
|
Returns the mode(s) of an array.
|
|
|
|
@@ -1035,7 +1035,7 @@ def mode(
|
|
|
|
Returns
|
|
-------
|
|
- np.ndarray or ExtensionArray
|
|
+ Union[Tuple[np.ndarray, npt.NDArray[np.bool_]], ExtensionArray]
|
|
"""
|
|
values = _ensure_arraylike(values, func_name="mode")
|
|
original = values
|
|
@@ -1049,8 +1049,10 @@ def mode(
|
|
values = _ensure_data(values)
|
|
|
|
npresult, res_mask = htable.mode(values, dropna=dropna, mask=mask)
|
|
- if res_mask is not None:
|
|
- return npresult, res_mask # type: ignore[return-value]
|
|
+ if res_mask is None:
|
|
+ res_mask = np.zeros(npresult.shape, dtype=np.bool_)
|
|
+ else:
|
|
+ return npresult, res_mask
|
|
|
|
try:
|
|
npresult = np.sort(npresult)
|
|
@@ -1061,7 +1063,7 @@ def mode(
|
|
)
|
|
|
|
result = _reconstruct_data(npresult, original.dtype, original)
|
|
- return result
|
|
+ return result, res_mask
|
|
|
|
|
|
def rank(
|
|
Index: pandas-2.2.3/pandas/core/arrays/base.py
|
|
===================================================================
|
|
--- pandas-2.2.3.orig/pandas/core/arrays/base.py
|
|
+++ pandas-2.2.3/pandas/core/arrays/base.py
|
|
@@ -2270,8 +2270,9 @@ class ExtensionArray:
|
|
Sorted, if possible.
|
|
"""
|
|
# error: Incompatible return value type (got "Union[ExtensionArray,
|
|
- # ndarray[Any, Any]]", expected "Self")
|
|
- return mode(self, dropna=dropna) # type: ignore[return-value]
|
|
+ # Tuple[np.ndarray, npt.NDArray[np.bool_]]", expected "Self")
|
|
+ result, _ = mode(self, dropna=dropna)
|
|
+ return result # type: ignore[return-value]
|
|
|
|
def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
|
|
if any(
|
|
Index: pandas-2.2.3/pandas/core/arrays/categorical.py
|
|
===================================================================
|
|
--- pandas-2.2.3.orig/pandas/core/arrays/categorical.py
|
|
+++ pandas-2.2.3/pandas/core/arrays/categorical.py
|
|
@@ -2435,7 +2435,7 @@ class Categorical(NDArrayBackedExtension
|
|
if dropna:
|
|
mask = self.isna()
|
|
|
|
- res_codes = algorithms.mode(codes, mask=mask)
|
|
+ res_codes, _ = algorithms.mode(codes, mask=mask)
|
|
res_codes = cast(np.ndarray, res_codes)
|
|
assert res_codes.dtype == codes.dtype
|
|
res = self._from_backing_data(res_codes)
|
|
Index: pandas-2.2.3/pandas/core/arrays/datetimelike.py
|
|
===================================================================
|
|
--- pandas-2.2.3.orig/pandas/core/arrays/datetimelike.py
|
|
+++ pandas-2.2.3/pandas/core/arrays/datetimelike.py
|
|
@@ -1646,7 +1646,7 @@ class DatetimeLikeArrayMixin( # type: i
|
|
if dropna:
|
|
mask = self.isna()
|
|
|
|
- i8modes = algorithms.mode(self.view("i8"), mask=mask)
|
|
+ i8modes, _ = algorithms.mode(self.view("i8"), mask=mask)
|
|
npmodes = i8modes.view(self._ndarray.dtype)
|
|
npmodes = cast(np.ndarray, npmodes)
|
|
return self._from_backing_data(npmodes)
|
|
Index: pandas-2.2.3/pandas/core/arrays/masked.py
|
|
===================================================================
|
|
--- pandas-2.2.3.orig/pandas/core/arrays/masked.py
|
|
+++ pandas-2.2.3/pandas/core/arrays/masked.py
|
|
@@ -1105,12 +1105,8 @@ class BaseMaskedArray(OpsMixin, Extensio
|
|
return Series(arr, index=index, name="count", copy=False)
|
|
|
|
def _mode(self, dropna: bool = True) -> Self:
|
|
- if dropna:
|
|
- result = mode(self._data, dropna=dropna, mask=self._mask)
|
|
- res_mask = np.zeros(result.shape, dtype=np.bool_)
|
|
- else:
|
|
- result, res_mask = mode(self._data, dropna=dropna, mask=self._mask)
|
|
- result = type(self)(result, res_mask) # type: ignore[arg-type]
|
|
+ result, res_mask = mode(self._data, dropna=dropna, mask=self._mask)
|
|
+ result = type(self)(result, res_mask)
|
|
return result[result.argsort()]
|
|
|
|
@doc(ExtensionArray.equals)
|
|
Index: pandas-2.2.3/pandas/core/series.py
|
|
===================================================================
|
|
--- pandas-2.2.3.orig/pandas/core/series.py
|
|
+++ pandas-2.2.3/pandas/core/series.py
|
|
@@ -2328,7 +2328,7 @@ class Series(base.IndexOpsMixin, NDFrame
|
|
# TODO: Add option for bins like value_counts()
|
|
values = self._values
|
|
if isinstance(values, np.ndarray):
|
|
- res_values = algorithms.mode(values, dropna=dropna)
|
|
+ res_values, _ = algorithms.mode(values, dropna=dropna)
|
|
else:
|
|
res_values = values._mode(dropna=dropna)
|
|
|
|
Index: pandas-2.2.3/pandas/tests/series/test_reductions.py
|
|
===================================================================
|
|
--- pandas-2.2.3.orig/pandas/tests/series/test_reductions.py
|
|
+++ pandas-2.2.3/pandas/tests/series/test_reductions.py
|
|
@@ -51,6 +51,29 @@ def test_mode_nullable_dtype(any_numeric
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
+def test_mode_nullable_dtype_edge_case(any_numeric_ea_dtype):
|
|
+ # GH##58926
|
|
+ ser = Series([1, 2, 3, 1], dtype=any_numeric_ea_dtype)
|
|
+ result = ser.mode(dropna=False)
|
|
+ expected = Series([1], dtype=any_numeric_ea_dtype)
|
|
+ tm.assert_series_equal(result, expected)
|
|
+
|
|
+ ser2 = Series([1, 1, 2, 3, pd.NA], dtype=any_numeric_ea_dtype)
|
|
+ result = ser2.mode(dropna=False)
|
|
+ expected = Series([1], dtype=any_numeric_ea_dtype)
|
|
+ tm.assert_series_equal(result, expected)
|
|
+
|
|
+ ser3 = Series([1, pd.NA, pd.NA], dtype=any_numeric_ea_dtype)
|
|
+ result = ser3.mode(dropna=False)
|
|
+ expected = Series([pd.NA], dtype=any_numeric_ea_dtype)
|
|
+ tm.assert_series_equal(result, expected)
|
|
+
|
|
+ ser4 = Series([1, 1, pd.NA, pd.NA], dtype=any_numeric_ea_dtype)
|
|
+ result = ser4.mode(dropna=False)
|
|
+ expected = Series([1, pd.NA], dtype=any_numeric_ea_dtype)
|
|
+ tm.assert_series_equal(result, expected)
|
|
+
|
|
+
|
|
def test_mode_infer_string():
|
|
# GH#56183
|
|
pytest.importorskip("pyarrow")
|
|
Index: pandas-2.2.3/pandas/tests/test_algos.py
|
|
===================================================================
|
|
--- pandas-2.2.3.orig/pandas/tests/test_algos.py
|
|
+++ pandas-2.2.3/pandas/tests/test_algos.py
|
|
@@ -1840,7 +1840,8 @@ class TestRank:
|
|
class TestMode:
|
|
def test_no_mode(self):
|
|
exp = Series([], dtype=np.float64, index=Index([], dtype=int))
|
|
- tm.assert_numpy_array_equal(algos.mode(np.array([])), exp.values)
|
|
+ result, _ = algos.mode(np.array([]))
|
|
+ tm.assert_numpy_array_equal(result, exp.values)
|
|
|
|
@pytest.mark.parametrize("dt", np.typecodes["AllInteger"] + np.typecodes["Float"])
|
|
def test_mode_single(self, dt):
|
|
@@ -1853,20 +1854,24 @@ class TestMode:
|
|
|
|
ser = Series(data_single, dtype=dt)
|
|
exp = Series(exp_single, dtype=dt)
|
|
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
|
|
+ result, _ = algos.mode(ser.values)
|
|
+ tm.assert_numpy_array_equal(result, exp.values)
|
|
tm.assert_series_equal(ser.mode(), exp)
|
|
|
|
ser = Series(data_multi, dtype=dt)
|
|
exp = Series(exp_multi, dtype=dt)
|
|
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
|
|
+ result, _ = algos.mode(ser.values)
|
|
+ tm.assert_numpy_array_equal(result, exp.values)
|
|
tm.assert_series_equal(ser.mode(), exp)
|
|
|
|
def test_mode_obj_int(self):
|
|
exp = Series([1], dtype=int)
|
|
- tm.assert_numpy_array_equal(algos.mode(exp.values), exp.values)
|
|
+ result, _ = algos.mode(exp.values)
|
|
+ tm.assert_numpy_array_equal(result, exp.values)
|
|
|
|
exp = Series(["a", "b", "c"], dtype=object)
|
|
- tm.assert_numpy_array_equal(algos.mode(exp.values), exp.values)
|
|
+ result, _ = algos.mode(exp.values)
|
|
+ tm.assert_numpy_array_equal(result, exp.values)
|
|
|
|
@pytest.mark.parametrize("dt", np.typecodes["AllInteger"] + np.typecodes["Float"])
|
|
def test_number_mode(self, dt):
|
|
@@ -1878,12 +1883,14 @@ class TestMode:
|
|
|
|
ser = Series(data_single, dtype=dt)
|
|
exp = Series(exp_single, dtype=dt)
|
|
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
|
|
+ result, _ = algos.mode(ser.values)
|
|
+ tm.assert_numpy_array_equal(result, exp.values)
|
|
tm.assert_series_equal(ser.mode(), exp)
|
|
|
|
ser = Series(data_multi, dtype=dt)
|
|
exp = Series(exp_multi, dtype=dt)
|
|
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
|
|
+ result, _ = algos.mode(ser.values)
|
|
+ tm.assert_numpy_array_equal(result, exp.values)
|
|
tm.assert_series_equal(ser.mode(), exp)
|
|
|
|
def test_strobj_mode(self):
|
|
@@ -1892,7 +1899,8 @@ class TestMode:
|
|
|
|
ser = Series(data, dtype="c")
|
|
exp = Series(exp, dtype="c")
|
|
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
|
|
+ result, _ = algos.mode(ser.values)
|
|
+ tm.assert_numpy_array_equal(result, exp.values)
|
|
tm.assert_series_equal(ser.mode(), exp)
|
|
|
|
@pytest.mark.parametrize("dt", [str, object])
|
|
@@ -1902,7 +1910,8 @@ class TestMode:
|
|
|
|
ser = Series(data, dtype=dt)
|
|
exp = Series(exp, dtype=dt)
|
|
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
|
|
+ result, _ = algos.mode(ser.values)
|
|
+ tm.assert_numpy_array_equal(result, exp.values)
|
|
tm.assert_series_equal(ser.mode(), exp)
|
|
|
|
def test_datelike_mode(self):
|
|
@@ -1936,18 +1945,21 @@ class TestMode:
|
|
def test_mixed_dtype(self):
|
|
exp = Series(["foo"], dtype=object)
|
|
ser = Series([1, "foo", "foo"])
|
|
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
|
|
+ result, _ = algos.mode(ser.values)
|
|
+ tm.assert_numpy_array_equal(result, exp.values)
|
|
tm.assert_series_equal(ser.mode(), exp)
|
|
|
|
def test_uint64_overflow(self):
|
|
exp = Series([2**63], dtype=np.uint64)
|
|
ser = Series([1, 2**63, 2**63], dtype=np.uint64)
|
|
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
|
|
+ result, _ = algos.mode(ser.values)
|
|
+ tm.assert_numpy_array_equal(result, exp.values)
|
|
tm.assert_series_equal(ser.mode(), exp)
|
|
|
|
exp = Series([1, 2**63], dtype=np.uint64)
|
|
ser = Series([1, 2**63], dtype=np.uint64)
|
|
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
|
|
+ result, _ = algos.mode(ser.values)
|
|
+ tm.assert_numpy_array_equal(result, exp.values)
|
|
tm.assert_series_equal(ser.mode(), exp)
|
|
|
|
def test_categorical(self):
|
|
@@ -1969,15 +1981,18 @@ class TestMode:
|
|
def test_index(self):
|
|
idx = Index([1, 2, 3])
|
|
exp = Series([1, 2, 3], dtype=np.int64)
|
|
- tm.assert_numpy_array_equal(algos.mode(idx), exp.values)
|
|
+ result, _ = algos.mode(idx)
|
|
+ tm.assert_numpy_array_equal(result, exp.values)
|
|
|
|
idx = Index([1, "a", "a"])
|
|
exp = Series(["a"], dtype=object)
|
|
- tm.assert_numpy_array_equal(algos.mode(idx), exp.values)
|
|
+ result, _ = algos.mode(idx)
|
|
+ tm.assert_numpy_array_equal(result, exp.values)
|
|
|
|
idx = Index([1, 1, 2, 3, 3])
|
|
exp = Series([1, 3], dtype=np.int64)
|
|
- tm.assert_numpy_array_equal(algos.mode(idx), exp.values)
|
|
+ result, _ = algos.mode(idx)
|
|
+ tm.assert_numpy_array_equal(result, exp.values)
|
|
|
|
idx = Index(
|
|
["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"],
|