From 1e899afbd9ca20f4ce9d6f93e1f62c072be0ed23 Mon Sep 17 00:00:00 2001 From: Gen Sato <52241300+halogen22@users.noreply.github.com> Date: Tue, 18 Mar 2025 01:33:40 +0900 Subject: [PATCH] BUG: .mode(dropna=False) doesn't work with nullable integers (#61132) * Fix dropna bug when mode * Fix test cases * Fix data type incompatible --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/hashtable_func_helper.pxi.in | 2 +- pandas/core/algorithms.py | 12 +++--- pandas/core/arrays/base.py | 5 ++- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/masked.py | 8 +--- pandas/core/series.py | 2 +- pandas/tests/series/test_reductions.py | 23 +++++++++++ pandas/tests/test_algos.py | 47 +++++++++++++++-------- 10 files changed, 71 insertions(+), 33 deletions(-) Index: pandas-2.3.1/pandas/_libs/hashtable_func_helper.pxi.in =================================================================== --- pandas-2.3.1.orig/pandas/_libs/hashtable_func_helper.pxi.in +++ pandas-2.3.1/pandas/_libs/hashtable_func_helper.pxi.in @@ -443,7 +443,7 @@ def mode(ndarray[htfunc_t] values, bint if na_counter > 0: res_mask = np.zeros(j+1, dtype=np.bool_) - res_mask[j] = True + res_mask[j] = (na_counter == max_count) return modes[:j + 1], res_mask Index: pandas-2.3.1/pandas/core/algorithms.py =================================================================== --- pandas-2.3.1.orig/pandas/core/algorithms.py +++ pandas-2.3.1/pandas/core/algorithms.py @@ -1022,7 +1022,7 @@ def duplicated( def mode( values: ArrayLike, dropna: bool = True, mask: npt.NDArray[np.bool_] | None = None -) -> ArrayLike: +) -> tuple[np.ndarray, npt.NDArray[np.bool_]] | ExtensionArray: """ Returns the mode(s) of an array. @@ -1035,7 +1035,7 @@ def mode( Returns ------- - np.ndarray or ExtensionArray + Union[Tuple[np.ndarray, npt.NDArray[np.bool_]], ExtensionArray] """ values = _ensure_arraylike(values, func_name="mode") original = values @@ -1049,8 +1049,10 @@ def mode( values = _ensure_data(values) npresult, res_mask = htable.mode(values, dropna=dropna, mask=mask) - if res_mask is not None: - return npresult, res_mask # type: ignore[return-value] + if res_mask is None: + res_mask = np.zeros(npresult.shape, dtype=np.bool_) + else: + return npresult, res_mask try: npresult = safe_sort(npresult) @@ -1061,7 +1063,7 @@ def mode( ) result = _reconstruct_data(npresult, original.dtype, original) - return result + return result, res_mask def rank( Index: pandas-2.3.1/pandas/core/arrays/base.py =================================================================== --- pandas-2.3.1.orig/pandas/core/arrays/base.py +++ pandas-2.3.1/pandas/core/arrays/base.py @@ -2270,8 +2270,9 @@ class ExtensionArray: Sorted, if possible. """ # error: Incompatible return value type (got "Union[ExtensionArray, - # ndarray[Any, Any]]", expected "Self") - return mode(self, dropna=dropna) # type: ignore[return-value] + # Tuple[np.ndarray, npt.NDArray[np.bool_]]", expected "Self") + result, _ = mode(self, dropna=dropna) + return result # type: ignore[return-value] def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): if any( Index: pandas-2.3.1/pandas/core/arrays/categorical.py =================================================================== --- pandas-2.3.1.orig/pandas/core/arrays/categorical.py +++ pandas-2.3.1/pandas/core/arrays/categorical.py @@ -2459,7 +2459,7 @@ class Categorical(NDArrayBackedExtension if dropna: mask = self.isna() - res_codes = algorithms.mode(codes, mask=mask) + res_codes, _ = algorithms.mode(codes, mask=mask) res_codes = cast(np.ndarray, res_codes) assert res_codes.dtype == codes.dtype res = self._from_backing_data(res_codes) Index: pandas-2.3.1/pandas/core/arrays/datetimelike.py =================================================================== --- pandas-2.3.1.orig/pandas/core/arrays/datetimelike.py +++ pandas-2.3.1/pandas/core/arrays/datetimelike.py @@ -1669,7 +1669,7 @@ class DatetimeLikeArrayMixin( # type: i if dropna: mask = self.isna() - i8modes = algorithms.mode(self.view("i8"), mask=mask) + i8modes, _ = algorithms.mode(self.view("i8"), mask=mask) npmodes = i8modes.view(self._ndarray.dtype) npmodes = cast(np.ndarray, npmodes) return self._from_backing_data(npmodes) Index: pandas-2.3.1/pandas/core/arrays/masked.py =================================================================== --- pandas-2.3.1.orig/pandas/core/arrays/masked.py +++ pandas-2.3.1/pandas/core/arrays/masked.py @@ -1124,12 +1124,8 @@ class BaseMaskedArray(OpsMixin, Extensio return Series(arr, index=index, name="count", copy=False) def _mode(self, dropna: bool = True) -> Self: - if dropna: - result = mode(self._data, dropna=dropna, mask=self._mask) - res_mask = np.zeros(result.shape, dtype=np.bool_) - else: - result, res_mask = mode(self._data, dropna=dropna, mask=self._mask) - result = type(self)(result, res_mask) # type: ignore[arg-type] + result, res_mask = mode(self._data, dropna=dropna, mask=self._mask) + result = type(self)(result, res_mask) return result[result.argsort()] @doc(ExtensionArray.equals) Index: pandas-2.3.1/pandas/core/series.py =================================================================== --- pandas-2.3.1.orig/pandas/core/series.py +++ pandas-2.3.1/pandas/core/series.py @@ -2337,7 +2337,7 @@ class Series(base.IndexOpsMixin, NDFrame # TODO: Add option for bins like value_counts() values = self._values if isinstance(values, np.ndarray): - res_values = algorithms.mode(values, dropna=dropna) + res_values, _ = algorithms.mode(values, dropna=dropna) else: res_values = values._mode(dropna=dropna) Index: pandas-2.3.1/pandas/tests/series/test_reductions.py =================================================================== --- pandas-2.3.1.orig/pandas/tests/series/test_reductions.py +++ pandas-2.3.1/pandas/tests/series/test_reductions.py @@ -51,6 +51,29 @@ def test_mode_nullable_dtype(any_numeric tm.assert_series_equal(result, expected) +def test_mode_nullable_dtype_edge_case(any_numeric_ea_dtype): + # GH##58926 + ser = Series([1, 2, 3, 1], dtype=any_numeric_ea_dtype) + result = ser.mode(dropna=False) + expected = Series([1], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + ser2 = Series([1, 1, 2, 3, pd.NA], dtype=any_numeric_ea_dtype) + result = ser2.mode(dropna=False) + expected = Series([1], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + ser3 = Series([1, pd.NA, pd.NA], dtype=any_numeric_ea_dtype) + result = ser3.mode(dropna=False) + expected = Series([pd.NA], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + ser4 = Series([1, 1, pd.NA, pd.NA], dtype=any_numeric_ea_dtype) + result = ser4.mode(dropna=False) + expected = Series([1, pd.NA], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + def test_mode_infer_string(): # GH#56183 pytest.importorskip("pyarrow") Index: pandas-2.3.1/pandas/tests/test_algos.py =================================================================== --- pandas-2.3.1.orig/pandas/tests/test_algos.py +++ pandas-2.3.1/pandas/tests/test_algos.py @@ -1855,7 +1855,8 @@ class TestRank: class TestMode: def test_no_mode(self): exp = Series([], dtype=np.float64, index=Index([], dtype=int)) - tm.assert_numpy_array_equal(algos.mode(np.array([])), exp.values) + result, _ = algos.mode(np.array([])) + tm.assert_numpy_array_equal(result, exp.values) @pytest.mark.parametrize("dt", np.typecodes["AllInteger"] + np.typecodes["Float"]) def test_mode_single(self, dt): @@ -1868,20 +1869,24 @@ class TestMode: ser = Series(data_single, dtype=dt) exp = Series(exp_single, dtype=dt) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + result, _ = algos.mode(ser.values) + tm.assert_numpy_array_equal(result, exp.values) tm.assert_series_equal(ser.mode(), exp) ser = Series(data_multi, dtype=dt) exp = Series(exp_multi, dtype=dt) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + result, _ = algos.mode(ser.values) + tm.assert_numpy_array_equal(result, exp.values) tm.assert_series_equal(ser.mode(), exp) def test_mode_obj_int(self): exp = Series([1], dtype=int) - tm.assert_numpy_array_equal(algos.mode(exp.values), exp.values) + result, _ = algos.mode(exp.values) + tm.assert_numpy_array_equal(result, exp.values) exp = Series(["a", "b", "c"], dtype=object) - tm.assert_numpy_array_equal(algos.mode(exp.values), exp.values) + result, _ = algos.mode(exp.values) + tm.assert_numpy_array_equal(result, exp.values) @pytest.mark.parametrize("dt", np.typecodes["AllInteger"] + np.typecodes["Float"]) def test_number_mode(self, dt): @@ -1893,12 +1898,14 @@ class TestMode: ser = Series(data_single, dtype=dt) exp = Series(exp_single, dtype=dt) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + result, _ = algos.mode(ser.values) + tm.assert_numpy_array_equal(result, exp.values) tm.assert_series_equal(ser.mode(), exp) ser = Series(data_multi, dtype=dt) exp = Series(exp_multi, dtype=dt) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + result, _ = algos.mode(ser.values) + tm.assert_numpy_array_equal(result, exp.values) tm.assert_series_equal(ser.mode(), exp) def test_strobj_mode(self): @@ -1907,7 +1914,8 @@ class TestMode: ser = Series(data, dtype="c") exp = Series(exp, dtype="c") - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + result, _ = algos.mode(ser.values) + tm.assert_numpy_array_equal(result, exp.values) tm.assert_series_equal(ser.mode(), exp) @pytest.mark.parametrize("dt", [str, object]) @@ -1920,7 +1928,8 @@ class TestMode: if using_infer_string and dt is str: tm.assert_extension_array_equal(algos.mode(ser.values), exp.values) else: - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + result, _ = algos.mode(ser.values) + tm.assert_numpy_array_equal(result, exp.values) tm.assert_series_equal(ser.mode(), exp) def test_datelike_mode(self): @@ -1954,18 +1963,21 @@ class TestMode: def test_mixed_dtype(self): exp = Series(["foo"], dtype=object) ser = Series([1, "foo", "foo"]) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + result, _ = algos.mode(ser.values) + tm.assert_numpy_array_equal(result, exp.values) tm.assert_series_equal(ser.mode(), exp) def test_uint64_overflow(self): exp = Series([2**63], dtype=np.uint64) ser = Series([1, 2**63, 2**63], dtype=np.uint64) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + result, _ = algos.mode(ser.values) + tm.assert_numpy_array_equal(result, exp.values) tm.assert_series_equal(ser.mode(), exp) exp = Series([1, 2**63], dtype=np.uint64) ser = Series([1, 2**63], dtype=np.uint64) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + result, _ = algos.mode(ser.values) + tm.assert_numpy_array_equal(result, exp.values) tm.assert_series_equal(ser.mode(), exp) def test_categorical(self): @@ -1987,15 +1999,18 @@ class TestMode: def test_index(self): idx = Index([1, 2, 3]) exp = Series([1, 2, 3], dtype=np.int64) - tm.assert_numpy_array_equal(algos.mode(idx), exp.values) + result, _ = algos.mode(idx) + tm.assert_numpy_array_equal(result, exp.values) idx = Index([1, "a", "a"]) exp = Series(["a"], dtype=object) - tm.assert_numpy_array_equal(algos.mode(idx), exp.values) + result, _ = algos.mode(idx) + tm.assert_numpy_array_equal(result, exp.values) idx = Index([1, 1, 2, 3, 3]) exp = Series([1, 3], dtype=np.int64) - tm.assert_numpy_array_equal(algos.mode(idx), exp.values) + result, _ = algos.mode(idx) + tm.assert_numpy_array_equal(result, exp.values) idx = Index( ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"],