diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index ed5c7806b2e23..be49f95fed26a 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1456,24 +1456,17 @@ default value. .. _indexing.lookup: -Looking up values by index/column labels ----------------------------------------- +The :meth:`~pandas.DataFrame.lookup` method +------------------------------------------- Sometimes you want to extract a set of values given a sequence of row labels -and column labels, this can be achieved by ``pandas.factorize`` and NumPy indexing. -For instance: +and column labels, and the ``lookup`` method allows for this and returns a +NumPy array. For instance: .. ipython:: python - df = pd.DataFrame({'col': ["A", "A", "B", "B"], - 'A': [80, 23, np.nan, 22], - 'B': [80, 55, 76, 67]}) - df - idx, cols = pd.factorize(df['col']) - df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx] - -Formerly this could be achieved with the dedicated ``DataFrame.lookup`` method -which was deprecated in version 1.2.0 and removed in version 2.0.0. + dflookup = pd.DataFrame(np.random.rand(20, 4), columns = ['A', 'B', 'C', 'D']) + dflookup.lookup(list(range(0, 10, 2)), ['B', 'C', 'A', 'B', 'D']) .. _indexing.class: diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index da2a9bdada469..cb25079cee917 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -30,6 +30,7 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) +- :meth:`pandas.DataFrame.lookup` returns with optimizations for looking up values by list of row/column pairs (:issue:`40140`) - :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) - Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`) - Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8f65277f660f7..69106d70ccf1b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5135,6 +5135,84 @@ def _series(self): # ---------------------------------------------------------------------- # Reindexing and alignment + def lookup(self, row_labels, col_labels) -> ExtensionArray | np.ndarray: + """ + Label-based "fancy indexing" function for DataFrame. + + Given equal-length arrays of row and column labels, return an + array of the values corresponding to each (row, col) pair. + + Parameters + ---------- + row_labels : sequence + The row labels to use for lookup. + col_labels : sequence + The column labels to use for lookup. + + Returns + ------- + numpy.ndarray + The found values. + + Examples + -------- + >>> grades = pd.DataFrame( + ... { + ... "Math": [85, 92, 78, 88, 95], + ... "Science": [90, 85, 92, 79, 87], + ... }, + ... index=["Alice", "Bob", "Charlie", "David", "Eve"], + ... ) + >>> feedback = pd.DataFrame( + ... { + ... "Math": [ + ... "Strong analytical skills", + ... "Excellent problem-solving", + ... "Needs more practice", + ... "Solid understanding", + ... "Exceptional reasoning", + ... ], + ... "Science": [ + ... "Excellent inquiry skills", + ... "Good theoretical concepts", + ... "Strong methodological interest", + ... "Needs focus", + ... "Outstanding curiosity", + ... ], + ... }, + ... index=["Alice", "Bob", "Charlie", "David", "Eve"], + ... ) + >>> student_top = grades.rank(1).idxmax(1) # student's top score + >>> feedback.lookup(student_top.index, student_top) + array(['Excellent inquiry skills', 'Excellent problem-solving', + 'Strong methodological interest', 'Solid understanding', + 'Exceptional reasoning'], dtype=object) + """ + n = len(row_labels) + if n != len(col_labels): + raise ValueError("Row labels must have same size as column labels") + if not (self.index.is_unique and self.columns.is_unique): + # GH#33041 + raise ValueError("DataFrame.lookup requires unique index and columns") + + ridx = self.index.get_indexer(row_labels) + cidx = self.columns.get_indexer(col_labels) + if (ridx == -1).any(): + raise KeyError("One or more row labels was not found") + if (cidx == -1).any(): + raise KeyError("One or more column labels was not found") + + sub = self.take(np.unique(cidx), axis=1) + sub = sub.take(np.unique(ridx), axis=0) + ridx = sub.index.get_indexer(row_labels) + values = sub.melt()["value"] + cidx = sub.columns.get_indexer(col_labels) + flat_index = ridx + cidx * len(sub) + + result = values[flat_index] + + return result + def _reindex_multi(self, axes: dict[str, Index], fill_value) -> DataFrame: """ We are guaranteed non-Nones in the axes. diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 0c99b08cb30c4..bb924acb2bf4e 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1414,6 +1414,74 @@ def test_loc_named_tuple_for_midx(self): ) tm.assert_frame_equal(result, expected) + def test_lookup_float(self, float_frame): + df = float_frame + rows = list(df.index) * len(df.columns) + cols = list(df.columns) * len(df.index) + result = df.lookup(rows, cols) + + expected = Series([df.loc[r, c] for r, c in zip(rows, cols)]) + tm.assert_series_equal(result, expected, check_index=False, check_names=False) + + def test_lookup_mixed(self, float_string_frame): + df = float_string_frame + rows = list(df.index) * len(df.columns) + cols = list(df.columns) * len(df.index) + result = df.lookup(rows, cols) + + expected = Series([df.loc[r, c] for r, c in zip(rows, cols)], dtype=np.object_) + tm.assert_series_equal(result, expected, check_index=False, check_names=False) + + def test_lookup_bool(self): + df = DataFrame( + { + "label": ["a", "b", "a", "c"], + "mask_a": [True, True, False, True], + "mask_b": [True, False, False, False], + "mask_c": [False, True, False, True], + } + ) + df_mask = df.lookup(df.index, "mask_" + df["label"]) + + exp_mask = Series( + [df.loc[r, c] for r, c in zip(df.index, "mask_" + df["label"])] + ) + + tm.assert_series_equal( + df_mask, Series(exp_mask, name="mask"), check_index=False, check_names=False + ) + assert df_mask.dtype == np.bool_ + + def test_lookup_raises(self, float_frame): + with pytest.raises(KeyError, match="'One or more row labels was not found'"): + float_frame.lookup(["xyz"], ["A"]) + + with pytest.raises(KeyError, match="'One or more column labels was not found'"): + float_frame.lookup([float_frame.index[0]], ["xyz"]) + + with pytest.raises(ValueError, match="same size"): + float_frame.lookup(["a", "b", "c"], ["a"]) + + def test_lookup_requires_unique_axes(self): + # GH#33041 raise with a helpful error message + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 2)), columns=["A", "A"] + ) + + rows = [0, 1] + cols = ["A", "A"] + + # homogeneous-dtype case + with pytest.raises(ValueError, match="requires unique index and columns"): + df.lookup(rows, cols) + with pytest.raises(ValueError, match="requires unique index and columns"): + df.T.lookup(cols, rows) + + # heterogeneous dtype + df["B"] = 0 + with pytest.raises(ValueError, match="requires unique index and columns"): + df.lookup(rows, cols) + @pytest.mark.parametrize("indexer", [["a"], "a"]) @pytest.mark.parametrize("col", [{}, {"b": 1}]) def test_set_2d_casting_date_to_int(self, col, indexer):