Merge pull request #350 from lvgig/feature/remove_no_nulls_error_nearest_mean

alphanumericmale · web-flow · commit d656528c7edb · 2024-12-02T13:36:21.000Z
adjusted behaviour of NearestMeanResponse transformer to be able to c…
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -16,22 +16,32 @@ Subsections for each version can be one of the following;
 
 Each individual change should have a link to the pull request after the description of the change.
 
-1.4.1 (unreleased)
+1.4.2 (unreleased)
 ------------------
 
 Changed
 ^^^^^^^
 
-- Refactored BaseImputer to utilise narwhals `#314 <https://github.com/lvgig/tubular/issues/314>_`
-- Converted test dfs to flexible pandas/polars setup
-- Converted BaseNominalTransformer to utilise narwhals `#334 <https://github.com/lvgig/tubular/issues/334>_`
-- narwhalified CheckNumericMixin `#336 <https://github.com/lvgig/tubular/issues/336>_`
 - placeholder
 - placeholder
 - placeholder
 - placeholder
 - placeholder
 
+1.4.1 (02/12/2024)
+------------------
+
+Changed
+^^^^^^^
+
+- Refactored BaseImputer to utilise narwhals `#314 <https://github.com/lvgig/tubular/issues/314>_`
+- Converted test dfs to flexible pandas/polars setup
+- Converted BaseNominalTransformer to utilise narwhals `#334 <https://github.com/lvgig/tubular/issues/334>_`
+- narwhalified CheckNumericMixin `#336 <https://github.com/lvgig/tubular/issues/336>_`
+- Changed behaviour of NearestMeanResponseImputer so that if there are no nulls at fit, 
+  it warns and has no effect at transform, as opposed to erroring. The error was problematic for e.g.
+  lightweight test runs where nulls are less likely to be present.
+
 1.4.0 (2024-10-15)
 ------------------
 
diff --git a/tests/imputers/test_NearestMeanResponseImputer.py b/tests/imputers/test_NearestMeanResponseImputer.py
@@ -1,3 +1,4 @@
+import narwhals as nw
 import numpy as np
 import pytest
 
@@ -10,6 +11,7 @@
 from tests.imputers.test_BaseImputer import (
     GenericImputerTransformTests,
 )
+from tests.utils import assert_frame_equal_dispatch, dataframe_init_dispatch
 from tubular.imputers import NearestMeanResponseImputer
 
 
@@ -42,18 +44,23 @@ def test_null_values_in_response_error(self, library):
             transformer.fit(df, df["a"])
 
     @pytest.mark.parametrize("library", ["pandas", "polars"])
-    def test_columns_with_no_nulls_error(self, library):
-        """Test an error is raised if a non-response column contains no nulls."""
+    def test_columns_with_no_nulls_warning(self, library):
+        """Test a warning is raised if a non-response column contains no nulls."""
         df = d.create_numeric_df_1(library=library)
 
-        transformer = NearestMeanResponseImputer(columns=["b", "c"])
+        transformer = NearestMeanResponseImputer(columns=["c"])
 
-        with pytest.raises(
-            ValueError,
-            match="NearestMeanResponseImputer: Column b has no missing values, cannot use this transformer.",
+        with pytest.warns(
+            UserWarning,
+            match="NearestMeanResponseImputer: Column c has no missing values, this transformer will have no effect for this column.",
         ):
             transformer.fit(df, df["c"])
 
+        expected_impute_values = {"c": None}
+        assert (
+            transformer.impute_values_ == expected_impute_values
+        ), f"impute_values_ attr not as expected, expected {expected_impute_values} but got {transformer.impute_values_}"
+
     @pytest.mark.parametrize("library", ["pandas", "polars"])
     def test_learnt_values(self, library):
         """Test that the nearest response values learnt during fit are expected."""
@@ -78,3 +85,48 @@ class TestTransform(
     @classmethod
     def setup_class(cls):
         cls.transformer_name = "NearestMeanResponseImputer"
+
+    @pytest.mark.parametrize("library", ["pandas", "polars"])
+    @pytest.mark.parametrize(
+        ("fit_col", "transform_col"),
+        [
+            # try a few types, with and without nulls in transform col
+            ([1, 2, 3], [1.0, np.nan, np.nan]),
+            ([4, 5, 6], [7, 8, 9]),
+            (["a", "b", "c"], ["a", None, "d"]),
+            (["c", "d", "e"], ["f", "g", "h"]),
+            ([4.0, 5.0, 6.0], [8.0, np.nan, 6.0]),
+            ([1.0, 2.0, 3.0], [4.0, 3.0, 2.0]),
+            ([True, False, False], [True, True, None]),
+            ([True, False, True], [True, False, True]),
+        ],
+    )
+    def test_no_effect_when_fit_on_null_free_col(self, fit_col, transform_col, library):
+        "test that when transformer fits on a col with no nulls, transform has no effect"
+
+        df_fit_dict = {
+            "a": fit_col,
+            "b": [1] * len(fit_col),
+        }
+
+        df_fit = dataframe_init_dispatch(df_fit_dict, library=library)
+
+        df_transform_dict = {
+            "a": transform_col,
+        }
+
+        df_transform = dataframe_init_dispatch(df_transform_dict, library=library)
+
+        transformer = NearestMeanResponseImputer(columns=["a"])
+
+        transformer.fit(df_fit, df_fit["b"])
+
+        df_transform = nw.from_native(df_transform)
+
+        expected_output = df_transform.clone().to_native()
+
+        df_transform = nw.to_native(df_transform)
+
+        output = transformer.transform(df_transform)
+
+        assert_frame_equal_dispatch(output, expected_output)
diff --git a/tubular/imputers.py b/tubular/imputers.py
@@ -55,7 +55,10 @@ def transform(self, X: FrameT) -> FrameT:
         X = nw.from_native(super().transform(X))
 
         new_col_expressions = [
-            nw.col(c).fill_null(self.impute_values_[c]) for c in self.columns
+            nw.col(c).fill_null(self.impute_values_[c])
+            if self.impute_values_[c]
+            else nw.col(c)
+            for c in self.columns
         ]
 
         return X.with_columns(
@@ -424,7 +427,8 @@ class NearestMeanResponseImputer(BaseImputer):
     ----------
     columns : None or str or list, default = None
         Columns to impute, if the default of None is supplied all columns in X are used
-        when the transform method is called.
+        when the transform method is called. If the column does not contain nulls at fit,
+        a warning will be issues and this transformer will have no effect on that column.
 
     Attributes
     ----------
@@ -478,26 +482,28 @@ def fit(self, X: FrameT, y: nw.Series) -> FrameT:
             c_nulls = X.select(nw.col(c).is_null())[c]
 
             if c_nulls.sum() == 0:
-                msg = f"{self.classname()}: Column {c} has no missing values, cannot use this transformer."
-                raise ValueError(msg)
+                msg = f"{self.classname()}: Column {c} has no missing values, this transformer will have no effect for this column."
+                warnings.warn(msg, stacklevel=2)
+                self.impute_values_[c] = None
 
-            mean_response_by_levels = (
-                X_y.filter(~c_nulls).group_by(c).agg(nw.col(response_column).mean())
-            )
+            else:
+                mean_response_by_levels = (
+                    X_y.filter(~c_nulls).group_by(c).agg(nw.col(response_column).mean())
+                )
 
-            mean_response_nulls = X_y.filter(c_nulls)[response_column].mean()
+                mean_response_nulls = X_y.filter(c_nulls)[response_column].mean()
 
-            mean_response_by_levels = mean_response_by_levels.with_columns(
-                (nw.col(response_column) - mean_response_nulls)
-                .abs()
-                .alias("abs_diff_response"),
-            )
+                mean_response_by_levels = mean_response_by_levels.with_columns(
+                    (nw.col(response_column) - mean_response_nulls)
+                    .abs()
+                    .alias("abs_diff_response"),
+                )
 
-            # take first value having the minimum difference in terms of average response
-            self.impute_values_[c] = mean_response_by_levels.filter(
-                mean_response_by_levels["abs_diff_response"]
-                == mean_response_by_levels["abs_diff_response"].min(),
-            )[c].item(index=0)
+                # take first value having the minimum difference in terms of average response
+                self.impute_values_[c] = mean_response_by_levels.filter(
+                    mean_response_by_levels["abs_diff_response"]
+                    == mean_response_by_levels["abs_diff_response"].min(),
+                )[c].item(index=0)
 
         return self