Skip to content

Commit 8efb2af

Browse files
authored
Merge pull request #177 from lvgig/feature/rare_encoder_forget_rare_cats
edited GroupRareLevelsTransformer to forget rare categories when work…
2 parents 81003b9 + 66034ae commit 8efb2af

File tree

4 files changed

+52
-10
lines changed

4 files changed

+52
-10
lines changed

CHANGELOG.rst

+6-1
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,18 @@ Subsections for each version can be one of the following;
1616

1717
Each individual change should have a link to the pull request after the description of the change.
1818

19+
1.2.1 (2024-02-08)
20+
------------------
21+
Added
22+
^^^^^
23+
- Updated GroupRareLevelsTransformer so that when working with category dtypes it forgets categories encoded as rare (this is wanted behaviour as these categories are no longer present in the data) `#177 https://github.com/lvgig/tubular/pull/177`_
24+
1925
1.2.0 (2024-02-06)
2026
------------------
2127
Added
2228
^^^^^
2329
- Update OneHotEncodingTransformer to default to returning int8 columns `#175 <https://github.com/lvgig/tubular/pull/175>`_
2430
- Updated NullIndicator to return int8 columns `#173 https://github.com/lvgig/tubular/pull/173`_
25-
- Update OneHotEncodingTransformer to default to returning int8 columns `#175 <https://github.com/lvgig/tubular/pull/175>`_
2631
- Updated MeanResponseTransformer to coerce return to float (useful behaviour for category type features) `#174 <https://github.com/lvgig/tubular/pull/174>`_
2732

2833
1.1.1 (2024-01-18)

tests/nominal/test_GroupRareLevelsTransformer.py

+30-5
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ def expected_df_1():
227227
df["c"] = pd.Series(
228228
["a", "a", "c", "c", "e", "e", "rare", "rare", "rare", "rare"],
229229
dtype=pd.CategoricalDtype(
230-
categories=["a", "c", "e", "f", "g", "h", "rare"],
230+
categories=["a", "c", "e", "rare"],
231231
ordered=False,
232232
),
233233
)
@@ -361,19 +361,19 @@ def test_expected_output_no_weight_single_row_na_category_column(self):
361361
one_row_df = pd.DataFrame({"b": [np.nan], "c": [np.NaN]})
362362
one_row_df["c"] = one_row_df["c"].astype("category")
363363

364-
# add rare as a category in dataframe
365-
one_row_df["c"] = one_row_df["c"].cat.add_categories("rare")
366-
367364
x = GroupRareLevelsTransformer(columns=["b", "c"], cut_off_percent=0.2)
368365

369366
# set the mappging dict directly rather than fitting x on df so test works with decorators
370367
x.non_rare_levels = {"b": ["a", np.NaN], "c": ["e", "c", "a", np.NaN]}
371368

372369
one_row_df_transformed = x.transform(one_row_df)
373370

371+
expected_df = one_row_df.copy()
372+
expected_df["c"] = expected_df["c"].cat.add_categories(x.rare_level_name)
373+
374374
ta.equality.assert_frame_equal_msg(
375375
actual=one_row_df_transformed,
376-
expected=one_row_df,
376+
expected=expected_df,
377377
msg_tag="Unexpected values in GroupRareLevelsTransformer.transform",
378378
)
379379

@@ -434,3 +434,28 @@ def test_expected_output_unseen_levels_not_encoded(self):
434434
actual=list(df_transformed["b"]),
435435
msg="Unseen levels are not left unchanged when unseen_levels_to_rare is set to false",
436436
)
437+
438+
def test_rare_categories_forgotten(self):
439+
"test that for category dtype, categories encoded as rare are forgotten by series"
440+
441+
df = d.create_df_8()
442+
443+
column = "c"
444+
445+
x = GroupRareLevelsTransformer(
446+
columns=column,
447+
cut_off_percent=0.25,
448+
)
449+
450+
expected_removed_cats = ["c", "b"]
451+
452+
x.fit(df)
453+
454+
output_df = x.transform(df)
455+
456+
output_categories = output_df[column].dtype.categories
457+
458+
for cat in expected_removed_cats:
459+
assert (
460+
cat not in output_categories
461+
), f"{x.classname} output columns should forget rare encoded categories, expected {cat} to be forgotten from column {column}"

tubular/_version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.2.0"
1+
__version__ = "1.2.1"

tubular/nominal.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -445,19 +445,31 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
445445
# for categorical dtypes have to set new category for the impute values first
446446
# and convert back to the categorical type, other it will convert to object
447447
if "category" in X[c].dtype.name:
448+
449+
categories_before = X[c].dtype.categories
450+
448451
if self.rare_level_name not in X[c].cat.categories:
449452
X[c] = X[c].cat.add_categories(self.rare_level_name)
450453

451-
dtype_before = X[c].dtype
452-
453454
X[c] = pd.Series(
454455
data=np.where(
455456
X[c].isin(self.non_rare_levels[c]),
456457
X[c],
457458
self.rare_level_name,
458459
),
459460
index=X.index,
460-
).astype(dtype_before)
461+
)
462+
463+
remaining_categories = [
464+
category
465+
for category in categories_before
466+
if category in self.non_rare_levels[c]
467+
]
468+
469+
X[c] = pd.Categorical(
470+
X[c],
471+
categories=remaining_categories + [self.rare_level_name],
472+
)
461473

462474
else:
463475
# using np.where converts np.NaN to str value if only one row of data frame is passed

0 commit comments

Comments
 (0)