Skip to content

Commit b85d9c9

Browse files
authored
[BUG] changed assertion statements in deconcatenate_columns to error statements (#525)
* [BUG] changed assertion statements in deconcatenate_columns to error statements * black * EOD commit * black * fixed pycodestyle issues * fixed test * fixed bug
1 parent 5746622 commit b85d9c9

File tree

2 files changed

+58
-12
lines changed

2 files changed

+58
-12
lines changed

janitor/functions.py

+38-12
Original file line numberDiff line numberDiff line change
@@ -948,20 +948,23 @@ def concatenate_columns(
948948
def deconcatenate_column(
949949
df: pd.DataFrame,
950950
column_name,
951-
new_column_names: Union[List[str], Tuple[str]],
952951
sep: str,
952+
new_column_names: Union[List[str], Tuple[str]] = None,
953+
autoname: str = None,
953954
preserve_position: bool = False,
954955
) -> pd.DataFrame:
955956
"""
956957
De-concatenates a single column into multiple columns.
957958
958-
This is the inverse of the `concatenate_columns` function.
959+
This is the inverse of the ``concatenate_columns`` function.
959960
960961
Used to quickly split columns out of a single column.
961962
962-
The keyword argument `preserve_position` takes `True` or `False` boolean
963-
that controls whether the `new_column_names` will take the original
964-
position of the to-be-deconcatenated `column_name`:
963+
The keyword argument ``preserve_position``
964+
takes ``True`` or ``False`` boolean
965+
that controls whether the ``new_column_names``
966+
will take the original position
967+
of the to-be-deconcatenated ``column_name``:
965968
966969
- When `preserve_position=False` (default), `df.columns` change from
967970
`[..., column_name, ...]` to `[..., column_name, ..., new_column_names]`.
@@ -973,6 +976,17 @@ def deconcatenate_column(
973976
`column_name` at its original position, and `column_name` itself
974977
is dropped.
975978
979+
The keyword argument ``autoname`` accepts a base string
980+
and then automatically creates numbered column names
981+
based off the base string.
982+
For example, if ``col`` is passed in
983+
as the argument to ``autoname``,
984+
and 4 columns are created,
985+
then the resulting columns will be named
986+
``col1, col2, col3, col4``.
987+
Numbering is always 1-indexed, not 0-indexed,
988+
in order to make the column names human-friendly.
989+
976990
This method does not mutate the original DataFrame.
977991
978992
Functional usage example:
@@ -996,30 +1010,42 @@ def deconcatenate_column(
9961010
9971011
:param df: A pandas DataFrame.
9981012
:param column_name: The column to split.
999-
:param new_column_names: A list of new column names post-splitting.
10001013
:param sep: The separator delimiting the column's data.
1014+
:param new_column_names: A list of new column names post-splitting.
1015+
:param autoname: A base name for automatically naming the new columns.
1016+
Takes precedence over ``new_column_names`` if both are provided.
10011017
:param preserve_position: Boolean for whether or not to preserve original
10021018
position of the column upon de-concatenation, default to False
10031019
:returns: A pandas DataFrame with a deconcatenated column.
10041020
"""
1005-
assert (
1006-
column_name in df.columns
1007-
), f"column name {column_name} not present in dataframe" # noqa: E501
1021+
if column_name not in df.columns:
1022+
raise ValueError(f"column name {column_name} not present in dataframe")
10081023
deconcat = df[column_name].str.split(sep, expand=True)
10091024
if preserve_position:
10101025
# Keep a copy of the original dataframe
10111026
df_original = df.copy()
1012-
assert (
1013-
len(new_column_names) == deconcat.shape[1]
1014-
), "number of new column names not correct."
1027+
if autoname:
1028+
new_column_names = [
1029+
f"{autoname}{i}" for i in range(1, deconcat.shape[1] + 1)
1030+
]
1031+
if not len(new_column_names) == deconcat.shape[1]:
1032+
raise JanitorError(
1033+
f"you need to provide {len(new_column_names)} names"
1034+
"to new_column_names"
1035+
)
1036+
10151037
deconcat.columns = new_column_names
10161038
df = pd.concat([df, deconcat], axis=1)
1039+
10171040
if preserve_position:
10181041
cols = list(df_original.columns)
10191042
index_original = cols.index(column_name)
10201043
for i, col_new in enumerate(new_column_names):
10211044
cols.insert(index_original + i, col_new)
10221045
df = df[cols].drop(columns=column_name)
1046+
1047+
# TODO: I suspect this should become a test
1048+
# instead of a defensive check?
10231049
assert (
10241050
len(df.columns)
10251051
== len(df_original.columns) + len(new_column_names) - 1

tests/functions/test_deconcatenate_column.py

+20
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,23 @@ def test_deconcatenate_column_preserve_position(dataframe):
4141
assert (
4242
list(df.columns).index("col2") == index_original + 1
4343
), "Position not preserved"
44+
45+
46+
def test_deconcatenate_column_autoname(dataframe):
47+
df_original = dataframe.concatenate_columns(
48+
column_names=["a", "decorated-elephant"],
49+
sep="-",
50+
new_column_name="index",
51+
).remove_columns(["a", "decorated-elephant"])
52+
53+
df = df_original.deconcatenate_column(
54+
"index",
55+
sep="-",
56+
new_column_names=["a", "decorated-elephant"],
57+
autoname="col",
58+
)
59+
60+
assert "col1" in df.columns
61+
assert "col2" in df.columns
62+
assert "a" not in df.columns
63+
assert "decorated-elephant" not in df.columns

0 commit comments

Comments
 (0)