@@ -948,20 +948,23 @@ def concatenate_columns(
948
948
def deconcatenate_column (
949
949
df : pd .DataFrame ,
950
950
column_name ,
951
- new_column_names : Union [List [str ], Tuple [str ]],
952
951
sep : str ,
952
+ new_column_names : Union [List [str ], Tuple [str ]] = None ,
953
+ autoname : str = None ,
953
954
preserve_position : bool = False ,
954
955
) -> pd .DataFrame :
955
956
"""
956
957
De-concatenates a single column into multiple columns.
957
958
958
- This is the inverse of the `concatenate_columns` function.
959
+ This is the inverse of the `` concatenate_columns` ` function.
959
960
960
961
Used to quickly split columns out of a single column.
961
962
962
- The keyword argument `preserve_position` takes `True` or `False` boolean
963
- that controls whether the `new_column_names` will take the original
964
- position of the to-be-deconcatenated `column_name`:
963
+ The keyword argument ``preserve_position``
964
+ takes ``True`` or ``False`` boolean
965
+ that controls whether the ``new_column_names``
966
+ will take the original position
967
+ of the to-be-deconcatenated ``column_name``:
965
968
966
969
- When `preserve_position=False` (default), `df.columns` change from
967
970
`[..., column_name, ...]` to `[..., column_name, ..., new_column_names]`.
@@ -973,6 +976,17 @@ def deconcatenate_column(
973
976
`column_name` at its original position, and `column_name` itself
974
977
is dropped.
975
978
979
+ The keyword argument ``autoname`` accepts a base string
980
+ and then automatically creates numbered column names
981
+ based off the base string.
982
+ For example, if ``col`` is passed in
983
+ as the argument to ``autoname``,
984
+ and 4 columns are created,
985
+ then the resulting columns will be named
986
+ ``col1, col2, col3, col4``.
987
+ Numbering is always 1-indexed, not 0-indexed,
988
+ in order to make the column names human-friendly.
989
+
976
990
This method does not mutate the original DataFrame.
977
991
978
992
Functional usage example:
@@ -996,30 +1010,42 @@ def deconcatenate_column(
996
1010
997
1011
:param df: A pandas DataFrame.
998
1012
:param column_name: The column to split.
999
- :param new_column_names: A list of new column names post-splitting.
1000
1013
:param sep: The separator delimiting the column's data.
1014
+ :param new_column_names: A list of new column names post-splitting.
1015
+ :param autoname: A base name for automatically naming the new columns.
1016
+ Takes precedence over ``new_column_names`` if both are provided.
1001
1017
:param preserve_position: Boolean for whether or not to preserve original
1002
1018
position of the column upon de-concatenation, default to False
1003
1019
:returns: A pandas DataFrame with a deconcatenated column.
1004
1020
"""
1005
- assert (
1006
- column_name in df .columns
1007
- ), f"column name { column_name } not present in dataframe" # noqa: E501
1021
+ if column_name not in df .columns :
1022
+ raise ValueError (f"column name { column_name } not present in dataframe" )
1008
1023
deconcat = df [column_name ].str .split (sep , expand = True )
1009
1024
if preserve_position :
1010
1025
# Keep a copy of the original dataframe
1011
1026
df_original = df .copy ()
1012
- assert (
1013
- len (new_column_names ) == deconcat .shape [1 ]
1014
- ), "number of new column names not correct."
1027
+ if autoname :
1028
+ new_column_names = [
1029
+ f"{ autoname } { i } " for i in range (1 , deconcat .shape [1 ] + 1 )
1030
+ ]
1031
+ if not len (new_column_names ) == deconcat .shape [1 ]:
1032
+ raise JanitorError (
1033
+ f"you need to provide { len (new_column_names )} names"
1034
+ "to new_column_names"
1035
+ )
1036
+
1015
1037
deconcat .columns = new_column_names
1016
1038
df = pd .concat ([df , deconcat ], axis = 1 )
1039
+
1017
1040
if preserve_position :
1018
1041
cols = list (df_original .columns )
1019
1042
index_original = cols .index (column_name )
1020
1043
for i , col_new in enumerate (new_column_names ):
1021
1044
cols .insert (index_original + i , col_new )
1022
1045
df = df [cols ].drop (columns = column_name )
1046
+
1047
+ # TODO: I suspect this should become a test
1048
+ # instead of a defensive check?
1023
1049
assert (
1024
1050
len (df .columns )
1025
1051
== len (df_original .columns ) + len (new_column_names ) - 1
0 commit comments