Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store categories from cuDF. #11311

Merged
merged 12 commits into from
Mar 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ enum class FeatureType : uint8_t { kNumerical = 0, kCategorical = 1 };
enum class DataSplitMode : int { kRow = 0, kCol = 1 };

// Forward declaration of the container used by the meta info.
struct CatContainer;
class CatContainer;

/**
* @brief Meta information about dataset, always sit in memory.
Expand Down
98 changes: 91 additions & 7 deletions python-package/xgboost/_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
TYPE_CHECKING,
Any,
Dict,
List,
Literal,
Optional,
Protocol,
Expand All @@ -23,7 +24,7 @@
import numpy as np

from ._typing import CNumericPtr, DataType, NumpyDType, NumpyOrCupy
from .compat import import_cupy, lazy_isinstance
from .compat import import_cupy, import_pyarrow, lazy_isinstance

if TYPE_CHECKING:
import pandas as pd
Expand Down Expand Up @@ -69,7 +70,11 @@ def shape(self) -> Tuple[int, int]:

def array_hasobject(data: DataType) -> bool:
"""Whether the numpy array has object dtype."""
return hasattr(data.dtype, "hasobject") and data.dtype.hasobject
return (
hasattr(data, "dtype")
and hasattr(data.dtype, "hasobject")
and data.dtype.hasobject
)


def cuda_array_interface_dict(data: _CudaArrayLikeArg) -> ArrayInf:
Expand Down Expand Up @@ -180,7 +185,7 @@ def is_arrow_dict(data: Any) -> TypeGuard["pa.DictionaryArray"]:
return lazy_isinstance(data, "pyarrow.lib", "DictionaryArray")


class PdCatAccessor(Protocol):
class DfCatAccessor(Protocol):
"""Protocol for pandas cat accessor."""

@property
Expand All @@ -202,7 +207,7 @@ def to_arrow( # pylint: disable=missing-function-docstring
def __cuda_array_interface__(self) -> ArrayInf: ...


def _is_pd_cat(data: Any) -> TypeGuard[PdCatAccessor]:
def _is_df_cat(data: Any) -> TypeGuard[DfCatAccessor]:
# Test pd.Series.cat, not pd.Series
return hasattr(data, "categories") and hasattr(data, "codes")

Expand Down Expand Up @@ -234,6 +239,67 @@ def npstr_to_arrow_strarr(strarr: np.ndarray) -> Tuple[np.ndarray, str]:
return offsets.astype(np.int32), values


def _arrow_cat_inf( # pylint: disable=too-many-locals
cats: "pa.StringArray",
codes: Union[_ArrayLikeArg, _CudaArrayLikeArg, "pa.IntegerArray"],
) -> Tuple[StringArray, ArrayInf, Tuple]:
if not TYPE_CHECKING:
pa = import_pyarrow()

# FIXME(jiamingy): Account for offset, need to find an implementation that returns
# offset > 0
assert cats.offset == 0
buffers: List[pa.Buffer] = cats.buffers()
mask, offset, data = buffers
assert offset.is_cpu

off_len = len(cats) + 1
if offset.size != off_len * (np.iinfo(np.int32).bits / 8):
raise TypeError("Arrow dictionary type offsets is required to be 32 bit.")

joffset: ArrayInf = {
"data": (offset.address, True),
"typestr": "<i4",
"version": 3,
"strides": None,
"shape": (off_len,),
"mask": None,
}

def make_buf_inf(buf: pa.Buffer, typestr: str) -> ArrayInf:
return {
"data": (buf.address, True),
"typestr": typestr,
"version": 3,
"strides": None,
"shape": (buf.size,),
"mask": None,
}

jdata = make_buf_inf(data, "<i1")
# Categories should not have missing values.
assert mask is None

jnames: StringArray = {"offsets": joffset, "values": jdata}

def make_array_inf(
array: Any,
) -> Tuple[ArrayInf, Optional[Tuple[pa.Buffer, pa.Buffer]]]:
"""Helper for handling categorical codes."""
# Handle cuDF data
if hasattr(array, "__cuda_array_interface__"):
inf = cuda_array_interface_dict(array)
return inf, None

# Other types (like arrow itself) are not yet supported.
raise TypeError("Invalid input type.")

cats_tmp = (mask, offset, data)
jcodes, codes_tmp = make_array_inf(codes)

return jnames, jcodes, (cats_tmp, codes_tmp)


def _ensure_np_dtype(
data: DataType, dtype: Optional[NumpyDType]
) -> Tuple[np.ndarray, Optional[NumpyDType]]:
Expand All @@ -252,7 +318,7 @@ def array_interface_dict(data: np.ndarray) -> ArrayInf: ...

@overload
def array_interface_dict(
data: PdCatAccessor,
data: DfCatAccessor,
) -> Tuple[StringArray, ArrayInf, Tuple]: ...


Expand All @@ -263,11 +329,11 @@ def array_interface_dict(


def array_interface_dict( # pylint: disable=too-many-locals
data: Union[np.ndarray, PdCatAccessor],
data: Union[np.ndarray, DfCatAccessor],
) -> Union[ArrayInf, Tuple[StringArray, ArrayInf, Optional[Tuple]]]:
"""Returns an array interface from the input."""
# Handle categorical values
if _is_pd_cat(data):
if _is_df_cat(data):
cats = data.categories
# pandas uses -1 to represent missing values for categorical features
codes = data.codes.replace(-1, np.nan)
Expand All @@ -287,6 +353,7 @@ def array_interface_dict( # pylint: disable=too-many-locals
name_offsets, _ = _ensure_np_dtype(name_offsets, np.int32)
joffsets = array_interface_dict(name_offsets)
bvalues = name_values.encode("utf-8")

ptr = ctypes.c_void_p.from_buffer(ctypes.c_char_p(bvalues)).value
assert ptr is not None

Expand Down Expand Up @@ -335,3 +402,20 @@ def check_cudf_meta(data: _CudaArrayLikeArg, field: str) -> None:
and data.__cuda_array_interface__["mask"] is not None
):
raise ValueError(f"Missing value is not allowed for: {field}")


def cudf_cat_inf(
cats: DfCatAccessor, codes: "pd.Series"
) -> Tuple[Union[ArrayInf, StringArray], ArrayInf, Tuple]:
"""Obtain the cuda array interface for cuDF categories."""
cp = import_cupy()
is_num_idx = cp.issubdtype(cats.dtype, cp.floating) or cp.issubdtype(
cats.dtype, cp.integer
)
if is_num_idx:
cats_ainf = cats.__cuda_array_interface__
codes_ainf = cuda_array_interface_dict(codes)
return cats_ainf, codes_ainf, (cats, codes)

joffset, jdata, buf = _arrow_cat_inf(cats.to_arrow(), codes)
return joffset, jdata, buf
4 changes: 1 addition & 3 deletions python-package/xgboost/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,7 @@
# The second arg is actually Optional[List[cudf.Series]], skipped for easier type check.
# The cudf Series is the obtained cat codes, preserved in the `DataIter` to prevent it
# being freed.
TransformedData = Tuple[
Any, Optional[List], Optional[FeatureNames], Optional[FeatureTypes]
]
TransformedData = Tuple[Any, Optional[FeatureNames], Optional[FeatureTypes]]

# template parameter
_T = TypeVar("_T")
Expand Down
26 changes: 11 additions & 15 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -626,17 +626,17 @@ def input_data(
and ref is not None
and ref is self._data_ref
):
new, cat_codes, feature_names, feature_types = self._temporary_data
new, feature_names, feature_types = self._temporary_data
else:
new, cat_codes, feature_names, feature_types = _proxy_transform(
new, feature_names, feature_types = _proxy_transform(
data,
feature_names,
feature_types,
self._enable_categorical,
)
# Stage the data, meta info are copied inside C++ MetaInfo.
self._temporary_data = (new, cat_codes, feature_names, feature_types)
dispatch_proxy_set_data(self.proxy, new, cat_codes)
self._temporary_data = (new, feature_names, feature_types)
dispatch_proxy_set_data(self.proxy, new)
self.proxy.set_info(
feature_names=feature_names,
feature_types=feature_types,
Expand Down Expand Up @@ -1525,12 +1525,11 @@ def _ref_data_from_cuda_interface(self, data: DataType) -> None:
arrinf = cuda_array_interface(data)
_check_call(_LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, arrinf))

def _ref_data_from_cuda_columnar(self, data: DataType, cat_codes: list) -> None:
def _ref_data_from_cuda_columnar(self, data: TransformedDf) -> None:
"""Reference data from CUDA columnar format."""
from .data import _cudf_array_interfaces

interfaces_str = _cudf_array_interfaces(data, cat_codes)
_check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str))
_check_call(
_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, data.array_interface())
)

def _ref_data_from_array(self, data: np.ndarray) -> None:
"""Reference data from numpy array."""
Expand Down Expand Up @@ -2822,18 +2821,15 @@ def inplace_predict(
)
return _prediction_output(shape, dims, preds, True)
if _is_cudf_df(data):
from .data import _cudf_array_interfaces, _transform_cudf_df
from .data import _transform_cudf_df

data, cat_codes, fns, _ = _transform_cudf_df(
data, None, None, enable_categorical
)
interfaces_str = _cudf_array_interfaces(data, cat_codes)
df, fns, _ = _transform_cudf_df(data, None, None, enable_categorical)
if validate_features:
self._validate_features(fns)
_check_call(
_LIB.XGBoosterPredictFromCudaColumnar(
self.handle,
interfaces_str,
df.array_interface(),
args,
p_handle,
ctypes.byref(shape),
Expand Down
Loading
Loading