dmlc · trivialfis · Mar 5, 2025 · Mar 5, 2025 · Mar 5, 2025 · Mar 5, 2025
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
@@ -43,7 +43,7 @@ enum class FeatureType : uint8_t { kNumerical = 0, kCategorical = 1 };
 enum class DataSplitMode : int { kRow = 0, kCol = 1 };
 
 // Forward declaration of the container used by the meta info.
-struct CatContainer;
+class CatContainer;
 
 /**
  * @brief Meta information about dataset, always sit in memory.

diff --git a/python-package/xgboost/_data_utils.py b/python-package/xgboost/_data_utils.py
@@ -8,6 +8,7 @@
     TYPE_CHECKING,
     Any,
     Dict,
+    List,
     Literal,
     Optional,
     Protocol,
@@ -23,7 +24,7 @@
 import numpy as np
 
 from ._typing import CNumericPtr, DataType, NumpyDType, NumpyOrCupy
-from .compat import import_cupy, lazy_isinstance
+from .compat import import_cupy, import_pyarrow, lazy_isinstance
 
 if TYPE_CHECKING:
     import pandas as pd
@@ -69,7 +70,11 @@ def shape(self) -> Tuple[int, int]:
 
 def array_hasobject(data: DataType) -> bool:
     """Whether the numpy array has object dtype."""
-    return hasattr(data.dtype, "hasobject") and data.dtype.hasobject
+    return (
+        hasattr(data, "dtype")
+        and hasattr(data.dtype, "hasobject")
+        and data.dtype.hasobject
+    )
 
 
 def cuda_array_interface_dict(data: _CudaArrayLikeArg) -> ArrayInf:
@@ -180,7 +185,7 @@ def is_arrow_dict(data: Any) -> TypeGuard["pa.DictionaryArray"]:
     return lazy_isinstance(data, "pyarrow.lib", "DictionaryArray")
 
 
-class PdCatAccessor(Protocol):
+class DfCatAccessor(Protocol):
     """Protocol for pandas cat accessor."""
 
     @property
@@ -202,7 +207,7 @@ def to_arrow(  # pylint: disable=missing-function-docstring
     def __cuda_array_interface__(self) -> ArrayInf: ...
 
 
-def _is_pd_cat(data: Any) -> TypeGuard[PdCatAccessor]:
+def _is_df_cat(data: Any) -> TypeGuard[DfCatAccessor]:
     # Test pd.Series.cat, not pd.Series
     return hasattr(data, "categories") and hasattr(data, "codes")
 
@@ -234,6 +239,67 @@ def npstr_to_arrow_strarr(strarr: np.ndarray) -> Tuple[np.ndarray, str]:
     return offsets.astype(np.int32), values
 
 
+def _arrow_cat_inf(  # pylint: disable=too-many-locals
+    cats: "pa.StringArray",
+    codes: Union[_ArrayLikeArg, _CudaArrayLikeArg, "pa.IntegerArray"],
+) -> Tuple[StringArray, ArrayInf, Tuple]:
+    if not TYPE_CHECKING:
+        pa = import_pyarrow()
+
+    # FIXME(jiamingy): Account for offset, need to find an implementation that returns
+    # offset > 0
+    assert cats.offset == 0
+    buffers: List[pa.Buffer] = cats.buffers()
+    mask, offset, data = buffers
+    assert offset.is_cpu
+
+    off_len = len(cats) + 1
+    if offset.size != off_len * (np.iinfo(np.int32).bits / 8):
+        raise TypeError("Arrow dictionary type offsets is required to be 32 bit.")
+
+    joffset: ArrayInf = {
+        "data": (offset.address, True),
+        "typestr": "<i4",
+        "version": 3,
+        "strides": None,
+        "shape": (off_len,),
+        "mask": None,
+    }
+
+    def make_buf_inf(buf: pa.Buffer, typestr: str) -> ArrayInf:
+        return {
+            "data": (buf.address, True),
+            "typestr": typestr,
+            "version": 3,
+            "strides": None,
+            "shape": (buf.size,),
+            "mask": None,
+        }
+
+    jdata = make_buf_inf(data, "<i1")
+    # Categories should not have missing values.
+    assert mask is None
+
+    jnames: StringArray = {"offsets": joffset, "values": jdata}
+
+    def make_array_inf(
+        array: Any,
+    ) -> Tuple[ArrayInf, Optional[Tuple[pa.Buffer, pa.Buffer]]]:
+        """Helper for handling categorical codes."""
+        # Handle cuDF data
+        if hasattr(array, "__cuda_array_interface__"):
+            inf = cuda_array_interface_dict(array)
+            return inf, None
+
+        # Other types (like arrow itself) are not yet supported.
+        raise TypeError("Invalid input type.")
+
+    cats_tmp = (mask, offset, data)
+    jcodes, codes_tmp = make_array_inf(codes)
+
+    return jnames, jcodes, (cats_tmp, codes_tmp)
+
+
 def _ensure_np_dtype(
     data: DataType, dtype: Optional[NumpyDType]
 ) -> Tuple[np.ndarray, Optional[NumpyDType]]:
@@ -252,7 +318,7 @@ def array_interface_dict(data: np.ndarray) -> ArrayInf: ...
 
 @overload
 def array_interface_dict(
-    data: PdCatAccessor,
+    data: DfCatAccessor,
 ) -> Tuple[StringArray, ArrayInf, Tuple]: ...
 
 
@@ -263,11 +329,11 @@ def array_interface_dict(
 
 
 def array_interface_dict(  # pylint: disable=too-many-locals
-    data: Union[np.ndarray, PdCatAccessor],
+    data: Union[np.ndarray, DfCatAccessor],
 ) -> Union[ArrayInf, Tuple[StringArray, ArrayInf, Optional[Tuple]]]:
     """Returns an array interface from the input."""
     # Handle categorical values
-    if _is_pd_cat(data):
+    if _is_df_cat(data):
         cats = data.categories
         # pandas uses -1 to represent missing values for categorical features
         codes = data.codes.replace(-1, np.nan)
@@ -287,6 +353,7 @@ def array_interface_dict(  # pylint: disable=too-many-locals
         name_offsets, _ = _ensure_np_dtype(name_offsets, np.int32)
         joffsets = array_interface_dict(name_offsets)
         bvalues = name_values.encode("utf-8")
+
         ptr = ctypes.c_void_p.from_buffer(ctypes.c_char_p(bvalues)).value
         assert ptr is not None
 
@@ -335,3 +402,20 @@ def check_cudf_meta(data: _CudaArrayLikeArg, field: str) -> None:
         and data.__cuda_array_interface__["mask"] is not None
     ):
         raise ValueError(f"Missing value is not allowed for: {field}")
+
+
+def cudf_cat_inf(
+    cats: DfCatAccessor, codes: "pd.Series"
+) -> Tuple[Union[ArrayInf, StringArray], ArrayInf, Tuple]:
+    """Obtain the cuda array interface for cuDF categories."""
+    cp = import_cupy()
+    is_num_idx = cp.issubdtype(cats.dtype, cp.floating) or cp.issubdtype(
+        cats.dtype, cp.integer
+    )
+    if is_num_idx:
+        cats_ainf = cats.__cuda_array_interface__
+        codes_ainf = cuda_array_interface_dict(codes)
+        return cats_ainf, codes_ainf, (cats, codes)
+
+    joffset, jdata, buf = _arrow_cat_inf(cats.to_arrow(), codes)
+    return joffset, jdata, buf
diff --git a/python-package/xgboost/_typing.py b/python-package/xgboost/_typing.py
@@ -109,9 +109,7 @@
 # The second arg is actually Optional[List[cudf.Series]], skipped for easier type check.
 # The cudf Series is the obtained cat codes, preserved in the `DataIter` to prevent it
 # being freed.
-TransformedData = Tuple[
-    Any, Optional[List], Optional[FeatureNames], Optional[FeatureTypes]
-]
+TransformedData = Tuple[Any, Optional[FeatureNames], Optional[FeatureTypes]]
 
 # template parameter
 _T = TypeVar("_T")

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
@@ -626,17 +626,17 @@ def input_data(
                 and ref is not None
                 and ref is self._data_ref
             ):
-                new, cat_codes, feature_names, feature_types = self._temporary_data
+                new, feature_names, feature_types = self._temporary_data
             else:
-                new, cat_codes, feature_names, feature_types = _proxy_transform(
+                new, feature_names, feature_types = _proxy_transform(
                     data,
                     feature_names,
                     feature_types,
                     self._enable_categorical,
                 )
             # Stage the data, meta info are copied inside C++ MetaInfo.
-            self._temporary_data = (new, cat_codes, feature_names, feature_types)
-            dispatch_proxy_set_data(self.proxy, new, cat_codes)
+            self._temporary_data = (new, feature_names, feature_types)
+            dispatch_proxy_set_data(self.proxy, new)
             self.proxy.set_info(
                 feature_names=feature_names,
                 feature_types=feature_types,
@@ -1525,12 +1525,11 @@ def _ref_data_from_cuda_interface(self, data: DataType) -> None:
         arrinf = cuda_array_interface(data)
         _check_call(_LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, arrinf))
 
-    def _ref_data_from_cuda_columnar(self, data: DataType, cat_codes: list) -> None:
+    def _ref_data_from_cuda_columnar(self, data: TransformedDf) -> None:
         """Reference data from CUDA columnar format."""
-        from .data import _cudf_array_interfaces
-
-        interfaces_str = _cudf_array_interfaces(data, cat_codes)
-        _check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str))
+        _check_call(
+            _LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, data.array_interface())
+        )
 
     def _ref_data_from_array(self, data: np.ndarray) -> None:
         """Reference data from numpy array."""
@@ -2822,18 +2821,15 @@ def inplace_predict(
             )
             return _prediction_output(shape, dims, preds, True)
         if _is_cudf_df(data):
-            from .data import _cudf_array_interfaces, _transform_cudf_df
+            from .data import _transform_cudf_df
 
-            data, cat_codes, fns, _ = _transform_cudf_df(
-                data, None, None, enable_categorical
-            )
-            interfaces_str = _cudf_array_interfaces(data, cat_codes)
+            df, fns, _ = _transform_cudf_df(data, None, None, enable_categorical)
             if validate_features:
                 self._validate_features(fns)
             _check_call(
                 _LIB.XGBoosterPredictFromCudaColumnar(
                     self.handle,
-                    interfaces_str,
+                    df.array_interface(),
                     args,
                     p_handle,
                     ctypes.byref(shape),