Improve accuracy_score compatibility with sklearn. (#6406)

jcrist · web-flow · commit 29871ebc26b6 · 2025-03-10T20:23:41.000Z
This adds support for the `sample_weight` and `normalize` arguments to `accuracy_score`. It also adds support for non-numeric dtypes (like strings), which are support by sklearn in this API. Like with the recent changes to `r2_score`, this is done by moving the implementation to use `cupy` instead of calling a C++ function. `accuracy_score` is not a performance critical algorithm, and the `cupy` implementation is both easier to maintain and should be good enough. Also adds support for `sample_weight` in `ClassifierMixin.score`, fixing a sklearn compatibility bug that was affecting the 0cc layer. Authors: - Jim Crist-Harif (https://github.com/jcrist) Approvers: - Simon Adorf (https://github.com/csadorf) - Bradley Dice (https://github.com/bdice) URL: #6406
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -246,8 +246,7 @@ Metrics (regression, classification, and distance)
   .. automodule:: cuml.metrics.regression
     :members:
 
-  .. automodule:: cuml.metrics.accuracy
-    :members:
+  .. autofunction:: cuml.metrics.accuracy_score
 
   .. autofunction:: cuml.metrics.confusion_matrix
 
diff --git a/python/cuml/cuml/__init__.py b/python/cuml/cuml/__init__.py
@@ -62,9 +62,7 @@
     from cuml.linear_model.mbsgd_regressor import MBSGDRegressor
 
     from cuml.manifold.t_sne import TSNE
-    from cuml.metrics.accuracy import accuracy_score
-    from cuml.metrics.cluster.adjusted_rand_index import adjusted_rand_score
-    from cuml.metrics.regression import r2_score
+    from cuml.metrics import accuracy_score, r2_score, adjusted_rand_score
     from cuml.model_selection import train_test_split
 
     from cuml.naive_bayes.naive_bayes import MultinomialNB
diff --git a/python/cuml/cuml/experimental/hyperparams/HPO_demo.ipynb b/python/cuml/cuml/experimental/hyperparams/HPO_demo.ipynb
@@ -87,7 +87,7 @@
     "\n",
     "from cuml.neighbors import KNeighborsClassifier\n",
     "from cuml.preprocessing.model_selection import train_test_split\n",
-    "from cuml.metrics.accuracy import accuracy_score\n",
+    "from cuml.metrics import accuracy_score\n",
     "\n",
     "import os\n",
     "from urllib.request import urlretrieve\n",
@@ -375,10 +375,10 @@
     "        - y_hat: The predictions made by the model\n",
     "    \"\"\"\n",
     "    y = y.astype(\"float32\") # cuML RandomForest needs the y labels to be float32\n",
-    "    return accuracy_score(y, y_hat, convert_dtype=True)\n",
+    "    return accuracy_score(y, y_hat)\n",
     "\n",
     "accuracy_wrapper_scorer = make_scorer(accuracy_score_wrapper)\n",
-    "cuml_accuracy_scorer = make_scorer(accuracy_score, convert_dtype=True)"
+    "cuml_accuracy_scorer = make_scorer(accuracy_score)"
    ]
   },
   {
@@ -447,7 +447,7 @@
     "        mode_str: User specifies what model it is to print the value\n",
     "    \"\"\"\n",
     "    y_pred = model.fit(X_train, y_train).predict(X_test)\n",
-    "    score = accuracy_score(y_pred, y_test.astype('float32'), convert_dtype=True)\n",
+    "    score = accuracy_score(y_pred, y_test.astype('float32'))\n",
     "    \n",
     "    print(\"{} model accuracy: {}\".format(mode_str, score))\n",
     "                                         "
diff --git a/python/cuml/cuml/internals/mixins.py b/python/cuml/cuml/internals/mixins.py
@@ -236,20 +236,15 @@ class ClassifierMixin:
     )
     @api_base_return_any_skipall
     @enable_device_interop
-    def score(self, X, y, **kwargs):
+    def score(self, X, y, sample_weight=None, **kwargs):
         """
         Scoring function for classifier estimators based on mean accuracy.
 
         """
-        from cuml.metrics.accuracy import accuracy_score
-
-        if hasattr(self, "handle"):
-            handle = self.handle
-        else:
-            handle = None
+        from cuml.metrics import accuracy_score
 
         preds = self.predict(X, **kwargs)
-        return accuracy_score(y, preds, handle=handle)
+        return accuracy_score(y, preds, sample_weight=sample_weight)
 
     @staticmethod
     def _more_static_tags():
diff --git a/python/cuml/cuml/metrics/CMakeLists.txt b/python/cuml/cuml/metrics/CMakeLists.txt
@@ -15,7 +15,6 @@
 
 
 set(cython_sources "")
-add_module_gpu_default("accuracy.pyx" ${accuracy_algo} ${metrics_algo})
 add_module_gpu_default("hinge_loss.pyx" ${hinge_loss_algo} ${metrics_algo})
 add_module_gpu_default("kl_divergence.pyx" ${kl_divergence_algo} ${metrics_algo})
 add_module_gpu_default("pairwise_distances.pyx" ${pairwise_distances_algo} ${metrics_algo})
diff --git a/python/cuml/cuml/metrics/__init__.py b/python/cuml/cuml/metrics/__init__.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,11 +19,10 @@
 from cuml.metrics.regression import mean_squared_error
 from cuml.metrics.regression import mean_squared_log_error
 from cuml.metrics.regression import mean_absolute_error
-from cuml.metrics.accuracy import accuracy_score
 from cuml.metrics.cluster.adjusted_rand_index import adjusted_rand_score
 from cuml.metrics._ranking import roc_auc_score
 from cuml.metrics._ranking import precision_recall_curve
-from cuml.metrics._classification import log_loss
+from cuml.metrics._classification import log_loss, accuracy_score
 from cuml.metrics.cluster.homogeneity_score import (
     cython_homogeneity_score as homogeneity_score,
 )
diff --git a/python/cuml/cuml/metrics/_classification.py b/python/cuml/cuml/metrics/_classification.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,16 +13,121 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import warnings
 
-from cuml.internals.input_utils import input_to_cupy_array
 import cuml.internals
+from cuml.internals.input_utils import input_to_cupy_array
 from cuml.internals.safe_imports import cpu_only_import
 from cuml.internals.safe_imports import gpu_only_import
 
 cp = gpu_only_import("cupy")
+cudf = gpu_only_import("cudf")
 np = cpu_only_import("numpy")
 
 
+def _input_to_cupy_or_cudf_series(x, check_rows=None):
+    """Coerce the input to a 1D cupy array or cudf Series.
+
+    For classification problems we need to support the full range
+    of supported input dtypes. cupy cannot support string labels,
+    and cudf cannot support float16. To handle this, we prefer cudf
+    if the input is cudf, otherwise try to coerce to cupy, falling
+    back to cudf if the dtype isn't supported.
+    """
+    if isinstance(x, cudf.Series):
+        # Drop the index so comparisons don't try to align on index
+        out = x.reset_index(drop=True)
+        n_cols = 1
+    else:
+        try:
+            out, _, n_cols, _ = input_to_cupy_array(x)
+            out = out.squeeze()  # ensure 1D
+        except ValueError:
+            # Unsupported dtype, use cudf instead
+            # Drop the index so comparisons don't try to align on index
+            out = cudf.Series(x, nan_as_null=False, copy=False).reset_index(
+                drop=True
+            )
+            n_cols = 1
+
+    n_rows = len(out)
+
+    if n_cols > 1:
+        raise ValueError(f"Expected 1 column but got {n_cols} columns.")
+    if check_rows is not None and n_rows != check_rows:
+        raise ValueError(f"Expected {check_rows} rows but got {n_rows} rows.")
+
+    return out
+
+
+@cuml.internals.api_return_any()
+def accuracy_score(
+    y_true, y_pred, *, sample_weight=None, normalize=True, **kwargs
+):
+    """
+    Accuracy classification score.
+
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,)
+        Ground truth (correct) labels.
+    y_pred : array-like of shape (n_samples,)
+        Predicted labels.
+    sample_weight : array-like of shape (n_samples,)
+        Sample weights.
+    normalize : bool
+        If ``False``, return the number of correctly classified samples.
+        Otherwise, return the fraction of correctly classified samples.
+
+    Returns
+    -------
+    score : float
+        The fraction of correctly classified samples, or the number of correctly
+        classified samples if ``normalize == False``.
+    """
+
+    if kwargs:
+        warnings.warn(
+            "`convert_dtype` and `handle` were deprecated from `accuracy_score` "
+            "in version 25.04 and will be removed in 25.06.",
+            FutureWarning,
+        )
+
+    y_true = _input_to_cupy_or_cudf_series(y_true)
+    y_pred = _input_to_cupy_or_cudf_series(y_pred, check_rows=len(y_true))
+
+    # Categorical dtypes in cudf currently don't coerce nicely on equality,
+    # we need to manually cast to cudf.Series and align dtypes.
+    # This whole code block can be removed once
+    # https://github.com/rapidsai/cudf/issues/18196 is resolved.
+    if y_true.dtype == "category":
+        if y_pred.dtype != y_true.dtype:
+            y_pred = cudf.Series(y_pred, copy=False, nan_as_null=False).astype(
+                y_true.dtype
+            )
+    elif y_pred.dtype == "category":
+        y_true = cudf.Series(y_true, copy=False, nan_as_null=False).astype(
+            y_pred.dtype
+        )
+
+    if sample_weight is not None:
+        sample_weight = input_to_cupy_array(
+            sample_weight,
+            check_dtype=[np.float32, np.float64, np.int32, np.int64],
+            check_cols=1,
+            check_rows=len(y_true),
+        ).array.squeeze()  # ensure 1D
+
+    correct = y_true == y_pred
+
+    if normalize:
+        return float(cp.average(correct, weights=sample_weight))
+    elif sample_weight is not None:
+        return float(cp.dot(correct, sample_weight))
+    else:
+        return float(cp.count_nonzero(correct))
+
+
 @cuml.internals.api_return_any()
 def log_loss(
     y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None
diff --git a/python/cuml/cuml/metrics/accuracy.pyx b/python/cuml/cuml/metrics/accuracy.pyx
diff --git a/python/cuml/cuml/tests/test_kneighbors_classifier.py b/python/cuml/cuml/tests/test_kneighbors_classifier.py
@@ -99,7 +99,8 @@ def test_neighborhood_predictions(
 @pytest.mark.parametrize("ncols", [50, 100])
 @pytest.mark.parametrize("n_neighbors", [2, 5, 10])
 @pytest.mark.parametrize("n_clusters", [2, 5, 10])
-def test_score(nrows, ncols, n_neighbors, n_clusters, datatype):
+@pytest.mark.parametrize("weighted", [False, True])
+def test_score(nrows, ncols, n_neighbors, n_clusters, datatype, weighted):
 
     X, y = make_blobs(
         n_samples=nrows,
@@ -112,10 +113,18 @@ def test_score(nrows, ncols, n_neighbors, n_clusters, datatype):
     X = X.astype(np.float32)
     X_train, X_test, y_train, y_test = _build_train_test_data(X, y, datatype)
 
+    if weighted:
+        sample_weight = np.random.default_rng(42).uniform(
+            0.5, 1, size=len(X_test)
+        )
+    else:
+        sample_weight = None
+
     knn_cu = cuKNN(n_neighbors=n_neighbors)
     knn_cu.fit(X_train, y_train)
 
-    assert knn_cu.score(X_test, y_test) >= (1.0 - 0.004)
+    score = knn_cu.score(X_test, y_test, sample_weight=sample_weight)
+    assert score >= (1.0 - 0.004)
 
 
 @pytest.mark.parametrize("datatype", ["dataframe", "numpy"])
diff --git a/python/cuml/cuml/tests/test_meta_estimators.py b/python/cuml/cuml/tests/test_meta_estimators.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@ def test_pipeline():
     pipe = Pipeline(steps=[("scaler", StandardScaler()), ("svc", SVC())])
     pipe.fit(X_train, y_train)
     score = pipe.score(X_test, y_test)
-    assert score > 0.8
+    assert score > 0.75
 
 
 def test_gridsearchCV():
diff --git a/python/cuml/cuml/tests/test_metrics.py b/python/cuml/cuml/tests/test_metrics.py

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`#`
`2`		`-# Copyright (c) 2019-2023, NVIDIA CORPORATION.`
	`2`	`+# Copyright (c) 2019-2025, NVIDIA CORPORATION.`
`3`	`3`	`#`
`4`	`4`	`# Licensed under the Apache License, Version 2.0 (the "License");`
`5`	`5`	`# you may not use this file except in compliance with the License.`
`@@ -19,11 +19,10 @@`
`19`	`19`	`from cuml.metrics.regression import mean_squared_error`
`20`	`20`	`from cuml.metrics.regression import mean_squared_log_error`
`21`	`21`	`from cuml.metrics.regression import mean_absolute_error`
`22`		`-from cuml.metrics.accuracy import accuracy_score`
`23`	`22`	`from cuml.metrics.cluster.adjusted_rand_index import adjusted_rand_score`
`24`	`23`	`from cuml.metrics._ranking import roc_auc_score`
`25`	`24`	`from cuml.metrics._ranking import precision_recall_curve`
`26`		`-from cuml.metrics._classification import log_loss`
	`25`	`+from cuml.metrics._classification import log_loss, accuracy_score`
`27`	`26`	`from cuml.metrics.cluster.homogeneity_score import (`
`28`	`27`	`cython_homogeneity_score as homogeneity_score,`
`29`	`28`	`)`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`#`
`2`		`-# Copyright (c) 2021-2023, NVIDIA CORPORATION.`
	`2`	`+# Copyright (c) 2021-2025, NVIDIA CORPORATION.`
`3`	`3`	`#`
`4`	`4`	`# Licensed under the Apache License, Version 2.0 (the "License");`
`5`	`5`	`# you may not use this file except in compliance with the License.`
`@@ -35,7 +35,7 @@ def test_pipeline():`
`35`	`35`	`pipe = Pipeline(steps=[("scaler", StandardScaler()), ("svc", SVC())])`
`36`	`36`	`pipe.fit(X_train, y_train)`
`37`	`37`	`score = pipe.score(X_test, y_test)`
`38`		`- assert score > 0.8`
	`38`	`+ assert score > 0.75`
`39`	`39`
`40`	`40`
`41`	`41`	`def test_gridsearchCV():`