From ac745c09962cba79f28552b229b5c3a602149fc1 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 20 Sep 2022 13:09:25 +0200 Subject: [PATCH 01/16] Bump version --- autosklearn/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosklearn/__version__.py b/autosklearn/__version__.py index a6b8488c18..810f877681 100644 --- a/autosklearn/__version__.py +++ b/autosklearn/__version__.py @@ -1,4 +1,4 @@ """Version information.""" # The following line *must* be the last in the module, exactly as formatted: -__version__ = "0.15.0" +__version__ = "0.16.0dev" From f160d7d82b402c0297cf9d1c6fd22ca47ff117be Mon Sep 17 00:00:00 2001 From: Shantam Gilra <64306405+shantam-8@users.noreply.github.com> Date: Mon, 10 Oct 2022 11:03:36 +0100 Subject: [PATCH 02/16] Proposed changes for ``test_metrics.py`` (#1577) * Trial pytest changes * Updated tests * Fixing errors and repetition * Updating tests * Proposed updates * Removing TestMetrics class * Update test_metrics.py --- test/test_metric/test_metrics.py | 859 ++++++++++++++----------------- 1 file changed, 380 insertions(+), 479 deletions(-) diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py index 4443024c4b..2edc7c066c 100644 --- a/test/test_metric/test_metrics.py +++ b/test/test_metric/test_metrics.py @@ -2,7 +2,6 @@ import numpy as np import sklearn.metrics -from smac.utils.constants import MAXINT import autosklearn.metrics from autosklearn.constants import BINARY_CLASSIFICATION, REGRESSION @@ -45,425 +44,340 @@ def dummy_metric(y_true, y_pred, X_data=None, **kwargs): ) scorer_nox(y_true, y_pred, X_data=np.array([32])) - def test_predict_scorer_binary(self): - y_true = np.array([0, 0, 1, 1]) - y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) - - scorer = autosklearn.metrics._PredictScorer( - "accuracy", sklearn.metrics.accuracy_score, 1, 0, 1, {} - ) - - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 1.0) - - y_pred = np.array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.5) - - y_pred = np.array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.5) - - scorer = autosklearn.metrics._PredictScorer( - "bac", sklearn.metrics.balanced_accuracy_score, 1, 0, 1, {} - ) - - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.5) - - scorer = autosklearn.metrics._PredictScorer( - name="accuracy", - score_func=sklearn.metrics.accuracy_score, - optimum=1, - worst_possible_result=0, - sign=-1, - kwargs={}, - ) - - y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, -1.0) - - def test_predict_scorer_multiclass(self): - y_true = np.array([0, 1, 2]) - y_pred = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) - - scorer = autosklearn.metrics._PredictScorer( - "accuracy", sklearn.metrics.accuracy_score, 1, 0, 1, {} - ) - - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 1.0) - - y_pred = np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.333333333) - - y_pred = np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.333333333) - - scorer = autosklearn.metrics._PredictScorer( - "bac", sklearn.metrics.balanced_accuracy_score, 1, 0, 1, {} - ) - - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.333333333) - - scorer = autosklearn.metrics._PredictScorer( - "accuracy", sklearn.metrics.accuracy_score, 1, 0, -1, {} - ) - - y_pred = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, -1.0) - - def test_predict_scorer_multilabel(self): - y_true = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) - y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) - - scorer = autosklearn.metrics._PredictScorer( - "accuracy", sklearn.metrics.accuracy_score, 1, 0, 1, {} - ) - - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 1.0) - - y_pred = np.array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.25) - - y_pred = np.array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.25) - - scorer = autosklearn.metrics._PredictScorer( - "accuracy", sklearn.metrics.accuracy_score, 1, 0, -1, {} - ) - - y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, -1.0) - - def test_predict_scorer_regression(self): - y_true = np.arange(0, 1.01, 0.1) - y_pred = y_true.copy() - - scorer = autosklearn.metrics._PredictScorer( - "r2", sklearn.metrics.r2_score, 1, 0, 1, {} - ) - - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 1.0) - - y_pred = np.ones(y_true.shape) * np.mean(y_true) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.0) - - def test_proba_scorer_binary(self): - y_true = [0, 0, 1, 1] - y_pred = [[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]] - - scorer = autosklearn.metrics._ProbaScorer( - "log_loss", sklearn.metrics.log_loss, 0, MAXINT, 1, {} - ) - - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.0) - - y_pred = [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]] - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.69314718055994529) - - y_pred = [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]] - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.69314718055994529) - - scorer = autosklearn.metrics._ProbaScorer( - "log_loss", sklearn.metrics.log_loss, 0, MAXINT, -1, {} - ) - - y_pred = [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]] - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, -0.69314718055994529) - - def test_proba_scorer_multiclass(self): - y_true = [0, 1, 2] - y_pred = [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]] - - scorer = autosklearn.metrics._ProbaScorer( - "log_loss", sklearn.metrics.log_loss, 0, MAXINT, 1, {} - ) - - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.0) - - y_pred = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 1.0986122886681098) - - y_pred = [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]] - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 1.0986122886681096) - - scorer = autosklearn.metrics._ProbaScorer( - "log_loss", sklearn.metrics.log_loss, 0, MAXINT, -1, {} - ) - - y_pred = [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]] - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, -1.0986122886681096) - - def test_proba_scorer_multilabel(self): - y_true = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) - y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) - - scorer = autosklearn.metrics._ProbaScorer( - "log_loss", sklearn.metrics.log_loss, 0, MAXINT, 1, {} - ) - - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.34657359027997314) - - y_pred = np.array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.69314718055994529) - - y_pred = np.array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.69314718055994529) - - scorer = autosklearn.metrics._ProbaScorer( - "log_loss", sklearn.metrics.log_loss, 0, MAXINT, -1, {} - ) - - y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, -0.34657359027997314) - def test_threshold_scorer_binary(self): - y_true = [0, 0, 1, 1] - y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) - - scorer = autosklearn.metrics._ThresholdScorer( - "roc_auc", sklearn.metrics.roc_auc_score, 1, 0, 1, {} - ) - - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 1.0) - - y_pred = np.array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.5) - - y_pred = np.array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.5) - - scorer = autosklearn.metrics._ThresholdScorer( - "roc_auc", sklearn.metrics.roc_auc_score, 1, 0, -1, {} - ) - - y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, -1.0) - - def test_threshold_scorer_multilabel(self): - y_true = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) - y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) - - scorer = autosklearn.metrics._ThresholdScorer( - "roc_auc", sklearn.metrics.roc_auc_score, 1, 0, 1, {} - ) - - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 1.0) - - y_pred = np.array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.5) - - y_pred = np.array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 0.5) - - scorer = autosklearn.metrics._ThresholdScorer( - "roc_auc", sklearn.metrics.roc_auc_score, 1, 0, -1, {} - ) - - y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, -1.0) - - def test_sign_flip(self): - y_true = np.arange(0, 1.01, 0.1) - y_pred = y_true.copy() - - scorer = autosklearn.metrics.make_scorer( - "r2", sklearn.metrics.r2_score, greater_is_better=True - ) - - score = scorer(y_true, y_pred + 1.0) - self.assertAlmostEqual(score, -9.0) - - score = scorer(y_true, y_pred + 0.5) - self.assertAlmostEqual(score, -1.5) - - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, 1.0) - - scorer = autosklearn.metrics.make_scorer( - "r2", sklearn.metrics.r2_score, greater_is_better=False - ) - - score = scorer(y_true, y_pred + 1.0) - self.assertAlmostEqual(score, 9.0) - - score = scorer(y_true, y_pred + 0.5) - self.assertAlmostEqual(score, 1.5) +@pytest.mark.parametrize( + "y_pred, y_true, scorer, expected_score", + [ + ( + np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]), + np.array([0, 0, 1, 1]), + autosklearn.metrics.accuracy, + 1.0, + ), + ( + np.array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]), + np.array([0, 0, 1, 1]), + autosklearn.metrics.accuracy, + 0.5, + ), + ( + np.array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]), + np.array([0, 0, 1, 1]), + autosklearn.metrics.balanced_accuracy, + 0.5, + ), + ( + np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]), + np.array([0, 1, 2]), + autosklearn.metrics.accuracy, + 1.0, + ), + ( + np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]), + np.array([0, 1, 2]), + autosklearn.metrics.accuracy, + 0.333333333, + ), + ( + np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]), + np.array([0, 1, 2]), + autosklearn.metrics.accuracy, + 0.333333333, + ), + ( + np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]), + np.array([0, 1, 2]), + autosklearn.metrics.balanced_accuracy, + 0.333333333, + ), + ( + np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]), + np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), + autosklearn.metrics.accuracy, + 1.0, + ), + ( + np.array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]), + np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), + autosklearn.metrics.accuracy, + 0.25, + ), + ( + np.array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]), + np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), + autosklearn.metrics.accuracy, + 0.25, + ), + ( + np.arange(0, 1.01, 0.1), + np.arange(0, 1.01, 0.1), + autosklearn.metrics.r2, + 1.0, + ), + ( + np.ones(np.arange(0, 1.01, 0.1).shape) * np.mean(np.arange(0, 1.01, 0.1)), + np.arange(0, 1.01, 0.1), + autosklearn.metrics.r2, + 0.0, + ), + ( + np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]), + np.array([0, 0, 1, 1]), + autosklearn.metrics.log_loss, + 0.0, + ), + ( + np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]), + np.array([0, 1, 2]), + autosklearn.metrics.log_loss, + 0.0, + ), + ( + np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]), + np.array([0, 0, 1, 1]), + autosklearn.metrics.roc_auc, + 1.0, + ), + ( + np.array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]), + np.array([0, 0, 1, 1]), + autosklearn.metrics.roc_auc, + 0.5, + ), + ( + np.array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]), + np.array([0, 0, 1, 1]), + autosklearn.metrics.roc_auc, + 0.5, + ), + ( + np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]), + np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), + autosklearn.metrics.roc_auc, + 1.0, + ), + ( + np.array([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]), + np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), + autosklearn.metrics.roc_auc, + 0.5, + ), + ( + np.array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]), + np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), + autosklearn.metrics.roc_auc, + 0.5, + ), + ], +) +def test_scorer( + y_pred: np.ndarray, + y_true: np.ndarray, + scorer: autosklearn.metrics.Scorer, + expected_score: float, +) -> None: + """ + Expects + ------- + * Expected scores are equal to scores gained from implementing assembled scorers. + """ + result_score = scorer(y_true, y_pred) + assert expected_score == pytest.approx(result_score) + + +@pytest.mark.parametrize( + "y_pred, y_true, expected_score", + [ + ( + np.arange(0, 1.01, 0.1) + 1.0, + np.arange(0, 1.01, 0.1), + -9.0, + ), + ( + np.arange(0, 1.01, 0.1) + 0.5, + np.arange(0, 1.01, 0.1), + -1.5, + ), + ( + np.arange(0, 1.01, 0.1), + np.arange(0, 1.01, 0.1), + 1.0, + ), + ], +) +def test_sign_flip( + y_pred: np.array, + y_true: np.array, + expected_score: float, +) -> None: + """ + Expects + ------- + * Flipping greater_is_better for r2_score result in flipped signs of its output. + """ + greater_true_scorer = autosklearn.metrics.make_scorer( + "r2", sklearn.metrics.r2_score, greater_is_better=True + ) + greater_true_score = greater_true_scorer(y_true, y_pred) + assert expected_score == pytest.approx(greater_true_score) - score = scorer(y_true, y_pred) - self.assertAlmostEqual(score, -1.0) + greater_false_scorer = autosklearn.metrics.make_scorer( + "r2", sklearn.metrics.r2_score, greater_is_better=False + ) + greater_false_score = greater_false_scorer(y_true, y_pred) + assert (expected_score * -1.0) == pytest.approx(greater_false_score) + + +def test_regression_metrics(): + """ + Expects + ------- + * Test metrics do not change output for autosklearn.metrics.REGRESSION_METRICS. + """ + for metric, scorer in autosklearn.metrics.REGRESSION_METRICS.items(): + y_true = np.random.random(100).reshape((-1, 1)) + y_pred = y_true.copy() + np.random.randn(100, 1) * 0.1 + + if metric == "mean_squared_log_error": + y_true = np.abs(y_true) + y_pred = np.abs(y_pred) + + y_true_2 = y_true.copy() + y_pred_2 = y_pred.copy() + assert np.isfinite(scorer(y_true_2, y_pred_2)) + np.testing.assert_array_almost_equal(y_true, y_true_2, err_msg=metric) + np.testing.assert_array_almost_equal(y_pred, y_pred_2, err_msg=metric) + + +def test_classification_metrics(): + """ + Expects + ------- + * Test metrics do not change output for autosklearn.metrics.CLASSIFICATION_METRICS. + """ + for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): + y_true = np.random.randint(0, 2, size=(100, 1)) + y_pred = np.random.random(200).reshape((-1, 2)) + y_pred = np.array([y_pred[i] / np.sum(y_pred[i]) for i in range(100)]) + + y_true_2 = y_true.copy() + y_pred_2 = y_pred.copy() + try: + assert np.isfinite(scorer(y_true_2, y_pred_2)) + np.testing.assert_array_almost_equal(y_true, y_true_2, err_msg=metric) + np.testing.assert_array_almost_equal(y_pred, y_pred_2, err_msg=metric) + except ValueError as e: + if ( + e.args[0] == "Samplewise metrics are not available outside" + " of multilabel classification." + ): + pass + else: + raise e + + +def test_regression_all(): + """ + Expects + ------- + * Correct scores from REGRESSION_METRICS. + """ + for metric, scorer in autosklearn.metrics.REGRESSION_METRICS.items(): + if scorer.name == "mean_squared_log_error": + continue + y_true = np.array([1, 2, 3, 4]) -class TestMetricsDoNotAlterInput(unittest.TestCase): - def test_regression_metrics(self): - for metric, scorer in autosklearn.metrics.REGRESSION_METRICS.items(): - y_true = np.random.random(100).reshape((-1, 1)) - y_pred = y_true.copy() + np.random.randn(100, 1) * 0.1 + y_pred_list = [ + np.array([1, 2, 3, 4]), + np.array([3, 4, 5, 6]), + np.array([-1, 0, -1, 0]), + np.array([-5, 10, 7, -3]), + ] - if metric == "mean_squared_log_error": - y_true = np.abs(y_true) - y_pred = np.abs(y_pred) + score_list = [scorer(y_true, y_pred) for y_pred in y_pred_list] + + assert scorer._optimum == pytest.approx(score_list[0]) + assert score_list == sorted(score_list, reverse=True) + + +def test_classification_binary(): + """ + Expects + ------- + * Correct scores from CLASSIFICATION_METRICS for binary classification. + """ + for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): + # Skip functions not applicable for binary classification. + # TODO: Average precision should work for binary classification, + # TODO: but its behavior is not right. When y_pred is completely + # TODO: wrong, it does return 0.5, but when it is not completely + # TODO: wrong, it returns value smaller than 0.5. + if metric in [ + "average_precision", + "precision_samples", + "recall_samples", + "f1_samples", + ]: + continue - y_true_2 = y_true.copy() - y_pred_2 = y_pred.copy() - self.assertTrue(np.isfinite(scorer(y_true_2, y_pred_2))) - np.testing.assert_array_almost_equal(y_true, y_true_2, err_msg=metric) - np.testing.assert_array_almost_equal(y_pred, y_pred_2, err_msg=metric) + y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0]) - def test_classification_metrics(self): - for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): - y_true = np.random.randint(0, 2, size=(100, 1)) - y_pred = np.random.random(200).reshape((-1, 2)) - y_pred = np.array([y_pred[i] / np.sum(y_pred[i]) for i in range(100)]) - - y_true_2 = y_true.copy() - y_pred_2 = y_pred.copy() - try: - self.assertTrue(np.isfinite(scorer(y_true_2, y_pred_2))) - np.testing.assert_array_almost_equal(y_true, y_true_2, err_msg=metric) - np.testing.assert_array_almost_equal(y_pred, y_pred_2, err_msg=metric) - except ValueError as e: - if ( - e.args[0] == "Samplewise metrics are not available outside" - " of multilabel classification." - ): - pass - else: - raise e - - -class TestMetric(unittest.TestCase): - def test_regression_all(self): - - for metric, scorer in autosklearn.metrics.REGRESSION_METRICS.items(): - y_true = np.array([1, 2, 3, 4]) - y_pred = y_true.copy() - previous_score = scorer._optimum - current_score = scorer(y_true, y_pred) - self.assertAlmostEqual(current_score, previous_score) - - y_pred = np.array([3, 4, 5, 6]) - current_score = scorer(y_true, y_pred) - self.assertLess(current_score, previous_score) - - if scorer.name == "mean_squared_log_error": - continue - - y_pred = np.array([-1, 0, -1, 0]) - previous_score = current_score - current_score = scorer(y_true, y_pred) - self.assertLess(current_score, previous_score) - - y_pred = np.array([-5, 10, 7, -3]) - previous_score = current_score - current_score = scorer(y_true, y_pred) - self.assertLess(current_score, previous_score) - - def test_classification_binary(self): - - for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): - # Skip functions not applicable for binary classification. - # TODO: Average precision should work for binary classification, - # TODO: but its behavior is not right. When y_pred is completely - # TODO: wrong, it does return 0.5, but when it is not completely - # TODO: wrong, it returns value smaller than 0.5. - if metric in [ - "average_precision", - "precision_samples", - "recall_samples", - "f1_samples", - ]: - continue - - y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0]) - y_pred = np.array( + y_pred_list = [ + np.array( [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] - ) - previous_score = scorer._optimum - current_score = scorer(y_true, y_pred) - self.assertAlmostEqual(current_score, previous_score) - - y_pred = np.array( + ), + np.array( [[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]] - ) - previous_score = current_score - current_score = scorer(y_true, y_pred) - self.assertLess(current_score, previous_score) - - y_pred = np.array( + ), + np.array( [[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]] - ) - previous_score = current_score - current_score = scorer(y_true, y_pred) - self.assertLess(current_score, previous_score) - - y_pred = np.array( + ), + np.array( [[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]] - ) - previous_score = current_score - current_score = scorer(y_true, y_pred) - self.assertLess(current_score, previous_score) - - def test_classification_multiclass(self): - # The last check in this test has a mismatch between the number of - # labels predicted in y_pred and the number of labels in y_true. - # This triggers several warnings but we are aware. - # - # TODO convert to pytest with fixture - # - # This test should be parameterized so we can identify which metrics - # cause which warning specifically and rectify if needed. - ignored_warnings = [(UserWarning, "y_pred contains classes not in y_true")] - - for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): - # Skip functions not applicable for multiclass classification. - if metric in [ - "roc_auc", - "average_precision", - "precision", - "recall", - "f1", - "precision_samples", - "recall_samples", - "f1_samples", - ]: - continue - - y_true = np.array([0.0, 0.0, 1.0, 1.0, 2.0]) - - y_pred = np.array( + ), + ] + + score_list = [scorer(y_true, y_pred) for y_pred in y_pred_list] + + assert scorer._optimum == pytest.approx(score_list[0]) + assert score_list == sorted(score_list, reverse=True) + + +def test_classification_multiclass(): + """ + Expects + ------- + * Correct scores from CLASSIFICATION_METRICS for multiclass classification. + """ + # The last check in this test has a mismatch between the number of + # labels predicted in y_pred and the number of labels in y_true. + # This triggers several warnings but we are aware. + # + # TODO convert to pytest with fixture + # + # This test should be parameterized so we can identify which metrics + # cause which warning specifically and rectify if needed. + ignored_warnings = [(UserWarning, "y_pred contains classes not in y_true")] + + for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): + # Skip functions not applicable for multiclass classification. + if metric in [ + "roc_auc", + "average_precision", + "precision", + "recall", + "f1", + "precision_samples", + "recall_samples", + "f1_samples", + ]: + continue + + y_true = np.array([0.0, 0.0, 1.0, 1.0, 2.0]) + + y_pred_list = [ + np.array( [ [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], @@ -471,12 +385,8 @@ def test_classification_multiclass(self): [0.0, 1.0, 0.0], [0.0, 0.0, 1.0], ] - ) - previous_score = scorer._optimum - current_score = scorer(y_true, y_pred) - self.assertAlmostEqual(current_score, previous_score) - - y_pred = np.array( + ), + np.array( [ [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], @@ -484,12 +394,8 @@ def test_classification_multiclass(self): [0.0, 1.0, 0.0], [0.0, 0.0, 1.0], ] - ) - previous_score = current_score - current_score = scorer(y_true, y_pred) - self.assertLess(current_score, previous_score) - - y_pred = np.array( + ), + np.array( [ [0.0, 0.0, 1.0], [0.0, 1.0, 0.0], @@ -497,12 +403,8 @@ def test_classification_multiclass(self): [0.0, 1.0, 0.0], [0.0, 1.0, 0.0], ] - ) - previous_score = current_score - current_score = scorer(y_true, y_pred) - self.assertLess(current_score, previous_score) - - y_pred = np.array( + ), + np.array( [ [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], @@ -510,59 +412,58 @@ def test_classification_multiclass(self): [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], ] - ) - previous_score = current_score - current_score = scorer(y_true, y_pred) - self.assertLess(current_score, previous_score) - - # less labels in the targets than in the predictions - y_true = np.array([0.0, 0.0, 1.0, 1.0]) - y_pred = np.array( - [[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]] - ) + ), + ] + + score_list = [scorer(y_true, y_pred) for y_pred in y_pred_list] + + assert scorer._optimum == pytest.approx(score_list[0]) + assert score_list == sorted(score_list, reverse=True) + + # less labels in the targets than in the predictions + y_true = np.array([0.0, 0.0, 1.0, 1.0]) + y_pred = np.array( + [[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]] + ) + + with warnings.catch_warnings(): + for category, message in ignored_warnings: + warnings.filterwarnings("ignore", category=category, message=message) + + score = scorer(y_true, y_pred) + assert np.isfinite(score) + + +def test_classification_multilabel(): + """ + Expects + ------- + * Correct scores from CLASSIFICATION_METRICS for multi-label classification. + """ + for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): + # Skip functions not applicable for multi-label classification. + if metric in [ + "roc_auc", + "log_loss", + "precision", + "recall", + "f1", + "balanced_accuracy", + ]: + continue + y_true = np.array([[1, 0, 0], [1, 1, 0], [0, 1, 1], [1, 1, 1]]) + + y_pred_list = [ + np.array([[1, 0, 0], [1, 1, 0], [0, 1, 1], [1, 1, 1]]), + np.array([[1, 0, 0], [0, 0, 1], [0, 1, 1], [1, 1, 1]]), + np.array([[1, 0, 0], [0, 0, 1], [1, 0, 1], [1, 1, 0]]), + np.array([[0, 1, 1], [0, 0, 1], [1, 0, 0], [0, 0, 0]]), + ] + + score_list = [scorer(y_true, y_pred) for y_pred in y_pred_list] - with warnings.catch_warnings(): - for category, message in ignored_warnings: - warnings.filterwarnings( - "ignore", category=category, message=message - ) - - score = scorer(y_true, y_pred) - self.assertTrue(np.isfinite(score)) - - def test_classification_multilabel(self): - - for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): - # Skip functions not applicable for multi-label classification. - if metric in [ - "roc_auc", - "log_loss", - "precision", - "recall", - "f1", - "balanced_accuracy", - ]: - continue - y_true = np.array([[1, 0, 0], [1, 1, 0], [0, 1, 1], [1, 1, 1]]) - y_pred = y_true.copy() - previous_score = scorer._optimum - current_score = scorer(y_true, y_pred) - self.assertAlmostEqual(current_score, previous_score) - - y_pred = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 1], [1, 1, 1]]) - previous_score = current_score - current_score = scorer(y_true, y_pred) - self.assertLess(current_score, previous_score) - - y_pred = np.array([[1, 0, 0], [0, 0, 1], [1, 0, 1], [1, 1, 0]]) - previous_score = current_score - current_score = scorer(y_true, y_pred) - self.assertLess(current_score, previous_score) - - y_pred = np.array([[0, 1, 1], [0, 0, 1], [1, 0, 0], [0, 0, 0]]) - previous_score = current_score - current_score = scorer(y_true, y_pred) - self.assertLess(current_score, previous_score) + assert scorer._optimum == pytest.approx(score_list[0]) + assert score_list == sorted(score_list, reverse=True) class TestCalculateScore(unittest.TestCase): From 313f5fbc131b888070404e9e05c105eb0d2a6c6a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 10 Oct 2022 12:05:08 +0200 Subject: [PATCH 03/16] Bump actions/stale from 5 to 6 (#1588) Bumps [actions/stale](https://github.com/actions/stale) from 5 to 6. - [Release notes](https://github.com/actions/stale/releases) - [Changelog](https://github.com/actions/stale/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/stale/compare/v5...v6) --- updated-dependencies: - dependency-name: actions/stale dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/stale.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml index 5d24ae0627..f5232d347e 100644 --- a/.github/workflows/stale.yaml +++ b/.github/workflows/stale.yaml @@ -9,7 +9,7 @@ jobs: stale: runs-on: ubuntu-latest steps: - - uses: actions/stale@v5 + - uses: actions/stale@v6 with: days-before-stale: 60 days-before-close: 7 From cc047d651373cc4c3c34afa555f242274565e33b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 10 Oct 2022 12:05:21 +0200 Subject: [PATCH 04/16] Bump actions/checkout from 2 to 3.1.0 (#1592) Bumps [actions/checkout](https://github.com/actions/checkout) from 2 to 3.1.0. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v2...v3.1.0) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/citation_cff.yml | 2 +- .github/workflows/dist.yml | 2 +- .github/workflows/docker-publish.yml | 2 +- .github/workflows/docs.yml | 2 +- .github/workflows/generate-baselines.yml | 2 +- .github/workflows/pre-commit-update.yml | 2 +- .github/workflows/pre-commit.yaml | 2 +- .github/workflows/pytest.yml | 2 +- .github/workflows/regressions.yml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/citation_cff.yml b/.github/workflows/citation_cff.yml index 6851c52d38..d3a5659aa8 100644 --- a/.github/workflows/citation_cff.yml +++ b/.github/workflows/citation_cff.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out a copy of the repository - uses: actions/checkout@v3 + uses: actions/checkout@v3.1.0 - name: Check whether the citation metadata from CITATION.cff is valid uses: citation-file-format/cffconvert-github-action@2.0.0 diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml index 07ad9366a2..1053f3dda9 100644 --- a/.github/workflows/dist.yml +++ b/.github/workflows/dist.yml @@ -22,7 +22,7 @@ jobs: steps: - name: Check out the repo - uses: actions/checkout@v3 + uses: actions/checkout@v3.1.0 with: submodules: recursive diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index a884e5f613..1c849db2a1 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -22,7 +22,7 @@ jobs: steps: - name: Check out the repo - uses: actions/checkout@v3 + uses: actions/checkout@v3.1.0 with: submodules: recursive diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 83510c5483..c87f900302 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -23,7 +23,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v3.1.0 with: submodules: recursive diff --git a/.github/workflows/generate-baselines.yml b/.github/workflows/generate-baselines.yml index 5149dd57d8..fd6f45b381 100644 --- a/.github/workflows/generate-baselines.yml +++ b/.github/workflows/generate-baselines.yml @@ -64,7 +64,7 @@ jobs: python-version: ${{ steps.python-version.outputs.value }} - name: Checkout Automlbenchmark - uses: actions/checkout@v2 + uses: actions/checkout@v3.1.0 with: repository: ${{ env.AUTOMLBENCHMARK_REPO }} ref: ${{ env.AUTOMLBENCHMARK_REF }} diff --git a/.github/workflows/pre-commit-update.yml b/.github/workflows/pre-commit-update.yml index 3bfede916f..09db790a7f 100644 --- a/.github/workflows/pre-commit-update.yml +++ b/.github/workflows/pre-commit-update.yml @@ -11,7 +11,7 @@ jobs: auto-update: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3.1.0 - uses: actions/setup-python@v2 diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml index c7e5b94438..9964a287e7 100644 --- a/.github/workflows/pre-commit.yaml +++ b/.github/workflows/pre-commit.yaml @@ -20,7 +20,7 @@ jobs: run-all-files: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v3.1.0 with: submodules: recursive diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 64e1f26b5b..76c8aac3ed 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -74,7 +74,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v3.1.0 with: submodules: recursive diff --git a/.github/workflows/regressions.yml b/.github/workflows/regressions.yml index 8bb0addcf4..942b0253d2 100644 --- a/.github/workflows/regressions.yml +++ b/.github/workflows/regressions.yml @@ -82,7 +82,7 @@ jobs: # branch: the branch name - name: Checkout Automlbenchmark - uses: actions/checkout@v3 + uses: actions/checkout@v3.1.0 with: repository: ${{ env.AUTOMLBENCHMARK_REPO }} ref: ${{ env.AUTOMLBENCHMARK_REF }} From 5c69ddf4584c5c7c4977203a1a579d042c6e3716 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 10 Oct 2022 12:06:13 +0200 Subject: [PATCH 05/16] chore: update pre-commit hooks (#1580) Co-authored-by: eddiebergman --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b296a920cb..baf26a9ee3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,7 @@ repos: files: test/.* - repo: https://github.com/psf/black - rev: 22.6.0 + rev: 22.10.0 hooks: - id: black name: black formatter autosklearn @@ -39,7 +39,7 @@ repos: additional_dependencies: ["toml"] # Needed to parse pyproject.toml - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.971 + rev: v0.982 hooks: - id: mypy name: mypy auto-sklearn From 305a3ab152241c1f97b7ac239ad09b66cfe81c57 Mon Sep 17 00:00:00 2001 From: Aron Bahram Date: Mon, 14 Nov 2022 09:26:21 +0100 Subject: [PATCH 06/16] Fix link checker make command in CONTRIBUTE.md (#1608) --- CONTRIBUTING.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 73ce781618..6408e56628 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -252,10 +252,11 @@ Lastly, if the feature really is a game changer or you're very proud of it, cons make doc ``` * If you're unfamiliar with sphinx, it's a documentation generator which can read comments and docstrings from within the code and generate html documentation. - * If you've added documentation, we also has a command `linkcheck` for making sure all the links correctly go to some destination. + * If you've added documentation, we also have a command `links` for making sure + all the links correctly go to some destination. This helps tests for dead links or accidental typos. ```bash - make linkcheck + make links ``` * We also use sphinx-gallery which can take python files (such as those in the `examples` folder) and run them, creating html which shows the code and the output it generates. ```bash @@ -396,7 +397,7 @@ Lastly, if the feature really is a game changer or you're very proud of it, cons # If you changed documentation: # This will generate all documentation and check links make doc - make linkcheck + make links make examples # mainly needed if you modified some examples # ... fix any issues From 6a97f729b971df0db3c1d590b94985c7349a1c5e Mon Sep 17 00:00:00 2001 From: Aron Bahram Date: Mon, 14 Nov 2022 09:30:36 +0100 Subject: [PATCH 07/16] Show progress bar while fitting to training data (#1606) * Show progress bar while fitting to training data * Minor fixes for progress bar * Revert accidental changes to requirements.txt * Document changes * Skip type checks for tqdm * Make progress bar more flexible with kwargs * Fix link checker make command in CONTRIBUTE.md * Update doc link to be sphinx compatible * Switch to pytets-forked from pytest-xdist Co-authored-by: Eddie Bergman --- CONTRIBUTING.md | 4 +- autosklearn/automl.py | 10 +++++ autosklearn/estimators.py | 7 ++++ autosklearn/experimental/askl2.py | 6 +++ autosklearn/util/progress_bar.py | 68 +++++++++++++++++++++++++++++++ pyproject.toml | 3 +- requirements.txt | 3 +- setup.py | 2 +- 8 files changed, 98 insertions(+), 5 deletions(-) create mode 100644 autosklearn/util/progress_bar.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6408e56628..dfffc2fcf1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -252,8 +252,8 @@ Lastly, if the feature really is a game changer or you're very proud of it, cons make doc ``` * If you're unfamiliar with sphinx, it's a documentation generator which can read comments and docstrings from within the code and generate html documentation. - * If you've added documentation, we also have a command `links` for making sure - all the links correctly go to some destination. + * If you've added documentation, we also have a command `links` for making + sure all the links correctly go to some destination. This helps tests for dead links or accidental typos. ```bash make links diff --git a/autosklearn/automl.py b/autosklearn/automl.py index e242fbbc08..93fde84330 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -120,6 +120,7 @@ warnings_to, ) from autosklearn.util.parallel import preload_modules +from autosklearn.util.progress_bar import ProgressBar from autosklearn.util.smac_wrap import SMACCallback, SmacRunCallback from autosklearn.util.stopwatch import StopWatch @@ -239,6 +240,7 @@ def __init__( get_trials_callback: SMACCallback | None = None, dataset_compression: bool | Mapping[str, Any] = True, allow_string_features: bool = True, + disable_progress_bar: bool = False, ): super().__init__() @@ -295,6 +297,7 @@ def __init__( self.logging_config = logging_config self.precision = precision self.allow_string_features = allow_string_features + self.disable_progress_bar = disable_progress_bar self._initial_configurations_via_metalearning = ( initial_configurations_via_metalearning ) @@ -626,6 +629,12 @@ def fit( # By default try to use the TCP logging port or get a new port self._logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT + progress_bar = ProgressBar( + total=self._time_for_task, + disable=self.disable_progress_bar, + desc="Fitting to the training data", + colour="green", + ) # Once we start the logging server, it starts in a new process # If an error occurs then we want to make sure that we exit cleanly # and shut it down, else it might hang @@ -961,6 +970,7 @@ def fit( self._logger.exception(e) raise e finally: + progress_bar.stop() self._fit_cleanup() self.fitted = True diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index 1a094d2582..577265239e 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -76,6 +76,7 @@ def __init__( get_trials_callback: SMACCallback | None = None, dataset_compression: Union[bool, Mapping[str, Any]] = True, allow_string_features: bool = True, + disable_progress_bar: bool = False, ): """ Parameters @@ -381,6 +382,10 @@ def __init__( Whether autosklearn should process string features. By default the textpreprocessing is enabled. + disable_progress_bar: bool = False + Whether to disable the progress bar that is displayed in the console + while fitting to the training data. + Attributes ---------- cv_results_ : dict of numpy (masked) ndarrays @@ -475,6 +480,7 @@ def __init__( self.get_trials_callback = get_trials_callback self.dataset_compression = dataset_compression self.allow_string_features = allow_string_features + self.disable_progress_bar = disable_progress_bar self.automl_ = None # type: Optional[AutoML] @@ -525,6 +531,7 @@ def build_automl(self): get_trials_callback=self.get_trials_callback, dataset_compression=self.dataset_compression, allow_string_features=self.allow_string_features, + disable_progress_bar=self.disable_progress_bar, ) return automl diff --git a/autosklearn/experimental/askl2.py b/autosklearn/experimental/askl2.py index 317f0be5b1..b712ba484e 100644 --- a/autosklearn/experimental/askl2.py +++ b/autosklearn/experimental/askl2.py @@ -166,6 +166,7 @@ def __init__( load_models: bool = True, dataset_compression: Union[bool, Mapping[str, Any]] = True, allow_string_features: bool = True, + disable_progress_bar: bool = False, ): """ @@ -284,6 +285,10 @@ def __init__( load_models : bool, optional (True) Whether to load the models after fitting Auto-sklearn. + disable_progress_bar: bool = False + Whether to disable the progress bar that is displayed in the console + while fitting to the training data. + Attributes ---------- @@ -337,6 +342,7 @@ def __init__( scoring_functions=scoring_functions, load_models=load_models, allow_string_features=allow_string_features, + disable_progress_bar=disable_progress_bar, ) def train_selectors(self, selected_metric=None): diff --git a/autosklearn/util/progress_bar.py b/autosklearn/util/progress_bar.py new file mode 100644 index 0000000000..7ccd3bc153 --- /dev/null +++ b/autosklearn/util/progress_bar.py @@ -0,0 +1,68 @@ +from typing import Any + +import datetime +import time +from threading import Thread + +from tqdm import trange + + +class ProgressBar(Thread): + """A Thread that displays a tqdm progress bar in the console. + + It is specialized to display information relevant to fitting to the training data + with auto-sklearn. + + Parameters + ---------- + total : int + The total amount that should be reached by the progress bar once it finishes + update_interval : float + Specifies how frequently the progress bar is updated (in seconds) + disable : bool + Turns on or off the progress bar. If True, this thread won't be started or + initialized. + kwargs : Any + Keyword arguments that are passed into tqdm's constructor. Refer to: + `tqdm `_. Note that postfix can not be + specified in the kwargs since it is already passed into tqdm by this class. + """ + + def __init__( + self, + total: int, + update_interval: float = 1.0, + disable: bool = False, + **kwargs: Any, + ): + self.disable = disable + if not disable: + super().__init__(name="_progressbar_") + self.total = total + self.update_interval = update_interval + self.terminated: bool = False + self.kwargs = kwargs + # start this thread + self.start() + + def run(self) -> None: + """Display a tqdm progress bar in the console. + + Additionally, it shows useful information related to the task. This method + overrides the run method of Thread. + """ + if not self.disable: + for _ in trange( + self.total, + postfix=f"The total time budget for this task is " + f"{datetime.timedelta(seconds=self.total)}", + **self.kwargs, + ): + if not self.terminated: + time.sleep(self.update_interval) + + def stop(self) -> None: + """Terminates the thread.""" + if not self.disable: + self.terminated = True + super().join() diff --git a/pyproject.toml b/pyproject.toml index 40ea854030..a696c0fb46 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -155,7 +155,8 @@ module = [ "setuptools.*", "pkg_resources.*", "yaml.*", - "psutil.*" + "psutil.*", + "tqdm.*", ] ignore_missing_imports = true diff --git a/requirements.txt b/requirements.txt index 76af7f4a06..d47fb91474 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,8 +14,9 @@ pyyaml pandas>=1.0 liac-arff threadpoolctl +tqdm ConfigSpace>=0.4.21,<0.5 pynisher>=0.6.3,<0.7 pyrfr>=0.8.1,<0.9 -smac>=1.2,<1.3 +smac>=1.2,<1.3 \ No newline at end of file diff --git a/setup.py b/setup.py index aa6e42669e..6e37e0e711 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ "test": [ "pytest>=4.6", "pytest-cov", - "pytest-xdist", + "pytest-forked", "pytest-timeout", "pytest-cases>=3.6.11", "mypy", From 5a90a19bf8f9342ce9cad7d28ce230bdbb33ead1 Mon Sep 17 00:00:00 2001 From: Eddie Bergman Date: Tue, 15 Nov 2022 14:53:25 +0100 Subject: [PATCH 08/16] fix(multiprocessing): Use list instead of key-view (#1609) --- autosklearn/util/parallel.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/autosklearn/util/parallel.py b/autosklearn/util/parallel.py index 0804588a61..2bbe474abb 100644 --- a/autosklearn/util/parallel.py +++ b/autosklearn/util/parallel.py @@ -3,7 +3,16 @@ def preload_modules(context: multiprocessing.context.BaseContext) -> None: - all_loaded_modules = sys.modules.keys() + """Attempt to preload modules when using forkserver""" + # NOTE: preloading and docstring + # + # This is just a best guess at why this is used, coming from this blogpost + # https://bnikolic.co.uk/blog/python/parallelism/2019/11/13/python-forkserver-preload.html + # Ideally we should identify subprocesses that get run with this and try limit the + # necessity to use all of these modules + # + # @eddiebergman + all_loaded_modules = list(sys.modules.keys()) preload = [ loaded_module for loaded_module in all_loaded_modules From 40f1111f7e733833ce59e038695e0b90452cebe9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 15 Nov 2022 16:50:25 +0100 Subject: [PATCH 09/16] chore: update pre-commit hooks (#1605) Co-authored-by: eddiebergman --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index baf26a9ee3..95b8c00f51 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,7 +39,7 @@ repos: additional_dependencies: ["toml"] # Needed to parse pyproject.toml - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.982 + rev: v0.990 hooks: - id: mypy name: mypy auto-sklearn From 1abd1f95eeb68d695cc66c60b2ccfa9a0b24b1ae Mon Sep 17 00:00:00 2001 From: Eddie Bergman Date: Tue, 15 Nov 2022 17:04:44 +0100 Subject: [PATCH 10/16] doc(smac): Update link for `get_smac_object_callback` (#1610) * doc(smac): Update link for `get_smac_object_callback` * doc(links): Update more smac links --- autosklearn/estimators.py | 4 ++-- autosklearn/experimental/askl2.py | 2 +- examples/40_advanced/example_multi_objective.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index 577265239e..68300b4a29 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -276,12 +276,12 @@ def __init__( smac_scenario_args : dict, optional (None) Additional arguments inserted into the scenario of SMAC. See the - `SMAC documentation `_ + `SMAC documentation `_ for a list of available arguments. get_smac_object_callback : callable Callback function to create an object of class - `smac.optimizer.smbo.SMBO `_. + `smac.facade.AbstractFacade `_. The function must accept the arguments ``scenario_dict``, ``instances``, ``num_params``, ``runhistory``, ``seed`` and ``ta``. This is an advanced feature. Use only if you are familiar with diff --git a/autosklearn/experimental/askl2.py b/autosklearn/experimental/askl2.py index b712ba484e..abe43ff254 100644 --- a/autosklearn/experimental/askl2.py +++ b/autosklearn/experimental/askl2.py @@ -264,7 +264,7 @@ def __init__( smac_scenario_args : dict, optional (None) Additional arguments inserted into the scenario of SMAC. See the - `SMAC documentation `_ + `SMAC documentation `_ for a list of available arguments. logging_config : dict, optional (None) diff --git a/examples/40_advanced/example_multi_objective.py b/examples/40_advanced/example_multi_objective.py index 2e4ceb1c7e..d61ce8b17a 100644 --- a/examples/40_advanced/example_multi_objective.py +++ b/examples/40_advanced/example_multi_objective.py @@ -8,7 +8,7 @@ competing metrics: `precision` and `recall` (read more on this tradeoff in the `scikit-learn docs `_. -Auto-sklearn uses `SMAC3's implementation of ParEGO `_. +Auto-sklearn uses `SMAC3's implementation of ParEGO `_. Multi-objective ensembling and proper access to the full Pareto set will be added in the near future. """ From 59ea4b0a8010e7a621503b4e1be2ca7c9d34fc03 Mon Sep 17 00:00:00 2001 From: Aron Bahram Date: Thu, 24 Nov 2022 13:12:14 +0100 Subject: [PATCH 11/16] refactor: use progress_bar more explicitly as a thread (#1622) --- autosklearn/automl.py | 3 +- autosklearn/util/progress_bar.py | 72 +++++++++++++++++++++----------- 2 files changed, 50 insertions(+), 25 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 93fde84330..1d37cb2321 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -652,6 +652,7 @@ def fit( # space self._backend.save_start_time(self._seed) + progress_bar.start() self._stopwatch = StopWatch() # Make sure that input is valid @@ -970,7 +971,7 @@ def fit( self._logger.exception(e) raise e finally: - progress_bar.stop() + progress_bar.join() self._fit_cleanup() self.fitted = True diff --git a/autosklearn/util/progress_bar.py b/autosklearn/util/progress_bar.py index 7ccd3bc153..c1eb3139f8 100644 --- a/autosklearn/util/progress_bar.py +++ b/autosklearn/util/progress_bar.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Any import datetime @@ -10,22 +12,45 @@ class ProgressBar(Thread): """A Thread that displays a tqdm progress bar in the console. - It is specialized to display information relevant to fitting to the training data - with auto-sklearn. + Treat this class as an ordinary thread. So to display a progress bar, + call start() on an instance of this class. To wait for the thread to + terminate call join(), which will max out the progress bar, + therefore terminate this thread immediately. Parameters ---------- total : int - The total amount that should be reached by the progress bar once it finishes - update_interval : float - Specifies how frequently the progress bar is updated (in seconds) - disable : bool - Turns on or off the progress bar. If True, this thread won't be started or - initialized. - kwargs : Any + The total amount that should be reached by the progress bar once it finishes. + update_interval : float, default=1.0 + Specifies how frequently the progress bar is updated (in seconds). + disable : bool, default=False + Turns on or off the progress bar. If True, this thread does not get + initialized and won't be started if start() is called. + tqdm_kwargs : Any, optional Keyword arguments that are passed into tqdm's constructor. Refer to: - `tqdm `_. Note that postfix can not be - specified in the kwargs since it is already passed into tqdm by this class. + `tqdm `_ for a list of parameters that + tqdm accepts. Note that 'postfix' cannot be specified in the kwargs since it is + already passed into tqdm by this class. + + Examples + -------- + + .. code:: python + + progress_bar = ProgressBar( + total=10, + desc="Executing code that runs for 10 seconds", + colour="green", + ) + # colour is a tqdm parameter passed as a tqdm_kwargs + try: + progress_bar.start() + # some code that runs for 10 seconds + except SomeException: + # something went wrong + finally: + progress_bar.join() + # perform some cleanup """ def __init__( @@ -33,7 +58,7 @@ def __init__( total: int, update_interval: float = 1.0, disable: bool = False, - **kwargs: Any, + **tqdm_kwargs: Any, ): self.disable = disable if not disable: @@ -41,28 +66,27 @@ def __init__( self.total = total self.update_interval = update_interval self.terminated: bool = False - self.kwargs = kwargs - # start this thread - self.start() + self.tqdm_kwargs = tqdm_kwargs - def run(self) -> None: - """Display a tqdm progress bar in the console. + def start(self) -> None: + """Start a new thread that calls the run() method.""" + if not self.disable: + super().start() - Additionally, it shows useful information related to the task. This method - overrides the run method of Thread. - """ + def run(self) -> None: + """Display a tqdm progress bar in the console.""" if not self.disable: for _ in trange( self.total, postfix=f"The total time budget for this task is " f"{datetime.timedelta(seconds=self.total)}", - **self.kwargs, + **self.tqdm_kwargs, ): if not self.terminated: time.sleep(self.update_interval) - def stop(self) -> None: - """Terminates the thread.""" + def join(self, timeout: float | None = None) -> None: + """Maxes out the progress bar and thereby terminating this thread.""" if not self.disable: self.terminated = True - super().join() + super().join(timeout) From a978478f6053f2e966955347e32b20d6d35c0a61 Mon Sep 17 00:00:00 2001 From: Aron Bahram Date: Thu, 24 Nov 2022 13:14:44 +0100 Subject: [PATCH 12/16] fix: modify show_models() to display same ranks as leaderboard (#1621) --- autosklearn/automl.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 1d37cb2321..ffcf1fb033 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -2185,21 +2185,20 @@ def has_key(rv, key): table = pd.DataFrame.from_dict(table_dict, orient="index") table.sort_values(by="cost", inplace=True) + table["rank"] = np.arange(1, len(table.index) + 1) # Check which resampling strategy is chosen and selecting the appropriate models is_cv = self._resampling_strategy == "cv" models = self.cv_models_ if is_cv else self.models_ - rank = 1 # Initializing rank for the first model for (_, model_id, _), model in models.items(): model_dict = {} # Declaring model dictionary # Inserting model_id, rank, cost and ensemble weight model_dict["model_id"] = table.loc[model_id]["model_id"].astype(int) - model_dict["rank"] = rank + model_dict["rank"] = table.loc[model_id]["rank"].astype(int) model_dict["cost"] = table.loc[model_id]["cost"] model_dict["ensemble_weight"] = table.loc[model_id]["ensemble_weight"] - rank += 1 # Incrementing rank by 1 for the next model # The steps in the models pipeline are as follows: # 'data_preprocessor': DataPreprocessor, From 63bfbebbd288c8669d6bce7f44f8c9a3a82facd5 Mon Sep 17 00:00:00 2001 From: Aron Bahram Date: Wed, 7 Dec 2022 09:59:21 +0100 Subject: [PATCH 13/16] refactor: track model_ids in cv_results (#1628) --- autosklearn/automl.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index ffcf1fb033..1b2b08f74f 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -1921,15 +1921,17 @@ def cv_results_(self): metric_dict[metric.name] = [] metric_mask[metric.name] = [] + model_ids = [] mean_fit_time = [] params = [] status = [] budgets = [] - for run_key in self.runhistory_.data: - run_value = self.runhistory_.data[run_key] + for run_key, run_value in self.runhistory_.data.items(): config_id = run_key.config_id config = self.runhistory_.ids_config[config_id] + if run_value.additional_info and "num_run" in run_value.additional_info: + model_ids.append(run_value.additional_info["num_run"]) s = run_value.status if s == StatusType.SUCCESS: @@ -1990,6 +1992,8 @@ def cv_results_(self): metric_dict[metric.name].append(metric_value) metric_mask[metric.name].append(mask_value) + results["model_ids"] = model_ids + if len(self._metrics) == 1: results["mean_test_score"] = np.array(metric_dict[self._metrics[0].name]) rank_order = -1 * self._metrics[0]._sign * results["mean_test_score"] @@ -2165,14 +2169,11 @@ def show_models(self) -> dict[int, Any]: warnings.warn("No ensemble found. Returning empty dictionary.") return ensemble_dict - def has_key(rv, key): - return rv.additional_info and key in rv.additional_info - table_dict = {} - for run_key, run_val in self.runhistory_.data.items(): - if has_key(run_val, "num_run"): - model_id = run_val.additional_info["num_run"] - table_dict[model_id] = {"model_id": model_id, "cost": run_val.cost} + for run_key, run_value in self.runhistory_.data.items(): + if run_value.additional_info and "num_run" in run_value.additional_info: + model_id = run_value.additional_info["num_run"] + table_dict[model_id] = {"model_id": model_id, "cost": run_value.cost} # Checking if the dictionary is empty if not table_dict: From 673211252ca508b6f5bb92cf5fa87c6455bbad99 Mon Sep 17 00:00:00 2001 From: Aron Bahram Date: Tue, 18 Apr 2023 13:08:13 +0200 Subject: [PATCH 14/16] fix(regressor): correctly cap the labels in predict (#1662) updates pre-commit --- .pre-commit-config.yaml | 8 ++++---- autosklearn/pipeline/regression.py | 25 ++++++++++++++++++++++++- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 95b8c00f51..af0ec72b29 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ repos: - repo: https://github.com/pycqa/isort - rev: 5.10.1 + rev: 5.11.5 hooks: - id: isort name: isort imports autosklearn @@ -15,7 +15,7 @@ repos: files: test/.* - repo: https://github.com/psf/black - rev: 22.10.0 + rev: 23.3.0 hooks: - id: black name: black formatter autosklearn @@ -31,7 +31,7 @@ repos: # This is disabled as most modules fail this - repo: https://github.com/pycqa/pydocstyle - rev: 6.1.1 + rev: 6.3.0 hooks: - id: pydocstyle files: DISABLED # autosklearn/.* @@ -39,7 +39,7 @@ repos: additional_dependencies: ["toml"] # Needed to parse pyproject.toml - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.990 + rev: v1.2.0 hooks: - id: mypy name: mypy auto-sklearn diff --git a/autosklearn/pipeline/regression.py b/autosklearn/pipeline/regression.py index dcc2fa3fcf..85f5ed70ab 100644 --- a/autosklearn/pipeline/regression.py +++ b/autosklearn/pipeline/regression.py @@ -106,12 +106,35 @@ def iterative_fit(self, X, y, n_iter=1, **fit_params): ) def predict(self, X, batch_size=None): + """Predict the classes using the selected model. + + Predicted values are capped to approximately the maximum and minimum labels + seen during training. + + Parameters + ---------- + X : array-like, shape = (n_samples, n_features) + + batch_size: int or None, defaults to None + batch_size controls whether the pipeline will be + called on small chunks of the data. Useful when calling the + predict method on the whole array X results in a MemoryError. + + Returns + ------- + array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes) + Returns the predicted values""" y = super().predict(X, batch_size=batch_size) - y[y > (2 * self.y_max_)] = 2 * self.y_max_ + + if self.y_max_ > 0: + y[y > (2 * self.y_max_)] = 2 * self.y_max_ + elif self.y_max_ < 0: + y[y > (0.5 * self.y_max_)] = 0.5 * self.y_max_ if self.y_min_ < 0: y[y < (2 * self.y_min_)] = 2 * self.y_min_ elif self.y_min_ > 0: y[y < (0.5 * self.y_min_)] = 0.5 * self.y_min_ + return y def _get_hyperparameter_search_space( From 87a10eec249d61912a4ecd742329925c7a36633d Mon Sep 17 00:00:00 2001 From: agentmarketbot Date: Thu, 9 Jan 2025 22:56:30 +0000 Subject: [PATCH 15/16] agent bot commit --- doc/manual.rst | 23 ++++++++++++++++--- .../example_extending_data_preprocessor.py | 8 ++++++- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/doc/manual.rst b/doc/manual.rst index 1322ecfdbc..c5be838e94 100644 --- a/doc/manual.rst +++ b/doc/manual.rst @@ -153,11 +153,28 @@ to restrict the searchspace: Data preprocessing includes One-Hot encoding of categorical features, imputation of missing values and the normalization of features or samples. These ensure that - the data the gets to the sklearn models is well formed and can be used for + the data that gets to the sklearn models is well formed and can be used for training models. - While this is necessary in general, if you'd like to disable this step, please - refer to this :ref:`example `. + While this is necessary in general, if you'd like to disable this step, you need to: + + 1. First register the NoPreprocessing component: + + .. code:: python + + from autosklearn.pipeline.components.data_preprocessing import add_preprocessor + from autosklearn.pipeline.components.data_preprocessing.NoPreprocessing import NoPreprocessing + add_preprocessor(NoPreprocessing) + + 2. Then include it in your classifier configuration: + + .. code:: python + + automl = AutoSklearnClassifier( + include={"data_preprocessor": ["NoPreprocessing"]}, + ) + + For a complete example, refer to :ref:`example `. .. collapse:: Turn off feature preprocessing diff --git a/examples/80_extending/example_extending_data_preprocessor.py b/examples/80_extending/example_extending_data_preprocessor.py index eb0325d9df..095ad803be 100644 --- a/examples/80_extending/example_extending_data_preprocessor.py +++ b/examples/80_extending/example_extending_data_preprocessor.py @@ -3,7 +3,13 @@ Extending Auto-Sklearn with Data Preprocessor Component ======================================================= -The following example demonstrates how to turn off data preprocessing step in auto-skearn. +The following example demonstrates how to turn off data preprocessing step in auto-sklearn. +This is useful when you want to: +1. Skip automatic data preprocessing (One-Hot encoding, imputation, normalization) +2. Use your own preprocessed data directly +3. Ensure the data remains unchanged before reaching the models + +Note: You must register the NoPreprocessing component before using it in include={}. """ from typing import Optional from pprint import pprint From 31dc88a62fff22959a4b9e53c40f9a8fa0b29538 Mon Sep 17 00:00:00 2001 From: agentmarketbot Date: Thu, 9 Jan 2025 23:01:20 +0000 Subject: [PATCH 16/16] agent bot commit --- .../data_preprocessing/NoPreprocessing.py | 83 +++++++++++++++++++ doc/manual.rst | 3 + .../example_extending_data_preprocessor.py | 3 + .../test_NoPreprocessing.py | 47 +++++++++++ 4 files changed, 136 insertions(+) create mode 100644 autosklearn/pipeline/components/data_preprocessing/NoPreprocessing.py create mode 100644 test/test_pipeline/components/data_preprocessing/test_NoPreprocessing.py diff --git a/autosklearn/pipeline/components/data_preprocessing/NoPreprocessing.py b/autosklearn/pipeline/components/data_preprocessing/NoPreprocessing.py new file mode 100644 index 0000000000..61eb3d60e1 --- /dev/null +++ b/autosklearn/pipeline/components/data_preprocessing/NoPreprocessing.py @@ -0,0 +1,83 @@ +from typing import Optional, Dict, Any, Union, Tuple + +from ConfigSpace.configuration_space import ConfigurationSpace +import numpy as np + +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT + + +class NoPreprocessing(AutoSklearnPreprocessingAlgorithm): + def __init__( + self, + random_state: Optional[Union[int, np.random.RandomState]] = None, + ) -> None: + """A component that does no preprocessing, passing the data through unchanged.""" + self.random_state = random_state + + def fit(self, X: DENSE, y: Optional[DENSE] = None) -> "NoPreprocessing": + """Fit the NoPreprocessing component. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = (n_samples, n_features) + Training data + y : array-like, shape = (n_samples,), optional + Targets for supervised learning + + Returns + ------- + self : NoPreprocessing + This estimator + """ + self.fitted_ = True + return self + + def transform(self, X: DENSE) -> DENSE: + """Transform the data by doing nothing. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = (n_samples, n_features) + Data to transform + + Returns + ------- + X : {array-like, sparse matrix}, shape = (n_samples, n_features) + Transformed data (identical to input) + """ + if self.fitted_ is False: + raise NotImplementedError() + return X + + @staticmethod + def get_properties( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return { + 'shortname': 'NoPreprocessing', + 'name': 'No Preprocessing', + 'handles_regression': True, + 'handles_classification': True, + 'handles_multiclass': True, + 'handles_multilabel': True, + 'handles_multioutput': True, + 'is_deterministic': True, + 'input': (DENSE, SPARSE, UNSIGNED_DATA), + 'output': (INPUT,) + } + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: + """Return the configuration space for this component. + + Returns + ------- + cs : ConfigurationSpace + The configuration space describing all hyperparameters of this component. + """ + cs = ConfigurationSpace() + return cs \ No newline at end of file diff --git a/doc/manual.rst b/doc/manual.rst index c5be838e94..a6483698a6 100644 --- a/doc/manual.rst +++ b/doc/manual.rst @@ -174,6 +174,9 @@ to restrict the searchspace: include={"data_preprocessor": ["NoPreprocessing"]}, ) + Note: Make sure to register the NoPreprocessing component BEFORE creating the AutoSklearnClassifier. + The component must be registered first, otherwise you'll get a ValueError saying the component is not valid. + For a complete example, refer to :ref:`example `. .. collapse:: Turn off feature preprocessing diff --git a/examples/80_extending/example_extending_data_preprocessor.py b/examples/80_extending/example_extending_data_preprocessor.py index 095ad803be..4a8af872fd 100644 --- a/examples/80_extending/example_extending_data_preprocessor.py +++ b/examples/80_extending/example_extending_data_preprocessor.py @@ -10,6 +10,9 @@ 3. Ensure the data remains unchanged before reaching the models Note: You must register the NoPreprocessing component before using it in include={}. +This example shows how to properly disable data preprocessing in auto-sklearn. + +Fixes #1745 """ from typing import Optional from pprint import pprint diff --git a/test/test_pipeline/components/data_preprocessing/test_NoPreprocessing.py b/test/test_pipeline/components/data_preprocessing/test_NoPreprocessing.py new file mode 100644 index 0000000000..fcd876f702 --- /dev/null +++ b/test/test_pipeline/components/data_preprocessing/test_NoPreprocessing.py @@ -0,0 +1,47 @@ +import unittest + +import numpy as np +from scipy import sparse + +from autosklearn.pipeline.components.data_preprocessing.NoPreprocessing import NoPreprocessing + + +class NoPreprocessingTest(unittest.TestCase): + def test_preprocessing_dtype_transform(self): + # Dense + X = np.random.rand(3, 2) + Y = np.random.randint(0, 2, (3,)) + no_preprocessing = NoPreprocessing() + no_preprocessing.fit(X, Y) + X_transformed = no_preprocessing.transform(X) + self.assertIsInstance(X_transformed, np.ndarray) + np.testing.assert_array_equal(X_transformed, X) + + # Sparse + X = sparse.csr_matrix(X) + Y = np.random.randint(0, 2, (3,)) + no_preprocessing = NoPreprocessing() + no_preprocessing.fit(X, Y) + X_transformed = no_preprocessing.transform(X) + self.assertIsInstance(X_transformed, sparse.csr_matrix) + np.testing.assert_array_equal(X_transformed.toarray(), X.toarray()) + + def test_preprocessing_dtype_transform_no_fit(self): + X = np.random.rand(3, 2) + no_preprocessing = NoPreprocessing() + with self.assertRaises(NotImplementedError): + no_preprocessing.transform(X) + + def test_preprocessing_properties(self): + props = NoPreprocessing.get_properties() + self.assertEqual(props['shortname'], 'NoPreprocessing') + self.assertTrue(props['handles_regression']) + self.assertTrue(props['handles_classification']) + self.assertTrue(props['handles_multiclass']) + self.assertTrue(props['handles_multilabel']) + self.assertTrue(props['handles_multioutput']) + self.assertTrue(props['is_deterministic']) + + def test_hyperparameter_search_space(self): + cs = NoPreprocessing.get_hyperparameter_search_space() + self.assertEqual(len(cs.get_hyperparameters()), 0) \ No newline at end of file