GH-16033: Add GBLinear (#16034)

* Add GBLinear * Add example with normal automl run + gblinear * Fix java tests
h2oai · Jan 30, 2024 · e69e2ce · e69e2ce
1 parent 5620485
commit e69e2ce
Show file tree

Hide file tree

Showing 4 changed files with 254 additions and 5 deletions.
diff --git a/h2o-automl/src/main/java/ai/h2o/automl/modeling/XGBoostSteps.java b/h2o-automl/src/main/java/ai/h2o/automl/modeling/XGBoostSteps.java
@@ -15,6 +15,7 @@
 import water.Key;
 
 import java.util.*;
+import java.util.stream.IntStream;
 
 public class XGBoostSteps extends ModelingSteps {
 
@@ -215,8 +216,51 @@ public Map<String, Object[]> prepareSearchParameters() {
 
     }
 
+    static class XGBoostGBLinearGridStep extends XGBoostGridStep {
+
+        public XGBoostGBLinearGridStep(String id, AutoML autoML) {
+            super(id, autoML, false);
+        }
+
+
+        @Override
+        public XGBoostParameters prepareModelParameters() {
+            return XGBoostSteps.prepareModelParameters(aml(), false);
+        }
+
+        @Override
+        public Map<String, Object[]> prepareSearchParameters() {
+            Map<String, Object[]> searchParams = new HashMap<>();
+
+            /* 
+            // not supported/exposed in our xgboost yet
+            if (aml().getBuildSpec().build_control.isReproducible()) {
+                searchParams.put("_updater", new String[] {"coord_descent"});
+                searchParams.put("_feature_selector", new String[] {"cyclic", "greedy"});  // TODO: check if others are deterministic
+                            } else {
+                searchParams.put("_updater", new String[] {"shotgun", "coord_descent"});
+                searchParams.put("_feature_selector", new String[] {"cyclic", "shuffle", "random", "greedy", "thrifty"});
+            }
+            int ncols = aml().getTrainingFrame().numCols() - (aml().getBuildSpec().getNonPredictors().length +
+                    (aml().getBuildSpec().input_spec.ignored_columns != null ? aml().getBuildSpec().input_spec.ignored_columns.length : 0));
+
+            searchParams.put("_top_k", IntStream.range(0, ncols-1).boxed().toArray(Integer[]::new));
+            */
+
+            searchParams.put("_booster", new XGBoostParameters.Booster[]{ XGBoostParameters.Booster.gblinear });
+
+            searchParams.put("_reg_lambda", new Float[]{0.001f, 0.01f, 0.1f, 1f, 10f, 100f});
+            searchParams.put("_reg_alpha", new Float[]{0.001f, 0.01f, 0.1f, 0.5f, 1f});
+
+            return searchParams;
+        }
+
+    }
+
     private final ModelingStep[] grids = new XGBoostGridStep[] {
             new DefaultXGBoostGridStep("grid_1", aml()),
+            new XGBoostGBLinearGridStep("grid_gblinear", aml()),
+
 /*
             new DefaultXGBoostGridStep("grid_1_resume", aml()) {
                 @Override

diff --git a/h2o-automl/src/test/java/ai/h2o/automl/ModelingStepRegistryTest.java b/h2o-automl/src/test/java/ai/h2o/automl/ModelingStepRegistryTest.java
@@ -81,7 +81,7 @@ public void test_all_registered_steps() {
                 .collect(Collectors.toList());
         ModelingStep[] modelingSteps = registry.getOrderedSteps(allSteps.toArray(new StepDefinition[0]), aml);
         // 2 groups by default (1 for models, 1 for grids), hence the 2*2 SEs + 10 optional SEs
-        assertEquals((1/*completion*/)+(1+3/*DL*/) + (2/*DRF*/) + (5+1+1/*GBM*/) + (1/*GLM*/) + (2*2+10/*SE*/) + (3+1+2/*XGB*/),
+        assertEquals((1/*completion*/)+(1+3/*DL*/) + (2/*DRF*/) + (5+1+1/*GBM*/) + (1/*GLM*/) + (2*2+10/*SE*/) + (3+1+2/*XGB*/+1/*gblinear*/),
                 modelingSteps.length);
         assertEquals(1, Stream.of(modelingSteps).filter(s -> "completion".equals(s.getProvider())).filter(ModelingStep.DynamicStep.class::isInstance).count());
         assertEquals(1, Stream.of(modelingSteps).filter(s -> Algo.DeepLearning.name().equals(s.getProvider())).filter(ModelingStep.ModelStep.class::isInstance).count());
@@ -93,7 +93,7 @@ public void test_all_registered_steps() {
         assertEquals(1, Stream.of(modelingSteps).filter(s -> Algo.GLM.name().equals(s.getProvider())).filter(ModelingStep.ModelStep.class::isInstance).count());
         assertEquals(14, Stream.of(modelingSteps).filter(s -> Algo.StackedEnsemble.name().equals(s.getProvider())).filter(ModelingStep.ModelStep.class::isInstance).count());
         assertEquals(3, Stream.of(modelingSteps).filter(s -> Algo.XGBoost.name().equals(s.getProvider())).filter(ModelingStep.ModelStep.class::isInstance).count());
-        assertEquals(1, Stream.of(modelingSteps).filter(s -> Algo.XGBoost.name().equals(s.getProvider())).filter(ModelingStep.GridStep.class::isInstance).count());
+        assertEquals(2, Stream.of(modelingSteps).filter(s -> Algo.XGBoost.name().equals(s.getProvider())).filter(ModelingStep.GridStep.class::isInstance).count());
         assertEquals(2, Stream.of(modelingSteps).filter(s -> Algo.XGBoost.name().equals(s.getProvider())).filter(ModelingStep.SelectionStep.class::isInstance).count());
 
         List<String> orderedStepIds = Arrays.stream(modelingSteps).flatMap(s -> Stream.of(s._provider, s._id)).collect(Collectors.toList());
@@ -112,7 +112,7 @@ public void test_all_registered_steps() {
                 Algo.DeepLearning.name(), "grid_1", Algo.DeepLearning.name(), "grid_2", Algo.DeepLearning.name(), "grid_3",
                 Algo.GBM.name(), "grid_1",
                 Algo.StackedEnsemble.name(), "best_of_family_2", Algo.StackedEnsemble.name(), "all_2",
-                Algo.XGBoost.name(), "grid_1",
+                Algo.XGBoost.name(), "grid_1", Algo.XGBoost.name(), "grid_gblinear",
                 Algo.GBM.name(), "lr_annealing",
                 Algo.StackedEnsemble.name(), "monotonic",
                 Algo.StackedEnsemble.name(), "best_of_family", Algo.StackedEnsemble.name(), "all",
@@ -158,7 +158,7 @@ public void test_all_grids() {
                 .toArray(StepDefinition[]::new);
         ModelingStepsRegistry registry = new ModelingStepsRegistry();
         ModelingStep[] modelingSteps = registry.getOrderedSteps(allGridSteps, aml);
-        assertEquals((3/*DL*/) + (1/*GBM*/) + (1/*XGB*/),
+        assertEquals((3/*DL*/) + (1/*GBM*/) + (1/*XGB*/+1/*gblinear*/),
                 modelingSteps.length);
     }
 
@@ -173,7 +173,7 @@ public void test_all_defaults_plus_grids() {
         ModelingStepsRegistry registry = new ModelingStepsRegistry();
         ModelingStep[] modelingSteps = registry.getOrderedSteps(allGridSteps, aml);
         // by default, 1 group for default models, 1 group for grids, hence the 2*2 SEs
-        assertEquals((1+3/*DL*/) + (2/*DRF*/) + (5+1/*GBM*/) + (1/*GLM*/) + (2*2/*SE*/) + (3+1/*XGB*/),
+        assertEquals((1+3/*DL*/) + (2/*DRF*/) + (5+1/*GBM*/) + (1/*GLM*/) + (2*2/*SE*/) + (3+1/*XGB*/+1/*gblinear*/),
                 modelingSteps.length);
     }
 

diff --git a/h2o-py/tests/testdir_algos/automl/pyunit_automl_xgboost_gblinear.py b/h2o-py/tests/testdir_algos/automl/pyunit_automl_xgboost_gblinear.py
@@ -0,0 +1,77 @@
+import sys, os
+
+sys.path.insert(1, os.path.join("..", "..", ".."))
+import h2o
+import h2o.exceptions
+from h2o.automl import H2OAutoML
+from tests import pyunit_utils as pu
+
+from _automl_utils import import_dataset, get_partitioned_model_names
+
+max_models = 5
+
+
+def _is_gblinear(model_id):
+    model = h2o.get_model(model_id)
+    return model.actual_params["booster"] == "gblinear"
+
+
+def test_automl_doesnt_containt_gblinear_by_default():
+    ds = import_dataset()
+    aml = H2OAutoML(max_models=20,
+                    seed=1, include_algos=["xgboost"])
+    aml.train(y=ds.target, training_frame=ds.train)
+    print(aml.leaderboard)
+    for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]:
+        assert not _is_gblinear(m[0])
+
+    print("GBLinear model count: {}".format(
+        sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]))))
+
+    aml = H2OAutoML(max_runtime_secs=120,
+                    seed=1, include_algos=["xgboost"])
+    aml.train(y=ds.target, training_frame=ds.train)
+    print(aml.leaderboard)
+    for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]:
+        assert not _is_gblinear(m[0])
+
+    print("GBLinear model count: {}".format(
+        sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]))))
+
+
+def test_automl_containt_gblinear_when_used_modeling_plan():
+    ds = import_dataset()
+    aml = H2OAutoML(max_models=6,
+                    modeling_plan=[dict(name="XGBoost", steps=[dict(id="grid_gblinear"), dict(id="grid_1")])],
+                    seed=1)
+    aml.train(y=ds.target, training_frame=ds.train)
+    print(aml.leaderboard)
+    assert any(_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])
+    print("GBLinear model count: {}".format(
+        sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]))))
+
+    aml = H2OAutoML(max_models=6,
+                    modeling_plan=[("XGBoost", "grids")],
+                    seed=1)
+    aml.train(y=ds.target, training_frame=ds.train)
+    print(aml.leaderboard)
+    assert any(_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])
+    print("GBLinear model count: {}".format(
+        sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]))))
+
+    aml = H2OAutoML(max_runtime_secs=60,
+                    modeling_plan=[
+                        ("XGBoost",)
+                    ],
+                    seed=1)
+    aml.train(y=ds.target, training_frame=ds.train)
+    print(aml.leaderboard)
+    assert any(_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])
+    print("GBLinear model count: {}".format(
+        sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]))))
+
+
+pu.run_tests([
+    test_automl_doesnt_containt_gblinear_by_default,
+    test_automl_containt_gblinear_when_used_modeling_plan,
+])
diff --git a/h2o-py/tests/testdir_algos/automl/pyunit_automl_xgboost_gblinear_large.py b/h2o-py/tests/testdir_algos/automl/pyunit_automl_xgboost_gblinear_large.py
@@ -0,0 +1,128 @@
+import sys, os
+
+sys.path.insert(1, os.path.join("..", "..", ".."))
+import h2o
+import h2o.exceptions
+from h2o.automl import H2OAutoML
+from tests import pyunit_utils as pu
+
+from _automl_utils import import_dataset
+
+MAX_MODELS = 14  # Minimal amount of models to contain a model from the gblinear grid
+
+
+def _is_gblinear(model_id):
+    model = h2o.get_model(model_id)
+    return model.actual_params.get("booster") == "gblinear"
+
+
+def models_has_same_hyperparams(m1, m2):
+    for k, v in m1.params.items():
+        if k in ["model_id", "training_frame", "validation_frame", "base_models"]:
+            continue
+        if k not in m2.params.keys() or v["input"] != m2.params[k]["input"]:
+            return False
+    return True
+
+
+def model_is_in_automl(model, automl):
+    for m in automl.leaderboard.as_data_frame(use_pandas=False)[1:]:
+        mod = h2o.get_model(m[0])
+        if models_has_same_hyperparams(model, mod):
+            return True
+    print(model.model_id)
+    return False
+
+
+def test_automl_XGBoost_gblinear_reproducible_modeling_plan():
+    ds = import_dataset()
+    aml = H2OAutoML(max_models=MAX_MODELS, seed=1)
+    aml.train(y=ds.target, training_frame=ds.train)
+    print(aml.leaderboard)
+    for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]:
+        assert not _is_gblinear(m[0])
+
+    aml2 = H2OAutoML(max_models=MAX_MODELS, seed=1, modeling_plan=[
+        dict(name="XGBoost", steps=[
+            dict(id="def_2", group=1, weight=10),
+            dict(id="def_1", group=2, weight=10),
+            dict(id="def_3", group=3, weight=10),
+            dict(id="grid_1", group=4, weight=90),
+            dict(id="lr_search", group=7, weight=30),
+        ]), dict(name="GLM", steps=[
+            dict(id="def_1", group=1, weight=10),
+        ]), dict(name="DRF", steps=[
+            dict(id="def_1", group=2, weight=10),
+            dict(id="XRT", group=3, weight=10),
+        ]), dict(name="GBM", steps=[
+            dict(id="def_5", group=1, weight=10),
+            dict(id="def_2", group=2, weight=10),
+            dict(id="def_3", group=2, weight=10),
+            dict(id="def_4", group=2, weight=10),
+            dict(id="def_1", group=3, weight=10),
+            dict(id="grid_1", group=4, weight=60),
+            dict(id="lr_annealing", group=7, weight=10),
+        ]), dict(name="DeepLearning", steps=[
+            dict(id="def_1", group=3, weight=10),
+            dict(id="grid_1", group=4, weight=30),
+            dict(id="grid_2", group=5, weight=30),
+            dict(id="grid_3", group=5, weight=30),
+        ]), dict(name="completion", steps=[
+            dict(id="resume_best_grids", group=6, weight=60),
+        ]), dict(name="StackedEnsemble", steps=[
+            dict(id="monotonic", group=9, weight=10),
+            dict(id="best_of_family_xglm", group=10, weight=10),
+            dict(id="all_xglm", group=10, weight=10),
+        ])])
+    aml2.train(y=ds.target, training_frame=ds.train)
+    print(aml2.leaderboard)
+    for m in aml2.leaderboard.as_data_frame(use_pandas=False)[1:]:
+        assert model_is_in_automl(h2o.get_model(m[0]), aml)
+
+    aml_with_gblinear = H2OAutoML(max_models=MAX_MODELS, seed=1, modeling_plan=[
+        dict(name="XGBoost", steps=[
+            dict(id="def_2", group=1, weight=10),
+            dict(id="def_1", group=2, weight=10),
+            dict(id="def_3", group=3, weight=10),
+            dict(id="grid_1", group=4, weight=90),
+            dict(id="grid_gblinear", group=4, weight=90),  # << XGBoost GBLinear booster grid
+            dict(id="lr_search", group=7, weight=30),
+        ]), dict(name="GLM", steps=[
+            dict(id="def_1", group=1, weight=10),
+        ]), dict(name="DRF", steps=[
+            dict(id="def_1", group=2, weight=10),
+            dict(id="XRT", group=3, weight=10),
+        ]), dict(name="GBM", steps=[
+            dict(id="def_5", group=1, weight=10),
+            dict(id="def_2", group=2, weight=10),
+            dict(id="def_3", group=2, weight=10),
+            dict(id="def_4", group=2, weight=10),
+            dict(id="def_1", group=3, weight=10),
+            dict(id="grid_1", group=4, weight=60),
+            dict(id="lr_annealing", group=7, weight=10),
+        ]), dict(name="DeepLearning", steps=[
+            dict(id="def_1", group=3, weight=10),
+            dict(id="grid_1", group=4, weight=30),
+            dict(id="grid_2", group=5, weight=30),
+            dict(id="grid_3", group=5, weight=30),
+        ]), dict(name="completion", steps=[
+            dict(id="resume_best_grids", group=6, weight=60),
+        ]), dict(name="StackedEnsemble", steps=[
+            dict(id="monotonic", group=9, weight=10),
+            dict(id="best_of_family_xglm", group=10, weight=10),
+            dict(id="all_xglm", group=10, weight=10),
+        ])])
+    aml_with_gblinear.train(y=ds.target, training_frame=ds.train)
+    print(aml_with_gblinear.leaderboard)
+    for m in aml_with_gblinear.leaderboard.as_data_frame(use_pandas=False)[1:]:
+        assert model_is_in_automl(h2o.get_model(m[0]), aml) or _is_gblinear(m[0]), m[0]
+
+    print("GBLinear model count: {}".format(
+        sum((_is_gblinear(m[0]) for m in aml_with_gblinear.leaderboard.as_data_frame(use_pandas=False)[1:]))))
+
+    assert any((_is_gblinear(m[0]) for m in aml_with_gblinear.leaderboard.as_data_frame(use_pandas=False)[1:]))
+
+
+pu.run_tests([
+    test_automl_XGBoost_gblinear_reproducible_modeling_plan,
+])