diff --git a/h2o-automl/src/main/java/ai/h2o/automl/modeling/XGBoostSteps.java b/h2o-automl/src/main/java/ai/h2o/automl/modeling/XGBoostSteps.java index a2127f47edb0..603aa906577b 100644 --- a/h2o-automl/src/main/java/ai/h2o/automl/modeling/XGBoostSteps.java +++ b/h2o-automl/src/main/java/ai/h2o/automl/modeling/XGBoostSteps.java @@ -15,6 +15,7 @@ import water.Key; import java.util.*; +import java.util.stream.IntStream; public class XGBoostSteps extends ModelingSteps { @@ -215,8 +216,51 @@ public Map prepareSearchParameters() { } + static class XGBoostGBLinearGridStep extends XGBoostGridStep { + + public XGBoostGBLinearGridStep(String id, AutoML autoML) { + super(id, autoML, false); + } + + + @Override + public XGBoostParameters prepareModelParameters() { + return XGBoostSteps.prepareModelParameters(aml(), false); + } + + @Override + public Map prepareSearchParameters() { + Map searchParams = new HashMap<>(); + + /* + // not supported/exposed in our xgboost yet + if (aml().getBuildSpec().build_control.isReproducible()) { + searchParams.put("_updater", new String[] {"coord_descent"}); + searchParams.put("_feature_selector", new String[] {"cyclic", "greedy"}); // TODO: check if others are deterministic + } else { + searchParams.put("_updater", new String[] {"shotgun", "coord_descent"}); + searchParams.put("_feature_selector", new String[] {"cyclic", "shuffle", "random", "greedy", "thrifty"}); + } + int ncols = aml().getTrainingFrame().numCols() - (aml().getBuildSpec().getNonPredictors().length + + (aml().getBuildSpec().input_spec.ignored_columns != null ? aml().getBuildSpec().input_spec.ignored_columns.length : 0)); + + searchParams.put("_top_k", IntStream.range(0, ncols-1).boxed().toArray(Integer[]::new)); + */ + + searchParams.put("_booster", new XGBoostParameters.Booster[]{ XGBoostParameters.Booster.gblinear }); + + searchParams.put("_reg_lambda", new Float[]{0.001f, 0.01f, 0.1f, 1f, 10f, 100f}); + searchParams.put("_reg_alpha", new Float[]{0.001f, 0.01f, 0.1f, 0.5f, 1f}); + + return searchParams; + } + + } + private final ModelingStep[] grids = new XGBoostGridStep[] { new DefaultXGBoostGridStep("grid_1", aml()), + new XGBoostGBLinearGridStep("grid_gblinear", aml()), + /* new DefaultXGBoostGridStep("grid_1_resume", aml()) { @Override diff --git a/h2o-automl/src/test/java/ai/h2o/automl/ModelingStepRegistryTest.java b/h2o-automl/src/test/java/ai/h2o/automl/ModelingStepRegistryTest.java index a8d4da388a7e..8e1a373b3c9d 100644 --- a/h2o-automl/src/test/java/ai/h2o/automl/ModelingStepRegistryTest.java +++ b/h2o-automl/src/test/java/ai/h2o/automl/ModelingStepRegistryTest.java @@ -81,7 +81,7 @@ public void test_all_registered_steps() { .collect(Collectors.toList()); ModelingStep[] modelingSteps = registry.getOrderedSteps(allSteps.toArray(new StepDefinition[0]), aml); // 2 groups by default (1 for models, 1 for grids), hence the 2*2 SEs + 10 optional SEs - assertEquals((1/*completion*/)+(1+3/*DL*/) + (2/*DRF*/) + (5+1+1/*GBM*/) + (1/*GLM*/) + (2*2+10/*SE*/) + (3+1+2/*XGB*/), + assertEquals((1/*completion*/)+(1+3/*DL*/) + (2/*DRF*/) + (5+1+1/*GBM*/) + (1/*GLM*/) + (2*2+10/*SE*/) + (3+1+2/*XGB*/+1/*gblinear*/), modelingSteps.length); assertEquals(1, Stream.of(modelingSteps).filter(s -> "completion".equals(s.getProvider())).filter(ModelingStep.DynamicStep.class::isInstance).count()); assertEquals(1, Stream.of(modelingSteps).filter(s -> Algo.DeepLearning.name().equals(s.getProvider())).filter(ModelingStep.ModelStep.class::isInstance).count()); @@ -93,7 +93,7 @@ public void test_all_registered_steps() { assertEquals(1, Stream.of(modelingSteps).filter(s -> Algo.GLM.name().equals(s.getProvider())).filter(ModelingStep.ModelStep.class::isInstance).count()); assertEquals(14, Stream.of(modelingSteps).filter(s -> Algo.StackedEnsemble.name().equals(s.getProvider())).filter(ModelingStep.ModelStep.class::isInstance).count()); assertEquals(3, Stream.of(modelingSteps).filter(s -> Algo.XGBoost.name().equals(s.getProvider())).filter(ModelingStep.ModelStep.class::isInstance).count()); - assertEquals(1, Stream.of(modelingSteps).filter(s -> Algo.XGBoost.name().equals(s.getProvider())).filter(ModelingStep.GridStep.class::isInstance).count()); + assertEquals(2, Stream.of(modelingSteps).filter(s -> Algo.XGBoost.name().equals(s.getProvider())).filter(ModelingStep.GridStep.class::isInstance).count()); assertEquals(2, Stream.of(modelingSteps).filter(s -> Algo.XGBoost.name().equals(s.getProvider())).filter(ModelingStep.SelectionStep.class::isInstance).count()); List orderedStepIds = Arrays.stream(modelingSteps).flatMap(s -> Stream.of(s._provider, s._id)).collect(Collectors.toList()); @@ -112,7 +112,7 @@ public void test_all_registered_steps() { Algo.DeepLearning.name(), "grid_1", Algo.DeepLearning.name(), "grid_2", Algo.DeepLearning.name(), "grid_3", Algo.GBM.name(), "grid_1", Algo.StackedEnsemble.name(), "best_of_family_2", Algo.StackedEnsemble.name(), "all_2", - Algo.XGBoost.name(), "grid_1", + Algo.XGBoost.name(), "grid_1", Algo.XGBoost.name(), "grid_gblinear", Algo.GBM.name(), "lr_annealing", Algo.StackedEnsemble.name(), "monotonic", Algo.StackedEnsemble.name(), "best_of_family", Algo.StackedEnsemble.name(), "all", @@ -158,7 +158,7 @@ public void test_all_grids() { .toArray(StepDefinition[]::new); ModelingStepsRegistry registry = new ModelingStepsRegistry(); ModelingStep[] modelingSteps = registry.getOrderedSteps(allGridSteps, aml); - assertEquals((3/*DL*/) + (1/*GBM*/) + (1/*XGB*/), + assertEquals((3/*DL*/) + (1/*GBM*/) + (1/*XGB*/+1/*gblinear*/), modelingSteps.length); } @@ -173,7 +173,7 @@ public void test_all_defaults_plus_grids() { ModelingStepsRegistry registry = new ModelingStepsRegistry(); ModelingStep[] modelingSteps = registry.getOrderedSteps(allGridSteps, aml); // by default, 1 group for default models, 1 group for grids, hence the 2*2 SEs - assertEquals((1+3/*DL*/) + (2/*DRF*/) + (5+1/*GBM*/) + (1/*GLM*/) + (2*2/*SE*/) + (3+1/*XGB*/), + assertEquals((1+3/*DL*/) + (2/*DRF*/) + (5+1/*GBM*/) + (1/*GLM*/) + (2*2/*SE*/) + (3+1/*XGB*/+1/*gblinear*/), modelingSteps.length); } diff --git a/h2o-py/tests/testdir_algos/automl/pyunit_automl_xgboost_gblinear.py b/h2o-py/tests/testdir_algos/automl/pyunit_automl_xgboost_gblinear.py new file mode 100644 index 000000000000..22486a494582 --- /dev/null +++ b/h2o-py/tests/testdir_algos/automl/pyunit_automl_xgboost_gblinear.py @@ -0,0 +1,77 @@ +import sys, os + +sys.path.insert(1, os.path.join("..", "..", "..")) +import h2o +import h2o.exceptions +from h2o.automl import H2OAutoML +from tests import pyunit_utils as pu + +from _automl_utils import import_dataset, get_partitioned_model_names + +max_models = 5 + + +def _is_gblinear(model_id): + model = h2o.get_model(model_id) + return model.actual_params["booster"] == "gblinear" + + +def test_automl_doesnt_containt_gblinear_by_default(): + ds = import_dataset() + aml = H2OAutoML(max_models=20, + seed=1, include_algos=["xgboost"]) + aml.train(y=ds.target, training_frame=ds.train) + print(aml.leaderboard) + for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]: + assert not _is_gblinear(m[0]) + + print("GBLinear model count: {}".format( + sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])))) + + aml = H2OAutoML(max_runtime_secs=120, + seed=1, include_algos=["xgboost"]) + aml.train(y=ds.target, training_frame=ds.train) + print(aml.leaderboard) + for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]: + assert not _is_gblinear(m[0]) + + print("GBLinear model count: {}".format( + sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])))) + + +def test_automl_containt_gblinear_when_used_modeling_plan(): + ds = import_dataset() + aml = H2OAutoML(max_models=6, + modeling_plan=[dict(name="XGBoost", steps=[dict(id="grid_gblinear"), dict(id="grid_1")])], + seed=1) + aml.train(y=ds.target, training_frame=ds.train) + print(aml.leaderboard) + assert any(_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]) + print("GBLinear model count: {}".format( + sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])))) + + aml = H2OAutoML(max_models=6, + modeling_plan=[("XGBoost", "grids")], + seed=1) + aml.train(y=ds.target, training_frame=ds.train) + print(aml.leaderboard) + assert any(_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]) + print("GBLinear model count: {}".format( + sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])))) + + aml = H2OAutoML(max_runtime_secs=60, + modeling_plan=[ + ("XGBoost",) + ], + seed=1) + aml.train(y=ds.target, training_frame=ds.train) + print(aml.leaderboard) + assert any(_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]) + print("GBLinear model count: {}".format( + sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])))) + + +pu.run_tests([ + test_automl_doesnt_containt_gblinear_by_default, + test_automl_containt_gblinear_when_used_modeling_plan, +]) diff --git a/h2o-py/tests/testdir_algos/automl/pyunit_automl_xgboost_gblinear_large.py b/h2o-py/tests/testdir_algos/automl/pyunit_automl_xgboost_gblinear_large.py new file mode 100644 index 000000000000..ca2c9570234f --- /dev/null +++ b/h2o-py/tests/testdir_algos/automl/pyunit_automl_xgboost_gblinear_large.py @@ -0,0 +1,128 @@ +import sys, os + +sys.path.insert(1, os.path.join("..", "..", "..")) +import h2o +import h2o.exceptions +from h2o.automl import H2OAutoML +from tests import pyunit_utils as pu + +from _automl_utils import import_dataset + +MAX_MODELS = 14 # Minimal amount of models to contain a model from the gblinear grid + + +def _is_gblinear(model_id): + model = h2o.get_model(model_id) + return model.actual_params.get("booster") == "gblinear" + + +def models_has_same_hyperparams(m1, m2): + for k, v in m1.params.items(): + if k in ["model_id", "training_frame", "validation_frame", "base_models"]: + continue + if k not in m2.params.keys() or v["input"] != m2.params[k]["input"]: + return False + return True + + +def model_is_in_automl(model, automl): + for m in automl.leaderboard.as_data_frame(use_pandas=False)[1:]: + mod = h2o.get_model(m[0]) + if models_has_same_hyperparams(model, mod): + return True + print(model.model_id) + return False + + +def test_automl_XGBoost_gblinear_reproducible_modeling_plan(): + ds = import_dataset() + aml = H2OAutoML(max_models=MAX_MODELS, seed=1) + aml.train(y=ds.target, training_frame=ds.train) + print(aml.leaderboard) + for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]: + assert not _is_gblinear(m[0]) + + aml2 = H2OAutoML(max_models=MAX_MODELS, seed=1, modeling_plan=[ + dict(name="XGBoost", steps=[ + dict(id="def_2", group=1, weight=10), + dict(id="def_1", group=2, weight=10), + dict(id="def_3", group=3, weight=10), + dict(id="grid_1", group=4, weight=90), + dict(id="lr_search", group=7, weight=30), + ]), dict(name="GLM", steps=[ + dict(id="def_1", group=1, weight=10), + ]), dict(name="DRF", steps=[ + dict(id="def_1", group=2, weight=10), + dict(id="XRT", group=3, weight=10), + ]), dict(name="GBM", steps=[ + dict(id="def_5", group=1, weight=10), + dict(id="def_2", group=2, weight=10), + dict(id="def_3", group=2, weight=10), + dict(id="def_4", group=2, weight=10), + dict(id="def_1", group=3, weight=10), + dict(id="grid_1", group=4, weight=60), + dict(id="lr_annealing", group=7, weight=10), + ]), dict(name="DeepLearning", steps=[ + dict(id="def_1", group=3, weight=10), + dict(id="grid_1", group=4, weight=30), + dict(id="grid_2", group=5, weight=30), + dict(id="grid_3", group=5, weight=30), + ]), dict(name="completion", steps=[ + dict(id="resume_best_grids", group=6, weight=60), + ]), dict(name="StackedEnsemble", steps=[ + dict(id="monotonic", group=9, weight=10), + dict(id="best_of_family_xglm", group=10, weight=10), + dict(id="all_xglm", group=10, weight=10), + ])]) + aml2.train(y=ds.target, training_frame=ds.train) + print(aml2.leaderboard) + for m in aml2.leaderboard.as_data_frame(use_pandas=False)[1:]: + assert model_is_in_automl(h2o.get_model(m[0]), aml) + + aml_with_gblinear = H2OAutoML(max_models=MAX_MODELS, seed=1, modeling_plan=[ + dict(name="XGBoost", steps=[ + dict(id="def_2", group=1, weight=10), + dict(id="def_1", group=2, weight=10), + dict(id="def_3", group=3, weight=10), + dict(id="grid_1", group=4, weight=90), + dict(id="grid_gblinear", group=4, weight=90), # << XGBoost GBLinear booster grid + dict(id="lr_search", group=7, weight=30), + ]), dict(name="GLM", steps=[ + dict(id="def_1", group=1, weight=10), + ]), dict(name="DRF", steps=[ + dict(id="def_1", group=2, weight=10), + dict(id="XRT", group=3, weight=10), + ]), dict(name="GBM", steps=[ + dict(id="def_5", group=1, weight=10), + dict(id="def_2", group=2, weight=10), + dict(id="def_3", group=2, weight=10), + dict(id="def_4", group=2, weight=10), + dict(id="def_1", group=3, weight=10), + dict(id="grid_1", group=4, weight=60), + dict(id="lr_annealing", group=7, weight=10), + ]), dict(name="DeepLearning", steps=[ + dict(id="def_1", group=3, weight=10), + dict(id="grid_1", group=4, weight=30), + dict(id="grid_2", group=5, weight=30), + dict(id="grid_3", group=5, weight=30), + ]), dict(name="completion", steps=[ + dict(id="resume_best_grids", group=6, weight=60), + ]), dict(name="StackedEnsemble", steps=[ + dict(id="monotonic", group=9, weight=10), + dict(id="best_of_family_xglm", group=10, weight=10), + dict(id="all_xglm", group=10, weight=10), + ])]) + aml_with_gblinear.train(y=ds.target, training_frame=ds.train) + print(aml_with_gblinear.leaderboard) + for m in aml_with_gblinear.leaderboard.as_data_frame(use_pandas=False)[1:]: + assert model_is_in_automl(h2o.get_model(m[0]), aml) or _is_gblinear(m[0]), m[0] + + print("GBLinear model count: {}".format( + sum((_is_gblinear(m[0]) for m in aml_with_gblinear.leaderboard.as_data_frame(use_pandas=False)[1:])))) + + assert any((_is_gblinear(m[0]) for m in aml_with_gblinear.leaderboard.as_data_frame(use_pandas=False)[1:])) + + +pu.run_tests([ + test_automl_XGBoost_gblinear_reproducible_modeling_plan, +])