Skip to content

Commit

Permalink
GH-16033: Add GBLinear (#16034)
Browse files Browse the repository at this point in the history
* Add GBLinear

* Add example with normal automl run + gblinear

* Fix java tests
  • Loading branch information
tomasfryda authored Jan 30, 2024
1 parent 5620485 commit e69e2ce
Show file tree
Hide file tree
Showing 4 changed files with 254 additions and 5 deletions.
44 changes: 44 additions & 0 deletions h2o-automl/src/main/java/ai/h2o/automl/modeling/XGBoostSteps.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import water.Key;

import java.util.*;
import java.util.stream.IntStream;

public class XGBoostSteps extends ModelingSteps {

Expand Down Expand Up @@ -215,8 +216,51 @@ public Map<String, Object[]> prepareSearchParameters() {

}

static class XGBoostGBLinearGridStep extends XGBoostGridStep {

public XGBoostGBLinearGridStep(String id, AutoML autoML) {
super(id, autoML, false);
}


@Override
public XGBoostParameters prepareModelParameters() {
return XGBoostSteps.prepareModelParameters(aml(), false);
}

@Override
public Map<String, Object[]> prepareSearchParameters() {
Map<String, Object[]> searchParams = new HashMap<>();

/*
// not supported/exposed in our xgboost yet
if (aml().getBuildSpec().build_control.isReproducible()) {
searchParams.put("_updater", new String[] {"coord_descent"});
searchParams.put("_feature_selector", new String[] {"cyclic", "greedy"}); // TODO: check if others are deterministic
} else {
searchParams.put("_updater", new String[] {"shotgun", "coord_descent"});
searchParams.put("_feature_selector", new String[] {"cyclic", "shuffle", "random", "greedy", "thrifty"});
}
int ncols = aml().getTrainingFrame().numCols() - (aml().getBuildSpec().getNonPredictors().length +
(aml().getBuildSpec().input_spec.ignored_columns != null ? aml().getBuildSpec().input_spec.ignored_columns.length : 0));
searchParams.put("_top_k", IntStream.range(0, ncols-1).boxed().toArray(Integer[]::new));
*/

searchParams.put("_booster", new XGBoostParameters.Booster[]{ XGBoostParameters.Booster.gblinear });

searchParams.put("_reg_lambda", new Float[]{0.001f, 0.01f, 0.1f, 1f, 10f, 100f});
searchParams.put("_reg_alpha", new Float[]{0.001f, 0.01f, 0.1f, 0.5f, 1f});

return searchParams;
}

}

private final ModelingStep[] grids = new XGBoostGridStep[] {
new DefaultXGBoostGridStep("grid_1", aml()),
new XGBoostGBLinearGridStep("grid_gblinear", aml()),

/*
new DefaultXGBoostGridStep("grid_1_resume", aml()) {
@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ public void test_all_registered_steps() {
.collect(Collectors.toList());
ModelingStep[] modelingSteps = registry.getOrderedSteps(allSteps.toArray(new StepDefinition[0]), aml);
// 2 groups by default (1 for models, 1 for grids), hence the 2*2 SEs + 10 optional SEs
assertEquals((1/*completion*/)+(1+3/*DL*/) + (2/*DRF*/) + (5+1+1/*GBM*/) + (1/*GLM*/) + (2*2+10/*SE*/) + (3+1+2/*XGB*/),
assertEquals((1/*completion*/)+(1+3/*DL*/) + (2/*DRF*/) + (5+1+1/*GBM*/) + (1/*GLM*/) + (2*2+10/*SE*/) + (3+1+2/*XGB*/+1/*gblinear*/),
modelingSteps.length);
assertEquals(1, Stream.of(modelingSteps).filter(s -> "completion".equals(s.getProvider())).filter(ModelingStep.DynamicStep.class::isInstance).count());
assertEquals(1, Stream.of(modelingSteps).filter(s -> Algo.DeepLearning.name().equals(s.getProvider())).filter(ModelingStep.ModelStep.class::isInstance).count());
Expand All @@ -93,7 +93,7 @@ public void test_all_registered_steps() {
assertEquals(1, Stream.of(modelingSteps).filter(s -> Algo.GLM.name().equals(s.getProvider())).filter(ModelingStep.ModelStep.class::isInstance).count());
assertEquals(14, Stream.of(modelingSteps).filter(s -> Algo.StackedEnsemble.name().equals(s.getProvider())).filter(ModelingStep.ModelStep.class::isInstance).count());
assertEquals(3, Stream.of(modelingSteps).filter(s -> Algo.XGBoost.name().equals(s.getProvider())).filter(ModelingStep.ModelStep.class::isInstance).count());
assertEquals(1, Stream.of(modelingSteps).filter(s -> Algo.XGBoost.name().equals(s.getProvider())).filter(ModelingStep.GridStep.class::isInstance).count());
assertEquals(2, Stream.of(modelingSteps).filter(s -> Algo.XGBoost.name().equals(s.getProvider())).filter(ModelingStep.GridStep.class::isInstance).count());
assertEquals(2, Stream.of(modelingSteps).filter(s -> Algo.XGBoost.name().equals(s.getProvider())).filter(ModelingStep.SelectionStep.class::isInstance).count());

List<String> orderedStepIds = Arrays.stream(modelingSteps).flatMap(s -> Stream.of(s._provider, s._id)).collect(Collectors.toList());
Expand All @@ -112,7 +112,7 @@ public void test_all_registered_steps() {
Algo.DeepLearning.name(), "grid_1", Algo.DeepLearning.name(), "grid_2", Algo.DeepLearning.name(), "grid_3",
Algo.GBM.name(), "grid_1",
Algo.StackedEnsemble.name(), "best_of_family_2", Algo.StackedEnsemble.name(), "all_2",
Algo.XGBoost.name(), "grid_1",
Algo.XGBoost.name(), "grid_1", Algo.XGBoost.name(), "grid_gblinear",
Algo.GBM.name(), "lr_annealing",
Algo.StackedEnsemble.name(), "monotonic",
Algo.StackedEnsemble.name(), "best_of_family", Algo.StackedEnsemble.name(), "all",
Expand Down Expand Up @@ -158,7 +158,7 @@ public void test_all_grids() {
.toArray(StepDefinition[]::new);
ModelingStepsRegistry registry = new ModelingStepsRegistry();
ModelingStep[] modelingSteps = registry.getOrderedSteps(allGridSteps, aml);
assertEquals((3/*DL*/) + (1/*GBM*/) + (1/*XGB*/),
assertEquals((3/*DL*/) + (1/*GBM*/) + (1/*XGB*/+1/*gblinear*/),
modelingSteps.length);
}

Expand All @@ -173,7 +173,7 @@ public void test_all_defaults_plus_grids() {
ModelingStepsRegistry registry = new ModelingStepsRegistry();
ModelingStep[] modelingSteps = registry.getOrderedSteps(allGridSteps, aml);
// by default, 1 group for default models, 1 group for grids, hence the 2*2 SEs
assertEquals((1+3/*DL*/) + (2/*DRF*/) + (5+1/*GBM*/) + (1/*GLM*/) + (2*2/*SE*/) + (3+1/*XGB*/),
assertEquals((1+3/*DL*/) + (2/*DRF*/) + (5+1/*GBM*/) + (1/*GLM*/) + (2*2/*SE*/) + (3+1/*XGB*/+1/*gblinear*/),
modelingSteps.length);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import sys, os

sys.path.insert(1, os.path.join("..", "..", ".."))
import h2o
import h2o.exceptions
from h2o.automl import H2OAutoML
from tests import pyunit_utils as pu

from _automl_utils import import_dataset, get_partitioned_model_names

max_models = 5


def _is_gblinear(model_id):
model = h2o.get_model(model_id)
return model.actual_params["booster"] == "gblinear"


def test_automl_doesnt_containt_gblinear_by_default():
ds = import_dataset()
aml = H2OAutoML(max_models=20,
seed=1, include_algos=["xgboost"])
aml.train(y=ds.target, training_frame=ds.train)
print(aml.leaderboard)
for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]:
assert not _is_gblinear(m[0])

print("GBLinear model count: {}".format(
sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]))))

aml = H2OAutoML(max_runtime_secs=120,
seed=1, include_algos=["xgboost"])
aml.train(y=ds.target, training_frame=ds.train)
print(aml.leaderboard)
for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]:
assert not _is_gblinear(m[0])

print("GBLinear model count: {}".format(
sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]))))


def test_automl_containt_gblinear_when_used_modeling_plan():
ds = import_dataset()
aml = H2OAutoML(max_models=6,
modeling_plan=[dict(name="XGBoost", steps=[dict(id="grid_gblinear"), dict(id="grid_1")])],
seed=1)
aml.train(y=ds.target, training_frame=ds.train)
print(aml.leaderboard)
assert any(_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])
print("GBLinear model count: {}".format(
sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]))))

aml = H2OAutoML(max_models=6,
modeling_plan=[("XGBoost", "grids")],
seed=1)
aml.train(y=ds.target, training_frame=ds.train)
print(aml.leaderboard)
assert any(_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])
print("GBLinear model count: {}".format(
sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]))))

aml = H2OAutoML(max_runtime_secs=60,
modeling_plan=[
("XGBoost",)
],
seed=1)
aml.train(y=ds.target, training_frame=ds.train)
print(aml.leaderboard)
assert any(_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])
print("GBLinear model count: {}".format(
sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]))))


pu.run_tests([
test_automl_doesnt_containt_gblinear_by_default,
test_automl_containt_gblinear_when_used_modeling_plan,
])
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import sys, os

sys.path.insert(1, os.path.join("..", "..", ".."))
import h2o
import h2o.exceptions
from h2o.automl import H2OAutoML
from tests import pyunit_utils as pu

from _automl_utils import import_dataset

MAX_MODELS = 14 # Minimal amount of models to contain a model from the gblinear grid


def _is_gblinear(model_id):
model = h2o.get_model(model_id)
return model.actual_params.get("booster") == "gblinear"


def models_has_same_hyperparams(m1, m2):
for k, v in m1.params.items():
if k in ["model_id", "training_frame", "validation_frame", "base_models"]:
continue
if k not in m2.params.keys() or v["input"] != m2.params[k]["input"]:
return False
return True


def model_is_in_automl(model, automl):
for m in automl.leaderboard.as_data_frame(use_pandas=False)[1:]:
mod = h2o.get_model(m[0])
if models_has_same_hyperparams(model, mod):
return True
print(model.model_id)
return False


def test_automl_XGBoost_gblinear_reproducible_modeling_plan():
ds = import_dataset()
aml = H2OAutoML(max_models=MAX_MODELS, seed=1)
aml.train(y=ds.target, training_frame=ds.train)
print(aml.leaderboard)
for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]:
assert not _is_gblinear(m[0])

aml2 = H2OAutoML(max_models=MAX_MODELS, seed=1, modeling_plan=[
dict(name="XGBoost", steps=[
dict(id="def_2", group=1, weight=10),
dict(id="def_1", group=2, weight=10),
dict(id="def_3", group=3, weight=10),
dict(id="grid_1", group=4, weight=90),
dict(id="lr_search", group=7, weight=30),
]), dict(name="GLM", steps=[
dict(id="def_1", group=1, weight=10),
]), dict(name="DRF", steps=[
dict(id="def_1", group=2, weight=10),
dict(id="XRT", group=3, weight=10),
]), dict(name="GBM", steps=[
dict(id="def_5", group=1, weight=10),
dict(id="def_2", group=2, weight=10),
dict(id="def_3", group=2, weight=10),
dict(id="def_4", group=2, weight=10),
dict(id="def_1", group=3, weight=10),
dict(id="grid_1", group=4, weight=60),
dict(id="lr_annealing", group=7, weight=10),
]), dict(name="DeepLearning", steps=[
dict(id="def_1", group=3, weight=10),
dict(id="grid_1", group=4, weight=30),
dict(id="grid_2", group=5, weight=30),
dict(id="grid_3", group=5, weight=30),
]), dict(name="completion", steps=[
dict(id="resume_best_grids", group=6, weight=60),
]), dict(name="StackedEnsemble", steps=[
dict(id="monotonic", group=9, weight=10),
dict(id="best_of_family_xglm", group=10, weight=10),
dict(id="all_xglm", group=10, weight=10),
])])
aml2.train(y=ds.target, training_frame=ds.train)
print(aml2.leaderboard)
for m in aml2.leaderboard.as_data_frame(use_pandas=False)[1:]:
assert model_is_in_automl(h2o.get_model(m[0]), aml)

aml_with_gblinear = H2OAutoML(max_models=MAX_MODELS, seed=1, modeling_plan=[
dict(name="XGBoost", steps=[
dict(id="def_2", group=1, weight=10),
dict(id="def_1", group=2, weight=10),
dict(id="def_3", group=3, weight=10),
dict(id="grid_1", group=4, weight=90),
dict(id="grid_gblinear", group=4, weight=90), # << XGBoost GBLinear booster grid
dict(id="lr_search", group=7, weight=30),
]), dict(name="GLM", steps=[
dict(id="def_1", group=1, weight=10),
]), dict(name="DRF", steps=[
dict(id="def_1", group=2, weight=10),
dict(id="XRT", group=3, weight=10),
]), dict(name="GBM", steps=[
dict(id="def_5", group=1, weight=10),
dict(id="def_2", group=2, weight=10),
dict(id="def_3", group=2, weight=10),
dict(id="def_4", group=2, weight=10),
dict(id="def_1", group=3, weight=10),
dict(id="grid_1", group=4, weight=60),
dict(id="lr_annealing", group=7, weight=10),
]), dict(name="DeepLearning", steps=[
dict(id="def_1", group=3, weight=10),
dict(id="grid_1", group=4, weight=30),
dict(id="grid_2", group=5, weight=30),
dict(id="grid_3", group=5, weight=30),
]), dict(name="completion", steps=[
dict(id="resume_best_grids", group=6, weight=60),
]), dict(name="StackedEnsemble", steps=[
dict(id="monotonic", group=9, weight=10),
dict(id="best_of_family_xglm", group=10, weight=10),
dict(id="all_xglm", group=10, weight=10),
])])
aml_with_gblinear.train(y=ds.target, training_frame=ds.train)
print(aml_with_gblinear.leaderboard)
for m in aml_with_gblinear.leaderboard.as_data_frame(use_pandas=False)[1:]:
assert model_is_in_automl(h2o.get_model(m[0]), aml) or _is_gblinear(m[0]), m[0]

print("GBLinear model count: {}".format(
sum((_is_gblinear(m[0]) for m in aml_with_gblinear.leaderboard.as_data_frame(use_pandas=False)[1:]))))

assert any((_is_gblinear(m[0]) for m in aml_with_gblinear.leaderboard.as_data_frame(use_pandas=False)[1:]))


pu.run_tests([
test_automl_XGBoost_gblinear_reproducible_modeling_plan,
])

0 comments on commit e69e2ce

Please sign in to comment.