Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-16033: Add GBLinear #16034

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions h2o-automl/src/main/java/ai/h2o/automl/modeling/XGBoostSteps.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import water.Key;

import java.util.*;
import java.util.stream.IntStream;

public class XGBoostSteps extends ModelingSteps {

Expand Down Expand Up @@ -215,8 +216,51 @@ public Map<String, Object[]> prepareSearchParameters() {

}

static class XGBoostGBLinearGridStep extends XGBoostGridStep {

public XGBoostGBLinearGridStep(String id, AutoML autoML) {
super(id, autoML, false);
}


@Override
public XGBoostParameters prepareModelParameters() {
return XGBoostSteps.prepareModelParameters(aml(), false);
}

@Override
public Map<String, Object[]> prepareSearchParameters() {
Map<String, Object[]> searchParams = new HashMap<>();

/*
// not supported/exposed in our xgboost yet
if (aml().getBuildSpec().build_control.isReproducible()) {
searchParams.put("_updater", new String[] {"coord_descent"});
searchParams.put("_feature_selector", new String[] {"cyclic", "greedy"}); // TODO: check if others are deterministic
} else {
searchParams.put("_updater", new String[] {"shotgun", "coord_descent"});
searchParams.put("_feature_selector", new String[] {"cyclic", "shuffle", "random", "greedy", "thrifty"});
}
int ncols = aml().getTrainingFrame().numCols() - (aml().getBuildSpec().getNonPredictors().length +
(aml().getBuildSpec().input_spec.ignored_columns != null ? aml().getBuildSpec().input_spec.ignored_columns.length : 0));

searchParams.put("_top_k", IntStream.range(0, ncols-1).boxed().toArray(Integer[]::new));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice to add this in now. You can uncomment it once Adam exposed those parameters.

*/

searchParams.put("_booster", new XGBoostParameters.Booster[]{ XGBoostParameters.Booster.gblinear });

searchParams.put("_reg_lambda", new Float[]{0.001f, 0.01f, 0.1f, 1f, 10f, 100f});
searchParams.put("_reg_alpha", new Float[]{0.001f, 0.01f, 0.1f, 0.5f, 1f});

return searchParams;
}

}

private final ModelingStep[] grids = new XGBoostGridStep[] {
new DefaultXGBoostGridStep("grid_1", aml()),
new XGBoostGBLinearGridStep("grid_gblinear", aml()),

/*
new DefaultXGBoostGridStep("grid_1_resume", aml()) {
@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ public void test_all_registered_steps() {
.collect(Collectors.toList());
ModelingStep[] modelingSteps = registry.getOrderedSteps(allSteps.toArray(new StepDefinition[0]), aml);
// 2 groups by default (1 for models, 1 for grids), hence the 2*2 SEs + 10 optional SEs
assertEquals((1/*completion*/)+(1+3/*DL*/) + (2/*DRF*/) + (5+1+1/*GBM*/) + (1/*GLM*/) + (2*2+10/*SE*/) + (3+1+2/*XGB*/),
assertEquals((1/*completion*/)+(1+3/*DL*/) + (2/*DRF*/) + (5+1+1/*GBM*/) + (1/*GLM*/) + (2*2+10/*SE*/) + (3+1+2/*XGB*/+1/*gblinear*/),
modelingSteps.length);
assertEquals(1, Stream.of(modelingSteps).filter(s -> "completion".equals(s.getProvider())).filter(ModelingStep.DynamicStep.class::isInstance).count());
assertEquals(1, Stream.of(modelingSteps).filter(s -> Algo.DeepLearning.name().equals(s.getProvider())).filter(ModelingStep.ModelStep.class::isInstance).count());
Expand All @@ -93,7 +93,7 @@ public void test_all_registered_steps() {
assertEquals(1, Stream.of(modelingSteps).filter(s -> Algo.GLM.name().equals(s.getProvider())).filter(ModelingStep.ModelStep.class::isInstance).count());
assertEquals(14, Stream.of(modelingSteps).filter(s -> Algo.StackedEnsemble.name().equals(s.getProvider())).filter(ModelingStep.ModelStep.class::isInstance).count());
assertEquals(3, Stream.of(modelingSteps).filter(s -> Algo.XGBoost.name().equals(s.getProvider())).filter(ModelingStep.ModelStep.class::isInstance).count());
assertEquals(1, Stream.of(modelingSteps).filter(s -> Algo.XGBoost.name().equals(s.getProvider())).filter(ModelingStep.GridStep.class::isInstance).count());
assertEquals(2, Stream.of(modelingSteps).filter(s -> Algo.XGBoost.name().equals(s.getProvider())).filter(ModelingStep.GridStep.class::isInstance).count());
assertEquals(2, Stream.of(modelingSteps).filter(s -> Algo.XGBoost.name().equals(s.getProvider())).filter(ModelingStep.SelectionStep.class::isInstance).count());

List<String> orderedStepIds = Arrays.stream(modelingSteps).flatMap(s -> Stream.of(s._provider, s._id)).collect(Collectors.toList());
Expand All @@ -112,7 +112,7 @@ public void test_all_registered_steps() {
Algo.DeepLearning.name(), "grid_1", Algo.DeepLearning.name(), "grid_2", Algo.DeepLearning.name(), "grid_3",
Algo.GBM.name(), "grid_1",
Algo.StackedEnsemble.name(), "best_of_family_2", Algo.StackedEnsemble.name(), "all_2",
Algo.XGBoost.name(), "grid_1",
Algo.XGBoost.name(), "grid_1", Algo.XGBoost.name(), "grid_gblinear",
Algo.GBM.name(), "lr_annealing",
Algo.StackedEnsemble.name(), "monotonic",
Algo.StackedEnsemble.name(), "best_of_family", Algo.StackedEnsemble.name(), "all",
Expand Down Expand Up @@ -158,7 +158,7 @@ public void test_all_grids() {
.toArray(StepDefinition[]::new);
ModelingStepsRegistry registry = new ModelingStepsRegistry();
ModelingStep[] modelingSteps = registry.getOrderedSteps(allGridSteps, aml);
assertEquals((3/*DL*/) + (1/*GBM*/) + (1/*XGB*/),
assertEquals((3/*DL*/) + (1/*GBM*/) + (1/*XGB*/+1/*gblinear*/),
modelingSteps.length);
}

Expand All @@ -173,7 +173,7 @@ public void test_all_defaults_plus_grids() {
ModelingStepsRegistry registry = new ModelingStepsRegistry();
ModelingStep[] modelingSteps = registry.getOrderedSteps(allGridSteps, aml);
// by default, 1 group for default models, 1 group for grids, hence the 2*2 SEs
assertEquals((1+3/*DL*/) + (2/*DRF*/) + (5+1/*GBM*/) + (1/*GLM*/) + (2*2/*SE*/) + (3+1/*XGB*/),
assertEquals((1+3/*DL*/) + (2/*DRF*/) + (5+1/*GBM*/) + (1/*GLM*/) + (2*2/*SE*/) + (3+1/*XGB*/+1/*gblinear*/),
modelingSteps.length);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import sys, os

sys.path.insert(1, os.path.join("..", "..", ".."))
import h2o
import h2o.exceptions
from h2o.automl import H2OAutoML
from tests import pyunit_utils as pu

from _automl_utils import import_dataset, get_partitioned_model_names

max_models = 5


def _is_gblinear(model_id):
model = h2o.get_model(model_id)
return model.actual_params["booster"] == "gblinear"


def test_automl_doesnt_containt_gblinear_by_default():
ds = import_dataset()
aml = H2OAutoML(max_models=20,
seed=1, include_algos=["xgboost"])
aml.train(y=ds.target, training_frame=ds.train)
print(aml.leaderboard)
for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]:
assert not _is_gblinear(m[0])

print("GBLinear model count: {}".format(
sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]))))

aml = H2OAutoML(max_runtime_secs=120,
seed=1, include_algos=["xgboost"])
aml.train(y=ds.target, training_frame=ds.train)
print(aml.leaderboard)
for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]:
assert not _is_gblinear(m[0])

print("GBLinear model count: {}".format(
sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]))))


def test_automl_containt_gblinear_when_used_modeling_plan():
ds = import_dataset()
aml = H2OAutoML(max_models=6,
modeling_plan=[dict(name="XGBoost", steps=[dict(id="grid_gblinear"), dict(id="grid_1")])],
seed=1)
aml.train(y=ds.target, training_frame=ds.train)
print(aml.leaderboard)
assert any(_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])
print("GBLinear model count: {}".format(
sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]))))

aml = H2OAutoML(max_models=6,
modeling_plan=[("XGBoost", "grids")],
seed=1)
aml.train(y=ds.target, training_frame=ds.train)
print(aml.leaderboard)
assert any(_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])
print("GBLinear model count: {}".format(
sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]))))

aml = H2OAutoML(max_runtime_secs=60,
modeling_plan=[
("XGBoost",)
],
seed=1)
aml.train(y=ds.target, training_frame=ds.train)
print(aml.leaderboard)
assert any(_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])
print("GBLinear model count: {}".format(
sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]))))


pu.run_tests([
test_automl_doesnt_containt_gblinear_by_default,
test_automl_containt_gblinear_when_used_modeling_plan,
])
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import sys, os

sys.path.insert(1, os.path.join("..", "..", ".."))
import h2o
import h2o.exceptions
from h2o.automl import H2OAutoML
from tests import pyunit_utils as pu

from _automl_utils import import_dataset

MAX_MODELS = 14 # Minimal amount of models to contain a model from the gblinear grid


def _is_gblinear(model_id):
model = h2o.get_model(model_id)
return model.actual_params.get("booster") == "gblinear"


def models_has_same_hyperparams(m1, m2):
for k, v in m1.params.items():
if k in ["model_id", "training_frame", "validation_frame", "base_models"]:
continue
if k not in m2.params.keys() or v["input"] != m2.params[k]["input"]:
return False
return True


def model_is_in_automl(model, automl):
for m in automl.leaderboard.as_data_frame(use_pandas=False)[1:]:
mod = h2o.get_model(m[0])
if models_has_same_hyperparams(model, mod):
return True
print(model.model_id)
return False


def test_automl_XGBoost_gblinear_reproducible_modeling_plan():
ds = import_dataset()
aml = H2OAutoML(max_models=MAX_MODELS, seed=1)
aml.train(y=ds.target, training_frame=ds.train)
print(aml.leaderboard)
for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]:
assert not _is_gblinear(m[0])

aml2 = H2OAutoML(max_models=MAX_MODELS, seed=1, modeling_plan=[
dict(name="XGBoost", steps=[
dict(id="def_2", group=1, weight=10),
dict(id="def_1", group=2, weight=10),
dict(id="def_3", group=3, weight=10),
dict(id="grid_1", group=4, weight=90),
dict(id="lr_search", group=7, weight=30),
]), dict(name="GLM", steps=[
dict(id="def_1", group=1, weight=10),
]), dict(name="DRF", steps=[
dict(id="def_1", group=2, weight=10),
dict(id="XRT", group=3, weight=10),
]), dict(name="GBM", steps=[
dict(id="def_5", group=1, weight=10),
dict(id="def_2", group=2, weight=10),
dict(id="def_3", group=2, weight=10),
dict(id="def_4", group=2, weight=10),
dict(id="def_1", group=3, weight=10),
dict(id="grid_1", group=4, weight=60),
dict(id="lr_annealing", group=7, weight=10),
]), dict(name="DeepLearning", steps=[
dict(id="def_1", group=3, weight=10),
dict(id="grid_1", group=4, weight=30),
dict(id="grid_2", group=5, weight=30),
dict(id="grid_3", group=5, weight=30),
]), dict(name="completion", steps=[
dict(id="resume_best_grids", group=6, weight=60),
]), dict(name="StackedEnsemble", steps=[
dict(id="monotonic", group=9, weight=10),
dict(id="best_of_family_xglm", group=10, weight=10),
dict(id="all_xglm", group=10, weight=10),
])])
aml2.train(y=ds.target, training_frame=ds.train)
print(aml2.leaderboard)
for m in aml2.leaderboard.as_data_frame(use_pandas=False)[1:]:
assert model_is_in_automl(h2o.get_model(m[0]), aml)

aml_with_gblinear = H2OAutoML(max_models=MAX_MODELS, seed=1, modeling_plan=[
dict(name="XGBoost", steps=[
dict(id="def_2", group=1, weight=10),
dict(id="def_1", group=2, weight=10),
dict(id="def_3", group=3, weight=10),
dict(id="grid_1", group=4, weight=90),
dict(id="grid_gblinear", group=4, weight=90), # << XGBoost GBLinear booster grid
dict(id="lr_search", group=7, weight=30),
]), dict(name="GLM", steps=[
dict(id="def_1", group=1, weight=10),
]), dict(name="DRF", steps=[
dict(id="def_1", group=2, weight=10),
dict(id="XRT", group=3, weight=10),
]), dict(name="GBM", steps=[
dict(id="def_5", group=1, weight=10),
dict(id="def_2", group=2, weight=10),
dict(id="def_3", group=2, weight=10),
dict(id="def_4", group=2, weight=10),
dict(id="def_1", group=3, weight=10),
dict(id="grid_1", group=4, weight=60),
dict(id="lr_annealing", group=7, weight=10),
]), dict(name="DeepLearning", steps=[
dict(id="def_1", group=3, weight=10),
dict(id="grid_1", group=4, weight=30),
dict(id="grid_2", group=5, weight=30),
dict(id="grid_3", group=5, weight=30),
]), dict(name="completion", steps=[
dict(id="resume_best_grids", group=6, weight=60),
]), dict(name="StackedEnsemble", steps=[
dict(id="monotonic", group=9, weight=10),
dict(id="best_of_family_xglm", group=10, weight=10),
dict(id="all_xglm", group=10, weight=10),
])])
aml_with_gblinear.train(y=ds.target, training_frame=ds.train)
Comment on lines +82 to +115
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Example of how to run normal automl with gblinear grid.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this what we want to give to the customer when they want to try gblinear?
I mean if it's one single customer, I think it's fine, but it fixes AutoML behavior once and for all…

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK it's just one customer and from a limited benchmark gblinear doesn't seem to bring much (except for higher training time). But it might be worth experimenting with once we have more gblinear parameters exposed.

print(aml_with_gblinear.leaderboard)
for m in aml_with_gblinear.leaderboard.as_data_frame(use_pandas=False)[1:]:
assert model_is_in_automl(h2o.get_model(m[0]), aml) or _is_gblinear(m[0]), m[0]

print("GBLinear model count: {}".format(
sum((_is_gblinear(m[0]) for m in aml_with_gblinear.leaderboard.as_data_frame(use_pandas=False)[1:]))))

assert any((_is_gblinear(m[0]) for m in aml_with_gblinear.leaderboard.as_data_frame(use_pandas=False)[1:]))


pu.run_tests([
test_automl_XGBoost_gblinear_reproducible_modeling_plan,
])
Loading