-
Notifications
You must be signed in to change notification settings - Fork 2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GH-16033: Add GBLinear #16034
GH-16033: Add GBLinear #16034
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import sys, os | ||
|
||
sys.path.insert(1, os.path.join("..", "..", "..")) | ||
import h2o | ||
import h2o.exceptions | ||
from h2o.automl import H2OAutoML | ||
from tests import pyunit_utils as pu | ||
|
||
from _automl_utils import import_dataset, get_partitioned_model_names | ||
|
||
max_models = 5 | ||
|
||
|
||
def _is_gblinear(model_id): | ||
model = h2o.get_model(model_id) | ||
return model.actual_params["booster"] == "gblinear" | ||
|
||
|
||
def test_automl_doesnt_containt_gblinear_by_default(): | ||
ds = import_dataset() | ||
aml = H2OAutoML(max_models=20, | ||
seed=1, include_algos=["xgboost"]) | ||
aml.train(y=ds.target, training_frame=ds.train) | ||
print(aml.leaderboard) | ||
for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]: | ||
assert not _is_gblinear(m[0]) | ||
|
||
print("GBLinear model count: {}".format( | ||
sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])))) | ||
|
||
aml = H2OAutoML(max_runtime_secs=120, | ||
seed=1, include_algos=["xgboost"]) | ||
aml.train(y=ds.target, training_frame=ds.train) | ||
print(aml.leaderboard) | ||
for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]: | ||
assert not _is_gblinear(m[0]) | ||
|
||
print("GBLinear model count: {}".format( | ||
sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])))) | ||
|
||
|
||
def test_automl_containt_gblinear_when_used_modeling_plan(): | ||
ds = import_dataset() | ||
aml = H2OAutoML(max_models=6, | ||
modeling_plan=[dict(name="XGBoost", steps=[dict(id="grid_gblinear"), dict(id="grid_1")])], | ||
seed=1) | ||
aml.train(y=ds.target, training_frame=ds.train) | ||
print(aml.leaderboard) | ||
assert any(_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]) | ||
print("GBLinear model count: {}".format( | ||
sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])))) | ||
|
||
aml = H2OAutoML(max_models=6, | ||
modeling_plan=[("XGBoost", "grids")], | ||
seed=1) | ||
aml.train(y=ds.target, training_frame=ds.train) | ||
print(aml.leaderboard) | ||
assert any(_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]) | ||
print("GBLinear model count: {}".format( | ||
sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])))) | ||
|
||
aml = H2OAutoML(max_runtime_secs=60, | ||
modeling_plan=[ | ||
("XGBoost",) | ||
], | ||
seed=1) | ||
aml.train(y=ds.target, training_frame=ds.train) | ||
print(aml.leaderboard) | ||
assert any(_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]) | ||
print("GBLinear model count: {}".format( | ||
sum((_is_gblinear(m[0]) for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:])))) | ||
|
||
|
||
pu.run_tests([ | ||
test_automl_doesnt_containt_gblinear_by_default, | ||
test_automl_containt_gblinear_when_used_modeling_plan, | ||
]) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
import sys, os | ||
|
||
sys.path.insert(1, os.path.join("..", "..", "..")) | ||
import h2o | ||
import h2o.exceptions | ||
from h2o.automl import H2OAutoML | ||
from tests import pyunit_utils as pu | ||
|
||
from _automl_utils import import_dataset | ||
|
||
MAX_MODELS = 14 # Minimal amount of models to contain a model from the gblinear grid | ||
|
||
|
||
def _is_gblinear(model_id): | ||
model = h2o.get_model(model_id) | ||
return model.actual_params.get("booster") == "gblinear" | ||
|
||
|
||
def models_has_same_hyperparams(m1, m2): | ||
for k, v in m1.params.items(): | ||
if k in ["model_id", "training_frame", "validation_frame", "base_models"]: | ||
continue | ||
if k not in m2.params.keys() or v["input"] != m2.params[k]["input"]: | ||
return False | ||
return True | ||
|
||
|
||
def model_is_in_automl(model, automl): | ||
for m in automl.leaderboard.as_data_frame(use_pandas=False)[1:]: | ||
mod = h2o.get_model(m[0]) | ||
if models_has_same_hyperparams(model, mod): | ||
return True | ||
print(model.model_id) | ||
return False | ||
|
||
|
||
def test_automl_XGBoost_gblinear_reproducible_modeling_plan(): | ||
ds = import_dataset() | ||
aml = H2OAutoML(max_models=MAX_MODELS, seed=1) | ||
aml.train(y=ds.target, training_frame=ds.train) | ||
print(aml.leaderboard) | ||
for m in aml.leaderboard.as_data_frame(use_pandas=False)[1:]: | ||
assert not _is_gblinear(m[0]) | ||
|
||
aml2 = H2OAutoML(max_models=MAX_MODELS, seed=1, modeling_plan=[ | ||
dict(name="XGBoost", steps=[ | ||
dict(id="def_2", group=1, weight=10), | ||
dict(id="def_1", group=2, weight=10), | ||
dict(id="def_3", group=3, weight=10), | ||
dict(id="grid_1", group=4, weight=90), | ||
dict(id="lr_search", group=7, weight=30), | ||
]), dict(name="GLM", steps=[ | ||
dict(id="def_1", group=1, weight=10), | ||
]), dict(name="DRF", steps=[ | ||
dict(id="def_1", group=2, weight=10), | ||
dict(id="XRT", group=3, weight=10), | ||
]), dict(name="GBM", steps=[ | ||
dict(id="def_5", group=1, weight=10), | ||
dict(id="def_2", group=2, weight=10), | ||
dict(id="def_3", group=2, weight=10), | ||
dict(id="def_4", group=2, weight=10), | ||
dict(id="def_1", group=3, weight=10), | ||
dict(id="grid_1", group=4, weight=60), | ||
dict(id="lr_annealing", group=7, weight=10), | ||
]), dict(name="DeepLearning", steps=[ | ||
dict(id="def_1", group=3, weight=10), | ||
dict(id="grid_1", group=4, weight=30), | ||
dict(id="grid_2", group=5, weight=30), | ||
dict(id="grid_3", group=5, weight=30), | ||
]), dict(name="completion", steps=[ | ||
dict(id="resume_best_grids", group=6, weight=60), | ||
]), dict(name="StackedEnsemble", steps=[ | ||
dict(id="monotonic", group=9, weight=10), | ||
dict(id="best_of_family_xglm", group=10, weight=10), | ||
dict(id="all_xglm", group=10, weight=10), | ||
])]) | ||
aml2.train(y=ds.target, training_frame=ds.train) | ||
print(aml2.leaderboard) | ||
for m in aml2.leaderboard.as_data_frame(use_pandas=False)[1:]: | ||
assert model_is_in_automl(h2o.get_model(m[0]), aml) | ||
|
||
aml_with_gblinear = H2OAutoML(max_models=MAX_MODELS, seed=1, modeling_plan=[ | ||
dict(name="XGBoost", steps=[ | ||
dict(id="def_2", group=1, weight=10), | ||
dict(id="def_1", group=2, weight=10), | ||
dict(id="def_3", group=3, weight=10), | ||
dict(id="grid_1", group=4, weight=90), | ||
dict(id="grid_gblinear", group=4, weight=90), # << XGBoost GBLinear booster grid | ||
dict(id="lr_search", group=7, weight=30), | ||
]), dict(name="GLM", steps=[ | ||
dict(id="def_1", group=1, weight=10), | ||
]), dict(name="DRF", steps=[ | ||
dict(id="def_1", group=2, weight=10), | ||
dict(id="XRT", group=3, weight=10), | ||
]), dict(name="GBM", steps=[ | ||
dict(id="def_5", group=1, weight=10), | ||
dict(id="def_2", group=2, weight=10), | ||
dict(id="def_3", group=2, weight=10), | ||
dict(id="def_4", group=2, weight=10), | ||
dict(id="def_1", group=3, weight=10), | ||
dict(id="grid_1", group=4, weight=60), | ||
dict(id="lr_annealing", group=7, weight=10), | ||
]), dict(name="DeepLearning", steps=[ | ||
dict(id="def_1", group=3, weight=10), | ||
dict(id="grid_1", group=4, weight=30), | ||
dict(id="grid_2", group=5, weight=30), | ||
dict(id="grid_3", group=5, weight=30), | ||
]), dict(name="completion", steps=[ | ||
dict(id="resume_best_grids", group=6, weight=60), | ||
]), dict(name="StackedEnsemble", steps=[ | ||
dict(id="monotonic", group=9, weight=10), | ||
dict(id="best_of_family_xglm", group=10, weight=10), | ||
dict(id="all_xglm", group=10, weight=10), | ||
])]) | ||
aml_with_gblinear.train(y=ds.target, training_frame=ds.train) | ||
Comment on lines
+82
to
+115
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Example of how to run normal automl with gblinear grid. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this what we want to give to the customer when they want to try gblinear? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. AFAIK it's just one customer and from a limited benchmark |
||
print(aml_with_gblinear.leaderboard) | ||
for m in aml_with_gblinear.leaderboard.as_data_frame(use_pandas=False)[1:]: | ||
assert model_is_in_automl(h2o.get_model(m[0]), aml) or _is_gblinear(m[0]), m[0] | ||
|
||
print("GBLinear model count: {}".format( | ||
sum((_is_gblinear(m[0]) for m in aml_with_gblinear.leaderboard.as_data_frame(use_pandas=False)[1:])))) | ||
|
||
assert any((_is_gblinear(m[0]) for m in aml_with_gblinear.leaderboard.as_data_frame(use_pandas=False)[1:])) | ||
|
||
|
||
pu.run_tests([ | ||
test_automl_XGBoost_gblinear_reproducible_modeling_plan, | ||
]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice to add this in now. You can uncomment it once Adam exposed those parameters.