Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GH-15971] Introduce Java Property Disabling POJO Import #16021

Merged
merged 4 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions h2o-algos/src/main/java/hex/generic/Generic.java
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,13 @@ public void computeImpl() {
if (ZipUtil.isCompressed(modelBytes)) {
genericModel = importMojo(modelBytes, dataKey);
} else {
warn("_path", "Trying to import a POJO model - this is currently an experimental feature.");
genericModel = importPojo(modelBytes, dataKey, _result.toString());
if (H2O.getSysBoolProperty("pojo.import.enabled", false)) {
warn("_path", "Trying to import a POJO model - this is currently an experimental feature.");
genericModel = importPojo(modelBytes, dataKey, _result.toString());
} else {
throw new SecurityException("POJO import is disabled since it brings a security risk. " +
"To enable the feature, set the java property `sys.ai.h2o.pojo.import.enabled` to true.");
}
}
genericModel.write_lock(_job);
genericModel.unlock(_job);
Expand Down
2 changes: 2 additions & 0 deletions h2o-algos/src/test/java/hex/generic/GenericModelTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1324,6 +1324,7 @@ public void rulefitMojoTestRegression() throws IOException {
public void testJavaScoring_gbm_binomial_pojo() throws Exception {
try {
Scope.enter();
System.setProperty("sys.ai.h2o.pojo.import.enabled", "true");
// Create new GBM model
final Frame trainingFrame = parseTestFile("./smalldata/testng/airlines_train.csv");
Scope.track(trainingFrame);
Expand Down Expand Up @@ -1356,6 +1357,7 @@ public void testJavaScoring_gbm_binomial_pojo() throws Exception {

assertFrameEquals(scoredOriginal, scoredGeneric, 0);
} finally {
System.clearProperty("sys.ai.h2o.pojo.import.enabled");
Scope.exit();
}
}
Expand Down
68 changes: 36 additions & 32 deletions h2o-py/tests/testdir_algos/gbm/pyunit_gbm_pojo_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,44 @@

sys.path.insert(1, "../../../")
import h2o
import unittest
from tests import pyunit_utils
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from pandas.util.testing import assert_frame_equal


def prostate_pojo_import():
prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
prostate = prostate.drop("ID")
prostate['CAPSULE'] = prostate['CAPSULE'].asfactor()

model = H2OGradientBoostingEstimator()
model.train(
y="CAPSULE",
training_frame=prostate
)

sandbox_dir = pyunit_utils.locate("results")
pojo_path = h2o.download_pojo(model, path=sandbox_dir)

model_imported = h2o.import_mojo(pojo_path)
print(model_imported)

# 1. check scoring
preds_original = model.predict(prostate)
preds_imported = model_imported.predict(prostate)
assert_frame_equal(preds_original.as_data_frame(), preds_imported.as_data_frame())

# 2. check we can get PDPs
pdp_original = model.partial_plot(frame=prostate, cols=['AGE'], server=True, plot=False)
pdp_imported = model_imported.partial_plot(frame=prostate, cols=['AGE'], server=True, plot=False)
assert_frame_equal(pdp_original[0].as_data_frame(), pdp_imported[0].as_data_frame())


if __name__ == "__main__":
pyunit_utils.standalone_test(prostate_pojo_import)
else:
prostate_pojo_import()
class TestGBMPojoImport(unittest.TestCase):
def test(self):
try:
h2o.init(strict_version_check=False, jvm_custom_args=["-Dsys.ai.h2o.pojo.import.enabled=true", ])
prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
prostate = prostate.drop("ID")
prostate['CAPSULE'] = prostate['CAPSULE'].asfactor()

model = H2OGradientBoostingEstimator()
model.train(
y="CAPSULE",
training_frame=prostate
)

sandbox_dir = pyunit_utils.locate("results")
pojo_path = h2o.download_pojo(model, path=sandbox_dir)

model_imported = h2o.import_mojo(pojo_path)
print(model_imported)

# 1. check scoring
preds_original = model.predict(prostate)
preds_imported = model_imported.predict(prostate)
assert_frame_equal(preds_original.as_data_frame(), preds_imported.as_data_frame())

# 2. check we can get PDPs
pdp_original = model.partial_plot(frame=prostate, cols=['AGE'], server=True, plot=False)
pdp_imported = model_imported.partial_plot(frame=prostate, cols=['AGE'], server=True, plot=False)
assert_frame_equal(pdp_original[0].as_data_frame(), pdp_imported[0].as_data_frame())
finally:
h2o.cluster().shutdown()


suite = unittest.TestLoader().loadTestsFromTestCase(TestGBMPojoImport)
unittest.TextTestRunner().run(suite)
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from h2o.estimators import H2OGradientBoostingEstimator, H2OGeneralizedLinearEstimator
from tests import pyunit_utils
import os
import sys
import unittest
from pandas.testing import assert_frame_equal


Expand Down Expand Up @@ -282,66 +282,65 @@ def make_pojo_embeddable(pojo_path):
return "".join(pojo_lines)


def generate_and_import_combined_pojo():
if sys.version_info[0] < 3: # Python 2
print("This example needs Python 3.x+")
return

weather_orig = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/weather.csv"))
weather = weather_orig # working copy

features = list(set(weather.names) - {"Date", "RainTomorrow", "Sunshine"})
features.sort()
response = "RISK_MM"

glm_model = H2OGeneralizedLinearEstimator()
glm_model.train(x=features, y=response, training_frame=weather)
glm_preds = glm_model.predict(weather)

gbm_model = H2OGradientBoostingEstimator(ntrees=5)
gbm_model.train(x=features, y=response, training_frame=weather)
gbm_preds = gbm_model.predict(weather)

# Drop columns that we will calculate in POJO manually (we will recreate them in POJO to be the exact same)
weather = weather.drop("ChangeTemp")
weather = weather.drop("ChangeTempDir")

(combined_pojo_name, combined_pojo_path) = generate_combined_pojo(glm_model, gbm_model)
print("Combined POJO was stored in: " + combined_pojo_path)

# Note: when using upload_mojo - always specify model_id=<POJO class name>
pojo_model = h2o.upload_mojo(combined_pojo_path, model_id=combined_pojo_name)

# Testing begins

# Sanity test - test parameterization that delegates to GLM
weather["Bias"] = 1 # behave like GLM
pojo_glm_preds = pojo_model.predict(weather)
assert_frame_equal(pojo_glm_preds.as_data_frame(), glm_preds.as_data_frame())

# Sanity test - test parameterization that delegates to GBM
weather["Bias"] = 0 # behave like GBM
pojo_gbm_preds = pojo_model.predict(weather)
assert_frame_equal(pojo_gbm_preds.as_data_frame(), gbm_preds.as_data_frame())

# Test per-segment specific behavior, segments are defined by ChangeWindDirect
weather["Bias"] = float("NaN")
for change_wind_dir in weather["ChangeWindDirect"].levels()[0]:
weather_cwd = weather[weather["ChangeWindDirect"] == change_wind_dir]
weather_orig_cwd = weather_orig[weather_orig["ChangeWindDirect"] == change_wind_dir]
pojo_weather_cwd_preds = pojo_model.predict(weather_cwd)
if change_wind_dir == "c" or change_wind_dir == "l":
expected = glm_model.predict(weather_orig_cwd) * 2
assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(), expected.as_data_frame())
elif change_wind_dir == "n":
expected = (glm_model.predict(weather_orig_cwd) + gbm_model.predict(weather_orig_cwd)) / 2
assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(), expected.as_data_frame())
elif change_wind_dir == "s":
expected = gbm_model.predict(weather_orig_cwd)
assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(), expected.as_data_frame())


if __name__ == "__main__":
pyunit_utils.standalone_test(generate_and_import_combined_pojo)
else:
generate_and_import_combined_pojo()
class TestCombinedPojoImport(unittest.TestCase):
def test(self):
try:
h2o.init(strict_version_check=False, jvm_custom_args=["-Dsys.ai.h2o.pojo.import.enabled=true", ])
weather_orig = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/weather.csv"))
weather = weather_orig # working copy

features = list(set(weather.names) - {"Date", "RainTomorrow", "Sunshine"})
features.sort()
response = "RISK_MM"

glm_model = H2OGeneralizedLinearEstimator()
glm_model.train(x=features, y=response, training_frame=weather)
glm_preds = glm_model.predict(weather)

gbm_model = H2OGradientBoostingEstimator(ntrees=5)
gbm_model.train(x=features, y=response, training_frame=weather)
gbm_preds = gbm_model.predict(weather)

# Drop columns that we will calculate in POJO manually (we will recreate them in POJO to be the exact same)
weather = weather.drop("ChangeTemp")
weather = weather.drop("ChangeTempDir")

(combined_pojo_name, combined_pojo_path) = generate_combined_pojo(glm_model, gbm_model)
print("Combined POJO was stored in: " + combined_pojo_path)

# Note: when using upload_mojo - always specify model_id=<POJO class name>
pojo_model = h2o.upload_mojo(combined_pojo_path, model_id=combined_pojo_name)

# Testing begins

# Sanity test - test parameterization that delegates to GLM
weather["Bias"] = 1 # behave like GLM
pojo_glm_preds = pojo_model.predict(weather)
assert_frame_equal(pojo_glm_preds.as_data_frame(), glm_preds.as_data_frame())

# Sanity test - test parameterization that delegates to GBM
weather["Bias"] = 0 # behave like GBM
pojo_gbm_preds = pojo_model.predict(weather)
assert_frame_equal(pojo_gbm_preds.as_data_frame(), gbm_preds.as_data_frame())

# Test per-segment specific behavior, segments are defined by ChangeWindDirect
weather["Bias"] = float("NaN")
for change_wind_dir in weather["ChangeWindDirect"].levels()[0]:
weather_cwd = weather[weather["ChangeWindDirect"] == change_wind_dir]
weather_orig_cwd = weather_orig[weather_orig["ChangeWindDirect"] == change_wind_dir]
pojo_weather_cwd_preds = pojo_model.predict(weather_cwd)
if change_wind_dir == "c" or change_wind_dir == "l":
expected = glm_model.predict(weather_orig_cwd) * 2
assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(), expected.as_data_frame())
elif change_wind_dir == "n":
expected = (glm_model.predict(weather_orig_cwd) + gbm_model.predict(weather_orig_cwd)) / 2
assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(), expected.as_data_frame())
elif change_wind_dir == "s":
expected = gbm_model.predict(weather_orig_cwd)
assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(), expected.as_data_frame())
finally:
h2o.cluster().shutdown()


suite = unittest.TestLoader().loadTestsFromTestCase(TestCombinedPojoImport)
unittest.TextTestRunner().run(suite)
114 changes: 59 additions & 55 deletions h2o-py/tests/testdir_generic_model/pyunit_mojo_import.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,69 @@
import h2o
import unittest
import tempfile
from h2o.estimators import H2OGradientBoostingEstimator, H2OGenericEstimator
from tests import pyunit_utils
import os
from pandas.testing import assert_frame_equal


# Test of MOJO convenience methods
def mojo_convenience():
# Train a model
airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
model = H2OGradientBoostingEstimator(ntrees = 1)
model.train(x = ["Origin", "Dest"], y = "IsDepDelayed", training_frame=airlines)

#Save the previously created model into a temporary file
original_model_filename = tempfile.mkdtemp()
original_model_filename = model.save_mojo(original_model_filename)

# Load the model from the temporary file
mojo_model = h2o.import_mojo(original_model_filename)
assert isinstance(mojo_model, H2OGenericEstimator)

# Test scoring is available on the model
predictions = mojo_model.predict(airlines)
assert predictions is not None
assert predictions.nrows == 24421
class TestMojoImport(unittest.TestCase):
# Test of MOJO convenience methods
def mojo_convenience(self):
try:
h2o.init(strict_version_check=False, jvm_custom_args=["-Dsys.ai.h2o.pojo.import.enabled=true", ])
# Train a model
airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
model = H2OGradientBoostingEstimator(ntrees = 1)
model.train(x = ["Origin", "Dest"], y = "IsDepDelayed", training_frame=airlines)

#Save the previously created model into a temporary file
original_model_filename = tempfile.mkdtemp()
original_model_filename = model.save_mojo(original_model_filename)

# Load the model from the temporary file
mojo_model = h2o.import_mojo(original_model_filename)
assert isinstance(mojo_model, H2OGenericEstimator)

# Test scoring is available on the model
predictions = mojo_model.predict(airlines)
assert predictions is not None
assert predictions.nrows == 24421

#####
# MOJO UPLOAD TEST
#####

try:
pyunit_utils.set_forbidden_paths([original_model_filename])
# Download the MOJO
original_model_filename = model.download_mojo(original_model_filename)
# Load the model from the temporary file
mojo_model = h2o.upload_mojo(original_model_filename)
assert isinstance(mojo_model, H2OGenericEstimator)

# Test scoring is available on the model
predictions = mojo_model.predict(airlines)
assert predictions is not None
assert predictions.nrows == 24421
finally:
pyunit_utils.clear_forbidden_paths()

#####
# MOJO to POJO Conversion test with POJO re-import
#####

pojo_directory = os.path.join(pyunit_utils.locate("results"), model.model_id + ".java")
pojo_path = model.download_pojo(path = pojo_directory)
mojo2_model = h2o.import_mojo(pojo_path)

predictions2 = mojo2_model.predict(airlines)
assert predictions2 is not None
assert predictions2.nrows == 24421
assert_frame_equal(predictions.as_data_frame(), predictions2.as_data_frame())
finally:
h2o.cluster().shutdown()

#####
# MOJO UPLOAD TEST
#####

try:
pyunit_utils.set_forbidden_paths([original_model_filename])
# Download the MOJO
original_model_filename = model.download_mojo(original_model_filename)
# Load the model from the temporary file
mojo_model = h2o.upload_mojo(original_model_filename)
assert isinstance(mojo_model, H2OGenericEstimator)

# Test scoring is available on the model
predictions = mojo_model.predict(airlines)
assert predictions is not None
assert predictions.nrows == 24421
finally:
pyunit_utils.clear_forbidden_paths()

#####
# MOJO to POJO Conversion test with POJO re-import
#####

pojo_directory = os.path.join(pyunit_utils.locate("results"), model.model_id + ".java")
pojo_path = model.download_pojo(path = pojo_directory)
mojo2_model = h2o.import_mojo(pojo_path)

predictions2 = mojo2_model.predict(airlines)
assert predictions2 is not None
assert predictions2.nrows == 24421
assert_frame_equal(predictions.as_data_frame(), predictions2.as_data_frame())


if __name__ == "__main__":
pyunit_utils.standalone_test(mojo_convenience)
else:
mojo_convenience()
suite = unittest.TestLoader().loadTestsFromTestCase(TestMojoImport)
unittest.TextTestRunner().run(suite)
Loading
Loading