Skip to content

Commit

Permalink
GH-15994 : check failed dataset conversion to pandas using datatable (#…
Browse files Browse the repository at this point in the history
…15995)

GH-15994: add test to make sure dataset conversion works with customer example.
GH-15994: install datatable if it is not available but the python version is correct.
GH-15994: Only skip tests if datatable or polars/pyarrow cannot be installed.
GH-15994:  Changed old tests that use as_data_frame to use single thread operation as before.
co-authored with tomasfryda
  • Loading branch information
wendycwong authored Jan 11, 2024
1 parent 92aff12 commit 1565818
Show file tree
Hide file tree
Showing 9 changed files with 187 additions and 97 deletions.
6 changes: 6 additions & 0 deletions h2o-py/h2o/utils/shared_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,12 @@ def can_use_pandas():
def can_use_datatable():
return is_module_available('datatable') and sys.version_info.major == 3 and sys.version_info.minor <= 9

def can_install_datatable():
return sys.version_info.major == 3 and sys.version_info.minor <= 9

def can_install_polars():
return sys.version_info.major == 3 and sys.version_info.minor > 9

def can_use_polars():
return is_module_available('polars') and sys.version_info.major == 3 and sys.version_info.minor > 9

Expand Down
3 changes: 3 additions & 0 deletions h2o-py/tests/pyunit_utils/utilsPY.py
Original file line number Diff line number Diff line change
Expand Up @@ -4739,3 +4739,6 @@ def prepare_data():
y = 'Y'

return df, x, y

def install(package):
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
18 changes: 10 additions & 8 deletions h2o-py/tests/testdir_misc/pyunit_gainslift_bins.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd
from tests import pyunit_utils
from h2o.estimators import *
from h2o.utils.threading import local_context

eps = 1e-10

Expand All @@ -22,18 +23,19 @@ def fast_estimator(estimator, **kwargs):


def ks_score(mod, data, y):
from scipy.stats import ks_2samp
with local_context(datatable_disabled=True, polars_disabled=True): # conversion h2o frame to pandas using single thread as before
from scipy.stats import ks_2samp

df = pd.DataFrame()
df["label"] = data[y].as_data_frame().iloc[:, 0]
df["probs"] = mod.predict(data)["p1"].as_data_frame().iloc[:, 0]
df = pd.DataFrame()
df["label"] = data[y].as_data_frame().iloc[:, 0]
df["probs"] = mod.predict(data)["p1"].as_data_frame().iloc[:, 0]

label_0 = df[df["label"] == 0]
label_1 = df[df["label"] == 1]
label_0 = df[df["label"] == 0]
label_1 = df[df["label"] == 1]

ks = ks_2samp(label_0["probs"], label_1["probs"])
ks = ks_2samp(label_0["probs"], label_1["probs"])

return ks.statistic
return ks.statistic


def get_ks(model, data):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
sys.path.insert(1,"../../")
import h2o
from tests import pyunit_utils
from h2o.utils.shared_utils import (can_use_datatable, can_use_polars, can_use_pyarrow)
from h2o.utils.shared_utils import (can_use_datatable, can_use_polars, can_use_pyarrow, can_install_datatable,
can_install_polars)
import time
import pandas as pd
from h2o.utils.threading import local_context
Expand Down Expand Up @@ -47,32 +48,41 @@ def singl_thread_pandas_conversion(dataset):
print("H2O frame to Pandas frame conversion time with single thread for dataset {1}: {0}".format(newTime, dataset))
return h2oframe_panda

def test_polars_datatable():
def test_polars_datatable():
file1 = "smalldata/titanic/titanic_expanded.csv"
file2 = "smalldata/glm_test/multinomial_3Class_10KRow.csv"
file3 = "smalldata/timeSeries/CreditCard-ts_train.csv"

original_converted_frame1 = singl_thread_pandas_conversion(file1)
original_converted_frame2 = singl_thread_pandas_conversion(file2)
original_converted_frame3 = singl_thread_pandas_conversion(file3)

with local_context(polars_disabled=True): # run with datatable
if can_use_datatable():

if not(can_install_datatable()):
print("datatable is not available. Skipping tests using datatable.")
else:
if not(can_use_datatable()):
pyunit_utils.install("datatable")

with local_context(polars_disabled=True): # run with datatable
print("test data frame conversion using datatable.")
test_frame_conversion(file1, original_converted_frame1, "datatable")
test_frame_conversion(file2, original_converted_frame2, "datatable")
test_frame_conversion(file3, original_converted_frame3, "datatable")
else:
print("datatable is not available. Skipping tests using datatable.")

with local_context(datatable_disabled=True):
if can_use_polars() and can_use_pyarrow():
print("test data frame conversion using polars and pyarrow.")
test_frame_conversion(file1, original_converted_frame1, "polars and pyarrow")
test_frame_conversion(file2, original_converted_frame2, "polars and pyarrow")
test_frame_conversion(file3, original_converted_frame3, "polars and pyarrow")
else:
print("polars, pyarrow are not available. Skipping tests using polars and pyarrow")
test_frame_conversion(file3, original_converted_frame3, "datatable")

if not(can_install_polars()):
print("polars, pyarrow are not available. Skipping tests using polars and pyarrow")
else:
if not(can_use_polars()):
pyunit_utils.install("polars")
if not(can_use_pyarrow()):
pyunit_utils.install("pyarrow")

with local_context(datatable_disabled=True):
if can_use_polars() and can_use_pyarrow():
print("test data frame conversion using polars and pyarrow.")
test_frame_conversion(file1, original_converted_frame1, "polars and pyarrow")
test_frame_conversion(file2, original_converted_frame2, "polars and pyarrow")
test_frame_conversion(file3, original_converted_frame3, "polars and pyarrow")

if __name__ == "__main__":
pyunit_utils.standalone_test(test_polars_datatable)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
sys.path.insert(1,"../../")
import h2o
from tests import pyunit_utils
from h2o.utils.shared_utils import (can_use_datatable, can_use_polars, can_use_pyarrow)
from h2o.utils.shared_utils import (can_use_datatable, can_use_polars, can_use_pyarrow, can_install_datatable,
can_install_polars)
import time
from h2o.utils.threading import local_context

Expand Down Expand Up @@ -50,18 +51,25 @@ def test_polars_datatable_2_pandas():
file1 = "bigdata/laptop/jira/PUBDEV_5266_merge_with_string_columns/PUBDEV_5266_f1.csv"
original_converted_frame1 = single_thread_pandas_conversion(file1) # need to run conversion in single thread

with local_context(polars_disabled=True): # run with datatable
if can_use_datatable():
if can_install_datatable():
if not(can_use_datatable()):
pyunit_utils.install("datatable")
with local_context(polars_disabled=True): # run with datatable
print("test data frame conversion using datatable.")
test_frame_conversion(file1, original_converted_frame1, "datatable")
else:
print("datatable is not available. Skipping tests using datatable.")

with local_context(datatable_disabled=True):
if can_use_polars() and can_use_pyarrow():
else:
print("datatable is not available. Skipping tests using datatable.")

if can_install_polars():
if not(can_use_polars()):
pyunit_utils.install("polars")
if not(can_use_pyarrow()):
pyunit_utils.install("pyarrow")
with local_context(datatable_disabled=True):
print("test data frame conversion using polars and pyarrow.")
test_frame_conversion(file1, original_converted_frame1, "polars and pyarrow")
else:
else:
print("polars, pyarrow are not available. Skipping tests using polars and pyarrow")

if __name__ == "__main__":
Expand Down
13 changes: 9 additions & 4 deletions h2o-py/tests/testdir_misc/pyunit_gh_15936_polars2pandas_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
sys.path.insert(1,"../../")
import h2o
from tests import pyunit_utils
from h2o.utils.shared_utils import (can_use_polars, can_use_pyarrow)
from h2o.utils.shared_utils import (can_use_polars, can_use_pyarrow, can_install_polars)
import pandas as pd
from h2o.utils.threading import local_context

Expand Down Expand Up @@ -32,15 +32,20 @@ def test_frame_conversion(h2oFrame, original_pandas_frame):
assert diff.max() < 1e-10

def test_polars_pyarrow():
if can_use_polars() and can_use_pyarrow():
if not(can_install_polars()):
print("polars and pyarrow are not available to test. Skipping tests using polars and pyarrow.")
else:
if not(can_use_polars()):
pyunit_utils.install("polars")
if not(can_use_pyarrow()):
pyunit_utils.install("pyarrow")

with local_context(datatable_disabled=True, polars_disabled=True):
h2oframe = genFrame()
print("converting h2o frame to pandas frame using single thread:")
original_pandas = h2oframe.as_data_frame()
with local_context(datatable_disabled=True):
test_frame_conversion(h2oframe, original_pandas)
else:
print("polars and pyarrow are not available to test. Skipping tests using polars and pyarrow.")

def genFrame():
python_lists = [["ls 1029551"], ["no 983196"], ["true 689851"], ["437594"], ["no,ls 113569"], ["no,true 70607"]]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import sys
sys.path.insert(1,"../../")
import h2o
from tests import pyunit_utils
from h2o.utils.shared_utils import (can_use_datatable, can_install_datatable)
import pandas as pd
from h2o.utils.threading import local_context


# datatable have problems before with this dataset. Checking here to make sure it works.
def test_frame_conversion(h2oFrame, original_pandas_frame):
print("h2o frame to pandas frame conversion using datatable")
new_pandas_frame = h2oFrame.as_data_frame()
# compare two frames column types
new_types = new_pandas_frame.dtypes
old_types = original_pandas_frame.dtypes
ncol = h2oFrame.ncol
colNames = new_pandas_frame.columns

for ind in list(range(ncol)):
assert new_types[colNames[ind]] == old_types[colNames[ind]], "Expected column types: {0}, actual column types: " \
"{1}".format(old_types[colNames[ind]], new_types[colNames[ind]])
if new_types[colNames[ind]] == "object":
diff = new_pandas_frame[colNames[ind]] == original_pandas_frame[colNames[ind]]
if not diff.all(): # difference caused by the presence of NAs
newSeries = pd.Series(new_pandas_frame[colNames[ind]])
newNA = newSeries.isna()
oldSeries = pd.Series(original_pandas_frame[colNames[ind]])
oldNA = oldSeries.isna()
assert (newNA==oldNA).all()
else:
diff = (new_pandas_frame[colNames[ind]] - original_pandas_frame[colNames[ind]]).abs()
assert diff.max() < 1e-10


def test_datatable():
if can_install_datatable():
if not(can_use_datatable()):
pyunit_utils.install("datatable")

with local_context(datatable_disabled=True, polars_disabled=True):
h2oframe = genFrame()
print("converting h2o frame to pandas frame using single thread:")
original_pandas = h2oframe.as_data_frame()
with local_context(polars_disabled=True):
test_frame_conversion(h2oframe, original_pandas)
else:
print("datatable are not available to test. Skipping tests using datatable.")


def genFrame():
python_lists = [["ls 1029551"], ["no 983196"], ["true 689851"], ["437594"], ["no,ls 113569"], ["no,true 70607"]]
col_names=["X"]
col_types=['enum']
return h2o.H2OFrame(python_obj=python_lists, column_names=col_names, column_types=col_types)


if __name__ == "__main__":
pyunit_utils.standalone_test(test_datatable)
else:
test_datatable()
76 changes: 35 additions & 41 deletions h2o-py/tests/testdir_misc/pyunit_model_calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,50 +5,44 @@
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.isotonicregression import H2OIsotonicRegressionEstimator
from pandas.testing import assert_frame_equal
from h2o.utils.threading import local_context


def test_calibrate_existing_model():
df = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
df["Angaus"] = df["Angaus"].asfactor()

train, calib = df.split_frame(ratios=[.8], destination_frames=["eco_train", "eco_calib"], seed=42)

model_int_calib = H2OGradientBoostingEstimator(
ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, seed=42,
calibrate_model=True, calibration_frame=calib, calibration_method="IsotonicRegression"
)
model_int_calib.train(
x=list(range(2, train.ncol)),
y="Angaus", training_frame=train
)
preds_int_calib = model_int_calib.predict(train)

isotonic_train = calib[["Angaus"]]
isotonic_train = isotonic_train.cbind(model_int_calib.predict(calib)["p1"])
h2o_iso_reg = H2OIsotonicRegressionEstimator(out_of_bounds="clip")
h2o_iso_reg.train(training_frame=isotonic_train, x="p1", y="Angaus")

model_man_calib = H2OGradientBoostingEstimator(
ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, seed=42
)
model_man_calib.train(
x=list(range(2, train.ncol)),
y="Angaus", training_frame=train
)
preds_no_calib = model_man_calib.predict(train)
assert preds_no_calib.col_names == ["predict", "p0", "p1"]

model_man_calib.calibrate(h2o_iso_reg)

preds_man_calib = model_man_calib.predict(train)
assert preds_man_calib.col_names == ["predict", "p0", "p1", "cal_p0", "cal_p1"]

assert_frame_equal(preds_int_calib.as_data_frame(), preds_man_calib.as_data_frame())

# test MOJO
mojo = pyunit_utils.download_mojo(model_man_calib)
mojo_prediction = h2o.mojo_predict_pandas(dataframe=train.as_data_frame(), predict_calibrated=True, **mojo)
assert_frame_equal(preds_int_calib.as_data_frame(), mojo_prediction)
with local_context(datatable_disabled=True, polars_disabled=True): # conversion h2o frame to pandas using single thread as before
df = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
df["Angaus"] = df["Angaus"].asfactor()

train, calib = df.split_frame(ratios=[.8], destination_frames=["eco_train", "eco_calib"], seed=42)

model_int_calib = H2OGradientBoostingEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5,
seed=42, calibrate_model=True, calibration_frame=calib,
calibration_method="IsotonicRegression")
model_int_calib.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train)
preds_int_calib = model_int_calib.predict(train)

isotonic_train = calib[["Angaus"]]
isotonic_train = isotonic_train.cbind(model_int_calib.predict(calib)["p1"])
h2o_iso_reg = H2OIsotonicRegressionEstimator(out_of_bounds="clip")
h2o_iso_reg.train(training_frame=isotonic_train, x="p1", y="Angaus")

model_man_calib = H2OGradientBoostingEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5,
seed=42)
model_man_calib.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train)
preds_no_calib = model_man_calib.predict(train)
assert preds_no_calib.col_names == ["predict", "p0", "p1"]

model_man_calib.calibrate(h2o_iso_reg)

preds_man_calib = model_man_calib.predict(train)
assert preds_man_calib.col_names == ["predict", "p0", "p1", "cal_p0", "cal_p1"]

assert_frame_equal(preds_int_calib.as_data_frame(), preds_man_calib.as_data_frame())

# test MOJO
mojo = pyunit_utils.download_mojo(model_man_calib)
mojo_prediction = h2o.mojo_predict_pandas(dataframe=train.as_data_frame(), predict_calibrated=True, **mojo)
assert_frame_equal(preds_int_calib.as_data_frame(), mojo_prediction)


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit 1565818

Please sign in to comment.