From 1565818b35b190ee71381689f03ad37dbaec7d7f Mon Sep 17 00:00:00 2001 From: wendycwong Date: Thu, 11 Jan 2024 13:19:04 -0800 Subject: [PATCH] GH-15994 : check failed dataset conversion to pandas using datatable (#15995) GH-15994: add test to make sure dataset conversion works with customer example. GH-15994: install datatable if it is not available but the python version is correct. GH-15994: Only skip tests if datatable or polars/pyarrow cannot be installed. GH-15994: Changed old tests that use as_data_frame to use single thread operation as before. co-authored with tomasfryda --- h2o-py/h2o/utils/shared_utils.py | 6 ++ h2o-py/tests/pyunit_utils/utilsPY.py | 3 + .../testdir_misc/pyunit_gainslift_bins.py | 18 +++-- ...h_15729_15936_datatable_polars_2_pandas.py | 46 ++++++----- ...9_15936_datatable_polars_2_pandas_large.py | 24 ++++-- .../pyunit_gh_15936_polars2pandas_error.py | 13 +++- .../pyunit_gh_15994_datatable2pandas_error.py | 61 +++++++++++++++ .../testdir_misc/pyunit_model_calibrate.py | 76 +++++++++---------- .../pyunit_stratified_kfold_medium.py | 37 ++++----- 9 files changed, 187 insertions(+), 97 deletions(-) create mode 100644 h2o-py/tests/testdir_misc/pyunit_gh_15994_datatable2pandas_error.py diff --git a/h2o-py/h2o/utils/shared_utils.py b/h2o-py/h2o/utils/shared_utils.py index 168bbd8e086a..15cd23656609 100644 --- a/h2o-py/h2o/utils/shared_utils.py +++ b/h2o-py/h2o/utils/shared_utils.py @@ -135,6 +135,12 @@ def can_use_pandas(): def can_use_datatable(): return is_module_available('datatable') and sys.version_info.major == 3 and sys.version_info.minor <= 9 +def can_install_datatable(): + return sys.version_info.major == 3 and sys.version_info.minor <= 9 + +def can_install_polars(): + return sys.version_info.major == 3 and sys.version_info.minor > 9 + def can_use_polars(): return is_module_available('polars') and sys.version_info.major == 3 and sys.version_info.minor > 9 diff --git a/h2o-py/tests/pyunit_utils/utilsPY.py b/h2o-py/tests/pyunit_utils/utilsPY.py index 6632b26ab202..335fe8834b3f 100644 --- a/h2o-py/tests/pyunit_utils/utilsPY.py +++ b/h2o-py/tests/pyunit_utils/utilsPY.py @@ -4739,3 +4739,6 @@ def prepare_data(): y = 'Y' return df, x, y + +def install(package): + subprocess.check_call([sys.executable, "-m", "pip", "install", package]) diff --git a/h2o-py/tests/testdir_misc/pyunit_gainslift_bins.py b/h2o-py/tests/testdir_misc/pyunit_gainslift_bins.py index 02d3a57a0670..ad7167ef9520 100644 --- a/h2o-py/tests/testdir_misc/pyunit_gainslift_bins.py +++ b/h2o-py/tests/testdir_misc/pyunit_gainslift_bins.py @@ -5,6 +5,7 @@ import pandas as pd from tests import pyunit_utils from h2o.estimators import * +from h2o.utils.threading import local_context eps = 1e-10 @@ -22,18 +23,19 @@ def fast_estimator(estimator, **kwargs): def ks_score(mod, data, y): - from scipy.stats import ks_2samp + with local_context(datatable_disabled=True, polars_disabled=True): # conversion h2o frame to pandas using single thread as before + from scipy.stats import ks_2samp - df = pd.DataFrame() - df["label"] = data[y].as_data_frame().iloc[:, 0] - df["probs"] = mod.predict(data)["p1"].as_data_frame().iloc[:, 0] + df = pd.DataFrame() + df["label"] = data[y].as_data_frame().iloc[:, 0] + df["probs"] = mod.predict(data)["p1"].as_data_frame().iloc[:, 0] - label_0 = df[df["label"] == 0] - label_1 = df[df["label"] == 1] + label_0 = df[df["label"] == 0] + label_1 = df[df["label"] == 1] - ks = ks_2samp(label_0["probs"], label_1["probs"]) + ks = ks_2samp(label_0["probs"], label_1["probs"]) - return ks.statistic + return ks.statistic def get_ks(model, data): diff --git a/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas.py b/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas.py index 295540a942c3..c233515bce6a 100644 --- a/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas.py +++ b/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas.py @@ -2,7 +2,8 @@ sys.path.insert(1,"../../") import h2o from tests import pyunit_utils -from h2o.utils.shared_utils import (can_use_datatable, can_use_polars, can_use_pyarrow) +from h2o.utils.shared_utils import (can_use_datatable, can_use_polars, can_use_pyarrow, can_install_datatable, + can_install_polars) import time import pandas as pd from h2o.utils.threading import local_context @@ -47,32 +48,41 @@ def singl_thread_pandas_conversion(dataset): print("H2O frame to Pandas frame conversion time with single thread for dataset {1}: {0}".format(newTime, dataset)) return h2oframe_panda -def test_polars_datatable(): +def test_polars_datatable(): file1 = "smalldata/titanic/titanic_expanded.csv" file2 = "smalldata/glm_test/multinomial_3Class_10KRow.csv" file3 = "smalldata/timeSeries/CreditCard-ts_train.csv" - + original_converted_frame1 = singl_thread_pandas_conversion(file1) original_converted_frame2 = singl_thread_pandas_conversion(file2) original_converted_frame3 = singl_thread_pandas_conversion(file3) - - with local_context(polars_disabled=True): # run with datatable - if can_use_datatable(): + + if not(can_install_datatable()): + print("datatable is not available. Skipping tests using datatable.") + else: + if not(can_use_datatable()): + pyunit_utils.install("datatable") + + with local_context(polars_disabled=True): # run with datatable print("test data frame conversion using datatable.") test_frame_conversion(file1, original_converted_frame1, "datatable") test_frame_conversion(file2, original_converted_frame2, "datatable") - test_frame_conversion(file3, original_converted_frame3, "datatable") - else: - print("datatable is not available. Skipping tests using datatable.") - - with local_context(datatable_disabled=True): - if can_use_polars() and can_use_pyarrow(): - print("test data frame conversion using polars and pyarrow.") - test_frame_conversion(file1, original_converted_frame1, "polars and pyarrow") - test_frame_conversion(file2, original_converted_frame2, "polars and pyarrow") - test_frame_conversion(file3, original_converted_frame3, "polars and pyarrow") - else: - print("polars, pyarrow are not available. Skipping tests using polars and pyarrow") + test_frame_conversion(file3, original_converted_frame3, "datatable") + + if not(can_install_polars()): + print("polars, pyarrow are not available. Skipping tests using polars and pyarrow") + else: + if not(can_use_polars()): + pyunit_utils.install("polars") + if not(can_use_pyarrow()): + pyunit_utils.install("pyarrow") + + with local_context(datatable_disabled=True): + if can_use_polars() and can_use_pyarrow(): + print("test data frame conversion using polars and pyarrow.") + test_frame_conversion(file1, original_converted_frame1, "polars and pyarrow") + test_frame_conversion(file2, original_converted_frame2, "polars and pyarrow") + test_frame_conversion(file3, original_converted_frame3, "polars and pyarrow") if __name__ == "__main__": pyunit_utils.standalone_test(test_polars_datatable) diff --git a/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas_large.py b/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas_large.py index 45e6b65c5aed..f0acf2df1b7e 100644 --- a/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas_large.py +++ b/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas_large.py @@ -2,7 +2,8 @@ sys.path.insert(1,"../../") import h2o from tests import pyunit_utils -from h2o.utils.shared_utils import (can_use_datatable, can_use_polars, can_use_pyarrow) +from h2o.utils.shared_utils import (can_use_datatable, can_use_polars, can_use_pyarrow, can_install_datatable, + can_install_polars) import time from h2o.utils.threading import local_context @@ -50,18 +51,25 @@ def test_polars_datatable_2_pandas(): file1 = "bigdata/laptop/jira/PUBDEV_5266_merge_with_string_columns/PUBDEV_5266_f1.csv" original_converted_frame1 = single_thread_pandas_conversion(file1) # need to run conversion in single thread - with local_context(polars_disabled=True): # run with datatable - if can_use_datatable(): + if can_install_datatable(): + if not(can_use_datatable()): + pyunit_utils.install("datatable") + with local_context(polars_disabled=True): # run with datatable print("test data frame conversion using datatable.") test_frame_conversion(file1, original_converted_frame1, "datatable") - else: - print("datatable is not available. Skipping tests using datatable.") - with local_context(datatable_disabled=True): - if can_use_polars() and can_use_pyarrow(): + else: + print("datatable is not available. Skipping tests using datatable.") + + if can_install_polars(): + if not(can_use_polars()): + pyunit_utils.install("polars") + if not(can_use_pyarrow()): + pyunit_utils.install("pyarrow") + with local_context(datatable_disabled=True): print("test data frame conversion using polars and pyarrow.") test_frame_conversion(file1, original_converted_frame1, "polars and pyarrow") - else: + else: print("polars, pyarrow are not available. Skipping tests using polars and pyarrow") if __name__ == "__main__": diff --git a/h2o-py/tests/testdir_misc/pyunit_gh_15936_polars2pandas_error.py b/h2o-py/tests/testdir_misc/pyunit_gh_15936_polars2pandas_error.py index de19ed3cd879..455369c28d98 100644 --- a/h2o-py/tests/testdir_misc/pyunit_gh_15936_polars2pandas_error.py +++ b/h2o-py/tests/testdir_misc/pyunit_gh_15936_polars2pandas_error.py @@ -2,7 +2,7 @@ sys.path.insert(1,"../../") import h2o from tests import pyunit_utils -from h2o.utils.shared_utils import (can_use_polars, can_use_pyarrow) +from h2o.utils.shared_utils import (can_use_polars, can_use_pyarrow, can_install_polars) import pandas as pd from h2o.utils.threading import local_context @@ -32,15 +32,20 @@ def test_frame_conversion(h2oFrame, original_pandas_frame): assert diff.max() < 1e-10 def test_polars_pyarrow(): - if can_use_polars() and can_use_pyarrow(): + if not(can_install_polars()): + print("polars and pyarrow are not available to test. Skipping tests using polars and pyarrow.") + else: + if not(can_use_polars()): + pyunit_utils.install("polars") + if not(can_use_pyarrow()): + pyunit_utils.install("pyarrow") + with local_context(datatable_disabled=True, polars_disabled=True): h2oframe = genFrame() print("converting h2o frame to pandas frame using single thread:") original_pandas = h2oframe.as_data_frame() with local_context(datatable_disabled=True): test_frame_conversion(h2oframe, original_pandas) - else: - print("polars and pyarrow are not available to test. Skipping tests using polars and pyarrow.") def genFrame(): python_lists = [["ls 1029551"], ["no 983196"], ["true 689851"], ["437594"], ["no,ls 113569"], ["no,true 70607"]] diff --git a/h2o-py/tests/testdir_misc/pyunit_gh_15994_datatable2pandas_error.py b/h2o-py/tests/testdir_misc/pyunit_gh_15994_datatable2pandas_error.py new file mode 100644 index 000000000000..6fe48b0b4aad --- /dev/null +++ b/h2o-py/tests/testdir_misc/pyunit_gh_15994_datatable2pandas_error.py @@ -0,0 +1,61 @@ +import sys +sys.path.insert(1,"../../") +import h2o +from tests import pyunit_utils +from h2o.utils.shared_utils import (can_use_datatable, can_install_datatable) +import pandas as pd +from h2o.utils.threading import local_context + + +# datatable have problems before with this dataset. Checking here to make sure it works. +def test_frame_conversion(h2oFrame, original_pandas_frame): + print("h2o frame to pandas frame conversion using datatable") + new_pandas_frame = h2oFrame.as_data_frame() + # compare two frames column types + new_types = new_pandas_frame.dtypes + old_types = original_pandas_frame.dtypes + ncol = h2oFrame.ncol + colNames = new_pandas_frame.columns + + for ind in list(range(ncol)): + assert new_types[colNames[ind]] == old_types[colNames[ind]], "Expected column types: {0}, actual column types: " \ + "{1}".format(old_types[colNames[ind]], new_types[colNames[ind]]) + if new_types[colNames[ind]] == "object": + diff = new_pandas_frame[colNames[ind]] == original_pandas_frame[colNames[ind]] + if not diff.all(): # difference caused by the presence of NAs + newSeries = pd.Series(new_pandas_frame[colNames[ind]]) + newNA = newSeries.isna() + oldSeries = pd.Series(original_pandas_frame[colNames[ind]]) + oldNA = oldSeries.isna() + assert (newNA==oldNA).all() + else: + diff = (new_pandas_frame[colNames[ind]] - original_pandas_frame[colNames[ind]]).abs() + assert diff.max() < 1e-10 + + +def test_datatable(): + if can_install_datatable(): + if not(can_use_datatable()): + pyunit_utils.install("datatable") + + with local_context(datatable_disabled=True, polars_disabled=True): + h2oframe = genFrame() + print("converting h2o frame to pandas frame using single thread:") + original_pandas = h2oframe.as_data_frame() + with local_context(polars_disabled=True): + test_frame_conversion(h2oframe, original_pandas) + else: + print("datatable are not available to test. Skipping tests using datatable.") + + +def genFrame(): + python_lists = [["ls 1029551"], ["no 983196"], ["true 689851"], ["437594"], ["no,ls 113569"], ["no,true 70607"]] + col_names=["X"] + col_types=['enum'] + return h2o.H2OFrame(python_obj=python_lists, column_names=col_names, column_types=col_types) + + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_datatable) +else: + test_datatable() diff --git a/h2o-py/tests/testdir_misc/pyunit_model_calibrate.py b/h2o-py/tests/testdir_misc/pyunit_model_calibrate.py index 0602ab2a591a..eeeba46bcec2 100644 --- a/h2o-py/tests/testdir_misc/pyunit_model_calibrate.py +++ b/h2o-py/tests/testdir_misc/pyunit_model_calibrate.py @@ -5,50 +5,44 @@ from h2o.estimators.gbm import H2OGradientBoostingEstimator from h2o.estimators.isotonicregression import H2OIsotonicRegressionEstimator from pandas.testing import assert_frame_equal +from h2o.utils.threading import local_context def test_calibrate_existing_model(): - df = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv")) - df["Angaus"] = df["Angaus"].asfactor() - - train, calib = df.split_frame(ratios=[.8], destination_frames=["eco_train", "eco_calib"], seed=42) - - model_int_calib = H2OGradientBoostingEstimator( - ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, seed=42, - calibrate_model=True, calibration_frame=calib, calibration_method="IsotonicRegression" - ) - model_int_calib.train( - x=list(range(2, train.ncol)), - y="Angaus", training_frame=train - ) - preds_int_calib = model_int_calib.predict(train) - - isotonic_train = calib[["Angaus"]] - isotonic_train = isotonic_train.cbind(model_int_calib.predict(calib)["p1"]) - h2o_iso_reg = H2OIsotonicRegressionEstimator(out_of_bounds="clip") - h2o_iso_reg.train(training_frame=isotonic_train, x="p1", y="Angaus") - - model_man_calib = H2OGradientBoostingEstimator( - ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, seed=42 - ) - model_man_calib.train( - x=list(range(2, train.ncol)), - y="Angaus", training_frame=train - ) - preds_no_calib = model_man_calib.predict(train) - assert preds_no_calib.col_names == ["predict", "p0", "p1"] - - model_man_calib.calibrate(h2o_iso_reg) - - preds_man_calib = model_man_calib.predict(train) - assert preds_man_calib.col_names == ["predict", "p0", "p1", "cal_p0", "cal_p1"] - - assert_frame_equal(preds_int_calib.as_data_frame(), preds_man_calib.as_data_frame()) - - # test MOJO - mojo = pyunit_utils.download_mojo(model_man_calib) - mojo_prediction = h2o.mojo_predict_pandas(dataframe=train.as_data_frame(), predict_calibrated=True, **mojo) - assert_frame_equal(preds_int_calib.as_data_frame(), mojo_prediction) + with local_context(datatable_disabled=True, polars_disabled=True): # conversion h2o frame to pandas using single thread as before + df = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv")) + df["Angaus"] = df["Angaus"].asfactor() + + train, calib = df.split_frame(ratios=[.8], destination_frames=["eco_train", "eco_calib"], seed=42) + + model_int_calib = H2OGradientBoostingEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, + seed=42, calibrate_model=True, calibration_frame=calib, + calibration_method="IsotonicRegression") + model_int_calib.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train) + preds_int_calib = model_int_calib.predict(train) + + isotonic_train = calib[["Angaus"]] + isotonic_train = isotonic_train.cbind(model_int_calib.predict(calib)["p1"]) + h2o_iso_reg = H2OIsotonicRegressionEstimator(out_of_bounds="clip") + h2o_iso_reg.train(training_frame=isotonic_train, x="p1", y="Angaus") + + model_man_calib = H2OGradientBoostingEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, + seed=42) + model_man_calib.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train) + preds_no_calib = model_man_calib.predict(train) + assert preds_no_calib.col_names == ["predict", "p0", "p1"] + + model_man_calib.calibrate(h2o_iso_reg) + + preds_man_calib = model_man_calib.predict(train) + assert preds_man_calib.col_names == ["predict", "p0", "p1", "cal_p0", "cal_p1"] + + assert_frame_equal(preds_int_calib.as_data_frame(), preds_man_calib.as_data_frame()) + + # test MOJO + mojo = pyunit_utils.download_mojo(model_man_calib) + mojo_prediction = h2o.mojo_predict_pandas(dataframe=train.as_data_frame(), predict_calibrated=True, **mojo) + assert_frame_equal(preds_int_calib.as_data_frame(), mojo_prediction) if __name__ == "__main__": diff --git a/h2o-py/tests/testdir_misc/pyunit_stratified_kfold_medium.py b/h2o-py/tests/testdir_misc/pyunit_stratified_kfold_medium.py index 84d9de8b0761..a31b6e2a2d99 100644 --- a/h2o-py/tests/testdir_misc/pyunit_stratified_kfold_medium.py +++ b/h2o-py/tests/testdir_misc/pyunit_stratified_kfold_medium.py @@ -5,36 +5,37 @@ sys.path.insert(1,"../../") import h2o from tests import pyunit_utils +from h2o.utils.threading import local_context def stratified_kfold(): + with local_context(datatable_disabled=True, polars_disabled=True): # convert h2o frame to pandas frame in single-thread as before + NFOLDS=5 - NFOLDS=5 + fr = h2o.import_file(pyunit_utils.locate("bigdata/laptop/covtype/covtype.data")) - fr = h2o.import_file(pyunit_utils.locate("bigdata/laptop/covtype/covtype.data")) + stratified = fr[54].stratified_kfold_column(n_folds=NFOLDS) + stratified.show() - stratified = fr[54].stratified_kfold_column(n_folds=NFOLDS) - stratified.show() + dist = (old_div(fr[54].table()["Count"], fr[54].table()["Count"].sum())).as_data_frame(True).to_dict("list")["Count"] # get a raw list of means - dist = (old_div(fr[54].table()["Count"], fr[54].table()["Count"].sum())).as_data_frame(True).to_dict("list")["Count"] # get a raw list of means + overall_result = reduce(lambda x,y: x.cbind(y), [old_div(fr[stratified==i,54].table()["Count"],fr[stratified==i,54].table()["Count"].sum()) for i in range(NFOLDS)]) + overall_result.show() + df = overall_result.as_data_frame(True) # get the overall result here - overall_result = reduce(lambda x,y: x.cbind(y), [old_div(fr[stratified==i,54].table()["Count"],fr[stratified==i,54].table()["Count"].sum()) for i in range(NFOLDS)]) - overall_result.show() - df = overall_result.as_data_frame(True) # get the overall result here + # show that folds are consistent + print() + print("Show that all folds are consistent with one another: ") + print(df.mean(axis=1)) # print the average + print(df.var(axis=1)) # print the standard deviation + print() - # show that folds are consistent - print() - print("Show that all folds are consistent with one another: ") - print(df.mean(axis=1)) # print the average - print(df.var(axis=1)) # print the standard deviation - print() - - # now show that folds are consistent with the original distribution of classes - for i in range(len(dist)): - print("Stratification variance for class #%s: %s" %(i, old_div((df.loc[i].sub(dist[i]).pow(2).sum()), (df.shape[0] - 1)))) + # now show that folds are consistent with the original distribution of classes + for i in range(len(dist)): + print("Stratification variance for class #%s: %s" %(i, old_div((df.loc[i].sub(dist[i]).pow(2).sum()), (df.shape[0] - 1))))