GH-15994 : check failed dataset conversion to pandas using datatable (#…

…15995) GH-15994: add test to make sure dataset conversion works with customer example. GH-15994: install datatable if it is not available but the python version is correct. GH-15994: Only skip tests if datatable or polars/pyarrow cannot be installed. GH-15994: Changed old tests that use as_data_frame to use single thread operation as before. co-authored with tomasfryda
h2oai · Jan 11, 2024 · 1565818 · 1565818
1 parent 92aff12
commit 1565818
Show file tree

Hide file tree

Showing 9 changed files with 187 additions and 97 deletions.
diff --git a/h2o-py/h2o/utils/shared_utils.py b/h2o-py/h2o/utils/shared_utils.py
@@ -135,6 +135,12 @@ def can_use_pandas():
 def can_use_datatable():
     return is_module_available('datatable') and sys.version_info.major == 3 and sys.version_info.minor <= 9
 
+def can_install_datatable():
+    return sys.version_info.major == 3 and sys.version_info.minor <= 9
+
+def can_install_polars():
+    return sys.version_info.major == 3 and sys.version_info.minor > 9
+
 def can_use_polars():
     return is_module_available('polars') and sys.version_info.major == 3 and sys.version_info.minor > 9
 

diff --git a/h2o-py/tests/pyunit_utils/utilsPY.py b/h2o-py/tests/pyunit_utils/utilsPY.py
@@ -4739,3 +4739,6 @@ def prepare_data():
     y = 'Y'
 
     return df, x, y
+
+def install(package):
+    subprocess.check_call([sys.executable, "-m", "pip", "install", package])
diff --git a/h2o-py/tests/testdir_misc/pyunit_gainslift_bins.py b/h2o-py/tests/testdir_misc/pyunit_gainslift_bins.py
@@ -5,6 +5,7 @@
 import pandas as pd
 from tests import pyunit_utils
 from h2o.estimators import *
+from h2o.utils.threading import local_context
 
 eps = 1e-10
 
@@ -22,18 +23,19 @@ def fast_estimator(estimator, **kwargs):
 
 
 def ks_score(mod, data, y):
-    from scipy.stats import ks_2samp
+    with local_context(datatable_disabled=True, polars_disabled=True): # conversion h2o frame to pandas using single thread as before
+      from scipy.stats import ks_2samp
 
-    df = pd.DataFrame()
-    df["label"] = data[y].as_data_frame().iloc[:, 0]
-    df["probs"] = mod.predict(data)["p1"].as_data_frame().iloc[:, 0]
+      df = pd.DataFrame()
+      df["label"] = data[y].as_data_frame().iloc[:, 0]
+      df["probs"] = mod.predict(data)["p1"].as_data_frame().iloc[:, 0]
 
-    label_0 = df[df["label"] == 0]
-    label_1 = df[df["label"] == 1]
+      label_0 = df[df["label"] == 0]
+      label_1 = df[df["label"] == 1]
 
-    ks = ks_2samp(label_0["probs"], label_1["probs"])
+      ks = ks_2samp(label_0["probs"], label_1["probs"])
 
-    return ks.statistic
+      return ks.statistic
 
 
 def get_ks(model, data):

diff --git a/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas.py b/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas.py
@@ -2,7 +2,8 @@
 sys.path.insert(1,"../../")
 import h2o
 from tests import pyunit_utils
-from h2o.utils.shared_utils import (can_use_datatable, can_use_polars, can_use_pyarrow)
+from h2o.utils.shared_utils import (can_use_datatable, can_use_polars, can_use_pyarrow, can_install_datatable, 
+                                    can_install_polars)
 import time
 import pandas as pd
 from h2o.utils.threading import local_context
@@ -47,32 +48,41 @@ def singl_thread_pandas_conversion(dataset):
         print("H2O frame to Pandas frame conversion time with single thread for dataset {1}: {0}".format(newTime, dataset))
         return h2oframe_panda
 
-def test_polars_datatable(): 
+def test_polars_datatable():
     file1 = "smalldata/titanic/titanic_expanded.csv"
     file2 = "smalldata/glm_test/multinomial_3Class_10KRow.csv"
     file3 = "smalldata/timeSeries/CreditCard-ts_train.csv"
-    
+
     original_converted_frame1 = singl_thread_pandas_conversion(file1)
     original_converted_frame2 = singl_thread_pandas_conversion(file2)
     original_converted_frame3 = singl_thread_pandas_conversion(file3)
-
-    with local_context(polars_disabled=True):   # run with datatable
-        if can_use_datatable():
+
+    if not(can_install_datatable()):
+        print("datatable is not available.  Skipping tests using datatable.")
+    else:
+        if not(can_use_datatable()):
+            pyunit_utils.install("datatable")
+
+        with local_context(polars_disabled=True):   # run with datatable
             print("test data frame conversion using datatable.")
             test_frame_conversion(file1, original_converted_frame1, "datatable")
             test_frame_conversion(file2, original_converted_frame2, "datatable")
-            test_frame_conversion(file3, original_converted_frame3, "datatable")
-        else:
-            print("datatable is not available.  Skipping tests using datatable.")
-
-    with local_context(datatable_disabled=True):
-        if can_use_polars() and can_use_pyarrow():
-            print("test data frame conversion using polars and pyarrow.")
-            test_frame_conversion(file1, original_converted_frame1, "polars and pyarrow")
-            test_frame_conversion(file2, original_converted_frame2, "polars and pyarrow")
-            test_frame_conversion(file3, original_converted_frame3, "polars and pyarrow")    
-        else:
-            print("polars, pyarrow are not available.  Skipping tests using polars and pyarrow")             
+            test_frame_conversion(file3, original_converted_frame3, "datatable")    
+
+    if not(can_install_polars()):
+        print("polars, pyarrow are not available.  Skipping tests using polars and pyarrow")
+    else:
+        if not(can_use_polars()):
+            pyunit_utils.install("polars")
+        if not(can_use_pyarrow()):
+            pyunit_utils.install("pyarrow")
+
+        with local_context(datatable_disabled=True):
+            if can_use_polars() and can_use_pyarrow():
+                print("test data frame conversion using polars and pyarrow.")
+                test_frame_conversion(file1, original_converted_frame1, "polars and pyarrow")
+                test_frame_conversion(file2, original_converted_frame2, "polars and pyarrow")
+                test_frame_conversion(file3, original_converted_frame3, "polars and pyarrow")    
 
 if __name__ == "__main__":
     pyunit_utils.standalone_test(test_polars_datatable)

diff --git a/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas_large.py b/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas_large.py
@@ -2,7 +2,8 @@
 sys.path.insert(1,"../../")
 import h2o
 from tests import pyunit_utils
-from h2o.utils.shared_utils import (can_use_datatable, can_use_polars, can_use_pyarrow)
+from h2o.utils.shared_utils import (can_use_datatable, can_use_polars, can_use_pyarrow, can_install_datatable, 
+                                    can_install_polars)
 import time
 from h2o.utils.threading import local_context
 
@@ -50,18 +51,25 @@ def test_polars_datatable_2_pandas():
     file1 = "bigdata/laptop/jira/PUBDEV_5266_merge_with_string_columns/PUBDEV_5266_f1.csv"
     original_converted_frame1 = single_thread_pandas_conversion(file1)  # need to run conversion in single thread
 
-    with local_context(polars_disabled=True):   # run with datatable
-        if can_use_datatable():
+    if can_install_datatable():
+        if not(can_use_datatable()):
+            pyunit_utils.install("datatable")
+        with local_context(polars_disabled=True):   # run with datatable
             print("test data frame conversion using datatable.")
             test_frame_conversion(file1, original_converted_frame1, "datatable")
-        else:
-            print("datatable is not available.  Skipping tests using datatable.")
 
-    with local_context(datatable_disabled=True):
-        if can_use_polars() and can_use_pyarrow():
+    else:
+        print("datatable is not available.  Skipping tests using datatable.")
+
+    if can_install_polars():
+        if not(can_use_polars()):
+            pyunit_utils.install("polars")
+        if not(can_use_pyarrow()):
+            pyunit_utils.install("pyarrow")
+        with local_context(datatable_disabled=True):
             print("test data frame conversion using polars and pyarrow.")
             test_frame_conversion(file1, original_converted_frame1, "polars and pyarrow")
-        else:
+    else:
             print("polars, pyarrow are not available.  Skipping tests using polars and pyarrow")
 
 if __name__ == "__main__":

diff --git a/h2o-py/tests/testdir_misc/pyunit_gh_15936_polars2pandas_error.py b/h2o-py/tests/testdir_misc/pyunit_gh_15936_polars2pandas_error.py
@@ -2,7 +2,7 @@
 sys.path.insert(1,"../../")
 import h2o
 from tests import pyunit_utils
-from h2o.utils.shared_utils import (can_use_polars, can_use_pyarrow)
+from h2o.utils.shared_utils import (can_use_polars, can_use_pyarrow, can_install_polars)
 import pandas as pd
 from h2o.utils.threading import local_context
 
@@ -32,15 +32,20 @@ def test_frame_conversion(h2oFrame, original_pandas_frame):
             assert diff.max() < 1e-10
 
 def test_polars_pyarrow():
-    if can_use_polars() and can_use_pyarrow():
+    if not(can_install_polars()):
+        print("polars and pyarrow are not available to test.  Skipping tests using polars and pyarrow.")
+    else:
+        if not(can_use_polars()):
+            pyunit_utils.install("polars")
+        if not(can_use_pyarrow()):
+            pyunit_utils.install("pyarrow")
+
         with local_context(datatable_disabled=True, polars_disabled=True):
             h2oframe = genFrame()
             print("converting h2o frame to pandas frame using single thread:")
             original_pandas = h2oframe.as_data_frame()
         with local_context(datatable_disabled=True):
             test_frame_conversion(h2oframe, original_pandas)  
-    else:
-        print("polars and pyarrow are not available to test.  Skipping tests using polars and pyarrow.")
 
 def genFrame():
     python_lists = [["ls 1029551"], ["no 983196"], ["true 689851"], ["437594"], ["no,ls 113569"], ["no,true 70607"]]

diff --git a/h2o-py/tests/testdir_misc/pyunit_gh_15994_datatable2pandas_error.py b/h2o-py/tests/testdir_misc/pyunit_gh_15994_datatable2pandas_error.py
@@ -0,0 +1,61 @@
+import sys
+sys.path.insert(1,"../../")
+import h2o
+from tests import pyunit_utils
+from h2o.utils.shared_utils import (can_use_datatable, can_install_datatable)
+import pandas as pd
+from h2o.utils.threading import local_context
+
+
+# datatable have problems before with this dataset.  Checking here to make sure it works.
+def test_frame_conversion(h2oFrame, original_pandas_frame):
+    print("h2o frame to pandas frame conversion using datatable")
+    new_pandas_frame = h2oFrame.as_data_frame()
+    # compare two frames column types                
+    new_types = new_pandas_frame.dtypes
+    old_types = original_pandas_frame.dtypes
+    ncol = h2oFrame.ncol
+    colNames = new_pandas_frame.columns
+
+    for ind in list(range(ncol)):
+        assert new_types[colNames[ind]] == old_types[colNames[ind]], "Expected column types: {0}, actual column types: " \
+                                                 "{1}".format(old_types[colNames[ind]], new_types[colNames[ind]])
+        if new_types[colNames[ind]] == "object":
+            diff = new_pandas_frame[colNames[ind]] == original_pandas_frame[colNames[ind]]
+            if not diff.all(): # difference caused by the presence of NAs
+                newSeries = pd.Series(new_pandas_frame[colNames[ind]])
+                newNA = newSeries.isna()
+                oldSeries = pd.Series(original_pandas_frame[colNames[ind]])
+                oldNA = oldSeries.isna()
+                assert (newNA==oldNA).all()       
+        else:
+            diff = (new_pandas_frame[colNames[ind]] - original_pandas_frame[colNames[ind]]).abs()
+            assert diff.max() < 1e-10
+
+
+def test_datatable():
+    if can_install_datatable():
+        if not(can_use_datatable()):
+            pyunit_utils.install("datatable")
+
+        with local_context(datatable_disabled=True, polars_disabled=True):
+            h2oframe = genFrame()
+            print("converting h2o frame to pandas frame using single thread:")
+            original_pandas = h2oframe.as_data_frame()
+        with local_context(polars_disabled=True):
+            test_frame_conversion(h2oframe, original_pandas)  
+    else:
+        print("datatable are not available to test.  Skipping tests using datatable.")
+
+
+def genFrame():
+    python_lists = [["ls 1029551"], ["no 983196"], ["true 689851"], ["437594"], ["no,ls 113569"], ["no,true 70607"]]
+    col_names=["X"]
+    col_types=['enum']
+    return h2o.H2OFrame(python_obj=python_lists, column_names=col_names, column_types=col_types)
+
+
+if __name__ == "__main__":
+    pyunit_utils.standalone_test(test_datatable)
+else:
+    test_datatable()
diff --git a/h2o-py/tests/testdir_misc/pyunit_model_calibrate.py b/h2o-py/tests/testdir_misc/pyunit_model_calibrate.py
@@ -5,50 +5,44 @@
 from h2o.estimators.gbm import H2OGradientBoostingEstimator
 from h2o.estimators.isotonicregression import H2OIsotonicRegressionEstimator
 from pandas.testing import assert_frame_equal
+from h2o.utils.threading import local_context
 
 
 def test_calibrate_existing_model():
-    df = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
-    df["Angaus"] = df["Angaus"].asfactor()
-
-    train, calib = df.split_frame(ratios=[.8], destination_frames=["eco_train", "eco_calib"], seed=42)
-
-    model_int_calib = H2OGradientBoostingEstimator(
-        ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, seed=42,
-        calibrate_model=True, calibration_frame=calib, calibration_method="IsotonicRegression"
-    )
-    model_int_calib.train(
-        x=list(range(2, train.ncol)),
-        y="Angaus", training_frame=train
-    )
-    preds_int_calib = model_int_calib.predict(train)
-
-    isotonic_train = calib[["Angaus"]]
-    isotonic_train = isotonic_train.cbind(model_int_calib.predict(calib)["p1"])
-    h2o_iso_reg = H2OIsotonicRegressionEstimator(out_of_bounds="clip")
-    h2o_iso_reg.train(training_frame=isotonic_train, x="p1", y="Angaus")
-
-    model_man_calib = H2OGradientBoostingEstimator(
-        ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, seed=42
-    )
-    model_man_calib.train(
-        x=list(range(2, train.ncol)),
-        y="Angaus", training_frame=train
-    )
-    preds_no_calib = model_man_calib.predict(train)
-    assert preds_no_calib.col_names == ["predict", "p0", "p1"]
-
-    model_man_calib.calibrate(h2o_iso_reg)
-
-    preds_man_calib = model_man_calib.predict(train)
-    assert preds_man_calib.col_names == ["predict", "p0", "p1", "cal_p0", "cal_p1"]
-
-    assert_frame_equal(preds_int_calib.as_data_frame(), preds_man_calib.as_data_frame())
-
-    # test MOJO
-    mojo = pyunit_utils.download_mojo(model_man_calib)
-    mojo_prediction = h2o.mojo_predict_pandas(dataframe=train.as_data_frame(), predict_calibrated=True, **mojo)
-    assert_frame_equal(preds_int_calib.as_data_frame(), mojo_prediction)
+    with local_context(datatable_disabled=True, polars_disabled=True): # conversion h2o frame to pandas using single thread as before
+      df = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
+      df["Angaus"] = df["Angaus"].asfactor()
+
+      train, calib = df.split_frame(ratios=[.8], destination_frames=["eco_train", "eco_calib"], seed=42)
+
+      model_int_calib = H2OGradientBoostingEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5,
+                                                     seed=42, calibrate_model=True, calibration_frame=calib, 
+                                                     calibration_method="IsotonicRegression")
+      model_int_calib.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train)
+      preds_int_calib = model_int_calib.predict(train)
+
+      isotonic_train = calib[["Angaus"]]
+      isotonic_train = isotonic_train.cbind(model_int_calib.predict(calib)["p1"])
+      h2o_iso_reg = H2OIsotonicRegressionEstimator(out_of_bounds="clip")
+      h2o_iso_reg.train(training_frame=isotonic_train, x="p1", y="Angaus")
+
+      model_man_calib = H2OGradientBoostingEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5,
+                                                     seed=42)
+      model_man_calib.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train)
+      preds_no_calib = model_man_calib.predict(train)
+      assert preds_no_calib.col_names == ["predict", "p0", "p1"]
+
+      model_man_calib.calibrate(h2o_iso_reg)
+
+      preds_man_calib = model_man_calib.predict(train)
+      assert preds_man_calib.col_names == ["predict", "p0", "p1", "cal_p0", "cal_p1"]
+
+      assert_frame_equal(preds_int_calib.as_data_frame(), preds_man_calib.as_data_frame())
+
+      # test MOJO
+      mojo = pyunit_utils.download_mojo(model_man_calib)
+      mojo_prediction = h2o.mojo_predict_pandas(dataframe=train.as_data_frame(), predict_calibrated=True, **mojo)
+      assert_frame_equal(preds_int_calib.as_data_frame(), mojo_prediction)
 
 
 if __name__ == "__main__":