From 1565818b35b190ee71381689f03ad37dbaec7d7f Mon Sep 17 00:00:00 2001
From: wendycwong <wendy@h2o.ai>
Date: Thu, 11 Jan 2024 13:19:04 -0800
Subject: [PATCH] GH-15994 : check failed dataset conversion to pandas using
 datatable (#15995)

GH-15994: add test to make sure dataset conversion works with customer example.
GH-15994: install datatable if it is not available but the python version is correct.
GH-15994: Only skip tests if datatable or polars/pyarrow cannot be installed.
GH-15994:  Changed old tests that use as_data_frame to use single thread operation as before.
co-authored with tomasfryda
---
 h2o-py/h2o/utils/shared_utils.py              |  6 ++
 h2o-py/tests/pyunit_utils/utilsPY.py          |  3 +
 .../testdir_misc/pyunit_gainslift_bins.py     | 18 +++--
 ...h_15729_15936_datatable_polars_2_pandas.py | 46 ++++++-----
 ...9_15936_datatable_polars_2_pandas_large.py | 24 ++++--
 .../pyunit_gh_15936_polars2pandas_error.py    | 13 +++-
 .../pyunit_gh_15994_datatable2pandas_error.py | 61 +++++++++++++++
 .../testdir_misc/pyunit_model_calibrate.py    | 76 +++++++++----------
 .../pyunit_stratified_kfold_medium.py         | 37 ++++-----
 9 files changed, 187 insertions(+), 97 deletions(-)
 create mode 100644 h2o-py/tests/testdir_misc/pyunit_gh_15994_datatable2pandas_error.py

diff --git a/h2o-py/h2o/utils/shared_utils.py b/h2o-py/h2o/utils/shared_utils.py
index 168bbd8e086a..15cd23656609 100644
--- a/h2o-py/h2o/utils/shared_utils.py
+++ b/h2o-py/h2o/utils/shared_utils.py
@@ -135,6 +135,12 @@ def can_use_pandas():
 def can_use_datatable():
     return is_module_available('datatable') and sys.version_info.major == 3 and sys.version_info.minor <= 9
 
+def can_install_datatable():
+    return sys.version_info.major == 3 and sys.version_info.minor <= 9
+
+def can_install_polars():
+    return sys.version_info.major == 3 and sys.version_info.minor > 9
+
 def can_use_polars():
     return is_module_available('polars') and sys.version_info.major == 3 and sys.version_info.minor > 9
 
diff --git a/h2o-py/tests/pyunit_utils/utilsPY.py b/h2o-py/tests/pyunit_utils/utilsPY.py
index 6632b26ab202..335fe8834b3f 100644
--- a/h2o-py/tests/pyunit_utils/utilsPY.py
+++ b/h2o-py/tests/pyunit_utils/utilsPY.py
@@ -4739,3 +4739,6 @@ def prepare_data():
     y = 'Y'
 
     return df, x, y
+
+def install(package):
+    subprocess.check_call([sys.executable, "-m", "pip", "install", package])
diff --git a/h2o-py/tests/testdir_misc/pyunit_gainslift_bins.py b/h2o-py/tests/testdir_misc/pyunit_gainslift_bins.py
index 02d3a57a0670..ad7167ef9520 100644
--- a/h2o-py/tests/testdir_misc/pyunit_gainslift_bins.py
+++ b/h2o-py/tests/testdir_misc/pyunit_gainslift_bins.py
@@ -5,6 +5,7 @@
 import pandas as pd
 from tests import pyunit_utils
 from h2o.estimators import *
+from h2o.utils.threading import local_context
 
 eps = 1e-10
 
@@ -22,18 +23,19 @@ def fast_estimator(estimator, **kwargs):
 
 
 def ks_score(mod, data, y):
-    from scipy.stats import ks_2samp
+    with local_context(datatable_disabled=True, polars_disabled=True): # conversion h2o frame to pandas using single thread as before
+      from scipy.stats import ks_2samp
 
-    df = pd.DataFrame()
-    df["label"] = data[y].as_data_frame().iloc[:, 0]
-    df["probs"] = mod.predict(data)["p1"].as_data_frame().iloc[:, 0]
+      df = pd.DataFrame()
+      df["label"] = data[y].as_data_frame().iloc[:, 0]
+      df["probs"] = mod.predict(data)["p1"].as_data_frame().iloc[:, 0]
 
-    label_0 = df[df["label"] == 0]
-    label_1 = df[df["label"] == 1]
+      label_0 = df[df["label"] == 0]
+      label_1 = df[df["label"] == 1]
 
-    ks = ks_2samp(label_0["probs"], label_1["probs"])
+      ks = ks_2samp(label_0["probs"], label_1["probs"])
 
-    return ks.statistic
+      return ks.statistic
 
 
 def get_ks(model, data):
diff --git a/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas.py b/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas.py
index 295540a942c3..c233515bce6a 100644
--- a/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas.py
+++ b/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas.py
@@ -2,7 +2,8 @@
 sys.path.insert(1,"../../")
 import h2o
 from tests import pyunit_utils
-from h2o.utils.shared_utils import (can_use_datatable, can_use_polars, can_use_pyarrow)
+from h2o.utils.shared_utils import (can_use_datatable, can_use_polars, can_use_pyarrow, can_install_datatable, 
+                                    can_install_polars)
 import time
 import pandas as pd
 from h2o.utils.threading import local_context
@@ -47,32 +48,41 @@ def singl_thread_pandas_conversion(dataset):
         print("H2O frame to Pandas frame conversion time with single thread for dataset {1}: {0}".format(newTime, dataset))
         return h2oframe_panda
     
-def test_polars_datatable(): 
+def test_polars_datatable():
     file1 = "smalldata/titanic/titanic_expanded.csv"
     file2 = "smalldata/glm_test/multinomial_3Class_10KRow.csv"
     file3 = "smalldata/timeSeries/CreditCard-ts_train.csv"
-    
+
     original_converted_frame1 = singl_thread_pandas_conversion(file1)
     original_converted_frame2 = singl_thread_pandas_conversion(file2)
     original_converted_frame3 = singl_thread_pandas_conversion(file3)
-        
-    with local_context(polars_disabled=True):   # run with datatable
-        if can_use_datatable():
+
+    if not(can_install_datatable()):
+        print("datatable is not available.  Skipping tests using datatable.")
+    else:
+        if not(can_use_datatable()):
+            pyunit_utils.install("datatable")
+
+        with local_context(polars_disabled=True):   # run with datatable
             print("test data frame conversion using datatable.")
             test_frame_conversion(file1, original_converted_frame1, "datatable")
             test_frame_conversion(file2, original_converted_frame2, "datatable")
-            test_frame_conversion(file3, original_converted_frame3, "datatable")
-        else:
-            print("datatable is not available.  Skipping tests using datatable.")
-        
-    with local_context(datatable_disabled=True):
-        if can_use_polars() and can_use_pyarrow():
-            print("test data frame conversion using polars and pyarrow.")
-            test_frame_conversion(file1, original_converted_frame1, "polars and pyarrow")
-            test_frame_conversion(file2, original_converted_frame2, "polars and pyarrow")
-            test_frame_conversion(file3, original_converted_frame3, "polars and pyarrow")    
-        else:
-            print("polars, pyarrow are not available.  Skipping tests using polars and pyarrow")             
+            test_frame_conversion(file3, original_converted_frame3, "datatable")    
+            
+    if not(can_install_polars()):
+        print("polars, pyarrow are not available.  Skipping tests using polars and pyarrow")
+    else:
+        if not(can_use_polars()):
+            pyunit_utils.install("polars")
+        if not(can_use_pyarrow()):
+            pyunit_utils.install("pyarrow")
+                    
+        with local_context(datatable_disabled=True):
+            if can_use_polars() and can_use_pyarrow():
+                print("test data frame conversion using polars and pyarrow.")
+                test_frame_conversion(file1, original_converted_frame1, "polars and pyarrow")
+                test_frame_conversion(file2, original_converted_frame2, "polars and pyarrow")
+                test_frame_conversion(file3, original_converted_frame3, "polars and pyarrow")    
 
 if __name__ == "__main__":
     pyunit_utils.standalone_test(test_polars_datatable)
diff --git a/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas_large.py b/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas_large.py
index 45e6b65c5aed..f0acf2df1b7e 100644
--- a/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas_large.py
+++ b/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas_large.py
@@ -2,7 +2,8 @@
 sys.path.insert(1,"../../")
 import h2o
 from tests import pyunit_utils
-from h2o.utils.shared_utils import (can_use_datatable, can_use_polars, can_use_pyarrow)
+from h2o.utils.shared_utils import (can_use_datatable, can_use_polars, can_use_pyarrow, can_install_datatable, 
+                                    can_install_polars)
 import time
 from h2o.utils.threading import local_context
 
@@ -50,18 +51,25 @@ def test_polars_datatable_2_pandas():
     file1 = "bigdata/laptop/jira/PUBDEV_5266_merge_with_string_columns/PUBDEV_5266_f1.csv"
     original_converted_frame1 = single_thread_pandas_conversion(file1)  # need to run conversion in single thread
 
-    with local_context(polars_disabled=True):   # run with datatable
-        if can_use_datatable():
+    if can_install_datatable():
+        if not(can_use_datatable()):
+            pyunit_utils.install("datatable")
+        with local_context(polars_disabled=True):   # run with datatable
             print("test data frame conversion using datatable.")
             test_frame_conversion(file1, original_converted_frame1, "datatable")
-        else:
-            print("datatable is not available.  Skipping tests using datatable.")
 
-    with local_context(datatable_disabled=True):
-        if can_use_polars() and can_use_pyarrow():
+    else:
+        print("datatable is not available.  Skipping tests using datatable.")
+
+    if can_install_polars():
+        if not(can_use_polars()):
+            pyunit_utils.install("polars")
+        if not(can_use_pyarrow()):
+            pyunit_utils.install("pyarrow")
+        with local_context(datatable_disabled=True):
             print("test data frame conversion using polars and pyarrow.")
             test_frame_conversion(file1, original_converted_frame1, "polars and pyarrow")
-        else:
+    else:
             print("polars, pyarrow are not available.  Skipping tests using polars and pyarrow")
 
 if __name__ == "__main__":
diff --git a/h2o-py/tests/testdir_misc/pyunit_gh_15936_polars2pandas_error.py b/h2o-py/tests/testdir_misc/pyunit_gh_15936_polars2pandas_error.py
index de19ed3cd879..455369c28d98 100644
--- a/h2o-py/tests/testdir_misc/pyunit_gh_15936_polars2pandas_error.py
+++ b/h2o-py/tests/testdir_misc/pyunit_gh_15936_polars2pandas_error.py
@@ -2,7 +2,7 @@
 sys.path.insert(1,"../../")
 import h2o
 from tests import pyunit_utils
-from h2o.utils.shared_utils import (can_use_polars, can_use_pyarrow)
+from h2o.utils.shared_utils import (can_use_polars, can_use_pyarrow, can_install_polars)
 import pandas as pd
 from h2o.utils.threading import local_context
 
@@ -32,15 +32,20 @@ def test_frame_conversion(h2oFrame, original_pandas_frame):
             assert diff.max() < 1e-10
             
 def test_polars_pyarrow():
-    if can_use_polars() and can_use_pyarrow():
+    if not(can_install_polars()):
+        print("polars and pyarrow are not available to test.  Skipping tests using polars and pyarrow.")
+    else:
+        if not(can_use_polars()):
+            pyunit_utils.install("polars")
+        if not(can_use_pyarrow()):
+            pyunit_utils.install("pyarrow")
+
         with local_context(datatable_disabled=True, polars_disabled=True):
             h2oframe = genFrame()
             print("converting h2o frame to pandas frame using single thread:")
             original_pandas = h2oframe.as_data_frame()
         with local_context(datatable_disabled=True):
             test_frame_conversion(h2oframe, original_pandas)  
-    else:
-        print("polars and pyarrow are not available to test.  Skipping tests using polars and pyarrow.")
 
 def genFrame():
     python_lists = [["ls 1029551"], ["no 983196"], ["true 689851"], ["437594"], ["no,ls 113569"], ["no,true 70607"]]
diff --git a/h2o-py/tests/testdir_misc/pyunit_gh_15994_datatable2pandas_error.py b/h2o-py/tests/testdir_misc/pyunit_gh_15994_datatable2pandas_error.py
new file mode 100644
index 000000000000..6fe48b0b4aad
--- /dev/null
+++ b/h2o-py/tests/testdir_misc/pyunit_gh_15994_datatable2pandas_error.py
@@ -0,0 +1,61 @@
+import sys
+sys.path.insert(1,"../../")
+import h2o
+from tests import pyunit_utils
+from h2o.utils.shared_utils import (can_use_datatable, can_install_datatable)
+import pandas as pd
+from h2o.utils.threading import local_context
+
+
+# datatable have problems before with this dataset.  Checking here to make sure it works.
+def test_frame_conversion(h2oFrame, original_pandas_frame):
+    print("h2o frame to pandas frame conversion using datatable")
+    new_pandas_frame = h2oFrame.as_data_frame()
+    # compare two frames column types                
+    new_types = new_pandas_frame.dtypes
+    old_types = original_pandas_frame.dtypes
+    ncol = h2oFrame.ncol
+    colNames = new_pandas_frame.columns
+    
+    for ind in list(range(ncol)):
+        assert new_types[colNames[ind]] == old_types[colNames[ind]], "Expected column types: {0}, actual column types: " \
+                                                 "{1}".format(old_types[colNames[ind]], new_types[colNames[ind]])
+        if new_types[colNames[ind]] == "object":
+            diff = new_pandas_frame[colNames[ind]] == original_pandas_frame[colNames[ind]]
+            if not diff.all(): # difference caused by the presence of NAs
+                newSeries = pd.Series(new_pandas_frame[colNames[ind]])
+                newNA = newSeries.isna()
+                oldSeries = pd.Series(original_pandas_frame[colNames[ind]])
+                oldNA = oldSeries.isna()
+                assert (newNA==oldNA).all()       
+        else:
+            diff = (new_pandas_frame[colNames[ind]] - original_pandas_frame[colNames[ind]]).abs()
+            assert diff.max() < 1e-10
+            
+            
+def test_datatable():
+    if can_install_datatable():
+        if not(can_use_datatable()):
+            pyunit_utils.install("datatable")
+        
+        with local_context(datatable_disabled=True, polars_disabled=True):
+            h2oframe = genFrame()
+            print("converting h2o frame to pandas frame using single thread:")
+            original_pandas = h2oframe.as_data_frame()
+        with local_context(polars_disabled=True):
+            test_frame_conversion(h2oframe, original_pandas)  
+    else:
+        print("datatable are not available to test.  Skipping tests using datatable.")
+
+
+def genFrame():
+    python_lists = [["ls 1029551"], ["no 983196"], ["true 689851"], ["437594"], ["no,ls 113569"], ["no,true 70607"]]
+    col_names=["X"]
+    col_types=['enum']
+    return h2o.H2OFrame(python_obj=python_lists, column_names=col_names, column_types=col_types)
+
+
+if __name__ == "__main__":
+    pyunit_utils.standalone_test(test_datatable)
+else:
+    test_datatable()
diff --git a/h2o-py/tests/testdir_misc/pyunit_model_calibrate.py b/h2o-py/tests/testdir_misc/pyunit_model_calibrate.py
index 0602ab2a591a..eeeba46bcec2 100644
--- a/h2o-py/tests/testdir_misc/pyunit_model_calibrate.py
+++ b/h2o-py/tests/testdir_misc/pyunit_model_calibrate.py
@@ -5,50 +5,44 @@
 from h2o.estimators.gbm import H2OGradientBoostingEstimator
 from h2o.estimators.isotonicregression import H2OIsotonicRegressionEstimator
 from pandas.testing import assert_frame_equal
+from h2o.utils.threading import local_context
 
 
 def test_calibrate_existing_model():
-    df = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
-    df["Angaus"] = df["Angaus"].asfactor()
-
-    train, calib = df.split_frame(ratios=[.8], destination_frames=["eco_train", "eco_calib"], seed=42)
-
-    model_int_calib = H2OGradientBoostingEstimator(
-        ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, seed=42,
-        calibrate_model=True, calibration_frame=calib, calibration_method="IsotonicRegression"
-    )
-    model_int_calib.train(
-        x=list(range(2, train.ncol)),
-        y="Angaus", training_frame=train
-    )
-    preds_int_calib = model_int_calib.predict(train)
-
-    isotonic_train = calib[["Angaus"]]
-    isotonic_train = isotonic_train.cbind(model_int_calib.predict(calib)["p1"])
-    h2o_iso_reg = H2OIsotonicRegressionEstimator(out_of_bounds="clip")
-    h2o_iso_reg.train(training_frame=isotonic_train, x="p1", y="Angaus")
-
-    model_man_calib = H2OGradientBoostingEstimator(
-        ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5, seed=42
-    )
-    model_man_calib.train(
-        x=list(range(2, train.ncol)),
-        y="Angaus", training_frame=train
-    )
-    preds_no_calib = model_man_calib.predict(train)
-    assert preds_no_calib.col_names == ["predict", "p0", "p1"]
-
-    model_man_calib.calibrate(h2o_iso_reg)
-
-    preds_man_calib = model_man_calib.predict(train)
-    assert preds_man_calib.col_names == ["predict", "p0", "p1", "cal_p0", "cal_p1"]
-
-    assert_frame_equal(preds_int_calib.as_data_frame(), preds_man_calib.as_data_frame())
-
-    # test MOJO
-    mojo = pyunit_utils.download_mojo(model_man_calib)
-    mojo_prediction = h2o.mojo_predict_pandas(dataframe=train.as_data_frame(), predict_calibrated=True, **mojo)
-    assert_frame_equal(preds_int_calib.as_data_frame(), mojo_prediction)
+    with local_context(datatable_disabled=True, polars_disabled=True): # conversion h2o frame to pandas using single thread as before
+      df = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
+      df["Angaus"] = df["Angaus"].asfactor()
+
+      train, calib = df.split_frame(ratios=[.8], destination_frames=["eco_train", "eco_calib"], seed=42)
+
+      model_int_calib = H2OGradientBoostingEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5,
+                                                     seed=42, calibrate_model=True, calibration_frame=calib, 
+                                                     calibration_method="IsotonicRegression")
+      model_int_calib.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train)
+      preds_int_calib = model_int_calib.predict(train)
+
+      isotonic_train = calib[["Angaus"]]
+      isotonic_train = isotonic_train.cbind(model_int_calib.predict(calib)["p1"])
+      h2o_iso_reg = H2OIsotonicRegressionEstimator(out_of_bounds="clip")
+      h2o_iso_reg.train(training_frame=isotonic_train, x="p1", y="Angaus")
+
+      model_man_calib = H2OGradientBoostingEstimator(ntrees=100, distribution="bernoulli", min_rows=10, max_depth=5,
+                                                     seed=42)
+      model_man_calib.train(x=list(range(2, train.ncol)), y="Angaus", training_frame=train)
+      preds_no_calib = model_man_calib.predict(train)
+      assert preds_no_calib.col_names == ["predict", "p0", "p1"]
+
+      model_man_calib.calibrate(h2o_iso_reg)
+
+      preds_man_calib = model_man_calib.predict(train)
+      assert preds_man_calib.col_names == ["predict", "p0", "p1", "cal_p0", "cal_p1"]
+
+      assert_frame_equal(preds_int_calib.as_data_frame(), preds_man_calib.as_data_frame())
+
+      # test MOJO
+      mojo = pyunit_utils.download_mojo(model_man_calib)
+      mojo_prediction = h2o.mojo_predict_pandas(dataframe=train.as_data_frame(), predict_calibrated=True, **mojo)
+      assert_frame_equal(preds_int_calib.as_data_frame(), mojo_prediction)
 
 
 if __name__ == "__main__":
diff --git a/h2o-py/tests/testdir_misc/pyunit_stratified_kfold_medium.py b/h2o-py/tests/testdir_misc/pyunit_stratified_kfold_medium.py
index 84d9de8b0761..a31b6e2a2d99 100644
--- a/h2o-py/tests/testdir_misc/pyunit_stratified_kfold_medium.py
+++ b/h2o-py/tests/testdir_misc/pyunit_stratified_kfold_medium.py
@@ -5,36 +5,37 @@
 sys.path.insert(1,"../../")
 import h2o
 from tests import pyunit_utils
+from h2o.utils.threading import local_context
 
 
 
 
 def stratified_kfold():
+  with local_context(datatable_disabled=True, polars_disabled=True): # convert h2o frame to pandas frame in single-thread as before
+    NFOLDS=5
 
-  NFOLDS=5
+    fr = h2o.import_file(pyunit_utils.locate("bigdata/laptop/covtype/covtype.data"))
 
-  fr = h2o.import_file(pyunit_utils.locate("bigdata/laptop/covtype/covtype.data"))
+    stratified = fr[54].stratified_kfold_column(n_folds=NFOLDS)
+    stratified.show()
 
-  stratified = fr[54].stratified_kfold_column(n_folds=NFOLDS)
-  stratified.show()
+    dist = (old_div(fr[54].table()["Count"], fr[54].table()["Count"].sum())).as_data_frame(True).to_dict("list")["Count"]  # get a raw list of means
 
-  dist = (old_div(fr[54].table()["Count"], fr[54].table()["Count"].sum())).as_data_frame(True).to_dict("list")["Count"]  # get a raw list of means
+    overall_result = reduce(lambda x,y: x.cbind(y), [old_div(fr[stratified==i,54].table()["Count"],fr[stratified==i,54].table()["Count"].sum()) for i in range(NFOLDS)])
+    overall_result.show()
+    df = overall_result.as_data_frame(True)  # get the overall result here
 
-  overall_result = reduce(lambda x,y: x.cbind(y), [old_div(fr[stratified==i,54].table()["Count"],fr[stratified==i,54].table()["Count"].sum()) for i in range(NFOLDS)])
-  overall_result.show()
-  df = overall_result.as_data_frame(True)  # get the overall result here
 
+    # show that folds are consistent
+    print()
+    print("Show that all folds are consistent with one another: ")
+    print(df.mean(axis=1))  # print the average
+    print(df.var(axis=1))   # print the standard deviation
+    print()
 
-  # show that folds are consistent
-  print()
-  print("Show that all folds are consistent with one another: ")
-  print(df.mean(axis=1))  # print the average
-  print(df.var(axis=1))   # print the standard deviation
-  print()
-
-  # now show that folds are consistent with the original distribution of classes
-  for i in range(len(dist)):
-    print("Stratification variance for class #%s: %s" %(i, old_div((df.loc[i].sub(dist[i]).pow(2).sum()), (df.shape[0] - 1))))
+    # now show that folds are consistent with the original distribution of classes
+    for i in range(len(dist)):
+      print("Stratification variance for class #%s: %s" %(i, old_div((df.loc[i].sub(dist[i]).pow(2).sum()), (df.shape[0] - 1))))