Make use of datatable and polars opt-in not opt-out

h2oai · Jan 18, 2024 · ef9ecdd · ef9ecdd
1 parent 108905e
commit ef9ecdd
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 45 deletions.
diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py
@@ -1966,7 +1966,7 @@ def as_data_frame(self, use_pandas=True, header=True):
         >>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
         >>> df = airlines.as_data_frame()
         >>> df
-        """ 
+        """
         if can_use_pandas() and use_pandas:
             import pandas
             if (can_use_datatable()) or (can_use_polars() and can_use_pyarrow()): # can use multi-thread
@@ -1987,7 +1987,8 @@ def as_data_frame(self, use_pandas=True, header=True):
             frame.pop(0)
         return frame
 
-    def convert_with_polars(selfself, fileName):
+
+    def convert_with_polars(self, fileName):
         import polars as pl
         dt_frame = pl.read_csv(fileName, null_values = "")
         return dt_frame.to_pandas()

diff --git a/h2o-py/h2o/utils/shared_utils.py b/h2o-py/h2o/utils/shared_utils.py
@@ -121,28 +121,38 @@ def temp_ctr():
 
 
 def is_module_available(mod):
-    if local_env(mod+"_disabled"): # fast track if module is explicitly disabled
+    if local_env(mod+"_disabled"):  # fast track if module is explicitly disabled
         return False
     if mod in sys.modules and sys.modules[mod] is not None:  # fast track + safer in unusual environments 
         return True
 
     import importlib.util
     return importlib.util.find_spec(mod) is not None
 
+
+def is_module_enabled(mod):
+    return local_env(mod+"_enabled") and is_module_available(mod)
+
+
 def can_use_pandas():
     return is_module_available('pandas')
 
+
 def can_use_datatable():
-    return is_module_available('datatable') and sys.version_info.major == 3 and sys.version_info.minor <= 9
+    return is_module_enabled('datatable') and sys.version_info.major == 3 and sys.version_info.minor <= 9
+
 
 def can_install_datatable():
     return sys.version_info.major == 3 and sys.version_info.minor <= 9
 
+
 def can_install_polars():
     return sys.version_info.major == 3 and sys.version_info.minor > 9
 
+
 def can_use_polars():
-    return is_module_available('polars') and sys.version_info.major == 3 and sys.version_info.minor > 9
+    return is_module_enabled('polars') and sys.version_info.major == 3 and sys.version_info.minor > 9
+
 
 def can_use_pyarrow():
     if can_use_pandas() and sys.version_info.minor > 9:
@@ -152,9 +162,11 @@ def can_use_pyarrow():
     else:
         return False
 
+
 def can_use_numpy():
     return is_module_available('numpy')
 
+
 _url_safe_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~"
 _url_chars_map = [chr(i) if chr(i) in _url_safe_chars else "%%%02X" % i for i in range(256)]
 
@@ -376,7 +388,7 @@ def slice_is_normalized(s):
 h2o_predictor_class = "hex.genmodel.tools.PredictCsv"
 
 
-def mojo_predict_pandas(dataframe, mojo_zip_path, genmodel_jar_path=None, classpath=None, java_options=None, 
+def   mojo_predict_pandas(dataframe, mojo_zip_path, genmodel_jar_path=None, classpath=None, java_options=None, 
                         verbose=False, setInvNumNA=False, predict_contributions=False, predict_calibrated=False):
     """
     MOJO scoring function to take a Pandas frame and use MOJO model as zip file to score.

diff --git a/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas.py b/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas.py
@@ -8,6 +8,7 @@
 import pandas as pd
 from h2o.utils.threading import local_context
 
+
 # if datatable or polars/pyarrow is installed, this test will show that using datatable to convert h2o frame to pandas
 # frame is much faster for large datasets.
 def test_frame_conversion(dataset, original_pandas_frame, module):
@@ -38,7 +39,8 @@ def test_frame_conversion(dataset, original_pandas_frame, module):
             diff = (new_pandas_frame[colNames[ind]] - original_pandas_frame[colNames[ind]]).abs()
             assert diff.max() < 1e-10
 
-def singl_thread_pandas_conversion(dataset):
+
+def single_thread_pandas_conversion(dataset):
     with local_context(datatable_disabled=True, polars_disabled=True):
         print("converting h2o frame to pandas frame using single thread")
         h2oFrame = h2o.import_file(pyunit_utils.locate(dataset))
@@ -48,41 +50,32 @@ def singl_thread_pandas_conversion(dataset):
         print("H2O frame to Pandas frame conversion time with single thread for dataset {1}: {0}".format(newTime, dataset))
         return h2oframe_panda
 
+
 def test_polars_datatable():
     file1 = "smalldata/titanic/titanic_expanded.csv"
     file2 = "smalldata/glm_test/multinomial_3Class_10KRow.csv"
     file3 = "smalldata/timeSeries/CreditCard-ts_train.csv"
 
-    original_converted_frame1 = singl_thread_pandas_conversion(file1)
-    original_converted_frame2 = singl_thread_pandas_conversion(file2)
-    original_converted_frame3 = singl_thread_pandas_conversion(file3)
+    original_converted_frame1 = single_thread_pandas_conversion(file1)
+    original_converted_frame2 = single_thread_pandas_conversion(file2)
+    original_converted_frame3 = single_thread_pandas_conversion(file3)
 
-    if not(can_install_datatable()):
-        print("datatable is not available.  Skipping tests using datatable.")
-    else:
-        if not(can_use_datatable()):
-            pyunit_utils.install("datatable")
-
-        with local_context(polars_disabled=True):   # run with datatable
+    if can_install_datatable():
+        with local_context(polars_disabled=True, datatable_enabled=True):   # run with datatable
+            assert can_use_datatable(), "Can't use datatable"   
             print("test data frame conversion using datatable.")
             test_frame_conversion(file1, original_converted_frame1, "datatable")
             test_frame_conversion(file2, original_converted_frame2, "datatable")
             test_frame_conversion(file3, original_converted_frame3, "datatable")    
-
-    if not(can_install_polars()):
-        print("polars, pyarrow are not available.  Skipping tests using polars and pyarrow")
-    else:
-        if not(can_use_polars()):
-            pyunit_utils.install("polars")
-        if not(can_use_pyarrow()):
-            pyunit_utils.install("pyarrow")
-
-        with local_context(datatable_disabled=True):
-            if can_use_polars() and can_use_pyarrow():
-                print("test data frame conversion using polars and pyarrow.")
-                test_frame_conversion(file1, original_converted_frame1, "polars and pyarrow")
-                test_frame_conversion(file2, original_converted_frame2, "polars and pyarrow")
-                test_frame_conversion(file3, original_converted_frame3, "polars and pyarrow")    
+
+    if can_install_polars():                        
+        with local_context(datatable_disabled=True, polars_enabled=True):
+            assert can_use_polars() and can_use_pyarrow(), "Can't use polars"
+            print("test data frame conversion using polars and pyarrow.")
+            test_frame_conversion(file1, original_converted_frame1, "polars and pyarrow")
+            test_frame_conversion(file2, original_converted_frame2, "polars and pyarrow")
+            test_frame_conversion(file3, original_converted_frame3, "polars and pyarrow")    
+
 
 if __name__ == "__main__":
     pyunit_utils.standalone_test(test_polars_datatable)

diff --git a/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas_large.py b/h2o-py/tests/testdir_misc/pyunit_gh_15729_15936_datatable_polars_2_pandas_large.py
@@ -7,6 +7,7 @@
 import time
 from h2o.utils.threading import local_context
 
+
 def test_frame_conversion(dataset, original_pandas_frame, module):
     # convert frame using datatable or polar
     h2oFrame = h2o.import_file(pyunit_utils.locate(dataset))
@@ -35,6 +36,7 @@ def test_frame_conversion(dataset, original_pandas_frame, module):
             diff = (new_pandas_frame[colNames[ind]] - original_pandas_frame[colNames[ind]]).abs()
             assert diff.max() < 1e-10
 
+
 def single_thread_pandas_conversion(dataset):
     with local_context(datatable_disabled=True, polars_disabled=True):
         print("converting h2o frame to pandas frame using single thread")
@@ -45,32 +47,25 @@ def single_thread_pandas_conversion(dataset):
         print("H2O frame to Pandas frame conversion time with single thread for dataset {1}: {0}".format(newTime, dataset))
         return h2oframe_panda
 
+
 # if datatable or polars/pyarrow is installed, this test will show that using datatable to convert h2o frame to pandas
 # frame is much faster for large datasets.
 def test_polars_datatable_2_pandas():
     file1 = "bigdata/laptop/jira/PUBDEV_5266_merge_with_string_columns/PUBDEV_5266_f1.csv"
     original_converted_frame1 = single_thread_pandas_conversion(file1)  # need to run conversion in single thread
 
     if can_install_datatable():
-        if not(can_use_datatable()):
-            pyunit_utils.install("datatable")
-        with local_context(polars_disabled=True):   # run with datatable
+        with local_context(polars_disabled=True, datatable_enabled=True):   # run with datatable
+            assert can_use_datatable(), "Can't use datatable"
             print("test data frame conversion using datatable.")
             test_frame_conversion(file1, original_converted_frame1, "datatable")
 
-    else:
-        print("datatable is not available.  Skipping tests using datatable.")
-
     if can_install_polars():
-        if not(can_use_polars()):
-            pyunit_utils.install("polars")
-        if not(can_use_pyarrow()):
-            pyunit_utils.install("pyarrow")
-        with local_context(datatable_disabled=True):
+        with local_context(datatable_disabled=True, polars_enabled=True):
+            assert can_use_polars() and can_use_pyarrow(), "Can't use polars"
             print("test data frame conversion using polars and pyarrow.")
             test_frame_conversion(file1, original_converted_frame1, "polars and pyarrow")
-    else:
-            print("polars, pyarrow are not available.  Skipping tests using polars and pyarrow")
+
 
 if __name__ == "__main__":
     pyunit_utils.standalone_test(test_polars_datatable_2_pandas)