Skip to content

Commit

Permalink
Make use of datatable and polars opt-in not opt-out
Browse files Browse the repository at this point in the history
  • Loading branch information
tomasfryda committed Jan 18, 2024
1 parent 108905e commit ef9ecdd
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 45 deletions.
5 changes: 3 additions & 2 deletions h2o-py/h2o/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1966,7 +1966,7 @@ def as_data_frame(self, use_pandas=True, header=True):
>>> airlines['FlightNum'] = airlines['FlightNum'].asfactor()
>>> df = airlines.as_data_frame()
>>> df
"""
"""
if can_use_pandas() and use_pandas:
import pandas
if (can_use_datatable()) or (can_use_polars() and can_use_pyarrow()): # can use multi-thread
Expand All @@ -1987,7 +1987,8 @@ def as_data_frame(self, use_pandas=True, header=True):
frame.pop(0)
return frame

def convert_with_polars(selfself, fileName):

def convert_with_polars(self, fileName):
import polars as pl
dt_frame = pl.read_csv(fileName, null_values = "")
return dt_frame.to_pandas()
Expand Down
20 changes: 16 additions & 4 deletions h2o-py/h2o/utils/shared_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,28 +121,38 @@ def temp_ctr():


def is_module_available(mod):
if local_env(mod+"_disabled"): # fast track if module is explicitly disabled
if local_env(mod+"_disabled"): # fast track if module is explicitly disabled
return False
if mod in sys.modules and sys.modules[mod] is not None: # fast track + safer in unusual environments
return True

import importlib.util
return importlib.util.find_spec(mod) is not None


def is_module_enabled(mod):
return local_env(mod+"_enabled") and is_module_available(mod)


def can_use_pandas():
return is_module_available('pandas')


def can_use_datatable():
return is_module_available('datatable') and sys.version_info.major == 3 and sys.version_info.minor <= 9
return is_module_enabled('datatable') and sys.version_info.major == 3 and sys.version_info.minor <= 9


def can_install_datatable():
return sys.version_info.major == 3 and sys.version_info.minor <= 9


def can_install_polars():
return sys.version_info.major == 3 and sys.version_info.minor > 9


def can_use_polars():
return is_module_available('polars') and sys.version_info.major == 3 and sys.version_info.minor > 9
return is_module_enabled('polars') and sys.version_info.major == 3 and sys.version_info.minor > 9


def can_use_pyarrow():
if can_use_pandas() and sys.version_info.minor > 9:
Expand All @@ -152,9 +162,11 @@ def can_use_pyarrow():
else:
return False


def can_use_numpy():
return is_module_available('numpy')


_url_safe_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~"
_url_chars_map = [chr(i) if chr(i) in _url_safe_chars else "%%%02X" % i for i in range(256)]

Expand Down Expand Up @@ -376,7 +388,7 @@ def slice_is_normalized(s):
h2o_predictor_class = "hex.genmodel.tools.PredictCsv"


def mojo_predict_pandas(dataframe, mojo_zip_path, genmodel_jar_path=None, classpath=None, java_options=None,
def mojo_predict_pandas(dataframe, mojo_zip_path, genmodel_jar_path=None, classpath=None, java_options=None,
verbose=False, setInvNumNA=False, predict_contributions=False, predict_calibrated=False):
"""
MOJO scoring function to take a Pandas frame and use MOJO model as zip file to score.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pandas as pd
from h2o.utils.threading import local_context


# if datatable or polars/pyarrow is installed, this test will show that using datatable to convert h2o frame to pandas
# frame is much faster for large datasets.
def test_frame_conversion(dataset, original_pandas_frame, module):
Expand Down Expand Up @@ -38,7 +39,8 @@ def test_frame_conversion(dataset, original_pandas_frame, module):
diff = (new_pandas_frame[colNames[ind]] - original_pandas_frame[colNames[ind]]).abs()
assert diff.max() < 1e-10

def singl_thread_pandas_conversion(dataset):

def single_thread_pandas_conversion(dataset):
with local_context(datatable_disabled=True, polars_disabled=True):
print("converting h2o frame to pandas frame using single thread")
h2oFrame = h2o.import_file(pyunit_utils.locate(dataset))
Expand All @@ -48,41 +50,32 @@ def singl_thread_pandas_conversion(dataset):
print("H2O frame to Pandas frame conversion time with single thread for dataset {1}: {0}".format(newTime, dataset))
return h2oframe_panda


def test_polars_datatable():
file1 = "smalldata/titanic/titanic_expanded.csv"
file2 = "smalldata/glm_test/multinomial_3Class_10KRow.csv"
file3 = "smalldata/timeSeries/CreditCard-ts_train.csv"

original_converted_frame1 = singl_thread_pandas_conversion(file1)
original_converted_frame2 = singl_thread_pandas_conversion(file2)
original_converted_frame3 = singl_thread_pandas_conversion(file3)
original_converted_frame1 = single_thread_pandas_conversion(file1)
original_converted_frame2 = single_thread_pandas_conversion(file2)
original_converted_frame3 = single_thread_pandas_conversion(file3)

if not(can_install_datatable()):
print("datatable is not available. Skipping tests using datatable.")
else:
if not(can_use_datatable()):
pyunit_utils.install("datatable")

with local_context(polars_disabled=True): # run with datatable
if can_install_datatable():
with local_context(polars_disabled=True, datatable_enabled=True): # run with datatable
assert can_use_datatable(), "Can't use datatable"
print("test data frame conversion using datatable.")
test_frame_conversion(file1, original_converted_frame1, "datatable")
test_frame_conversion(file2, original_converted_frame2, "datatable")
test_frame_conversion(file3, original_converted_frame3, "datatable")

if not(can_install_polars()):
print("polars, pyarrow are not available. Skipping tests using polars and pyarrow")
else:
if not(can_use_polars()):
pyunit_utils.install("polars")
if not(can_use_pyarrow()):
pyunit_utils.install("pyarrow")

with local_context(datatable_disabled=True):
if can_use_polars() and can_use_pyarrow():
print("test data frame conversion using polars and pyarrow.")
test_frame_conversion(file1, original_converted_frame1, "polars and pyarrow")
test_frame_conversion(file2, original_converted_frame2, "polars and pyarrow")
test_frame_conversion(file3, original_converted_frame3, "polars and pyarrow")

if can_install_polars():
with local_context(datatable_disabled=True, polars_enabled=True):
assert can_use_polars() and can_use_pyarrow(), "Can't use polars"
print("test data frame conversion using polars and pyarrow.")
test_frame_conversion(file1, original_converted_frame1, "polars and pyarrow")
test_frame_conversion(file2, original_converted_frame2, "polars and pyarrow")
test_frame_conversion(file3, original_converted_frame3, "polars and pyarrow")


if __name__ == "__main__":
pyunit_utils.standalone_test(test_polars_datatable)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import time
from h2o.utils.threading import local_context


def test_frame_conversion(dataset, original_pandas_frame, module):
# convert frame using datatable or polar
h2oFrame = h2o.import_file(pyunit_utils.locate(dataset))
Expand Down Expand Up @@ -35,6 +36,7 @@ def test_frame_conversion(dataset, original_pandas_frame, module):
diff = (new_pandas_frame[colNames[ind]] - original_pandas_frame[colNames[ind]]).abs()
assert diff.max() < 1e-10


def single_thread_pandas_conversion(dataset):
with local_context(datatable_disabled=True, polars_disabled=True):
print("converting h2o frame to pandas frame using single thread")
Expand All @@ -45,32 +47,25 @@ def single_thread_pandas_conversion(dataset):
print("H2O frame to Pandas frame conversion time with single thread for dataset {1}: {0}".format(newTime, dataset))
return h2oframe_panda


# if datatable or polars/pyarrow is installed, this test will show that using datatable to convert h2o frame to pandas
# frame is much faster for large datasets.
def test_polars_datatable_2_pandas():
file1 = "bigdata/laptop/jira/PUBDEV_5266_merge_with_string_columns/PUBDEV_5266_f1.csv"
original_converted_frame1 = single_thread_pandas_conversion(file1) # need to run conversion in single thread

if can_install_datatable():
if not(can_use_datatable()):
pyunit_utils.install("datatable")
with local_context(polars_disabled=True): # run with datatable
with local_context(polars_disabled=True, datatable_enabled=True): # run with datatable
assert can_use_datatable(), "Can't use datatable"
print("test data frame conversion using datatable.")
test_frame_conversion(file1, original_converted_frame1, "datatable")

else:
print("datatable is not available. Skipping tests using datatable.")

if can_install_polars():
if not(can_use_polars()):
pyunit_utils.install("polars")
if not(can_use_pyarrow()):
pyunit_utils.install("pyarrow")
with local_context(datatable_disabled=True):
with local_context(datatable_disabled=True, polars_enabled=True):
assert can_use_polars() and can_use_pyarrow(), "Can't use polars"
print("test data frame conversion using polars and pyarrow.")
test_frame_conversion(file1, original_converted_frame1, "polars and pyarrow")
else:
print("polars, pyarrow are not available. Skipping tests using polars and pyarrow")


if __name__ == "__main__":
pyunit_utils.standalone_test(test_polars_datatable_2_pandas)
Expand Down

0 comments on commit ef9ecdd

Please sign in to comment.