Merge pull request #86 from franzhuettinger/master

Updated dependency blosc to blosc2
AnotherSamWilson · Apr 27, 2024 · f6fb6ad · f6fb6ad
2 parents 41187f6 + 68be789
commit f6fb6ad
Show file tree

Hide file tree

Showing 7 changed files with 45 additions and 27 deletions.
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.7", "3.8", "3.9"]
+        python-version: ["3.9", "3.10", "3.11"]
     steps:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v3
@@ -27,17 +27,17 @@ jobs:
           pip install mypy
           pip install codecov
           pip install pytest-cov
-          pip install blosc
+          pip install blosc2
           pip install dill
           pip install pandas
           pip install seaborn
           pip install matplotlib
-          pip install scipy==1.7.3
+          pip install scipy
           pip install scikit-learn
           pip install lightgbm
           pip install pyarrow
       - name: Test with pytest
         run: |
           mypy miceforest
           pytest --cov-config .coveragerc --cov-report html --cov=miceforest
-          codecov
+          codecov
diff --git a/README.Rmd b/README.Rmd
@@ -303,7 +303,7 @@ data to be imputed in a more similar fashion to the original mice procedure.
 
 ### Saving and Loading Kernels  
 
-Kernels can be saved using the `.save_kernel()` method, and then loaded again using the `utils.load_kernel()` function. Internally, this procedure uses `blosc` and `dill` packages to do the following:  
+Kernels can be saved using the `.save_kernel()` method, and then loaded again using the `utils.load_kernel()` function. Internally, this procedure uses `blosc2` and `dill` packages to do the following:  
 
 1. Convert working data to parquet bytes (if it is a pandas dataframe)
 2. Serialize the kernel  

diff --git a/README.md b/README.md
@@ -411,7 +411,7 @@ to the original mice procedure.
 
 Kernels can be saved using the `.save_kernel()` method, and then loaded
 again using the `utils.load_kernel()` function. Internally, this
-procedure uses `blosc` and `dill` packages to do the following:
+procedure uses `blosc2` and `dill` packages to do the following:
 
 1.  Convert working data to parquet bytes (if it is a pandas dataframe)
 2.  Serialize the kernel  

diff --git a/miceforest/ImputationKernel.py b/miceforest/ImputationKernel.py
@@ -25,7 +25,7 @@
 from lightgbm import train, Dataset, cv, log_evaluation, early_stopping, Booster
 from lightgbm.basic import _ConfigAliases
 from io import BytesIO
-import blosc
+import blosc2
 import dill
 from copy import copy
 from typing import Union, List, Dict, Any, Optional
@@ -294,7 +294,7 @@ def __init__(
             for ds in range(datasets)
         }
         self.optimal_parameter_losses: Dict[Any, Any] = {
-            ds: {var: np.Inf for var in self.variable_training_order}
+            ds: {var: np.inf for var in self.variable_training_order}
             for ds in range(datasets)
         }
 
@@ -1449,7 +1449,7 @@ def tune_parameters(
                                 categorical_feature=feature_cat_index,
                             )
                         except:
-                            loss, best_iteration = np.Inf, 0
+                            loss, best_iteration = np.inf, 0
 
                         if best_iteration > 1:
                             break
@@ -1814,11 +1814,11 @@ def save_kernel(
             The file to save to.
 
         clevel: int
-            The compression level, sent to clevel argument in blosc.compress()
+            The compression level, sent to clevel argument in blosc2.compress()
 
         cname: str
             The compression algorithm used.
-            Sent to cname argument in blosc.compress.
+            Sent to cname argument in blosc2.compress.
             If None is specified, the default is lz4hc.
 
         n_threads: int
@@ -1833,7 +1833,27 @@ def save_kernel(
 
         clevel = 9 if clevel is None else clevel
         cname = "lz4hc" if cname is None else cname
-        n_threads = blosc.detect_number_of_cores() if n_threads is None else n_threads
+
+        # make interface compatible
+        codec_mapping = {
+            "blosclz": blosc2.Codec.BLOSCLZ,
+            "lz4":blosc2.Codec.LZ4,
+            "lz4hc":blosc2.Codec.LZ4HC,
+            "zlib":blosc2.Codec.ZLIB,
+            "zstd":blosc2.Codec.ZSTD,
+            "ndlz":blosc2.Codec.NDLZ,
+            "zfp_acc":blosc2.Codec.ZFP_ACC,
+            "zfp_prec":blosc2.Codec.ZFP_PREC,
+            "zfp_rate":blosc2.Codec.ZFP_RATE,
+            "openhtj2k":blosc2.Codec.OPENHTJ2K,
+            "grok":blosc2.Codec.GROK
+        }
+        if cname in codec_mapping.keys():
+            codec=codec_mapping[cname]
+        else:
+            codec=blosc2.Codec.LZ4HC
+
+        n_threads = blosc2.detect_number_of_cores() if n_threads is None else n_threads
 
         if copy_while_saving:
             kernel = copy(self)
@@ -1846,16 +1866,15 @@ def save_kernel(
             kernel.working_data.to_parquet(working_data_bytes)
             kernel.working_data = working_data_bytes
 
-        blosc.set_nthreads(n_threads)
-
+        blosc2.set_nthreads(n_threads)
+        
         with open(filepath, "wb") as f:
             dill.dump(
-                blosc.compress(
+                blosc2.compress(
                     dill.dumps(kernel),
                     clevel=clevel,
-                    typesize=8,
-                    shuffle=blosc.NOSHUFFLE,
-                    cname=cname,
+                    filter=blosc2.Filter.NOFILTER,
+                    codec=codec,
                 ),
                 f,
             )

diff --git a/miceforest/ImputedData.py b/miceforest/ImputedData.py
@@ -354,7 +354,7 @@ def _ampute_original_data(self):
                 dat=self.working_data,
                 row_ind=self.na_where[c],
                 col_ind=c,
-                val=np.array([np.NaN]),
+                val=np.array([np.nan]),
             )
 
     def _get_num_vars(self, subset: Optional[List] = None):

diff --git a/miceforest/utils.py b/miceforest/utils.py
@@ -1,7 +1,7 @@
 from .compat import pd_DataFrame, pd_Series, pd_read_parquet
 import numpy as np
 from numpy.random import RandomState
-import blosc
+import blosc2
 import dill
 from typing import Union, List, Dict, Optional
 
@@ -98,10 +98,10 @@ def load_kernel(filepath: str, n_threads: Optional[int] = None):
     -------
     ImputationKernel
     """
-    n_threads = blosc.detect_number_of_cores() if n_threads is None else n_threads
-    blosc.set_nthreads(n_threads)
+    n_threads = blosc2.detect_number_of_cores() if n_threads is None else n_threads
+    blosc2.set_nthreads(n_threads)
     with open(filepath, "rb") as f:
-        kernel = dill.loads(blosc.decompress(dill.load(f)))
+        kernel = dill.loads(blosc2.decompress(dill.load(f)))
 
     if kernel.original_data_class == "pd_DataFrame":
         kernel.working_data = pd_read_parquet(kernel.working_data)

diff --git a/setup.py b/setup.py
@@ -17,7 +17,7 @@
     install_requires=[
         'lightgbm >= 3.3.1',
         'numpy',
-        "blosc",
+        "blosc2",
         "dill"
         ],
     extras_require={
@@ -30,7 +30,8 @@
         ],
         "Testing": [
             "pandas",
-            "sklearn"
+            "sklearn",
+            "pyarrow"
         ],
     },
     url="https://github.com/AnotherSamWilson/miceforest",
@@ -39,8 +40,6 @@
         'Natural Language :: English',
         'Operating System :: MacOS',
         'Operating System :: Microsoft :: Windows',
-        'Programming Language :: Python :: 3.7',
-        'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
         'Programming Language :: Python :: 3.10',
         'Programming Language :: Python :: 3.11',