Skip to content

Commit

Permalink
Merge pull request #86 from franzhuettinger/master
Browse files Browse the repository at this point in the history
Updated dependency blosc to blosc2
  • Loading branch information
AnotherSamWilson authored Apr 27, 2024
2 parents 41187f6 + 68be789 commit f6fb6ad
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 27 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/run_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.7", "3.8", "3.9"]
python-version: ["3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v3
Expand All @@ -27,17 +27,17 @@ jobs:
pip install mypy
pip install codecov
pip install pytest-cov
pip install blosc
pip install blosc2
pip install dill
pip install pandas
pip install seaborn
pip install matplotlib
pip install scipy==1.7.3
pip install scipy
pip install scikit-learn
pip install lightgbm
pip install pyarrow
- name: Test with pytest
run: |
mypy miceforest
pytest --cov-config .coveragerc --cov-report html --cov=miceforest
codecov
codecov
2 changes: 1 addition & 1 deletion README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ data to be imputed in a more similar fashion to the original mice procedure.

### Saving and Loading Kernels

Kernels can be saved using the `.save_kernel()` method, and then loaded again using the `utils.load_kernel()` function. Internally, this procedure uses `blosc` and `dill` packages to do the following:
Kernels can be saved using the `.save_kernel()` method, and then loaded again using the `utils.load_kernel()` function. Internally, this procedure uses `blosc2` and `dill` packages to do the following:

1. Convert working data to parquet bytes (if it is a pandas dataframe)
2. Serialize the kernel
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ to the original mice procedure.

Kernels can be saved using the `.save_kernel()` method, and then loaded
again using the `utils.load_kernel()` function. Internally, this
procedure uses `blosc` and `dill` packages to do the following:
procedure uses `blosc2` and `dill` packages to do the following:

1. Convert working data to parquet bytes (if it is a pandas dataframe)
2. Serialize the kernel
Expand Down
43 changes: 31 additions & 12 deletions miceforest/ImputationKernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from lightgbm import train, Dataset, cv, log_evaluation, early_stopping, Booster
from lightgbm.basic import _ConfigAliases
from io import BytesIO
import blosc
import blosc2
import dill
from copy import copy
from typing import Union, List, Dict, Any, Optional
Expand Down Expand Up @@ -294,7 +294,7 @@ def __init__(
for ds in range(datasets)
}
self.optimal_parameter_losses: Dict[Any, Any] = {
ds: {var: np.Inf for var in self.variable_training_order}
ds: {var: np.inf for var in self.variable_training_order}
for ds in range(datasets)
}

Expand Down Expand Up @@ -1449,7 +1449,7 @@ def tune_parameters(
categorical_feature=feature_cat_index,
)
except:
loss, best_iteration = np.Inf, 0
loss, best_iteration = np.inf, 0

if best_iteration > 1:
break
Expand Down Expand Up @@ -1814,11 +1814,11 @@ def save_kernel(
The file to save to.
clevel: int
The compression level, sent to clevel argument in blosc.compress()
The compression level, sent to clevel argument in blosc2.compress()
cname: str
The compression algorithm used.
Sent to cname argument in blosc.compress.
Sent to cname argument in blosc2.compress.
If None is specified, the default is lz4hc.
n_threads: int
Expand All @@ -1833,7 +1833,27 @@ def save_kernel(

clevel = 9 if clevel is None else clevel
cname = "lz4hc" if cname is None else cname
n_threads = blosc.detect_number_of_cores() if n_threads is None else n_threads

# make interface compatible
codec_mapping = {
"blosclz": blosc2.Codec.BLOSCLZ,
"lz4":blosc2.Codec.LZ4,
"lz4hc":blosc2.Codec.LZ4HC,
"zlib":blosc2.Codec.ZLIB,
"zstd":blosc2.Codec.ZSTD,
"ndlz":blosc2.Codec.NDLZ,
"zfp_acc":blosc2.Codec.ZFP_ACC,
"zfp_prec":blosc2.Codec.ZFP_PREC,
"zfp_rate":blosc2.Codec.ZFP_RATE,
"openhtj2k":blosc2.Codec.OPENHTJ2K,
"grok":blosc2.Codec.GROK
}
if cname in codec_mapping.keys():
codec=codec_mapping[cname]
else:
codec=blosc2.Codec.LZ4HC

n_threads = blosc2.detect_number_of_cores() if n_threads is None else n_threads

if copy_while_saving:
kernel = copy(self)
Expand All @@ -1846,16 +1866,15 @@ def save_kernel(
kernel.working_data.to_parquet(working_data_bytes)
kernel.working_data = working_data_bytes

blosc.set_nthreads(n_threads)

blosc2.set_nthreads(n_threads)
with open(filepath, "wb") as f:
dill.dump(
blosc.compress(
blosc2.compress(
dill.dumps(kernel),
clevel=clevel,
typesize=8,
shuffle=blosc.NOSHUFFLE,
cname=cname,
filter=blosc2.Filter.NOFILTER,
codec=codec,
),
f,
)
Expand Down
2 changes: 1 addition & 1 deletion miceforest/ImputedData.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ def _ampute_original_data(self):
dat=self.working_data,
row_ind=self.na_where[c],
col_ind=c,
val=np.array([np.NaN]),
val=np.array([np.nan]),
)

def _get_num_vars(self, subset: Optional[List] = None):
Expand Down
8 changes: 4 additions & 4 deletions miceforest/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .compat import pd_DataFrame, pd_Series, pd_read_parquet
import numpy as np
from numpy.random import RandomState
import blosc
import blosc2
import dill
from typing import Union, List, Dict, Optional

Expand Down Expand Up @@ -98,10 +98,10 @@ def load_kernel(filepath: str, n_threads: Optional[int] = None):
-------
ImputationKernel
"""
n_threads = blosc.detect_number_of_cores() if n_threads is None else n_threads
blosc.set_nthreads(n_threads)
n_threads = blosc2.detect_number_of_cores() if n_threads is None else n_threads
blosc2.set_nthreads(n_threads)
with open(filepath, "rb") as f:
kernel = dill.loads(blosc.decompress(dill.load(f)))
kernel = dill.loads(blosc2.decompress(dill.load(f)))

if kernel.original_data_class == "pd_DataFrame":
kernel.working_data = pd_read_parquet(kernel.working_data)
Expand Down
7 changes: 3 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
install_requires=[
'lightgbm >= 3.3.1',
'numpy',
"blosc",
"blosc2",
"dill"
],
extras_require={
Expand All @@ -30,7 +30,8 @@
],
"Testing": [
"pandas",
"sklearn"
"sklearn",
"pyarrow"
],
},
url="https://github.com/AnotherSamWilson/miceforest",
Expand All @@ -39,8 +40,6 @@
'Natural Language :: English',
'Operating System :: MacOS',
'Operating System :: Microsoft :: Windows',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
Expand Down

0 comments on commit f6fb6ad

Please sign in to comment.