Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New features: chunking and compression #93

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
2 changes: 2 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ EXTRA_DIST = COPYRIGHT \
README \
RELEASE_NOTES \
m4/foreach.m4 \
m4/foreach_idx.m4 \
m4/list_len.m4 \
m4/utils.m4

# Below is a trick to build all test executables, without running them
Expand Down
127 changes: 127 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,9 @@ dnl AH_TEMPLATE([ENABLE_IN_PLACE_SWAP], [Define if to enable in-place byte swap]
dnl AH_TEMPLATE([DISABLE_IN_PLACE_SWAP],[Define if to disable in-place byte swap])
AH_TEMPLATE([ENABLE_SUBFILING], [Define if to enable subfiling feature])
AH_TEMPLATE([ENABLE_NETCDF4], [Define if to enable NetCDF-4 support])
AH_TEMPLATE([ENABLE_CHUNKING], [Define if to enable chunked storage layout and chunking feature])
AH_TEMPLATE([ENABLE_ZLIB], [Define if to enable zlib chunking method])
AH_TEMPLATE([ENABLE_SZ], [Define if to enable sz chunking method])
AH_TEMPLATE([ENABLE_ADIOS], [Define if to enable ADIOS BP read feature])
AH_TEMPLATE([HDF5_VER_GE_1_10_4], [Define if HDF5 version is at least 1.10.4])
AH_TEMPLATE([NETCDF_GE_4_5_0], [Define if NetCDF version is at least 4.5.0])
Expand Down Expand Up @@ -2222,6 +2225,129 @@ fi
AC_SUBST(ENABLE_BURST_BUFFER)
AM_CONDITIONAL(ENABLE_BURST_BUFFER, [test x$enable_bbdriver = xyes])

AC_ARG_ENABLE([chunking],
[AS_HELP_STRING([--enable-chunking],
[Enable chunked chunking driver support. @<:@default: disabled@:>@])],
[enable_chunking=${enableval}], [enable_chunking=no]
)

ENABLE_CHUNKING=0
if test "x$enable_chunking" = "xyes" ; then
AC_DEFINE(ENABLE_CHUNKING)
ENABLE_CHUNKING=1
fi
AC_SUBST(ENABLE_CHUNKING)
AM_CONDITIONAL(ENABLE_CHUNKING, [test x$enable_chunking = xyes])

AC_ARG_ENABLE([zlib],
[AS_HELP_STRING([--enable-zlib],
[Enable zlib chunking method support. @<:@default: disabled@:>@])],
[enable_zlib=${enableval}], [enable_zlib=no]
)

ENABLE_ZLIB=0
if test "x$enable_zlib" = "xyes" ; then
AC_DEFINE(ENABLE_ZLIB)
ENABLE_ZLIB=1
fi
AC_SUBST(ENABLE_ZLIB)
AM_CONDITIONAL(ENABLE_ZLIB, [test x$enable_zlib = xyes])

if test "x$enable_zlib" = "xyes" ; then
ZLIB_INSTALL=""
AC_ARG_WITH(zlib,
[AS_HELP_STRING([--with-zlib=/path/to/implementation],
[installation prefix for zlib implementation])],
if test "x${withval}" = xyes; then
AC_MSG_ERROR(--with-zlib is set but the value is NULL)
else
ZLIB_INSTALL=${withval}
fi
)

if test "x${ZLIB_INSTALL}" != x ; then
CPPFLAGS+=" -I${ZLIB_INSTALL}/include"
LDFLAGS+=" -L${ZLIB_INSTALL}/lib"
LIBS+=" -lz"
fi

LIBS+=" -lm -ldl"

have_zlib=no
AC_MSG_CHECKING(ZLIB library)
AC_SEARCH_LIBS([deflate], [z], [have_zlib=yes], [have_zlib=no])
if test "x${have_zlib}" = xyes; then
AC_CHECK_HEADERS([zlib.h], [], [have_zlib=no])
fi

if test "x${have_zlib}" = xno; then
AC_MSG_ERROR([
------------------------------------------------------------
The ZLIB library and header file are required to build
PnetCDF with ZLIB chunking support. Use option
--with-zlib=/path/to/implementation
to specify the location of ZLIB build.
Stopping ...
Check 'config.log' for more information.
------------------------------------------------------------])
fi
fi

AC_ARG_ENABLE([sz],
[AS_HELP_STRING([--enable-sz],
[Enable sz chunking method support. @<:@default: disabled@:>@])],
[enable_sz=${enableval}], [enable_sz=no]
)

ENABLE_SZ=0
if test "x$enable_sz" = "xyes" ; then
AC_DEFINE(ENABLE_SZ)
ENABLE_SZ=1
fi
AC_SUBST(ENABLE_SZ)
AM_CONDITIONAL(ENABLE_SZ, [test x$enable_sz = xyes])


if test "x$enable_sz" = "xyes" ; then
SZ_INSTALL=""
AC_ARG_WITH(sz,
[AS_HELP_STRING([--with-sz=/path/to/implementation],
[installation prefix for sz implementation])],
if test "x${withval}" = xyes; then
AC_MSG_ERROR(--with-sz is set but the value is NULL)
else
SZ_INSTALL=${withval}
fi
)

if test "x${SZ_INSTALL}" != x ; then
CPPFLAGS+=" -I${SZ_INSTALL}/include"
LDFLAGS+=" -L${SZ_INSTALL}/lib"
LIBS+=" -lSZ -lzstd"
fi

LIBS+=" -lm -ldl"

have_sz=no
AC_MSG_CHECKING(SZ library)
AC_SEARCH_LIBS([deflate], [z], [have_sz=yes], [have_sz=no])
if test "x${have_sz}" = xyes; then
AC_CHECK_HEADERS([sz.h], [], [have_sz=no])
fi

if test "x${have_sz}" = xno; then
AC_MSG_ERROR([
------------------------------------------------------------
The SZ library and header file are required to build
PnetCDF with SZ chunking support. Use option
--with-sz=/path/to/implementation
to specify the location of SZ build.
Stopping ...
Check 'config.log' for more information.
------------------------------------------------------------])
fi
fi

ADIOS_INSTALL=""
AC_ARG_WITH(adios,
[AS_HELP_STRING([--with-adios@<:@=DIR@:>@],
Expand Down Expand Up @@ -2532,6 +2658,7 @@ AC_CONFIG_FILES(Makefile \
src/drivers/nc4io/Makefile \
src/drivers/ncadios/Makefile \
src/drivers/ncbbio/Makefile \
src/drivers/ncchunkio/Makefile \
src/drivers/ncfoo/Makefile \
src/binding/Makefile \
src/binding/cxx/Makefile \
Expand Down
119 changes: 119 additions & 0 deletions doc/README.Chunk.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# Support variable chunking and compression

PnetCDF contains an experimental variable chunking and compression feature
for classic NetCDF files.

For details about its design and implementation, please refer to:
Hou, Kaiyuan, et al. "Supporting Data Compression in PnetCDF."
2021 IEEE International Conference on Big Data (Big Data). IEEE, 2021.

## Enable variable chunking support

* To build PnetCDF with variable chunking support
+ Add `--enable-chunking` option at the configure command line. For example,
```
./configure --prefix=/PnetCDF/install/path --enable-chunking
```
* To build deflate filter support for chunked variable
+ Add `--enable-zlib` option at the configure command line. Option
`--with-zlib` can also be used to specify the installation path of
zlib if it is not in the standard locations. For example,
```
./configure --prefix=/PnetCDF/install/path --enable-chunking --enable-zlib \
--with-zlib=/zlib/install/path
```
* To build sz filter support for chunked variable
+ Add `--enable-sz` option at the configure command line. Option
`--with-sz` can also be used to specify the installation path of
sz if it is not in the standard locations. For example,
```
./configure --prefix=/PnetCDF/install/path --enable-chunking --enable-sz \
--with-sz=/sz/install/path
```

## Enable variable chunking

To enable chunked storage layout for variables, set the file info "nc_chunking"
to "enable". The chunking feature requires 64-bit NetCDF format (CDF5).
For example,
```
MPI_Info_create(&info);
ncmpi_create(MPI_COMM_WORLD, fname, NC_64BIT_DATA, info, &ncid);
```
Alternatively, the file info can be set through the environment variable
"PNETCDF_HINTS".
```
export PNETCDF_HINTS="nc_chunking=enable"
```
When chunking is enabled, all non-scalar variables will be stored in a chunked
storage layout. Scalar variables are not chunked.

Users can also set the default filter for chunked variables. For example,
```
MPI_Info_set(info, "nc_chunk_default_filter", "zlib");
```
or
```
export PNETCDF_HINTS="nc_chunking=enable;nc_chunk_default_filter=zlib"
```
The available filter options are none (default), zlib (deflate), sz.

## Define chunk dimension of variables

Applications can use the following APIs to set and get the chunk dimension of
a variable.
```
int ncmpi_var_set_chunk (int ncid, int varid, int *chunk_dim);
int ncmpi_var_get_chunk (int ncid, int varid, int *chunk_dim);
```
For example:
```
int dim[2] = {100, 100};
int chunk_dim[2] = {10, 10};
ncmpi_def_var (ncid, name, type, 2, dim, &varid)
ncmpi_var_set_chunk (ncid, varid, chunk_dim);
```
For record variables, the chunk dimension along the record dimension is always
1.
The default chunk dimension is the dimension of the variable except for the
record dimension. By default, PnetCDF will create one chunk per record or
variable.

## Define filter for chunked variables

Applications can use the following APIs to set and get the chunk dimension of
a variable.
```
#define NC_FILTER_NONE 0
#define NC_FILTER_DEFLATE 2
#define NC_FILTER_SZ 3
int ncmpi_var_set_filter (int ncid, int varid, int filter);
int ncmpi_var_get_filter (int ncid, int varid, int *filter);
```
For example:
```
ncmpi_var_set_filter (ncid, varid, NC_FILTER_DEFLATE);
```
Valid filter values are NC_FILTER_NONE (none), NC_FILTER_DEFLATE (zlib), and
NC_FILTER_SZ (sz).


## Known problems

There are some limitations of the experimental variable chunking feature.

* Only one filter can be applied to a chunked variable. Unlike HDF5 which allows
the stacking of multiple filters on chunked datasets, the current
implementation in PnetCDF only allows a single filter to be applied to a
variable.
* No per-variable option for variable chunking. If chunking is enabled, all
non-scalar variables will be chunked even if the chunk dimension is not
defined.
* Independent variable I/O is not supported. Variable read/write (get/put)
must be collective in order to maintain data consistency of filtered chunks.
Non-blocking APIs can be used to mitigate the impact of this limitation.

Copyright (C) 2022, Northwestern University and Argonne National Laboratory

See the COPYRIGHT notice in the top-level directory.

7 changes: 7 additions & 0 deletions m4/foreach_idx.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
divert(`-1')
# foreach_idx(x, idx, (item_1, item_2, ..., item_n), stmt)
# parenthesized list, simple version
define(`foreach_idx', `pushdef(`$1')pushdef(`$2')_foreach_idx($@,0)popdef(`$2')popdef(`$1')')
define(`_arg1', `$1')
define(`_foreach_idx', `ifelse(`$3', `()', `',`define(`$1', _arg1$3)define(`$2', `$5')$4`'$0(`$1', `$2', (shift$3), `$4',incr($5))')')
divert`'dnl
6 changes: 6 additions & 0 deletions m4/list_len.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
divert(`-1')
# list_len((item_1, item_2, ..., item_n))
# parenthesized list, simple version
define(`list_len', `_list_len($@, 0)')`'dnl
define(`_list_len',`ifelse(`$1', `()', `$2', `$0((shift$1), incr(`$2'))')')`'dnl
divert`'dnl
40 changes: 40 additions & 0 deletions src/dispatchers/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,9 @@ ncmpi_create(MPI_Comm comm,
#ifdef ENABLE_BURST_BUFFER
int enable_bb_driver=0;
#endif
#ifdef ENABLE_CHUNKING
int enable_chk_driver=0;
#endif

MPI_Comm_rank(comm, &rank);
MPI_Comm_size(comm, &nprocs);
Expand Down Expand Up @@ -382,6 +385,18 @@ ncmpi_create(MPI_Comm comm,
enable_bb_driver = 1;
}
#endif
#ifdef ENABLE_CHUNKING
if (combined_info != MPI_INFO_NULL) {
char value[MPI_MAX_INFO_VAL];
int flag;

/* check if nc_chunking is enabled */
MPI_Info_get(combined_info, "nc_chunking", MPI_MAX_INFO_VAL-1,
value, &flag);
if (flag && strcasecmp(value, "enable") == 0)
enable_chk_driver = 1;
}
#endif

/* Use environment variable and cmode to tell the file format
* which is later used to select the right driver.
Expand Down Expand Up @@ -462,6 +477,11 @@ ncmpi_create(MPI_Comm comm,
if (enable_bb_driver)
driver = ncbbio_inq_driver();
else
#endif
#ifdef ENABLE_CHUNKING
if (enable_chk_driver)
driver = ncchkio_inq_driver();
else
#endif
/* default is the driver built on top of MPI-IO */
driver = ncmpio_inq_driver();
Expand Down Expand Up @@ -558,6 +578,9 @@ ncmpi_open(MPI_Comm comm,
#ifdef ENABLE_BURST_BUFFER
int enable_bb_driver=0;
#endif
#ifdef ENABLE_CHUNKING
int enable_chk_driver=0;
#endif

MPI_Comm_rank(comm, &rank);
MPI_Comm_size(comm, &nprocs);
Expand Down Expand Up @@ -688,6 +711,18 @@ ncmpi_open(MPI_Comm comm,
enable_bb_driver = 1;
}
#endif
#ifdef ENABLE_CHUNKING
if (combined_info != MPI_INFO_NULL) {
char value[MPI_MAX_INFO_VAL];
int flag;

/* check if nc_chunking is enabled */
MPI_Info_get(combined_info, "nc_chunking", MPI_MAX_INFO_VAL-1,
value, &flag);
if (flag && strcasecmp(value, "enable") == 0)
enable_chk_driver = 1;
}
#endif

#ifdef ENABLE_NETCDF4
if (format == NC_FORMAT_NETCDF4_CLASSIC || format == NC_FORMAT_NETCDF4) {
Expand Down Expand Up @@ -716,6 +751,11 @@ ncmpi_open(MPI_Comm comm,
if (enable_bb_driver)
driver = ncbbio_inq_driver();
else
#endif
#ifdef ENABLE_CHUNKING
if (enable_chk_driver)
driver = ncchkio_inq_driver();
else
#endif
{
/* ncmpio driver */
Expand Down
Loading