Skip to content

Commit 15c2916

Browse files
authored
Merge pull request #4180 from facebook/split_param
Block splitter control parameter
2 parents 5bae43b + bbaba45 commit 15c2916

File tree

6 files changed

+178
-46
lines changed

6 files changed

+178
-46
lines changed

lib/compress/zstd_compress.c

+59-27
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
323323
assert(cctxParams.ldmParams.hashLog >= cctxParams.ldmParams.bucketSizeLog);
324324
assert(cctxParams.ldmParams.hashRateLog < 32);
325325
}
326-
cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
326+
cctxParams.postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.postBlockSplitter, &cParams);
327327
cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
328328
cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
329329
cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize);
@@ -391,13 +391,13 @@ ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams,
391391
*/
392392
cctxParams->compressionLevel = compressionLevel;
393393
cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
394-
cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
394+
cctxParams->postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->postBlockSplitter, &params->cParams);
395395
cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
396396
cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
397397
cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize);
398398
cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel);
399399
DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
400-
cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
400+
cctxParams->useRowMatchFinder, cctxParams->postBlockSplitter, cctxParams->ldmParams.enableLdm);
401401
}
402402

403403
size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params)
@@ -598,11 +598,16 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
598598
bounds.upperBound = 1;
599599
return bounds;
600600

601-
case ZSTD_c_useBlockSplitter:
601+
case ZSTD_c_splitAfterSequences:
602602
bounds.lowerBound = (int)ZSTD_ps_auto;
603603
bounds.upperBound = (int)ZSTD_ps_disable;
604604
return bounds;
605605

606+
case ZSTD_c_blockSplitterLevel:
607+
bounds.lowerBound = 0;
608+
bounds.upperBound = ZSTD_BLOCKSPLITTER_LEVEL_MAX;
609+
return bounds;
610+
606611
case ZSTD_c_useRowMatchFinder:
607612
bounds.lowerBound = (int)ZSTD_ps_auto;
608613
bounds.upperBound = (int)ZSTD_ps_disable;
@@ -669,6 +674,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
669674
case ZSTD_c_minMatch:
670675
case ZSTD_c_targetLength:
671676
case ZSTD_c_strategy:
677+
case ZSTD_c_blockSplitterLevel:
672678
return 1;
673679

674680
case ZSTD_c_format:
@@ -695,7 +701,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
695701
case ZSTD_c_stableOutBuffer:
696702
case ZSTD_c_blockDelimiters:
697703
case ZSTD_c_validateSequences:
698-
case ZSTD_c_useBlockSplitter:
704+
case ZSTD_c_splitAfterSequences:
699705
case ZSTD_c_useRowMatchFinder:
700706
case ZSTD_c_deterministicRefPrefix:
701707
case ZSTD_c_prefetchCDictTables:
@@ -754,7 +760,8 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
754760
case ZSTD_c_stableOutBuffer:
755761
case ZSTD_c_blockDelimiters:
756762
case ZSTD_c_validateSequences:
757-
case ZSTD_c_useBlockSplitter:
763+
case ZSTD_c_splitAfterSequences:
764+
case ZSTD_c_blockSplitterLevel:
758765
case ZSTD_c_useRowMatchFinder:
759766
case ZSTD_c_deterministicRefPrefix:
760767
case ZSTD_c_prefetchCDictTables:
@@ -975,10 +982,15 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
975982
CCtxParams->validateSequences = value;
976983
return (size_t)CCtxParams->validateSequences;
977984

978-
case ZSTD_c_useBlockSplitter:
979-
BOUNDCHECK(ZSTD_c_useBlockSplitter, value);
980-
CCtxParams->useBlockSplitter = (ZSTD_paramSwitch_e)value;
981-
return CCtxParams->useBlockSplitter;
985+
case ZSTD_c_splitAfterSequences:
986+
BOUNDCHECK(ZSTD_c_splitAfterSequences, value);
987+
CCtxParams->postBlockSplitter = (ZSTD_paramSwitch_e)value;
988+
return CCtxParams->postBlockSplitter;
989+
990+
case ZSTD_c_blockSplitterLevel:
991+
BOUNDCHECK(ZSTD_c_blockSplitterLevel, value);
992+
CCtxParams->preBlockSplitter_level = value;
993+
return (size_t)CCtxParams->preBlockSplitter_level;
982994

983995
case ZSTD_c_useRowMatchFinder:
984996
BOUNDCHECK(ZSTD_c_useRowMatchFinder, value);
@@ -1135,8 +1147,11 @@ size_t ZSTD_CCtxParams_getParameter(
11351147
case ZSTD_c_validateSequences :
11361148
*value = (int)CCtxParams->validateSequences;
11371149
break;
1138-
case ZSTD_c_useBlockSplitter :
1139-
*value = (int)CCtxParams->useBlockSplitter;
1150+
case ZSTD_c_splitAfterSequences :
1151+
*value = (int)CCtxParams->postBlockSplitter;
1152+
break;
1153+
case ZSTD_c_blockSplitterLevel :
1154+
*value = CCtxParams->preBlockSplitter_level;
11401155
break;
11411156
case ZSTD_c_useRowMatchFinder :
11421157
*value = (int)CCtxParams->useRowMatchFinder;
@@ -2099,7 +2114,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
20992114
{
21002115
ZSTD_cwksp* const ws = &zc->workspace;
21012116
DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d useBlockSplitter=%d",
2102-
(U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->useBlockSplitter);
2117+
(U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->postBlockSplitter);
21032118
assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
21042119

21052120
zc->isFirstBlock = 1;
@@ -2111,7 +2126,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
21112126
params = &zc->appliedParams;
21122127

21132128
assert(params->useRowMatchFinder != ZSTD_ps_auto);
2114-
assert(params->useBlockSplitter != ZSTD_ps_auto);
2129+
assert(params->postBlockSplitter != ZSTD_ps_auto);
21152130
assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
21162131
assert(params->maxBlockSize != 0);
21172132
if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
@@ -2517,10 +2532,10 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
25172532
/* Copy only compression parameters related to tables. */
25182533
params.cParams = srcCCtx->appliedParams.cParams;
25192534
assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_ps_auto);
2520-
assert(srcCCtx->appliedParams.useBlockSplitter != ZSTD_ps_auto);
2535+
assert(srcCCtx->appliedParams.postBlockSplitter != ZSTD_ps_auto);
25212536
assert(srcCCtx->appliedParams.ldmParams.enableLdm != ZSTD_ps_auto);
25222537
params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder;
2523-
params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
2538+
params.postBlockSplitter = srcCCtx->appliedParams.postBlockSplitter;
25242539
params.ldmParams = srcCCtx->appliedParams.ldmParams;
25252540
params.fParams = fParams;
25262541
params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize;
@@ -2728,9 +2743,9 @@ static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams)
27282743
* Returns 1 if true, 0 otherwise. */
27292744
static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams)
27302745
{
2731-
DEBUGLOG(5, "ZSTD_blockSplitterEnabled (useBlockSplitter=%d)", cctxParams->useBlockSplitter);
2732-
assert(cctxParams->useBlockSplitter != ZSTD_ps_auto);
2733-
return (cctxParams->useBlockSplitter == ZSTD_ps_enable);
2746+
DEBUGLOG(5, "ZSTD_blockSplitterEnabled (postBlockSplitter=%d)", cctxParams->postBlockSplitter);
2747+
assert(cctxParams->postBlockSplitter != ZSTD_ps_auto);
2748+
return (cctxParams->postBlockSplitter == ZSTD_ps_enable);
27342749
}
27352750

27362751
/* Type returned by ZSTD_buildSequencesStatistics containing finalized symbol encoding types
@@ -4300,7 +4315,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
43004315
U32 nbSeq;
43014316
size_t cSize;
43024317
DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
4303-
assert(zc->appliedParams.useBlockSplitter == ZSTD_ps_enable);
4318+
assert(zc->appliedParams.postBlockSplitter == ZSTD_ps_enable);
43044319

43054320
{ const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
43064321
FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
@@ -4491,7 +4506,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
44914506

44924507
#include "zstd_preSplit.h"
44934508

4494-
static size_t ZSTD_optimalBlockSize(ZSTD_CCtx* cctx, const void* src, size_t srcSize, size_t blockSizeMax, ZSTD_strategy strat, S64 savings)
4509+
static size_t ZSTD_optimalBlockSize(ZSTD_CCtx* cctx, const void* src, size_t srcSize, size_t blockSizeMax, int splitLevel, ZSTD_strategy strat, S64 savings)
44954510
{
44964511
/* split level based on compression strategy, from `fast` to `btultra2` */
44974512
static const int splitLevels[] = { 0, 0, 1, 2, 2, 3, 3, 4, 4, 4 };
@@ -4505,10 +4520,22 @@ static size_t ZSTD_optimalBlockSize(ZSTD_CCtx* cctx, const void* src, size_t src
45054520
* require verified savings to allow pre-splitting.
45064521
* Note: as a consequence, the first full block is not split.
45074522
*/
4508-
if (savings < 3) return 128 KB;
4509-
/* dynamic splitting has a cpu cost for analysis,
4510-
* select a variant among multiple gradual speed/accuracy tradeoffs */
4511-
return ZSTD_splitBlock(src, blockSizeMax, splitLevels[strat], cctx->tmpWorkspace, cctx->tmpWkspSize);
4523+
if (savings < 3) {
4524+
DEBUGLOG(6, "don't attempt splitting: savings (%i) too low", (int)savings);
4525+
return 128 KB;
4526+
}
4527+
/* apply @splitLevel, or use default value (which depends on @strat).
4528+
* note that splitting heuristic is still conditioned by @savings >= 3,
4529+
* so the first block will not reach this code path */
4530+
if (splitLevel == 1) return 128 KB;
4531+
if (splitLevel == 0) {
4532+
assert(ZSTD_fast <= strat && strat <= ZSTD_btultra2);
4533+
splitLevel = splitLevels[strat];
4534+
} else {
4535+
assert(2 <= splitLevel && splitLevel <= 6);
4536+
splitLevel -= 2;
4537+
}
4538+
return ZSTD_splitBlock(src, blockSizeMax, splitLevel, cctx->tmpWorkspace, cctx->tmpWkspSize);
45124539
}
45134540

45144541
/*! ZSTD_compress_frameChunk() :
@@ -4539,7 +4566,12 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
45394566

45404567
while (remaining) {
45414568
ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
4542-
size_t const blockSize = ZSTD_optimalBlockSize(cctx, ip, remaining, blockSizeMax, cctx->appliedParams.cParams.strategy, savings);
4569+
size_t const blockSize = ZSTD_optimalBlockSize(cctx,
4570+
ip, remaining,
4571+
blockSizeMax,
4572+
cctx->appliedParams.preBlockSplitter_level,
4573+
cctx->appliedParams.cParams.strategy,
4574+
savings);
45434575
U32 const lastBlock = lastFrameChunk & (blockSize == remaining);
45444576
assert(blockSize <= remaining);
45454577

@@ -6286,7 +6318,7 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
62866318
dictSize, mode);
62876319
}
62886320

6289-
params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
6321+
params.postBlockSplitter = ZSTD_resolveBlockSplitterMode(params.postBlockSplitter, &params.cParams);
62906322
params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
62916323
params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
62926324
params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);

lib/compress/zstd_compress_internal.h

+15-5
Original file line numberDiff line numberDiff line change
@@ -343,8 +343,21 @@ struct ZSTD_CCtx_params_s {
343343
ZSTD_sequenceFormat_e blockDelimiters;
344344
int validateSequences;
345345

346-
/* Block splitting */
347-
ZSTD_paramSwitch_e useBlockSplitter;
346+
/* Block splitting
347+
* @postBlockSplitter executes split analysis after sequences are produced,
348+
* it's more accurate but consumes more resources.
349+
* @preBlockSplitter_level splits before knowing sequences,
350+
* it's more approximative but also cheaper.
351+
* Valid @preBlockSplitter_level values range from 0 to 6 (included).
352+
* 0 means auto, 1 means do not split,
353+
* then levels are sorted in increasing cpu budget, from 2 (fastest) to 6 (slowest).
354+
* Highest @preBlockSplitter_level combines well with @postBlockSplitter.
355+
*/
356+
ZSTD_paramSwitch_e postBlockSplitter;
357+
int preBlockSplitter_level;
358+
359+
/* Adjust the max block size*/
360+
size_t maxBlockSize;
348361

349362
/* Param for deciding whether to use row-based matchfinder */
350363
ZSTD_paramSwitch_e useRowMatchFinder;
@@ -368,9 +381,6 @@ struct ZSTD_CCtx_params_s {
368381
void* extSeqProdState;
369382
ZSTD_sequenceProducer_F extSeqProdFunc;
370383

371-
/* Adjust the max block size*/
372-
size_t maxBlockSize;
373-
374384
/* Controls repcode search in external sequence parsing */
375385
ZSTD_paramSwitch_e searchForExternalRepcodes;
376386
}; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */

lib/compress/zstd_preSplit.c

+1
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ size_t ZSTD_splitBlock(const void* blockStart, size_t blockSize,
229229
int level,
230230
void* workspace, size_t wkspSize)
231231
{
232+
DEBUGLOG(6, "ZSTD_splitBlock (level=%i)", level);
232233
assert(0<=level && level<=4);
233234
if (level == 0)
234235
return ZSTD_splitBlock_fromBorders(blockStart, blockSize, workspace, wkspSize);

lib/zstd.h

+32-6
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,8 @@ typedef enum {
491491
* ZSTD_c_stableOutBuffer
492492
* ZSTD_c_blockDelimiters
493493
* ZSTD_c_validateSequences
494-
* ZSTD_c_useBlockSplitter
494+
* ZSTD_c_blockSplitterLevel
495+
* ZSTD_c_splitAfterSequences
495496
* ZSTD_c_useRowMatchFinder
496497
* ZSTD_c_prefetchCDictTables
497498
* ZSTD_c_enableSeqProducerFallback
@@ -518,7 +519,8 @@ typedef enum {
518519
ZSTD_c_experimentalParam16=1013,
519520
ZSTD_c_experimentalParam17=1014,
520521
ZSTD_c_experimentalParam18=1015,
521-
ZSTD_c_experimentalParam19=1016
522+
ZSTD_c_experimentalParam19=1016,
523+
ZSTD_c_experimentalParam20=1017
522524
} ZSTD_cParameter;
523525

524526
typedef struct {
@@ -2148,16 +2150,40 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
21482150
*/
21492151
#define ZSTD_c_validateSequences ZSTD_c_experimentalParam12
21502152

2151-
/* ZSTD_c_useBlockSplitter
2152-
* Controlled with ZSTD_paramSwitch_e enum.
2153+
/* ZSTD_c_blockSplitterLevel
2154+
* note: this parameter only influences the first splitter stage,
2155+
* which is active before producing the sequences.
2156+
* ZSTD_c_splitAfterSequences controls the next splitter stage,
2157+
* which is active after sequence production.
2158+
* Note that both can be combined.
2159+
* Allowed values are between 0 and ZSTD_BLOCKSPLITTER_LEVEL_MAX included.
2160+
* 0 means "auto", which will select a value depending on current ZSTD_c_strategy.
2161+
* 1 means no splitting.
2162+
* Then, values from 2 to 6 are sorted in increasing cpu load order.
2163+
*
2164+
* Note that currently the first block is never split,
2165+
* to ensure expansion guarantees in presence of incompressible data.
2166+
*/
2167+
#define ZSTD_BLOCKSPLITTER_LEVEL_MAX 6
2168+
#define ZSTD_c_blockSplitterLevel ZSTD_c_experimentalParam20
2169+
2170+
/* ZSTD_c_splitAfterSequences
2171+
* This is a stronger splitter algorithm,
2172+
* based on actual sequences previously produced by the selected parser.
2173+
* It's also slower, and as a consequence, mostly used for high compression levels.
2174+
* While the post-splitter does overlap with the pre-splitter,
2175+
* both can nonetheless be combined,
2176+
* notably with ZSTD_c_blockSplitterLevel at ZSTD_BLOCKSPLITTER_LEVEL_MAX,
2177+
* resulting in higher compression ratio than just one of them.
2178+
*
21532179
* Default is ZSTD_ps_auto.
21542180
* Set to ZSTD_ps_disable to never use block splitter.
21552181
* Set to ZSTD_ps_enable to always use block splitter.
21562182
*
21572183
* By default, in ZSTD_ps_auto, the library will decide at runtime whether to use
21582184
* block splitting based on the compression parameters.
21592185
*/
2160-
#define ZSTD_c_useBlockSplitter ZSTD_c_experimentalParam13
2186+
#define ZSTD_c_splitAfterSequences ZSTD_c_experimentalParam13
21612187

21622188
/* ZSTD_c_useRowMatchFinder
21632189
* Controlled with ZSTD_paramSwitch_e enum.
@@ -2236,7 +2262,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
22362262
* that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
22372263
* bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
22382264
* compressBound() inaccurate). Only currently meant to be used for testing.
2239-
*
22402265
*/
22412266
#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18
22422267

@@ -2264,6 +2289,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
22642289
*/
22652290
#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19
22662291

2292+
22672293
/*! ZSTD_CCtx_getParameter() :
22682294
* Get the requested compression parameter value, selected by enum ZSTD_cParameter,
22692295
* and store it into int* value.

tests/fuzz/zstd_helpers.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,8 @@ void FUZZ_setRandomParameters(ZSTD_CCtx *cctx, size_t srcSize, FUZZ_dataProducer
140140
setRand(cctx, ZSTD_c_forceMaxWindow, 0, 1, producer);
141141
setRand(cctx, ZSTD_c_literalCompressionMode, 0, 2, producer);
142142
setRand(cctx, ZSTD_c_forceAttachDict, 0, 2, producer);
143-
setRand(cctx, ZSTD_c_useBlockSplitter, 0, 2, producer);
143+
setRand(cctx, ZSTD_c_blockSplitterLevel, 0, ZSTD_BLOCKSPLITTER_LEVEL_MAX, producer);
144+
setRand(cctx, ZSTD_c_splitAfterSequences, 0, 2, producer);
144145
setRand(cctx, ZSTD_c_deterministicRefPrefix, 0, 1, producer);
145146
setRand(cctx, ZSTD_c_prefetchCDictTables, 0, 2, producer);
146147
setRand(cctx, ZSTD_c_maxBlockSize, ZSTD_BLOCKSIZE_MAX_MIN, ZSTD_BLOCKSIZE_MAX, producer);

0 commit comments

Comments
 (0)