Skip to content

Commit

Permalink
finalized work
Browse files Browse the repository at this point in the history
  • Loading branch information
mcroomp committed Sep 14, 2024
1 parent 929d5cd commit 2dbec98
Show file tree
Hide file tree
Showing 6 changed files with 172 additions and 76 deletions.
62 changes: 55 additions & 7 deletions src/add_policy_estimator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,16 @@ pub enum DictionaryAddPolicy {
/// This policy is used by MiniZ in fastest mode. It adds all substrings of a match to the dictionary except
/// literals that are 4 bytes away from the end of the block.
AddFirstExcept4kBoundary,

/// This policy is used by fast mode in zlibng, it is the same
/// as AddFirst(0) but it also add the last character for the
/// last match in the 32k window.
///
/// This is due to the fact that
/// each time the dictionary is reset, it explicitly adds the
/// last character to the dictionary which ends up being the
/// last chacacter of the previous match.
AddFirstWith32KBoundary,
}

impl DictionaryAddPolicy {
Expand Down Expand Up @@ -59,11 +69,29 @@ impl DictionaryAddPolicy {
update_fn(input, pos, 1);
}
}
DictionaryAddPolicy::AddFirstWith32KBoundary => {
update_fn(input, pos, 1);
if is_at_32k_boundary(length, pos) {
update_fn(&input[length as usize - 1..], pos + length - 1, 1);
}
}
}
}
}
}

/// Check if the match is crossing the 32k boundary as which happens
/// in zlibng.
fn is_at_32k_boundary(length: u32, pos: u32) -> bool {
length > 1
&& (((pos) & 0x7fff) <= (32768 - 0x106))
&& (((pos + length) & 0x7fff) >= (32768 - 0x106))
}

/// When adding matches to the dictionary, some of the fast variants
/// only add smaller strings in their entirety (ie a substring starting
/// at each position). This function is designed to measure this
/// and determine the policy that should be used.
pub fn estimate_add_policy(token_blocks: &[PreflateTokenBlock]) -> DictionaryAddPolicy {
const WINDOW_MASK: usize = 0x7fff;

Expand All @@ -78,9 +106,16 @@ pub fn estimate_add_policy(token_blocks: &[PreflateTokenBlock]) -> DictionaryAdd

// tracks the maximum length that we've seen that was added to the dictionary if the last match was also added
let mut max_length_last_add = 0;

// same as previous, but tracks if we are inside the 32k boundary
let mut last_outside_32k_seen = false;

let mut current_offset: u32 = 0;

const LAST_ADDED: u16 = 0x8000;
const LAST_32K: u16 = 0x4000;

const MASK: u16 = 0x0fff;

let mut min_len = u32::MAX;

Expand Down Expand Up @@ -113,20 +148,31 @@ pub fn estimate_add_policy(token_blocks: &[PreflateTokenBlock]) -> DictionaryAdd
let previous_match =
current_window[(current_offset - r.dist()) as usize & WINDOW_MASK];

let match_length = u32::from(previous_match & !LAST_ADDED);
let match_length = u32::from(previous_match & MASK);

max_length = std::cmp::max(max_length, match_length);
if (previous_match & LAST_ADDED) == 0 {
max_length_last_add =
std::cmp::max(max_length_last_add, match_length);
}

if match_length != 0 && (previous_match & LAST_32K) == 0 {
last_outside_32k_seen = true;
}

let last = LAST_ADDED
| if is_at_32k_boundary(r.len(), current_offset) {
LAST_32K
} else {
0
};

current_window[current_offset as usize & WINDOW_MASK] = 0;
current_offset += 1;

for i in 1..r.len() {
current_window[current_offset as usize & WINDOW_MASK] =
r.len() as u16 | if i == r.len() - 1 { LAST_ADDED } else { 0 };
r.len() as u16 | if i == r.len() - 1 { last } else { 0 };
current_offset += 1;
}
}
Expand All @@ -138,6 +184,8 @@ pub fn estimate_add_policy(token_blocks: &[PreflateTokenBlock]) -> DictionaryAdd

if max_length == 0 && block_4k {
DictionaryAddPolicy::AddFirstExcept4kBoundary
} else if !last_outside_32k_seen {
DictionaryAddPolicy::AddFirstWith32KBoundary
} else if max_length_last_add < max_length {
DictionaryAddPolicy::AddFirstAndLast(max_length_last_add as u16)
} else if max_length < 258 {
Expand Down Expand Up @@ -180,10 +228,10 @@ fn verify_zlib_level_recognition() {
#[test]
fn verify_zlibng_level_recognition() {
let levels = [
DictionaryAddPolicy::AddFirstAndLast(0), // 1 quick
DictionaryAddPolicy::AddFirstAndLast(4), // 2 fast
DictionaryAddPolicy::AddFirstAndLast(96), // 3 medium
DictionaryAddPolicy::AddFirstAndLast(191), // 4 medium
DictionaryAddPolicy::AddFirstWith32KBoundary, // 1 quick
DictionaryAddPolicy::AddFirstAndLast(4), // 2 fast
DictionaryAddPolicy::AddFirstAndLast(96), // 3 medium
DictionaryAddPolicy::AddFirstAndLast(191), // 4 medium
];

for i in 1..=4 {
Expand All @@ -198,7 +246,7 @@ fn verify_zlibng_level_recognition() {

/// libflate always adds all matches to the dictionary
#[test]
fn verify_libflate_level_recognition() {
fn verify_libdeflate_level_recognition() {
for i in 1..=9 {
let v = crate::process::read_file(&format!("compressed_libdeflate_level{}.deflate", i));

Expand Down
1 change: 1 addition & 0 deletions src/complevel_estimator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ impl<'a> CompLevelEstimatorState<'a> {
match self.add_policy {
DictionaryAddPolicy::AddFirst(_)
| DictionaryAddPolicy::AddFirstAndLast(_)
| DictionaryAddPolicy::AddFirstWith32KBoundary
| DictionaryAddPolicy::AddFirstExcept4kBoundary => {
for config in &ZLIB_PREFLATE_PARSER_SETTINGS {
if candidate.max_chain_found() < config.max_chain {
Expand Down
37 changes: 29 additions & 8 deletions src/depth_estimator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,23 @@ pub struct HashTableDepthEstimatorImpl<H: HashImplementation> {

/// hash function used to calculate the hash
hash: H,

/// the dictionary add policy used to update the hash
add_policy: DictionaryAddPolicy,
}

impl<H: HashImplementation> HashTableDepthEstimatorImpl<H> {
/// depth is the number of matches we need to walk to reach the match_pos. This
/// is only valid if this was part of the same hash chain
#[inline]
fn get_node_depth(&self, node: u16, expected_hash: u16) -> i32 {
debug_assert_eq!(self.chain_depth_hash_verify[node as usize], expected_hash);
debug_assert_eq!(
self.chain_depth_hash_verify[node as usize],
expected_hash,
"hash chain imcomplete {:?} {:?}",
self.hash.algorithm(),
self.add_policy
);
self.chain_depth[node as usize]
}

Expand Down Expand Up @@ -80,6 +89,7 @@ impl<H: HashImplementation> HashTableDepthEstimatorImpl<H> {

impl<H: HashImplementation> HashTableDepthEstimator for HashTableDepthEstimatorImpl<H> {
fn update_hash(&mut self, add_policy: DictionaryAddPolicy, input: &PreflateInput, length: u32) {
self.add_policy = add_policy;
add_policy.update_hash(
input.cur_chars(0),
input.pos(),
Expand Down Expand Up @@ -113,17 +123,17 @@ impl<H: HashImplementation> HashTableDepthEstimator for HashTableDepthEstimatorI
}
}

/// this algorithm is wierd because it uses the first candidate of the 3 byte match,
/// but then continues with the next 4 bytes. This is used by libflate.
/// Libdeflate is a bit special because it uses the first candidate of the 3 byte match,
/// but then continues with the next 4 bytes.
#[derive(DefaultBoxed)]
struct HashTableDepthEstimatorLibflate {
struct HashTableDepthEstimatorLibdeflate {
length4: HashTableDepthEstimatorImpl<LibdeflateHash4>,
head3: [u32; 65536],
}

const LIB_DEFLATE3_HASH: LibdeflateHash3Secondary = LibdeflateHash3Secondary {};

impl HashTableDepthEstimatorLibflate {
impl HashTableDepthEstimatorLibdeflate {
fn internal_update_hash3(&mut self, chars: &[u8], pos: u32, length: u32) {
debug_assert!(length as usize <= chars.len());
if length as usize + 3 - 1 >= chars.len() {
Expand All @@ -139,7 +149,7 @@ impl HashTableDepthEstimatorLibflate {
}
}

impl HashTableDepthEstimator for HashTableDepthEstimatorLibflate {
impl HashTableDepthEstimator for HashTableDepthEstimatorLibdeflate {
fn update_hash(&mut self, add_policy: DictionaryAddPolicy, input: &PreflateInput, length: u32) {
add_policy.update_hash(
input.cur_chars(0),
Expand Down Expand Up @@ -184,7 +194,7 @@ pub fn new_depth_estimator(hash_algorithm: HashAlgorithm) -> Box<dyn HashTableDe
hash_shift,
}),
HashAlgorithm::MiniZFast => HashTableDepthEstimatorImpl::box_new(MiniZHash {}),
HashAlgorithm::Libdeflate4 => HashTableDepthEstimatorLibflate::default_boxed(),
HashAlgorithm::Libdeflate4 => HashTableDepthEstimatorLibdeflate::default_boxed(),
HashAlgorithm::Libdeflate4Fast => HashTableDepthEstimatorImpl::box_new(LibdeflateHash4 {}),

HashAlgorithm::ZlibNG => HashTableDepthEstimatorImpl::box_new(ZlibNGHash {}),
Expand All @@ -207,7 +217,10 @@ fn verify_max_chain_length() {

#[rustfmt::skip]
let levels = [
("compressed_zlibng_level1.deflate", HashAlgorithm::ZlibNG, DictionaryAddPolicy::AddFirstAndLast(0), 23),
("compressed_zlibng_level1.deflate", HashAlgorithm::Crc32cHash, DictionaryAddPolicy::AddFirstWith32KBoundary, 0),
("compressed_zlibng_level2.deflate", HashAlgorithm::Crc32cHash, DictionaryAddPolicy::AddFirstAndLast(4), 3),
("compressed_zlibng_level3.deflate", HashAlgorithm::Crc32cHash, DictionaryAddPolicy::AddFirstAndLast(96), 5),
("compressed_zlibng_level4.deflate", HashAlgorithm::Crc32cHash, DictionaryAddPolicy::AddFirstAndLast(191), 23),
("compressed_libdeflate_level1.deflate", HashAlgorithm::Libdeflate4Fast, DictionaryAddPolicy::AddAll, 1),
("compressed_libdeflate_level2.deflate", HashAlgorithm::Libdeflate4, DictionaryAddPolicy::AddAll, 6),
("compressed_libdeflate_level3.deflate", HashAlgorithm::Libdeflate4, DictionaryAddPolicy::AddAll, 12),
Expand Down Expand Up @@ -235,6 +248,14 @@ fn verify_max_chain_length() {

let parsed = parse_deflate(&compressed_data, 0).unwrap();

let add_policy_estimator = crate::add_policy_estimator::estimate_add_policy(&parsed.blocks);

assert_eq!(
add_policy_estimator, level.2,
"add policy for file {} is incorrect (should be {:?})",
level.0, level.2
);

let mut estimator = new_depth_estimator(level.1);

let mut input = PreflateInput::new(&parsed.plain_text);
Expand Down
Loading

0 comments on commit 2dbec98

Please sign in to comment.