Skip to content

Commit

Permalink
Fix Crc32 and zlibng support (#17)
Browse files Browse the repository at this point in the history
* add new crc code

* finalized work
  • Loading branch information
mcroomp authored Sep 14, 2024
1 parent e431f9e commit 8c9e17d
Show file tree
Hide file tree
Showing 6 changed files with 182 additions and 25 deletions.
62 changes: 55 additions & 7 deletions src/add_policy_estimator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,16 @@ pub enum DictionaryAddPolicy {
/// This policy is used by MiniZ in fastest mode. It adds all substrings of a match to the dictionary except
/// literals that are 4 bytes away from the end of the block.
AddFirstExcept4kBoundary,

/// This policy is used by fast mode in zlibng, it is the same
/// as AddFirst(0) but it also add the last character for the
/// last match in the 32k window.
///
/// This is due to the fact that
/// each time the dictionary is reset, it explicitly adds the
/// last character to the dictionary which ends up being the
/// last chacacter of the previous match.
AddFirstWith32KBoundary,
}

impl DictionaryAddPolicy {
Expand Down Expand Up @@ -59,11 +69,29 @@ impl DictionaryAddPolicy {
update_fn(input, pos, 1);
}
}
DictionaryAddPolicy::AddFirstWith32KBoundary => {
update_fn(input, pos, 1);
if is_at_32k_boundary(length, pos) {
update_fn(&input[length as usize - 1..], pos + length - 1, 1);
}
}
}
}
}
}

/// Check if the match is crossing the 32k boundary as which happens
/// in zlibng.
fn is_at_32k_boundary(length: u32, pos: u32) -> bool {
length > 1
&& (((pos) & 0x7fff) <= (32768 - 0x106))
&& (((pos + length) & 0x7fff) >= (32768 - 0x106))
}

/// When adding matches to the dictionary, some of the fast variants
/// only add smaller strings in their entirety (ie a substring starting
/// at each position). This function is designed to measure this
/// and determine the policy that should be used.
pub fn estimate_add_policy(token_blocks: &[PreflateTokenBlock]) -> DictionaryAddPolicy {
const WINDOW_MASK: usize = 0x7fff;

Expand All @@ -78,9 +106,16 @@ pub fn estimate_add_policy(token_blocks: &[PreflateTokenBlock]) -> DictionaryAdd

// tracks the maximum length that we've seen that was added to the dictionary if the last match was also added
let mut max_length_last_add = 0;

// same as previous, but tracks if we are inside the 32k boundary
let mut last_outside_32k_seen = false;

let mut current_offset: u32 = 0;

const LAST_ADDED: u16 = 0x8000;
const LAST_32K: u16 = 0x4000;

const MASK: u16 = 0x0fff;

let mut min_len = u32::MAX;

Expand Down Expand Up @@ -113,20 +148,31 @@ pub fn estimate_add_policy(token_blocks: &[PreflateTokenBlock]) -> DictionaryAdd
let previous_match =
current_window[(current_offset - r.dist()) as usize & WINDOW_MASK];

let match_length = u32::from(previous_match & !LAST_ADDED);
let match_length = u32::from(previous_match & MASK);

max_length = std::cmp::max(max_length, match_length);
if (previous_match & LAST_ADDED) == 0 {
max_length_last_add =
std::cmp::max(max_length_last_add, match_length);
}

if match_length != 0 && (previous_match & LAST_32K) == 0 {
last_outside_32k_seen = true;
}

let last = LAST_ADDED
| if is_at_32k_boundary(r.len(), current_offset) {
LAST_32K
} else {
0
};

current_window[current_offset as usize & WINDOW_MASK] = 0;
current_offset += 1;

for i in 1..r.len() {
current_window[current_offset as usize & WINDOW_MASK] =
r.len() as u16 | if i == r.len() - 1 { LAST_ADDED } else { 0 };
r.len() as u16 | if i == r.len() - 1 { last } else { 0 };
current_offset += 1;
}
}
Expand All @@ -138,6 +184,8 @@ pub fn estimate_add_policy(token_blocks: &[PreflateTokenBlock]) -> DictionaryAdd

if max_length == 0 && block_4k {
DictionaryAddPolicy::AddFirstExcept4kBoundary
} else if !last_outside_32k_seen {
DictionaryAddPolicy::AddFirstWith32KBoundary
} else if max_length_last_add < max_length {
DictionaryAddPolicy::AddFirstAndLast(max_length_last_add as u16)
} else if max_length < 258 {
Expand Down Expand Up @@ -180,10 +228,10 @@ fn verify_zlib_level_recognition() {
#[test]
fn verify_zlibng_level_recognition() {
let levels = [
DictionaryAddPolicy::AddFirstAndLast(0), // 1 quick
DictionaryAddPolicy::AddFirstAndLast(4), // 2 fast
DictionaryAddPolicy::AddFirstAndLast(96), // 3 medium
DictionaryAddPolicy::AddFirstAndLast(191), // 4 medium
DictionaryAddPolicy::AddFirstWith32KBoundary, // 1 quick
DictionaryAddPolicy::AddFirstAndLast(4), // 2 fast
DictionaryAddPolicy::AddFirstAndLast(96), // 3 medium
DictionaryAddPolicy::AddFirstAndLast(191), // 4 medium
];

for i in 1..=4 {
Expand All @@ -198,7 +246,7 @@ fn verify_zlibng_level_recognition() {

/// libflate always adds all matches to the dictionary
#[test]
fn verify_libflate_level_recognition() {
fn verify_libdeflate_level_recognition() {
for i in 1..=9 {
let v = crate::process::read_file(&format!("compressed_libdeflate_level{}.deflate", i));

Expand Down
1 change: 1 addition & 0 deletions src/complevel_estimator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ impl<'a> CompLevelEstimatorState<'a> {
match self.add_policy {
DictionaryAddPolicy::AddFirst(_)
| DictionaryAddPolicy::AddFirstAndLast(_)
| DictionaryAddPolicy::AddFirstWith32KBoundary
| DictionaryAddPolicy::AddFirstExcept4kBoundary => {
for config in &ZLIB_PREFLATE_PARSER_SETTINGS {
if candidate.max_chain_found() < config.max_chain {
Expand Down
37 changes: 29 additions & 8 deletions src/depth_estimator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,23 @@ pub struct HashTableDepthEstimatorImpl<H: HashImplementation> {

/// hash function used to calculate the hash
hash: H,

/// the dictionary add policy used to update the hash
add_policy: DictionaryAddPolicy,
}

impl<H: HashImplementation> HashTableDepthEstimatorImpl<H> {
/// depth is the number of matches we need to walk to reach the match_pos. This
/// is only valid if this was part of the same hash chain
#[inline]
fn get_node_depth(&self, node: u16, expected_hash: u16) -> i32 {
debug_assert_eq!(self.chain_depth_hash_verify[node as usize], expected_hash);
debug_assert_eq!(
self.chain_depth_hash_verify[node as usize],
expected_hash,
"hash chain imcomplete {:?} {:?}",
self.hash.algorithm(),
self.add_policy
);
self.chain_depth[node as usize]
}

Expand Down Expand Up @@ -80,6 +89,7 @@ impl<H: HashImplementation> HashTableDepthEstimatorImpl<H> {

impl<H: HashImplementation> HashTableDepthEstimator for HashTableDepthEstimatorImpl<H> {
fn update_hash(&mut self, add_policy: DictionaryAddPolicy, input: &PreflateInput, length: u32) {
self.add_policy = add_policy;
add_policy.update_hash(
input.cur_chars(0),
input.pos(),
Expand Down Expand Up @@ -113,17 +123,17 @@ impl<H: HashImplementation> HashTableDepthEstimator for HashTableDepthEstimatorI
}
}

/// this algorithm is wierd because it uses the first candidate of the 3 byte match,
/// but then continues with the next 4 bytes. This is used by libflate.
/// Libdeflate is a bit special because it uses the first candidate of the 3 byte match,
/// but then continues with the next 4 bytes.
#[derive(DefaultBoxed)]
struct HashTableDepthEstimatorLibflate {
struct HashTableDepthEstimatorLibdeflate {
length4: HashTableDepthEstimatorImpl<LibdeflateHash4>,
head3: [u32; 65536],
}

const LIB_DEFLATE3_HASH: LibdeflateHash3Secondary = LibdeflateHash3Secondary {};

impl HashTableDepthEstimatorLibflate {
impl HashTableDepthEstimatorLibdeflate {
fn internal_update_hash3(&mut self, chars: &[u8], pos: u32, length: u32) {
debug_assert!(length as usize <= chars.len());
if length as usize + 3 - 1 >= chars.len() {
Expand All @@ -139,7 +149,7 @@ impl HashTableDepthEstimatorLibflate {
}
}

impl HashTableDepthEstimator for HashTableDepthEstimatorLibflate {
impl HashTableDepthEstimator for HashTableDepthEstimatorLibdeflate {
fn update_hash(&mut self, add_policy: DictionaryAddPolicy, input: &PreflateInput, length: u32) {
add_policy.update_hash(
input.cur_chars(0),
Expand Down Expand Up @@ -184,7 +194,7 @@ pub fn new_depth_estimator(hash_algorithm: HashAlgorithm) -> Box<dyn HashTableDe
hash_shift,
}),
HashAlgorithm::MiniZFast => HashTableDepthEstimatorImpl::box_new(MiniZHash {}),
HashAlgorithm::Libdeflate4 => HashTableDepthEstimatorLibflate::default_boxed(),
HashAlgorithm::Libdeflate4 => HashTableDepthEstimatorLibdeflate::default_boxed(),
HashAlgorithm::Libdeflate4Fast => HashTableDepthEstimatorImpl::box_new(LibdeflateHash4 {}),

HashAlgorithm::ZlibNG => HashTableDepthEstimatorImpl::box_new(ZlibNGHash {}),
Expand All @@ -207,7 +217,10 @@ fn verify_max_chain_length() {

#[rustfmt::skip]
let levels = [
("compressed_zlibng_level1.deflate", HashAlgorithm::ZlibNG, DictionaryAddPolicy::AddFirstAndLast(0), 23),
("compressed_zlibng_level1.deflate", HashAlgorithm::Crc32cHash, DictionaryAddPolicy::AddFirstWith32KBoundary, 0),
("compressed_zlibng_level2.deflate", HashAlgorithm::Crc32cHash, DictionaryAddPolicy::AddFirstAndLast(4), 3),
("compressed_zlibng_level3.deflate", HashAlgorithm::Crc32cHash, DictionaryAddPolicy::AddFirstAndLast(96), 5),
("compressed_zlibng_level4.deflate", HashAlgorithm::Crc32cHash, DictionaryAddPolicy::AddFirstAndLast(191), 23),
("compressed_libdeflate_level1.deflate", HashAlgorithm::Libdeflate4Fast, DictionaryAddPolicy::AddAll, 1),
("compressed_libdeflate_level2.deflate", HashAlgorithm::Libdeflate4, DictionaryAddPolicy::AddAll, 6),
("compressed_libdeflate_level3.deflate", HashAlgorithm::Libdeflate4, DictionaryAddPolicy::AddAll, 12),
Expand Down Expand Up @@ -235,6 +248,14 @@ fn verify_max_chain_length() {

let parsed = parse_deflate(&compressed_data, 0).unwrap();

let add_policy_estimator = crate::add_policy_estimator::estimate_add_policy(&parsed.blocks);

assert_eq!(
add_policy_estimator, level.2,
"add policy for file {} is incorrect (should be {:?})",
level.0, level.2
);

let mut estimator = new_depth_estimator(level.1);

let mut input = PreflateInput::new(&parsed.plain_text);
Expand Down
82 changes: 80 additions & 2 deletions src/hash_algorithm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ pub trait HashImplementation: Default + Copy + Clone {
fn get_hash(&self, b: &[u8]) -> u16;
fn num_hash_bytes() -> usize;
fn new_hash_chain(self) -> Self::HashChainType;
fn algorithm(&self) -> HashAlgorithm;
}

#[derive(Default, Debug, Copy, Clone, Eq, PartialEq)]
Expand All @@ -50,6 +51,13 @@ impl HashImplementation for ZlibRotatingHash {
fn new_hash_chain(self) -> Self::HashChainType {
HashChainNormalize::<ZlibRotatingHash>::new(self)
}

fn algorithm(&self) -> HashAlgorithm {
HashAlgorithm::Zlib {
hash_mask: self.hash_mask,
hash_shift: self.hash_shift,
}
}
}

#[derive(Default, Copy, Clone)]
Expand All @@ -74,6 +82,10 @@ impl HashImplementation for MiniZHash {
fn new_hash_chain(self) -> Self::HashChainType {
crate::hash_chain::HashChainNormalize::<MiniZHash>::new(self)
}

fn algorithm(&self) -> HashAlgorithm {
HashAlgorithm::MiniZFast
}
}

/// Fast version of Libflate hash that doesn't use a secondary 3
Expand All @@ -97,6 +109,10 @@ impl HashImplementation for LibdeflateHash4Fast {
fn new_hash_chain(self) -> Self::HashChainType {
HashChainNormalize::<LibdeflateHash4Fast>::new(self)
}

fn algorithm(&self) -> HashAlgorithm {
HashAlgorithm::Libdeflate4Fast
}
}

#[derive(Default, Copy, Clone)]
Expand All @@ -118,6 +134,10 @@ impl HashImplementation for LibdeflateHash4 {
fn new_hash_chain(self) -> Self::HashChainType {
crate::hash_chain::HashChainNormalizeLibflate4::new()
}

fn algorithm(&self) -> HashAlgorithm {
HashAlgorithm::Libdeflate4
}
}

/// This is the 3 byte version of the libdeflate hash algorithm, which is used
Expand All @@ -142,6 +162,10 @@ impl HashImplementation for LibdeflateHash3Secondary {
fn new_hash_chain(self) -> Self::HashChainType {
unimplemented!();
}

fn algorithm(&self) -> HashAlgorithm {
unimplemented!("shoudln't get called on secondary hash");
}
}

#[derive(Default, Copy, Clone)]
Expand All @@ -163,6 +187,10 @@ impl HashImplementation for ZlibNGHash {
fn new_hash_chain(self) -> Self::HashChainType {
crate::hash_chain::HashChainNormalize::<ZlibNGHash>::new(self)
}

fn algorithm(&self) -> HashAlgorithm {
HashAlgorithm::ZlibNG
}
}

#[derive(Default, Copy, Clone)]
Expand All @@ -172,7 +200,14 @@ impl HashImplementation for Crc32cHash {
type HashChainType = HashChainNormalize<Crc32cHash>;

fn get_hash(&self, b: &[u8]) -> u16 {
crc32fast::hash(&b[0..4]) as u16
assert!(b.len() >= 4);

let mut crc = CRC32C_TABLE[b[0] as usize];
crc = (crc >> 8) ^ CRC32C_TABLE[((crc ^ u32::from(b[1])) & 0xFF) as usize];
crc = (crc >> 8) ^ CRC32C_TABLE[((crc ^ u32::from(b[2])) & 0xFF) as usize];
crc = (crc >> 8) ^ CRC32C_TABLE[((crc ^ u32::from(b[3])) & 0xFF) as usize];

crc as u16
}

fn num_hash_bytes() -> usize {
Expand All @@ -182,13 +217,52 @@ impl HashImplementation for Crc32cHash {
fn new_hash_chain(self) -> Self::HashChainType {
crate::hash_chain::HashChainNormalize::<Crc32cHash>::new(self)
}

fn algorithm(&self) -> HashAlgorithm {
HashAlgorithm::Crc32cHash
}
}

static CRC32C_TABLE: [u32; 256] = [
0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB,
0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24,
0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384,
0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B,
0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35,
0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA,
0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A,
0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595,
0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957,
0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198,
0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38,
0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7,
0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789,
0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46,
0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6,
0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829,
0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93,
0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C,
0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC,
0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033,
0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D,
0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982,
0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622,
0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED,
0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F,
0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0,
0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540,
0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F,
0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1,
0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E,
0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E,
0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351,
];

/// This vector uses a lookup into a table for random values
#[derive(Debug, Default, Copy, Clone, Eq, PartialEq)]
pub struct RandomVectorHash {}

const RANDOM_VECTOR: [u16; 768] = [
static RANDOM_VECTOR: [u16; 768] = [
0x499d, 0x3dc2, 0x2d07, 0x705b, 0x7a76, 0x3469, 0x59db, 0x0c58, 0x2b72, 0x412d, 0x1246, 0x2095,
0x1c1c, 0x4726, 0x5f45, 0x2c4e, 0x7b1b, 0x1e70, 0x2743, 0x554f, 0x1334, 0x5328, 0x78c1, 0x41cc,
0x4b2c, 0x62a5, 0x1d93, 0x4aa4, 0x64c8, 0x65f0, 0x194d, 0x1ac0, 0x3f96, 0x41df, 0x4389, 0x065b,
Expand Down Expand Up @@ -271,4 +345,8 @@ impl HashImplementation for RandomVectorHash {
fn new_hash_chain(self) -> Self::HashChainType {
Self::HashChainType::new(self)
}

fn algorithm(&self) -> HashAlgorithm {
HashAlgorithm::RandomVector
}
}
Loading

0 comments on commit 8c9e17d

Please sign in to comment.