Skip to content

Commit

Permalink
update file format and general cleanup (#15)
Browse files Browse the repository at this point in the history
* work

* work

* work

* work

* work

* work

* work

* sample files
  • Loading branch information
mcroomp authored Sep 9, 2024
1 parent 8c5aed6 commit ef28638
Show file tree
Hide file tree
Showing 24 changed files with 1,134 additions and 445 deletions.
Binary file added samples/compressed_zlibng_level1.deflate
Binary file not shown.
Binary file added samples/compressed_zlibng_level2.deflate
Binary file not shown.
Binary file added samples/compressed_zlibng_level3.deflate
Binary file not shown.
Binary file added samples/compressed_zlibng_level4.deflate
Binary file not shown.
Binary file added samples/file-sample_1MB.docx
Binary file not shown.
Binary file added samples/sample1.bin.gz
Binary file not shown.
Binary file added samples/samplepptx.pptx
Binary file not shown.
Binary file added samples/starcontrol.samplesave
Binary file not shown.
Binary file added samples/zlibng.deflate
Binary file not shown.
56 changes: 31 additions & 25 deletions src/complevel_estimator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@ use crate::{
preflate_constants,
preflate_input::PreflateInput,
preflate_parameter_estimator::PreflateStrategy,
preflate_parse_config::{FAST_PREFLATE_PARSER_SETTINGS, SLOW_PREFLATE_PARSER_SETTINGS},
preflate_parse_config::{
MatchingType, SLOW_PREFLATE_PARSER_SETTINGS, ZLIB_PREFLATE_PARSER_SETTINGS,
},
preflate_token::{BlockType, PreflateToken, PreflateTokenBlock, PreflateTokenReference},
skip_length_estimator::estimate_skip_length,
skip_length_estimator::estimate_add_policy,
token_predictor::TokenPredictorParameters,
};

Expand All @@ -31,8 +33,7 @@ pub struct CompLevelInfo {
pub min_len: u32,
pub add_policy: DictionaryAddPolicy,
pub hash_algorithm: HashAlgorithm,
pub good_length: u32,
pub max_lazy: u32,
pub match_type: MatchingType,
pub nice_length: u32,
pub max_chain: u32,
}
Expand Down Expand Up @@ -64,8 +65,7 @@ impl CandidateInfo {
max_token_count: 0,
zlib_compatible: false,
max_dist_3_matches: 0,
good_length: 0,
max_lazy: 0,
matching_type: MatchingType::Greedy,
max_chain: 0,
min_len: 0,
};
Expand Down Expand Up @@ -157,7 +157,7 @@ impl<'a> CompLevelEstimatorState<'a> {
plain_text: &'a [u8],
blocks: &'a Vec<PreflateTokenBlock>,
) -> Self {
let add_policy = estimate_skip_length(blocks);
let add_policy = estimate_add_policy(blocks);

let hash_bits = mem_level + 7;
let mem_hash_shift = (hash_bits + 2) / 3;
Expand All @@ -177,7 +177,7 @@ impl<'a> CompLevelEstimatorState<'a> {
let mut candidates: Vec<Box<CandidateInfo>> = Vec::new();

candidates.push(Box::new(CandidateInfo::new(
add_policy,
DictionaryAddPolicy::AddFirst(0),
HashAlgorithm::MiniZFast,
wbits,
)));
Expand Down Expand Up @@ -214,6 +214,13 @@ impl<'a> CompLevelEstimatorState<'a> {
wbits,
)));

// Crc32c candidate
candidates.push(Box::new(CandidateInfo::new(
add_policy,
HashAlgorithm::Crc32cHash,
wbits,
)));

CompLevelEstimatorState {
input,
candidates,
Expand All @@ -227,11 +234,10 @@ impl<'a> CompLevelEstimatorState<'a> {
}
}

fn update_hash(&mut self, length: u32, override_add_policy: bool) {
/// updates all the active candidates with the current hash and advance it
fn update_candidate_hashes(&mut self, length: u32) {
for i in &mut self.candidates {
let mut inputc = self.input.clone();
i.hash_chain
.update_hash_with_depth(length, &mut inputc, override_add_policy);
i.hash_chain.update_hash_with_depth(length, &self.input);
}

self.input.advance(length);
Expand Down Expand Up @@ -264,17 +270,19 @@ impl<'a> CompLevelEstimatorState<'a> {
fn check_dump(&mut self) {
for (_i, b) in self.blocks.iter().enumerate() {
if b.block_type == BlockType::Stored {
self.update_hash(b.uncompressed_len, true);
for _i in 0..b.uncompressed_len {
self.update_candidate_hashes(1);
}
continue;
}
for (_j, t) in b.tokens.iter().enumerate() {
match t {
PreflateToken::Literal => {
self.update_hash(1, true);
self.update_candidate_hashes(1);
}
&PreflateToken::Reference(r) => {
self.check_match(r);
self.update_hash(r.len(), false);
self.update_candidate_hashes(r.len());
}
}
}
Expand All @@ -292,8 +300,7 @@ impl<'a> CompLevelEstimatorState<'a> {
.min_by(|&a, &b| a.max_chain_found().cmp(&b.max_chain_found()))
.unwrap();

let mut good_length = 32;
let mut max_lazy = 258;
let mut match_type = MatchingType::Greedy;
let mut nice_length = 258;

let add_policy = candidate.add_policy;
Expand All @@ -303,21 +310,21 @@ impl<'a> CompLevelEstimatorState<'a> {
let longest_dist_at_hop_1_plus = candidate.longest_dist_at_hop_1_plus;

match candidate.add_policy {
DictionaryAddPolicy::AddFirst(_) | DictionaryAddPolicy::AddFirstAndLast(_) => {
for config in &FAST_PREFLATE_PARSER_SETTINGS {
DictionaryAddPolicy::AddFirst(_)
| DictionaryAddPolicy::AddFirstAndLast(_)
| DictionaryAddPolicy::AddFirstExcept4kBoundary => {
for config in &ZLIB_PREFLATE_PARSER_SETTINGS {
if candidate.max_chain_found() < config.max_chain {
good_length = config.good_length;
match_type = config.match_type;
nice_length = config.nice_length;
max_lazy = 0;
break;
}
}
}
DictionaryAddPolicy::AddAll => {
for config in &SLOW_PREFLATE_PARSER_SETTINGS {
if candidate.max_chain_found() < config.max_chain {
good_length = config.good_length;
max_lazy = config.max_lazy;
match_type = config.match_type;
nice_length = config.nice_length;
break;
}
Expand All @@ -343,8 +350,7 @@ impl<'a> CompLevelEstimatorState<'a> {
very_far_matches_detected: very_far_matches,
max_dist_3_matches: self.longest_len_3_dist as u16,
add_policy,
good_length,
max_lazy,
match_type,
nice_length,
max_chain,
min_len: self.min_len,
Expand Down
20 changes: 20 additions & 0 deletions src/hash_algorithm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ pub enum HashAlgorithm {
Libdeflate4,
ZlibNG,
RandomVector,
Crc32cHash,
}
pub trait HashImplementation: Default + Copy + Clone {
type HashChainType: HashChain;
Expand Down Expand Up @@ -135,6 +136,25 @@ impl HashImplementation for ZlibNGHash {
}
}

#[derive(Default, Copy, Clone)]
pub struct Crc32cHash {}

impl HashImplementation for Crc32cHash {
type HashChainType = HashChainNormalize<Crc32cHash>;

fn get_hash(&self, b: &[u8]) -> u16 {
crc32fast::hash(&b[0..4]) as u16
}

fn num_hash_bytes() -> usize {
4
}

fn new_hash_chain(self) -> Self::HashChainType {
crate::hash_chain::HashChainNormalize::<Crc32cHash>::new(self)
}
}

/// This vector uses a lookup into a table for random values
#[derive(Debug, Default, Copy, Clone, Eq, PartialEq)]
pub struct RandomVectorHash {}
Expand Down
35 changes: 23 additions & 12 deletions src/hash_chain.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ pub enum DictionaryAddPolicy {
AddFirst(u16),
/// Add only the first and last substring of a match to the dictionary that are larger than the limit
AddFirstAndLast(u16),

/// This policy is used by MiniZ in fastest mode. It adds all substrings of a match to the dictionary except
/// literals that are 4 bytes away from the end of the block.
AddFirstExcept4kBoundary,
}

trait InternalPosition: Copy + Clone + Eq + PartialEq + Default + std::fmt::Debug {
Expand Down Expand Up @@ -159,10 +163,12 @@ impl<H: HashImplementation, I: InternalPosition> HashTable<H, I> {
/// depth is the number of matches we need to walk to reach the match_pos. This
/// is only valid if this was part of the same hash chain
#[inline]
fn get_node_depth(&self, node: I, expected_hash: u16) -> i32 {
assert_eq!(self.chain_depth_hash_verify[node.to_index()], expected_hash);

self.chain_depth[node.to_index()]
fn get_node_depth(&self, node: I, expected_hash: u16) -> Option<i32> {
if self.chain_depth_hash_verify[node.to_index()] == expected_hash {
Some(self.chain_depth[node.to_index()])
} else {
None
}
}

#[inline]
Expand Down Expand Up @@ -237,14 +243,19 @@ impl<H: HashImplementation, I: InternalPosition> HashTable<H, I> {

let match_depth = self.get_node_depth(match_pos, h);

// if we have a match, then we can calculate the depth
debug_assert!(
cur_depth >= match_depth,
"current match should be >= to previous c: {} m: {}",
cur_depth,
match_depth
);
(cur_depth - match_depth) as u32
if let (Some(cur_depth), Some(match_depth)) = (cur_depth, match_depth) {
// if we have a match, then we can calculate the depth
debug_assert!(
cur_depth >= match_depth,
"current match should be >= to previous c: {} m: {}",
cur_depth,
match_depth
);

(cur_depth - match_depth) as u32
} else {
BAD_DEPTH
}
}
}

Expand Down
Loading

0 comments on commit ef28638

Please sign in to comment.