diff --git a/src/bit_reader.rs b/src/deflate/bit_reader.rs similarity index 97% rename from src/bit_reader.rs rename to src/deflate/bit_reader.rs index beb6f65..0aa7f6d 100644 --- a/src/bit_reader.rs +++ b/src/deflate/bit_reader.rs @@ -12,6 +12,7 @@ pub trait ReadBits { fn get(&mut self, cbit: u32) -> Result; } +/// BitReader reads a variable number of bits from a byte stream. pub struct BitReader { binary_reader: R, bits_read: u32, diff --git a/src/bit_writer.rs b/src/deflate/bit_writer.rs similarity index 96% rename from src/bit_writer.rs rename to src/deflate/bit_writer.rs index 94b9134..5c6aa64 100644 --- a/src/bit_writer.rs +++ b/src/deflate/bit_writer.rs @@ -4,6 +4,7 @@ * This software incorporates material from third parties. See NOTICE.txt for details. *--------------------------------------------------------------------------------------------*/ +/// Used to write a variable number of bits to a byte buffer. #[derive(Default)] pub struct BitWriter { pub bit_buffer: u32, @@ -61,7 +62,7 @@ fn write_simple() { /// write various bit patterns and see if the result matches the input #[test] fn write_roundtrip() { - use crate::bit_reader::BitReader; + use super::bit_reader::BitReader; let mut b = BitWriter::default(); let mut data_buffer = Vec::new(); diff --git a/src/preflate_constants.rs b/src/deflate/deflate_constants.rs similarity index 100% rename from src/preflate_constants.rs rename to src/deflate/deflate_constants.rs diff --git a/src/deflate_reader.rs b/src/deflate/deflate_reader.rs similarity index 76% rename from src/deflate_reader.rs rename to src/deflate/deflate_reader.rs index e1319a7..d7596b5 100644 --- a/src/deflate_reader.rs +++ b/src/deflate/deflate_reader.rs @@ -5,20 +5,20 @@ *--------------------------------------------------------------------------------------------*/ use crate::{ + deflate::deflate_token::{DeflateHuffmanType, DeflateToken, DeflateTokenReference}, preflate_error::{err_exit_code, AddContext, ExitCode, Result}, - preflate_token::{PreflateHuffmanType, PreflateToken, PreflateTokenReference}, }; use std::io::Read; -use crate::{ +use super::{deflate_constants, deflate_token::DeflateTokenBlock}; + +use super::{ bit_reader::BitReader, huffman_encoding::{HuffmanOriginalEncoding, HuffmanReader}, - preflate_constants, - preflate_token::PreflateTokenBlock, }; -/// Used to read binary data in deflate format and convert it to plaintext and a list of tokenized blocks +/// Used to read binary data in DEFLATE format and convert it to plaintext and a list of tokenized blocks /// containing the literals and distance codes that were used to compress the file pub struct DeflateReader { input: BitReader, @@ -64,7 +64,7 @@ impl DeflateReader { } } - pub fn read_block(&mut self, last: &mut bool) -> Result { + pub fn read_block(&mut self, last: &mut bool) -> Result { *last = self.read_bit()?; let mode = self.read_bits(2)?; @@ -89,7 +89,7 @@ impl DeflateReader { self.write_literal(b); } - Ok(PreflateTokenBlock::Stored { + Ok(DeflateTokenBlock::Stored { uncompressed, padding_bits, }) @@ -102,17 +102,17 @@ impl DeflateReader { let decoder = HuffmanReader::create_fixed()?; if let Err(e) = self.decode_block(&decoder, &mut tokens) { if e.exit_code() == ExitCode::ShortRead { - Ok(PreflateTokenBlock::Huffman { + Ok(DeflateTokenBlock::Huffman { tokens, - huffman_type: PreflateHuffmanType::Static { incomplete: true }, + huffman_type: DeflateHuffmanType::Static { incomplete: true }, }) } else { Err(e) } } else { - Ok(PreflateTokenBlock::Huffman { + Ok(DeflateTokenBlock::Huffman { tokens, - huffman_type: PreflateHuffmanType::Static { incomplete: false }, + huffman_type: DeflateHuffmanType::Static { incomplete: false }, }) } } @@ -125,9 +125,9 @@ impl DeflateReader { let mut tokens = Vec::new(); self.decode_block(&decoder, &mut tokens).context()?; - Ok(PreflateTokenBlock::Huffman { + Ok(DeflateTokenBlock::Huffman { tokens, - huffman_type: PreflateHuffmanType::Dynamic { huffman_encoding }, + huffman_type: DeflateHuffmanType::Dynamic { huffman_encoding }, }) } @@ -138,7 +138,7 @@ impl DeflateReader { fn decode_block( &mut self, decoder: &HuffmanReader, - tokens: &mut Vec, + tokens: &mut Vec, ) -> Result<()> { let mut earliest_reference = i32::MAX; let mut cur_pos = 0; @@ -147,40 +147,39 @@ impl DeflateReader { let lit_len: u32 = decoder.fetch_next_literal_code(&mut self.input)?.into(); if lit_len < 256 { self.write_literal(lit_len as u8); - tokens.push(PreflateToken::Literal(lit_len as u8)); + tokens.push(DeflateToken::Literal(lit_len as u8)); cur_pos += 1; } else if lit_len == 256 { return Ok(()); } else { - let lcode: u32 = lit_len - preflate_constants::NONLEN_CODE_COUNT as u32; - if lcode >= preflate_constants::LEN_CODE_COUNT as u32 { + let lcode: u32 = lit_len - deflate_constants::NONLEN_CODE_COUNT as u32; + if lcode >= deflate_constants::LEN_CODE_COUNT as u32 { return err_exit_code(ExitCode::InvalidDeflate, "Invalid length code"); } - let len: u32 = preflate_constants::MIN_MATCH - + preflate_constants::LENGTH_BASE_TABLE[lcode as usize] as u32 + let len: u32 = deflate_constants::MIN_MATCH + + deflate_constants::LENGTH_BASE_TABLE[lcode as usize] as u32 + self - .read_bits(preflate_constants::LENGTH_EXTRA_TABLE[lcode as usize].into())?; + .read_bits(deflate_constants::LENGTH_EXTRA_TABLE[lcode as usize].into())?; // length of 258 can be encoded two ways: 284 with 5 one bits (non-standard) or as 285 with 0 extra bits (standard) let irregular258 = - len == 258 && lcode != preflate_constants::LEN_CODE_COUNT as u32 - 1; + len == 258 && lcode != deflate_constants::LEN_CODE_COUNT as u32 - 1; let dcode = decoder.fetch_next_distance_char(&mut self.input)? as u32; - if dcode >= preflate_constants::DIST_CODE_COUNT as u32 { + if dcode >= deflate_constants::DIST_CODE_COUNT as u32 { return err_exit_code(ExitCode::InvalidDeflate, "Invalid distance code"); } let dist = 1 - + preflate_constants::DIST_BASE_TABLE[dcode as usize] as u32 - + self - .read_bits(preflate_constants::DIST_EXTRA_TABLE[dcode as usize].into())?; + + deflate_constants::DIST_BASE_TABLE[dcode as usize] as u32 + + self.read_bits(deflate_constants::DIST_EXTRA_TABLE[dcode as usize].into())?; if dist as usize > self.plain_text.len() { return err_exit_code(ExitCode::InvalidDeflate, "Invalid distance"); } self.write_reference(dist, len); - tokens.push(PreflateToken::Reference(PreflateTokenReference::new( + tokens.push(DeflateToken::Reference(DeflateTokenReference::new( len, dist, irregular258, diff --git a/src/preflate_token.rs b/src/deflate/deflate_token.rs similarity index 62% rename from src/preflate_token.rs rename to src/deflate/deflate_token.rs index f6af128..ec7a990 100644 --- a/src/preflate_token.rs +++ b/src/deflate/deflate_token.rs @@ -4,30 +4,35 @@ * This software incorporates material from third parties. See NOTICE.txt for details. *--------------------------------------------------------------------------------------------*/ -use crate::{ - huffman_encoding::HuffmanOriginalEncoding, - preflate_constants::{ - quantize_distance, quantize_length, DIST_CODE_COUNT, LITLENDIST_CODE_COUNT, - NONLEN_CODE_COUNT, - }, +use crate::deflate::huffman_encoding::HuffmanOriginalEncoding; + +use super::deflate_constants::{ + quantize_distance, quantize_length, DIST_CODE_COUNT, LITLENDIST_CODE_COUNT, NONLEN_CODE_COUNT, }; +/// In a DEFLATE stream, tokens are either literals (bytes) or references to previous bytes +/// with a distance and length. #[derive(Copy, Clone, Debug, Eq, PartialEq)] -pub struct PreflateTokenReference { - len: u8, - dist: u16, - irregular258: bool, +pub enum DeflateToken { + Literal(u8), + Reference(DeflateTokenReference), } +/// In the case of a distance and length, the length is the number of bytes to copy from the +/// previous bytes, and the distance is the number of bytes back to start copying from. +/// +/// the irregular258 field is used to indicate that the 258 length code was used but in a +/// suboptimal way (the RFC allows for two different ways to encode 258) #[derive(Copy, Clone, Debug, Eq, PartialEq)] -pub enum PreflateToken { - Literal(u8), - Reference(PreflateTokenReference), +pub struct DeflateTokenReference { + len: u8, + dist: u16, + irregular258: bool, } -impl PreflateTokenReference { - pub fn new(len: u32, dist: u32, irregular258: bool) -> PreflateTokenReference { - PreflateTokenReference { +impl DeflateTokenReference { + pub fn new(len: u32, dist: u32, irregular258: bool) -> DeflateTokenReference { + DeflateTokenReference { len: (len - 3) as u8, dist: dist as u16, irregular258, @@ -56,7 +61,7 @@ pub const BT_DYNAMICHUFF: u32 = 0; pub const BT_STATICHUFF: u32 = 2; #[derive(Debug, PartialEq)] -pub enum PreflateHuffmanType { +pub enum DeflateHuffmanType { Dynamic { huffman_encoding: HuffmanOriginalEncoding, }, @@ -66,10 +71,10 @@ pub enum PreflateHuffmanType { } #[derive(Debug)] -pub enum PreflateTokenBlock { +pub enum DeflateTokenBlock { Huffman { - tokens: Vec, - huffman_type: PreflateHuffmanType, + tokens: Vec, + huffman_type: DeflateHuffmanType, }, Stored { uncompressed: Vec, @@ -77,6 +82,8 @@ pub enum PreflateTokenBlock { }, } +/// Used to track the frequence of tokens in the DEFLATE stream +/// which are later used to build the huffman encoding. #[derive(Debug)] pub struct TokenFrequency { pub literal_codes: [u16; LITLENDIST_CODE_COUNT], @@ -98,12 +105,12 @@ impl Default for TokenFrequency { } impl TokenFrequency { - pub fn commit_token(&mut self, token: &PreflateToken) { + pub fn commit_token(&mut self, token: &DeflateToken) { match token { - PreflateToken::Literal(lit) => { + DeflateToken::Literal(lit) => { self.literal_codes[*lit as usize] += 1; } - PreflateToken::Reference(t) => { + DeflateToken::Reference(t) => { self.literal_codes[NONLEN_CODE_COUNT + quantize_length(t.len())] += 1; self.distance_codes[quantize_distance(t.dist())] += 1; } diff --git a/src/deflate_writer.rs b/src/deflate/deflate_writer.rs similarity index 87% rename from src/deflate_writer.rs rename to src/deflate/deflate_writer.rs index 8015e5d..3712127 100644 --- a/src/deflate_writer.rs +++ b/src/deflate/deflate_writer.rs @@ -6,17 +6,19 @@ use crate::preflate_error::Result; -use crate::preflate_token::PreflateHuffmanType; -use crate::{ - bit_writer::BitWriter, - huffman_encoding::HuffmanWriter, - preflate_constants::{ +use super::deflate_token::DeflateHuffmanType; +use super::{ + deflate_constants::{ quantize_distance, quantize_length, DIST_BASE_TABLE, DIST_EXTRA_TABLE, LENGTH_BASE_TABLE, LENGTH_EXTRA_TABLE, LITLEN_CODE_COUNT, MIN_MATCH, NONLEN_CODE_COUNT, }, - preflate_token::{PreflateToken, PreflateTokenBlock}, + deflate_token::{DeflateToken, DeflateTokenBlock}, }; +use super::bit_writer::BitWriter; +use super::huffman_encoding::HuffmanWriter; + +/// Takes a tokenized block and writes it to the original compressed output. pub struct DeflateWriter { /// bit writer to write partial bits to output bitwriter: BitWriter, @@ -39,10 +41,10 @@ impl DeflateWriter { o } - pub fn encode_block(&mut self, block: &PreflateTokenBlock, last: bool) -> Result<()> { + pub fn encode_block(&mut self, block: &DeflateTokenBlock, last: bool) -> Result<()> { self.bitwriter.write(last as u32, 1, &mut self.output); match block { - PreflateTokenBlock::Stored { + DeflateTokenBlock::Stored { uncompressed, padding_bits, } => { @@ -57,16 +59,16 @@ impl DeflateWriter { self.output.extend_from_slice(&uncompressed); } - PreflateTokenBlock::Huffman { + DeflateTokenBlock::Huffman { tokens, huffman_type, } => match huffman_type { - PreflateHuffmanType::Static { .. } => { + DeflateHuffmanType::Static { .. } => { self.bitwriter.write(1, 2, &mut self.output); let huffman_writer = HuffmanWriter::start_fixed_huffman_table(); self.encode_block_with_decoder(tokens, &huffman_writer); } - PreflateHuffmanType::Dynamic { + DeflateHuffmanType::Dynamic { huffman_encoding, .. } => { let huffman_writer = HuffmanWriter::start_dynamic_huffman_table( @@ -90,19 +92,19 @@ impl DeflateWriter { fn encode_block_with_decoder( &mut self, - tokens: &Vec, + tokens: &Vec, huffman_writer: &HuffmanWriter, ) { for token in tokens { match token { - PreflateToken::Literal(lit) => { + DeflateToken::Literal(lit) => { huffman_writer.write_literal( &mut self.bitwriter, &mut self.output, u16::from(*lit), ); } - PreflateToken::Reference(reference) => { + DeflateToken::Reference(reference) => { if reference.get_irregular258() { huffman_writer.write_literal( &mut self.bitwriter, diff --git a/src/huffman_calc.rs b/src/deflate/huffman_calc.rs similarity index 97% rename from src/huffman_calc.rs rename to src/deflate/huffman_calc.rs index 0940257..bbd96ff 100644 --- a/src/huffman_calc.rs +++ b/src/deflate/huffman_calc.rs @@ -11,6 +11,13 @@ pub enum HufftreeBitCalc { Miniz, } +/// Calculates the Huffman bit lengths for a given distribution of symbols. +/// +/// There is no one-size-fits-all solution for calculating Huffman bit lengths and +/// each library has its own. +/// +/// If we can get the right algorithm, it will minimize the amount of corrections +/// that we later need to write when encoding the data back to the compressed format. pub fn calc_bit_lengths( bit_calc: HufftreeBitCalc, sym_count: &[u16], diff --git a/src/huffman_encoding.rs b/src/deflate/huffman_encoding.rs similarity index 98% rename from src/huffman_encoding.rs rename to src/deflate/huffman_encoding.rs index a00198a..fee1d60 100644 --- a/src/huffman_encoding.rs +++ b/src/deflate/huffman_encoding.rs @@ -6,11 +6,11 @@ use crate::preflate_error::{err_exit_code, ExitCode, Result}; -use crate::{ +use crate::deflate::{ bit_reader::ReadBits, bit_writer::BitWriter, + deflate_constants::TREE_CODE_ORDER_TABLE, huffman_helper::{calc_huffman_codes, calculate_huffman_code_tree, decode_symbol}, - preflate_constants::TREE_CODE_ORDER_TABLE, }; #[derive(PartialEq, Eq, Clone, Copy, Debug)] @@ -25,6 +25,7 @@ pub enum TreeCodeType { ZeroLong = 18, } +/// Represents the original encoding of the huffman table as it was read from the file #[derive(Debug, Clone, Eq, PartialEq, Default)] pub struct HuffmanOriginalEncoding { /// Huffman literal/distance lengths as RLE encoded in the file @@ -230,12 +231,12 @@ impl HuffmanOriginalEncoding { } } -pub struct HuffmanReader { +pub(super) struct HuffmanReader { lit_huff_code_tree: Vec, dist_huff_code_tree: Vec, } -pub struct HuffmanWriter { +pub(super) struct HuffmanWriter { lit_code_lengths: Vec, lit_huffman_codes: Vec, dist_code_lengths: Vec, @@ -353,7 +354,7 @@ impl HuffmanWriter { #[test] fn roundtrip_huffman_bitreadwrite() { - use crate::bit_reader::BitReader; + use crate::deflate::bit_reader::BitReader; use std::io::Cursor; let code_lengths = [1, 0, 3, 3, 4, 4, 3, 0]; @@ -454,7 +455,7 @@ fn roundtrip_huffman_table() { #[cfg(test)] fn rountrip_test(encoding: HuffmanOriginalEncoding) { - use crate::bit_reader::BitReader; + use super::bit_reader::BitReader; use std::io::Cursor; let mut output_buffer = Vec::new(); diff --git a/src/huffman_helper.rs b/src/deflate/huffman_helper.rs similarity index 96% rename from src/huffman_helper.rs rename to src/deflate/huffman_helper.rs index 4f9f741..ccc07ba 100644 --- a/src/huffman_helper.rs +++ b/src/deflate/huffman_helper.rs @@ -4,10 +4,11 @@ * This software incorporates material from third parties. See NOTICE.txt for details. *--------------------------------------------------------------------------------------------*/ -use crate::bit_reader::ReadBits; use crate::preflate_error::{err_exit_code, ExitCode, Result}; use std::vec; +use super::bit_reader::ReadBits; + /// Calculates Huffman code array given an array of Huffman Code Lengths using the RFC 1951 algorithm pub fn calc_huffman_codes(code_lengths: &[u8]) -> Result> { let mut result: Vec = vec![0; code_lengths.len()]; @@ -184,8 +185,8 @@ impl ReadBits for SingleCode { } #[cfg(test)] -fn roundtrip(frequencies: &[u16], huffcalc: crate::huffman_calc::HufftreeBitCalc) { - use crate::huffman_calc::calc_bit_lengths; +fn roundtrip(frequencies: &[u16], huffcalc: super::huffman_calc::HufftreeBitCalc) { + use super::huffman_calc::calc_bit_lengths; let code_lengths = calc_bit_lengths(huffcalc, frequencies, 7); @@ -212,15 +213,15 @@ fn roundtrip(frequencies: &[u16], huffcalc: crate::huffman_calc::HufftreeBitCalc fn roundtrip_huffman_code() { roundtrip( &[1, 0, 2, 3, 5, 8, 13, 0], - crate::huffman_calc::HufftreeBitCalc::Miniz, + super::huffman_calc::HufftreeBitCalc::Miniz, ); roundtrip( &[1, 0, 2, 3, 5, 8, 13, 0], - crate::huffman_calc::HufftreeBitCalc::Zlib, + super::huffman_calc::HufftreeBitCalc::Zlib, ); roundtrip( &[1, 0, 2, 3, 5, 1008, 113, 1, 1, 1, 100, 10000], - crate::huffman_calc::HufftreeBitCalc::Zlib, + super::huffman_calc::HufftreeBitCalc::Zlib, ); } diff --git a/src/deflate/mod.rs b/src/deflate/mod.rs new file mode 100644 index 0000000..6c9b941 --- /dev/null +++ b/src/deflate/mod.rs @@ -0,0 +1,14 @@ +//! Module for reading and writing DEFLATE streams. Streams are read in as a vector of blocks containing tokens +//! can which can be written back out as an identical DEFLATE stream. + +mod bit_reader; +mod bit_writer; +mod huffman_helper; + +pub mod deflate_constants; + +pub mod deflate_reader; +pub mod deflate_token; +pub mod deflate_writer; +pub mod huffman_calc; +pub mod huffman_encoding; diff --git a/src/add_policy_estimator.rs b/src/estimator/add_policy_estimator.rs similarity index 95% rename from src/add_policy_estimator.rs rename to src/estimator/add_policy_estimator.rs index b95c5c1..31115c8 100644 --- a/src/add_policy_estimator.rs +++ b/src/estimator/add_policy_estimator.rs @@ -8,7 +8,7 @@ /// /// This will be the limit that we use when we decide whether to /// use skip_hash or update_hash. -use crate::preflate_token::{PreflateToken, PreflateTokenBlock}; +use crate::deflate::deflate_token::{DeflateToken, DeflateTokenBlock}; #[derive(Default, Eq, PartialEq, Debug, Clone, Copy)] pub enum DictionaryAddPolicy { @@ -92,7 +92,7 @@ fn is_at_32k_boundary(length: u32, pos: u32) -> bool { /// only add smaller strings in their entirety (ie a substring starting /// at each position). This function is designed to measure this /// and determine the policy that should be used. -pub fn estimate_add_policy(token_blocks: &[PreflateTokenBlock]) -> DictionaryAddPolicy { +pub(super) fn estimate_add_policy(token_blocks: &[DeflateTokenBlock]) -> DictionaryAddPolicy { const WINDOW_MASK: usize = 0x7fff; // used to see if we have the special case of not adding matches on the edge @@ -123,21 +123,21 @@ pub fn estimate_add_policy(token_blocks: &[PreflateTokenBlock]) -> DictionaryAdd let token_block = &token_blocks[i]; match token_block { - PreflateTokenBlock::Stored { uncompressed, .. } => { + DeflateTokenBlock::Stored { uncompressed, .. } => { // we assume for stored blocks everything was added to the dictionary for _i in 0..uncompressed.len() { current_window[current_offset as usize & WINDOW_MASK] = 0; current_offset += 1; } } - PreflateTokenBlock::Huffman { tokens, .. } => { + DeflateTokenBlock::Huffman { tokens, .. } => { for token in tokens.iter() { match token { - PreflateToken::Literal(_) => { + DeflateToken::Literal(_) => { current_window[current_offset as usize & WINDOW_MASK] = 0; current_offset += 1; } - PreflateToken::Reference(r) => { + DeflateToken::Reference(r) => { // track if we saw something on the of the 4k boundary if (current_offset & 4095) >= 4093 { block_4k = false; diff --git a/src/complevel_estimator.rs b/src/estimator/complevel_estimator.rs similarity index 93% rename from src/complevel_estimator.rs rename to src/estimator/complevel_estimator.rs index 9a88a88..792fdc5 100644 --- a/src/complevel_estimator.rs +++ b/src/estimator/complevel_estimator.rs @@ -8,16 +8,19 @@ /// Getting the parameters correct means that the resulting diff between the deflate stream /// and the predicted deflate stream will be as small as possible. use crate::{ - add_policy_estimator::DictionaryAddPolicy, - depth_estimator::{new_depth_estimator, HashTableDepthEstimator}, + deflate::deflate_constants, + deflate::deflate_token::{DeflateToken, DeflateTokenBlock, DeflateTokenReference}, hash_algorithm::HashAlgorithm, - preflate_constants, preflate_error::{err_exit_code, ExitCode, Result}, preflate_input::PreflateInput, +}; + +use super::{ + add_policy_estimator::DictionaryAddPolicy, + depth_estimator::{new_depth_estimator, HashTableDepthEstimator}, preflate_parse_config::{ MatchingType, SLOW_PREFLATE_PARSER_SETTINGS, ZLIB_PREFLATE_PARSER_SETTINGS, }, - preflate_token::{PreflateToken, PreflateTokenBlock, PreflateTokenReference}, }; #[derive(Default)] @@ -56,7 +59,7 @@ impl CandidateInfo { } } - fn match_depth(&mut self, token: PreflateTokenReference, input: &PreflateInput) -> bool { + fn match_depth(&mut self, token: DeflateTokenReference, input: &PreflateInput) -> bool { let mdepth = self.depth_estimator.match_depth(token, input); // remove element if the match was impossible due to matching the @@ -113,7 +116,7 @@ struct CompLevelEstimatorState<'a> { add_policy: DictionaryAddPolicy, - blocks: &'a Vec, + blocks: &'a Vec, wsize: u16, reference_count: u32, unfound_references: u32, @@ -130,7 +133,7 @@ impl<'a> CompLevelEstimatorState<'a> { plain_text: &'a [u8], add_policy: DictionaryAddPolicy, min_len: u32, - blocks: &'a Vec, + blocks: &'a Vec, ) -> Self { let hash_bits = mem_level + 7; let mem_hash_shift = (hash_bits + 2) / 3; @@ -199,7 +202,7 @@ impl<'a> CompLevelEstimatorState<'a> { self.input.advance(length); } - fn check_match(&mut self, token: PreflateTokenReference) { + fn check_match(&mut self, token: DeflateTokenReference) { self.reference_count += 1; if self.input.pos() < token.dist() || self.candidates.is_empty() { @@ -222,18 +225,18 @@ impl<'a> CompLevelEstimatorState<'a> { fn check_dump(&mut self) { for (_i, b) in self.blocks.iter().enumerate() { match b { - PreflateTokenBlock::Stored { uncompressed, .. } => { + DeflateTokenBlock::Stored { uncompressed, .. } => { for _i in 0..uncompressed.len() { self.update_candidate_hashes(1); } } - PreflateTokenBlock::Huffman { tokens, .. } => { + DeflateTokenBlock::Huffman { tokens, .. } => { for (_j, t) in tokens.iter().enumerate() { match t { - PreflateToken::Literal(_) => { + DeflateToken::Literal(_) => { self.update_candidate_hashes(1); } - &PreflateToken::Reference(r) => { + &DeflateToken::Reference(r) => { self.check_match(r); self.update_candidate_hashes(r.len()); } @@ -295,8 +298,8 @@ impl<'a> CompLevelEstimatorState<'a> { } let very_far_matches = longest_dist_at_hop_0 - > self.window_size() - preflate_constants::MIN_LOOKAHEAD - || longest_dist_at_hop_1_plus >= self.window_size() - preflate_constants::MIN_LOOKAHEAD; + > self.window_size() - deflate_constants::MIN_LOOKAHEAD + || longest_dist_at_hop_1_plus >= self.window_size() - deflate_constants::MIN_LOOKAHEAD; Ok(CompLevelInfo { reference_count: self.reference_count, @@ -328,7 +331,7 @@ pub fn estimate_preflate_comp_level( min_len: u32, plain_text: &[u8], add_policy: DictionaryAddPolicy, - blocks: &Vec, + blocks: &Vec, ) -> Result { let mut state = CompLevelEstimatorState::new(wbits, mem_level, plain_text, add_policy, min_len, blocks); diff --git a/src/depth_estimator.rs b/src/estimator/depth_estimator.rs similarity index 93% rename from src/depth_estimator.rs rename to src/estimator/depth_estimator.rs index 1e2f7c2..2fd4155 100644 --- a/src/depth_estimator.rs +++ b/src/estimator/depth_estimator.rs @@ -1,17 +1,18 @@ use default_boxed::DefaultBoxed; use crate::{ - add_policy_estimator::DictionaryAddPolicy, hash_algorithm::*, preflate_input::PreflateInput, - preflate_token::PreflateTokenReference, + deflate::deflate_token::DeflateTokenReference, hash_algorithm::*, preflate_input::PreflateInput, }; +use super::add_policy_estimator::DictionaryAddPolicy; + pub trait HashTableDepthEstimator { fn update_hash(&mut self, add_policy: DictionaryAddPolicy, input: &PreflateInput, length: u32); /// sees how many matches we need to walk to reach match_pos, which we /// do by subtracting the depth of the current node from the depth of the /// match node. - fn match_depth(&self, token: PreflateTokenReference, input: &PreflateInput) -> u32; + fn match_depth(&self, token: DeflateTokenReference, input: &PreflateInput) -> u32; } #[derive(DefaultBoxed)] @@ -101,7 +102,7 @@ impl HashTableDepthEstimator for HashTableDepthEstimatorI /// sees how many matches we need to walk to reach match_pos, which we /// do by subtracting the depth of the current node from the depth of the /// match node. - fn match_depth(&self, token: PreflateTokenReference, input: &PreflateInput) -> u32 { + fn match_depth(&self, token: DeflateTokenReference, input: &PreflateInput) -> u32 { let match_pos = (input.pos() - token.dist()) as u16; let h = self.hash.get_hash(input.cur_chars(0)); @@ -165,7 +166,7 @@ impl HashTableDepthEstimator for HashTableDepthEstimatorLibdeflate { /// sees how many matches we need to walk to reach match_pos, which we /// do by subtracting the depth of the current node from the depth of the /// match node. - fn match_depth(&self, token: PreflateTokenReference, input: &PreflateInput) -> u32 { + fn match_depth(&self, token: DeflateTokenReference, input: &PreflateInput) -> u32 { let length3hash = LIB_DEFLATE3_HASH.get_hash(input.cur_chars(0)); let distance3 = input.pos() - self.head3[usize::from(length3hash)]; @@ -206,7 +207,7 @@ pub fn new_depth_estimator(hash_algorithm: HashAlgorithm) -> Box { + DeflateTokenBlock::Stored { uncompressed, .. } => { estimator.update_hash( DictionaryAddPolicy::AddAll, &input, uncompressed.len() as u32, ); } - PreflateTokenBlock::Huffman { tokens, .. } => { + DeflateTokenBlock::Huffman { tokens, .. } => { for token in tokens { let len = match token { - PreflateToken::Literal(_) => 1, - PreflateToken::Reference(r) => { + DeflateToken::Literal(_) => 1, + DeflateToken::Reference(r) => { max_depth = max_depth.max(estimator.match_depth(r, &input)); assert!(max_depth <= 4096, "max depth {} too high", max_depth); r.len() diff --git a/src/estimator/mod.rs b/src/estimator/mod.rs new file mode 100644 index 0000000..9bf293e --- /dev/null +++ b/src/estimator/mod.rs @@ -0,0 +1,11 @@ +//! This module is useed to estimate the parameters used to compress this DEFLATE stream. If we get these +//! parameters right, it will minimize or even eliminate the need to encode any corrections when we +//! recompress the stream. + +mod complevel_estimator; +mod depth_estimator; +mod preflate_stream_info; + +pub mod add_policy_estimator; +pub mod preflate_parameter_estimator; +pub mod preflate_parse_config; diff --git a/src/preflate_parameter_estimator.rs b/src/estimator/preflate_parameter_estimator.rs similarity index 75% rename from src/preflate_parameter_estimator.rs rename to src/estimator/preflate_parameter_estimator.rs index 426c916..f403c90 100644 --- a/src/preflate_parameter_estimator.rs +++ b/src/estimator/preflate_parameter_estimator.rs @@ -5,20 +5,22 @@ *--------------------------------------------------------------------------------------------*/ use crate::{ - add_policy_estimator::{estimate_add_policy, DictionaryAddPolicy}, bit_helper::bit_length, - complevel_estimator::estimate_preflate_comp_level, + deflate::{deflate_constants, deflate_token::DeflateTokenBlock}, + estimator::{add_policy_estimator::DictionaryAddPolicy, preflate_parse_config::MatchingType}, hash_algorithm::HashAlgorithm, - preflate_constants::{self}, preflate_error::{ExitCode, Result}, - preflate_parse_config::MatchingType, - preflate_stream_info::{extract_preflate_info, PreflateStreamInfo}, - preflate_token::PreflateTokenBlock, statistical_codec::{PredictionDecoder, PredictionEncoder}, token_predictor::TokenPredictorParameters, PreflateError, }; +use super::{ + add_policy_estimator::estimate_add_policy, + complevel_estimator::estimate_preflate_comp_level, + preflate_stream_info::{extract_preflate_info, PreflateStreamInfo}, +}; + #[derive(Debug, Copy, Clone, Eq, PartialEq)] pub enum PreflateStrategy { Default, @@ -195,6 +197,78 @@ impl PreflateParameters { } } } + + /// From the plain text and the preflate blocks, estimate the preflate parameters + pub fn estimate_preflate_parameters( + plain_text: &[u8], + blocks: &Vec, + ) -> Result { + let info = extract_preflate_info(blocks); + + let preflate_strategy = estimate_preflate_strategy(&info); + let huff_strategy = estimate_preflate_huff_strategy(&info); + + if preflate_strategy == PreflateStrategy::Store + || preflate_strategy == PreflateStrategy::HuffOnly + { + // No dictionary used + return Ok(PreflateParameters { + predictor: TokenPredictorParameters { + window_bits: 0, + very_far_matches_detected: false, + matches_to_start_detected: false, + strategy: preflate_strategy, + nice_length: 0, + add_policy: DictionaryAddPolicy::AddAll, + max_token_count: 16386, + zlib_compatible: true, + max_dist_3_matches: 0, + matching_type: MatchingType::Greedy, + max_chain: 0, + min_len: 0, + hash_algorithm: HashAlgorithm::None, + }, + huff_strategy, + }); + } + + let window_bits = estimate_preflate_window_bits(info.max_dist); + let mem_level = estimate_preflate_mem_level(info.max_tokens_per_block); + let add_policy = estimate_add_policy(blocks); + + //let hash_shift = 5; + //let hash_mask = 32767; + + let max_token_count = (1 << (6 + mem_level)) - 1; + + let cl = estimate_preflate_comp_level( + window_bits, + mem_level, + info.min_len, + plain_text, + add_policy, + blocks, + )?; + + Ok(PreflateParameters { + predictor: TokenPredictorParameters { + window_bits, + very_far_matches_detected: cl.very_far_matches_detected, + matches_to_start_detected: cl.matches_to_start_detected, + strategy: estimate_preflate_strategy(&info), + nice_length: cl.nice_length, + add_policy: cl.add_policy, + max_token_count, + zlib_compatible: cl.zlib_compatible, + max_dist_3_matches: cl.max_dist_3_matches, + matching_type: cl.match_type, + max_chain: cl.max_chain, + min_len: cl.min_len, + hash_algorithm: cl.hash_algorithm, + }, + huff_strategy: estimate_preflate_huff_strategy(&info), + }) + } } fn estimate_preflate_mem_level(max_block_size_: u32) -> u32 { @@ -208,14 +282,14 @@ fn estimate_preflate_mem_level(max_block_size_: u32) -> u32 { mbits - 6 } -pub fn estimate_preflate_window_bits(max_dist_: u32) -> u32 { +fn estimate_preflate_window_bits(max_dist_: u32) -> u32 { let mut max_dist = max_dist_; - max_dist += preflate_constants::MIN_LOOKAHEAD; + max_dist += deflate_constants::MIN_LOOKAHEAD; let wbits = bit_length(max_dist - 1); std::cmp::min(std::cmp::max(wbits, 9), 15) } -pub fn estimate_preflate_strategy(info: &PreflateStreamInfo) -> PreflateStrategy { +fn estimate_preflate_strategy(info: &PreflateStreamInfo) -> PreflateStrategy { if info.count_stored_blocks == info.count_blocks { return PreflateStrategy::Store; } @@ -228,7 +302,7 @@ pub fn estimate_preflate_strategy(info: &PreflateStreamInfo) -> PreflateStrategy PreflateStrategy::Default } -pub fn estimate_preflate_huff_strategy(info: &PreflateStreamInfo) -> PreflateHuffStrategy { +fn estimate_preflate_huff_strategy(info: &PreflateStreamInfo) -> PreflateHuffStrategy { if info.count_static_huff_tree_blocks == info.count_blocks { return PreflateHuffStrategy::Static; } @@ -238,81 +312,12 @@ pub fn estimate_preflate_huff_strategy(info: &PreflateStreamInfo) -> PreflateHuf PreflateHuffStrategy::Mixed } -pub fn estimate_preflate_parameters( - unpacked_output: &[u8], - blocks: &Vec, -) -> Result { - let info = extract_preflate_info(blocks); - - let preflate_strategy = estimate_preflate_strategy(&info); - let huff_strategy = estimate_preflate_huff_strategy(&info); - - if preflate_strategy == PreflateStrategy::Store - || preflate_strategy == PreflateStrategy::HuffOnly - { - // No dictionary used - return Ok(PreflateParameters { - predictor: TokenPredictorParameters { - window_bits: 0, - very_far_matches_detected: false, - matches_to_start_detected: false, - strategy: preflate_strategy, - nice_length: 0, - add_policy: DictionaryAddPolicy::AddAll, - max_token_count: 16386, - zlib_compatible: true, - max_dist_3_matches: 0, - matching_type: MatchingType::Greedy, - max_chain: 0, - min_len: 0, - hash_algorithm: HashAlgorithm::None, - }, - huff_strategy, - }); - } - - let window_bits = estimate_preflate_window_bits(info.max_dist); - let mem_level = estimate_preflate_mem_level(info.max_tokens_per_block); - let add_policy = estimate_add_policy(blocks); - - //let hash_shift = 5; - //let hash_mask = 32767; - - let max_token_count = (1 << (6 + mem_level)) - 1; - - let cl = estimate_preflate_comp_level( - window_bits, - mem_level, - info.min_len, - unpacked_output, - add_policy, - blocks, - )?; - - Ok(PreflateParameters { - predictor: TokenPredictorParameters { - window_bits, - very_far_matches_detected: cl.very_far_matches_detected, - matches_to_start_detected: cl.matches_to_start_detected, - strategy: estimate_preflate_strategy(&info), - nice_length: cl.nice_length, - add_policy: cl.add_policy, - max_token_count, - zlib_compatible: cl.zlib_compatible, - max_dist_3_matches: cl.max_dist_3_matches, - matching_type: cl.match_type, - max_chain: cl.max_chain, - min_len: cl.min_len, - hash_algorithm: cl.hash_algorithm, - }, - huff_strategy: estimate_preflate_huff_strategy(&info), - }) -} - #[test] fn verify_zlib_recognition() { use crate::{ - preflate_parse_config::{SLOW_PREFLATE_PARSER_SETTINGS, ZLIB_PREFLATE_PARSER_SETTINGS}, + estimator::preflate_parse_config::{ + SLOW_PREFLATE_PARSER_SETTINGS, ZLIB_PREFLATE_PARSER_SETTINGS, + }, process::{parse_deflate, read_file}, }; @@ -320,7 +325,11 @@ fn verify_zlib_recognition() { let v = read_file(&format!("compressed_zlib_level{}.deflate", i)); let contents = parse_deflate(&v, 1).unwrap(); - let params = estimate_preflate_parameters(&contents.plain_text, &contents.blocks).unwrap(); + let params = PreflateParameters::estimate_preflate_parameters( + &contents.plain_text, + &contents.blocks, + ) + .unwrap(); assert_eq!(params.predictor.zlib_compatible, true); if i == 0 { @@ -361,7 +370,11 @@ fn verify_miniz_recognition() { let v = read_file(&format!("compressed_flate2_level{}.deflate", i)); let contents = parse_deflate(&v, 1).unwrap(); - let params = estimate_preflate_parameters(&contents.plain_text, &contents.blocks).unwrap(); + let params = PreflateParameters::estimate_preflate_parameters( + &contents.plain_text, + &contents.blocks, + ) + .unwrap(); if i == 0 { assert_eq!(params.predictor.strategy, PreflateStrategy::Store); @@ -381,7 +394,11 @@ fn verify_zlibng_recognition() { let v = read_file(&format!("compressed_zlibng_level{}.deflate", i)); let contents = parse_deflate(&v, 1).unwrap(); - let params = estimate_preflate_parameters(&contents.plain_text, &contents.blocks).unwrap(); + let params = PreflateParameters::estimate_preflate_parameters( + &contents.plain_text, + &contents.blocks, + ) + .unwrap(); if i == 0 { assert_eq!(params.predictor.strategy, PreflateStrategy::Store); diff --git a/src/preflate_parse_config.rs b/src/estimator/preflate_parse_config.rs similarity index 98% rename from src/preflate_parse_config.rs rename to src/estimator/preflate_parse_config.rs index a12c695..680e2c5 100644 --- a/src/preflate_parse_config.rs +++ b/src/estimator/preflate_parse_config.rs @@ -4,7 +4,7 @@ * This software incorporates material from third parties. See NOTICE.txt for details. *--------------------------------------------------------------------------------------------*/ -use crate::add_policy_estimator::DictionaryAddPolicy; +use crate::estimator::add_policy_estimator::DictionaryAddPolicy; #[derive(Debug, Copy, Clone, PartialEq, Eq, Default)] pub enum MatchingType { diff --git a/src/preflate_stream_info.rs b/src/estimator/preflate_stream_info.rs similarity index 82% rename from src/preflate_stream_info.rs rename to src/estimator/preflate_stream_info.rs index 51d48e1..cf85a3c 100644 --- a/src/preflate_stream_info.rs +++ b/src/estimator/preflate_stream_info.rs @@ -4,7 +4,7 @@ * This software incorporates material from third parties. See NOTICE.txt for details. *--------------------------------------------------------------------------------------------*/ -use crate::preflate_token::{PreflateHuffmanType, PreflateToken, PreflateTokenBlock}; +use crate::deflate::deflate_token::{DeflateHuffmanType, DeflateToken, DeflateTokenBlock}; pub struct PreflateStreamInfo { pub token_count: u32, @@ -20,17 +20,17 @@ pub struct PreflateStreamInfo { pub count_static_huff_tree_blocks: u32, } -fn process_tokens(tokens: &[PreflateToken], result: &mut PreflateStreamInfo) { +fn process_tokens(tokens: &[DeflateToken], result: &mut PreflateStreamInfo) { result.token_count += tokens.len() as u32; result.max_tokens_per_block = std::cmp::max(result.max_tokens_per_block, tokens.len() as u32); let mut block_max_dist = 0; let mut block_min_len = u32::MAX; for j in 0..tokens.len() { match &tokens[j] { - PreflateToken::Literal(_) => { + DeflateToken::Literal(_) => { result.literal_count += 1; } - PreflateToken::Reference(t) => { + DeflateToken::Reference(t) => { result.reference_count += 1; block_max_dist = std::cmp::max(block_max_dist, t.dist()); block_min_len = std::cmp::min(block_min_len, t.len()); @@ -47,7 +47,7 @@ fn process_tokens(tokens: &[PreflateToken], result: &mut PreflateStreamInfo) { } } -pub fn extract_preflate_info(blocks: &[PreflateTokenBlock]) -> PreflateStreamInfo { +pub(crate) fn extract_preflate_info(blocks: &[DeflateTokenBlock]) -> PreflateStreamInfo { let mut result: PreflateStreamInfo = PreflateStreamInfo { count_blocks: blocks.len() as u32, count_stored_blocks: 0, @@ -64,14 +64,14 @@ pub fn extract_preflate_info(blocks: &[PreflateTokenBlock]) -> PreflateStreamInf for i in 0..blocks.len() { match &blocks[i] { - PreflateTokenBlock::Stored { .. } => { + DeflateTokenBlock::Stored { .. } => { result.count_stored_blocks += 1; } - PreflateTokenBlock::Huffman { + DeflateTokenBlock::Huffman { tokens, huffman_type, } => { - if let PreflateHuffmanType::Static { .. } = huffman_type { + if let DeflateHuffmanType::Static { .. } = huffman_type { result.count_static_huff_tree_blocks += 1; } process_tokens(&tokens, &mut result); diff --git a/src/hash_chain_holder.rs b/src/hash_chain_holder.rs index 6ee56e7..d64e6b2 100644 --- a/src/hash_chain_holder.rs +++ b/src/hash_chain_holder.rs @@ -5,23 +5,23 @@ *--------------------------------------------------------------------------------------------*/ use crate::bit_helper::DebugHash; +use crate::deflate::deflate_constants::{MAX_MATCH, MIN_LOOKAHEAD, MIN_MATCH}; +use crate::deflate::deflate_token::DeflateTokenReference; +use crate::estimator::preflate_parameter_estimator::PreflateStrategy; use crate::hash_algorithm::{ Crc32cHash, HashAlgorithm, HashImplementation, LibdeflateHash4, LibdeflateHash4Fast, MiniZHash, RandomVectorHash, ZlibNGHash, ZlibRotatingHash, }; use crate::hash_chain::{HashChain, MAX_UPDATE_HASH_BATCH}; -use crate::preflate_constants::{MAX_MATCH, MIN_LOOKAHEAD, MIN_MATCH}; use crate::preflate_error::{err_exit_code, ExitCode, Result}; use crate::preflate_input::PreflateInput; -use crate::preflate_parameter_estimator::PreflateStrategy; -use crate::preflate_token::PreflateTokenReference; use crate::token_predictor::TokenPredictorParameters; use std::cmp; #[derive(Debug, Copy, Clone)] pub enum MatchResult { - Success(PreflateTokenReference), + Success(DeflateTokenReference), DistanceLargerThanHop0(u32, u32), NoInput, NoMoreMatchesFound, @@ -85,7 +85,7 @@ pub trait HashChainHolder { /// or none if it wasn't found fn calculate_hops( &self, - target_reference: &PreflateTokenReference, + target_reference: &DeflateTokenReference, input: &PreflateInput, ) -> Result; @@ -95,7 +95,7 @@ pub trait HashChainHolder { /// debugging function to verify that the hash chain is correct #[allow(dead_code)] - fn verify_hash(&self, _dist: Option); + fn verify_hash(&self, _dist: Option); fn checksum(&self, checksum: &mut DebugHash); } @@ -125,7 +125,7 @@ impl HashChainHolder for () { fn calculate_hops( &self, - _target_reference: &PreflateTokenReference, + _target_reference: &DeflateTokenReference, _input: &PreflateInput, ) -> Result { unimplemented!() @@ -135,7 +135,7 @@ impl HashChainHolder for () { unimplemented!() } - fn verify_hash(&self, _dist: Option) {} + fn verify_hash(&self, _dist: Option) {} fn checksum(&self, _checksum: &mut DebugHash) {} } @@ -172,7 +172,7 @@ impl HashChainHolder for HashChainHolderImpl { /// or none if it wasn't found fn calculate_hops( &self, - target_reference: &PreflateTokenReference, + target_reference: &DeflateTokenReference, input: &PreflateInput, ) -> Result { let max_len = std::cmp::min(input.remaining(), MAX_MATCH); @@ -255,7 +255,7 @@ impl HashChainHolder for HashChainHolderImpl { /// debugging function to verify that the hash chain is correct #[allow(dead_code)] - fn verify_hash(&self, _dist: Option) { + fn verify_hash(&self, _dist: Option) { //self.hash.verify_hash(dist, &self.input); } @@ -327,7 +327,7 @@ impl HashChainHolderImpl { let input_chars = input.cur_chars(OFFSET as i32); let mut best_len = prev_len; - let mut best_match: Option = None; + let mut best_match: Option = None; let mut first = true; for dist in self.hash.iterate(input, OFFSET) { @@ -346,7 +346,7 @@ impl HashChainHolderImpl { let match_length = prefix_compare(match_start, input_chars, best_len, max_len); if match_length > best_len { - let r = PreflateTokenReference::new(match_length, dist, false); + let r = DeflateTokenReference::new(match_length, dist, false); if match_length >= nice_length && (match_length > 3 || dist <= max_dist_3_matches) { return MatchResult::Success(r); diff --git a/src/idat_parse.rs b/src/idat_parse.rs index 854a24b..0ce4f22 100644 --- a/src/idat_parse.rs +++ b/src/idat_parse.rs @@ -7,6 +7,9 @@ use crate::{ preflate_error::{err_exit_code, ExitCode}, }; +/// The contents of a PNG IDat stream. These are treated specially since they +/// contain a Zlib stream that is split into multiple chunks and would be +/// treated as corrupt if we just tried to parse it without removing the boundary headers. #[derive(Debug, PartialEq)] pub struct IdatContents { /// the sizes of the IDAT chunks diff --git a/src/lib.rs b/src/lib.rs index 27fc351..7c5b9fa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,30 +4,17 @@ * This software incorporates material from third parties. See NOTICE.txt for details. *--------------------------------------------------------------------------------------------*/ -mod add_policy_estimator; mod bit_helper; -mod bit_reader; -mod bit_writer; mod cabac_codec; -mod complevel_estimator; -mod deflate_reader; -mod deflate_writer; -mod depth_estimator; +mod deflate; +mod estimator; mod hash_algorithm; mod hash_chain; mod hash_chain_holder; -mod huffman_calc; -mod huffman_encoding; -mod huffman_helper; mod idat_parse; -mod preflate_constants; mod preflate_container; mod preflate_error; mod preflate_input; -mod preflate_parameter_estimator; -mod preflate_parse_config; -mod preflate_stream_info; -mod preflate_token; mod process; mod scan_deflate; mod statistical_codec; diff --git a/src/preflate_container.rs b/src/preflate_container.rs index f843175..70410f8 100644 --- a/src/preflate_container.rs +++ b/src/preflate_container.rs @@ -4,10 +4,10 @@ use std::io::{Cursor, Read, Write}; use crate::{ cabac_codec::{PredictionDecoderCabac, PredictionEncoderCabac}, + estimator::preflate_parameter_estimator::PreflateParameters, idat_parse::{recreate_idat, IdatContents}, preflate_error::{AddContext, ExitCode, PreflateError}, preflate_input::PreflateInput, - preflate_parameter_estimator::{estimate_preflate_parameters, PreflateParameters}, process::{decode_mispredictions, encode_mispredictions, parse_deflate}, scan_deflate::{split_into_deflate_streams, BlockChunk}, statistical_codec::PredictionEncoder, @@ -375,7 +375,9 @@ pub fn decompress_deflate_stream( //process::write_file("c:\\temp\\lastop.deflate", compressed_data); //process::write_file("c:\\temp\\lastop.bin", contents.plain_text.as_slice()); - let params = estimate_preflate_parameters(&contents.plain_text, &contents.blocks).context()?; + let params = + PreflateParameters::estimate_preflate_parameters(&contents.plain_text, &contents.blocks) + .context()?; if loglevel > 0 { println!("params: {:?}", params); @@ -451,7 +453,9 @@ pub fn decompress_deflate_stream_assert( let contents = parse_deflate(compressed_data, 0)?; - let params = estimate_preflate_parameters(&contents.plain_text, &contents.blocks).context()?; + let params = + PreflateParameters::estimate_preflate_parameters(&contents.plain_text, &contents.blocks) + .context()?; params.write(&mut cabac_encoder); encode_mispredictions(&contents, ¶ms, &mut cabac_encoder)?; diff --git a/src/preflate_info.rs b/src/preflate_info.rs deleted file mode 100644 index 6bdeb22..0000000 --- a/src/preflate_info.rs +++ /dev/null @@ -1,50 +0,0 @@ -/*--------------------------------------------------------------------------------------------- - * Copyright (c) Microsoft Corporation. All rights reserved. - * Licensed under the Apache License, Version 2.0. See LICENSE.txt in the project root for license information. - * This software incorporates material from third parties. See NOTICE.txt for details. - *--------------------------------------------------------------------------------------------*/ - - struct PreflateStreamInfo { - token_count: usize, - literal_count: usize, - reference_count: usize, - max_dist: usize, - max_tokens_per_block: usize, - count_blocks: usize, - count_stored_blocks: usize, - count_huff_blocks: usize, - count_rle_blocks: usize, - count_static_huff_tree_blocks: usize, -} - -fn extract_preflate_info(blocks: &Vec) -> PreflateStreamInfo { - let mut result = PreflateStreamInfo::default(); - result.count_blocks = blocks.len() as u32; - for (i, b) in blocks.iter().enumerate() { - if b.block_type == TokenBlockType::Stored { - result.count_stored_blocks += 1; - continue; - } - if b.block_type == TokenBlockType::StaticHuff { - result.count_static_huff_tree_blocks += 1; - } - result.token_count += b.tokens.len() as u32; - result.max_tokens_per_block = cmp::max(result.max_tokens_per_block, b.tokens.len() as u32); - let mut block_max_dist = 0; - for (j, t) in b.tokens.iter().enumerate() { - if t.len == 1 { - result.literal_count += 1; - } else { - result.reference_count += 1; - block_max_dist = cmp::max(block_max_dist, t.dist as u32); - } - } - result.max_dist = cmp::max(result.max_dist, block_max_dist); - if block_max_dist == 0 { - result.count_huff_blocks += 1; - } else if block_max_dist == 1 { - result.count_rle_blocks += 1; - } - } - return result; -} \ No newline at end of file diff --git a/src/process.rs b/src/process.rs index 42982da..1cfc564 100644 --- a/src/process.rs +++ b/src/process.rs @@ -7,12 +7,11 @@ use std::io::Cursor; use crate::{ - deflate_reader::DeflateReader, - deflate_writer::DeflateWriter, + deflate::deflate_token::DeflateTokenBlock, + deflate::{deflate_reader::DeflateReader, deflate_writer::DeflateWriter}, + estimator::preflate_parameter_estimator::PreflateParameters, preflate_error::PreflateError, preflate_input::PreflateInput, - preflate_parameter_estimator::PreflateParameters, - preflate_token::PreflateTokenBlock, statistical_codec::{ CodecCorrection, CodecMisprediction, PredictionDecoder, PredictionEncoder, }, @@ -42,7 +41,7 @@ pub fn encode_mispredictions( pub struct DeflateContents { pub compressed_size: usize, pub plain_text: Vec, - pub blocks: Vec, + pub blocks: Vec, pub eof_padding: u8, } @@ -60,7 +59,7 @@ pub fn parse_deflate( if deflate_info_dump_level > 0 { // Log information about this deflate compressed block match &block { - PreflateTokenBlock::Stored { + DeflateTokenBlock::Stored { uncompressed, padding_bits, } => { @@ -70,7 +69,7 @@ pub fn parse_deflate( padding_bits ); } - PreflateTokenBlock::Huffman { tokens, .. } => { + DeflateTokenBlock::Huffman { tokens, .. } => { println!("Block: tokens={}", tokens.len()); } } @@ -96,7 +95,7 @@ pub fn parse_deflate( } fn predict_blocks( - blocks: &[PreflateTokenBlock], + blocks: &[DeflateTokenBlock], mut token_predictor_in: TokenPredictor, encoder: &mut impl PredictionEncoder, ) -> Result<(), PreflateError> { @@ -115,7 +114,7 @@ pub fn decode_mispredictions( params: &PreflateParameters, plain_text: PreflateInput, decoder: &mut impl PredictionDecoder, -) -> Result<(Vec, Vec), PreflateError> { +) -> Result<(Vec, Vec), PreflateError> { let mut deflate_writer: DeflateWriter = DeflateWriter::new(); let output_blocks = recreate_blocks( @@ -137,7 +136,7 @@ fn recreate_blocks( mut token_predictor: TokenPredictor, decoder: &mut D, deflate_writer: &mut DeflateWriter, -) -> Result, PreflateError> { +) -> Result, PreflateError> { let mut output_blocks = Vec::new(); let mut is_eof = token_predictor.input_eof() && !decoder.decode_misprediction(CodecMisprediction::EOFMisprediction); @@ -185,7 +184,6 @@ fn analyze_compressed_data_fast( uncompressed_size: &mut u64, ) { use crate::cabac_codec::{PredictionDecoderCabac, PredictionEncoderCabac}; - use crate::preflate_parameter_estimator::estimate_preflate_parameters; use cabac::vp8::{VP8Reader, VP8Writer}; @@ -195,7 +193,9 @@ fn analyze_compressed_data_fast( let contents = parse_deflate(compressed_data, 1).unwrap(); - let params = estimate_preflate_parameters(&contents.plain_text, &contents.blocks).unwrap(); + let params = + PreflateParameters::estimate_preflate_parameters(&contents.plain_text, &contents.blocks) + .unwrap(); println!("params: {:?}", params); @@ -239,7 +239,6 @@ fn analyze_compressed_data_verify( _deflate_info_dump_level: i32, uncompressed_size: &mut u64, ) { - use crate::preflate_parameter_estimator::estimate_preflate_parameters; use crate::{ cabac_codec::{PredictionDecoderCabac, PredictionEncoderCabac}, statistical_codec::{VerifyPredictionDecoder, VerifyPredictionEncoder}, @@ -267,7 +266,9 @@ fn analyze_compressed_data_verify( let contents = parse_deflate(compressed_data, 1).unwrap(); - let params = estimate_preflate_parameters(&contents.plain_text, &contents.blocks).unwrap(); + let params = + PreflateParameters::estimate_preflate_parameters(&contents.plain_text, &contents.blocks) + .unwrap(); println!("params: {:?}", params); @@ -308,11 +309,11 @@ fn analyze_compressed_data_verify( .enumerate() .for_each(|(index, (a, b))| match (a, &b) { ( - PreflateTokenBlock::Stored { + DeflateTokenBlock::Stored { uncompressed: a, padding_bits: b, }, - PreflateTokenBlock::Stored { + DeflateTokenBlock::Stored { uncompressed: c, padding_bits: d, }, @@ -321,11 +322,11 @@ fn analyze_compressed_data_verify( assert_eq!(b, d, "padding bits differ {index}"); } ( - PreflateTokenBlock::Huffman { + DeflateTokenBlock::Huffman { tokens: t1, huffman_type: h1, }, - PreflateTokenBlock::Huffman { + DeflateTokenBlock::Huffman { tokens: t2, huffman_type: h2, }, @@ -377,11 +378,14 @@ fn verify_zlib_perfect_compression() { &read_file(format!("compressed_zlib_level{i}.deflate").as_str()); let compressed_data = compressed_data; - use crate::preflate_parameter_estimator::estimate_preflate_parameters; let contents = parse_deflate(compressed_data, 1).unwrap(); - let params = estimate_preflate_parameters(&contents.plain_text, &contents.blocks).unwrap(); + let params = PreflateParameters::estimate_preflate_parameters( + &contents.plain_text, + &contents.blocks, + ) + .unwrap(); println!("params: {:?}", params); diff --git a/src/token_predictor.rs b/src/token_predictor.rs index b656a32..f22aeda 100644 --- a/src/token_predictor.rs +++ b/src/token_predictor.rs @@ -5,21 +5,22 @@ *--------------------------------------------------------------------------------------------*/ use crate::{ - add_policy_estimator::DictionaryAddPolicy, bit_helper::DebugHash, cabac_codec::{decode_difference, encode_difference}, + deflate::deflate_constants::MIN_MATCH, + deflate::deflate_token::{ + DeflateHuffmanType, DeflateToken, DeflateTokenBlock, DeflateTokenReference, TokenFrequency, + BT_DYNAMICHUFF, BT_STATICHUFF, BT_STORED, + }, + deflate::huffman_calc::HufftreeBitCalc, + estimator::{ + add_policy_estimator::DictionaryAddPolicy, preflate_parameter_estimator::PreflateStrategy, + preflate_parse_config::MatchingType, + }, hash_algorithm::HashAlgorithm, hash_chain_holder::{new_hash_chain_holder, HashChainHolder, MatchResult}, - huffman_calc::HufftreeBitCalc, - preflate_constants::MIN_MATCH, preflate_error::{err_exit_code, AddContext, ExitCode, Result}, preflate_input::PreflateInput, - preflate_parameter_estimator::PreflateStrategy, - preflate_parse_config::MatchingType, - preflate_token::{ - PreflateHuffmanType, PreflateToken, PreflateTokenBlock, PreflateTokenReference, - TokenFrequency, BT_DYNAMICHUFF, BT_STATICHUFF, BT_STORED, - }, statistical_codec::{ CodecCorrection, CodecMisprediction, PredictionDecoder, PredictionEncoder, }, @@ -31,7 +32,7 @@ const VERIFY: bool = false; pub struct TokenPredictor<'a> { state: Box, params: TokenPredictorParameters, - pending_reference: Option, + pending_reference: Option, current_token_count: u32, max_token_count: u32, input: PreflateInput<'a>, @@ -93,7 +94,7 @@ impl<'a> TokenPredictor<'a> { pub fn predict_block( &mut self, - block: &PreflateTokenBlock, + block: &DeflateTokenBlock, codec: &mut D, last_block: bool, ) -> Result<()> { @@ -106,7 +107,7 @@ impl<'a> TokenPredictor<'a> { let huffman_encoding; match block { - PreflateTokenBlock::Stored { + DeflateTokenBlock::Stored { uncompressed, padding_bits, } => { @@ -125,19 +126,19 @@ impl<'a> TokenPredictor<'a> { } return Ok(()); } - PreflateTokenBlock::Huffman { + DeflateTokenBlock::Huffman { tokens: t, huffman_type, } => { match huffman_type { - PreflateHuffmanType::Static { .. } => { + DeflateHuffmanType::Static { .. } => { codec.encode_correction( CodecCorrection::BlockTypeCorrection, encode_difference(BT_DYNAMICHUFF, BT_STATICHUFF), ); huffman_encoding = None; } - PreflateHuffmanType::Dynamic { + DeflateHuffmanType::Dynamic { huffman_encoding: h, .. } => { @@ -210,15 +211,15 @@ impl<'a> TokenPredictor<'a> { // println!("B{}T{}: TGT({},{}) -> PRD({},{})", blockno, i, target_token.len, target_token.dist, predicted_token.len, predicted_token.dist); match target_token { - PreflateToken::Literal(_) => { + DeflateToken::Literal(_) => { match predicted_token { - PreflateToken::Literal(_) => { + DeflateToken::Literal(_) => { codec.encode_misprediction( CodecMisprediction::LiteralPredictionWrong, false, ); } - PreflateToken::Reference(..) => { + DeflateToken::Reference(..) => { // target had a literal, so we were wrong if we predicted a reference codec.encode_misprediction( CodecMisprediction::ReferencePredictionWrong, @@ -227,9 +228,9 @@ impl<'a> TokenPredictor<'a> { } } } - PreflateToken::Reference(target_ref) => { + DeflateToken::Reference(target_ref) => { let predicted_ref = match predicted_token { - PreflateToken::Literal(_) => { + DeflateToken::Literal(_) => { // target had a reference, so we were wrong if we predicted a literal codec.encode_misprediction( CodecMisprediction::LiteralPredictionWrong, @@ -243,7 +244,7 @@ impl<'a> TokenPredictor<'a> { ) })? } - PreflateToken::Reference(r) => { + DeflateToken::Reference(r) => { // we predicted a reference correctly, so verify that the length/dist was correct codec.encode_misprediction( CodecMisprediction::ReferencePredictionWrong, @@ -303,7 +304,7 @@ impl<'a> TokenPredictor<'a> { pub fn recreate_block( &mut self, codec: &mut D, - ) -> Result { + ) -> Result { self.current_token_count = 0; self.pending_reference = None; @@ -325,7 +326,7 @@ impl<'a> TokenPredictor<'a> { self.input.advance(1); } - return Ok(PreflateTokenBlock::Stored { + return Ok(DeflateTokenBlock::Stored { uncompressed, padding_bits, }); @@ -360,16 +361,16 @@ impl<'a> TokenPredictor<'a> { }, ); - let mut predicted_ref: PreflateTokenReference; + let mut predicted_ref: DeflateTokenReference; match self.predict_token() { - PreflateToken::Literal(l) => { + DeflateToken::Literal(l) => { let not_ok = codec.decode_misprediction(CodecMisprediction::LiteralPredictionWrong); if !not_ok { - self.commit_token(&PreflateToken::Literal(l)); - freq.commit_token(&PreflateToken::Literal(l)); + self.commit_token(&DeflateToken::Literal(l)); + freq.commit_token(&DeflateToken::Literal(l)); - tokens.push(PreflateToken::Literal(l)); + tokens.push(DeflateToken::Literal(l)); continue; } @@ -380,15 +381,15 @@ impl<'a> TokenPredictor<'a> { ) })?; } - PreflateToken::Reference(r) => { + DeflateToken::Reference(r) => { let not_ok = codec.decode_misprediction(CodecMisprediction::ReferencePredictionWrong); if not_ok { let c = self.input.cur_char(0); - self.commit_token(&PreflateToken::Literal(c)); - freq.commit_token(&PreflateToken::Literal(c)); + self.commit_token(&DeflateToken::Literal(c)); + freq.commit_token(&DeflateToken::Literal(c)); - tokens.push(PreflateToken::Literal(c)); + tokens.push(DeflateToken::Literal(c)); continue; } @@ -404,7 +405,7 @@ impl<'a> TokenPredictor<'a> { if new_len != predicted_ref.len() { let hops = codec.decode_correction(CodecCorrection::DistAfterLenCorrection); - predicted_ref = PreflateTokenReference::new( + predicted_ref = DeflateTokenReference::new( new_len, self.state .hop_match(new_len, hops, &self.input) @@ -420,7 +421,7 @@ impl<'a> TokenPredictor<'a> { .with_context(|| { format!("recalculate_distance token {}", self.current_token_count) })?; - predicted_ref = PreflateTokenReference::new(new_len, new_dist, false); + predicted_ref = DeflateTokenReference::new(new_len, new_dist, false); } } @@ -430,17 +431,17 @@ impl<'a> TokenPredictor<'a> { predicted_ref.set_irregular258(true); } - self.commit_token(&PreflateToken::Reference(predicted_ref)); - freq.commit_token(&PreflateToken::Reference(predicted_ref)); - tokens.push(PreflateToken::Reference(predicted_ref)); + self.commit_token(&DeflateToken::Reference(predicted_ref)); + freq.commit_token(&DeflateToken::Reference(predicted_ref)); + tokens.push(DeflateToken::Reference(predicted_ref)); } - let b = PreflateTokenBlock::Huffman { + let b = DeflateTokenBlock::Huffman { tokens, huffman_type: if bt == BT_STATICHUFF { - PreflateHuffmanType::Static { incomplete: false } + DeflateHuffmanType::Static { incomplete: false } } else { - PreflateHuffmanType::Dynamic { + DeflateHuffmanType::Dynamic { huffman_encoding: recreate_tree_for_block(&freq, codec, HufftreeBitCalc::Zlib)?, } }, @@ -456,9 +457,9 @@ impl<'a> TokenPredictor<'a> { self.input.remaining() == 0 } - fn predict_token(&mut self) -> PreflateToken { + fn predict_token(&mut self) -> DeflateToken { if self.input.pos() == 0 || self.input.remaining() < MIN_MATCH { - return PreflateToken::Literal(self.input.cur_char(0)); + return DeflateToken::Literal(self.input.cur_char(0)); } let m = if let Some(pending) = self.pending_reference { @@ -472,13 +473,13 @@ impl<'a> TokenPredictor<'a> { if let MatchResult::Success(match_token) = m { if match_token.len() < MIN_MATCH { - return PreflateToken::Literal(self.input.cur_char(0)); + return DeflateToken::Literal(self.input.cur_char(0)); } // match is too small and far way to be worth encoding as a distance/length pair. if match_token.len() == 3 && match_token.dist() > self.params.max_dist_3_matches.into() { - return PreflateToken::Literal(self.input.cur_char(0)); + return DeflateToken::Literal(self.input.cur_char(0)); } // Check for a longer match that starts at the next byte, in which case we should @@ -509,15 +510,15 @@ impl<'a> TokenPredictor<'a> { if !self.params.zlib_compatible { self.pending_reference = None; } - return PreflateToken::Literal(self.input.cur_char(0)); + return DeflateToken::Literal(self.input.cur_char(0)); } } } } - PreflateToken::Reference(match_token) + DeflateToken::Reference(match_token) } else { - PreflateToken::Literal(self.input.cur_char(0)) + DeflateToken::Literal(self.input.cur_char(0)) } } @@ -525,8 +526,8 @@ impl<'a> TokenPredictor<'a> { /// to find a match for the reference. fn repredict_reference( &mut self, - _dist_match: Option, - ) -> Result { + _dist_match: Option, + ) -> Result { if self.input.pos() == 0 || self.input.remaining() < MIN_MATCH { return err_exit_code( ExitCode::RecompressFailed, @@ -560,13 +561,13 @@ impl<'a> TokenPredictor<'a> { ) } - fn commit_token(&mut self, token: &PreflateToken) { + fn commit_token(&mut self, token: &DeflateToken) { match token { - PreflateToken::Literal(_) => { + DeflateToken::Literal(_) => { self.state.update_hash(1, &self.input); self.input.advance(1); } - PreflateToken::Reference(t) => { + DeflateToken::Reference(t) => { self.state.update_hash(t.len(), &self.input); self.input.advance(t.len()); } diff --git a/src/tree_predictor.rs b/src/tree_predictor.rs index bad0fdc..13a9732 100644 --- a/src/tree_predictor.rs +++ b/src/tree_predictor.rs @@ -6,11 +6,13 @@ use crate::{ cabac_codec::{decode_difference, encode_difference}, - huffman_calc::{calc_bit_lengths, HufftreeBitCalc}, - huffman_encoding::{HuffmanOriginalEncoding, TreeCodeType}, - preflate_constants::{CODETREE_CODE_COUNT, NONLEN_CODE_COUNT, TREE_CODE_ORDER_TABLE}, + deflate::deflate_constants::{CODETREE_CODE_COUNT, NONLEN_CODE_COUNT, TREE_CODE_ORDER_TABLE}, + deflate::deflate_token::TokenFrequency, + deflate::{ + huffman_calc::{calc_bit_lengths, HufftreeBitCalc}, + huffman_encoding::{HuffmanOriginalEncoding, TreeCodeType}, + }, preflate_error::{err_exit_code, ExitCode, Result}, - preflate_token::TokenFrequency, statistical_codec::{ CodecCorrection, CodecMisprediction, PredictionDecoder, PredictionEncoder, },