Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve documentation and structure for code review #19

Merged
merged 4 commits into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/bit_reader.rs → src/deflate/bit_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ pub trait ReadBits {
fn get(&mut self, cbit: u32) -> Result<u32>;
}

/// BitReader reads a variable number of bits from a byte stream.
pub struct BitReader<R> {
binary_reader: R,
bits_read: u32,
Expand Down
3 changes: 2 additions & 1 deletion src/bit_writer.rs → src/deflate/bit_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* This software incorporates material from third parties. See NOTICE.txt for details.
*--------------------------------------------------------------------------------------------*/

/// Used to write a variable number of bits to a byte buffer.
#[derive(Default)]
pub struct BitWriter {
pub bit_buffer: u32,
Expand Down Expand Up @@ -61,7 +62,7 @@ fn write_simple() {
/// write various bit patterns and see if the result matches the input
#[test]
fn write_roundtrip() {
use crate::bit_reader::BitReader;
use super::bit_reader::BitReader;

let mut b = BitWriter::default();
let mut data_buffer = Vec::new();
Expand Down
File renamed without changes.
51 changes: 25 additions & 26 deletions src/deflate_reader.rs → src/deflate/deflate_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,20 @@
*--------------------------------------------------------------------------------------------*/

use crate::{
deflate::deflate_token::{DeflateHuffmanType, DeflateToken, DeflateTokenReference},
preflate_error::{err_exit_code, AddContext, ExitCode, Result},
preflate_token::{PreflateHuffmanType, PreflateToken, PreflateTokenReference},
};

use std::io::Read;

use crate::{
use super::{deflate_constants, deflate_token::DeflateTokenBlock};

use super::{
bit_reader::BitReader,
huffman_encoding::{HuffmanOriginalEncoding, HuffmanReader},
preflate_constants,
preflate_token::PreflateTokenBlock,
};

/// Used to read binary data in deflate format and convert it to plaintext and a list of tokenized blocks
/// Used to read binary data in DEFLATE format and convert it to plaintext and a list of tokenized blocks
/// containing the literals and distance codes that were used to compress the file
pub struct DeflateReader<R> {
input: BitReader<R>,
Expand Down Expand Up @@ -64,7 +64,7 @@ impl<R: Read> DeflateReader<R> {
}
}

pub fn read_block(&mut self, last: &mut bool) -> Result<PreflateTokenBlock> {
pub fn read_block(&mut self, last: &mut bool) -> Result<DeflateTokenBlock> {
*last = self.read_bit()?;
let mode = self.read_bits(2)?;

Expand All @@ -89,7 +89,7 @@ impl<R: Read> DeflateReader<R> {
self.write_literal(b);
}

Ok(PreflateTokenBlock::Stored {
Ok(DeflateTokenBlock::Stored {
uncompressed,
padding_bits,
})
Expand All @@ -102,17 +102,17 @@ impl<R: Read> DeflateReader<R> {
let decoder = HuffmanReader::create_fixed()?;
if let Err(e) = self.decode_block(&decoder, &mut tokens) {
if e.exit_code() == ExitCode::ShortRead {
Ok(PreflateTokenBlock::Huffman {
Ok(DeflateTokenBlock::Huffman {
tokens,
huffman_type: PreflateHuffmanType::Static { incomplete: true },
huffman_type: DeflateHuffmanType::Static { incomplete: true },
})
} else {
Err(e)
}
} else {
Ok(PreflateTokenBlock::Huffman {
Ok(DeflateTokenBlock::Huffman {
tokens,
huffman_type: PreflateHuffmanType::Static { incomplete: false },
huffman_type: DeflateHuffmanType::Static { incomplete: false },
})
}
}
Expand All @@ -125,9 +125,9 @@ impl<R: Read> DeflateReader<R> {
let mut tokens = Vec::new();
self.decode_block(&decoder, &mut tokens).context()?;

Ok(PreflateTokenBlock::Huffman {
Ok(DeflateTokenBlock::Huffman {
tokens,
huffman_type: PreflateHuffmanType::Dynamic { huffman_encoding },
huffman_type: DeflateHuffmanType::Dynamic { huffman_encoding },
})
}

Expand All @@ -138,7 +138,7 @@ impl<R: Read> DeflateReader<R> {
fn decode_block(
&mut self,
decoder: &HuffmanReader,
tokens: &mut Vec<PreflateToken>,
tokens: &mut Vec<DeflateToken>,
) -> Result<()> {
let mut earliest_reference = i32::MAX;
let mut cur_pos = 0;
Expand All @@ -147,40 +147,39 @@ impl<R: Read> DeflateReader<R> {
let lit_len: u32 = decoder.fetch_next_literal_code(&mut self.input)?.into();
if lit_len < 256 {
self.write_literal(lit_len as u8);
tokens.push(PreflateToken::Literal(lit_len as u8));
tokens.push(DeflateToken::Literal(lit_len as u8));
cur_pos += 1;
} else if lit_len == 256 {
return Ok(());
} else {
let lcode: u32 = lit_len - preflate_constants::NONLEN_CODE_COUNT as u32;
if lcode >= preflate_constants::LEN_CODE_COUNT as u32 {
let lcode: u32 = lit_len - deflate_constants::NONLEN_CODE_COUNT as u32;
if lcode >= deflate_constants::LEN_CODE_COUNT as u32 {
return err_exit_code(ExitCode::InvalidDeflate, "Invalid length code");
}
let len: u32 = preflate_constants::MIN_MATCH
+ preflate_constants::LENGTH_BASE_TABLE[lcode as usize] as u32
let len: u32 = deflate_constants::MIN_MATCH
+ deflate_constants::LENGTH_BASE_TABLE[lcode as usize] as u32
+ self
.read_bits(preflate_constants::LENGTH_EXTRA_TABLE[lcode as usize].into())?;
.read_bits(deflate_constants::LENGTH_EXTRA_TABLE[lcode as usize].into())?;

// length of 258 can be encoded two ways: 284 with 5 one bits (non-standard) or as 285 with 0 extra bits (standard)
let irregular258 =
len == 258 && lcode != preflate_constants::LEN_CODE_COUNT as u32 - 1;
len == 258 && lcode != deflate_constants::LEN_CODE_COUNT as u32 - 1;

let dcode = decoder.fetch_next_distance_char(&mut self.input)? as u32;
if dcode >= preflate_constants::DIST_CODE_COUNT as u32 {
if dcode >= deflate_constants::DIST_CODE_COUNT as u32 {
return err_exit_code(ExitCode::InvalidDeflate, "Invalid distance code");
}

let dist = 1
+ preflate_constants::DIST_BASE_TABLE[dcode as usize] as u32
+ self
.read_bits(preflate_constants::DIST_EXTRA_TABLE[dcode as usize].into())?;
+ deflate_constants::DIST_BASE_TABLE[dcode as usize] as u32
+ self.read_bits(deflate_constants::DIST_EXTRA_TABLE[dcode as usize].into())?;

if dist as usize > self.plain_text.len() {
return err_exit_code(ExitCode::InvalidDeflate, "Invalid distance");
}

self.write_reference(dist, len);
tokens.push(PreflateToken::Reference(PreflateTokenReference::new(
tokens.push(DeflateToken::Reference(DeflateTokenReference::new(
len,
dist,
irregular258,
Expand Down
53 changes: 30 additions & 23 deletions src/preflate_token.rs → src/deflate/deflate_token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,35 @@
* This software incorporates material from third parties. See NOTICE.txt for details.
*--------------------------------------------------------------------------------------------*/

use crate::{
huffman_encoding::HuffmanOriginalEncoding,
preflate_constants::{
quantize_distance, quantize_length, DIST_CODE_COUNT, LITLENDIST_CODE_COUNT,
NONLEN_CODE_COUNT,
},
use crate::deflate::huffman_encoding::HuffmanOriginalEncoding;

use super::deflate_constants::{
quantize_distance, quantize_length, DIST_CODE_COUNT, LITLENDIST_CODE_COUNT, NONLEN_CODE_COUNT,
};

/// In a DEFLATE stream, tokens are either literals (bytes) or references to previous bytes
/// with a distance and length.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub struct PreflateTokenReference {
len: u8,
dist: u16,
irregular258: bool,
pub enum DeflateToken {
Literal(u8),
Reference(DeflateTokenReference),
}

/// In the case of a distance and length, the length is the number of bytes to copy from the
/// previous bytes, and the distance is the number of bytes back to start copying from.
///
/// the irregular258 field is used to indicate that the 258 length code was used but in a
/// suboptimal way (the RFC allows for two different ways to encode 258)
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub enum PreflateToken {
Literal(u8),
Reference(PreflateTokenReference),
pub struct DeflateTokenReference {
len: u8,
dist: u16,
irregular258: bool,
}

impl PreflateTokenReference {
pub fn new(len: u32, dist: u32, irregular258: bool) -> PreflateTokenReference {
PreflateTokenReference {
impl DeflateTokenReference {
pub fn new(len: u32, dist: u32, irregular258: bool) -> DeflateTokenReference {
DeflateTokenReference {
len: (len - 3) as u8,
dist: dist as u16,
irregular258,
Expand Down Expand Up @@ -56,7 +61,7 @@ pub const BT_DYNAMICHUFF: u32 = 0;
pub const BT_STATICHUFF: u32 = 2;

#[derive(Debug, PartialEq)]
pub enum PreflateHuffmanType {
pub enum DeflateHuffmanType {
Dynamic {
huffman_encoding: HuffmanOriginalEncoding,
},
Expand All @@ -66,17 +71,19 @@ pub enum PreflateHuffmanType {
}

#[derive(Debug)]
pub enum PreflateTokenBlock {
pub enum DeflateTokenBlock {
Huffman {
tokens: Vec<PreflateToken>,
huffman_type: PreflateHuffmanType,
tokens: Vec<DeflateToken>,
huffman_type: DeflateHuffmanType,
},
Stored {
uncompressed: Vec<u8>,
padding_bits: u8,
},
}

/// Used to track the frequence of tokens in the DEFLATE stream
/// which are later used to build the huffman encoding.
#[derive(Debug)]
pub struct TokenFrequency {
pub literal_codes: [u16; LITLENDIST_CODE_COUNT],
Expand All @@ -98,12 +105,12 @@ impl Default for TokenFrequency {
}

impl TokenFrequency {
pub fn commit_token(&mut self, token: &PreflateToken) {
pub fn commit_token(&mut self, token: &DeflateToken) {
match token {
PreflateToken::Literal(lit) => {
DeflateToken::Literal(lit) => {
self.literal_codes[*lit as usize] += 1;
}
PreflateToken::Reference(t) => {
DeflateToken::Reference(t) => {
self.literal_codes[NONLEN_CODE_COUNT + quantize_length(t.len())] += 1;
self.distance_codes[quantize_distance(t.dist())] += 1;
}
Expand Down
30 changes: 16 additions & 14 deletions src/deflate_writer.rs → src/deflate/deflate_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,19 @@

use crate::preflate_error::Result;

use crate::preflate_token::PreflateHuffmanType;
use crate::{
bit_writer::BitWriter,
huffman_encoding::HuffmanWriter,
preflate_constants::{
use super::deflate_token::DeflateHuffmanType;
use super::{
deflate_constants::{
quantize_distance, quantize_length, DIST_BASE_TABLE, DIST_EXTRA_TABLE, LENGTH_BASE_TABLE,
LENGTH_EXTRA_TABLE, LITLEN_CODE_COUNT, MIN_MATCH, NONLEN_CODE_COUNT,
},
preflate_token::{PreflateToken, PreflateTokenBlock},
deflate_token::{DeflateToken, DeflateTokenBlock},
};

use super::bit_writer::BitWriter;
use super::huffman_encoding::HuffmanWriter;

/// Takes a tokenized block and writes it to the original compressed output.
pub struct DeflateWriter {
/// bit writer to write partial bits to output
bitwriter: BitWriter,
Expand All @@ -39,10 +41,10 @@ impl DeflateWriter {
o
}

pub fn encode_block(&mut self, block: &PreflateTokenBlock, last: bool) -> Result<()> {
pub fn encode_block(&mut self, block: &DeflateTokenBlock, last: bool) -> Result<()> {
self.bitwriter.write(last as u32, 1, &mut self.output);
match block {
PreflateTokenBlock::Stored {
DeflateTokenBlock::Stored {
uncompressed,
padding_bits,
} => {
Expand All @@ -57,16 +59,16 @@ impl DeflateWriter {

self.output.extend_from_slice(&uncompressed);
}
PreflateTokenBlock::Huffman {
DeflateTokenBlock::Huffman {
tokens,
huffman_type,
} => match huffman_type {
PreflateHuffmanType::Static { .. } => {
DeflateHuffmanType::Static { .. } => {
self.bitwriter.write(1, 2, &mut self.output);
let huffman_writer = HuffmanWriter::start_fixed_huffman_table();
self.encode_block_with_decoder(tokens, &huffman_writer);
}
PreflateHuffmanType::Dynamic {
DeflateHuffmanType::Dynamic {
huffman_encoding, ..
} => {
let huffman_writer = HuffmanWriter::start_dynamic_huffman_table(
Expand All @@ -90,19 +92,19 @@ impl DeflateWriter {

fn encode_block_with_decoder(
&mut self,
tokens: &Vec<PreflateToken>,
tokens: &Vec<DeflateToken>,
huffman_writer: &HuffmanWriter,
) {
for token in tokens {
match token {
PreflateToken::Literal(lit) => {
DeflateToken::Literal(lit) => {
huffman_writer.write_literal(
&mut self.bitwriter,
&mut self.output,
u16::from(*lit),
);
}
PreflateToken::Reference(reference) => {
DeflateToken::Reference(reference) => {
if reference.get_irregular258() {
huffman_writer.write_literal(
&mut self.bitwriter,
Expand Down
7 changes: 7 additions & 0 deletions src/huffman_calc.rs → src/deflate/huffman_calc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@ pub enum HufftreeBitCalc {
Miniz,
}

/// Calculates the Huffman bit lengths for a given distribution of symbols.
///
/// There is no one-size-fits-all solution for calculating Huffman bit lengths and
/// each library has its own.
///
/// If we can get the right algorithm, it will minimize the amount of corrections
/// that we later need to write when encoding the data back to the compressed format.
pub fn calc_bit_lengths(
bit_calc: HufftreeBitCalc,
sym_count: &[u16],
Expand Down
Loading