From f75b04cfc16a3524abd8337d3662829e7f957844 Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Thu, 11 Jun 2020 13:54:20 -0700 Subject: [PATCH] Process all non-ASCII bytes with the UTF-8 parser. The UTF-8 parser knows how to handle invalid byte sequences, so don't preprocess the input in the main parser; just hand any non-ASCII byte to the UTF-8 parser to handle. This includes what were previously interpreted as 8-bit C1 control codes; they are now interpreted as UTF-8 continuation characters. --- CHANGELOG.md | 5 +++++ src/lib.rs | 44 +++++++++++++++++++++++++++++++++++++++++++- src/table.rs | 12 +++--------- 3 files changed, 51 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 02a1e0a..8c5ef64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ CHANGELOG ========= +## 0.9.0 + +- Remove 8-bit C1 support. 8-bit C1 codes are now interpreted as UTF-8 + continuation bytes. + ## 0.8.0 - Remove C1 ST support in OSCs, fixing OSCs with ST in the payload diff --git a/src/lib.rs b/src/lib.rs index 6234901..9734ce0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -348,7 +348,7 @@ pub trait Perform { /// Draw a character to the screen and update states. fn print(&mut self, _: char); - /// Execute a C0 or C1 control function. + /// Execute a C0 control function. fn execute(&mut self, byte: u8); /// Invoked when a final character arrives in first part of device control string. @@ -846,6 +846,48 @@ mod tests { #[cfg(feature = "no_std")] assert_eq!(dispatcher.params[1].len(), MAX_OSC_RAW - dispatcher.params[0].len()); } + + #[derive(Default)] + struct InvalidUtf8ByteDispatcher { + num_invalid: u8, + } + + impl Perform for InvalidUtf8ByteDispatcher { + fn print(&mut self, c: char) { + assert_eq!(c, '�'); + self.num_invalid += 1; + } + + fn execute(&mut self, _: u8) {} + + fn hook(&mut self, _: &[i64], _: &[u8], _: bool, _: char) {} + + fn put(&mut self, _: u8) {} + + fn unhook(&mut self) {} + + fn osc_dispatch(&mut self, _: &[&[u8]], _: bool) {} + + fn csi_dispatch(&mut self, _: &[i64], _: &[u8], _: bool, _: char) {} + + fn esc_dispatch(&mut self, _: &[u8], _: bool, _: u8) {} + } + + #[test] + fn parse_invalid_utf8_byte() { + let mut dispatcher = InvalidUtf8ByteDispatcher::default(); + let mut parser = Parser::new(); + + for byte in 0x80..0xc2 { + parser.advance(&mut dispatcher, byte); + } + for byte in 0xf5..=0xff { + parser.advance(&mut dispatcher, byte); + } + + // Continuation bytes, overlong bytes, invalid code points, invalid code units. + assert_eq!(dispatcher.num_invalid, 64 + 2 + 9 + 2); + } } #[cfg(all(feature = "nightly", test))] diff --git a/src/table.rs b/src/table.rs index c19e4ab..a67ca6f 100644 --- a/src/table.rs +++ b/src/table.rs @@ -18,15 +18,9 @@ generate_state_changes!(state_changes, { 0x19 => (Anywhere, Execute), 0x1c..=0x1f => (Anywhere, Execute), 0x20..=0x7f => (Anywhere, Print), - 0x80..=0x8f => (Anywhere, Execute), - 0x91..=0x9a => (Anywhere, Execute), - 0x9c => (Anywhere, Execute), - // Beginning of UTF-8 2 byte sequence - 0xc2..=0xdf => (Utf8, BeginUtf8), - // Beginning of UTF-8 3 byte sequence - 0xe0..=0xef => (Utf8, BeginUtf8), - // Beginning of UTF-8 4 byte sequence - 0xf0..=0xf4 => (Utf8, BeginUtf8), + // Hand all non-ASCII bytes to the UTF-8 parser to figure out. This + // includes 8-bit C1 codes, since we don't recognize them as such. + 0x80..=0xff => (Utf8, BeginUtf8), }, Escape {