Process all non-ASCII bytes with the UTF-8 parser.

The UTF-8 parser knows how to handle invalid byte sequences, so don't preprocess the input in the main parser; just hand any non-ASCII byte to the UTF-8 parser to handle. This includes what were previously interpreted as 8-bit C1 control codes; they are now interpreted as UTF-8 continuation characters.
alacritty · Jun 15, 2020 · f75b04c · f75b04c
1 parent 582731c
commit f75b04c
Show file tree

Hide file tree

Showing 3 changed files with 51 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,11 @@
 CHANGELOG
 =========
 
+## 0.9.0
+
+- Remove 8-bit C1 support. 8-bit C1 codes are now interpreted as UTF-8
+  continuation bytes.
+
 ## 0.8.0
 
 - Remove C1 ST support in OSCs, fixing OSCs with ST in the payload

diff --git a/src/lib.rs b/src/lib.rs
@@ -348,7 +348,7 @@ pub trait Perform {
     /// Draw a character to the screen and update states.
     fn print(&mut self, _: char);
 
-    /// Execute a C0 or C1 control function.
+    /// Execute a C0 control function.
     fn execute(&mut self, byte: u8);
 
     /// Invoked when a final character arrives in first part of device control string.
@@ -846,6 +846,48 @@ mod tests {
         #[cfg(feature = "no_std")]
         assert_eq!(dispatcher.params[1].len(), MAX_OSC_RAW - dispatcher.params[0].len());
     }
+
+    #[derive(Default)]
+    struct InvalidUtf8ByteDispatcher {
+        num_invalid: u8,
+    }
+
+    impl Perform for InvalidUtf8ByteDispatcher {
+        fn print(&mut self, c: char) {
+            assert_eq!(c, '�');
+            self.num_invalid += 1;
+        }
+
+        fn execute(&mut self, _: u8) {}
+
+        fn hook(&mut self, _: &[i64], _: &[u8], _: bool, _: char) {}
+
+        fn put(&mut self, _: u8) {}
+
+        fn unhook(&mut self) {}
+
+        fn osc_dispatch(&mut self, _: &[&[u8]], _: bool) {}
+
+        fn csi_dispatch(&mut self, _: &[i64], _: &[u8], _: bool, _: char) {}
+
+        fn esc_dispatch(&mut self, _: &[u8], _: bool, _: u8) {}
+    }
+
+    #[test]
+    fn parse_invalid_utf8_byte() {
+        let mut dispatcher = InvalidUtf8ByteDispatcher::default();
+        let mut parser = Parser::new();
+
+        for byte in 0x80..0xc2 {
+            parser.advance(&mut dispatcher, byte);
+        }
+        for byte in 0xf5..=0xff {
+            parser.advance(&mut dispatcher, byte);
+        }
+
+        // Continuation bytes, overlong bytes, invalid code points, invalid code units.
+        assert_eq!(dispatcher.num_invalid, 64 + 2 + 9 + 2);
+    }
 }
 
 #[cfg(all(feature = "nightly", test))]

diff --git a/src/table.rs b/src/table.rs
@@ -18,15 +18,9 @@ generate_state_changes!(state_changes, {
         0x19        => (Anywhere, Execute),
         0x1c..=0x1f => (Anywhere, Execute),
         0x20..=0x7f => (Anywhere, Print),
-        0x80..=0x8f => (Anywhere, Execute),
-        0x91..=0x9a => (Anywhere, Execute),
-        0x9c        => (Anywhere, Execute),
-        // Beginning of UTF-8 2 byte sequence
-        0xc2..=0xdf => (Utf8, BeginUtf8),
-        // Beginning of UTF-8 3 byte sequence
-        0xe0..=0xef => (Utf8, BeginUtf8),
-        // Beginning of UTF-8 4 byte sequence
-        0xf0..=0xf4 => (Utf8, BeginUtf8),
+        // Hand all non-ASCII bytes to the UTF-8 parser to figure out. This
+        // includes 8-bit C1 codes, since we don't recognize them as such.
+        0x80..=0xff => (Utf8, BeginUtf8),
     },
 
     Escape {