From f75b04cfc16a3524abd8337d3662829e7f957844 Mon Sep 17 00:00:00 2001
From: Dan Gohman <sunfish@mozilla.com>
Date: Thu, 11 Jun 2020 13:54:20 -0700
Subject: [PATCH] Process all non-ASCII bytes with the UTF-8 parser.

The UTF-8 parser knows how to handle invalid byte sequences, so don't
preprocess the input in the main parser; just hand any non-ASCII byte to
the UTF-8 parser to handle.

This includes what were previously interpreted as 8-bit C1 control codes;
they are now interpreted as UTF-8 continuation characters.
---
 CHANGELOG.md |  5 +++++
 src/lib.rs   | 44 +++++++++++++++++++++++++++++++++++++++++++-
 src/table.rs | 12 +++---------
 3 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 02a1e0a..8c5ef64 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,11 @@
 CHANGELOG
 =========
 
+## 0.9.0
+
+- Remove 8-bit C1 support. 8-bit C1 codes are now interpreted as UTF-8
+  continuation bytes.
+
 ## 0.8.0
 
 - Remove C1 ST support in OSCs, fixing OSCs with ST in the payload
diff --git a/src/lib.rs b/src/lib.rs
index 6234901..9734ce0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -348,7 +348,7 @@ pub trait Perform {
     /// Draw a character to the screen and update states.
     fn print(&mut self, _: char);
 
-    /// Execute a C0 or C1 control function.
+    /// Execute a C0 control function.
     fn execute(&mut self, byte: u8);
 
     /// Invoked when a final character arrives in first part of device control string.
@@ -846,6 +846,48 @@ mod tests {
         #[cfg(feature = "no_std")]
         assert_eq!(dispatcher.params[1].len(), MAX_OSC_RAW - dispatcher.params[0].len());
     }
+
+    #[derive(Default)]
+    struct InvalidUtf8ByteDispatcher {
+        num_invalid: u8,
+    }
+
+    impl Perform for InvalidUtf8ByteDispatcher {
+        fn print(&mut self, c: char) {
+            assert_eq!(c, '�');
+            self.num_invalid += 1;
+        }
+
+        fn execute(&mut self, _: u8) {}
+
+        fn hook(&mut self, _: &[i64], _: &[u8], _: bool, _: char) {}
+
+        fn put(&mut self, _: u8) {}
+
+        fn unhook(&mut self) {}
+
+        fn osc_dispatch(&mut self, _: &[&[u8]], _: bool) {}
+
+        fn csi_dispatch(&mut self, _: &[i64], _: &[u8], _: bool, _: char) {}
+
+        fn esc_dispatch(&mut self, _: &[u8], _: bool, _: u8) {}
+    }
+
+    #[test]
+    fn parse_invalid_utf8_byte() {
+        let mut dispatcher = InvalidUtf8ByteDispatcher::default();
+        let mut parser = Parser::new();
+
+        for byte in 0x80..0xc2 {
+            parser.advance(&mut dispatcher, byte);
+        }
+        for byte in 0xf5..=0xff {
+            parser.advance(&mut dispatcher, byte);
+        }
+
+        // Continuation bytes, overlong bytes, invalid code points, invalid code units.
+        assert_eq!(dispatcher.num_invalid, 64 + 2 + 9 + 2);
+    }
 }
 
 #[cfg(all(feature = "nightly", test))]
diff --git a/src/table.rs b/src/table.rs
index c19e4ab..a67ca6f 100644
--- a/src/table.rs
+++ b/src/table.rs
@@ -18,15 +18,9 @@ generate_state_changes!(state_changes, {
         0x19        => (Anywhere, Execute),
         0x1c..=0x1f => (Anywhere, Execute),
         0x20..=0x7f => (Anywhere, Print),
-        0x80..=0x8f => (Anywhere, Execute),
-        0x91..=0x9a => (Anywhere, Execute),
-        0x9c        => (Anywhere, Execute),
-        // Beginning of UTF-8 2 byte sequence
-        0xc2..=0xdf => (Utf8, BeginUtf8),
-        // Beginning of UTF-8 3 byte sequence
-        0xe0..=0xef => (Utf8, BeginUtf8),
-        // Beginning of UTF-8 4 byte sequence
-        0xf0..=0xf4 => (Utf8, BeginUtf8),
+        // Hand all non-ASCII bytes to the UTF-8 parser to figure out. This
+        // includes 8-bit C1 codes, since we don't recognize them as such.
+        0x80..=0xff => (Utf8, BeginUtf8),
     },
 
     Escape {