Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Process all non-ASCII bytes with the UTF-8 parser. #58

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
CHANGELOG
=========

## 0.9.0

- Remove 8-bit C1 support. 8-bit C1 codes are now interpreted as UTF-8
continuation bytes.

## 0.8.0

- Remove C1 ST support in OSCs, fixing OSCs with ST in the payload
Expand Down
44 changes: 43 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ pub trait Perform {
/// Draw a character to the screen and update states.
fn print(&mut self, _: char);

/// Execute a C0 or C1 control function.
/// Execute a C0 control function.
fn execute(&mut self, byte: u8);

/// Invoked when a final character arrives in first part of device control string.
Expand Down Expand Up @@ -846,6 +846,48 @@ mod tests {
#[cfg(feature = "no_std")]
assert_eq!(dispatcher.params[1].len(), MAX_OSC_RAW - dispatcher.params[0].len());
}

#[derive(Default)]
struct InvalidUtf8ByteDispatcher {
num_invalid: u8,
}

impl Perform for InvalidUtf8ByteDispatcher {
fn print(&mut self, c: char) {
assert_eq!(c, '�');
self.num_invalid += 1;
}

fn execute(&mut self, _: u8) {}

fn hook(&mut self, _: &[i64], _: &[u8], _: bool, _: char) {}

fn put(&mut self, _: u8) {}

fn unhook(&mut self) {}

fn osc_dispatch(&mut self, _: &[&[u8]], _: bool) {}

fn csi_dispatch(&mut self, _: &[i64], _: &[u8], _: bool, _: char) {}

fn esc_dispatch(&mut self, _: &[u8], _: bool, _: u8) {}
}

#[test]
fn parse_invalid_utf8_byte() {
let mut dispatcher = InvalidUtf8ByteDispatcher::default();
let mut parser = Parser::new();

for byte in 0x80..0xc2 {
parser.advance(&mut dispatcher, byte);
}
for byte in 0xf5..=0xff {
parser.advance(&mut dispatcher, byte);
}

// Continuation bytes, overlong bytes, invalid code points, invalid code units.
assert_eq!(dispatcher.num_invalid, 64 + 2 + 9 + 2);
}
}

#[cfg(all(feature = "nightly", test))]
Expand Down
12 changes: 3 additions & 9 deletions src/table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,9 @@ generate_state_changes!(state_changes, {
0x19 => (Anywhere, Execute),
0x1c..=0x1f => (Anywhere, Execute),
0x20..=0x7f => (Anywhere, Print),
0x80..=0x8f => (Anywhere, Execute),
0x91..=0x9a => (Anywhere, Execute),
0x9c => (Anywhere, Execute),
// Beginning of UTF-8 2 byte sequence
0xc2..=0xdf => (Utf8, BeginUtf8),
// Beginning of UTF-8 3 byte sequence
0xe0..=0xef => (Utf8, BeginUtf8),
// Beginning of UTF-8 4 byte sequence
0xf0..=0xf4 => (Utf8, BeginUtf8),
// Hand all non-ASCII bytes to the UTF-8 parser to figure out. This
// includes 8-bit C1 codes, since we don't recognize them as such.
0x80..=0xff => (Utf8, BeginUtf8),
},

Escape {
Expand Down