diff --git a/src/parser.rs b/src/parser.rs index b1357f26..23199d3c 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -16,8 +16,8 @@ use std::ops::Range; /// Should only be used with the `Parser` instance it came from. #[derive(Debug, Clone)] pub struct ParserState { - pub(crate) position: usize, - pub(crate) current_line_start_position: usize, + pub(crate) position: SourcePosition, + pub(crate) current_line_start_position: SourcePosition, pub(crate) current_line_number: u32, pub(crate) at_start_of: Option, } @@ -26,7 +26,7 @@ impl ParserState { /// The position from the start of the input, counted in UTF-8 bytes. #[inline] pub fn position(&self) -> SourcePosition { - SourcePosition(self.position) + self.position } /// The line number and column number @@ -34,7 +34,7 @@ impl ParserState { pub fn source_location(&self) -> SourceLocation { SourceLocation { line: self.current_line_number, - column: (self.position - self.current_line_start_position + 1) as u32, + column: (self.position.0 - self.current_line_start_position.0 + 1) as u32, } } } diff --git a/src/size_of_tests.rs b/src/size_of_tests.rs index edd2b439..6c486dc8 100644 --- a/src/size_of_tests.rs +++ b/src/size_of_tests.rs @@ -42,8 +42,8 @@ size_of_test!(token, Token, 32); size_of_test!(std_cow_str, std::borrow::Cow<'static, str>, 24, 32); size_of_test!(cow_rc_str, CowRcStr, 16); -size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 72); -size_of_test!(parser_input, crate::parser::ParserInput, 136); +size_of_test!(tokenizer, crate::tokenizer::Tokenizer, 80); +size_of_test!(parser_input, crate::parser::ParserInput, 144); size_of_test!(parser, crate::parser::Parser, 16); size_of_test!(source_position, crate::SourcePosition, 8); size_of_test!(parser_state, crate::ParserState, 24); diff --git a/src/tokenizer.rs b/src/tokenizer.rs index f1716c71..4075ca7e 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -209,11 +209,11 @@ impl<'a> Token<'a> { pub struct Tokenizer<'a> { input: &'a str, /// Counted in bytes, not code points. From 0. - position: usize, + current_position: std::slice::Iter<'a, u8>, /// The position at the start of the current line; but adjusted to /// ensure that computing the column will give the result in units /// of UTF-16 characters. - current_line_start_position: usize, + current_line_start_position: SourcePosition, current_line_number: u32, var_or_env_functions: SeenStatus, source_map_url: Option<&'a str>, @@ -232,8 +232,8 @@ impl<'a> Tokenizer<'a> { pub fn new(input: &str) -> Tokenizer { Tokenizer { input, - position: 0, - current_line_start_position: 0, + current_position: input.as_bytes().iter(), + current_line_start_position: SourcePosition(0), current_line_number: 0, var_or_env_functions: SeenStatus::DontCare, source_map_url: None, @@ -269,15 +269,19 @@ impl<'a> Tokenizer<'a> { #[inline] pub fn position(&self) -> SourcePosition { - debug_assert!(self.input.is_char_boundary(self.position)); - SourcePosition(self.position) + let offset = unsafe { + self.current_position.as_slice().as_ptr().offset_from(self.input.as_ptr()) + }; + debug_assert!(offset >= 0, "{}", "current_position should always point to input: {offset}"); + debug_assert!(self.input.is_char_boundary(offset as usize)); + SourcePosition(offset as usize) } #[inline] pub fn current_source_location(&self) -> SourceLocation { SourceLocation { line: self.current_line_number, - column: (self.position - self.current_line_start_position + 1) as u32, + column: (self.position().0 - self.current_line_start_position.0 + 1) as u32, } } @@ -294,7 +298,7 @@ impl<'a> Tokenizer<'a> { #[inline] pub fn state(&self) -> ParserState { ParserState { - position: self.position, + position: self.position(), current_line_start_position: self.current_line_start_position, current_line_number: self.current_line_number, at_start_of: None, @@ -303,7 +307,7 @@ impl<'a> Tokenizer<'a> { #[inline] pub fn reset(&mut self, state: &ParserState) { - self.position = state.position; + self.current_position = self.input.as_bytes()[state.position.0..].iter(); self.current_line_start_position = state.current_line_start_position; self.current_line_number = state.current_line_number; } @@ -338,7 +342,7 @@ impl<'a> Tokenizer<'a> { if self.is_eof() { None } else { - Some(self.input.as_bytes()[self.position]) + Some(self.byte_at(0)) } } @@ -352,25 +356,48 @@ impl<'a> Tokenizer<'a> { // That is, `tokenizer.char_at(n)` will not panic. #[inline] fn has_at_least(&self, n: usize) -> bool { - self.position + n < self.input.len() + self.current_position.len() > n + } + + #[cfg(debug_assertions)] + fn check_before_advance(&self, n: usize) { + // Each byte must either be an ASCII byte or a sequence leader, but not a 4-byte + // leader; also newlines are rejected. + for i in 0..n { + let b = self.byte_at(i); + debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80)); + debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C'); + } } - // Advance over N bytes in the input. This function can advance - // over ASCII bytes (excluding newlines), or UTF-8 sequence - // leaders (excluding leaders for 4-byte sequences). + #[inline] + pub fn advance_one(&mut self) { + #[cfg(debug_assertions)] + self.check_before_advance(1); + self.advance_one_unchecked(); + } + + // Advance over N bytes in the input. This function can advance over ASCII bytes (excluding + // newlines), or UTF-8 sequence leaders (excluding leaders for 4-byte sequences). #[inline] pub fn advance(&mut self, n: usize) { - if cfg!(debug_assertions) { - // Each byte must either be an ASCII byte or a sequence - // leader, but not a 4-byte leader; also newlines are - // rejected. - for i in 0..n { - let b = self.byte_at(i); - debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80)); - debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C'); - } - } - self.position += n + #[cfg(debug_assertions)] + self.check_before_advance(n); + self.advance_unchecked(n); + } + + // Version of advance() that doesn't assert about newlines and co. Used for very specific cases + // that deal with them properly on their own. + #[inline] + pub(crate) fn advance_unchecked(&mut self, n: usize) { + self.current_position = self.current_position.as_slice()[n..].iter(); + } + + // As above, but for advance_one + #[inline] + pub(crate) fn advance_one_unchecked(&mut self) { + let byte = self.current_position.next(); + debug_assert_ne!(byte, None, "Advanced past EOF"); } // Assumes non-EOF @@ -381,46 +408,41 @@ impl<'a> Tokenizer<'a> { #[inline] fn byte_at(&self, offset: usize) -> u8 { - self.input.as_bytes()[self.position + offset] + self.current_position.as_slice()[offset] } - // Advance over a single byte; the byte must be a UTF-8 sequence - // leader for a 4-byte sequence. + // Advance over a single byte; the byte must be a UTF-8 sequence leader for a 4-byte sequence. #[inline] fn consume_4byte_intro(&mut self) { debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0); - // This takes two UTF-16 characters to represent, so we - // actually have an undercount. - self.current_line_start_position = self.current_line_start_position.wrapping_sub(1); - self.position += 1; + // This takes two UTF-16 characters to represent, so we actually have an undercount. + self.current_line_start_position.0 = self.current_line_start_position.0.wrapping_sub(1); + self.advance_one_unchecked(); } - // Advance over a single byte; the byte must be a UTF-8 - // continuation byte. + // Advance over a single byte; the byte must be a UTF-8 continuation byte. #[inline] fn consume_continuation_byte(&mut self) { debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80); - // Continuation bytes contribute to column overcount. Note - // that due to the special case for the 4-byte sequence intro, - // we must use wrapping add here. - self.current_line_start_position = self.current_line_start_position.wrapping_add(1); - self.position += 1; + // Continuation bytes contribute to column overcount. Note that due to the special case + // for the 4-byte sequence intro, we must use wrapping add here. + self.current_line_start_position.0 = self.current_line_start_position.0.wrapping_add(1); + self.advance_one_unchecked(); } // Advance over any kind of byte, excluding newlines. #[inline(never)] fn consume_known_byte(&mut self, byte: u8) { debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C'); - self.position += 1; + self.advance_one_unchecked(); // Continuation bytes contribute to column overcount. if byte & 0xF0 == 0xF0 { - // This takes two UTF-16 characters to represent, so we - // actually have an undercount. - self.current_line_start_position = self.current_line_start_position.wrapping_sub(1); + // This takes two UTF-16 characters to represent, so we actually have an undercount. + self.current_line_start_position.0 = self.current_line_start_position.0.wrapping_sub(1); } else if byte & 0xC0 == 0x80 { - // Note that due to the special case for the 4-byte - // sequence intro, we must use wrapping add here. - self.current_line_start_position = self.current_line_start_position.wrapping_add(1); + // Note that due to the special case for the 4-byte sequence intro, we must use + // wrapping add here. + self.current_line_start_position.0 = self.current_line_start_position.0.wrapping_add(1); } } @@ -438,43 +460,42 @@ impl<'a> Tokenizer<'a> { fn consume_newline(&mut self) { let byte = self.next_byte_unchecked(); debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C'); - self.position += 1; + self.advance_one_unchecked(); if byte == b'\r' && self.next_byte() == Some(b'\n') { - self.position += 1; + self.advance_one_unchecked(); } - self.current_line_start_position = self.position; + self.current_line_start_position = self.position(); self.current_line_number += 1; } #[inline] fn has_newline_at(&self, offset: usize) -> bool { - self.position + offset < self.input.len() - && matches!(self.byte_at(offset), b'\n' | b'\r' | b'\x0C') + self.has_at_least(offset) && matches!(self.byte_at(offset), b'\n' | b'\r' | b'\x0C') } #[inline] fn consume_char(&mut self) -> char { let c = self.next_char(); let len_utf8 = c.len_utf8(); - self.position += len_utf8; + self.advance_unchecked(len_utf8); // Note that due to the special case for the 4-byte sequence // intro, we must use wrapping add here. - self.current_line_start_position = self - .current_line_start_position + self.current_line_start_position.0 = self + .current_line_start_position.0 .wrapping_add(len_utf8 - c.len_utf16()); c } #[inline] fn starts_with(&self, needle: &[u8]) -> bool { - self.input.as_bytes()[self.position..].starts_with(needle) + self.current_position.as_slice().starts_with(needle) } pub fn skip_whitespace(&mut self) { while !self.is_eof() { match_byte! { self.next_byte_unchecked(), b' ' | b'\t' => { - self.advance(1) + self.advance_one() }, b'\n' | b'\x0C' | b'\r' => { self.consume_newline(); @@ -495,7 +516,7 @@ impl<'a> Tokenizer<'a> { while !self.is_eof() { match_byte! { self.next_byte_unchecked(), b' ' | b'\t' => { - self.advance(1) + self.advance_one() }, b'\n' | b'\x0C' | b'\r' => { self.consume_newline(); @@ -564,7 +585,7 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result, ()> { b'\n' | b'\x0C' | b'\r' => consume_whitespace(tokenizer, true), b'"' => consume_string(tokenizer, false), b'#' => { - tokenizer.advance(1); + tokenizer.advance_one(); if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) } else if !tokenizer.is_eof() && match tokenizer.next_byte_unchecked() { // Any other valid case here already resulted in IDHash. @@ -575,14 +596,14 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result, ()> { }, b'$' => { if tokenizer.starts_with(b"$=") { tokenizer.advance(2); SuffixMatch } - else { tokenizer.advance(1); Delim('$') } + else { tokenizer.advance_one(); Delim('$') } }, b'\'' => consume_string(tokenizer, true), - b'(' => { tokenizer.advance(1); ParenthesisBlock }, - b')' => { tokenizer.advance(1); CloseParenthesis }, + b'(' => { tokenizer.advance_one(); ParenthesisBlock }, + b')' => { tokenizer.advance_one(); CloseParenthesis }, b'*' => { if tokenizer.starts_with(b"*=") { tokenizer.advance(2); SubstringMatch } - else { tokenizer.advance(1); Delim('*') } + else { tokenizer.advance_one(); Delim('*') } }, b'+' => { if ( @@ -595,11 +616,11 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result, ()> { ) { consume_numeric(tokenizer) } else { - tokenizer.advance(1); + tokenizer.advance_one(); Delim('+') } }, - b',' => { tokenizer.advance(1); Comma }, + b',' => { tokenizer.advance_one(); Comma }, b'-' => { if ( tokenizer.has_at_least(1) @@ -616,7 +637,7 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result, ()> { } else if is_ident_start(tokenizer) { consume_ident_like(tokenizer) } else { - tokenizer.advance(1); + tokenizer.advance_one(); Delim('-') } }, @@ -625,7 +646,7 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result, ()> { && tokenizer.byte_at(1).is_ascii_digit() { consume_numeric(tokenizer) } else { - tokenizer.advance(1); + tokenizer.advance_one(); Delim('.') } } @@ -633,53 +654,53 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result, ()> { if tokenizer.starts_with(b"/*") { Comment(consume_comment(tokenizer)) } else { - tokenizer.advance(1); + tokenizer.advance_one(); Delim('/') } } b'0'..=b'9' => consume_numeric(tokenizer), - b':' => { tokenizer.advance(1); Colon }, - b';' => { tokenizer.advance(1); Semicolon }, + b':' => { tokenizer.advance_one(); Colon }, + b';' => { tokenizer.advance_one(); Semicolon }, b'<' => { if tokenizer.starts_with(b"