Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create new types for Tokenizer and TreeBuilder which are Send #339

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions html5ever/src/lib.rs
Original file line number Diff line number Diff line change
@@ -29,6 +29,12 @@ mod util {
pub mod str;
}

pub trait Sendable {
type SendableSelf: Send;
fn get_sendable(&self) -> Self::SendableSelf;
fn get_self_from_sendable(sendable: Self::SendableSelf) -> Self;
}

pub mod serialize;
pub mod tokenizer;
pub mod tree_builder;
58 changes: 55 additions & 3 deletions html5ever/src/tokenizer/char_ref/mod.rs
Original file line number Diff line number Diff line change
@@ -7,10 +7,11 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use super::{Tokenizer, TokenSink};
use super::{Tokenizer, TokenSink, Sendable};
use buffer_queue::BufferQueue;
use data;
use tendril::StrTendril;
use tendril::{SendTendril, StrTendril};
use tendril::fmt::UTF8;
use util::str::{is_ascii_alnum};

use std::char::from_u32;
@@ -20,6 +21,7 @@ pub use self::Status::*;
use self::State::*;

//§ tokenizing-character-references
#[derive(Clone, Copy)]
pub struct CharRef {
/// The resulting character(s)
pub chars: [char; 2],
@@ -34,7 +36,7 @@ pub enum Status {
Done,
}

#[derive(Debug)]
#[derive(Clone, Copy, Debug)]
enum State {
Begin,
Octothorpe,
@@ -44,6 +46,22 @@ enum State {
BogusName,
}

pub struct SendableCharRefTokenizer {
state: State,
addnl_allowed: Option<char>,
result: Option<CharRef>,

num: u32,
num_too_big: bool,
seen_digit: bool,
hex_marker: Option<char>,

name_buf_opt: Option<SendTendril<UTF8>>,
name_match: Option<(u32, u32)>,
name_len: usize,
}

#[derive(Clone)]
pub struct CharRefTokenizer {
state: State,
addnl_allowed: Option<char>,
@@ -110,6 +128,40 @@ impl CharRefTokenizer {
}
}

impl Sendable for CharRefTokenizer {
type SendableSelf = SendableCharRefTokenizer;

fn get_sendable(&self) -> Self::SendableSelf {
SendableCharRefTokenizer {
state: self.state,
addnl_allowed: self.addnl_allowed,
result: self.result,
num: self.num,
num_too_big: self.num_too_big,
seen_digit: self.seen_digit,
hex_marker: self.hex_marker,
name_buf_opt: self.name_buf_opt.clone().map(|s| SendTendril::from(s)),
name_match: self.name_match,
name_len: self.name_len
}
}

fn get_self_from_sendable(sendable_self: Self::SendableSelf) -> Self {
CharRefTokenizer {
state: sendable_self.state,
addnl_allowed: sendable_self.addnl_allowed,
result: sendable_self.result,
num: sendable_self.num,
num_too_big: sendable_self.num_too_big,
seen_digit: sendable_self.seen_digit,
hex_marker: sendable_self.hex_marker,
name_buf_opt: sendable_self.name_buf_opt.clone().map(|s| StrTendril::from(s)),
name_match: sendable_self.name_match,
name_len: sendable_self.name_len
}
}
}

impl CharRefTokenizer {
pub fn step<Sink: TokenSink>(
&mut self,
130 changes: 128 additions & 2 deletions html5ever/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
@@ -19,7 +19,7 @@ use self::states::{Escaped, DoubleEscaped};
use self::states::{Unquoted, SingleQuoted, DoubleQuoted};
use self::states::{DoctypeIdKind, Public, System};

use self::char_ref::{CharRef, CharRefTokenizer};
use self::char_ref::{CharRef, CharRefTokenizer, SendableCharRefTokenizer};

use util::str::lower_ascii_letter;

@@ -30,9 +30,12 @@ use std::borrow::Cow::{self, Borrowed};
use std::collections::BTreeMap;

use {LocalName, QualName, Attribute, SmallCharSet};
use tendril::StrTendril;
use tendril::{SendTendril, StrTendril};
use tendril::fmt::UTF8;
pub use buffer_queue::{BufferQueue, SetResult, FromSet, NotFromSet};

use super::Sendable;

pub mod states;
mod interface;
mod char_ref;
@@ -95,6 +98,38 @@ impl Default for TokenizerOpts {
}
}

/// Similar to Tokenizer, except this type uses SendTendril instead of StrTendril.
pub struct SendableTokenizer<Sink> {
opts: TokenizerOpts,
sink: Sink,
state: states::State,
at_eof: bool,
char_ref_tokenizer: Option<SendableCharRefTokenizer>,
current_char: char,
reconsume: bool,
ignore_lf: bool,
discard_bom: bool,
current_tag_kind: TagKind,
current_tag_name: SendTendril<UTF8>,
current_tag_self_closing: bool,
current_tag_attrs: Vec<(QualName, SendTendril<UTF8>)>,
current_attr_name: SendTendril<UTF8>,
current_attr_value: SendTendril<UTF8>,
current_comment: SendTendril<UTF8>,

/// current doctype's fields
curr_doctype_name: Option<SendTendril<UTF8>>,
curr_doctype_public_id: Option<SendTendril<UTF8>>,
curr_doctype_system_id: Option<SendTendril<UTF8>>,
curr_doctype_force_quirks: bool,

last_start_tag_name: Option<LocalName>,
temp_buf: SendTendril<UTF8>,
state_profile: BTreeMap<states::State, u64>,
time_in_sink: u64,
current_line: u64,
}

/// The HTML tokenizer.
pub struct Tokenizer<Sink> {
/// Options controlling the behavior of the tokenizer.
@@ -559,6 +594,97 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
self.process_token_and_continue(ParseError(error));
}
}

impl<Sink: TokenSink + Sendable> Sendable for Tokenizer<Sink>
{
type SendableSelf = SendableTokenizer<<Sink as Sendable>::SendableSelf>;

/// Returns an instance containing the necessary information required to
/// create a Tokenizer with the exact same state. Instances of this
/// type can be sent between threads.
fn get_sendable(&self) -> Self::SendableSelf {
let mut sendable_current_tag_attrs = vec!();
let mut current_tag_attrs = self.current_tag_attrs.iter();
while let Some(attr) = current_tag_attrs.next() {
sendable_current_tag_attrs.push((attr.name.clone(), SendTendril::from(attr.value.clone())));
}

SendableTokenizer {
opts: self.opts.clone(),
sink: self.sink.get_sendable(),
state: self.state,
char_ref_tokenizer: self.char_ref_tokenizer.clone().map(|tok| tok.get_sendable()),
at_eof: self.at_eof,
current_char: self.current_char,
reconsume: self.reconsume,
ignore_lf: self.ignore_lf,
discard_bom: self.discard_bom,
current_tag_kind: self.current_tag_kind,
current_tag_name: SendTendril::from(self.current_tag_name.clone()),
current_tag_self_closing: self.current_tag_self_closing,
current_tag_attrs: sendable_current_tag_attrs,
current_attr_name: SendTendril::from(self.current_attr_name.clone()),
current_attr_value: SendTendril::from(self.current_attr_value.clone()),
current_comment: SendTendril::from(self.current_comment.clone()),

curr_doctype_name: self.current_doctype.name.clone().map(|s| SendTendril::from(s)),
curr_doctype_public_id: self.current_doctype.public_id.clone().map(|s| SendTendril::from(s)),
curr_doctype_system_id: self.current_doctype.system_id.clone().map(|s| SendTendril::from(s)),
curr_doctype_force_quirks: self.current_doctype.force_quirks,

last_start_tag_name: self.last_start_tag_name.clone(),
temp_buf: SendTendril::from(self.temp_buf.clone()),
state_profile: self.state_profile.clone(),
time_in_sink: self.time_in_sink,
current_line: self.current_line
}
}

fn get_self_from_sendable(sendable_self: Self::SendableSelf) -> Self {
let mut current_tag_attrs = vec!();
let mut sendable_current_tag_attrs = sendable_self.current_tag_attrs.iter();
while let Some(attr) = sendable_current_tag_attrs.next() {
let (name, value) = attr.clone();
current_tag_attrs.push(Attribute {
name: name,
value: StrTendril::from(value),
});
}

Tokenizer {
opts: sendable_self.opts,
sink: Sink::get_self_from_sendable(sendable_self.sink),
state: sendable_self.state,
char_ref_tokenizer: sendable_self.char_ref_tokenizer
.map(|tok| Box::new(CharRefTokenizer::get_self_from_sendable(tok))),
at_eof: sendable_self.at_eof,
current_char: sendable_self.current_char,
reconsume: sendable_self.reconsume,
ignore_lf: sendable_self.ignore_lf,
discard_bom: sendable_self.discard_bom,
current_tag_kind: sendable_self.current_tag_kind,
current_tag_name: StrTendril::from(sendable_self.current_tag_name),
current_tag_self_closing: sendable_self.current_tag_self_closing,
current_tag_attrs: current_tag_attrs,
current_attr_name: StrTendril::from(sendable_self.current_attr_name),
current_attr_value: StrTendril::from(sendable_self.current_attr_value),
current_comment: StrTendril::from(sendable_self.current_comment),

current_doctype: Doctype {
name: sendable_self.curr_doctype_name.map(|s| StrTendril::from(s)),
public_id: sendable_self.curr_doctype_public_id.map(|s| StrTendril::from(s)),
system_id: sendable_self.curr_doctype_system_id.map(|s| StrTendril::from(s)),
force_quirks: sendable_self.curr_doctype_force_quirks,
},

last_start_tag_name: sendable_self.last_start_tag_name,
temp_buf: StrTendril::from(sendable_self.temp_buf),
state_profile: sendable_self.state_profile,
time_in_sink: sendable_self.time_in_sink,
current_line: sendable_self.current_line
}
}
}
//§ END

// Shorthand for common state machine behaviors.
Loading