Skip to content

Commit 029f7d3

Browse files
committedOct 6, 2017
Add unic-segment component
The algorithm implementation is based on [`unicode-segmentation`](https://github.com/unicode-rs/unicode-segmentation). The API is kept minimal at the moment, with only segment boundry objects and their indices objects. As we expand our string-level APIs, we can add on what's needed.
1 parent 673b5d7 commit 029f7d3

30 files changed

+5556
-4
lines changed
 

‎AUTHORS

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Manish Goregaokar <manishsmail@gmail.com>
1212
Matt Brubeck <mbrubeck@limpet.net>
1313
Michael Howell <michael@notriddle.com>
1414
Pyfisch <pyfisch@gmail.com>
15+
Raph Levien <raph@google.com>
1516
Riad S. Wahby <kwantam@gmail.com>
1617
Simon Sapin <simon.sapin@exyr.org>
1718
Valentin Gosu <valentin.gosu@gmail.com>

‎gen/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,6 @@ clap = "2.25"
2020
lazy_static = "0.2"
2121

2222
# Parsing
23+
itertools = "0.6"
2324
matches = "0.1"
2425
regex = "0.2"

‎gen/src/main.rs

+8-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ extern crate lazy_static;
2121
#[macro_use]
2222
extern crate matches;
2323

24+
extern crate itertools;
2425
extern crate regex;
2526

2627

@@ -30,7 +31,10 @@ mod writer;
3031

3132
/// Validate component target names passed in
3233
fn validate_component_name(name: String) -> Result<(), String> {
33-
if matches!(name.as_str(), "idna" | "ucd" | "normal" | "emoji") {
34+
if matches!(
35+
name.as_str(),
36+
"ucd" | "normal" | "segment" | "idna" | "emoji"
37+
) {
3438
Ok(())
3539
} else {
3640
Err(format!("Invalid component: `{}`", name))
@@ -56,6 +60,9 @@ fn main() {
5660
if components.is_empty() || components.contains(&"normal") {
5761
writer::normal::generate();
5862
}
63+
if components.is_empty() || components.contains(&"segment") {
64+
writer::segment::generate();
65+
}
5966
if components.is_empty() || components.contains(&"idna") {
6067
writer::idna::generate();
6168
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
// Copyright 2017 The UNIC Project Developers.
2+
//
3+
// See the COPYRIGHT file at the top-level directory of this distribution.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
12+
use std::char;
13+
use std::str::FromStr;
14+
15+
use itertools::Itertools;
16+
17+
use source::utils::read;
18+
19+
use regex::Regex;
20+
21+
22+
lazy_static! {
23+
pub static ref GRAPHEME_BREAK_TESTS: GraphemeBreakTests = {
24+
read("data/ucd/test/GraphemeBreakTest.txt").parse().unwrap()
25+
};
26+
}
27+
28+
29+
pub struct GraphemeBreakTests {
30+
pub entries: Vec<GraphemeBreakTest>,
31+
}
32+
33+
34+
/// Represents a Test Case, containing a sequence of characters and GCB for each character, and for
35+
/// each pair of adjacent chars, if they can break or not, and which rule matching the position.
36+
///
37+
/// Invariants:
38+
/// ```
39+
/// char_gcbs.len() == chars.len()
40+
/// breaks.len() == chars.len() - 1
41+
/// rules.len() == chars.len() - 1
42+
/// ```
43+
#[derive(Debug)]
44+
pub struct GraphemeBreakTest {
45+
pub chars: Vec<char>,
46+
pub char_gcbs: Vec<String>,
47+
pub breaks: Vec<bool>,
48+
pub rules: Vec<String>,
49+
}
50+
51+
52+
impl FromStr for GraphemeBreakTests {
53+
type Err = ();
54+
fn from_str(string: &str) -> Result<GraphemeBreakTests, ()> {
55+
lazy_static! {
56+
static ref LINE_RE: Regex = Regex::new(
57+
r"(?xm)^\s*
58+
÷ \s+ # source begin
59+
(\w.*\w) # source captured
60+
\s+ ÷ \s* # source end
61+
\# # delimiter
62+
\s* ÷ \s+ \[0\.2\] # comment begin
63+
(.*) # comment captured
64+
\s*$"
65+
).unwrap();
66+
67+
static ref COMMENT_RE: Regex = Regex::new(
68+
r"(?x)
69+
\s+ [ \w\s<>\(\) -]+ # char name
70+
\s+ \( ( \w+ ) \) # char gcb
71+
\s+ ( [÷×] ) # break opportunity or not
72+
\s+ \[ ( [^\]]+ ) \] # rule id
73+
"
74+
).unwrap();
75+
}
76+
77+
let entries = LINE_RE
78+
.captures_iter(string)
79+
.filter_map(|line| {
80+
let source_items: Vec<&str> =
81+
line[1].split_whitespace().map(|s| s.trim()).collect();
82+
83+
let codepoints: Vec<u32> = source_items
84+
.iter()
85+
.step(2)
86+
.map(|&s| u32::from_str_radix(s, 16).expect("Bad number"))
87+
.collect();
88+
let chars: Vec<char> = codepoints
89+
.iter()
90+
.filter_map(|&u| char::from_u32(u))
91+
.collect();
92+
// Skip if any surrogate or invalid codepoints are present
93+
if codepoints.len() != chars.len() {
94+
return None;
95+
}
96+
assert_eq!(chars.len() * 2, source_items.len() + 1);
97+
98+
let breaks: Vec<bool> = source_items
99+
.iter()
100+
.dropping(1)
101+
.step(2)
102+
.map(|s| match *s {
103+
"÷" => true,
104+
"×" => false,
105+
t => panic!("Invalid token: {:?}", t),
106+
})
107+
.collect();
108+
assert_eq!(breaks.len(), chars.len() - 1);
109+
110+
let comment_items_captured = COMMENT_RE.captures_iter(&line[2]).collect::<Vec<_>>();
111+
let comment_items_mapped = comment_items_captured
112+
.iter()
113+
.map(|ref c| [&c[1], &c[2], &c[3]])
114+
.collect::<Vec<_>>();
115+
let comment_items_flattened = comment_items_mapped
116+
.iter()
117+
.flat_map(|x| x.iter())
118+
.collect::<Vec<_>>();
119+
let comment_items = &comment_items_flattened[..comment_items_flattened.len() - 2];
120+
assert_eq!(comment_items.len(), chars.len() * 3 - 2);
121+
122+
let char_gcbs: Vec<String> = comment_items
123+
.iter()
124+
.step(3)
125+
.map(|&s| s.to_string())
126+
.collect();
127+
assert_eq!(char_gcbs.len(), chars.len());
128+
129+
let rules: Vec<String> = comment_items
130+
.iter()
131+
.dropping(2)
132+
.step(3)
133+
.map(|&s| s.to_string())
134+
.collect();
135+
assert_eq!(rules.len(), chars.len() - 1);
136+
137+
Some(GraphemeBreakTest {
138+
chars,
139+
char_gcbs,
140+
breaks,
141+
rules,
142+
})
143+
})
144+
.collect();
145+
146+
Ok(GraphemeBreakTests { entries })
147+
}
148+
}

‎gen/src/source/ucd/test/mod.rs

+2
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,5 @@
1010

1111

1212
pub mod normalization_test;
13+
pub mod grapheme_break_test;
14+
pub mod word_break_test;
+148
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
// Copyright 2017 The UNIC Project Developers.
2+
//
3+
// See the COPYRIGHT file at the top-level directory of this distribution.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
12+
use std::char;
13+
use std::str::FromStr;
14+
15+
use itertools::Itertools;
16+
17+
use source::utils::read;
18+
19+
use regex::Regex;
20+
21+
22+
lazy_static! {
23+
pub static ref WORD_BREAK_TESTS: WordBreakTests = {
24+
read("data/ucd/test/WordBreakTest.txt").parse().unwrap()
25+
};
26+
}
27+
28+
29+
pub struct WordBreakTests {
30+
pub entries: Vec<WordBreakTest>,
31+
}
32+
33+
34+
/// Represents a Test Case, containing a sequence of characters and GCB for each character, and for
35+
/// each pair of adjacent chars, if they can break or not, and which rule matching the position.
36+
///
37+
/// Invariants:
38+
/// ```
39+
/// char_gcbs.len() == chars.len()
40+
/// breaks.len() == chars.len() - 1
41+
/// rules.len() == chars.len() - 1
42+
/// ```
43+
#[derive(Debug)]
44+
pub struct WordBreakTest {
45+
pub chars: Vec<char>,
46+
pub char_gcbs: Vec<String>,
47+
pub breaks: Vec<bool>,
48+
pub rules: Vec<String>,
49+
}
50+
51+
52+
impl FromStr for WordBreakTests {
53+
type Err = ();
54+
fn from_str(string: &str) -> Result<WordBreakTests, ()> {
55+
lazy_static! {
56+
static ref LINE_RE: Regex = Regex::new(
57+
r"(?xm)^\s*
58+
÷ \s+ # source begin
59+
(\w.*\w) # source captured
60+
\s+ ÷ \s* # source end
61+
\# # delimiter
62+
\s* ÷ \s+ \[0\.2\] # comment begin
63+
(.*) # comment captured
64+
\s*$"
65+
).unwrap();
66+
67+
static ref COMMENT_RE: Regex = Regex::new(
68+
r"(?x)
69+
\s+ [ \w\s<>\(\) -]+ # char name
70+
\s+ \( ( \w+ ) \) # char gcb
71+
\s+ ( [÷×] ) # break opportunity or not
72+
\s+ \[ ( [^\]]+ ) \] # rule id
73+
"
74+
).unwrap();
75+
}
76+
77+
let entries = LINE_RE
78+
.captures_iter(string)
79+
.filter_map(|line| {
80+
let source_items: Vec<&str> =
81+
line[1].split_whitespace().map(|s| s.trim()).collect();
82+
83+
let codepoints: Vec<u32> = source_items
84+
.iter()
85+
.step(2)
86+
.map(|&s| u32::from_str_radix(s, 16).expect("Bad number"))
87+
.collect();
88+
let chars: Vec<char> = codepoints
89+
.iter()
90+
.filter_map(|&u| char::from_u32(u))
91+
.collect();
92+
// Skip if any surrogate or invalid codepoints are present
93+
if codepoints.len() != chars.len() {
94+
return None;
95+
}
96+
assert_eq!(chars.len() * 2, source_items.len() + 1);
97+
98+
let breaks: Vec<bool> = source_items
99+
.iter()
100+
.dropping(1)
101+
.step(2)
102+
.map(|s| match *s {
103+
"÷" => true,
104+
"×" => false,
105+
t => panic!("Invalid token: {:?}", t),
106+
})
107+
.collect();
108+
assert_eq!(breaks.len(), chars.len() - 1);
109+
110+
let comment_items_captured = COMMENT_RE.captures_iter(&line[2]).collect::<Vec<_>>();
111+
let comment_items_mapped = comment_items_captured
112+
.iter()
113+
.map(|ref c| [&c[1], &c[2], &c[3]])
114+
.collect::<Vec<_>>();
115+
let comment_items_flattened = comment_items_mapped
116+
.iter()
117+
.flat_map(|x| x.iter())
118+
.collect::<Vec<_>>();
119+
let comment_items = &comment_items_flattened[..comment_items_flattened.len() - 2];
120+
assert_eq!(comment_items.len(), chars.len() * 3 - 2);
121+
122+
let char_gcbs: Vec<String> = comment_items
123+
.iter()
124+
.step(3)
125+
.map(|&s| s.to_string())
126+
.collect();
127+
assert_eq!(char_gcbs.len(), chars.len());
128+
129+
let rules: Vec<String> = comment_items
130+
.iter()
131+
.dropping(2)
132+
.step(3)
133+
.map(|&s| s.to_string())
134+
.collect();
135+
assert_eq!(rules.len(), chars.len() - 1);
136+
137+
Some(WordBreakTest {
138+
chars,
139+
char_gcbs,
140+
breaks,
141+
rules,
142+
})
143+
})
144+
.collect();
145+
146+
Ok(WordBreakTests { entries })
147+
}
148+
}

‎gen/src/writer/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
pub mod emoji;
1313
pub mod idna;
1414
pub mod normal;
15+
pub mod segment;
1516
pub mod ucd;
1617

1718
mod common;

‎gen/src/writer/segment/mod.rs

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// Copyright 2017 The UNIC Project Developers.
2+
//
3+
// See the COPYRIGHT file at the top-level directory of this distribution.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
12+
mod segment_tests;
13+
14+
use writer::utils::clean_dir;
15+
16+
17+
pub fn generate() {
18+
segment_tests::generate(&clean_dir("unic/segment/tests/tables"));
19+
}
+135
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
// Copyright 2017 The UNIC Project Developers.
2+
//
3+
// See the COPYRIGHT file at the top-level directory of this distribution.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
12+
use std::path::Path;
13+
14+
use source::ucd::test::grapheme_break_test::{GraphemeBreakTest, GRAPHEME_BREAK_TESTS};
15+
use source::ucd::test::word_break_test::{WordBreakTest, WORD_BREAK_TESTS};
16+
17+
use writer::utils::write;
18+
19+
20+
pub fn generate(dir: &Path) {
21+
emit_grapheme_cluster_break_test_data(dir);
22+
emit_word_break_test_data(dir);
23+
}
24+
25+
26+
fn str_escape(s: &str) -> String {
27+
format!(
28+
"\"{}\"",
29+
s.chars()
30+
.map(|c| c.escape_unicode().collect::<String>())
31+
.collect::<String>()
32+
)
33+
}
34+
35+
fn chars_escape(cs: &[char]) -> String {
36+
str_escape(&cs.iter().collect::<String>())
37+
}
38+
39+
40+
fn legacy_should_break(rule: &String) -> bool {
41+
const EXTENDED_ONLY_RULES: &[&str] = &["9.1", "9.2"];
42+
EXTENDED_ONLY_RULES.contains(&rule.as_str())
43+
}
44+
45+
46+
fn emit_grapheme_cluster_break_test_data(dir: &Path) {
47+
let mut contents = "&[\n".to_owned();
48+
49+
for case in GRAPHEME_BREAK_TESTS.entries.iter() {
50+
let GraphemeBreakTest {
51+
ref chars,
52+
ref breaks,
53+
ref rules,
54+
..
55+
} = *case;
56+
57+
contents.push_str(" (");
58+
59+
// Source
60+
contents.push_str(&format!("{}, ", chars_escape(&chars)));
61+
62+
// Extended Grapheme Clusters
63+
{
64+
contents.push_str("&[");
65+
let mut cluster: Vec<char> = vec![chars[0]];
66+
for (i, &brk) in breaks.iter().enumerate() {
67+
if brk {
68+
contents.push_str(&format!("{}, ", chars_escape(&cluster)));
69+
cluster.truncate(0);
70+
}
71+
cluster.push(chars[i + 1]);
72+
}
73+
contents.push_str(&format!("{}], ", chars_escape(&cluster)));
74+
}
75+
76+
// Legacy Grapheme Clusters
77+
if rules.iter().any(legacy_should_break) {
78+
contents.push_str("Some(&[");
79+
let mut cluster: Vec<char> = vec![chars[0]];
80+
for (i, &brk) in breaks.iter().enumerate() {
81+
if brk || legacy_should_break(&rules[i]) {
82+
contents.push_str(&format!("{}, ", chars_escape(&cluster)));
83+
cluster.truncate(0);
84+
}
85+
cluster.push(chars[i + 1]);
86+
}
87+
contents.push_str(&format!("{}])", chars_escape(&cluster)));
88+
} else {
89+
contents.push_str("None");
90+
}
91+
92+
contents.push_str("),\n");
93+
}
94+
95+
contents.push_str("]");
96+
97+
write(dir, "grapheme_cluster_break_test_data.rsv", &contents);
98+
}
99+
100+
fn emit_word_break_test_data(dir: &Path) {
101+
let mut contents = "&[\n".to_owned();
102+
103+
for case in WORD_BREAK_TESTS.entries.iter() {
104+
let WordBreakTest {
105+
ref chars,
106+
ref breaks,
107+
..
108+
} = *case;
109+
110+
contents.push_str(" (");
111+
112+
// Source
113+
contents.push_str(&format!("{}, ", chars_escape(&chars)));
114+
115+
// Words
116+
{
117+
contents.push_str("&[");
118+
let mut cluster: Vec<char> = vec![chars[0]];
119+
for (i, &brk) in breaks.iter().enumerate() {
120+
if brk {
121+
contents.push_str(&format!("{}, ", chars_escape(&cluster)));
122+
cluster.truncate(0);
123+
}
124+
cluster.push(chars[i + 1]);
125+
}
126+
contents.push_str(&format!("{}]", chars_escape(&cluster)));
127+
}
128+
129+
contents.push_str("),\n");
130+
}
131+
132+
contents.push_str("]");
133+
134+
write(dir, "word_break_test_data.rsv", &contents);
135+
}

‎gen/src/writer/ucd/mod.rs

+2
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ mod ident;
1919
mod name;
2020
mod normal;
2121
mod segment;
22+
mod segment_tests;
2223

2324

2425
use writer::utils::clean_dir;
@@ -35,4 +36,5 @@ pub fn generate() {
3536
name::generate(&clean_dir("unic/ucd/name/tables"));
3637
normal::generate(&clean_dir("unic/ucd/normal/tables"));
3738
segment::generate(&clean_dir("unic/ucd/segment/tables"));
39+
segment_tests::generate(&clean_dir("unic/ucd/segment/tests/tables"));
3840
}

‎gen/src/writer/ucd/segment_tests.rs

+109
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
// Copyright 2017 The UNIC Project Developers.
2+
//
3+
// See the COPYRIGHT file at the top-level directory of this distribution.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
12+
use std::path::Path;
13+
use std::collections::BTreeMap;
14+
15+
use source::ucd::test::grapheme_break_test::{GraphemeBreakTest, GRAPHEME_BREAK_TESTS};
16+
use source::ucd::test::word_break_test::{WordBreakTest, WORD_BREAK_TESTS};
17+
18+
use writer::utils::tables::ToDirectCharTable;
19+
use writer::utils::write;
20+
21+
22+
pub fn generate(dir: &Path) {
23+
emit_grapheme_cluster_break_test_data(dir);
24+
emit_word_break_test_data(dir);
25+
}
26+
27+
28+
fn emit_grapheme_cluster_break_test_data(dir: &Path) {
29+
let mut map = BTreeMap::default();
30+
31+
for case in GRAPHEME_BREAK_TESTS.entries.iter() {
32+
let GraphemeBreakTest {
33+
ref chars,
34+
ref char_gcbs,
35+
..
36+
} = *case;
37+
38+
for (i, ch) in chars.iter().enumerate() {
39+
let ref gcb = char_gcbs[i];
40+
if map.contains_key(ch) {
41+
assert_eq!(map[ch], *gcb);
42+
} else {
43+
map.insert(*ch, gcb.clone());
44+
}
45+
}
46+
}
47+
48+
write(
49+
dir,
50+
"grapheme_cluster_break_test_data.rsv",
51+
&map.to_direct_char_table(|val, f| write!(f, "{}", val)),
52+
);
53+
}
54+
55+
fn emit_word_break_test_data(dir: &Path) {
56+
let mut map = BTreeMap::default();
57+
58+
for case in WORD_BREAK_TESTS.entries.iter() {
59+
let WordBreakTest {
60+
ref chars,
61+
ref char_gcbs,
62+
..
63+
} = *case;
64+
65+
for (i, ch) in chars.iter().enumerate() {
66+
let ref gcb = char_gcbs[i];
67+
if map.contains_key(ch) {
68+
assert_eq!(map[ch], *gcb);
69+
} else {
70+
map.insert(*ch, gcb.clone());
71+
}
72+
}
73+
}
74+
75+
write(
76+
dir,
77+
"word_break_test_data.rsv",
78+
&map.to_direct_char_table(|val, f| write!(f, "{}", val)),
79+
);
80+
}
81+
82+
/* TODO
83+
fn emit_sentence_break_test_data(dir: &Path) {
84+
let mut map = BTreeMap::default();
85+
86+
for case in SENTENCE_BREAK_TESTS.entries.iter() {
87+
let SentenceBreakTest {
88+
ref chars,
89+
ref char_gcbs,
90+
..
91+
} = *case;
92+
93+
for (i, ch) in chars.iter().enumerate() {
94+
let ref gcb = char_gcbs[i];
95+
if map.contains_key(ch) {
96+
assert_eq!(map[ch], *gcb);
97+
} else {
98+
map.insert(*ch, gcb.clone());
99+
}
100+
}
101+
}
102+
103+
write(
104+
dir,
105+
"sentence_break_test_data.rsv",
106+
&map.to_direct_char_table(|val, f| write!(f, "{}", val)),
107+
);
108+
}
109+
*/

‎unic/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ unic-bidi = { path = "bidi/", version = "0.6.0" }
2323
unic-char = { path = "char/", version = "0.6.0" }
2424
unic-idna = { path = "idna/", version = "0.6.0" }
2525
unic-normal = { path = "normal/", version = "0.6.0" }
26+
unic-segment = { path = "segment/", version = "0.6.0" }
2627
unic-ucd = { path = "ucd/", version = "0.6.0" }
2728
unic-utils = { path = "utils/", version = "0.6.0" }
2829

‎unic/segment/Cargo.toml

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
[package]
2+
name = "unic-segment"
3+
version = "0.6.0"
4+
authors = ["The UNIC Project Developers"]
5+
repository = "https://github.com/behnam/rust-unic/"
6+
license = "MIT/Apache-2.0"
7+
description = "UNIC - Unicode Text Segmentation Algorithms"
8+
keywords = ["text", "unicode", "grapheme", "word", "boundary"]
9+
categories = ["internationalization", "text-processing", "parsing", "rendering"]
10+
readme = "README.md"
11+
12+
# No tests/benches that depends on /data/
13+
exclude = []
14+
15+
[badges]
16+
travis-ci = { repository = "behnam/rust-unic", branch = "master" }
17+
18+
[dependencies]
19+
unic-ucd-segment = { path = "../ucd/segment/", version = "0.6.0" }
20+
21+
[dev-dependencies]
22+
quickcheck = "0.4"
23+
unic-ucd-common = { path = "../ucd/common/", version = "0.6.0" }

‎unic/segment/README.md

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# UNIC — Unicode Text Segmentation Algorithms
2+
3+
[![Crates.io](https://img.shields.io/crates/v/unic-segment.svg)](https://crates.io/crates/unic-segment)
4+
[![Documentation](https://docs.rs/unic-segment/badge.svg)](https://docs.rs/unic-segment/)
5+
6+
This UNIC component implements algorithms from [Unicode® Standard Annex #29 -
7+
Unicode Text Segmentation](http://unicode.org/reports/tr29/), used for detecting
8+
boundaries of text element boundaries, such as user-perceived characters (a.k.a.
9+
*Grapheme Clusters)*, *Words*, and *Sentences*.
10+
11+
## Notes
12+
13+
Initial code for this component is based on
14+
[`unicode-segmentation`](https://github.com/unicode-rs/unicode-segmentation).

‎unic/segment/src/grapheme.rs

+836
Large diffs are not rendered by default.

‎unic/segment/src/lib.rs

+100
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
// Copyright 2012-2015 The Rust Project Developers.
2+
// Copyright 2017 The UNIC Project Developers.
3+
//
4+
// See the COPYRIGHT file at the top-level directory of this distribution.
5+
//
6+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
7+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
8+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
9+
// option. This file may not be copied, modified, or distributed
10+
// except according to those terms.
11+
12+
13+
//! # UNIC — Unicode Text Segmentation Algorithms
14+
//!
15+
//! A component of [`unic`: Unicode and Internationalization Crates for Rust](/unic/).
16+
//!
17+
//! This UNIC component implements algorithms from [Unicode® Standard Annex #29 -
18+
//! Unicode Text Segmentation](http://unicode.org/reports/tr29/), used for detecting
19+
//! boundaries of text element boundaries, such as user-perceived characters (a.k.a.
20+
//! *Grapheme Clusters)*, *Words*, and *Sentences* (last one not implemented yet).
21+
//!
22+
//! # Examples
23+
//!
24+
//! ```rust
25+
//! # use unic_segment::{GraphemeIndices, Graphemes, WordBoundIndices, WordBounds, Words};
26+
//! assert_eq!(
27+
//! Graphemes::new("a\u{310}e\u{301}o\u{308}\u{332}").collect::<Vec<&str>>(),
28+
//! &["a\u{310}", "e\u{301}", "o\u{308}\u{332}"]
29+
//! );
30+
//!
31+
//! assert_eq!(
32+
//! Graphemes::new("a\r\nb🇷🇺🇸🇹").collect::<Vec<&str>>(),
33+
//! &["a", "\r\n", "b", "🇷🇺", "🇸🇹"]
34+
//! );
35+
//!
36+
//! assert_eq!(
37+
//! GraphemeIndices::new("a̐éö̲\r\n").collect::<Vec<(usize, &str)>>(),
38+
//! &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")]
39+
//! );
40+
//!
41+
//! fn has_alphanumeric(s: &&str) -> bool {
42+
//! s.chars().any(|ch| ch.is_alphanumeric())
43+
//! }
44+
//!
45+
//! assert_eq!(
46+
//! Words::new(
47+
//! "The quick (\"brown\") fox can't jump 32.3 feet, right?",
48+
//! has_alphanumeric,
49+
//! ).collect::<Vec<&str>>(),
50+
//! &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"]
51+
//! );
52+
//!
53+
//! assert_eq!(
54+
//! WordBounds::new("The quick (\"brown\") fox").collect::<Vec<&str>>(),
55+
//! &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"]
56+
//! );
57+
//!
58+
//! assert_eq!(
59+
//! WordBoundIndices::new("Brr, it's 29.3°F!").collect::<Vec<(usize, &str)>>(),
60+
//! &[
61+
//! (0, "Brr"),
62+
//! (3, ","),
63+
//! (4, " "),
64+
//! (5, "it's"),
65+
//! (9, " "),
66+
//! (10, "29.3"),
67+
//! (14, "°"),
68+
//! (16, "F"),
69+
//! (17, "!")
70+
//! ]
71+
//! );
72+
//! ```
73+
74+
#![forbid(unsafe_code, missing_docs)]
75+
76+
77+
extern crate unic_ucd_segment;
78+
79+
#[cfg(test)]
80+
extern crate unic_ucd_common;
81+
82+
83+
mod grapheme;
84+
mod word;
85+
86+
87+
pub use unic_ucd_segment::UNICODE_VERSION;
88+
89+
pub use grapheme::{GraphemeCursor, GraphemeIncomplete, GraphemeIndices, Graphemes};
90+
pub use word::{WordBoundIndices, WordBounds, Words};
91+
92+
93+
/// UNIC component version.
94+
pub const PKG_VERSION: &'static str = env!("CARGO_PKG_VERSION");
95+
96+
/// UNIC component name.
97+
pub const PKG_NAME: &'static str = env!("CARGO_PKG_NAME");
98+
99+
/// UNIC component description.
100+
pub const PKG_DESCRIPTION: &'static str = env!("CARGO_PKG_DESCRIPTION");

‎unic/segment/src/word.rs

+710
Large diffs are not rendered by default.

‎unic/segment/tests/basic_example.rs

+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// Copyright 2012-2015 The Rust Project Developers.
2+
// Copyright 2017 The UNIC Project Developers.
3+
//
4+
// See the COPYRIGHT file at the top-level directory of this distribution.
5+
//
6+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
7+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
8+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
9+
// option. This file may not be copied, modified, or distributed
10+
// except according to those terms.
11+
12+
13+
extern crate unic_segment;
14+
extern crate unic_ucd_common;
15+
16+
17+
use unic_segment::{GraphemeIndices, Graphemes, WordBoundIndices, WordBounds, Words};
18+
use unic_ucd_common::is_alphanumeric;
19+
20+
21+
// Rust 1.17 won't accept closure where fn is expected
22+
fn has_alphanumeric(s: &&str) -> bool {
23+
s.chars().any(is_alphanumeric)
24+
}
25+
26+
27+
#[test]
28+
fn test_all() {
29+
assert_eq!(
30+
Graphemes::new("a\u{310}e\u{301}o\u{308}\u{332}").collect::<Vec<&str>>(),
31+
&["a\u{310}", "e\u{301}", "o\u{308}\u{332}"]
32+
);
33+
34+
assert_eq!(
35+
Graphemes::new("a\r\nb🇷🇺🇸🇹").collect::<Vec<&str>>(),
36+
&["a", "\r\n", "b", "🇷🇺", "🇸🇹"]
37+
);
38+
39+
assert_eq!(
40+
GraphemeIndices::new("a̐éö̲\r\n").collect::<Vec<(usize, &str)>>(),
41+
&[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")]
42+
);
43+
44+
assert_eq!(
45+
Words::new(
46+
"The quick (\"brown\") fox can't jump 32.3 feet, right?",
47+
has_alphanumeric,
48+
).collect::<Vec<&str>>(),
49+
&["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"]
50+
);
51+
52+
assert_eq!(
53+
WordBounds::new("The quick (\"brown\") fox").collect::<Vec<&str>>(),
54+
&["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"]
55+
);
56+
57+
assert_eq!(
58+
WordBoundIndices::new("Brr, it's 29.3°F!").collect::<Vec<(usize, &str)>>(),
59+
&[
60+
(0, "Brr"),
61+
(3, ","),
62+
(4, " "),
63+
(5, "it's"),
64+
(9, " "),
65+
(10, "29.3"),
66+
(14, "°"),
67+
(16, "F"),
68+
(17, "!")
69+
]
70+
);
71+
}

‎unic/segment/tests/extra_grapheme_cluster_break_test_data.rsv

+41
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎unic/segment/tests/extra_word_break_test_data.rsv

+50
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
// Copyright 2012-2015 The Rust Project Developers.
2+
// Copyright 2017 The UNIC Project Developers.
3+
//
4+
// See the COPYRIGHT file at the top-level directory of this distribution.
5+
//
6+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
7+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
8+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
9+
// option. This file may not be copied, modified, or distributed
10+
// except according to those terms.
11+
12+
13+
extern crate unic_segment;
14+
15+
16+
use unic_segment::Graphemes;
17+
18+
19+
type TestData = &'static [(
20+
&'static str,
21+
&'static [&'static str],
22+
Option<&'static [&'static str]>,
23+
)];
24+
25+
26+
const TEST_DATA: TestData = include!("tables/grapheme_cluster_break_test_data.rsv");
27+
28+
/// Extra cases that the official test suite doesn't cover.
29+
const EXTRA_TEST_DATA: TestData = include!("extra_grapheme_cluster_break_test_data.rsv");
30+
31+
32+
#[test]
33+
fn test_graphemes_conformance() {
34+
let tests = TEST_DATA.iter().chain(EXTRA_TEST_DATA);
35+
for &(input, graphemes, legacy_graphemes) in tests {
36+
let legacy_graphemes = match legacy_graphemes {
37+
Some(s) => s,
38+
None => graphemes,
39+
};
40+
41+
// test forward iterator
42+
assert!(Graphemes::new(input).eq(graphemes.iter().cloned()));
43+
assert!(Graphemes::new_legacy(input).eq(legacy_graphemes.iter().cloned()));
44+
45+
// test reverse iterator
46+
assert!(
47+
Graphemes::new(input)
48+
.rev()
49+
.eq(graphemes.iter().rev().cloned())
50+
);
51+
assert!(
52+
Graphemes::new_legacy(input)
53+
.rev()
54+
.eq(legacy_graphemes.iter().rev().cloned())
55+
);
56+
}
57+
}
+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
// Copyright 2012-2015 The Rust Project Developers.
2+
// Copyright 2017 The UNIC Project Developers.
3+
//
4+
// See the COPYRIGHT file at the top-level directory of this distribution.
5+
//
6+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
7+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
8+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
9+
// option. This file may not be copied, modified, or distributed
10+
// except according to those terms.
11+
12+
13+
#[macro_use]
14+
extern crate quickcheck;
15+
16+
extern crate unic_segment;
17+
18+
19+
use unic_segment::{Graphemes, WordBounds};
20+
21+
22+
// QuickCheck Graphemes
23+
quickcheck! {
24+
fn quickcheck_graphemes_new_join_vs_input(input: String) -> bool {
25+
let graphemes = Graphemes::new(&input).collect::<String>();
26+
graphemes == input
27+
}
28+
29+
fn quickcheck_graphemes_new_forward_vs_reverse(input: String) -> bool {
30+
let graphemes1 = Graphemes::new(&input).collect::<Vec<_>>();
31+
let mut graphemes2 = Graphemes::new(&input).rev().collect::<Vec<_>>();
32+
graphemes2.reverse();
33+
graphemes1 == graphemes2
34+
}
35+
36+
fn quickcheck_graphemes_new_legacy_join_vs_input(input: String) -> bool {
37+
let graphemes = Graphemes::new_legacy(&input).collect::<String>();
38+
graphemes == input
39+
}
40+
41+
fn quickcheck_graphemes_new_legacy_forward_vs_reverse(input: String) -> bool {
42+
let graphemes1 = Graphemes::new_legacy(&input).collect::<Vec<_>>();
43+
let mut graphemes2 = Graphemes::new_legacy(&input).rev().collect::<Vec<_>>();
44+
graphemes2.reverse();
45+
graphemes1 == graphemes2
46+
}
47+
}
48+
49+
// QuickCheck Words
50+
quickcheck! {
51+
fn quickcheck_words_new_join_vs_input(input: String) -> bool {
52+
let words = WordBounds::new(&input).collect::<String>();
53+
words == input
54+
}
55+
56+
fn quickcheck_words_new_forward_vs_reverse(input: String) -> bool {
57+
let words1 = WordBounds::new(&input).collect::<Vec<_>>();
58+
let mut words2 = WordBounds::new(&input).rev().collect::<Vec<_>>();
59+
words2.reverse();
60+
words1 == words2
61+
}
62+
}

‎unic/segment/tests/tables/grapheme_cluster_break_test_data.rsv

+748
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎unic/segment/tests/tables/word_break_test_data.rsv

+2,061
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
// Copyright 2012-2015 The Rust Project Developers.
2+
// Copyright 2017 The UNIC Project Developers.
3+
//
4+
// See the COPYRIGHT file at the top-level directory of this distribution.
5+
//
6+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
7+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
8+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
9+
// option. This file may not be copied, modified, or distributed
10+
// except according to those terms.
11+
12+
13+
extern crate unic_segment;
14+
15+
16+
use unic_segment::{WordBoundIndices, WordBounds};
17+
18+
19+
type TestData = &'static [(&'static str, &'static [&'static str])];
20+
21+
22+
const TEST_DATA: TestData = include!("tables/word_break_test_data.rsv");
23+
24+
/// Extra cases that the official test suite doesn't cover.
25+
const EXTRA_TEST_DATA: TestData = include!("extra_word_break_test_data.rsv");
26+
27+
28+
#[test]
29+
fn test_words_conformance() {
30+
let tests = TEST_DATA.iter().chain(EXTRA_TEST_DATA);
31+
32+
for &(input, words) in tests {
33+
macro_rules! assert_ {
34+
($test:expr, $exp:expr, $name:expr) => {
35+
// collect into vector for better diagnostics in failure case
36+
let testing = $test.collect::<Vec<_>>();
37+
let expected = $exp.collect::<Vec<_>>();
38+
assert_eq!(
39+
testing,
40+
expected,
41+
"{} test for testcase ({:?}, {:?}) failed.", $name, input, words
42+
)
43+
}
44+
}
45+
46+
// test forward word boundaries
47+
assert_!(
48+
WordBounds::new(input),
49+
words.iter().cloned(),
50+
"Forward word boundaries"
51+
);
52+
53+
// test reverse word boundaries
54+
assert_!(
55+
WordBounds::new(input).rev(),
56+
words.iter().rev().cloned(),
57+
"Reverse word boundaries"
58+
);
59+
60+
// generate offsets from word string lengths
61+
let mut indices = vec![0];
62+
for i in words.iter().cloned().map(|s| s.len()).scan(0, |t, n| {
63+
*t += n;
64+
Some(*t)
65+
}) {
66+
indices.push(i);
67+
}
68+
indices.pop();
69+
let indices = indices;
70+
71+
// test forward indices iterator
72+
assert_!(
73+
WordBoundIndices::new(input).map(|(l, _)| l),
74+
indices.iter().cloned(),
75+
"Forward word indices"
76+
);
77+
78+
// test backward indices iterator
79+
assert_!(
80+
WordBoundIndices::new(input).rev().map(|(l, _)| l),
81+
indices.iter().rev().cloned(),
82+
"Reverse word indices"
83+
);
84+
}
85+
}

‎unic/src/lib.rs

+9-2
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,18 @@
1818
//! instead of importing components one-by-one, and ensuring all components
1919
//! imported are compatible in algorithms and consistent data-wise.
2020
//!
21-
//! ## Components
21+
//! ## Major Components
2222
//!
23-
//! - [`ucd`](/unic-ucd): Unicode Character Database.
23+
//! - [`char`](/unic-char): Unicode Character utilities.
24+
//!
25+
//! - [`ucd`](/unic-ucd): Unicode Character Database. (UAX\#44).
2426
//!
2527
//! - [`bidi`](/unic-bidi): Unicode Bidirectional Algorithm (UAX\#9).
2628
//!
2729
//! - [`normal`](/unic-normal): Unicode Normalization Forms (UAX\#15).
2830
//!
31+
//! - [`segment`](/unic-segment): Unicode Text Segmentation (UAX\#29).
32+
//!
2933
//! - [`idna`](/unic-idna): Unicode IDNA Compatibility Processing (UTS\#46).
3034
//!
3135
//!
@@ -116,11 +120,14 @@
116120
//! }
117121
//! ```
118122
123+
119124
pub extern crate unic_bidi as bidi;
120125
pub extern crate unic_char as char;
121126
pub extern crate unic_idna as idna;
122127
pub extern crate unic_normal as normal;
128+
pub extern crate unic_segment as segment;
123129
pub extern crate unic_ucd as ucd;
124130

131+
125132
/// The [Unicode version](http://www.unicode.org/versions/) of data
126133
pub use ucd::UNICODE_VERSION;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
// Copyright 2017 The UNIC Project Developers.
2+
//
3+
// See the COPYRIGHT file at the top-level directory of this distribution.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
12+
extern crate unic_ucd_segment;
13+
extern crate unic_utils;
14+
15+
16+
use unic_utils::CharDataTable;
17+
18+
use unic_ucd_segment::grapheme_cluster_break::{self, GraphemeClusterBreak};
19+
use unic_ucd_segment::word_break::{self, WordBreak};
20+
21+
22+
#[test]
23+
fn test_grapheme_cluster_break_conformance() {
24+
use grapheme_cluster_break::abbr_names::*;
25+
use grapheme_cluster_break::long_names::*;
26+
27+
const TEST_DATA: CharDataTable<GraphemeClusterBreak> =
28+
include!("tables/grapheme_cluster_break_test_data.rsv");
29+
30+
for (ch, gcb) in TEST_DATA.iter() {
31+
assert_eq!(GraphemeClusterBreak::of(ch.low), gcb);
32+
}
33+
}
34+
35+
#[test]
36+
fn test_word_break_conformance() {
37+
use word_break::abbr_names::*;
38+
use word_break::long_names::*;
39+
// The test data file uses some unexpected names for some values
40+
use word_break::long_names::{Extend as Extend_FE, Format as Format_FE, ZWJ as ZWJ_FE};
41+
42+
const TEST_DATA: CharDataTable<WordBreak> = include!("tables/word_break_test_data.rsv");
43+
44+
for (ch, gcb) in TEST_DATA.iter() {
45+
assert_eq!(WordBreak::of(ch.low), gcb);
46+
}
47+
}

‎unic/ucd/segment/tests/tables/grapheme_cluster_break_test_data.rsv

+30
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎unic/ucd/segment/tests/tables/word_break_test_data.rsv

+36
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎unic/ucd/segment/tests/unicode_version_tests.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
// except according to those terms.
1010

1111

12-
extern crate unic_ucd_segment;
1312
extern crate unic_ucd_core;
13+
extern crate unic_ucd_segment;
1414

1515

1616
#[test]

0 commit comments

Comments
 (0)
Please sign in to comment.