eyeplum · Oct 6, 2017
diff --git a/‎AUTHORS
+1 b/‎AUTHORS
+1
diff --git a/‎gen/Cargo.toml
+1 b/‎gen/Cargo.toml
+1
diff --git a/‎gen/src/main.rs
+8-1 b/‎gen/src/main.rs
+8-1
diff --git a/‎gen/src/source/ucd/test/grapheme_break_test.rs
+148 b/‎gen/src/source/ucd/test/grapheme_break_test.rs
+148
diff --git a/‎gen/src/source/ucd/test/mod.rs
+2 b/‎gen/src/source/ucd/test/mod.rs
+2
diff --git a/‎gen/src/source/ucd/test/word_break_test.rs
+148 b/‎gen/src/source/ucd/test/word_break_test.rs
+148
diff --git a/‎gen/src/writer/mod.rs
+1 b/‎gen/src/writer/mod.rs
+1
diff --git a/‎gen/src/writer/segment/mod.rs
+19 b/‎gen/src/writer/segment/mod.rs
+19
diff --git a/‎gen/src/writer/segment/segment_tests.rs
+135 b/‎gen/src/writer/segment/segment_tests.rs
+135
diff --git a/‎gen/src/writer/ucd/mod.rs
+2 b/‎gen/src/writer/ucd/mod.rs
+2
diff --git a/‎gen/src/writer/ucd/segment_tests.rs
+109 b/‎gen/src/writer/ucd/segment_tests.rs
+109
diff --git a/‎unic/Cargo.toml
+1 b/‎unic/Cargo.toml
+1
diff --git a/‎unic/segment/Cargo.toml
+23 b/‎unic/segment/Cargo.toml
+23
diff --git a/‎unic/segment/README.md
+14 b/‎unic/segment/README.md
+14
diff --git a/‎unic/segment/src/grapheme.rs
+836 b/‎unic/segment/src/grapheme.rs
+836
diff --git a/‎unic/segment/src/lib.rs
+100 b/‎unic/segment/src/lib.rs
+100
diff --git a/‎unic/segment/src/word.rs
+710 b/‎unic/segment/src/word.rs
+710
diff --git a/‎unic/segment/tests/basic_example.rs
+71 b/‎unic/segment/tests/basic_example.rs
+71
diff --git a/‎unic/segment/tests/extra_grapheme_cluster_break_test_data.rsv
+41 b/‎unic/segment/tests/extra_grapheme_cluster_break_test_data.rsv
+41
diff --git a/‎unic/segment/tests/extra_word_break_test_data.rsv
+50 b/‎unic/segment/tests/extra_word_break_test_data.rsv
+50
diff --git a/‎unic/segment/tests/grapheme_cluster_conformance_tests.rs
+57 b/‎unic/segment/tests/grapheme_cluster_conformance_tests.rs
+57
diff --git a/‎unic/segment/tests/quickcheck_tests.rs
+62 b/‎unic/segment/tests/quickcheck_tests.rs
+62
diff --git a/‎unic/segment/tests/tables/grapheme_cluster_break_test_data.rsv
+748 b/‎unic/segment/tests/tables/grapheme_cluster_break_test_data.rsv
+748
diff --git a/‎unic/segment/tests/tables/word_break_test_data.rsv
+2,061 b/‎unic/segment/tests/tables/word_break_test_data.rsv
+2,061
diff --git a/‎unic/segment/tests/words_conformance_tests.rs
+85 b/‎unic/segment/tests/words_conformance_tests.rs
+85
diff --git a/‎unic/src/lib.rs
+9-2 b/‎unic/src/lib.rs
+9-2
diff --git a/‎unic/ucd/segment/tests/conformance_tests.rs
+47 b/‎unic/ucd/segment/tests/conformance_tests.rs
+47
diff --git a/‎unic/ucd/segment/tests/tables/grapheme_cluster_break_test_data.rsv
+30 b/‎unic/ucd/segment/tests/tables/grapheme_cluster_break_test_data.rsv
+30
diff --git a/‎unic/ucd/segment/tests/tables/word_break_test_data.rsv
+36 b/‎unic/ucd/segment/tests/tables/word_break_test_data.rsv
+36
diff --git a/‎unic/ucd/segment/tests/unicode_version_tests.rs
+1-1 b/‎unic/ucd/segment/tests/unicode_version_tests.rs
+1-1
@@ -12,6 +12,7 @@ Manish Goregaokar <manishsmail@gmail.com>
 Matt Brubeck <mbrubeck@limpet.net>
 Michael Howell <michael@notriddle.com>
 Pyfisch <pyfisch@gmail.com>
+Raph Levien <raph@google.com>
 Riad S. Wahby <kwantam@gmail.com>
 Simon Sapin <simon.sapin@exyr.org>
 Valentin Gosu <valentin.gosu@gmail.com>
@@ -20,5 +20,6 @@ clap = "2.25"
 lazy_static = "0.2"
 
 # Parsing
+itertools = "0.6"
 matches = "0.1"
 regex = "0.2"
@@ -21,6 +21,7 @@ extern crate lazy_static;
 #[macro_use]
 extern crate matches;
 
+extern crate itertools;
 extern crate regex;
 
 
@@ -30,7 +31,10 @@ mod writer;
 
 /// Validate component target names passed in
 fn validate_component_name(name: String) -> Result<(), String> {
-    if matches!(name.as_str(), "idna" | "ucd" | "normal" | "emoji") {
+    if matches!(
+        name.as_str(),
+        "ucd" | "normal" | "segment" | "idna" | "emoji"
+    ) {
         Ok(())
     } else {
         Err(format!("Invalid component: `{}`", name))
@@ -56,6 +60,9 @@ fn main() {
     if components.is_empty() || components.contains(&"normal") {
         writer::normal::generate();
     }
+    if components.is_empty() || components.contains(&"segment") {
+        writer::segment::generate();
+    }
     if components.is_empty() || components.contains(&"idna") {
         writer::idna::generate();
     }
 
@@ -0,0 +1,148 @@
+// Copyright 2017 The UNIC Project Developers.
+//
+// See the COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+
+use std::char;
+use std::str::FromStr;
+
+use itertools::Itertools;
+
+use source::utils::read;
+
+use regex::Regex;
+
+
+lazy_static! {
+    pub static ref GRAPHEME_BREAK_TESTS: GraphemeBreakTests = {
+        read("data/ucd/test/GraphemeBreakTest.txt").parse().unwrap()
+    };
+}
+
+
+pub struct GraphemeBreakTests {
+    pub entries: Vec<GraphemeBreakTest>,
+}
+
+
+/// Represents a Test Case, containing a sequence of characters and GCB for each character, and for
+/// each pair of adjacent chars, if they can break or not, and which rule matching the position.
+///
+/// Invariants:
+/// ```
+/// char_gcbs.len() == chars.len()
+/// breaks.len() == chars.len() - 1
+/// rules.len() == chars.len() - 1
+/// ```
+#[derive(Debug)]
+pub struct GraphemeBreakTest {
+    pub chars: Vec<char>,
+    pub char_gcbs: Vec<String>,
+    pub breaks: Vec<bool>,
+    pub rules: Vec<String>,
+}
+
+
+impl FromStr for GraphemeBreakTests {
+    type Err = ();
+    fn from_str(string: &str) -> Result<GraphemeBreakTests, ()> {
+        lazy_static! {
+            static ref LINE_RE: Regex = Regex::new(
+                r"(?xm)^\s*
+                    ÷ \s+                 # source begin
+                    (\w.*\w)              # source captured
+                    \s+ ÷ \s*             # source end
+                    \#                    # delimiter
+                    \s* ÷ \s+ \[0\.2\]    # comment begin
+                    (.*)                  # comment captured
+                \s*$"
+            ).unwrap();
+
+            static ref COMMENT_RE: Regex = Regex::new(
+                r"(?x)
+                    \s+ [ \w\s<>\(\) -]+     # char name
+                    \s+ \( ( \w+ ) \)       # char gcb
+                    \s+ ( [÷×] )            # break opportunity or not
+                    \s+ \[ ( [^\]]+ ) \]    # rule id
+                "
+            ).unwrap();
+        }
+
+        let entries = LINE_RE
+            .captures_iter(string)
+            .filter_map(|line| {
+                let source_items: Vec<&str> =
+                    line[1].split_whitespace().map(|s| s.trim()).collect();
+
+                let codepoints: Vec<u32> = source_items
+                    .iter()
+                    .step(2)
+                    .map(|&s| u32::from_str_radix(s, 16).expect("Bad number"))
+                    .collect();
+                let chars: Vec<char> = codepoints
+                    .iter()
+                    .filter_map(|&u| char::from_u32(u))
+                    .collect();
+                // Skip if any surrogate or invalid codepoints are present
+                if codepoints.len() != chars.len() {
+                    return None;
+                }
+                assert_eq!(chars.len() * 2, source_items.len() + 1);
+
+                let breaks: Vec<bool> = source_items
+                    .iter()
+                    .dropping(1)
+                    .step(2)
+                    .map(|s| match *s {
+                        "÷" => true,
+                        "×" => false,
+                        t => panic!("Invalid token: {:?}", t),
+                    })
+                    .collect();
+                assert_eq!(breaks.len(), chars.len() - 1);
+
+                let comment_items_captured = COMMENT_RE.captures_iter(&line[2]).collect::<Vec<_>>();
+                let comment_items_mapped = comment_items_captured
+                    .iter()
+                    .map(|ref c| [&c[1], &c[2], &c[3]])
+                    .collect::<Vec<_>>();
+                let comment_items_flattened = comment_items_mapped
+                    .iter()
+                    .flat_map(|x| x.iter())
+                    .collect::<Vec<_>>();
+                let comment_items = &comment_items_flattened[..comment_items_flattened.len() - 2];
+                assert_eq!(comment_items.len(), chars.len() * 3 - 2);
+
+                let char_gcbs: Vec<String> = comment_items
+                    .iter()
+                    .step(3)
+                    .map(|&s| s.to_string())
+                    .collect();
+                assert_eq!(char_gcbs.len(), chars.len());
+
+                let rules: Vec<String> = comment_items
+                    .iter()
+                    .dropping(2)
+                    .step(3)
+                    .map(|&s| s.to_string())
+                    .collect();
+                assert_eq!(rules.len(), chars.len() - 1);
+
+                Some(GraphemeBreakTest {
+                    chars,
+                    char_gcbs,
+                    breaks,
+                    rules,
+                })
+            })
+            .collect();
+
+        Ok(GraphemeBreakTests { entries })
+    }
+}
@@ -10,3 +10,5 @@
 
 
 pub mod normalization_test;
+pub mod grapheme_break_test;
+pub mod word_break_test;
@@ -0,0 +1,148 @@
+// Copyright 2017 The UNIC Project Developers.
+//
+// See the COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+
+use std::char;
+use std::str::FromStr;
+
+use itertools::Itertools;
+
+use source::utils::read;
+
+use regex::Regex;
+
+
+lazy_static! {
+    pub static ref WORD_BREAK_TESTS: WordBreakTests = {
+        read("data/ucd/test/WordBreakTest.txt").parse().unwrap()
+    };
+}
+
+
+pub struct WordBreakTests {
+    pub entries: Vec<WordBreakTest>,
+}
+
+
+/// Represents a Test Case, containing a sequence of characters and GCB for each character, and for
+/// each pair of adjacent chars, if they can break or not, and which rule matching the position.
+///
+/// Invariants:
+/// ```
+/// char_gcbs.len() == chars.len()
+/// breaks.len() == chars.len() - 1
+/// rules.len() == chars.len() - 1
+/// ```
+#[derive(Debug)]
+pub struct WordBreakTest {
+    pub chars: Vec<char>,
+    pub char_gcbs: Vec<String>,
+    pub breaks: Vec<bool>,
+    pub rules: Vec<String>,
+}
+
+
+impl FromStr for WordBreakTests {
+    type Err = ();
+    fn from_str(string: &str) -> Result<WordBreakTests, ()> {
+        lazy_static! {
+            static ref LINE_RE: Regex = Regex::new(
+                r"(?xm)^\s*
+                    ÷ \s+                 # source begin
+                    (\w.*\w)              # source captured
+                    \s+ ÷ \s*             # source end
+                    \#                    # delimiter
+                    \s* ÷ \s+ \[0\.2\]    # comment begin
+                    (.*)                  # comment captured
+                \s*$"
+            ).unwrap();
+
+            static ref COMMENT_RE: Regex = Regex::new(
+                r"(?x)
+                    \s+ [ \w\s<>\(\) -]+     # char name
+                    \s+ \( ( \w+ ) \)       # char gcb
+                    \s+ ( [÷×] )            # break opportunity or not
+                    \s+ \[ ( [^\]]+ ) \]    # rule id
+                "
+            ).unwrap();
+        }
+
+        let entries = LINE_RE
+            .captures_iter(string)
+            .filter_map(|line| {
+                let source_items: Vec<&str> =
+                    line[1].split_whitespace().map(|s| s.trim()).collect();
+
+                let codepoints: Vec<u32> = source_items
+                    .iter()
+                    .step(2)
+                    .map(|&s| u32::from_str_radix(s, 16).expect("Bad number"))
+                    .collect();
+                let chars: Vec<char> = codepoints
+                    .iter()
+                    .filter_map(|&u| char::from_u32(u))
+                    .collect();
+                // Skip if any surrogate or invalid codepoints are present
+                if codepoints.len() != chars.len() {
+                    return None;
+                }
+                assert_eq!(chars.len() * 2, source_items.len() + 1);
+
+                let breaks: Vec<bool> = source_items
+                    .iter()
+                    .dropping(1)
+                    .step(2)
+                    .map(|s| match *s {
+                        "÷" => true,
+                        "×" => false,
+                        t => panic!("Invalid token: {:?}", t),
+                    })
+                    .collect();
+                assert_eq!(breaks.len(), chars.len() - 1);
+
+                let comment_items_captured = COMMENT_RE.captures_iter(&line[2]).collect::<Vec<_>>();
+                let comment_items_mapped = comment_items_captured
+                    .iter()
+                    .map(|ref c| [&c[1], &c[2], &c[3]])
+                    .collect::<Vec<_>>();
+                let comment_items_flattened = comment_items_mapped
+                    .iter()
+                    .flat_map(|x| x.iter())
+                    .collect::<Vec<_>>();
+                let comment_items = &comment_items_flattened[..comment_items_flattened.len() - 2];
+                assert_eq!(comment_items.len(), chars.len() * 3 - 2);
+
+                let char_gcbs: Vec<String> = comment_items
+                    .iter()
+                    .step(3)
+                    .map(|&s| s.to_string())
+                    .collect();
+                assert_eq!(char_gcbs.len(), chars.len());
+
+                let rules: Vec<String> = comment_items
+                    .iter()
+                    .dropping(2)
+                    .step(3)
+                    .map(|&s| s.to_string())
+                    .collect();
+                assert_eq!(rules.len(), chars.len() - 1);
+
+                Some(WordBreakTest {
+                    chars,
+                    char_gcbs,
+                    breaks,
+                    rules,
+                })
+            })
+            .collect();
+
+        Ok(WordBreakTests { entries })
+    }
+}
@@ -12,6 +12,7 @@
 pub mod emoji;
 pub mod idna;
 pub mod normal;
+pub mod segment;
 pub mod ucd;
 
 mod common;
 
@@ -0,0 +1,19 @@
+// Copyright 2017 The UNIC Project Developers.
+//
+// See the COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+
+mod segment_tests;
+
+use writer::utils::clean_dir;
+
+
+pub fn generate() {
+    segment_tests::generate(&clean_dir("unic/segment/tests/tables"));
+}
@@ -0,0 +1,135 @@
+// Copyright 2017 The UNIC Project Developers.
+//
+// See the COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+
+use std::path::Path;
+
+use source::ucd::test::grapheme_break_test::{GraphemeBreakTest, GRAPHEME_BREAK_TESTS};
+use source::ucd::test::word_break_test::{WordBreakTest, WORD_BREAK_TESTS};
+
+use writer::utils::write;
+
+
+pub fn generate(dir: &Path) {
+    emit_grapheme_cluster_break_test_data(dir);
+    emit_word_break_test_data(dir);
+}
+
+
+fn str_escape(s: &str) -> String {
+    format!(
+        "\"{}\"",
+        s.chars()
+            .map(|c| c.escape_unicode().collect::<String>())
+            .collect::<String>()
+    )
+}
+
+fn chars_escape(cs: &[char]) -> String {
+    str_escape(&cs.iter().collect::<String>())
+}
+
+
+fn legacy_should_break(rule: &String) -> bool {
+    const EXTENDED_ONLY_RULES: &[&str] = &["9.1", "9.2"];
+    EXTENDED_ONLY_RULES.contains(&rule.as_str())
+}
+
+
+fn emit_grapheme_cluster_break_test_data(dir: &Path) {
+    let mut contents = "&[\n".to_owned();
+
+    for case in GRAPHEME_BREAK_TESTS.entries.iter() {
+        let GraphemeBreakTest {
+            ref chars,
+            ref breaks,
+            ref rules,
+            ..
+        } = *case;
+
+        contents.push_str("    (");
+
+        // Source
+        contents.push_str(&format!("{}, ", chars_escape(&chars)));
+
+        // Extended Grapheme Clusters
+        {
+            contents.push_str("&[");
+            let mut cluster: Vec<char> = vec![chars[0]];
+            for (i, &brk) in breaks.iter().enumerate() {
+                if brk {
+                    contents.push_str(&format!("{}, ", chars_escape(&cluster)));
+                    cluster.truncate(0);
+                }
+                cluster.push(chars[i + 1]);
+            }
+            contents.push_str(&format!("{}], ", chars_escape(&cluster)));
+        }
+
+        // Legacy Grapheme Clusters
+        if rules.iter().any(legacy_should_break) {
+            contents.push_str("Some(&[");
+            let mut cluster: Vec<char> = vec![chars[0]];
+            for (i, &brk) in breaks.iter().enumerate() {
+                if brk || legacy_should_break(&rules[i]) {
+                    contents.push_str(&format!("{}, ", chars_escape(&cluster)));
+                    cluster.truncate(0);
+                }
+                cluster.push(chars[i + 1]);
+            }
+            contents.push_str(&format!("{}])", chars_escape(&cluster)));
+        } else {
+            contents.push_str("None");
+        }
+
+        contents.push_str("),\n");
+    }
+
+    contents.push_str("]");
+
+    write(dir, "grapheme_cluster_break_test_data.rsv", &contents);
+}
+
+fn emit_word_break_test_data(dir: &Path) {
+    let mut contents = "&[\n".to_owned();
+
+    for case in WORD_BREAK_TESTS.entries.iter() {
+        let WordBreakTest {
+            ref chars,
+            ref breaks,
+            ..
+        } = *case;
+
+        contents.push_str("    (");
+
+        // Source
+        contents.push_str(&format!("{}, ", chars_escape(&chars)));
+
+        // Words
+        {
+            contents.push_str("&[");
+            let mut cluster: Vec<char> = vec![chars[0]];
+            for (i, &brk) in breaks.iter().enumerate() {
+                if brk {
+                    contents.push_str(&format!("{}, ", chars_escape(&cluster)));
+                    cluster.truncate(0);
+                }
+                cluster.push(chars[i + 1]);
+            }
+            contents.push_str(&format!("{}]", chars_escape(&cluster)));
+        }
+
+        contents.push_str("),\n");
+    }
+
+    contents.push_str("]");
+
+    write(dir, "word_break_test_data.rsv", &contents);
+}
@@ -19,6 +19,7 @@ mod ident;
 mod name;
 mod normal;
 mod segment;
+mod segment_tests;
 
 
 use writer::utils::clean_dir;
@@ -35,4 +36,5 @@ pub fn generate() {
     name::generate(&clean_dir("unic/ucd/name/tables"));
     normal::generate(&clean_dir("unic/ucd/normal/tables"));
     segment::generate(&clean_dir("unic/ucd/segment/tables"));
+    segment_tests::generate(&clean_dir("unic/ucd/segment/tests/tables"));
 }
@@ -0,0 +1,109 @@
+// Copyright 2017 The UNIC Project Developers.
+//
+// See the COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+
+use std::path::Path;
+use std::collections::BTreeMap;
+
+use source::ucd::test::grapheme_break_test::{GraphemeBreakTest, GRAPHEME_BREAK_TESTS};
+use source::ucd::test::word_break_test::{WordBreakTest, WORD_BREAK_TESTS};
+
+use writer::utils::tables::ToDirectCharTable;
+use writer::utils::write;
+
+
+pub fn generate(dir: &Path) {
+    emit_grapheme_cluster_break_test_data(dir);
+    emit_word_break_test_data(dir);
+}
+
+
+fn emit_grapheme_cluster_break_test_data(dir: &Path) {
+    let mut map = BTreeMap::default();
+
+    for case in GRAPHEME_BREAK_TESTS.entries.iter() {
+        let GraphemeBreakTest {
+            ref chars,
+            ref char_gcbs,
+            ..
+        } = *case;
+
+        for (i, ch) in chars.iter().enumerate() {
+            let ref gcb = char_gcbs[i];
+            if map.contains_key(ch) {
+                assert_eq!(map[ch], *gcb);
+            } else {
+                map.insert(*ch, gcb.clone());
+            }
+        }
+    }
+
+    write(
+        dir,
+        "grapheme_cluster_break_test_data.rsv",
+        &map.to_direct_char_table(|val, f| write!(f, "{}", val)),
+    );
+}
+
+fn emit_word_break_test_data(dir: &Path) {
+    let mut map = BTreeMap::default();
+
+    for case in WORD_BREAK_TESTS.entries.iter() {
+        let WordBreakTest {
+            ref chars,
+            ref char_gcbs,
+            ..
+        } = *case;
+
+        for (i, ch) in chars.iter().enumerate() {
+            let ref gcb = char_gcbs[i];
+            if map.contains_key(ch) {
+                assert_eq!(map[ch], *gcb);
+            } else {
+                map.insert(*ch, gcb.clone());
+            }
+        }
+    }
+
+    write(
+        dir,
+        "word_break_test_data.rsv",
+        &map.to_direct_char_table(|val, f| write!(f, "{}", val)),
+    );
+}
+
+/* TODO
+fn emit_sentence_break_test_data(dir: &Path) {
+    let mut map = BTreeMap::default();
+
+    for case in SENTENCE_BREAK_TESTS.entries.iter() {
+        let SentenceBreakTest {
+            ref chars,
+            ref char_gcbs,
+            ..
+        } = *case;
+
+        for (i, ch) in chars.iter().enumerate() {
+            let ref gcb = char_gcbs[i];
+            if map.contains_key(ch) {
+                assert_eq!(map[ch], *gcb);
+            } else {
+                map.insert(*ch, gcb.clone());
+            }
+        }
+    }
+
+    write(
+        dir,
+        "sentence_break_test_data.rsv",
+        &map.to_direct_char_table(|val, f| write!(f, "{}", val)),
+    );
+}
+*/
@@ -23,6 +23,7 @@ unic-bidi = { path = "bidi/", version = "0.6.0" }
 unic-char = { path = "char/", version = "0.6.0" }
 unic-idna = { path = "idna/", version = "0.6.0" }
 unic-normal = { path = "normal/", version = "0.6.0" }
+unic-segment = { path = "segment/", version = "0.6.0" }
 unic-ucd = { path = "ucd/", version = "0.6.0" }
 unic-utils = { path = "utils/", version = "0.6.0" }
 
 
@@ -0,0 +1,23 @@
+[package]
+name = "unic-segment"
+version = "0.6.0"
+authors = ["The UNIC Project Developers"]
+repository = "https://github.com/behnam/rust-unic/"
+license = "MIT/Apache-2.0"
+description = "UNIC - Unicode Text Segmentation Algorithms"
+keywords = ["text", "unicode", "grapheme", "word", "boundary"]
+categories = ["internationalization", "text-processing", "parsing", "rendering"]
+readme = "README.md"
+
+# No tests/benches that depends on /data/
+exclude = []
+
+[badges]
+travis-ci = { repository = "behnam/rust-unic", branch = "master" }
+
+[dependencies]
+unic-ucd-segment = { path = "../ucd/segment/", version = "0.6.0" }
+
+[dev-dependencies]
+quickcheck = "0.4"
+unic-ucd-common = { path = "../ucd/common/", version = "0.6.0" }
@@ -0,0 +1,14 @@
+# UNIC — Unicode Text Segmentation Algorithms
+
+[![Crates.io](https://img.shields.io/crates/v/unic-segment.svg)](https://crates.io/crates/unic-segment)
+[![Documentation](https://docs.rs/unic-segment/badge.svg)](https://docs.rs/unic-segment/)
+
+This UNIC component implements algorithms from [Unicode® Standard Annex #29 -
+Unicode Text Segmentation](http://unicode.org/reports/tr29/), used for detecting
+boundaries of text element boundaries, such as user-perceived characters (a.k.a.
+*Grapheme Clusters)*, *Words*, and *Sentences*.
+
+## Notes
+
+Initial code for this component is based on
+[`unicode-segmentation`](https://github.com/unicode-rs/unicode-segmentation).
@@ -0,0 +1,100 @@
+// Copyright 2012-2015 The Rust Project Developers.
+// Copyright 2017 The UNIC Project Developers.
+//
+// See the COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+
+//! # UNIC — Unicode Text Segmentation Algorithms
+//!
+//! A component of [`unic`: Unicode and Internationalization Crates for Rust](/unic/).
+//!
+//! This UNIC component implements algorithms from [Unicode® Standard Annex #29 -
+//! Unicode Text Segmentation](http://unicode.org/reports/tr29/), used for detecting
+//! boundaries of text element boundaries, such as user-perceived characters (a.k.a.
+//! *Grapheme Clusters)*, *Words*, and *Sentences* (last one not implemented yet).
+//!
+//! # Examples
+//!
+//! ```rust
+//! # use unic_segment::{GraphemeIndices, Graphemes, WordBoundIndices, WordBounds, Words};
+//! assert_eq!(
+//!     Graphemes::new("a\u{310}e\u{301}o\u{308}\u{332}").collect::<Vec<&str>>(),
+//!     &["a\u{310}", "e\u{301}", "o\u{308}\u{332}"]
+//! );
+//!
+//! assert_eq!(
+//!     Graphemes::new("a\r\nb🇷🇺🇸🇹").collect::<Vec<&str>>(),
+//!     &["a", "\r\n", "b", "🇷🇺", "🇸🇹"]
+//! );
+//!
+//! assert_eq!(
+//!     GraphemeIndices::new("a̐éö̲\r\n").collect::<Vec<(usize, &str)>>(),
+//!     &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")]
+//! );
+//!
+//! fn has_alphanumeric(s: &&str) -> bool {
+//!     s.chars().any(|ch| ch.is_alphanumeric())
+//! }
+//!
+//! assert_eq!(
+//!     Words::new(
+//!         "The quick (\"brown\") fox can't jump 32.3 feet, right?",
+//!         has_alphanumeric,
+//!     ).collect::<Vec<&str>>(),
+//!     &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"]
+//! );
+//!
+//! assert_eq!(
+//!     WordBounds::new("The quick (\"brown\")  fox").collect::<Vec<&str>>(),
+//!     &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"]
+//! );
+//!
+//! assert_eq!(
+//!     WordBoundIndices::new("Brr, it's 29.3°F!").collect::<Vec<(usize, &str)>>(),
+//!     &[
+//!         (0, "Brr"),
+//!         (3, ","),
+//!         (4, " "),
+//!         (5, "it's"),
+//!         (9, " "),
+//!         (10, "29.3"),
+//!         (14, "°"),
+//!         (16, "F"),
+//!         (17, "!")
+//!     ]
+//! );
+//! ```
+
+#![forbid(unsafe_code, missing_docs)]
+
+
+extern crate unic_ucd_segment;
+
+#[cfg(test)]
+extern crate unic_ucd_common;
+
+
+mod grapheme;
+mod word;
+
+
+pub use unic_ucd_segment::UNICODE_VERSION;
+
+pub use grapheme::{GraphemeCursor, GraphemeIncomplete, GraphemeIndices, Graphemes};
+pub use word::{WordBoundIndices, WordBounds, Words};
+
+
+/// UNIC component version.
+pub const PKG_VERSION: &'static str = env!("CARGO_PKG_VERSION");
+
+/// UNIC component name.
+pub const PKG_NAME: &'static str = env!("CARGO_PKG_NAME");
+
+/// UNIC component description.
+pub const PKG_DESCRIPTION: &'static str = env!("CARGO_PKG_DESCRIPTION");
@@ -0,0 +1,71 @@
+// Copyright 2012-2015 The Rust Project Developers.
+// Copyright 2017 The UNIC Project Developers.
+//
+// See the COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+
+extern crate unic_segment;
+extern crate unic_ucd_common;
+
+
+use unic_segment::{GraphemeIndices, Graphemes, WordBoundIndices, WordBounds, Words};
+use unic_ucd_common::is_alphanumeric;
+
+
+// Rust 1.17 won't accept closure where fn is expected
+fn has_alphanumeric(s: &&str) -> bool {
+    s.chars().any(is_alphanumeric)
+}
+
+
+#[test]
+fn test_all() {
+    assert_eq!(
+        Graphemes::new("a\u{310}e\u{301}o\u{308}\u{332}").collect::<Vec<&str>>(),
+        &["a\u{310}", "e\u{301}", "o\u{308}\u{332}"]
+    );
+
+    assert_eq!(
+        Graphemes::new("a\r\nb🇷🇺🇸🇹").collect::<Vec<&str>>(),
+        &["a", "\r\n", "b", "🇷🇺", "🇸🇹"]
+    );
+
+    assert_eq!(
+        GraphemeIndices::new("a̐éö̲\r\n").collect::<Vec<(usize, &str)>>(),
+        &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")]
+    );
+
+    assert_eq!(
+        Words::new(
+            "The quick (\"brown\") fox can't jump 32.3 feet, right?",
+            has_alphanumeric,
+        ).collect::<Vec<&str>>(),
+        &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"]
+    );
+
+    assert_eq!(
+        WordBounds::new("The quick (\"brown\")  fox").collect::<Vec<&str>>(),
+        &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"]
+    );
+
+    assert_eq!(
+        WordBoundIndices::new("Brr, it's 29.3°F!").collect::<Vec<(usize, &str)>>(),
+        &[
+            (0, "Brr"),
+            (3, ","),
+            (4, " "),
+            (5, "it's"),
+            (9, " "),
+            (10, "29.3"),
+            (14, "°"),
+            (16, "F"),
+            (17, "!")
+        ]
+    );
+}
@@ -0,0 +1,57 @@
+// Copyright 2012-2015 The Rust Project Developers.
+// Copyright 2017 The UNIC Project Developers.
+//
+// See the COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+
+extern crate unic_segment;
+
+
+use unic_segment::Graphemes;
+
+
+type TestData = &'static [(
+    &'static str,
+    &'static [&'static str],
+    Option<&'static [&'static str]>,
+)];
+
+
+const TEST_DATA: TestData = include!("tables/grapheme_cluster_break_test_data.rsv");
+
+/// Extra cases that the official test suite doesn't cover.
+const EXTRA_TEST_DATA: TestData = include!("extra_grapheme_cluster_break_test_data.rsv");
+
+
+#[test]
+fn test_graphemes_conformance() {
+    let tests = TEST_DATA.iter().chain(EXTRA_TEST_DATA);
+    for &(input, graphemes, legacy_graphemes) in tests {
+        let legacy_graphemes = match legacy_graphemes {
+            Some(s) => s,
+            None => graphemes,
+        };
+
+        // test forward iterator
+        assert!(Graphemes::new(input).eq(graphemes.iter().cloned()));
+        assert!(Graphemes::new_legacy(input).eq(legacy_graphemes.iter().cloned()));
+
+        // test reverse iterator
+        assert!(
+            Graphemes::new(input)
+                .rev()
+                .eq(graphemes.iter().rev().cloned())
+        );
+        assert!(
+            Graphemes::new_legacy(input)
+                .rev()
+                .eq(legacy_graphemes.iter().rev().cloned())
+        );
+    }
+}
@@ -0,0 +1,62 @@
+// Copyright 2012-2015 The Rust Project Developers.
+// Copyright 2017 The UNIC Project Developers.
+//
+// See the COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+
+#[macro_use]
+extern crate quickcheck;
+
+extern crate unic_segment;
+
+
+use unic_segment::{Graphemes, WordBounds};
+
+
+// QuickCheck Graphemes
+quickcheck! {
+    fn quickcheck_graphemes_new_join_vs_input(input: String) -> bool {
+        let graphemes = Graphemes::new(&input).collect::<String>();
+        graphemes == input
+    }
+
+    fn quickcheck_graphemes_new_forward_vs_reverse(input: String) -> bool {
+        let graphemes1 = Graphemes::new(&input).collect::<Vec<_>>();
+        let mut graphemes2 = Graphemes::new(&input).rev().collect::<Vec<_>>();
+        graphemes2.reverse();
+        graphemes1 == graphemes2
+    }
+
+    fn quickcheck_graphemes_new_legacy_join_vs_input(input: String) -> bool {
+        let graphemes = Graphemes::new_legacy(&input).collect::<String>();
+        graphemes == input
+    }
+
+    fn quickcheck_graphemes_new_legacy_forward_vs_reverse(input: String) -> bool {
+        let graphemes1 = Graphemes::new_legacy(&input).collect::<Vec<_>>();
+        let mut graphemes2 = Graphemes::new_legacy(&input).rev().collect::<Vec<_>>();
+        graphemes2.reverse();
+        graphemes1 == graphemes2
+    }
+}
+
+// QuickCheck Words
+quickcheck! {
+    fn quickcheck_words_new_join_vs_input(input: String) -> bool {
+        let words = WordBounds::new(&input).collect::<String>();
+        words == input
+    }
+
+    fn quickcheck_words_new_forward_vs_reverse(input: String) -> bool {
+        let words1 = WordBounds::new(&input).collect::<Vec<_>>();
+        let mut words2 = WordBounds::new(&input).rev().collect::<Vec<_>>();
+        words2.reverse();
+        words1 == words2
+    }
+}
@@ -0,0 +1,85 @@
+// Copyright 2012-2015 The Rust Project Developers.
+// Copyright 2017 The UNIC Project Developers.
+//
+// See the COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+
+extern crate unic_segment;
+
+
+use unic_segment::{WordBoundIndices, WordBounds};
+
+
+type TestData = &'static [(&'static str, &'static [&'static str])];
+
+
+const TEST_DATA: TestData = include!("tables/word_break_test_data.rsv");
+
+/// Extra cases that the official test suite doesn't cover.
+const EXTRA_TEST_DATA: TestData = include!("extra_word_break_test_data.rsv");
+
+
+#[test]
+fn test_words_conformance() {
+    let tests = TEST_DATA.iter().chain(EXTRA_TEST_DATA);
+
+    for &(input, words) in tests {
+        macro_rules! assert_ {
+            ($test:expr, $exp:expr, $name:expr) => {
+                // collect into vector for better diagnostics in failure case
+                let testing = $test.collect::<Vec<_>>();
+                let expected = $exp.collect::<Vec<_>>();
+                assert_eq!(
+                    testing,
+                    expected,
+                    "{} test for testcase ({:?}, {:?}) failed.", $name, input, words
+                    )
+            }
+        }
+
+        // test forward word boundaries
+        assert_!(
+            WordBounds::new(input),
+            words.iter().cloned(),
+            "Forward word boundaries"
+        );
+
+        // test reverse word boundaries
+        assert_!(
+            WordBounds::new(input).rev(),
+            words.iter().rev().cloned(),
+            "Reverse word boundaries"
+        );
+
+        // generate offsets from word string lengths
+        let mut indices = vec![0];
+        for i in words.iter().cloned().map(|s| s.len()).scan(0, |t, n| {
+            *t += n;
+            Some(*t)
+        }) {
+            indices.push(i);
+        }
+        indices.pop();
+        let indices = indices;
+
+        // test forward indices iterator
+        assert_!(
+            WordBoundIndices::new(input).map(|(l, _)| l),
+            indices.iter().cloned(),
+            "Forward word indices"
+        );
+
+        // test backward indices iterator
+        assert_!(
+            WordBoundIndices::new(input).rev().map(|(l, _)| l),
+            indices.iter().rev().cloned(),
+            "Reverse word indices"
+        );
+    }
+}
@@ -18,14 +18,18 @@
 //! instead of importing components one-by-one, and ensuring all components
 //! imported are compatible in algorithms and consistent data-wise.
 //!
-//! ## Components
+//! ## Major Components
 //!
-//! -   [`ucd`](/unic-ucd): Unicode Character Database.
+//! -   [`char`](/unic-char): Unicode Character utilities.
+//!
+//! -   [`ucd`](/unic-ucd): Unicode Character Database. (UAX\#44).
 //!
 //! -   [`bidi`](/unic-bidi): Unicode Bidirectional Algorithm (UAX\#9).
 //!
 //! -   [`normal`](/unic-normal): Unicode Normalization Forms (UAX\#15).
 //!
+//! -   [`segment`](/unic-segment): Unicode Text Segmentation (UAX\#29).
+//!
 //! -   [`idna`](/unic-idna): Unicode IDNA Compatibility Processing (UTS\#46).
 //!
 //!
@@ -116,11 +120,14 @@
 //! }
 //! ```
 
+
 pub extern crate unic_bidi as bidi;
 pub extern crate unic_char as char;
 pub extern crate unic_idna as idna;
 pub extern crate unic_normal as normal;
+pub extern crate unic_segment as segment;
 pub extern crate unic_ucd as ucd;
 
+
 /// The [Unicode version](http://www.unicode.org/versions/) of data
 pub use ucd::UNICODE_VERSION;
@@ -0,0 +1,47 @@
+// Copyright 2017 The UNIC Project Developers.
+//
+// See the COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+
+extern crate unic_ucd_segment;
+extern crate unic_utils;
+
+
+use unic_utils::CharDataTable;
+
+use unic_ucd_segment::grapheme_cluster_break::{self, GraphemeClusterBreak};
+use unic_ucd_segment::word_break::{self, WordBreak};
+
+
+#[test]
+fn test_grapheme_cluster_break_conformance() {
+    use grapheme_cluster_break::abbr_names::*;
+    use grapheme_cluster_break::long_names::*;
+
+    const TEST_DATA: CharDataTable<GraphemeClusterBreak> =
+        include!("tables/grapheme_cluster_break_test_data.rsv");
+
+    for (ch, gcb) in TEST_DATA.iter() {
+        assert_eq!(GraphemeClusterBreak::of(ch.low), gcb);
+    }
+}
+
+#[test]
+fn test_word_break_conformance() {
+    use word_break::abbr_names::*;
+    use word_break::long_names::*;
+    // The test data file uses some unexpected names for some values
+    use word_break::long_names::{Extend as Extend_FE, Format as Format_FE, ZWJ as ZWJ_FE};
+
+    const TEST_DATA: CharDataTable<WordBreak> = include!("tables/word_break_test_data.rsv");
+
+    for (ch, gcb) in TEST_DATA.iter() {
+        assert_eq!(WordBreak::of(ch.low), gcb);
+    }
+}
@@ -9,8 +9,8 @@
 // except according to those terms.
 
 
-extern crate unic_ucd_segment;
 extern crate unic_ucd_core;
+extern crate unic_ucd_segment;
 
 
 #[test]
Original file line number	Diff line number	Diff line change
`@@ -10,3 +10,5 @@`
`10`	`10`
`11`	`11`
`12`	`12`	`pub mod normalization_test;`
	`13`	`+pub mod grapheme_break_test;`
	`14`	`+pub mod word_break_test;`