Skip to content

Commit ae83044

Browse files
nicholasbishopphip1611
authored andcommitted
Add ucs2_cstr macro
1 parent 7b1f04e commit ae83044

File tree

4 files changed

+144
-1
lines changed

4 files changed

+144
-1
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@
33
* Impl `Display`, `Eq`, `PartialEq`, `Ord`, `PartialOrd`, and `Hash` for
44
the `Error` type.
55
* Switch to the 2021 edition.
6+
* Add `ucs2_cstr!` macro.

src/lib.rs

+7
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,13 @@
44
#![deny(missing_docs)]
55
#![deny(clippy::all)]
66

7+
mod macros;
8+
9+
/// These need to be public for the `ucs2_cstr!` macro, but are not
10+
/// intended to be called directly.
11+
#[doc(hidden)]
12+
pub use macros::{str_num_ucs2_chars, str_to_ucs2};
13+
714
use bit_field::BitField;
815
use core::fmt::{self, Display, Formatter};
916

src/macros.rs

+126
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
use crate::{ucs2_from_utf8_at_offset, Error};
2+
3+
/// Count the number of UCS-2 characters in a string. Return an error if
4+
/// the string cannot be encoded in UCS-2.
5+
pub const fn str_num_ucs2_chars(s: &str) -> Result<usize, Error> {
6+
let bytes = s.as_bytes();
7+
let len = bytes.len();
8+
9+
let mut offset = 0;
10+
let mut num_ucs2_chars = 0;
11+
12+
while offset < len {
13+
// SAFETY: `bytes` is valid UTF-8.
14+
match unsafe { ucs2_from_utf8_at_offset(bytes, offset) } {
15+
Ok(ch) => {
16+
offset += ch.num_bytes as usize;
17+
num_ucs2_chars += 1;
18+
}
19+
Err(err) => {
20+
return Err(err);
21+
}
22+
}
23+
}
24+
25+
Ok(num_ucs2_chars)
26+
}
27+
28+
/// Convert a `str` into a null-terminated UCS-2 character array.
29+
pub const fn str_to_ucs2<const N: usize>(s: &str) -> Result<[u16; N], Error> {
30+
let bytes = s.as_bytes();
31+
let len = bytes.len();
32+
33+
let mut output = [0; N];
34+
35+
let mut output_offset = 0;
36+
let mut input_offset = 0;
37+
while input_offset < len {
38+
// SAFETY: `bytes` is valid UTF-8.
39+
match unsafe { ucs2_from_utf8_at_offset(bytes, input_offset) } {
40+
Ok(ch) => {
41+
if ch.val == 0 {
42+
panic!("interior null character");
43+
} else {
44+
output[output_offset] = ch.val;
45+
output_offset += 1;
46+
input_offset += ch.num_bytes as usize;
47+
}
48+
}
49+
Err(err) => {
50+
return Err(err);
51+
}
52+
}
53+
}
54+
55+
// The output array must be one bigger than the converted string,
56+
// to leave room for the trailing null character.
57+
if output_offset + 1 != N {
58+
panic!("incorrect array length");
59+
}
60+
61+
Ok(output)
62+
}
63+
64+
/// Encode a string as UCS-2 with a trailing null character.
65+
///
66+
/// The encoding is done at compile time, so the result can be used in a
67+
/// `const` item. The type returned by the macro is a `[u16; N]` array;
68+
/// to avoid having to specify what `N` is in a `const` item, take a
69+
/// reference and store it as `&[u16]`.
70+
///
71+
/// # Example
72+
///
73+
/// ```
74+
/// use ucs2::ucs2_cstr;
75+
///
76+
/// const S: &[u16] = &ucs2_cstr!("abc");
77+
/// assert_eq!(S, [97, 98, 99, 0]);
78+
/// ```
79+
#[macro_export]
80+
macro_rules! ucs2_cstr {
81+
($s:literal) => {{
82+
// Use `const` values here to force errors to happen at compile
83+
// time.
84+
85+
const NUM_CHARS: usize = match $crate::str_num_ucs2_chars($s) {
86+
// Add one for the null char.
87+
Ok(num) => num + 1,
88+
Err(_) => panic!("input contains a character which cannot be represented in UCS-2"),
89+
};
90+
91+
const VAL: [u16; NUM_CHARS] = match $crate::str_to_ucs2($s) {
92+
Ok(val) => val,
93+
// The string was already checked by `str_num_ucs2_chars`,
94+
// so this error is unreachable.
95+
Err(_) => {
96+
unreachable!();
97+
}
98+
};
99+
VAL
100+
}};
101+
}
102+
103+
#[cfg(test)]
104+
mod tests {
105+
use super::*;
106+
107+
#[test]
108+
fn test_str_num_chars() {
109+
// Some of the strings here are from https://www.kermitproject.org/utf8.html.
110+
111+
// One-byte chars.
112+
assert_eq!(str_num_ucs2_chars("abc"), Ok(3));
113+
// Two-byte chars.
114+
assert_eq!(str_num_ucs2_chars("Τη γλώσσα μου έδωσαν ελληνική"), Ok(29));
115+
// Three-byte chars.
116+
assert_eq!(str_num_ucs2_chars("ვეპხის ტყაოსანი შოთა რუსთაველი"), Ok(30));
117+
// Four-byte chars.
118+
assert_eq!(str_num_ucs2_chars("😎🔥"), Err(Error::MultiByte));
119+
}
120+
121+
#[test]
122+
fn test_ucs2_cstr() {
123+
let s = ucs2_cstr!("abc");
124+
assert_eq!(s, [97, 98, 99, 0]);
125+
}
126+
}

tests/tests.rs

+10-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use ucs2::{decode, decode_with, encode, Error};
1+
use ucs2::{decode, decode_with, encode, ucs2_cstr, Error};
22

33
#[test]
44
fn encoding() {
@@ -64,3 +64,12 @@ fn decoding_with() {
6464
assert_eq!(result.unwrap(), 9);
6565
assert_eq!(core::str::from_utf8(&u8_buffer[0..9]), Ok("$¢ह한"));
6666
}
67+
68+
#[test]
69+
fn test_macro() {
70+
const S1: [u16; 4] = ucs2_cstr!("abc");
71+
const S2: &[u16] = &ucs2_cstr!("$¢ह한");
72+
73+
assert_eq!(S1, [97, 98, 99, 0]);
74+
assert_eq!(S2, [36, 162, 2361, 54620, 0]);
75+
}

0 commit comments

Comments
 (0)