@@ -29,6 +29,66 @@ impl Display for Error {
29
29
30
30
type Result < T > = core:: result:: Result < T , Error > ;
31
31
32
+ /// Value returned by `ucs2_from_utf8_at_offset`.
33
+ struct Ucs2CharFromUtf8 {
34
+ /// UCS-2 character.
35
+ val : u16 ,
36
+ /// Number of bytes needed to encode the character in UTF-8.
37
+ num_bytes : u8 ,
38
+ }
39
+
40
+ /// Get a UCS-2 character from a UTF-8 byte slice at the given offset.
41
+ ///
42
+ /// # Safety
43
+ ///
44
+ /// The input `bytes` must be valid UTF-8.
45
+ const unsafe fn ucs2_from_utf8_at_offset ( bytes : & [ u8 ] , offset : usize ) -> Result < Ucs2CharFromUtf8 > {
46
+ let len = bytes. len ( ) ;
47
+ let ch;
48
+ let ch_len;
49
+
50
+ if bytes[ offset] & 0b1000_0000 == 0b0000_0000 {
51
+ ch = bytes[ offset] as u16 ;
52
+ ch_len = 1 ;
53
+ } else if bytes[ offset] & 0b1110_0000 == 0b1100_0000 {
54
+ // 2 byte codepoint
55
+ if offset + 1 >= len {
56
+ // safe: len is the length of bytes,
57
+ // and bytes is a direct view into the
58
+ // buffer of input, which in order to be a valid
59
+ // utf-8 string _must_ contain `i + 1`.
60
+ unsafe { core:: hint:: unreachable_unchecked ( ) }
61
+ }
62
+
63
+ let a = ( bytes[ offset] & 0b0001_1111 ) as u16 ;
64
+ let b = ( bytes[ offset + 1 ] & 0b0011_1111 ) as u16 ;
65
+ ch = a << 6 | b;
66
+ ch_len = 2 ;
67
+ } else if bytes[ offset] & 0b1111_0000 == 0b1110_0000 {
68
+ // 3 byte codepoint
69
+ if offset + 2 >= len || offset + 1 >= len {
70
+ // safe: impossible utf-8 string.
71
+ unsafe { core:: hint:: unreachable_unchecked ( ) }
72
+ }
73
+
74
+ let a = ( bytes[ offset] & 0b0000_1111 ) as u16 ;
75
+ let b = ( bytes[ offset + 1 ] & 0b0011_1111 ) as u16 ;
76
+ let c = ( bytes[ offset + 2 ] & 0b0011_1111 ) as u16 ;
77
+ ch = a << 12 | b << 6 | c;
78
+ ch_len = 3 ;
79
+ } else if bytes[ offset] & 0b1111_0000 == 0b1111_0000 {
80
+ return Err ( Error :: MultiByte ) ; // UTF-16
81
+ } else {
82
+ // safe: impossible utf-8 string.
83
+ unsafe { core:: hint:: unreachable_unchecked ( ) }
84
+ }
85
+
86
+ Ok ( Ucs2CharFromUtf8 {
87
+ val : ch,
88
+ num_bytes : ch_len,
89
+ } )
90
+ }
91
+
32
92
/// Encodes an input UTF-8 string into a UCS-2 string.
33
93
///
34
94
/// The returned `usize` represents the length of the returned buffer,
@@ -62,44 +122,10 @@ where
62
122
let mut i = 0 ;
63
123
64
124
while i < len {
65
- let ch;
66
-
67
- if bytes[ i] & 0b1000_0000 == 0b0000_0000 {
68
- ch = u16:: from ( bytes[ i] ) ;
69
- i += 1 ;
70
- } else if bytes[ i] & 0b1110_0000 == 0b1100_0000 {
71
- // 2 byte codepoint
72
- if i + 1 >= len {
73
- // safe: len is the length of bytes,
74
- // and bytes is a direct view into the
75
- // buffer of input, which in order to be a valid
76
- // utf-8 string _must_ contain `i + 1`.
77
- unsafe { core:: hint:: unreachable_unchecked ( ) }
78
- }
79
-
80
- let a = u16:: from ( bytes[ i] & 0b0001_1111 ) ;
81
- let b = u16:: from ( bytes[ i + 1 ] & 0b0011_1111 ) ;
82
- ch = a << 6 | b;
83
- i += 2 ;
84
- } else if bytes[ i] & 0b1111_0000 == 0b1110_0000 {
85
- // 3 byte codepoint
86
- if i + 2 >= len || i + 1 >= len {
87
- // safe: impossible utf-8 string.
88
- unsafe { core:: hint:: unreachable_unchecked ( ) }
89
- }
90
-
91
- let a = u16:: from ( bytes[ i] & 0b0000_1111 ) ;
92
- let b = u16:: from ( bytes[ i + 1 ] & 0b0011_1111 ) ;
93
- let c = u16:: from ( bytes[ i + 2 ] & 0b0011_1111 ) ;
94
- ch = a << 12 | b << 6 | c;
95
- i += 3 ;
96
- } else if bytes[ i] & 0b1111_0000 == 0b1111_0000 {
97
- return Err ( Error :: MultiByte ) ; // UTF-16
98
- } else {
99
- // safe: impossible utf-8 string.
100
- unsafe { core:: hint:: unreachable_unchecked ( ) }
101
- }
102
- output ( ch) ?;
125
+ // SAFETY: `bytes` is valid UTF-8.
126
+ let ch = unsafe { ucs2_from_utf8_at_offset ( bytes, i) } ?;
127
+ i += usize:: from ( ch. num_bytes ) ;
128
+ output ( ch. val ) ?;
103
129
}
104
130
Ok ( ( ) )
105
131
}
0 commit comments