Skip to content

Commit 0035f14

Browse files
committed
Improving C target's utf-8 conversion.
1 parent 939c3cf commit 0035f14

File tree

3 files changed

+90
-90
lines changed

3 files changed

+90
-90
lines changed

src/parse.c

+30-30
Original file line numberDiff line numberDiff line change
@@ -16568,48 +16568,48 @@ UNICC_STATIC int _alloc_stack( _pcb* pcb )
1656816568
#if UNICC_UTF8
1656916569
UNICC_STATIC UNICC_CHAR _get_char( _pcb* pcb )
1657016570
{
16571-
unsigned char byte[ 4 ];
16571+
unsigned char first = UNICC_GETCHAR( pcb );
1657216572

16573-
// Read the first byte
16574-
byte[0] = UNICC_GETCHAR( pcb );
16575-
16576-
if ((byte[0] & 0x80) == 0)
16573+
if ((first & 0x80) == 0)
1657716574
{
1657816575
// Single-byte ASCII character
16579-
return byte[0];
16576+
return first;
1658016577
}
16581-
else if ((byte[0] & 0xE0) == 0xC0)
16578+
else if ((first & 0xE0) == 0xC0)
1658216579
{
1658316580
// Two-byte sequence (110xxxxx 10xxxxxx)
16584-
byte[1] = UNICC_GETCHAR( pcb );
16585-
return (
16586-
(byte[0] & 0x1F) << 6)
16587-
| (byte[1] & 0x3F
16588-
);
16581+
unsigned char second = UNICC_GETCHAR( pcb );
16582+
return ((first & 0x1F) << 6) | (second & 0x3F);
1658916583
}
16590-
else if ((byte[0] & 0xF0) == 0xE0)
16584+
else if ((first & 0xF0) == 0xE0)
1659116585
{
1659216586
// Three-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
16593-
byte[1] = UNICC_GETCHAR( pcb );
16594-
byte[2] = UNICC_GETCHAR( pcb );
16595-
return (
16596-
(byte[0] & 0x0F) << 12)
16597-
| ((byte[1] & 0x3F) << 6)
16598-
| (byte[3] & 0x3F
16599-
);
16587+
unsigned char bytes[2];
16588+
16589+
bytes[0] = UNICC_GETCHAR( pcb );
16590+
bytes[1] = UNICC_GETCHAR( pcb );
16591+
16592+
return
16593+
((first & 0x0F) << 12)
16594+
| ((bytes[0] & 0x3F) << 6)
16595+
| (bytes[1] & 0x3F)
16596+
;
1660016597
}
16601-
else if ((byte[0] & 0xF8) == 0xF0)
16598+
else if ((first & 0xF8) == 0xF0)
1660216599
{
1660316600
// Four-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
16604-
byte[1] = UNICC_GETCHAR( pcb );
16605-
byte[2] = UNICC_GETCHAR( pcb );
16606-
byte[3] = UNICC_GETCHAR( pcb );
16607-
return (
16608-
(byte[0] & 0x07) << 18)
16609-
| ((byte[1] & 0x3F) << 12)
16610-
| ((byte[2] & 0x3F) << 6)
16611-
| (byte[3] & 0x3F
16612-
);
16601+
unsigned char bytes[3];
16602+
16603+
bytes[0] = UNICC_GETCHAR( pcb );
16604+
bytes[1] = UNICC_GETCHAR( pcb );
16605+
bytes[2] = UNICC_GETCHAR( pcb );
16606+
16607+
return
16608+
((first & 0x07) << 18)
16609+
| ((bytes[0] & 0x3F) << 12)
16610+
| ((bytes[1] & 0x3F) << 6)
16611+
| (bytes[2] & 0x3F)
16612+
;
1661316613
}
1661416614

1661516615
return -1; // Invalid UTF-8 sequence

targets/C.source/fn.getinput.c

+30-30
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,48 @@
11
#if UNICC_UTF8
22
UNICC_STATIC UNICC_CHAR _get_char( _pcb* pcb )
33
{
4-
unsigned char byte[ 4 ];
4+
unsigned char first = UNICC_GETCHAR( pcb );
55

6-
// Read the first byte
7-
byte[0] = UNICC_GETCHAR( pcb );
8-
9-
if ((byte[0] & 0x80) == 0)
6+
if ((first & 0x80) == 0)
107
{
118
// Single-byte ASCII character
12-
return byte[0];
9+
return first;
1310
}
14-
else if ((byte[0] & 0xE0) == 0xC0)
11+
else if ((first & 0xE0) == 0xC0)
1512
{
1613
// Two-byte sequence (110xxxxx 10xxxxxx)
17-
byte[1] = UNICC_GETCHAR( pcb );
18-
return (
19-
(byte[0] & 0x1F) << 6)
20-
| (byte[1] & 0x3F
21-
);
14+
unsigned char second = UNICC_GETCHAR( pcb );
15+
return ((first & 0x1F) << 6) | (second & 0x3F);
2216
}
23-
else if ((byte[0] & 0xF0) == 0xE0)
17+
else if ((first & 0xF0) == 0xE0)
2418
{
2519
// Three-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
26-
byte[1] = UNICC_GETCHAR( pcb );
27-
byte[2] = UNICC_GETCHAR( pcb );
28-
return (
29-
(byte[0] & 0x0F) << 12)
30-
| ((byte[1] & 0x3F) << 6)
31-
| (byte[3] & 0x3F
32-
);
20+
unsigned char bytes[2];
21+
22+
bytes[0] = UNICC_GETCHAR( pcb );
23+
bytes[1] = UNICC_GETCHAR( pcb );
24+
25+
return
26+
((first & 0x0F) << 12)
27+
| ((bytes[0] & 0x3F) << 6)
28+
| (bytes[1] & 0x3F)
29+
;
3330
}
34-
else if ((byte[0] & 0xF8) == 0xF0)
31+
else if ((first & 0xF8) == 0xF0)
3532
{
3633
// Four-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
37-
byte[1] = UNICC_GETCHAR( pcb );
38-
byte[2] = UNICC_GETCHAR( pcb );
39-
byte[3] = UNICC_GETCHAR( pcb );
40-
return (
41-
(byte[0] & 0x07) << 18)
42-
| ((byte[1] & 0x3F) << 12)
43-
| ((byte[2] & 0x3F) << 6)
44-
| (byte[3] & 0x3F
45-
);
34+
unsigned char bytes[3];
35+
36+
bytes[0] = UNICC_GETCHAR( pcb );
37+
bytes[1] = UNICC_GETCHAR( pcb );
38+
bytes[2] = UNICC_GETCHAR( pcb );
39+
40+
return
41+
((first & 0x07) << 18)
42+
| ((bytes[0] & 0x3F) << 12)
43+
| ((bytes[1] & 0x3F) << 6)
44+
| (bytes[2] & 0x3F)
45+
;
4646
}
4747

4848
return -1; // Invalid UTF-8 sequence

targets/c.tlt

+30-30
Original file line numberDiff line numberDiff line change
@@ -1026,48 +1026,48 @@ UNICC_STATIC int @@prefix_alloc_stack( @@prefix_pcb* pcb )
10261026
#if UNICC_UTF8
10271027
UNICC_STATIC UNICC_CHAR _get_char( _pcb* pcb )
10281028
{
1029-
unsigned char byte[ 4 ];
1029+
unsigned char first = UNICC_GETCHAR( pcb );
10301030

1031-
// Read the first byte
1032-
byte[0] = UNICC_GETCHAR( pcb );
1033-
1034-
if ((byte[0] &amp; 0x80) == 0)
1031+
if ((first &amp; 0x80) == 0)
10351032
{
10361033
// Single-byte ASCII character
1037-
return byte[0];
1034+
return first;
10381035
}
1039-
else if ((byte[0] &amp; 0xE0) == 0xC0)
1036+
else if ((first &amp; 0xE0) == 0xC0)
10401037
{
10411038
// Two-byte sequence (110xxxxx 10xxxxxx)
1042-
byte[1] = UNICC_GETCHAR( pcb );
1043-
return (
1044-
(byte[0] &amp; 0x1F) &lt;&lt; 6)
1045-
| (byte[1] &amp; 0x3F
1046-
);
1039+
unsigned char second = UNICC_GETCHAR( pcb );
1040+
return ((first &amp; 0x1F) &lt;&lt; 6) | (second &amp; 0x3F);
10471041
}
1048-
else if ((byte[0] &amp; 0xF0) == 0xE0)
1042+
else if ((first &amp; 0xF0) == 0xE0)
10491043
{
10501044
// Three-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
1051-
byte[1] = UNICC_GETCHAR( pcb );
1052-
byte[2] = UNICC_GETCHAR( pcb );
1053-
return (
1054-
(byte[0] &amp; 0x0F) &lt;&lt; 12)
1055-
| ((byte[1] &amp; 0x3F) &lt;&lt; 6)
1056-
| (byte[3] &amp; 0x3F
1057-
);
1045+
unsigned char bytes[2];
1046+
1047+
bytes[0] = UNICC_GETCHAR( pcb );
1048+
bytes[1] = UNICC_GETCHAR( pcb );
1049+
1050+
return
1051+
((first &amp; 0x0F) &lt;&lt; 12)
1052+
| ((bytes[0] &amp; 0x3F) &lt;&lt; 6)
1053+
| (bytes[1] &amp; 0x3F)
1054+
;
10581055
}
1059-
else if ((byte[0] &amp; 0xF8) == 0xF0)
1056+
else if ((first &amp; 0xF8) == 0xF0)
10601057
{
10611058
// Four-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
1062-
byte[1] = UNICC_GETCHAR( pcb );
1063-
byte[2] = UNICC_GETCHAR( pcb );
1064-
byte[3] = UNICC_GETCHAR( pcb );
1065-
return (
1066-
(byte[0] &amp; 0x07) &lt;&lt; 18)
1067-
| ((byte[1] &amp; 0x3F) &lt;&lt; 12)
1068-
| ((byte[2] &amp; 0x3F) &lt;&lt; 6)
1069-
| (byte[3] &amp; 0x3F
1070-
);
1059+
unsigned char bytes[3];
1060+
1061+
bytes[0] = UNICC_GETCHAR( pcb );
1062+
bytes[1] = UNICC_GETCHAR( pcb );
1063+
bytes[2] = UNICC_GETCHAR( pcb );
1064+
1065+
return
1066+
((first &amp; 0x07) &lt;&lt; 18)
1067+
| ((bytes[0] &amp; 0x3F) &lt;&lt; 12)
1068+
| ((bytes[1] &amp; 0x3F) &lt;&lt; 6)
1069+
| (bytes[2] &amp; 0x3F)
1070+
;
10711071
}
10721072

10731073
return -1; // Invalid UTF-8 sequence

0 commit comments

Comments
 (0)