Skip to content

Commit 2d78a4d

Browse files
authored
Merge pull request #549 from WordPress/feature/idnaencoder-improve-tests
2 parents ac4b336 + 4f76f5b commit 2d78a4d

File tree

2 files changed

+174
-61
lines changed

2 files changed

+174
-61
lines changed

src/IdnaEncoder.php

+6
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,17 @@ public static function to_ascii($text) {
9292
// Step 4: Check if it's ASCII now
9393
if (self::is_ascii($text)) {
9494
// Skip to step 7
95+
/*
96+
* As the `nameprep()` method returns the original string, this code will never be reached until
97+
* that method is properly implemented.
98+
*/
99+
// @codeCoverageIgnoreStart
95100
if (strlen($text) < self::MAX_LENGTH) {
96101
return $text;
97102
}
98103

99104
throw new Exception('Prepared string is too long', 'idna.prepared_too_long', $text);
105+
// @codeCoverageIgnoreEnd
100106
}
101107

102108
// Step 5: Check ACE prefix

tests/IdnaEncoderTest.php

+168-61
Original file line numberDiff line numberDiff line change
@@ -4,97 +4,204 @@
44

55
use WpOrg\Requests\Exception;
66
use WpOrg\Requests\IdnaEncoder;
7+
use WpOrg\Requests\Tests\Fixtures\StringableObject;
78
use WpOrg\Requests\Tests\TestCase;
89

10+
/**
11+
* @covers \WpOrg\Requests\IdnaEncoder
12+
*/
913
final class IdnaEncoderTest extends TestCase {
10-
public static function specExamples() {
14+
15+
/**
16+
* Tests encoding a hostname using Punycode.
17+
*
18+
* @dataProvider dataEncoding
19+
*
20+
* @param string $data Data to encode.
21+
* @param string $expected Expected function output.
22+
*
23+
* @return void
24+
*/
25+
public function testEncoding($data, $expected) {
26+
$result = IdnaEncoder::encode($data);
27+
$this->assertSame($expected, $result);
28+
}
29+
30+
/**
31+
* Data Provider.
32+
*
33+
* @return array
34+
*/
35+
public function dataEncoding() {
1136
return array(
12-
array(
13-
"\xe4\xbb\x96\xe4\xbb\xac\xe4\xb8\xba\xe4\xbb\x80\xe4\xb9\x88\xe4\xb8\x8d\xe8\xaf\xb4\xe4\xb8\xad\xe6\x96\x87",
14-
'xn--ihqwcrb4cv8a8dqg056pqjye',
37+
'empty string' => array(
38+
'data' => '',
39+
'expected' => '',
40+
),
41+
'ascii character' => array(
42+
'data' => 'a',
43+
'expected' => 'a',
44+
),
45+
'two-byte character' => array(
46+
'data' => "\xc2\xb6", // Pilcrow character
47+
'expected' => 'xn--tba',
48+
),
49+
'three-byte character' => array(
50+
'data' => "\xe2\x82\xac", // Euro symbol
51+
'expected' => 'xn--lzg',
52+
),
53+
'four-byte character' => array(
54+
'data' => "\xf0\xa4\xad\xa2", // Chinese symbol?
55+
'expected' => 'xn--ww6j',
56+
),
57+
58+
'stringable object' => array(
59+
'data' => new StringableObject("\xc2\xb6"),
60+
'expected' => 'xn--tba',
61+
),
62+
63+
/*
64+
* Examples taken from RFC: https://datatracker.ietf.org/doc/html/rfc3492#section-7
65+
*
66+
* Testdata retrieved by converting to hex using https://r12a.github.io/uniview/
67+
* - Paste the unicode sequence.
68+
* - Use the "Remove all spaces" option.
69+
* - Use the "Send to Unicode Converter Tool" option.
70+
* - In the tool, copy the UTF-8 sequence, lowercase it and add the `\x` between each set.
71+
*/
72+
'example from specs: RFC3492, section 7.1-A: Arabic' => array(
73+
'data' => "\xd9\x84\xd9\x8a\xd9\x87\xd9\x85\xd8\xa7\xd8\xa8\xd8\xaa\xd9\x83\xd9\x84\xd9\x85\xd9\x88\xd8\xb4\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a\xd8\x9f",
74+
'expected' => 'xn--egbpdaj6bu4bxfgehfvwxn',
75+
),
76+
'example from specs: RFC3492, section 7.1-B: Simplified Chinese' => array(
77+
'data' => "\xe4\xbb\x96\xe4\xbb\xac\xe4\xb8\xba\xe4\xbb\x80\xe4\xb9\x88\xe4\xb8\x8d\xe8\xaf\xb4\xe4\xb8\xad\xe6\x96\x87",
78+
'expected' => 'xn--ihqwcrb4cv8a8dqg056pqjye',
79+
),
80+
'example from specs: RFC3492, section 7.1-C: Traditional Chinese' => array(
81+
'data' => "\xe4\xbb\x96\xe5\x80\x91\xe7\x88\xb2\xe4\xbb\x80\xe9\xba\xbd\xe4\xb8\x8d\xe8\xaa\xaa\xe4\xb8\xad\xe6\x96\x87",
82+
'expected' => 'xn--ihqwctvzc91f659drss3x8bo0yb',
1583
),
16-
array(
17-
"\x33\xe5\xb9\xb4\x42\xe7\xb5\x84\xe9\x87\x91\xe5\x85\xab\xe5\x85\x88\xe7\x94\x9f",
18-
'xn--3B-ww4c5e180e575a65lsy2b',
84+
'example from specs: RFC3492, section 7.1-D: Czech' => array(
85+
'data' => "\x50\x72\x6f\xc4\x8d\x70\x72\x6f\x73\x74\xc4\x9b\x6e\x65\x6d\x6c\x75\x76\xc3\xad\xc4\x8d\x65\x73\x6b\x79",
86+
'expected' => 'xn--Proprostnemluvesky-uyb24dma41a',
87+
),
88+
'example from specs: RFC3492, section 7.1-E: Hebrew' => array(
89+
'data' => "\xd7\x9c\xd7\x9e\xd7\x94\xd7\x94\xd7\x9d\xd7\xa4\xd7\xa9\xd7\x95\xd7\x98\xd7\x9c\xd7\x90\xd7\x9e\xd7\x93\xd7\x91\xd7\xa8\xd7\x99\xd7\x9d\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa",
90+
'expected' => 'xn--4dbcagdahymbxekheh6e0a7fei0b',
91+
),
92+
'example from specs: RFC3492, section 7.1-F: Hindi (Devanagari)' => array(
93+
'data' => "\xe0\xa4\xaf\xe0\xa4\xb9\xe0\xa4\xb2\xe0\xa5\x8b\xe0\xa4\x97\xe0\xa4\xb9\xe0\xa4\xbf\xe0\xa4\xa8\xe0\xa5\x8d\xe0\xa4\xa6\xe0\xa5\x80\xe0\xa4\x95\xe0\xa5\x8d\xe0\xa4\xaf\xe0\xa5\x8b\xe0\xa4\x82\xe0\xa4\xa8\xe0\xa4\xb9\xe0\xa5\x80\xe0\xa4\x82\xe0\xa4\xac\xe0\xa5\x8b\xe0\xa4\xb2\xe0\xa4\xb8\xe0\xa4\x95\xe0\xa4\xa4\xe0\xa5\x87\xe0\xa4\xb9\xe0\xa5\x88\xe0\xa4\x82",
94+
'expected' => 'xn--i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd',
95+
),
96+
'example from specs: RFC3492, section 7.1-G: Japanese (Kanji and hiragana)' => array(
97+
'data' => "\xe3\x81\xaa\xe3\x81\x9c\xe3\x81\xbf\xe3\x82\x93\xe3\x81\xaa\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x82\x92\xe8\xa9\xb1\xe3\x81\x97\xe3\x81\xa6\xe3\x81\x8f\xe3\x82\x8c\xe3\x81\xaa\xe3\x81\x84\xe3\x81\xae\xe3\x81\x8b",
98+
'expected' => 'xn--n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa',
99+
),
100+
/* Does not validate - output too long.
101+
'example from specs: RFC3492, section 7.1-H: Korean (Hangul)' => array(
102+
'data' => "\xec\x84\xb8\xea\xb3\x84\xec\x9d\x98\xeb\xaa\xa8\xeb\x93\xa0\xec\x82\xac\xeb\x9e\x8c\xeb\x93\xa4\xec\x9d\xb4\xed\x95\x9c\xea\xb5\xad\xec\x96\xb4\xeb\xa5\xbc\xec\x9d\xb4\xed\x95\xb4\xed\x95\x9c\xeb\x8b\xa4\xeb\xa9\xb4\xec\x96\xbc\xeb\xa7\x88\xeb\x82\x98\xec\xa2\x8b\xec\x9d\x84\xea\xb9\x8c",
103+
'expected' => 'xn--989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c',
104+
),
105+
*/
106+
'example from specs: RFC3492, section 7.1-I: Russian (Cyrillic)' => array(
107+
'data' => "\xd0\xbf\xd0\xbe\xd1\x87\xd0\xb5\xd0\xbc\xd1\x83\xd0\xb6\xd0\xb5\xd0\xbe\xd0\xbd\xd0\xb8\xd0\xbd\xd0\xb5\xd0\xb3\xd0\xbe\xd0\xb2\xd0\xbe\xd1\x80\xd1\x8f\xd1\x82\xd0\xbf\xd0\xbe\xd1\x80\xd1\x83\xd1\x81\xd1\x81\xd0\xba\xd0\xb8",
108+
// Officially, the `d` in `dot` should be uppercase ? Needs double-check. Either a typo in the RFC or a bug.
109+
'expected' => 'xn--b1abfaaepdrnnbgefbadotcwatmq2g4l',
110+
),
111+
'example from specs: RFC3492, section 7.1-J: Spanish' => array(
112+
'data' => "\x50\x6f\x72\x71\x75\xc3\xa9\x6e\x6f\x70\x75\x65\x64\x65\x6e\x73\x69\x6d\x70\x6c\x65\x6d\x65\x6e\x74\x65\x68\x61\x62\x6c\x61\x72\x65\x6e\x45\x73\x70\x61\xc3\xb1\x6f\x6c",
113+
'expected' => 'xn--PorqunopuedensimplementehablarenEspaol-fmd56a',
114+
),
115+
'example from specs: RFC3492, section 7.1-K: Vietnamese' => array(
116+
'data' => "\x54\xe1\xba\xa1\x69\x73\x61\x6f\x68\xe1\xbb\x8d\x6b\x68\xc3\xb4\x6e\x67\x74\x68\xe1\xbb\x83\x63\x68\xe1\xbb\x89\x6e\xc3\xb3\x69\x74\x69\xe1\xba\xbf\x6e\x67\x56\x69\xe1\xbb\x87\x74",
117+
'expected' => 'xn--TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g',
118+
),
119+
'example from specs: RFC3492, section 7.1-L: Japanese artist' => array(
120+
'data' => "\x33\xe5\xb9\xb4\x42\xe7\xb5\x84\xe9\x87\x91\xe5\x85\xab\xe5\x85\x88\xe7\x94\x9f",
121+
'expected' => 'xn--3B-ww4c5e180e575a65lsy2b',
122+
),
123+
'example from specs: RFC3492, section 7.1-M: Japanese artist' => array(
124+
'data' => "\xe5\xae\x89\xe5\xae\xa4\xe5\xa5\x88\xe7\xbe\x8e\xe6\x81\xb5\x2d\x77\x69\x74\x68\x2d\x53\x55\x50\x45\x52\x2d\x4d\x4f\x4e\x4b\x45\x59\x53",
125+
'expected' => 'xn---with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n',
126+
),
127+
'example from specs: RFC3492, section 7.1-N: Japanese artist' => array(
128+
'data' => "\x48\x65\x6c\x6c\x6f\x2d\x41\x6e\x6f\x74\x68\x65\x72\x2d\x57\x61\x79\x2d\xe3\x81\x9d\xe3\x82\x8c\xe3\x81\x9e\xe3\x82\x8c\xe3\x81\xae\xe5\xa0\xb4\xe6\x89\x80",
129+
'expected' => 'xn--Hello-Another-Way--fc4qua05auwb3674vfr0b',
130+
),
131+
'example from specs: RFC3492, section 7.1-O: Japanese artist' => array(
132+
'data' => "\xe3\x81\xb2\xe3\x81\xa8\xe3\x81\xa4\xe5\xb1\x8b\xe6\xa0\xb9\xe3\x81\xae\xe4\xb8\x8b\x32",
133+
'expected' => 'xn--2-u9tlzr9756bt3uc0v',
134+
),
135+
'example from specs: RFC3492, section 7.1-P: Japanese artist' => array(
136+
'data' => "\x4d\x61\x6a\x69\xe3\x81\xa7\x4b\x6f\x69\xe3\x81\x99\xe3\x82\x8b\x35\xe7\xa7\x92\xe5\x89\x8d",
137+
'expected' => 'xn--MajiKoi5-783gue6qz075azm5e',
138+
),
139+
'example from specs: RFC3492, section 7.1-Q: Japanese artist' => array(
140+
'data' => "\xe3\x83\x91\xe3\x83\x95\xe3\x82\xa3\xe3\x83\xbc\x64\x65\xe3\x83\xab\xe3\x83\xb3\xe3\x83\x90",
141+
'expected' => 'xn--de-jg4avhby1noc0d',
142+
),
143+
'example from specs: RFC3492, section 7.1-R: Japanese artist' => array(
144+
'data' => "\xe3\x81\x9d\xe3\x81\xae\xe3\x82\xb9\xe3\x83\x94\xe3\x83\xbc\xe3\x83\x89\xe3\x81\xa7",
145+
'expected' => 'xn--d9juau41awczczp',
146+
),
147+
'example from specs: RFC3492, section 7.1-S: ASCII string which breaks the rules' => array(
148+
'data' => "\x2d\x3e\x20\x24\x31\x2e\x30\x30\x20\x3c\x2d",
149+
'expected' => '-> $1.00 <-',
19150
),
20151
);
21152
}
22153

23154
/**
24-
* @dataProvider specExamples
155+
* Tests receiving an exception when trying to encode a hostname containing invalid unicode.
156+
*
157+
* @dataProvider dataInvalidUnicode
158+
*
159+
* @param string $data Data to encode.
160+
*
161+
* @return void
25162
*/
26-
public function testEncoding($data, $expected) {
27-
$result = IdnaEncoder::encode($data);
28-
$this->assertSame($expected, $result);
163+
public function testInvalidUnicode($data) {
164+
$this->expectException(Exception::class);
165+
$this->expectExceptionMessage('Invalid Unicode codepoint');
166+
167+
IdnaEncoder::encode($data);
168+
}
169+
170+
/**
171+
* Data Provider.
172+
*
173+
* @return array
174+
*/
175+
public function dataInvalidUnicode() {
176+
return array(
177+
'Five-byte character' => array("\xfb\xb6\xb6\xb6\xb6"),
178+
'Six-byte character' => array("\xfd\xb6\xb6\xb6\xb6\xb6"),
179+
'Invalid ASCII character with multibyte' => array("\0\xc2\xb6"),
180+
'Unfinished multibyte' => array("\xc2"),
181+
'Partial multibyte' => array("\xc2\xc2\xb6"),
182+
);
29183
}
30184

31185
public function testASCIITooLong() {
32186
$this->expectException(Exception::class);
33187
$this->expectExceptionMessage('Provided string is too long');
188+
34189
$data = str_repeat('abcd', 20);
35190
IdnaEncoder::encode($data);
36191
}
37192

38193
public function testEncodedTooLong() {
39194
$this->expectException(Exception::class);
40195
$this->expectExceptionMessage('Encoded string is too long');
196+
41197
$data = str_repeat("\xe4\xbb\x96", 60);
42198
IdnaEncoder::encode($data);
43199
}
44200

45201
public function testAlreadyPrefixed() {
46202
$this->expectException(Exception::class);
47203
$this->expectExceptionMessage('Provided string begins with ACE prefix');
48-
IdnaEncoder::encode("xn--\xe4\xbb\x96");
49-
}
50-
51-
public function testASCIICharacter() {
52-
$result = IdnaEncoder::encode('a');
53-
$this->assertSame('a', $result);
54-
}
55-
56-
public function testTwoByteCharacter() {
57-
$result = IdnaEncoder::encode("\xc2\xb6"); // Pilcrow character
58-
$this->assertSame('xn--tba', $result);
59-
}
60-
61-
public function testThreeByteCharacter() {
62-
$result = IdnaEncoder::encode("\xe2\x82\xac"); // Euro symbol
63-
$this->assertSame('xn--lzg', $result);
64-
}
65-
66-
public function testFourByteCharacter() {
67-
$result = IdnaEncoder::encode("\xf0\xa4\xad\xa2"); // Chinese symbol?
68-
$this->assertSame('xn--ww6j', $result);
69-
}
70204

71-
public function testFiveByteCharacter() {
72-
$this->expectException(Exception::class);
73-
$this->expectExceptionMessage('Invalid Unicode codepoint');
74-
IdnaEncoder::encode("\xfb\xb6\xb6\xb6\xb6");
75-
}
76-
77-
public function testSixByteCharacter() {
78-
$this->expectException(Exception::class);
79-
$this->expectExceptionMessage('Invalid Unicode codepoint');
80-
IdnaEncoder::encode("\xfd\xb6\xb6\xb6\xb6\xb6");
81-
}
82-
83-
public function testInvalidASCIICharacterWithMultibyte() {
84-
$this->expectException(Exception::class);
85-
$this->expectExceptionMessage('Invalid Unicode codepoint');
86-
IdnaEncoder::encode("\0\xc2\xb6");
87-
}
88-
89-
public function testUnfinishedMultibyte() {
90-
$this->expectException(Exception::class);
91-
$this->expectExceptionMessage('Invalid Unicode codepoint');
92-
IdnaEncoder::encode("\xc2");
93-
}
94-
95-
public function testPartialMultibyte() {
96-
$this->expectException(Exception::class);
97-
$this->expectExceptionMessage('Invalid Unicode codepoint');
98-
IdnaEncoder::encode("\xc2\xc2\xb6");
205+
IdnaEncoder::encode("xn--\xe4\xbb\x96");
99206
}
100207
}

0 commit comments

Comments
 (0)