Skip to content

Commit 4461548

Browse files
authored
Unescape constant strings
1 parent a6c17e2 commit 4461548

File tree

2 files changed

+148
-12
lines changed

2 files changed

+148
-12
lines changed

Diff for: src/Parser/ConstExprParser.php

+101-11
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,35 @@
44

55
use PHPStan\PhpDocParser\Ast;
66
use PHPStan\PhpDocParser\Lexer\Lexer;
7+
use function chr;
8+
use function hexdec;
9+
use function octdec;
10+
use function preg_replace_callback;
11+
use function str_replace;
712
use function strtolower;
8-
use function trim;
13+
use function substr;
914

1015
class ConstExprParser
1116
{
1217

18+
private const REPLACEMENTS = [
19+
'\\' => '\\',
20+
'n' => "\n",
21+
'r' => "\r",
22+
't' => "\t",
23+
'f' => "\f",
24+
'v' => "\v",
25+
'e' => "\x1B",
26+
];
27+
28+
/** @var bool */
29+
private $unescapeStrings;
30+
31+
public function __construct(bool $unescapeStrings = false)
32+
{
33+
$this->unescapeStrings = $unescapeStrings;
34+
}
35+
1336
public function parse(TokenIterator $tokens, bool $trimStrings = false): Ast\ConstExpr\ConstExprNode
1437
{
1538
if ($tokens->isCurrentTokenType(Lexer::TOKEN_FLOAT)) {
@@ -24,18 +47,14 @@ public function parse(TokenIterator $tokens, bool $trimStrings = false): Ast\Con
2447
return new Ast\ConstExpr\ConstExprIntegerNode($value);
2548
}
2649

27-
if ($tokens->isCurrentTokenType(Lexer::TOKEN_SINGLE_QUOTED_STRING)) {
28-
$value = $tokens->currentTokenValue();
29-
if ($trimStrings) {
30-
$value = trim($tokens->currentTokenValue(), "'");
31-
}
32-
$tokens->next();
33-
return new Ast\ConstExpr\ConstExprStringNode($value);
34-
35-
} elseif ($tokens->isCurrentTokenType(Lexer::TOKEN_DOUBLE_QUOTED_STRING)) {
50+
if ($tokens->isCurrentTokenType(Lexer::TOKEN_SINGLE_QUOTED_STRING, Lexer::TOKEN_DOUBLE_QUOTED_STRING)) {
3651
$value = $tokens->currentTokenValue();
3752
if ($trimStrings) {
38-
$value = trim($tokens->currentTokenValue(), '"');
53+
if ($this->unescapeStrings) {
54+
$value = self::unescapeString($value);
55+
} else {
56+
$value = substr($value, 1, -1);
57+
}
3958
}
4059
$tokens->next();
4160
return new Ast\ConstExpr\ConstExprStringNode($value);
@@ -137,4 +156,75 @@ private function parseArrayItem(TokenIterator $tokens): Ast\ConstExpr\ConstExprA
137156
return new Ast\ConstExpr\ConstExprArrayItemNode($key, $value);
138157
}
139158

159+
private static function unescapeString(string $string): string
160+
{
161+
$quote = $string[0];
162+
163+
if ($quote === '\'') {
164+
return str_replace(
165+
['\\\\', '\\\''],
166+
['\\', '\''],
167+
substr($string, 1, -1)
168+
);
169+
}
170+
171+
return self::parseEscapeSequences(substr($string, 1, -1), '"');
172+
}
173+
174+
/**
175+
* Implementation based on https://github.com/nikic/PHP-Parser/blob/b0edd4c41111042d43bb45c6c657b2e0db367d9e/lib/PhpParser/Node/Scalar/String_.php#L90-L130
176+
*/
177+
private static function parseEscapeSequences(string $str, string $quote): string
178+
{
179+
$str = str_replace('\\' . $quote, $quote, $str);
180+
181+
return preg_replace_callback(
182+
'~\\\\([\\\\nrtfve]|[xX][0-9a-fA-F]{1,2}|[0-7]{1,3}|u\{([0-9a-fA-F]+)\})~',
183+
static function ($matches) {
184+
$str = $matches[1];
185+
186+
if (isset(self::REPLACEMENTS[$str])) {
187+
return self::REPLACEMENTS[$str];
188+
}
189+
if ($str[0] === 'x' || $str[0] === 'X') {
190+
return chr(hexdec(substr($str, 1)));
191+
}
192+
if ($str[0] === 'u') {
193+
return self::codePointToUtf8(hexdec($matches[2]));
194+
}
195+
196+
return chr(octdec($str));
197+
},
198+
$str
199+
);
200+
}
201+
202+
/**
203+
* Implementation based on https://github.com/nikic/PHP-Parser/blob/b0edd4c41111042d43bb45c6c657b2e0db367d9e/lib/PhpParser/Node/Scalar/String_.php#L132-L154
204+
*/
205+
private static function codePointToUtf8(int $num): string
206+
{
207+
if ($num <= 0x7F) {
208+
return chr($num);
209+
}
210+
if ($num <= 0x7FF) {
211+
return chr(($num >> 6) + 0xC0)
212+
. chr(($num & 0x3F) + 0x80);
213+
}
214+
if ($num <= 0xFFFF) {
215+
return chr(($num >> 12) + 0xE0)
216+
. chr((($num >> 6) & 0x3F) + 0x80)
217+
. chr(($num & 0x3F) + 0x80);
218+
}
219+
if ($num <= 0x1FFFFF) {
220+
return chr(($num >> 18) + 0xF0)
221+
. chr((($num >> 12) & 0x3F) + 0x80)
222+
. chr((($num >> 6) & 0x3F) + 0x80)
223+
. chr(($num & 0x3F) + 0x80);
224+
}
225+
226+
// Invalid UTF-8 codepoint escape sequence: Codepoint too large
227+
return "\xef\xbf\xbd";
228+
}
229+
140230
}

Diff for: tests/PHPStan/Parser/ConstExprParserTest.php

+47-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ protected function setUp(): void
2929
{
3030
parent::setUp();
3131
$this->lexer = new Lexer();
32-
$this->constExprParser = new ConstExprParser();
32+
$this->constExprParser = new ConstExprParser(true);
3333
}
3434

3535

@@ -358,4 +358,50 @@ public function provideFetchNodeParseData(): Iterator
358358
];
359359
}
360360

361+
/**
362+
* @dataProvider provideWithTrimStringsStringNodeParseData
363+
*/
364+
public function testParseWithTrimStrings(string $input, ConstExprNode $expectedExpr, int $nextTokenType = Lexer::TOKEN_END): void
365+
{
366+
$tokens = new TokenIterator($this->lexer->tokenize($input));
367+
$exprNode = $this->constExprParser->parse($tokens, true);
368+
369+
$this->assertSame((string) $expectedExpr, (string) $exprNode);
370+
$this->assertEquals($expectedExpr, $exprNode);
371+
$this->assertSame($nextTokenType, $tokens->currentTokenType());
372+
}
373+
374+
public function provideWithTrimStringsStringNodeParseData(): Iterator
375+
{
376+
yield [
377+
'"foo"',
378+
new ConstExprStringNode('foo'),
379+
];
380+
381+
yield [
382+
'"Foo \\n\\"\\r Bar"',
383+
new ConstExprStringNode("Foo \n\"\r Bar"),
384+
];
385+
386+
yield [
387+
'\'bar\'',
388+
new ConstExprStringNode('bar'),
389+
];
390+
391+
yield [
392+
'\'Foo \\\' Bar\'',
393+
new ConstExprStringNode('Foo \' Bar'),
394+
];
395+
396+
yield [
397+
'"\u{1f601}"',
398+
new ConstExprStringNode("\u{1f601}"),
399+
];
400+
401+
yield [
402+
'"\u{ffffffff}"',
403+
new ConstExprStringNode("\u{fffd}"),
404+
];
405+
}
406+
361407
}

0 commit comments

Comments
 (0)