Skip to content

Commit

Permalink
also let the dfas classify the keywords
Browse files Browse the repository at this point in the history
  • Loading branch information
cfbolz committed Nov 21, 2024
1 parent 31a9a6c commit bdddcdc
Show file tree
Hide file tree
Showing 7 changed files with 121 additions and 120 deletions.
94 changes: 50 additions & 44 deletions pypy/interpreter/pyparser/dfa_generated.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@
True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, False,
True, True, False, False, True, False, False,
True, True, False, False, False, True, False,
False, True, True, False, False, True, False,
True, True, True, True, True, True, True, True,
True, True, True, True, True, True, True, True,
True, True, True, True, False, False, False, True,
True, True, True, False, True, False, False, True,
False, False, True, True, True, True, True, False,
False, True, True, True, True, True, True, True,
True, False, True, False, False, True]
False, True, True, True, True, True, True, False,
True, True, True, True, False, True, True, False,
True]
states = [
# 0 (accepts)
{'\t': 0, '\n': 32, '\x0c': 0,
'\r': 33, ' ': 0, '!': 7, '"': 34,
'#': 36, '$': 31, '%': 8, '&': 9,
{'\n': 32, '\r': 33, '!': 7, '"': 34,
'#': 37, '$': 31, '%': 8, '&': 9,
"'": 35, '(': 10, ')': 11, '*': 12,
'+': 13, ',': 14, '-': 15, '.': 6,
'/': 16, '0': 4, '1': 5, '2': 5,
Expand All @@ -34,7 +34,7 @@
'P': 1, 'Q': 1, 'R': 3, 'S': 1,
'T': 1, 'U': 2, 'V': 1, 'W': 1,
'X': 1, 'Y': 1, 'Z': 1, '[': 23,
'\\': 37, ']': 24, '^': 25, '_': 1,
'\\': 36, ']': 24, '^': 25, '_': 1,
'`': 26, 'a': 1, 'b': 2, 'c': 1,
'd': 1, 'e': 1, 'f': 1, 'g': 1,
'h': 1, 'i': 1, 'j': 1, 'k': 1,
Expand Down Expand Up @@ -170,15 +170,15 @@
# 33 (accepts)
{'\n': 32},
# 34
{automata.DEFAULT: 68, '\n': 67,
'\r': 67, '"': 65, '\\': 66},
{automata.DEFAULT: 68, '\n': 65,
'\r': 65, '"': 66, '\\': 67},
# 35
{automata.DEFAULT: 71, '\n': 67,
'\r': 67, "'": 69, '\\': 70},
# 36 (accepts)
{automata.DEFAULT: 36, '\n': 67, '\r': 67},
# 37
{automata.DEFAULT: 71, '\n': 65,
'\r': 65, "'": 69, '\\': 70},
# 36
{'\n': 72, '\r': 73},
# 37 (accepts)
{automata.DEFAULT: 37, '\n': 65, '\r': 65},
# 38
{'0': 74, '1': 74, '2': 74, '3': 74,
'4': 74, '5': 74, '6': 74, '7': 74,
Expand Down Expand Up @@ -256,22 +256,22 @@
{'0': 64, '1': 64, '2': 64, '3': 64,
'4': 64, '5': 64, '6': 64, '7': 64,
'8': 64, '9': 64},
# 65 (accepts)
# 65
{},
# 66 (accepts)
{'"': 84},
# 66
{automata.DEFAULT: 87, '\n': 85, '\r': 86},
# 67
{},
{automata.DEFAULT: 85, '\n': 86, '\r': 87},
# 68
{automata.DEFAULT: 68, '\n': 67,
'\r': 67, '"': 88, '\\': 66},
{automata.DEFAULT: 68, '\n': 65,
'\r': 65, '"': 88, '\\': 67},
# 69 (accepts)
{"'": 84},
{"'": 89},
# 70
{automata.DEFAULT: 89, '\n': 85, '\r': 86},
{automata.DEFAULT: 90, '\n': 91, '\r': 92},
# 71
{automata.DEFAULT: 71, '\n': 67,
'\r': 67, "'": 88, '\\': 70},
{automata.DEFAULT: 71, '\n': 65,
'\r': 65, "'": 88, '\\': 70},
# 72 (accepts)
{},
# 73 (accepts)
Expand All @@ -290,9 +290,9 @@
# 76 (accepts)
{'0': 76, '1': 76, 'L': 41, 'l': 41},
# 77
{'+': 90, '-': 90, '0': 91, '1': 91,
'2': 91, '3': 91, '4': 91, '5': 91,
'6': 91, '7': 91, '8': 91, '9': 91},
{'+': 93, '-': 93, '0': 94, '1': 94,
'2': 94, '3': 94, '4': 94, '5': 94,
'6': 94, '7': 94, '8': 94, '9': 94},
# 78
{'0': 79, '1': 79, '2': 79, '3': 79,
'4': 79, '5': 79, '6': 79, '7': 79,
Expand All @@ -311,29 +311,35 @@
{},
# 84 (accepts)
{},
# 85 (accepts)
{},
# 85
{automata.DEFAULT: 85, '\n': 65,
'\r': 65, '"': 88, '\\': 67},
# 86 (accepts)
{'\n': 85},
# 87
{automata.DEFAULT: 87, '\n': 67,
'\r': 67, '"': 88, '\\': 66},
{},
# 87 (accepts)
{'\n': 86},
# 88 (accepts)
{},
# 89
{automata.DEFAULT: 89, '\n': 67,
'\r': 67, "'": 88, '\\': 70},
# 89 (accepts)
{},
# 90
{'0': 91, '1': 91, '2': 91, '3': 91,
'4': 91, '5': 91, '6': 91, '7': 91,
'8': 91, '9': 91},
{automata.DEFAULT: 90, '\n': 65,
'\r': 65, "'": 88, '\\': 70},
# 91 (accepts)
{'0': 91, '1': 91, '2': 91, '3': 91,
'4': 91, '5': 91, '6': 91, '7': 91,
'8': 91, '9': 91, 'J': 41, 'j': 41},
{},
# 92 (accepts)
{'\n': 91},
# 93
{'0': 94, '1': 94, '2': 94, '3': 94,
'4': 94, '5': 94, '6': 94, '7': 94,
'8': 94, '9': 94},
# 94 (accepts)
{'0': 94, '1': 94, '2': 94, '3': 94,
'4': 94, '5': 94, '6': 94, '7': 94,
'8': 94, '9': 94, 'J': 41, 'j': 41},
]
pseudoDFA = automata.DFA(states, accepts)
pseudoDFA.state_to_token = [-1, 12, 12, 12, 13, 13, 102, -2, 156, 39, 8, 52, 43, 5, 45, 6, 155, 70, 141, 79, 48, 80, 37, 9, 54, 164, 11, 10, 106, 56, 7, -2, 1, 1, -2, -2, -5, -2, -2, -2, 13, 13, -2, -2, 13, -2, 13, 84, 62, 63, 44, 60, 58, 59, 157, 61, 139, 83, 84, 81, 82, 137, 65, 64, 15, 14, -2, -2, -2, 14, -2, -2, -6, -6, 13, 13, 13, -2, -2, 13, 68, 69, 66, 67, -4, -3, -3, -2, 14, -2, -2, 13]
pseudoDFA.state_to_token = [-1, 12, 12, 12, 13, 13, 102, -2, 156, 39, 8, 52, 43, 5, 45, 6, 155, 70, 141, 79, 48, 80, 37, 9, 54, 164, 11, 10, 106, 56, 7, -2, 1, 1, -2, -2, -2, -7, -2, -2, 13, 13, -2, -2, 13, -2, 13, 84, 62, 63, 44, 60, 58, 59, 157, 61, 139, 83, 84, 81, 82, 137, 65, 64, 15, -2, 14, -2, -2, 14, -2, -2, -8, -8, 13, 13, 13, -2, -2, 13, 68, 69, 66, 67, -6, -2, -4, -4, 14, -5, -2, -3, -3, -2, 13]

accepts = [False, False, False, False, False, True]
states = [
Expand Down
74 changes: 34 additions & 40 deletions pypy/interpreter/pyparser/gendfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,13 @@ def makeComment ():
any(states, notGroupStr(states, "\r\n")))
# ____________________________________________________________
# Names
name = chain(states,
groupStr(states, string.letters + "_"),
any(states, groupStr(states,
string.letters + string.digits + "_")))
label(labels, name, "NAME")
name = label(
labels,
chain(states,
groupStr(states, string.letters + "_"),
any(states, groupStr(states,
string.letters + string.digits + "_"))),
"NAME")
# ____________________________________________________________
# Digits
def makeDigits ():
Expand Down Expand Up @@ -114,8 +116,10 @@ def makeFloat ():
groupStr(states, "jJ")))
# ____________________________________________________________
# Any old number
number = group(states, imagNumber, makeFloat(), intNumber)
label(labels, number, "NUMBER")
number = label(
labels,
group(states, imagNumber, makeFloat(), intNumber),
"NUMBER")

# ____________________________________________________________
# Funny
Expand All @@ -124,15 +128,15 @@ def makeFloat ():
for op in sorted(pygram.python_opmap):
if op == "$NUM":
continue
funny.append(chain(states, chainStr(states, op)))
label(labels, funny[-1], op)
revdb_metavar = chain(states,
groupStr(states, "$"),
atleastonce(states, makeDigits()))
label(labels, revdb_metavar, "REVDBMETAVAR")
funny.append(label(labels, chain(states, chainStr(states, op)), op))
revdb_metavar = label(
labels,
chain(states,
groupStr(states, "$"),
atleastonce(states, makeDigits())),
"REVDBMETAVAR")
funny.append(revdb_metavar)
eol = makeEOL()
label(labels, eol, "NEWLINE")
eol = label(labels, makeEOL(), "NEWLINE")
funny.append(eol)
funny = group(states, *funny)
# ____________________________________________________________
Expand All @@ -141,13 +145,7 @@ def makeStrPrefix ():
maybe(states, groupStr(states, "uUbB")),
maybe(states, groupStr(states, "rR")))
# ____________________________________________________________
def makeStr(quote):
regular_end = newArcPair(states, quote)
# add a label to the closing quote where a string is finished on one
# line
label(labels, regular_end, "STRING")
continuation_end = makeLineCont()
label(labels, continuation_end, "TOK_STRING_CONTINUATION")
def makeStr(quote, cont_label):
return chain(
states,
makeStrPrefix(),
Expand All @@ -161,30 +159,26 @@ def makeStr(quote):
any(states,
notGroupStr(states, "\r\n%s\\" % quote)))),
group(states,
regular_end,
continuation_end))
# add a label to the closing quote where a string is finished
# on one line
label(labels, newArcPair(states, quote), "STRING"),
# special label for continuation end
label(labels, makeLineCont(), cont_label)))
contStr = group(states,
makeStr('"'),
makeStr("'"))
makeStr('"', "TOK_STRING_CONTINUATION_DOUBLE"),
makeStr("'", "TOK_STRING_CONTINUATION_SINGLE"))
triple = chain(states,
makeStrPrefix(),
group(states,
chainStr(states, "'''"),
chainStr(states, '"""')))
label(labels, triple, "TOK_TRIPLE_QUOTE_START")
comment = makeComment()
label(labels, comment, "TOK_COMMENT")
linecont = makeLineCont()
label(labels, linecont, "TOK_LINECONT")
label(labels, chainStr(states, "'''"), "TOK_TRIPLE_QUOTE_START_SINGLE"),
label(labels, chainStr(states, '"""'), "TOK_TRIPLE_QUOTE_START_DOUBLE")))
pseudoExtras = group(states,
linecont,
comment,
label(labels, makeLineCont(), "TOK_LINECONT"),
label(labels, makeComment(), "TOK_COMMENT"),
triple)
pseudoToken = chain(states,
makeWhitespace(),
group(states,
newArcPair(states, EMPTY),
pseudoExtras, number, funny, contStr, name))
pseudoToken = group(states,
newArcPair(states, EMPTY),
pseudoExtras, number, funny, contStr, name)
label(labels, pseudoToken, "ACCEPT")
dfaStates, dfaAccepts = nfaToDfa(states, pseudoToken[0], labels)
#view(dfaStates, dfaAccepts)
Expand Down
10 changes: 6 additions & 4 deletions pypy/interpreter/pyparser/pygram.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,12 @@ class _Tokens(object):

# a few special token numbers for the tokenizer. They never end up in a Token
# instance
_Tokens.TOK_STRING_CONTINUATION = -3
_Tokens.TOK_TRIPLE_QUOTE_START = -4
_Tokens.TOK_COMMENT = -5
_Tokens.TOK_LINECONT = -6
_Tokens.TOK_STRING_CONTINUATION_SINGLE = -3
_Tokens.TOK_STRING_CONTINUATION_DOUBLE = -4
_Tokens.TOK_TRIPLE_QUOTE_START_SINGLE = -5
_Tokens.TOK_TRIPLE_QUOTE_START_DOUBLE = -6
_Tokens.TOK_COMMENT = -7
_Tokens.TOK_LINECONT = -8

python_opmap = {}
for op, idx in pytoken.python_opmap.iteritems():
Expand Down
23 changes: 0 additions & 23 deletions pypy/interpreter/pyparser/pytokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,35 +10,12 @@
basil.util.automata module.
"""
# ______________________________________________________________________

from pypy.interpreter.pyparser import automata
from pypy.interpreter.pyparser.dfa_generated import *

__all__ = [ "tokenize" ]

endDFAs = {"'" : singleDFA,
'"' : doubleDFA,
'r' : None,
'R' : None,
'u' : None,
'U' : None,
'b' : None,
'B' : None}

for uniPrefix in ("", "u", "U", "b", "B"):
for rawPrefix in ("", "r", "R"):
prefix = uniPrefix + rawPrefix
endDFAs[prefix + "'''"] = single3DFA
endDFAs[prefix + '"""'] = double3DFA

whiteSpaceStatesAccepts = [True]
whiteSpaceStates = [{'\t': 0, ' ': 0, '\x0c': 0}]
whiteSpaceDFA = automata.DFA(whiteSpaceStates, whiteSpaceStatesAccepts)

# ______________________________________________________________________
# COPIED:

triple_quoted = {}
for t in ("'''", '"""',
"r'''", 'r"""', "R'''", 'R"""',
Expand Down
Loading

0 comments on commit bdddcdc

Please sign in to comment.