also let the dfas classify the keywords

cfbolz · Nov 21, 2024 · bdddcdc · bdddcdc
1 parent 31a9a6c
commit bdddcdc
Show file tree

Hide file tree

Showing 7 changed files with 121 additions and 120 deletions.
diff --git a/pypy/interpreter/pyparser/dfa_generated.py b/pypy/interpreter/pyparser/dfa_generated.py
@@ -8,19 +8,19 @@
            True, True, True, True, True, True, True, True,
            True, True, True, True, True, True, True, True,
            True, True, True, True, True, True, True, False,
-           True, True, False, False, True, False, False,
+           True, True, False, False, False, True, False,
            False, True, True, False, False, True, False,
            True, True, True, True, True, True, True, True,
            True, True, True, True, True, True, True, True,
-           True, True, True, True, False, False, False, True,
+           True, True, True, False, True, False, False, True,
            False, False, True, True, True, True, True, False,
-           False, True, True, True, True, True, True, True,
-           True, False, True, False, False, True]
+           False, True, True, True, True, True, True, False,
+           True, True, True, True, False, True, True, False,
+           True]
 states = [
     # 0 (accepts)
-    {'\t': 0, '\n': 32, '\x0c': 0,
-     '\r': 33, ' ': 0, '!': 7, '"': 34,
-     '#': 36, '$': 31, '%': 8, '&': 9,
+    {'\n': 32, '\r': 33, '!': 7, '"': 34,
+     '#': 37, '$': 31, '%': 8, '&': 9,
      "'": 35, '(': 10, ')': 11, '*': 12,
      '+': 13, ',': 14, '-': 15, '.': 6,
      '/': 16, '0': 4, '1': 5, '2': 5,
@@ -34,7 +34,7 @@
      'P': 1, 'Q': 1, 'R': 3, 'S': 1,
      'T': 1, 'U': 2, 'V': 1, 'W': 1,
      'X': 1, 'Y': 1, 'Z': 1, '[': 23,
-     '\\': 37, ']': 24, '^': 25, '_': 1,
+     '\\': 36, ']': 24, '^': 25, '_': 1,
      '`': 26, 'a': 1, 'b': 2, 'c': 1,
      'd': 1, 'e': 1, 'f': 1, 'g': 1,
      'h': 1, 'i': 1, 'j': 1, 'k': 1,
@@ -170,15 +170,15 @@
     # 33 (accepts)
     {'\n': 32},
     # 34
-    {automata.DEFAULT: 68, '\n': 67,
-     '\r': 67, '"': 65, '\\': 66},
+    {automata.DEFAULT: 68, '\n': 65,
+     '\r': 65, '"': 66, '\\': 67},
     # 35
-    {automata.DEFAULT: 71, '\n': 67,
-     '\r': 67, "'": 69, '\\': 70},
-    # 36 (accepts)
-    {automata.DEFAULT: 36, '\n': 67, '\r': 67},
-    # 37
+    {automata.DEFAULT: 71, '\n': 65,
+     '\r': 65, "'": 69, '\\': 70},
+    # 36
     {'\n': 72, '\r': 73},
+    # 37 (accepts)
+    {automata.DEFAULT: 37, '\n': 65, '\r': 65},
     # 38
     {'0': 74, '1': 74, '2': 74, '3': 74,
      '4': 74, '5': 74, '6': 74, '7': 74,
@@ -256,22 +256,22 @@
     {'0': 64, '1': 64, '2': 64, '3': 64,
      '4': 64, '5': 64, '6': 64, '7': 64,
      '8': 64, '9': 64},
-    # 65 (accepts)
+    # 65
+    {},
+    # 66 (accepts)
     {'"': 84},
-    # 66
-    {automata.DEFAULT: 87, '\n': 85, '\r': 86},
     # 67
-    {},
+    {automata.DEFAULT: 85, '\n': 86, '\r': 87},
     # 68
-    {automata.DEFAULT: 68, '\n': 67,
-     '\r': 67, '"': 88, '\\': 66},
+    {automata.DEFAULT: 68, '\n': 65,
+     '\r': 65, '"': 88, '\\': 67},
     # 69 (accepts)
-    {"'": 84},
+    {"'": 89},
     # 70
-    {automata.DEFAULT: 89, '\n': 85, '\r': 86},
+    {automata.DEFAULT: 90, '\n': 91, '\r': 92},
     # 71
-    {automata.DEFAULT: 71, '\n': 67,
-     '\r': 67, "'": 88, '\\': 70},
+    {automata.DEFAULT: 71, '\n': 65,
+     '\r': 65, "'": 88, '\\': 70},
     # 72 (accepts)
     {},
     # 73 (accepts)
@@ -290,9 +290,9 @@
     # 76 (accepts)
     {'0': 76, '1': 76, 'L': 41, 'l': 41},
     # 77
-    {'+': 90, '-': 90, '0': 91, '1': 91,
-     '2': 91, '3': 91, '4': 91, '5': 91,
-     '6': 91, '7': 91, '8': 91, '9': 91},
+    {'+': 93, '-': 93, '0': 94, '1': 94,
+     '2': 94, '3': 94, '4': 94, '5': 94,
+     '6': 94, '7': 94, '8': 94, '9': 94},
     # 78
     {'0': 79, '1': 79, '2': 79, '3': 79,
      '4': 79, '5': 79, '6': 79, '7': 79,
@@ -311,29 +311,35 @@
     {},
     # 84 (accepts)
     {},
-    # 85 (accepts)
-    {},
+    # 85
+    {automata.DEFAULT: 85, '\n': 65,
+     '\r': 65, '"': 88, '\\': 67},
     # 86 (accepts)
-    {'\n': 85},
-    # 87
-    {automata.DEFAULT: 87, '\n': 67,
-     '\r': 67, '"': 88, '\\': 66},
+    {},
+    # 87 (accepts)
+    {'\n': 86},
     # 88 (accepts)
     {},
-    # 89
-    {automata.DEFAULT: 89, '\n': 67,
-     '\r': 67, "'": 88, '\\': 70},
+    # 89 (accepts)
+    {},
     # 90
-    {'0': 91, '1': 91, '2': 91, '3': 91,
-     '4': 91, '5': 91, '6': 91, '7': 91,
-     '8': 91, '9': 91},
+    {automata.DEFAULT: 90, '\n': 65,
+     '\r': 65, "'": 88, '\\': 70},
     # 91 (accepts)
-    {'0': 91, '1': 91, '2': 91, '3': 91,
-     '4': 91, '5': 91, '6': 91, '7': 91,
-     '8': 91, '9': 91, 'J': 41, 'j': 41},
+    {},
+    # 92 (accepts)
+    {'\n': 91},
+    # 93
+    {'0': 94, '1': 94, '2': 94, '3': 94,
+     '4': 94, '5': 94, '6': 94, '7': 94,
+     '8': 94, '9': 94},
+    # 94 (accepts)
+    {'0': 94, '1': 94, '2': 94, '3': 94,
+     '4': 94, '5': 94, '6': 94, '7': 94,
+     '8': 94, '9': 94, 'J': 41, 'j': 41},
     ]
 pseudoDFA = automata.DFA(states, accepts)
-pseudoDFA.state_to_token = [-1, 12, 12, 12, 13, 13, 102, -2, 156, 39, 8, 52, 43, 5, 45, 6, 155, 70, 141, 79, 48, 80, 37, 9, 54, 164, 11, 10, 106, 56, 7, -2, 1, 1, -2, -2, -5, -2, -2, -2, 13, 13, -2, -2, 13, -2, 13, 84, 62, 63, 44, 60, 58, 59, 157, 61, 139, 83, 84, 81, 82, 137, 65, 64, 15, 14, -2, -2, -2, 14, -2, -2, -6, -6, 13, 13, 13, -2, -2, 13, 68, 69, 66, 67, -4, -3, -3, -2, 14, -2, -2, 13]
+pseudoDFA.state_to_token = [-1, 12, 12, 12, 13, 13, 102, -2, 156, 39, 8, 52, 43, 5, 45, 6, 155, 70, 141, 79, 48, 80, 37, 9, 54, 164, 11, 10, 106, 56, 7, -2, 1, 1, -2, -2, -2, -7, -2, -2, 13, 13, -2, -2, 13, -2, 13, 84, 62, 63, 44, 60, 58, 59, 157, 61, 139, 83, 84, 81, 82, 137, 65, 64, 15, -2, 14, -2, -2, 14, -2, -2, -8, -8, 13, 13, 13, -2, -2, 13, 68, 69, 66, 67, -6, -2, -4, -4, 14, -5, -2, -3, -3, -2, 13]
 
 accepts = [False, False, False, False, False, True]
 states = [

diff --git a/pypy/interpreter/pyparser/gendfa.py b/pypy/interpreter/pyparser/gendfa.py
@@ -44,11 +44,13 @@ def makeComment ():
                      any(states, notGroupStr(states, "\r\n")))
     # ____________________________________________________________
     # Names
-    name = chain(states,
-                 groupStr(states, string.letters + "_"),
-                 any(states, groupStr(states,
-                                      string.letters + string.digits + "_")))
-    label(labels, name, "NAME")
+    name = label(
+        labels,
+        chain(states,
+              groupStr(states, string.letters + "_"),
+              any(states, groupStr(states,
+                                   string.letters + string.digits + "_"))),
+        "NAME")
     # ____________________________________________________________
     # Digits
     def makeDigits ():
@@ -114,8 +116,10 @@ def makeFloat ():
                              groupStr(states, "jJ")))
     # ____________________________________________________________
     # Any old number
-    number = group(states, imagNumber, makeFloat(), intNumber)
-    label(labels, number, "NUMBER")
+    number = label(
+        labels,
+        group(states, imagNumber, makeFloat(), intNumber),
+        "NUMBER")
 
     # ____________________________________________________________
     # Funny
@@ -124,15 +128,15 @@ def makeFloat ():
     for op in sorted(pygram.python_opmap):
         if op == "$NUM":
             continue
-        funny.append(chain(states, chainStr(states, op)))
-        label(labels, funny[-1], op)
-    revdb_metavar = chain(states,
-                          groupStr(states, "$"),
-                          atleastonce(states, makeDigits()))
-    label(labels, revdb_metavar, "REVDBMETAVAR")
+        funny.append(label(labels, chain(states, chainStr(states, op)), op))
+    revdb_metavar = label(
+        labels,
+        chain(states,
+              groupStr(states, "$"),
+              atleastonce(states, makeDigits())),
+        "REVDBMETAVAR")
     funny.append(revdb_metavar)
-    eol = makeEOL()
-    label(labels, eol, "NEWLINE")
+    eol = label(labels, makeEOL(), "NEWLINE")
     funny.append(eol)
     funny = group(states, *funny)
     # ____________________________________________________________
@@ -141,13 +145,7 @@ def makeStrPrefix ():
                      maybe(states, groupStr(states, "uUbB")),
                      maybe(states, groupStr(states, "rR")))
     # ____________________________________________________________
-    def makeStr(quote):
-        regular_end = newArcPair(states, quote)
-        # add a label to the closing quote where a string is finished on one
-        # line
-        label(labels, regular_end, "STRING")
-        continuation_end = makeLineCont()
-        label(labels, continuation_end, "TOK_STRING_CONTINUATION")
+    def makeStr(quote, cont_label):
         return chain(
             states,
             makeStrPrefix(),
@@ -161,30 +159,26 @@ def makeStr(quote):
                       any(states,
                           notGroupStr(states, "\r\n%s\\" % quote)))),
             group(states,
-                  regular_end,
-                  continuation_end))
+                  # add a label to the closing quote where a string is finished
+                  # on one line
+                  label(labels, newArcPair(states, quote), "STRING"),
+                  # special label for continuation end
+                  label(labels, makeLineCont(), cont_label)))
     contStr = group(states,
-                    makeStr('"'),
-                    makeStr("'"))
+                    makeStr('"', "TOK_STRING_CONTINUATION_DOUBLE"),
+                    makeStr("'", "TOK_STRING_CONTINUATION_SINGLE"))
     triple = chain(states,
                    makeStrPrefix(),
                    group(states,
-                         chainStr(states, "'''"),
-                         chainStr(states, '"""')))
-    label(labels, triple, "TOK_TRIPLE_QUOTE_START")
-    comment = makeComment()
-    label(labels, comment, "TOK_COMMENT")
-    linecont = makeLineCont()
-    label(labels, linecont, "TOK_LINECONT")
+                         label(labels, chainStr(states, "'''"), "TOK_TRIPLE_QUOTE_START_SINGLE"),
+                         label(labels, chainStr(states, '"""'), "TOK_TRIPLE_QUOTE_START_DOUBLE")))
     pseudoExtras = group(states,
-                         linecont,
-                         comment,
+                         label(labels, makeLineCont(), "TOK_LINECONT"),
+                         label(labels, makeComment(), "TOK_COMMENT"),
                          triple)
-    pseudoToken = chain(states,
-                        makeWhitespace(),
-                        group(states,
-                              newArcPair(states, EMPTY),
-                              pseudoExtras, number, funny, contStr, name))
+    pseudoToken = group(states,
+                        newArcPair(states, EMPTY),
+                        pseudoExtras, number, funny, contStr, name)
     label(labels, pseudoToken, "ACCEPT")
     dfaStates, dfaAccepts = nfaToDfa(states, pseudoToken[0], labels)
     #view(dfaStates, dfaAccepts)

diff --git a/pypy/interpreter/pyparser/pygram.py b/pypy/interpreter/pyparser/pygram.py
@@ -53,10 +53,12 @@ class _Tokens(object):
 
 # a few special token numbers for the tokenizer. They never end up in a Token
 # instance
-_Tokens.TOK_STRING_CONTINUATION = -3
-_Tokens.TOK_TRIPLE_QUOTE_START = -4
-_Tokens.TOK_COMMENT = -5
-_Tokens.TOK_LINECONT = -6
+_Tokens.TOK_STRING_CONTINUATION_SINGLE = -3
+_Tokens.TOK_STRING_CONTINUATION_DOUBLE = -4
+_Tokens.TOK_TRIPLE_QUOTE_START_SINGLE = -5
+_Tokens.TOK_TRIPLE_QUOTE_START_DOUBLE = -6
+_Tokens.TOK_COMMENT = -7
+_Tokens.TOK_LINECONT = -8
 
 python_opmap = {}
 for op, idx in pytoken.python_opmap.iteritems():

diff --git a/pypy/interpreter/pyparser/pytokenize.py b/pypy/interpreter/pyparser/pytokenize.py
@@ -10,35 +10,12 @@
 basil.util.automata module.
 
 """
-# ______________________________________________________________________
-
-from pypy.interpreter.pyparser import automata
 from pypy.interpreter.pyparser.dfa_generated import *
 
-__all__ = [ "tokenize" ]
-
-endDFAs = {"'" : singleDFA,
-           '"' : doubleDFA,
-           'r' : None,
-           'R' : None,
-           'u' : None,
-           'U' : None,
-           'b' : None,
-           'B' : None}
-
-for uniPrefix in ("", "u", "U", "b", "B"):
-    for rawPrefix in ("", "r", "R"):
-        prefix = uniPrefix + rawPrefix
-        endDFAs[prefix + "'''"] = single3DFA
-        endDFAs[prefix + '"""'] = double3DFA
-
 whiteSpaceStatesAccepts = [True]
 whiteSpaceStates = [{'\t': 0, ' ': 0, '\x0c': 0}]
 whiteSpaceDFA = automata.DFA(whiteSpaceStates, whiteSpaceStatesAccepts)
 
-# ______________________________________________________________________
-# COPIED:
-
 triple_quoted = {}
 for t in ("'''", '"""',
           "r'''", 'r"""', "R'''", 'R"""',