-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlexer.py
149 lines (138 loc) · 4.3 KB
/
lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from env import *
from tokens import *
import re
class LexError(Exception):
pass
class Lexer:
def __init__(self):
pass
def isdelim(self, c):
"""We allow None as a delimiter because that will be the EOF delimiter."""
r = c is None or c.isspace() or c in '()[]";#\0'
return r
def getchar(self):
try:
self.char = self.instring[0]
self.instring = self.instring[1:]
except IndexError:
self.char = None
def tokenize(self, instring):
self.instring = instring
self.getchar()
tokens = []
while self.char:
tokens.append(self.gentoken())
self.getchar()
return tokens
def gentoken(self):
incomment = False
if self.char == ';':
incomment = True
while self.char and (self.char.isspace() or incomment):
self.getchar()
if self.char == '\n' and incomment:
incomment = False
self.getchar()
elif self.char == ';' and not incomment:
incomment = True
if self.char == '(':
return LParen()
elif self.char == ')':
return RParen()
elif self.char == '[':
return LBrack()
elif self.char == ']':
return RBrack()
elif self.char == '\'':
return Quote()
elif self.char == '`':
return BQuote()
elif self.char == '"':
self.getchar()
s = '"'
while self.char != '"':
s += self.char
self.getchar()
s += '"'
return String(s)
elif self.char == '#':
self.getchar()
if self.char == 't' or self.char == 'T':
return ID('#t')
elif self.char == 'f' or self.char == 'F':
return ID('#f')
elif self.char == ',':
n = self.instring[0]
if n == '@':
self.getchar()
return UnSyntaxSplicing()
else:
return Comma()
elif self.char is not None:
## number or identifier
return self.getnum()
def getnum(self):
## can't use getchar for a bit...
pos = 0
hasdec = False
hasexp = False
## this is ugly, and I shouldn't have to do it. make it better.
self.instring = self.char + self.instring
c = self.instring[pos]
while not self.isdelim(c):
if not c.isdigit():
if c == '.':
if not hasdec:
hasdec = True
pos += 1
try:
c = self.instring[pos]
except IndexError:
c = None #2. is okay.
continue
elif c in 'eE':
hasexp = True
pos += 1
try:
c = self.instring[pos]
except IndexError:
return self.getid() #2e is not okay, it is an ID.
if c in '-+' or c.isdigit():
pos += 1
try:
c = self.instring[pos]
except IndexError:
return self.getid() #2e+ is also an id
continue
return self.getid()
pos += 1
try:
c = self.instring[pos]
except IndexError:
c = None
lexval = self.instring[:pos]
self.instring = self.instring[pos:]
if hasdec:
return Float(lexval)
else:
return Int(lexval)
def getid(self):
pos = 0
c = self.instring[pos]
while not self.isdelim(c):
pos += 1
try:
c = self.instring[pos]
except IndexError:
c = None
lexval = self.instring[:pos]
self.instring = self.instring[pos:]
return ID(lexval)
if __name__ == '__main__':
L = Lexer()
s = """(define (fib-iter n a b)
(if (= n 0)
a
(fib-iter (- n 1) b (+ a b))))"""
t = L.tokenize(s)
print(t)