-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
lexer emits stream of tokens matching minimal grammar parser reads token stream and Marshalls to sourceFile data structure
- Loading branch information
Showing
6 changed files
with
1,007 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
package gmars | ||
|
||
type nodeType uint8 | ||
|
||
const ( | ||
nodeLiteral nodeType = iota | ||
nodeSymbol | ||
nodeOp // + - * / % | ||
) | ||
|
||
type expNode struct { | ||
typ nodeType | ||
symbol string | ||
value int | ||
a *expNode | ||
b *expNode | ||
} | ||
|
||
type expression struct { | ||
tokens []token | ||
root *expNode | ||
} | ||
|
||
func newExpression(t []token) *expression { | ||
return &expression{tokens: t} | ||
} | ||
|
||
func (e *expression) AppendToken(t token) { | ||
e.tokens = append(e.tokens, t) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
package gmars | ||
|
||
import ( | ||
"bufio" | ||
"fmt" | ||
"io" | ||
"unicode" | ||
) | ||
|
||
type lexer struct { | ||
reader *bufio.Reader | ||
nextRune rune | ||
atEOF bool | ||
closed bool | ||
tokens chan token | ||
} | ||
|
||
type lexStateFn func(l *lexer) lexStateFn | ||
|
||
func newLexer(r io.Reader) *lexer { | ||
lex := &lexer{ | ||
reader: bufio.NewReader(r), | ||
tokens: make(chan token), | ||
} | ||
lex.next() | ||
go lex.run() | ||
return lex | ||
} | ||
|
||
func (l *lexer) next() (rune, bool) { | ||
if l.atEOF { | ||
return '\x00', true | ||
} | ||
|
||
r, _, err := l.reader.ReadRune() | ||
if err != nil { | ||
l.atEOF = true | ||
return l.nextRune, true | ||
} | ||
|
||
lastRune := l.nextRune | ||
l.nextRune = r | ||
return lastRune, false | ||
} | ||
|
||
func (l *lexer) run() { | ||
for state := lexInput; state != nil; { | ||
state = state(l) | ||
} | ||
close(l.tokens) | ||
l.closed = true | ||
} | ||
|
||
func (l *lexer) NextToken() (token, error) { | ||
if l.closed { | ||
return token{}, fmt.Errorf("no more tokens") | ||
} | ||
return <-l.tokens, nil | ||
} | ||
|
||
func (l *lexer) Tokens() ([]token, error) { | ||
tokens := make([]token, 0) | ||
for { | ||
token, err := l.NextToken() | ||
if err != nil { | ||
return nil, err | ||
} | ||
tokens = append(tokens, token) | ||
if token.typ == tokEOF || token.typ == tokError { | ||
break | ||
} | ||
} | ||
return tokens, nil | ||
} | ||
|
||
func (l *lexer) emitConsume(tok token, nextState lexStateFn) lexStateFn { | ||
l.tokens <- tok | ||
_, eof := l.next() | ||
if eof { | ||
l.tokens <- token{tokEOF, ""} | ||
return nil | ||
} | ||
return nextState | ||
} | ||
|
||
func lexInput(l *lexer) lexStateFn { | ||
// consume any space until non-space characters, emitting tokNewlines | ||
if unicode.IsSpace(l.nextRune) { | ||
for unicode.IsSpace(l.nextRune) { | ||
if l.nextRune == '\n' { | ||
l.tokens <- token{typ: tokNewline} | ||
} | ||
_, eof := l.next() | ||
if eof { | ||
l.tokens <- token{typ: tokEOF} | ||
return nil | ||
} | ||
} | ||
return lexInput | ||
} | ||
|
||
// handle alphanumeric input | ||
if unicode.IsLetter(l.nextRune) || l.nextRune == '_' { | ||
return lexText | ||
} | ||
|
||
if unicode.IsDigit(l.nextRune) { | ||
return lexNumber | ||
} | ||
|
||
// handle comments | ||
if l.nextRune == ';' { | ||
return lexComment | ||
} | ||
|
||
// dispatch based on next rune, or error | ||
switch l.nextRune { | ||
case '\x00': | ||
l.tokens <- token{tokEOF, ""} | ||
case ',': | ||
return l.emitConsume(token{tokComma, ","}, lexInput) | ||
case '(': | ||
return l.emitConsume(token{tokParenL, "("}, lexInput) | ||
case ')': | ||
return l.emitConsume(token{tokParenR, ")"}, lexInput) | ||
case '+': | ||
fallthrough | ||
case '-': | ||
fallthrough | ||
case '*': | ||
fallthrough | ||
case '/': | ||
fallthrough | ||
case '%': | ||
return l.emitConsume(token{tokExprOp, string(l.nextRune)}, lexInput) | ||
case '$': | ||
fallthrough | ||
case '#': | ||
fallthrough | ||
case '{': | ||
fallthrough | ||
case '}': | ||
fallthrough | ||
case '<': | ||
fallthrough | ||
case '>': | ||
return l.emitConsume(token{tokAddressMode, string(l.nextRune)}, lexInput) | ||
default: | ||
l.tokens <- token{tokError, fmt.Sprintf("unexpected character: '%s'", string(l.nextRune))} | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func lexText(l *lexer) lexStateFn { | ||
runeBuf := make([]rune, 0, 10) | ||
|
||
for unicode.IsLetter(l.nextRune) || unicode.IsDigit(l.nextRune) || l.nextRune == '.' || l.nextRune == '_' { | ||
r, eof := l.next() | ||
runeBuf = append(runeBuf, r) | ||
if eof { | ||
l.tokens <- token{typ: tokText, val: string(runeBuf)} | ||
l.tokens <- token{typ: tokEOF} | ||
return nil | ||
} | ||
} | ||
|
||
if len(runeBuf) > 0 { | ||
l.tokens <- token{typ: tokText, val: string(runeBuf)} | ||
} | ||
|
||
return lexInput | ||
} | ||
|
||
func lexNumber(l *lexer) lexStateFn { | ||
numberBuf := make([]rune, 0, 10) | ||
for unicode.IsDigit(l.nextRune) { | ||
r, eof := l.next() | ||
numberBuf = append(numberBuf, r) | ||
if eof { | ||
l.tokens <- token{tokNumber, string(numberBuf)} | ||
l.tokens <- token{typ: tokEOF} | ||
return nil | ||
} | ||
} | ||
|
||
if len(numberBuf) > 0 { | ||
l.tokens <- token{tokNumber, string(numberBuf)} | ||
} | ||
|
||
return lexInput | ||
} | ||
|
||
func lexComment(l *lexer) lexStateFn { | ||
commentBuf := make([]rune, 0, 32) | ||
|
||
for l.nextRune != '\n' { | ||
commentBuf = append(commentBuf, l.nextRune) | ||
_, eof := l.next() | ||
if eof { | ||
l.tokens <- token{tokComment, string(commentBuf)} | ||
l.tokens <- token{tokEOF, ""} | ||
return nil | ||
} | ||
} | ||
l.tokens <- token{typ: tokComment, val: string(commentBuf)} | ||
return lexInput | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
package gmars | ||
|
||
import ( | ||
"fmt" | ||
"strings" | ||
"testing" | ||
|
||
"github.com/stretchr/testify/assert" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
type lexTestCase struct { | ||
input string | ||
expected []token | ||
} | ||
|
||
func runLexTests(t *testing.T, setName string, testCases []lexTestCase) { | ||
for i, test := range testCases { | ||
l := newLexer(strings.NewReader(test.input)) | ||
out, err := l.Tokens() | ||
require.NoError(t, err, fmt.Errorf("%s test %d: error: %s", setName, i, err)) | ||
assert.Equal(t, test.expected, out, fmt.Sprintf("%s test %d", setName, i)) | ||
} | ||
} | ||
|
||
func TestLexer(t *testing.T) { | ||
testCases := []lexTestCase{ | ||
{ | ||
input: "", | ||
expected: []token{ | ||
{tokEOF, ""}, | ||
}, | ||
}, | ||
{ | ||
input: "\n", | ||
expected: []token{ | ||
{typ: tokNewline}, | ||
{typ: tokEOF}, | ||
}, | ||
}, | ||
{ | ||
input: "start mov # -1, $2 ; comment\n", | ||
expected: []token{ | ||
{tokText, "start"}, | ||
{tokText, "mov"}, | ||
{tokAddressMode, "#"}, | ||
{tokExprOp, "-"}, | ||
{tokNumber, "1"}, | ||
{tokComma, ","}, | ||
{tokAddressMode, "$"}, | ||
{tokNumber, "2"}, | ||
{tokComment, "; comment"}, | ||
{tokNewline, ""}, | ||
{tokEOF, ""}, | ||
}, | ||
}, | ||
{ | ||
input: "step equ (1+3)-start\n", | ||
expected: []token{ | ||
{tokText, "step"}, | ||
{tokText, "equ"}, | ||
{tokParenL, "("}, | ||
{tokNumber, "1"}, | ||
{tokExprOp, "+"}, | ||
{tokNumber, "3"}, | ||
{tokParenR, ")"}, | ||
{tokExprOp, "-"}, | ||
{tokText, "start"}, | ||
{tokNewline, ""}, | ||
{tokEOF, ""}, | ||
}, | ||
}, | ||
{ | ||
input: "111", | ||
expected: []token{ | ||
{tokNumber, "111"}, | ||
{tokEOF, ""}, | ||
}, | ||
}, | ||
{ | ||
input: "; comment", | ||
expected: []token{ | ||
{tokComment, "; comment"}, | ||
{tokEOF, ""}, | ||
}, | ||
}, | ||
{ | ||
input: "text", | ||
expected: []token{ | ||
{tokText, "text"}, | ||
{tokEOF, ""}, | ||
}, | ||
}, | ||
{ | ||
input: "#", | ||
expected: []token{ | ||
{tokAddressMode, "#"}, | ||
{tokEOF, ""}, | ||
}, | ||
}, | ||
{ | ||
input: "underscore_text", | ||
expected: []token{ | ||
{tokText, "underscore_text"}, | ||
{tokEOF, ""}, | ||
}, | ||
}, | ||
{ | ||
input: "~", | ||
expected: []token{ | ||
{tokError, "unexpected character: '~'"}, | ||
}, | ||
}, | ||
} | ||
|
||
runLexTests(t, "TestLexer", testCases) | ||
} | ||
|
||
func TestLexEnd(t *testing.T) { | ||
l := newLexer(strings.NewReader("test mov 0, 1\n")) | ||
|
||
_, err := l.Tokens() | ||
assert.NoError(t, err) | ||
|
||
tok, err := l.NextToken() | ||
assert.Error(t, err) | ||
assert.Equal(t, token{}, tok) | ||
|
||
tokens, err := l.Tokens() | ||
assert.Error(t, err) | ||
assert.Nil(t, tokens) | ||
|
||
r, eof := l.next() | ||
assert.True(t, eof) | ||
assert.Equal(t, r, '\x00') | ||
} |
Oops, something went wrong.