Skip to content

Commit

Permalink
Implement Parser Frontend (#56)
Browse files Browse the repository at this point in the history
lexer emits stream of tokens matching minimal grammar
parser reads token stream and Marshalls to sourceFile data structure
  • Loading branch information
bobertlo authored Nov 13, 2024
1 parent bc8df07 commit c6a79e6
Show file tree
Hide file tree
Showing 6 changed files with 1,007 additions and 0 deletions.
30 changes: 30 additions & 0 deletions expr.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package gmars

type nodeType uint8

const (
nodeLiteral nodeType = iota
nodeSymbol
nodeOp // + - * / %
)

type expNode struct {
typ nodeType
symbol string
value int
a *expNode
b *expNode
}

type expression struct {
tokens []token
root *expNode
}

func newExpression(t []token) *expression {
return &expression{tokens: t}
}

func (e *expression) AppendToken(t token) {
e.tokens = append(e.tokens, t)
}
208 changes: 208 additions & 0 deletions lex.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
package gmars

import (
"bufio"
"fmt"
"io"
"unicode"
)

type lexer struct {
reader *bufio.Reader
nextRune rune
atEOF bool
closed bool
tokens chan token
}

type lexStateFn func(l *lexer) lexStateFn

func newLexer(r io.Reader) *lexer {
lex := &lexer{
reader: bufio.NewReader(r),
tokens: make(chan token),
}
lex.next()
go lex.run()
return lex
}

func (l *lexer) next() (rune, bool) {
if l.atEOF {
return '\x00', true
}

r, _, err := l.reader.ReadRune()
if err != nil {
l.atEOF = true
return l.nextRune, true
}

lastRune := l.nextRune
l.nextRune = r
return lastRune, false
}

func (l *lexer) run() {
for state := lexInput; state != nil; {
state = state(l)
}
close(l.tokens)
l.closed = true
}

func (l *lexer) NextToken() (token, error) {
if l.closed {
return token{}, fmt.Errorf("no more tokens")
}
return <-l.tokens, nil
}

func (l *lexer) Tokens() ([]token, error) {
tokens := make([]token, 0)
for {
token, err := l.NextToken()
if err != nil {
return nil, err
}
tokens = append(tokens, token)
if token.typ == tokEOF || token.typ == tokError {
break
}
}
return tokens, nil
}

func (l *lexer) emitConsume(tok token, nextState lexStateFn) lexStateFn {
l.tokens <- tok
_, eof := l.next()
if eof {
l.tokens <- token{tokEOF, ""}
return nil
}
return nextState
}

func lexInput(l *lexer) lexStateFn {
// consume any space until non-space characters, emitting tokNewlines
if unicode.IsSpace(l.nextRune) {
for unicode.IsSpace(l.nextRune) {
if l.nextRune == '\n' {
l.tokens <- token{typ: tokNewline}
}
_, eof := l.next()
if eof {
l.tokens <- token{typ: tokEOF}
return nil
}
}
return lexInput
}

// handle alphanumeric input
if unicode.IsLetter(l.nextRune) || l.nextRune == '_' {
return lexText
}

if unicode.IsDigit(l.nextRune) {
return lexNumber
}

// handle comments
if l.nextRune == ';' {
return lexComment
}

// dispatch based on next rune, or error
switch l.nextRune {
case '\x00':
l.tokens <- token{tokEOF, ""}
case ',':
return l.emitConsume(token{tokComma, ","}, lexInput)
case '(':
return l.emitConsume(token{tokParenL, "("}, lexInput)
case ')':
return l.emitConsume(token{tokParenR, ")"}, lexInput)
case '+':
fallthrough
case '-':
fallthrough
case '*':
fallthrough
case '/':
fallthrough
case '%':
return l.emitConsume(token{tokExprOp, string(l.nextRune)}, lexInput)
case '$':
fallthrough
case '#':
fallthrough
case '{':
fallthrough
case '}':
fallthrough
case '<':
fallthrough
case '>':
return l.emitConsume(token{tokAddressMode, string(l.nextRune)}, lexInput)
default:
l.tokens <- token{tokError, fmt.Sprintf("unexpected character: '%s'", string(l.nextRune))}
}

return nil
}

func lexText(l *lexer) lexStateFn {
runeBuf := make([]rune, 0, 10)

for unicode.IsLetter(l.nextRune) || unicode.IsDigit(l.nextRune) || l.nextRune == '.' || l.nextRune == '_' {
r, eof := l.next()
runeBuf = append(runeBuf, r)
if eof {
l.tokens <- token{typ: tokText, val: string(runeBuf)}
l.tokens <- token{typ: tokEOF}
return nil
}
}

if len(runeBuf) > 0 {
l.tokens <- token{typ: tokText, val: string(runeBuf)}
}

return lexInput
}

func lexNumber(l *lexer) lexStateFn {
numberBuf := make([]rune, 0, 10)
for unicode.IsDigit(l.nextRune) {
r, eof := l.next()
numberBuf = append(numberBuf, r)
if eof {
l.tokens <- token{tokNumber, string(numberBuf)}
l.tokens <- token{typ: tokEOF}
return nil
}
}

if len(numberBuf) > 0 {
l.tokens <- token{tokNumber, string(numberBuf)}
}

return lexInput
}

func lexComment(l *lexer) lexStateFn {
commentBuf := make([]rune, 0, 32)

for l.nextRune != '\n' {
commentBuf = append(commentBuf, l.nextRune)
_, eof := l.next()
if eof {
l.tokens <- token{tokComment, string(commentBuf)}
l.tokens <- token{tokEOF, ""}
return nil
}
}
l.tokens <- token{typ: tokComment, val: string(commentBuf)}
return lexInput
}
136 changes: 136 additions & 0 deletions lex_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
package gmars

import (
"fmt"
"strings"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

type lexTestCase struct {
input string
expected []token
}

func runLexTests(t *testing.T, setName string, testCases []lexTestCase) {
for i, test := range testCases {
l := newLexer(strings.NewReader(test.input))
out, err := l.Tokens()
require.NoError(t, err, fmt.Errorf("%s test %d: error: %s", setName, i, err))
assert.Equal(t, test.expected, out, fmt.Sprintf("%s test %d", setName, i))
}
}

func TestLexer(t *testing.T) {
testCases := []lexTestCase{
{
input: "",
expected: []token{
{tokEOF, ""},
},
},
{
input: "\n",
expected: []token{
{typ: tokNewline},
{typ: tokEOF},
},
},
{
input: "start mov # -1, $2 ; comment\n",
expected: []token{
{tokText, "start"},
{tokText, "mov"},
{tokAddressMode, "#"},
{tokExprOp, "-"},
{tokNumber, "1"},
{tokComma, ","},
{tokAddressMode, "$"},
{tokNumber, "2"},
{tokComment, "; comment"},
{tokNewline, ""},
{tokEOF, ""},
},
},
{
input: "step equ (1+3)-start\n",
expected: []token{
{tokText, "step"},
{tokText, "equ"},
{tokParenL, "("},
{tokNumber, "1"},
{tokExprOp, "+"},
{tokNumber, "3"},
{tokParenR, ")"},
{tokExprOp, "-"},
{tokText, "start"},
{tokNewline, ""},
{tokEOF, ""},
},
},
{
input: "111",
expected: []token{
{tokNumber, "111"},
{tokEOF, ""},
},
},
{
input: "; comment",
expected: []token{
{tokComment, "; comment"},
{tokEOF, ""},
},
},
{
input: "text",
expected: []token{
{tokText, "text"},
{tokEOF, ""},
},
},
{
input: "#",
expected: []token{
{tokAddressMode, "#"},
{tokEOF, ""},
},
},
{
input: "underscore_text",
expected: []token{
{tokText, "underscore_text"},
{tokEOF, ""},
},
},
{
input: "~",
expected: []token{
{tokError, "unexpected character: '~'"},
},
},
}

runLexTests(t, "TestLexer", testCases)
}

func TestLexEnd(t *testing.T) {
l := newLexer(strings.NewReader("test mov 0, 1\n"))

_, err := l.Tokens()
assert.NoError(t, err)

tok, err := l.NextToken()
assert.Error(t, err)
assert.Equal(t, token{}, tok)

tokens, err := l.Tokens()
assert.Error(t, err)
assert.Nil(t, tokens)

r, eof := l.next()
assert.True(t, eof)
assert.Equal(t, r, '\x00')
}
Loading

0 comments on commit c6a79e6

Please sign in to comment.