From 6f9d992a73064a209cabe416cca7539323e13365 Mon Sep 17 00:00:00 2001 From: Robert Lowry Date: Sat, 23 Nov 2024 14:40:18 -0600 Subject: [PATCH] implement symbol scanner (#87) * symbol scanner: ignore symbols inside for loops * lexer/scanner: pass invalid input through token stream w/o error * lex invalid input gracefully --- compile.go | 13 +++- lex.go | 39 ++-------- lex_test.go | 3 +- symbol_scanner.go | 166 +++++++++++++++++++++++++++++++++++++++++ symbol_scanner_test.go | 63 ++++++++++++++++ token.go | 1 + tokenbuf.go | 34 +++++++++ 7 files changed, 285 insertions(+), 34 deletions(-) create mode 100644 symbol_scanner.go create mode 100644 symbol_scanner_test.go create mode 100644 tokenbuf.go diff --git a/compile.go b/compile.go index 8e8e598..2583b09 100644 --- a/compile.go +++ b/compile.go @@ -446,7 +446,18 @@ func (c *compiler) compile() (WarriorData, error) { func CompileWarrior(r io.Reader, config SimulatorConfig) (WarriorData, error) { lexer := newLexer(r) - parser := newParser(lexer) + tokens, err := lexer.Tokens() + if err != nil { + return WarriorData{}, err + } + + // scanner := newSymbolScanner(newBufTokenReader(tokens)) + // _, err = scanner.ScanInput() + // if err != nil { + // return WarriorData{}, fmt.Errorf("symbol scanner: %s", err) + // } + + parser := newParser(newBufTokenReader(tokens)) sourceLines, metadata, err := parser.parse() if err != nil { return WarriorData{}, err diff --git a/lex.go b/lex.go index 34e9cff..dcf9fd7 100644 --- a/lex.go +++ b/lex.go @@ -22,37 +22,6 @@ type lexer struct { tokens chan token } -// butTokenReader implements the same interface as a streaming parser to let -// us cache and reuse the token stream instead of making multiple passes with -// the lexer -type bufTokenReader struct { - tokens []token - i int -} - -func newBufTokenReader(tokens []token) *bufTokenReader { - return &bufTokenReader{tokens: tokens} -} - -func (r *bufTokenReader) NextToken() (token, error) { - if r.i >= len(r.tokens) { - return token{}, fmt.Errorf("no more tokens") - } - next := r.tokens[r.i] - r.i++ - return next, nil -} - -func (r *bufTokenReader) Tokens() ([]token, error) { - if r.i >= len(r.tokens) { - return nil, fmt.Errorf("no more tokens") - } - subslice := r.tokens[r.i:] - ret := make([]token, len(subslice)) - copy(subslice, ret) - return ret, nil -} - type lexStateFn func(l *lexer) lexStateFn func newLexer(r io.Reader) *lexer { @@ -207,7 +176,13 @@ func lexInput(l *lexer) lexStateFn { case '\x1a': return l.consume(lexInput) default: - l.tokens <- token{tokError, fmt.Sprintf("unexpected character: '%s'", string(l.nextRune))} + // we will put this in the stream. if a file is formatted + // properly, and invalid input should be after an 'end' + // pseudo-op which will cause the parser to stop before + // processing this token, otherwise it is an error + l.tokens <- token{tokInvalid, string(l.nextRune)} + l.tokens <- token{typ: tokEOF} + return nil } return nil diff --git a/lex_test.go b/lex_test.go index b0532bc..c5e1a3e 100644 --- a/lex_test.go +++ b/lex_test.go @@ -114,7 +114,8 @@ func TestLexer(t *testing.T) { { input: "~", expected: []token{ - {tokError, "unexpected character: '~'"}, + {tokInvalid, "~"}, + {tokEOF, ""}, }, }, { diff --git a/symbol_scanner.go b/symbol_scanner.go new file mode 100644 index 0000000..89eef1b --- /dev/null +++ b/symbol_scanner.go @@ -0,0 +1,166 @@ +package gmars + +import ( + "fmt" + "strings" +) + +// symbol scanner accepts a tokenReader and scans for any +// equ symbols contained. Symbols defined inside for loops +// are ignored, allowing us to run the same code both before +// and after for loops have been expanded. +type symbolScanner struct { + lex tokenReader + + nextToken token + atEOF bool + valBuf []token + labelBuf []string + forLevel int + err error + + symbols map[string][]token +} + +type scanStateFn func(p *symbolScanner) scanStateFn + +func newSymbolScanner(lex tokenReader) *symbolScanner { + pre := &symbolScanner{ + lex: lex, + symbols: make(map[string][]token), + } + + pre.next() + + return pre +} + +func (p *symbolScanner) next() token { + if p.atEOF { + return token{typ: tokEOF} + } + tok, err := p.lex.NextToken() + if err != nil { + p.atEOF = true + return token{tokError, fmt.Sprintf("%s\n", err)} + } + if tok.typ == tokEOF || tok.typ == tokError { + p.atEOF = true + } + retTok := p.nextToken + p.nextToken = tok + return retTok +} + +// run the preprocessor +func (p *symbolScanner) ScanInput() (map[string][]token, error) { + for state := scanLine; state != nil; { + state = state(p) + } + if p.err != nil { + return nil, p.err + } + return p.symbols, nil +} + +func (p *symbolScanner) consume(nextState scanStateFn) scanStateFn { + p.next() + if p.nextToken.typ == tokEOF { + return nil + } + return nextState +} + +// run at start of each line +// on text: preLabels +// on other: preConsumeLine +func scanLine(p *symbolScanner) scanStateFn { + switch p.nextToken.typ { + case tokText: + p.labelBuf = make([]string, 0) + return scanLabels + default: + return scanConsumeLine + } +} + +// text equ: consumeValue +// text op: consumLine +// text default: scanLabels +// anything else: consumeLine +func scanLabels(p *symbolScanner) scanStateFn { + switch p.nextToken.typ { + case tokText: + if p.nextToken.IsPseudoOp() { + opLower := strings.ToLower(p.nextToken.val) + switch opLower { + case "equ": + if p.forLevel == 0 { + p.valBuf = make([]token, 0) + return p.consume(scanEquValue) + } + case "for": + p.forLevel++ + return scanConsumeLine + case "rof": + if p.forLevel > 0 { + p.forLevel-- + } + return scanConsumeLine + case "end": + if p.forLevel > 1 { + return scanConsumeLine + } else { + return nil + } + default: + return scanConsumeLine + } + } else if p.nextToken.IsOp() { + return scanConsumeLine + } else if p.nextToken.typ == tokInvalid { + return nil + } + p.labelBuf = append(p.labelBuf, p.nextToken.val) + return p.consume(scanLabels) + case tokComment: + fallthrough + case tokNewline: + return p.consume(scanLabels) + case tokEOF: + return nil + default: + return scanConsumeLine + } +} + +func scanConsumeLine(p *symbolScanner) scanStateFn { + switch p.nextToken.typ { + case tokNewline: + return p.consume(scanLine) + case tokError: + return nil + case tokEOF: + return nil + default: + return p.consume(scanConsumeLine) + } +} + +func scanEquValue(p *symbolScanner) scanStateFn { + for p.nextToken.typ != tokNewline && p.nextToken.typ != tokEOF && p.nextToken.typ != tokError { + p.valBuf = append(p.valBuf, p.nextToken) + p.next() + } + for _, label := range p.labelBuf { + _, ok := p.symbols[label] + if ok { + p.err = fmt.Errorf("symbol '%s' redefined", label) + return nil + } + p.symbols[label] = p.valBuf + } + p.valBuf = make([]token, 0) + p.labelBuf = make([]string, 0) + return p.consume(scanLine) +} diff --git a/symbol_scanner_test.go b/symbol_scanner_test.go new file mode 100644 index 0000000..8081ef4 --- /dev/null +++ b/symbol_scanner_test.go @@ -0,0 +1,63 @@ +package gmars + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +type symbolScannerTestCase struct { + input string + output map[string][]token +} + +func runSymbolScannerTests(t *testing.T, cases []symbolScannerTestCase) { + for _, test := range cases { + tokens, err := LexInput(strings.NewReader(test.input)) + require.NoError(t, err) + require.NotNil(t, tokens) + + scanner := newSymbolScanner(newBufTokenReader(tokens)) + symbols, err := scanner.ScanInput() + require.NoError(t, err) + require.NotNil(t, symbols) + + require.Equal(t, test.output, symbols) + } +} + +func TestSymbolScanner(t *testing.T) { + tests := []symbolScannerTestCase{ + { + input: "test equ 2\ndat 0, test\n", + output: map[string][]token{ + "test": {{tokNumber, "2"}}, + }, + }, + { + input: "dat 0, 0", + output: map[string][]token{}, + }, + { + input: "test\ntest2\nequ 2", + output: map[string][]token{ + "test": {{tokNumber, "2"}}, + "test2": {{tokNumber, "2"}}, + }, + }, + { + // ignore symbols inside for loops because they could be redifined. + // will just re-scan after expanding for loops + input: "test equ 2\nfor 0\nq equ 1\nrof\nfor 1\nq equ 2\nrof\n", + output: map[string][]token{ + "test": {{tokNumber, "2"}}, + }, + }, + { + input: "for 1\nend\nrof\n ~", + output: map[string][]token{}, + }, + } + runSymbolScannerTests(t, tests) +} diff --git a/token.go b/token.go index 029733f..c248265 100644 --- a/token.go +++ b/token.go @@ -15,6 +15,7 @@ const ( tokParenR tokComment // includes semi-colon, no newline char tokNewline + tokInvalid // pass invalid Runes through individually tokEOF ) diff --git a/tokenbuf.go b/tokenbuf.go new file mode 100644 index 0000000..353fc30 --- /dev/null +++ b/tokenbuf.go @@ -0,0 +1,34 @@ +package gmars + +import "fmt" + +// butTokenReader implements the same interface as a streaming parser to let +// us cache and reuse the token stream instead of making multiple passes with +// the lexer +type bufTokenReader struct { + tokens []token + i int +} + +func newBufTokenReader(tokens []token) *bufTokenReader { + return &bufTokenReader{tokens: tokens} +} + +func (r *bufTokenReader) NextToken() (token, error) { + if r.i >= len(r.tokens) { + return token{}, fmt.Errorf("no more tokens") + } + next := r.tokens[r.i] + r.i++ + return next, nil +} + +func (r *bufTokenReader) Tokens() ([]token, error) { + if r.i >= len(r.tokens) { + return nil, fmt.Errorf("no more tokens") + } + subslice := r.tokens[r.i:] + ret := make([]token, len(subslice)) + copy(subslice, ret) + return ret, nil +}