From 79db90f5055e00934c1a10c290cf899d22c7ef31 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Thu, 22 Jul 2021 22:09:00 +0300
Subject: [PATCH 01/29] add units test for the .set directive

---
 tests/assemble.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/tests/assemble.py b/tests/assemble.py
index 3875ee0..33b41fe 100644
--- a/tests/assemble.py
+++ b/tests/assemble.py
@@ -3,6 +3,7 @@
 from esp32_ulp.nocomment import remove_comments
 
 src = """\
+        .set const, 123
 
 start:  wait 42
         ld r0, r1, 0
@@ -14,14 +15,14 @@
 
 def test_parse_line():
     a = Assembler()
-    lines = src.splitlines()
-    # note: line number = index + 1
-    assert a.parse_line(lines[0]) == None
-    assert a.parse_line(lines[1]) == ('start', 'wait', ('42', ))
-    assert a.parse_line(lines[2]) == (None, 'ld', ('r0', 'r1', '0', ))
-    assert a.parse_line(lines[3]) == (None, 'st', ('r0', 'r1', '0', ))
-    assert a.parse_line(lines[4]) == (None, 'halt', ())
-    assert a.parse_line(lines[5]) == ('end', None, ())
+    lines = iter(src.splitlines())
+    assert a.parse_line(next(lines)) == (None, '.set', ('const', '123', ))
+    assert a.parse_line(next(lines)) == None
+    assert a.parse_line(next(lines)) == ('start', 'wait', ('42', ))
+    assert a.parse_line(next(lines)) == (None, 'ld', ('r0', 'r1', '0', ))
+    assert a.parse_line(next(lines)) == (None, 'st', ('r0', 'r1', '0', ))
+    assert a.parse_line(next(lines)) == (None, 'halt', ())
+    assert a.parse_line(next(lines)) == ('end', None, ())
 
 
 def test_parse():
@@ -34,8 +35,10 @@ def test_parse():
 def test_assemble():
     a = Assembler()
     a.assemble(src)
+    assert a.symbols.has_sym('const')
     assert a.symbols.has_sym('start')
     assert a.symbols.has_sym('end')
+    assert a.symbols.get_sym('const') == (ABS, None, 123)
     assert a.symbols.get_sym('start') == (REL, TEXT, 0)
     assert a.symbols.get_sym('end') == (REL, TEXT, 4)
     assert len(b''.join(a.sections[TEXT])) == 16  # 4 instructions * 4B
@@ -50,6 +53,7 @@ def test_symbols():
         ('abs_t4', ABS, TEXT, 4),
         ('rel_d4', REL, DATA, 4),
         ('abs_d4', ABS, DATA, 4),
+        ('const', ABS, None, 123),
     ]:
         st.set_sym(*entry)
     # PASS 1 ========================================================
@@ -62,11 +66,13 @@ def test_symbols():
     assert st.resolve_absolute('abs_d4') == 4
     assert st.resolve_absolute('rel_t4') == 4
     assert st.resolve_absolute('rel_d4') == 4
+    assert st.resolve_absolute('const') == 123
     st.set_from(TEXT, 8)
     assert st.resolve_relative('abs_t4') == -4
     assert st.resolve_relative('abs_d4') == -4
     assert st.resolve_relative('rel_t4') == -4
     assert st.resolve_relative('rel_d4') == -4
+    assert st.resolve_absolute('const') == 123
     # PASS 2 ========================================================
     st.set_bases({TEXT: 100, DATA: 200})
     st.set_pass(2)
@@ -84,11 +90,13 @@ def test_symbols():
     assert st.resolve_absolute('abs_d4') == 4
     assert st.resolve_absolute('rel_t4') == 100 + 4
     assert st.resolve_absolute('rel_d4') == 200 + 4
+    assert st.resolve_absolute('const') == 123
     st.set_from(TEXT, 8)
     assert st.resolve_relative('abs_t4') == 4 - 108
     assert st.resolve_relative('abs_d4') == 4 - 108
     assert st.resolve_relative('rel_t4') == 104 - 108
     assert st.resolve_relative('rel_d4') == 204 - 108
+    assert st.resolve_absolute('const') == 123
 
 
 test_parse_line()

From 84d734ddd22f89d44932ad05153b9ff5ac3d38d5 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Thu, 22 Jul 2021 22:12:44 +0300
Subject: [PATCH 02/29] add support for left aligned assembler directives (e.g.
 .set)

Much open-source code out there has .global, .set, etc directives starting
in the first column of a line. This change allows assembling such code.

Incidentally this also fixes a bug, where directives without parameters,
such as .text, .data, etc were silently accepted when left-aligned but
in those cases treated as labels instead of section headers.
---
 esp32_ulp/assemble.py  | 2 +-
 tests/assemble.py      | 6 ++++++
 tests/compat/symbols.S | 2 ++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/esp32_ulp/assemble.py b/esp32_ulp/assemble.py
index d0b1ff2..e775329 100644
--- a/esp32_ulp/assemble.py
+++ b/esp32_ulp/assemble.py
@@ -118,7 +118,7 @@ def parse_line(self, line):
         """
         if not line:
             return
-        has_label = line[0] not in '\t '
+        has_label = line[0] not in '\t .'
         if has_label:
             label_line = line.split(None, 1)
             if len(label_line) == 2:
diff --git a/tests/assemble.py b/tests/assemble.py
index 33b41fe..4a64b1c 100644
--- a/tests/assemble.py
+++ b/tests/assemble.py
@@ -4,12 +4,14 @@
 
 src = """\
         .set const, 123
+.set const_left, 976
 
 start:  wait 42
         ld r0, r1, 0
         st  r0,  r1,0
         halt
 end:
+.data
 """
 
 
@@ -17,12 +19,14 @@ def test_parse_line():
     a = Assembler()
     lines = iter(src.splitlines())
     assert a.parse_line(next(lines)) == (None, '.set', ('const', '123', ))
+    assert a.parse_line(next(lines)) == (None, '.set', ('const_left', '976', ))
     assert a.parse_line(next(lines)) == None
     assert a.parse_line(next(lines)) == ('start', 'wait', ('42', ))
     assert a.parse_line(next(lines)) == (None, 'ld', ('r0', 'r1', '0', ))
     assert a.parse_line(next(lines)) == (None, 'st', ('r0', 'r1', '0', ))
     assert a.parse_line(next(lines)) == (None, 'halt', ())
     assert a.parse_line(next(lines)) == ('end', None, ())
+    assert a.parse_line(next(lines)) == (None, '.data', ())  # test left-aligned directive is not treated as label
 
 
 def test_parse():
@@ -36,9 +40,11 @@ def test_assemble():
     a = Assembler()
     a.assemble(src)
     assert a.symbols.has_sym('const')
+    assert a.symbols.has_sym('const_left')
     assert a.symbols.has_sym('start')
     assert a.symbols.has_sym('end')
     assert a.symbols.get_sym('const') == (ABS, None, 123)
+    assert a.symbols.get_sym('const_left') == (ABS, None, 976)
     assert a.symbols.get_sym('start') == (REL, TEXT, 0)
     assert a.symbols.get_sym('end') == (REL, TEXT, 4)
     assert len(b''.join(a.sections[TEXT])) == 16  # 4 instructions * 4B
diff --git a/tests/compat/symbols.S b/tests/compat/symbols.S
index bf59c3b..359fa15 100644
--- a/tests/compat/symbols.S
+++ b/tests/compat/symbols.S
@@ -1,10 +1,12 @@
             .text
 
             .set constant42, 42
+.set notindented, 1
 
 start:      move r0, data0
             move r1, data1
             move r2, constant42
+            move r3, notindented
 
             # count from 0 .. 42 in stage register
             stage_rst

From ec81ecc040691076824fca2cd897b70e4d202215 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Thu, 22 Jul 2021 22:41:58 +0300
Subject: [PATCH 03/29] fix a crash bug where BSS size calculation was
 attempted on the value of a data item (bytes) instead of the size of that
 data item (int)

The size of the bss section was increased with the value of the defined symbol
rather than the size of that value (number of bytes). This change fixes that.
---
 esp32_ulp/assemble.py |  4 ++--
 tests/assemble.py     | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/esp32_ulp/assemble.py b/esp32_ulp/assemble.py
index e775329..764ae29 100644
--- a/esp32_ulp/assemble.py
+++ b/esp32_ulp/assemble.py
@@ -150,8 +150,8 @@ def append_section(self, value, expected_section=None):
         if expected_section is not None and s is not expected_section:
             raise TypeError('only allowed in %s section' % expected_section)
         if s is BSS:
-            # just increase BSS size by value
-            self.offsets[s] += value
+            # just increase BSS size by length of value
+            self.offsets[s] += len(value)
         else:
             self.sections[s].append(value)
             self.offsets[s] += len(value)
diff --git a/tests/assemble.py b/tests/assemble.py
index 4a64b1c..f23a6b6 100644
--- a/tests/assemble.py
+++ b/tests/assemble.py
@@ -14,6 +14,13 @@
 .data
 """
 
+src_bss = """\
+  .bss
+
+label:
+  .long 0
+"""
+
 
 def test_parse_line():
     a = Assembler()
@@ -52,6 +59,18 @@ def test_assemble():
     assert a.offsets[BSS] == 0
 
 
+def test_assemble_bss():
+    a = Assembler()
+    try:
+        a.assemble(src_bss)
+    except TypeError:
+        raised = True
+    else:
+        raised = False
+    assert not raised
+    assert a.offsets[BSS] == 4  # 1 word * 4B
+
+
 def test_symbols():
     st = SymbolTable({}, {})
     for entry in [
@@ -108,4 +127,5 @@ def test_symbols():
 test_parse_line()
 test_parse()
 test_assemble()
+test_assemble_bss()
 test_symbols()

From c184924f9645e2e73b466139b194ee435bfa50ab Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Thu, 29 Jul 2021 21:32:19 +0300
Subject: [PATCH 04/29] raise error when attempting to store values in .bss
 section

A simple safety-net matching the behaviour of binutils-esp32ulp
---
 esp32_ulp/assemble.py |  2 ++
 tests/assemble.py     | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/esp32_ulp/assemble.py b/esp32_ulp/assemble.py
index 764ae29..912fa7d 100644
--- a/esp32_ulp/assemble.py
+++ b/esp32_ulp/assemble.py
@@ -150,6 +150,8 @@ def append_section(self, value, expected_section=None):
         if expected_section is not None and s is not expected_section:
             raise TypeError('only allowed in %s section' % expected_section)
         if s is BSS:
+            if int.from_bytes(value, 'little') != 0:
+                raise ValueError('attempt to store non-zero value in section .bss')
             # just increase BSS size by length of value
             self.offsets[s] += len(value)
         else:
diff --git a/tests/assemble.py b/tests/assemble.py
index f23a6b6..edc321e 100644
--- a/tests/assemble.py
+++ b/tests/assemble.py
@@ -71,6 +71,25 @@ def test_assemble_bss():
     assert a.offsets[BSS] == 4  # 1 word * 4B
 
 
+def test_assemble_bss_with_value():
+    lines = """\
+.bss
+    .long 3  #non-zero value not allowed in bss section
+"""
+
+    a = Assembler()
+    try:
+        a.assemble(lines)
+    except ValueError as e:
+        if str(e) != "attempt to store non-zero value in section .bss":
+            raise  # re-raise failures we didn't expect
+        raised = True
+    else:
+        raised = False
+
+    assert raised
+
+
 def test_symbols():
     st = SymbolTable({}, {})
     for entry in [
@@ -128,4 +147,5 @@ def test_symbols():
 test_parse()
 test_assemble()
 test_assemble_bss()
+test_assemble_bss_with_value()
 test_symbols()

From 25d34b0d517dd172e179a1d755cb2a92769e90fa Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Thu, 22 Jul 2021 23:13:36 +0300
Subject: [PATCH 05/29] fix reference to non-existing variable

---
 esp32_ulp/assemble.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/esp32_ulp/assemble.py b/esp32_ulp/assemble.py
index 912fa7d..3f73a9d 100644
--- a/esp32_ulp/assemble.py
+++ b/esp32_ulp/assemble.py
@@ -32,7 +32,7 @@ def get_from(self):
     def set_sym(self, symbol, stype, section, value):
         entry = (stype, section, value)
         if symbol in self._symbols and entry != self._symbols[symbol]:
-            raise Exception('redefining symbol %s with different value %r -> %r.' % (label, self._symbols[symbol], entry))
+            raise Exception('redefining symbol %s with different value %r -> %r.' % (symbol, self._symbols[symbol], entry))
         self._symbols[symbol] = entry
 
     def has_sym(self, symbol):

From 76a81aca0353561213dacedbba8c769dcd11f093 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Fri, 23 Jul 2021 01:10:03 +0300
Subject: [PATCH 06/29] fix typo in comment of instruction definition

---
 esp32_ulp/opcodes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/esp32_ulp/opcodes.py b/esp32_ulp/opcodes.py
index 4e2ca04..3018b30 100644
--- a/esp32_ulp/opcodes.py
+++ b/esp32_ulp/opcodes.py
@@ -112,7 +112,7 @@ def make_ins(layout):
     unused : 8      # Unused
     low : 5         # Low bit
     high : 5        # High bit
-    opcode : 4      # Opcode (OPCODE_WR_REG)
+    opcode : 4      # Opcode (OPCODE_RD_REG)
 """)
 
 

From 56f4530ce1042a5d48397832e6b475ef2209d589 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Thu, 22 Jul 2021 23:07:06 +0300
Subject: [PATCH 07/29] add support for the .global directive. only symbols
 flagged as global will be exported

This change is mostly to support code that uses the .global directive without
having to modify it first (such as commenting out those lines).
---
 esp32_ulp/assemble.py | 18 ++++++++++++++----
 tests/assemble.py     | 31 ++++++++++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/esp32_ulp/assemble.py b/esp32_ulp/assemble.py
index 3f73a9d..25c7f23 100644
--- a/esp32_ulp/assemble.py
+++ b/esp32_ulp/assemble.py
@@ -12,9 +12,10 @@
 
 
 class SymbolTable:
-    def __init__(self, symbols, bases):
+    def __init__(self, symbols, bases, globals):
         self._symbols = symbols
         self._bases = bases
+        self._globals = globals
         self._pass = None
 
     def set_pass(self, _pass):
@@ -53,7 +54,9 @@ def dump(self):
             print(symbol, entry)
 
     def export(self):
-        addrs_syms = [(self.resolve_absolute(entry), symbol) for symbol, entry in self._symbols.items()]
+        addrs_syms = [(self.resolve_absolute(entry), symbol)
+                      for symbol, entry in self._symbols.items()
+                      if symbol in self._globals]
         return sorted(addrs_syms)
 
     def to_abs_addr(self, section, offset):
@@ -93,11 +96,15 @@ def resolve_relative(self, symbol):
         from_addr = self.to_abs_addr(self._from_section, self._from_offset)
         return sym_addr - from_addr
 
+    def set_global(self, symbol):
+        self._globals[symbol] = True
+        pass
+
 
 class Assembler:
 
-    def __init__(self, symbols=None, bases=None):
-        self.symbols = SymbolTable(symbols or {}, bases or {})
+    def __init__(self, symbols=None, bases=None, globls=None):
+        self.symbols = SymbolTable(symbols or {}, bases or {}, globls or {})
         opcodes.symbols = self.symbols  # XXX dirty hack
 
     def init(self, a_pass):
@@ -236,6 +243,9 @@ def d_set(self, symbol, expr):
         value = int(expr)  # TODO: support more than just integers
         self.symbols.set_sym(symbol, ABS, None, value)
 
+    def d_global(self, symbol):
+        self.symbols.set_global(symbol)
+
     def append_data(self, wordlen, args):
         data = [int(arg).to_bytes(wordlen, 'little') for arg in args]
         self.append_section(b''.join(data))
diff --git a/tests/assemble.py b/tests/assemble.py
index edc321e..2cde82f 100644
--- a/tests/assemble.py
+++ b/tests/assemble.py
@@ -22,6 +22,23 @@
 """
 
 
+src_global = """\
+
+  .global counter
+counter:
+  .long 0
+
+internal:
+  .long 0
+
+  .text
+  .global entry
+entry:
+  wait 42
+  halt
+"""
+
+
 def test_parse_line():
     a = Assembler()
     lines = iter(src.splitlines())
@@ -90,8 +107,19 @@ def test_assemble_bss_with_value():
     assert raised
 
 
+def test_assemble_global():
+    a = Assembler()
+    a.assemble(src_global)
+    assert a.symbols.has_sym('counter')
+    assert a.symbols.has_sym('internal')
+    assert a.symbols.has_sym('entry')
+
+    exported_symbols = a.symbols.export()
+    assert exported_symbols == [(0, 'counter'), (2, 'entry')]  # internal not exported
+
+
 def test_symbols():
-    st = SymbolTable({}, {})
+    st = SymbolTable({}, {}, {})
     for entry in [
         ('rel_t4', REL, TEXT, 4),
         ('abs_t4', ABS, TEXT, 4),
@@ -148,4 +176,5 @@ def test_symbols():
 test_assemble()
 test_assemble_bss()
 test_assemble_bss_with_value()
+test_assemble_global()
 test_symbols()

From 9907b107c94f0d831e0483617cd6b7bad8d7d50d Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Thu, 22 Jul 2021 23:23:12 +0300
Subject: [PATCH 08/29] let SymbolTable.export() optionally export non-global
 symbols too

This is then the same behaviour as before the .global directive was
supported.

It might be useful for debugging purposes or potentially some backward
compatibility issues (e.g. scripts that depend on the symbol printout
after assembling)
---
 esp32_ulp/assemble.py | 4 ++--
 tests/assemble.py     | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/esp32_ulp/assemble.py b/esp32_ulp/assemble.py
index 25c7f23..c847432 100644
--- a/esp32_ulp/assemble.py
+++ b/esp32_ulp/assemble.py
@@ -53,10 +53,10 @@ def dump(self):
         for symbol, entry in self._symbols.items():
             print(symbol, entry)
 
-    def export(self):
+    def export(self, incl_non_globals=False):
         addrs_syms = [(self.resolve_absolute(entry), symbol)
                       for symbol, entry in self._symbols.items()
-                      if symbol in self._globals]
+                      if incl_non_globals or symbol in self._globals]
         return sorted(addrs_syms)
 
     def to_abs_addr(self, section, offset):
diff --git a/tests/assemble.py b/tests/assemble.py
index 2cde82f..3839390 100644
--- a/tests/assemble.py
+++ b/tests/assemble.py
@@ -117,6 +117,9 @@ def test_assemble_global():
     exported_symbols = a.symbols.export()
     assert exported_symbols == [(0, 'counter'), (2, 'entry')]  # internal not exported
 
+    exported_symbols = a.symbols.export(True)  # include non-global symbols
+    assert exported_symbols == [(0, 'counter'), (1, 'internal'), (2, 'entry')]
+
 
 def test_symbols():
     st = SymbolTable({}, {}, {})

From 27ab85027052efca27e52290598fd053840a5d96 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Thu, 22 Jul 2021 23:32:45 +0300
Subject: [PATCH 09/29] support ULP opcodes in upper case

Some open-source out there uses upper case for ULP opcodes. This change
allows using such code unmodified instead of crashing with
"Unsupported opcode or directive"
---
 esp32_ulp/assemble.py |  4 ++--
 tests/assemble.py     | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/esp32_ulp/assemble.py b/esp32_ulp/assemble.py
index c847432..ef21079 100644
--- a/esp32_ulp/assemble.py
+++ b/esp32_ulp/assemble.py
@@ -275,12 +275,12 @@ def assembler_pass(self, lines):
                         continue
                 else:
                     # machine instruction
-                    func = getattr(opcodes, 'i_' + opcode, None)
+                    func = getattr(opcodes, 'i_' + opcode.lower(), None)
                     if func is not None:
                         instruction = func(*args)
                         self.append_section(instruction.to_bytes(4, 'little'), TEXT)
                         continue
-                raise Exception('Unknown opcode or directive: %s' % opcode)
+                raise ValueError('Unknown opcode or directive: %s' % opcode)
         self.finalize_sections()
 
     def assemble(self, text):
diff --git a/tests/assemble.py b/tests/assemble.py
index 3839390..cc59377 100644
--- a/tests/assemble.py
+++ b/tests/assemble.py
@@ -121,6 +121,20 @@ def test_assemble_global():
     assert exported_symbols == [(0, 'counter'), (1, 'internal'), (2, 'entry')]
 
 
+def test_assemble_uppercase_opcode():
+    a = Assembler()
+    try:
+        a.assemble("  WAIT 42")
+    except ValueError as e:
+        if str(e) != "Unknown opcode or directive: WAIT":
+            # re-raise failures we didn't expect
+            raise
+        raised = True
+    else:
+        raised = False
+    assert not raised
+
+
 def test_symbols():
     st = SymbolTable({}, {}, {})
     for entry in [
@@ -180,4 +194,5 @@ def test_symbols():
 test_assemble_bss()
 test_assemble_bss_with_value()
 test_assemble_global()
+test_assemble_uppercase_opcode()
 test_symbols()

From 54b117e79595ecf8fa376ed4a75149ace5f0bb4d Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Thu, 22 Jul 2021 23:47:06 +0300
Subject: [PATCH 10/29] add a compatibility test for the recent fixes and
 improvements

Just to double-check and ensure that we're still matching the
binary output of binutils-esp32ulp
---
 tests/compat/fixes.S | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 tests/compat/fixes.S

diff --git a/tests/compat/fixes.S b/tests/compat/fixes.S
new file mode 100644
index 0000000..0c84f1b
--- /dev/null
+++ b/tests/compat/fixes.S
@@ -0,0 +1,20 @@
+# This file tests various fixes to the assembler,
+# to ensure the binary output matches that of binutils.
+# a) support for left-aligned directives (e.g. .set without preceding whitespace)
+# b) a crash-fix related to data items in the .bss section
+# c) support for marking labels as global
+# d) support for upper case ULP opcode names
+#
+.set gpio, 2
+
+.bss
+
+counter:
+.long 0
+
+  .text
+  .global entry
+entry:
+  MOVE R1, gpio
+  WAIT 42
+  halt

From feb42dc9eb315c3d521027141beabf0ab43acaf1 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Fri, 23 Jul 2021 00:08:42 +0300
Subject: [PATCH 11/29] add support for evaluating expressions

This change allows immediate values to be calculated from an expression,
such as 1+1, even including symbols such as "100 << const" (where const
was defined with the .set directive). Expressions are also supported in
the .set directives.

Expressions are evaluated using the built-in eval(). To prevent misuse or
malicious code execution, expressions are validated.

At the point when eval is called, all symbols should have already been resolved
to their values. That means we only need to allow for numeric characters along
with arithmetic and bitwise operators, round brackets and whitespace. The
character 'x' and the characters 'abcdef' are also accepted to allow for hex
numbers such as 0x123abc. These are only allowed however in sequences starting
with 0x. If any other character is encountered the expression is deemed invalid
and an exception is raised.
---
 esp32_ulp/assemble.py  |  2 +-
 esp32_ulp/opcodes.py   | 23 ++++++++++++++--
 esp32_ulp/util.py      | 58 +++++++++++++++++++++++++++++++++++++++
 tests/00_unit_tests.sh |  2 +-
 tests/assemble.py      | 23 ++++++++++++++++
 tests/opcodes.py       | 54 +++++++++++++++++++++++++++++++++++-
 tests/util.py          | 62 ++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 219 insertions(+), 5 deletions(-)
 create mode 100644 tests/util.py

diff --git a/esp32_ulp/assemble.py b/esp32_ulp/assemble.py
index ef21079..2fdd154 100644
--- a/esp32_ulp/assemble.py
+++ b/esp32_ulp/assemble.py
@@ -240,7 +240,7 @@ def d_align(self, align=4, fill=None):
             self.fill(self.section, amount, fill)
 
     def d_set(self, symbol, expr):
-        value = int(expr)  # TODO: support more than just integers
+        value = int(opcodes.eval_arg(expr))  # TODO: support more than just integers
         self.symbols.set_sym(symbol, ABS, None, value)
 
     def d_global(self, symbol):
diff --git a/esp32_ulp/opcodes.py b/esp32_ulp/opcodes.py
index 3018b30..59006f2 100644
--- a/esp32_ulp/opcodes.py
+++ b/esp32_ulp/opcodes.py
@@ -6,6 +6,7 @@
 from uctypes import struct, addressof, LITTLE_ENDIAN, UINT32, BFUINT32, BF_POS, BF_LEN
 
 from .soc import *
+from .util import split_tokens, validate_expression
 
 # XXX dirty hack: use a global for the symbol table
 symbols = None
@@ -267,6 +268,20 @@ def make_ins(layout):
 ARG = namedtuple('ARG', ('type', 'value', 'raw'))
 
 
+def eval_arg(arg):
+    parts = []
+    for token in split_tokens(arg):
+        if symbols.has_sym(token):
+            _, _, sym_value = symbols.get_sym(token)
+            parts.append(str(sym_value))
+        else:
+            parts.append(token)
+    parts = "".join(parts)
+    if not validate_expression(parts):
+        raise ValueError('Unsupported expression: %s' % parts)
+    return eval(parts)
+
+
 def arg_qualify(arg):
     """
     look at arg and qualify its type:
@@ -289,8 +304,12 @@ def arg_qualify(arg):
         return ARG(IMM, int(arg), arg)
     except ValueError:
         pass
-    entry = symbols.get_sym(arg)
-    return ARG(SYM, entry, arg)
+    try:
+        entry = symbols.get_sym(arg)
+        return ARG(SYM, entry, arg)
+    except KeyError:
+        pass
+    return ARG(IMM, int(eval_arg(arg)), arg)
 
 
 def get_reg(arg):
diff --git a/esp32_ulp/util.py b/esp32_ulp/util.py
index c184414..8d2832f 100644
--- a/esp32_ulp/util.py
+++ b/esp32_ulp/util.py
@@ -2,6 +2,8 @@
 
 import gc
 
+NORMAL, WHITESPACE = 0, 1
+
 
 def garbage_collect(msg, verbose=DEBUG):
     free_before = gc.mem_free()
@@ -9,3 +11,59 @@ def garbage_collect(msg, verbose=DEBUG):
     free_after = gc.mem_free()
     if verbose:
         print("%s: %d --gc--> %d bytes free" % (msg, free_before, free_after))
+
+
+def split_tokens(line):
+    buf = ""
+    tokens = []
+    state = NORMAL
+    for c in line:
+        if ('a' <= c <= 'z') or ('A' <= c <= 'Z') or ('0' <= c <= '9') or c == '_':
+            if state != NORMAL:
+                if len(buf) > 0:
+                    tokens.append(buf)
+                    buf = ""
+                state = NORMAL
+            buf += c
+        elif c == ' ' or c == '\t':
+            if state != WHITESPACE:
+                if len(buf) > 0:
+                    tokens.append(buf)
+                    buf = ""
+                state = WHITESPACE
+            buf += c
+        else:
+            if len(buf) > 0:
+                tokens.append(buf)
+                buf = ""
+            tokens.append(c)
+
+    if len(buf) > 0:
+        tokens.append(buf)
+
+    return tokens
+
+
+def validate_expression(param):
+    for token in split_tokens(param):
+        state = 0
+        for c in token:
+            if c not in ' \t+-*/%()<>&|~x0123456789abcdef':
+                return False
+
+            # the following allows hex digits a-f after 0x but not otherwise
+            if state == 0:
+                if c in 'abcdef':
+                    return False
+                if c == '0':
+                    state = 1
+                continue
+
+            if state == 1:
+                state = 2 if c == 'x' else 0
+                continue
+
+            if state == 2:
+                if c not in '0123456789abcdef':
+                    state = 0
+    return True
diff --git a/tests/00_unit_tests.sh b/tests/00_unit_tests.sh
index 07d221f..c7e2f89 100755
--- a/tests/00_unit_tests.sh
+++ b/tests/00_unit_tests.sh
@@ -4,7 +4,7 @@
 
 set -e
 
-for file in opcodes assemble link ; do
+for file in opcodes assemble link util; do
     echo testing $file...
     micropython $file.py
 done
diff --git a/tests/assemble.py b/tests/assemble.py
index cc59377..ac2d423 100644
--- a/tests/assemble.py
+++ b/tests/assemble.py
@@ -135,6 +135,28 @@ def test_assemble_uppercase_opcode():
     assert not raised
 
 
+def test_assemble_evalulate_expressions():
+    src_w_expr = """\
+    .set shft, 2
+    .set loops, (1 << shft)
+
+entry:
+    move r0, 1+1
+    move r1, loops
+    move r2, (shft + 10) * 2
+    move r3, entry << 2
+"""
+    a = Assembler()
+    a.assemble(src_w_expr)
+
+    assert a.symbols.has_sym('shft')
+    assert a.symbols.has_sym('loops')
+    assert a.symbols.has_sym('entry')
+    assert a.symbols.get_sym('shft') == (ABS, None, 2)
+    assert a.symbols.get_sym('loops') == (ABS, None, 4)
+    assert a.symbols.get_sym('entry') == (REL, TEXT, 0)
+
+
 def test_symbols():
     st = SymbolTable({}, {}, {})
     for entry in [
@@ -195,4 +217,5 @@ def test_symbols():
 test_assemble_bss_with_value()
 test_assemble_global()
 test_assemble_uppercase_opcode()
+test_assemble_evalulate_expressions()
 test_symbols()
diff --git a/tests/opcodes.py b/tests/opcodes.py
index 54bb673..f14829a 100644
--- a/tests/opcodes.py
+++ b/tests/opcodes.py
@@ -1,6 +1,8 @@
 from uctypes import UINT32, BFUINT32, BF_POS, BF_LEN
 from esp32_ulp.opcodes import make_ins, make_ins_struct_def
-from esp32_ulp.opcodes import get_reg, get_imm, get_cond, arg_qualify, ARG, REG, IMM, COND
+from esp32_ulp.opcodes import get_reg, get_imm, get_cond, arg_qualify, eval_arg, ARG, REG, IMM, SYM, COND
+from esp32_ulp.assemble import SymbolTable, ABS, REL, TEXT
+import esp32_ulp.opcodes as opcodes
 
 OPCODE_DELAY = 4
 LAYOUT_DELAY = """
@@ -43,6 +45,19 @@ def test_arg_qualify():
     assert arg_qualify('Eq') == ARG(COND, 'eq', 'Eq')
     assert arg_qualify('EQ') == ARG(COND, 'eq', 'EQ')
 
+    # for the next tests, ensure the opcodes module has a SymbolTable
+    opcodes.symbols = SymbolTable({}, {}, {})
+    opcodes.symbols.set_sym('const', ABS, None, 42)  # constant as defined by .set
+    opcodes.symbols.set_sym('entry', REL, TEXT, 4)  # label pointing to code
+
+    assert arg_qualify('1+1') == ARG(IMM, 2, '1+1')
+    assert arg_qualify('const >> 1') == ARG(IMM, 21, 'const >> 1')
+    assert arg_qualify('entry') == ARG(SYM, (REL, TEXT, 4), 'entry')  # symbols should not (yet) be evaluated
+    assert arg_qualify('entry + const') == ARG(IMM, 46, 'entry + const')
+
+    # clean up
+    opcodes.symbols = None
+
 
 def test_get_reg():
     assert get_reg('r0') == 0
@@ -57,9 +72,46 @@ def test_get_cond():
     assert get_cond('Eq') == 'eq'
 
 
+def test_eval_arg():
+    opcodes.symbols = SymbolTable({}, {}, {})
+    opcodes.symbols.set_sym('const', ABS, None, 42)  # constant
+    opcodes.symbols.set_sym('raise', ABS, None, 99)  # constant using a python keyword as name (is allowed)
+
+    assert eval_arg('1+1') == 2
+    assert eval_arg('1+const') == 43
+    assert eval_arg('raise*2/3') == 66
+    assert eval_arg('raise-const') == 57
+    assert eval_arg('(raise-const)*2') == 114
+    assert eval_arg('const    % 5') == 2
+    assert eval_arg('const + 0x19af') == 0x19af + 42
+    assert eval_arg('const & ~2') == 40
+    assert eval_arg('const << 3') == 336
+    assert eval_arg('const >> 1') == 21
+    assert eval_arg('(const|4)&0xf') == 0xe
+
+    assert_raises(ValueError, eval_arg, 'evil()')
+    assert_raises(ValueError, eval_arg, 'def cafe()')
+    assert_raises(ValueError, eval_arg, '1 ^ 2')
+    assert_raises(ValueError, eval_arg, '!100')
+
+    # clean up
+    opcodes.symbols = None
+
+
+def assert_raises(exception, func, *args):
+    try:
+        func(*args)
+    except exception:
+        raised = True
+    else:
+        raised = False
+    assert raised
+
+
 test_make_ins_struct_def()
 test_make_ins()
 test_arg_qualify()
 test_get_reg()
 test_get_imm()
 test_get_cond()
+test_eval_arg()
\ No newline at end of file
diff --git a/tests/util.py b/tests/util.py
new file mode 100644
index 0000000..18ab54e
--- /dev/null
+++ b/tests/util.py
@@ -0,0 +1,62 @@
+from esp32_ulp.util import split_tokens, validate_expression
+
+tests = []
+
+
+def test(param):
+    """
+    the @test decorator
+    """
+    tests.append(param)
+
+
+@test
+def test_split_tokens():
+    assert split_tokens("") == []
+    assert split_tokens("t") == ['t']
+    assert split_tokens("test") == ['test']
+    assert split_tokens("t t") == ['t', ' ', 't']
+    assert split_tokens("t,t") == ['t', ',', 't']
+    assert split_tokens("test(arg)") == ['test', '(', 'arg', ')']
+    assert split_tokens("test(arg,arg2)") == ['test', '(', 'arg', ',', 'arg2', ')']
+    assert split_tokens("test(arg,arg2)") == ['test', '(', 'arg', ',', 'arg2', ')']
+    assert split_tokens("  test(  arg,  arg2)") == ['  ', 'test', '(', '  ', 'arg', ',', '  ', 'arg2', ')']
+    assert split_tokens("  test(  arg )  ") == ['  ', 'test', '(', '  ', 'arg', ' ', ')', '  ']
+    assert split_tokens("\t  test  \t  ") == ['\t  ', 'test', "  \t  "]
+    assert split_tokens("test\nrow2") == ['test', "\n", "row2"]
+
+    # split_token does not support comments. should generally only be used after comments are already stripped
+    assert split_tokens("test(arg /*comment*/)") == ['test', '(', 'arg', ' ', '/', '*', 'comment', '*', '/', ')']
+    assert split_tokens("#test") == ['#', 'test']
+
+
+@test
+def test_validate_expression():
+    assert validate_expression('') is True
+    assert validate_expression('1') is True
+    assert validate_expression('1+1') is True
+    assert validate_expression('(1+1)') is True
+    assert validate_expression('(1+1)*2') is True
+    assert validate_expression('(1 + 1)') is True
+    assert validate_expression('10 % 2') is True
+    assert validate_expression('0x100 << 2') is True
+    assert validate_expression('0x100 & ~2') is True
+    assert validate_expression('0xabcdef') is True
+    assert validate_expression('0x123def') is True
+    assert validate_expression('2*3+4/5&6|7') is True
+    assert validate_expression('(((((1+1) * 2') is True  # valid characters, even if expression is not valid
+
+    assert validate_expression(':') is False
+    assert validate_expression('_') is False
+    assert validate_expression('=') is False
+    assert validate_expression('.') is False
+    assert validate_expression('!') is False
+    assert validate_expression('123 ^ 4') is False  # operator not supported for now
+    assert validate_expression('evil()') is False
+    assert validate_expression('def cafe()') is False  # valid hex digits, but potentially dangerous code
+
+
+if __name__ == '__main__':
+    # run all methods marked with @test
+    for t in tests:
+        t()

From 87507c9f48c1e9450adf52f2a37b1ccaa0653038 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Fri, 23 Jul 2021 16:35:52 +0300
Subject: [PATCH 12/29] add a compatibility test for evaluating expressions

This is a common example, where the ADC is read multiple times (oversampled)
and the oversampling factor is calculated by a shift-left, which makes
calculating the average of samples as easy as shifting their sum right by
the same amount of bits. Using .set directives with expressions makes
the oversampling factor easily configurable.
---
 tests/compat/expr.S | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 tests/compat/expr.S

diff --git a/tests/compat/expr.S b/tests/compat/expr.S
new file mode 100644
index 0000000..48f7304
--- /dev/null
+++ b/tests/compat/expr.S
@@ -0,0 +1,44 @@
+  .set adc_channel, 6
+
+  .set adc_oversampling_factor_log, 2
+  .set adc_oversampling_factor, (1 << adc_oversampling_factor_log)
+
+.data
+
+result:
+  .long 0
+
+  .text
+  .global entry
+entry:
+  move r0, 0
+  stage_rst
+
+measure:
+  adc r1, 0, adc_channel + 1
+  add r0, r0, r1
+
+  stage_inc 1
+  jumps measure, adc_oversampling_factor, lt
+
+  rsh r0, r0, adc_oversampling_factor_log
+
+  move r3, result
+  st r0, r3, 0
+
+  #test that expressions evaluate correctly for all supported operators
+  move r3, 1+2
+  move r3, 3-5
+  move r3, -5
+  move r3, 2*3
+  move r3, 4/2
+  move r3, 4 % 3
+  move r3, 0xff << 2
+  move r3, 0xff >> 1
+  move r3, (0xabcdef | 0xff) & 0xff
+  move r3, 0x1234 & ~2
+  move r3, 42|4&0xf  # 46 (4&0xf is evaluated first)
+  move r3, (42|4)&0xf  # 14 (42|4 is evaluated first)
+
+exit:
+  halt

From 99352a3b1b2a79f04266e64443680f9d282cf3c4 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Thu, 29 Jul 2021 22:32:17 +0300
Subject: [PATCH 13/29] docs: add that expressions are now supported

---
 README.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.rst b/README.rst
index 3952878..56395d1 100644
--- a/README.rst
+++ b/README.rst
@@ -17,6 +17,10 @@ Status
 
 The most commonly used simple stuff should work.
 
+Expressions in assembly source code are supported and get evaluated during
+assembling. Only expressions evaluating to a single integer are supported.
+Constants defined with ``.set`` are supported in expressions.
+
 We have some unit tests and also compatibility tests that compare the output
 whether it is identical with binutils-esp32ulp output.
 

From d76fd2696eb72987834ffd549924604cd7f3a295 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Fri, 23 Jul 2021 16:59:42 +0300
Subject: [PATCH 14/29] add preprocessor that can replace simple #define values
 in code

The preprocessor strips all comments and lines containing a define statement
but keeps the empty lines in the output (to preserve line numbering). The
output is then passed directly into the assembler.

The preprocessor does not support "function style" #define macros (i.e.
ADD(a,b) a+b) but this is not needed for expanding the constants used by
WRITE_RTC_REG(), et al.
---
 esp32_ulp/__main__.py            |   2 +
 esp32_ulp/preprocess.py          |  57 ++++++++++
 tests/00_unit_tests.sh           |   2 +-
 tests/01_compat_tests.sh         |   4 +-
 tests/compat/preprocess_simple.S |   7 ++
 tests/preprocess.py              | 175 +++++++++++++++++++++++++++++++
 6 files changed, 245 insertions(+), 2 deletions(-)
 create mode 100644 esp32_ulp/preprocess.py
 create mode 100644 tests/compat/preprocess_simple.S
 create mode 100644 tests/preprocess.py

diff --git a/esp32_ulp/__main__.py b/esp32_ulp/__main__.py
index 584a3dd..b24578a 100644
--- a/esp32_ulp/__main__.py
+++ b/esp32_ulp/__main__.py
@@ -2,6 +2,7 @@
 
 from .util import garbage_collect
 
+from .preprocess import preprocess
 from .assemble import Assembler
 from .link import make_binary
 garbage_collect('after import')
@@ -23,6 +24,7 @@ def main(fn):
     with open(fn) as f:
         src = f.read()
 
+    src = preprocess(src)
     binary = src_to_binary(src)
 
     if fn.endswith('.s') or fn.endswith('.S'):
diff --git a/esp32_ulp/preprocess.py b/esp32_ulp/preprocess.py
new file mode 100644
index 0000000..12c13b1
--- /dev/null
+++ b/esp32_ulp/preprocess.py
@@ -0,0 +1,57 @@
+from . import nocomment
+from .util import split_tokens
+
+
+class Preprocessor:
+    def __init__(self):
+        self._defines = {}
+
+    def parse_defines(self, content):
+        result = {}
+        for line in content.splitlines():
+            line = line.strip()
+            if not line.startswith("#define"):
+                # skip lines not containing #define
+                continue
+            line = line[8:].strip()  # remove #define
+            parts = line.split(None, 1)
+            if len(parts) != 2:
+                # skip defines without value
+                continue
+            identifier, value = parts
+            tmp = identifier.split('(', 1)
+            if len(tmp) == 2:
+                # skip parameterised defines (macros)
+                continue
+            value = "".join(nocomment.remove_comments(value)).strip()
+            result[identifier] = value
+        self._defines = result
+        return result
+
+    def expand_defines(self, line):
+        found = True
+        while found:  # do as many passed as needed, until nothing was replaced anymore
+            found = False
+            tokens = split_tokens(line)
+            line = ""
+            for t in tokens:
+                lu = self._defines.get(t, t)
+                if lu != t:
+                    found = True
+                line += lu
+
+        return line
+
+    def preprocess(self, content):
+        self.parse_defines(content)
+        lines = nocomment.remove_comments(content)
+        result = []
+        for line in lines:
+            line = self.expand_defines(line)
+            result.append(line)
+        result = "\n".join(result)
+        return result
+
+
+def preprocess(content):
+    return Preprocessor().preprocess(content)
diff --git a/tests/00_unit_tests.sh b/tests/00_unit_tests.sh
index c7e2f89..efd5b64 100755
--- a/tests/00_unit_tests.sh
+++ b/tests/00_unit_tests.sh
@@ -4,7 +4,7 @@
 
 set -e
 
-for file in opcodes assemble link util; do
+for file in opcodes assemble link util preprocess; do
     echo testing $file...
     micropython $file.py
 done
diff --git a/tests/01_compat_tests.sh b/tests/01_compat_tests.sh
index c565aa1..68f8bdc 100755
--- a/tests/01_compat_tests.sh
+++ b/tests/01_compat_tests.sh
@@ -13,12 +13,14 @@ for src_file in $(ls -1 compat/*.S); do
     log_file="${src_name}.log"
     micropython -m esp32_ulp $src_file 1>$log_file   # generates $ulp_file
 
+    pre_file="${src_name}.pre"
     obj_file="${src_name}.o"
     elf_file="${src_name}.elf"
     bin_file="${src_name}.bin"
 
     echo -e "\tBuilding using binutils"
-    esp32ulp-elf-as -o $obj_file $src_file
+    gcc -E -o ${pre_file} $src_file
+    esp32ulp-elf-as -o $obj_file ${pre_file}
     esp32ulp-elf-ld -T esp32.ulp.ld -o $elf_file $obj_file
     esp32ulp-elf-objcopy -O binary $elf_file $bin_file
 
diff --git a/tests/compat/preprocess_simple.S b/tests/compat/preprocess_simple.S
new file mode 100644
index 0000000..b6a61e8
--- /dev/null
+++ b/tests/compat/preprocess_simple.S
@@ -0,0 +1,7 @@
+#define GPIO 2
+#define BASE 0x100
+#define ADDR (BASE + GPIO)
+
+entry:
+  move r0, GPIO
+  move r1, ADDR
diff --git a/tests/preprocess.py b/tests/preprocess.py
new file mode 100644
index 0000000..bfca066
--- /dev/null
+++ b/tests/preprocess.py
@@ -0,0 +1,175 @@
+from esp32_ulp.preprocess import Preprocessor
+
+tests = []
+
+
+def test(param):
+    tests.append(param)
+
+
+@test
+def test_replace_defines_should_return_empty_line_given_empty_string():
+    p = Preprocessor()
+
+    assert p.preprocess("") == ""
+
+
+@test
+def replace_defines_should_return_remove_comments():
+    p = Preprocessor()
+
+    line = "// some comment"
+    expected = ""
+    assert p.preprocess(line) == expected
+
+
+@test
+def test_parse_defines():
+    p = Preprocessor()
+
+    assert p.parse_defines("") == {}
+    assert p.parse_defines("// comment") == {}
+    assert p.parse_defines("  // comment") == {}
+    assert p.parse_defines("  /* comment */") == {}
+    assert p.parse_defines("  /* comment */ #define A 42") == {}  # #define must be the first thing on a line
+    assert p.parse_defines("#define a 1") == {"a": "1"}
+    assert p.parse_defines(" #define a 1") == {"a": "1"}
+    assert p.parse_defines("#define a 1 2") == {"a": "1 2"}
+    assert p.parse_defines("#define f(a,b) 1") == {}  # macros not supported
+    assert p.parse_defines("#define f(a, b) 1") == {}  # macros not supported
+    assert p.parse_defines("#define f (a,b) 1") == {"f": "(a,b) 1"}  # f is not a macro
+    assert p.parse_defines("#define f (a, b) 1") == {"f": "(a, b) 1"}  # f is not a macro
+    assert p.parse_defines("#define RTC_ADDR       0x12345    // start of range") == {"RTC_ADDR": "0x12345"}
+
+
+@test
+def test_parse_defines_handles_multiple_input_lines():
+    p = Preprocessor()
+
+    multi_line_1 = """\
+#define ID_WITH_UNDERSCORE something
+#define ID2 somethingelse
+"""
+    assert p.parse_defines(multi_line_1) == {"ID_WITH_UNDERSCORE": "something", "ID2": "somethingelse"}
+
+
+@test
+def test_parse_defines_does_not_understand_comments_by_current_design():
+    # comments are not understood. lines are expected to already have comments removed!
+    p = Preprocessor()
+
+    multi_line_2 = """\
+#define ID_WITH_UNDERSCORE something
+/*
+#define ID2 somethingelse
+*/
+"""
+    assert "ID2" in p.parse_defines(multi_line_2)
+
+
+@test
+def test_parse_defines_does_not_understand_line_continuations_with_backslash_by_current_design():
+    p = Preprocessor()
+
+    multi_line_3 = r"""
+    #define ID_WITH_UNDERSCORE something \
+           line2
+    """
+
+    assert p.parse_defines(multi_line_3) == {"ID_WITH_UNDERSCORE": "something \\"}
+
+
+@test
+def preprocess_should_remove_comments_and_defines_but_keep_the_lines_as_empty_lines():
+    p = Preprocessor()
+
+    lines = """\
+    // copyright
+    #define A 1
+
+    move r1, r2"""
+
+    assert p.preprocess(lines) == "\n\n\n\tmove r1, r2"
+
+
+@test
+def preprocess_should_replace_words_defined():
+    p = Preprocessor()
+
+    lines = """\
+    #define DR_REG_RTCIO_BASE 0x3ff48400
+
+    move r1, DR_REG_RTCIO_BASE"""
+
+    assert "move r1, 0x3ff48400" in p.preprocess(lines)
+
+
+@test
+def preprocess_should_replace_words_defined_multiple_times():
+    p = Preprocessor()
+
+    lines = """\
+    #define DR_REG_RTCIO_BASE 0x3ff48400
+
+    move r1, DR_REG_RTCIO_BASE  #once
+    move r2, DR_REG_RTCIO_BASE  #second time"""
+
+    assert "move r1, 0x3ff48400" in p.preprocess(lines)
+    assert "move r2, 0x3ff48400" in p.preprocess(lines)
+
+
+@test
+def preprocess_should_replace_all_defined_words():
+    p = Preprocessor()
+
+    lines = """\
+    #define DR_REG_RTCIO_BASE 0x3ff48400
+    #define SOME_OFFSET 4
+
+    move r1, DR_REG_RTCIO_BASE
+    add r2, r1, SOME_OFFSET"""
+
+    assert "move r1, 0x3ff48400" in p.preprocess(lines)
+    assert "add r2, r1, 4" in p.preprocess(lines)
+
+
+@test
+def preprocess_should_not_replace_substrings_within_identifiers():
+    p = Preprocessor()
+
+    # ie. if AAA is defined don't touch PREFIX_AAA_SUFFIX
+    lines = """\
+    #define RTCIO 4
+    move r1, DR_REG_RTCIO_BASE"""
+
+    assert "DR_REG_4_BASE" not in p.preprocess(lines)
+
+    # ie. if A and AA are defined, don't replace AA as two A's but with AA
+    lines = """\
+    #define A 4
+    #define AA 8
+    move r1, A
+    move r2, AA"""
+
+    assert "move r1, 4" in p.preprocess(lines)
+    assert "move r2, 8" in p.preprocess(lines)
+
+
+@test
+def preprocess_should_replace_defines_used_in_defines():
+    p = Preprocessor()
+
+    lines = """\
+    #define BITS (BASE << 4)
+    #define BASE 0x1234
+
+    move r1, BITS
+    move r2, BASE"""
+
+    assert "move r1, (0x1234 << 4)" in p.preprocess(lines)
+
+
+if __name__ == '__main__':
+    # run all methods marked with @test
+    for t in tests:
+        t()

From 4dded94fa2ca0fc663cbc04561c2c92247c42834 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Sat, 7 Aug 2021 17:35:07 +0300
Subject: [PATCH 15/29] allow assembler to skip comment removal to avoid
 removing comments twice

Since the preprocessor was introduced, which already removes comments, the
assembler does not need to remove comments anymore in the usual case. The
assembler still retains the ability to remove comments (enabled by default)
in case it is used without the preprocessor. The `remove_comments` argument
to the `assemble()` method can be used to control whether comments will be
removed during assembly or not.
---
 esp32_ulp/__main__.py |  2 +-
 esp32_ulp/assemble.py |  6 +++---
 tests/assemble.py     | 19 +++++++++++++++++++
 3 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/esp32_ulp/__main__.py b/esp32_ulp/__main__.py
index b24578a..d9555fd 100644
--- a/esp32_ulp/__main__.py
+++ b/esp32_ulp/__main__.py
@@ -10,7 +10,7 @@
 
 def src_to_binary(src):
     assembler = Assembler()
-    assembler.assemble(src)
+    assembler.assemble(src, remove_comments=False)  # comments already removed by preprocessor
     garbage_collect('before symbols export')
     addrs_syms = assembler.symbols.export()
     for addr, sym in addrs_syms:
diff --git a/esp32_ulp/assemble.py b/esp32_ulp/assemble.py
index 2fdd154..297ebb8 100644
--- a/esp32_ulp/assemble.py
+++ b/esp32_ulp/assemble.py
@@ -3,7 +3,7 @@
 """
 
 from . import opcodes
-from .nocomment import remove_comments
+from .nocomment import remove_comments as do_remove_comments
 from .util import garbage_collect
 
 TEXT, DATA, BSS = 'text', 'data', 'bss'
@@ -283,8 +283,8 @@ def assembler_pass(self, lines):
                 raise ValueError('Unknown opcode or directive: %s' % opcode)
         self.finalize_sections()
 
-    def assemble(self, text):
-        lines = remove_comments(text)
+    def assemble(self, text, remove_comments=True):
+        lines = do_remove_comments(text) if remove_comments else text.splitlines()
         self.init(1)  # pass 1 is only to get the symbol table right
         self.assembler_pass(lines)
         self.symbols.set_bases(self.compute_bases())
diff --git a/tests/assemble.py b/tests/assemble.py
index ac2d423..496d4a9 100644
--- a/tests/assemble.py
+++ b/tests/assemble.py
@@ -157,6 +157,24 @@ def test_assemble_evalulate_expressions():
     assert a.symbols.get_sym('entry') == (REL, TEXT, 0)
 
 
+def test_assemble_optional_comment_removal():
+    line = " move r1, 123  # comment"
+
+    a = Assembler()
+
+    # first assemble as normal (comments will be removed by default)
+    a.assemble(line)
+
+    # now assemble with comment removal disabled
+    try:
+        a.assemble(line, remove_comments=False)
+    except ValueError as e:
+        raised = True
+    else:
+        raised = False
+    assert raised
+
+
 def test_symbols():
     st = SymbolTable({}, {}, {})
     for entry in [
@@ -218,4 +236,5 @@ def test_symbols():
 test_assemble_global()
 test_assemble_uppercase_opcode()
 test_assemble_evalulate_expressions()
+test_assemble_optional_comment_removal()
 test_symbols()

From 219f939a4242040a7209e3cb019f8076af15f3a8 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Sun, 25 Jul 2021 18:15:57 +0300
Subject: [PATCH 16/29] fix evaluation of expressions during first assembler
 pass

During the first assembler pass, the SymbolTable does not yet have all
symbols. During a symbol lookup the SymbolTable has so far returned a
fake symbol for non-existing symbols, to make the assembler happy
(values are not really being used during the first pass, so it's ok).

However now that expressions are supported, when the symbol lookup
encountered expressions during pass 1, it assumed those expressions were
"not-yet-existing-symbols", which is of course incorrect as they will
eventually be evaluated to integer values. Some opcodes were unhappy with
receiving an expression during pass 1 (e.g. the req_wr opcode, which
expects a sane address as a first argument).

This commit simply skips creating instructions during the first pass,
because all instructions are 32-bit (4 bytes) long anyway, so the content
doesn't matter during that first assembler pass, which only measures
section sizes.
---
 esp32_ulp/assemble.py |  2 +-
 tests/assemble.py     | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/esp32_ulp/assemble.py b/esp32_ulp/assemble.py
index 297ebb8..12fae70 100644
--- a/esp32_ulp/assemble.py
+++ b/esp32_ulp/assemble.py
@@ -277,7 +277,7 @@ def assembler_pass(self, lines):
                     # machine instruction
                     func = getattr(opcodes, 'i_' + opcode.lower(), None)
                     if func is not None:
-                        instruction = func(*args)
+                        instruction = 0 if self.a_pass == 1 else func(*args)
                         self.append_section(instruction.to_bytes(4, 'little'), TEXT)
                         continue
                 raise ValueError('Unknown opcode or directive: %s' % opcode)
diff --git a/tests/assemble.py b/tests/assemble.py
index 496d4a9..f1a5b45 100644
--- a/tests/assemble.py
+++ b/tests/assemble.py
@@ -175,6 +175,19 @@ def test_assemble_optional_comment_removal():
     assert raised
 
 
+def test_assemble_test_regressions_from_evaluation():
+    line = " reg_wr (0x3ff48400 + 0x10), 1, 1, 1"
+
+    a = Assembler()
+    raised = False
+    try:
+        a.assemble(line)
+    except ValueError as e:
+        if str(e) == 'invalid register base':  # ensure we trapped the expected Exception
+            raised = True
+    assert not raised
+
+
 def test_symbols():
     st = SymbolTable({}, {}, {})
     for entry in [
@@ -237,4 +250,5 @@ def test_symbols():
 test_assemble_uppercase_opcode()
 test_assemble_evalulate_expressions()
 test_assemble_optional_comment_removal()
+test_assemble_test_regressions_from_evaluation()
 test_symbols()

From 5c3eeb85529dd4d5dd271f6b8afeda924264cd64 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Mon, 26 Jul 2021 06:46:47 +0300
Subject: [PATCH 17/29] remove no-longer-needed pass dependent code from
 SymbolTable

SymbolTable used the pass number to handle some special cases in pass 1
of assembling, by returning dummy values. Since the first pass no longer
creates actual instructions, the SymbolTable no longer needs to do this
and no longer needs to be aware of the pass the assembler is in.
---
 esp32_ulp/assemble.py | 21 ++-------------------
 tests/assemble.py     | 27 +++++++++++++++------------
 2 files changed, 17 insertions(+), 31 deletions(-)

diff --git a/esp32_ulp/assemble.py b/esp32_ulp/assemble.py
index 12fae70..9180d8a 100644
--- a/esp32_ulp/assemble.py
+++ b/esp32_ulp/assemble.py
@@ -16,10 +16,6 @@ def __init__(self, symbols, bases, globals):
         self._symbols = symbols
         self._bases = bases
         self._globals = globals
-        self._pass = None
-
-    def set_pass(self, _pass):
-        self._pass = _pass
 
     def set_bases(self, bases):
         self._bases = bases
@@ -40,13 +36,7 @@ def has_sym(self, symbol):
         return symbol in self._symbols
         
     def get_sym(self, symbol):
-        try:
-            entry = self._symbols[symbol]
-        except KeyError:
-            if self._pass == 1:
-                entry = (REL, TEXT, 0)  # for a dummy, this is good enough
-            else:
-                raise
+        entry = self._symbols[symbol]
         return entry
 
     def dump(self):
@@ -60,13 +50,7 @@ def export(self, incl_non_globals=False):
         return sorted(addrs_syms)
 
     def to_abs_addr(self, section, offset):
-        try:
-            base = self._bases[section]
-        except KeyError:
-            if self._pass == 1:
-                base = 0  # for a dummy this is good enough
-            else:
-                raise
+        base = self._bases[section]
         return base + offset
 
     def resolve_absolute(self, symbol):
@@ -109,7 +93,6 @@ def __init__(self, symbols=None, bases=None, globls=None):
 
     def init(self, a_pass):
         self.a_pass = a_pass
-        self.symbols.set_pass(a_pass)
         self.sections = dict(text=[], data=[])
         self.offsets = dict(text=0, data=0, bss=0)
         self.section = TEXT
diff --git a/tests/assemble.py b/tests/assemble.py
index f1a5b45..e607ba2 100644
--- a/tests/assemble.py
+++ b/tests/assemble.py
@@ -199,25 +199,28 @@ def test_symbols():
     ]:
         st.set_sym(*entry)
     # PASS 1 ========================================================
-    st.set_pass(1)
     assert st.has_sym('abs_t4')
     assert st.get_sym('abs_t4') == (ABS, TEXT, 4)
     assert not st.has_sym('notexist')
-    assert st.get_sym('notexist') == (REL, TEXT, 0)  # pass1 -> dummy
+    try:
+        st.get_sym('notexist')  # pass1 -> raises
+    except KeyError:
+        raised = True
+    else:
+        raised = False
+    assert raised
     assert st.resolve_absolute('abs_t4') == 4
-    assert st.resolve_absolute('abs_d4') == 4
-    assert st.resolve_absolute('rel_t4') == 4
-    assert st.resolve_absolute('rel_d4') == 4
-    assert st.resolve_absolute('const') == 123
-    st.set_from(TEXT, 8)
-    assert st.resolve_relative('abs_t4') == -4
-    assert st.resolve_relative('abs_d4') == -4
-    assert st.resolve_relative('rel_t4') == -4
-    assert st.resolve_relative('rel_d4') == -4
+    try:
+        # relative symbols cannot be resolved, because in pass 1 section bases are not yet defined
+        st.resolve_absolute('rel_t4')
+    except KeyError:
+        raised = True
+    else:
+        raised = False
+    assert raised
     assert st.resolve_absolute('const') == 123
     # PASS 2 ========================================================
     st.set_bases({TEXT: 100, DATA: 200})
-    st.set_pass(2)
     assert st.has_sym('abs_t4')
     assert st.get_sym('abs_t4') == (ABS, TEXT, 4)
     assert not st.has_sym('notexist')

From 3e8c0d515392d53f7ff6a0d328d9cf9cade4192d Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Mon, 26 Jul 2021 06:53:02 +0300
Subject: [PATCH 18/29] add support for macros such as WRITE_RTC_REG

This is a simplified implementation, rather than adding "proper" support for macros
because we don't need more. The macros WRITE_RTC_REG, READ_RTC_REG, WRITE_RTC_FIELD
and READ_RTC_FIELD are simply expanded in a predefined way. If they are also defined
as macros in the source code, those macros in the source will be ignored.
---
 esp32_ulp/preprocess.py | 50 +++++++++++++++++++++++++++++++++++++++++
 tests/preprocess.py     | 12 ++++++++++
 2 files changed, 62 insertions(+)

diff --git a/esp32_ulp/preprocess.py b/esp32_ulp/preprocess.py
index 12c13b1..1eae375 100644
--- a/esp32_ulp/preprocess.py
+++ b/esp32_ulp/preprocess.py
@@ -2,6 +2,34 @@
 from .util import split_tokens
 
 
+class RTC_Macros:
+    @staticmethod
+    def READ_RTC_REG(rtc_reg, low_bit, bit_width):
+        return '\treg_rd ' + ', '.join((
+            rtc_reg,
+            '%s + %s - 1' % (low_bit, bit_width),
+            low_bit
+        ))
+
+    @staticmethod
+    def WRITE_RTC_REG(rtc_reg, low_bit, bit_width, value):
+        args = (
+            rtc_reg,
+            '%s + %s - 1' % (low_bit, bit_width),
+            low_bit,
+            value
+        )
+        return '\treg_wr ' + ', '.join(args)
+
+    @staticmethod
+    def READ_RTC_FIELD(rtc_reg, low_bit):
+        return RTC_Macros.READ_RTC_REG(rtc_reg, low_bit, 1)
+
+    @staticmethod
+    def WRITE_RTC_FIELD(rtc_reg, low_bit, value):
+        return RTC_Macros.WRITE_RTC_REG(rtc_reg, low_bit, 1, value + ' & 1')
+
+
 class Preprocessor:
     def __init__(self):
         self._defines = {}
@@ -42,12 +70,34 @@ def expand_defines(self, line):
 
         return line
 
+    def expand_rtc_macros(self, line):
+        clean_line = line.strip()
+        if not clean_line:
+            return line
+
+        macro = clean_line.split('(', 1)
+        if len(macro) != 2:
+            return line
+
+        macro_name, macro_args = macro
+
+        macro_fn = getattr(RTC_Macros, macro_name, None)
+        if macro_fn is None:
+            return line
+
+        macro_args, _ = macro_args.rsplit(')', 1)  # trim away right bracket. safe as comments already stripped
+        macro_args = macro_args.split(',')  # not safe when args contain ',' but we should not have those
+        macro_args = [x.strip() for x in macro_args]
+
+        return macro_fn(*macro_args)
+
     def preprocess(self, content):
         self.parse_defines(content)
         lines = nocomment.remove_comments(content)
         result = []
         for line in lines:
             line = self.expand_defines(line)
+            line = self.expand_rtc_macros(line)
             result.append(line)
         result = "\n".join(result)
         return result
diff --git a/tests/preprocess.py b/tests/preprocess.py
index bfca066..a31fe1b 100644
--- a/tests/preprocess.py
+++ b/tests/preprocess.py
@@ -169,6 +169,18 @@ def preprocess_should_replace_defines_used_in_defines():
     assert "move r1, (0x1234 << 4)" in p.preprocess(lines)
 
 
+@test
+def test_expand_rtc_macros():
+    p = Preprocessor()
+
+    assert p.expand_rtc_macros("") == ""
+    assert p.expand_rtc_macros("abc") == "abc"
+    assert p.expand_rtc_macros("WRITE_RTC_REG(1, 2, 3, 4)") == "\treg_wr 1, 2 + 3 - 1, 2, 4"
+    assert p.expand_rtc_macros("READ_RTC_REG(1, 2, 3)") == "\treg_rd 1, 2 + 3 - 1, 2"
+    assert p.expand_rtc_macros("WRITE_RTC_FIELD(1, 2, 3)") == "\treg_wr 1, 2 + 1 - 1, 2, 3 & 1"
+    assert p.expand_rtc_macros("READ_RTC_FIELD(1, 2)") == "\treg_rd 1, 2 + 1 - 1, 2"
+
+
 if __name__ == '__main__':
     # run all methods marked with @test
     for t in tests:

From ac1de99fe6f29735c689a4c52665cb2257323705 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Mon, 26 Jul 2021 08:13:43 +0300
Subject: [PATCH 19/29] add simple include file processing

This will not work on the ESP32 for large files, due to limited memory.
But this is only the first step. Next we'll add a database for storing
defines from include files.
---
 esp32_ulp/preprocess.py | 50 +++++++++++++++++++++++++----------------
 tests/fixtures/incl.h   |  5 +++++
 tests/fixtures/incl2.h  |  2 ++
 tests/preprocess.py     | 49 +++++++++++++++++++++++++++++-----------
 4 files changed, 74 insertions(+), 32 deletions(-)
 create mode 100644 tests/fixtures/incl.h
 create mode 100644 tests/fixtures/incl2.h

diff --git a/esp32_ulp/preprocess.py b/esp32_ulp/preprocess.py
index 1eae375..51ec8ef 100644
--- a/esp32_ulp/preprocess.py
+++ b/esp32_ulp/preprocess.py
@@ -32,29 +32,31 @@ def WRITE_RTC_FIELD(rtc_reg, low_bit, value):
 
 class Preprocessor:
     def __init__(self):
+        self._defines_db = None
         self._defines = {}
 
+    def parse_define_line(self, line):
+        line = line.strip()
+        if not line.startswith("#define"):
+            # skip lines not containing #define
+            return {}
+        line = line[8:].strip()  # remove #define
+        parts = line.split(None, 1)
+        if len(parts) != 2:
+            # skip defines without value
+            return {}
+        identifier, value = parts
+        tmp = identifier.split('(', 1)
+        if len(tmp) == 2:
+            # skip parameterised defines (macros)
+            return {}
+        value = "".join(nocomment.remove_comments(value)).strip()
+        return {identifier: value}
+
     def parse_defines(self, content):
-        result = {}
         for line in content.splitlines():
-            line = line.strip()
-            if not line.startswith("#define"):
-                # skip lines not containing #define
-                continue
-            line = line[8:].strip()  # remove #define
-            parts = line.split(None, 1)
-            if len(parts) != 2:
-                # skip defines without value
-                continue
-            identifier, value = parts
-            tmp = identifier.split('(', 1)
-            if len(tmp) == 2:
-                # skip parameterised defines (macros)
-                continue
-            value = "".join(nocomment.remove_comments(value)).strip()
-            result[identifier] = value
-        self._defines = result
-        return result
+            self._defines.update(self.parse_define_line(line))
+        return self._defines
 
     def expand_defines(self, line):
         found = True
@@ -70,6 +72,16 @@ def expand_defines(self, line):
 
         return line
 
+    def process_include_file(self, filename):
+        defines = self._defines
+
+        with open(filename, 'r') as f:
+            for line in f:
+                result = self.parse_defines(line)
+                defines.update(result)
+
+        return defines
+
     def expand_rtc_macros(self, line):
         clean_line = line.strip()
         if not clean_line:
diff --git a/tests/fixtures/incl.h b/tests/fixtures/incl.h
new file mode 100644
index 0000000..5c8415e
--- /dev/null
+++ b/tests/fixtures/incl.h
@@ -0,0 +1,5 @@
+#define CONST1 42
+#define MACRO(x,y) x+y
+#define MULTI_LINE abc \
+                   xyz
+#define CONST2 99
\ No newline at end of file
diff --git a/tests/fixtures/incl2.h b/tests/fixtures/incl2.h
new file mode 100644
index 0000000..09775d1
--- /dev/null
+++ b/tests/fixtures/incl2.h
@@ -0,0 +1,2 @@
+#define CONST2 123
+#define CONST3 777
\ No newline at end of file
diff --git a/tests/preprocess.py b/tests/preprocess.py
index a31fe1b..f9fe936 100644
--- a/tests/preprocess.py
+++ b/tests/preprocess.py
@@ -27,19 +27,19 @@ def replace_defines_should_return_remove_comments():
 def test_parse_defines():
     p = Preprocessor()
 
-    assert p.parse_defines("") == {}
-    assert p.parse_defines("// comment") == {}
-    assert p.parse_defines("  // comment") == {}
-    assert p.parse_defines("  /* comment */") == {}
-    assert p.parse_defines("  /* comment */ #define A 42") == {}  # #define must be the first thing on a line
-    assert p.parse_defines("#define a 1") == {"a": "1"}
-    assert p.parse_defines(" #define a 1") == {"a": "1"}
-    assert p.parse_defines("#define a 1 2") == {"a": "1 2"}
-    assert p.parse_defines("#define f(a,b) 1") == {}  # macros not supported
-    assert p.parse_defines("#define f(a, b) 1") == {}  # macros not supported
-    assert p.parse_defines("#define f (a,b) 1") == {"f": "(a,b) 1"}  # f is not a macro
-    assert p.parse_defines("#define f (a, b) 1") == {"f": "(a, b) 1"}  # f is not a macro
-    assert p.parse_defines("#define RTC_ADDR       0x12345    // start of range") == {"RTC_ADDR": "0x12345"}
+    assert p.parse_define_line("") == {}
+    assert p.parse_define_line("// comment") == {}
+    assert p.parse_define_line("  // comment") == {}
+    assert p.parse_define_line("  /* comment */") == {}
+    assert p.parse_define_line("  /* comment */ #define A 42") == {}  # #define must be the first thing on a line
+    assert p.parse_define_line("#define a 1") == {"a": "1"}
+    assert p.parse_define_line(" #define a 1") == {"a": "1"}
+    assert p.parse_define_line("#define a 1 2") == {"a": "1 2"}
+    assert p.parse_define_line("#define f(a,b) 1") == {}  # macros not supported
+    assert p.parse_define_line("#define f(a, b) 1") == {}  # macros not supported
+    assert p.parse_define_line("#define f (a,b) 1") == {"f": "(a,b) 1"}  # f is not a macro
+    assert p.parse_define_line("#define f (a, b) 1") == {"f": "(a, b) 1"}  # f is not a macro
+    assert p.parse_define_line("#define RTC_ADDR       0x12345    // start of range") == {"RTC_ADDR": "0x12345"}
 
 
 @test
@@ -181,6 +181,29 @@ def test_expand_rtc_macros():
     assert p.expand_rtc_macros("READ_RTC_FIELD(1, 2)") == "\treg_rd 1, 2 + 1 - 1, 2"
 
 
+@test
+def test_process_include_file():
+    p = Preprocessor()
+
+    defines = p.process_include_file('fixtures/incl.h')
+    assert defines['CONST1'] == '42'
+    assert defines['CONST2'] == '99'
+    assert defines.get('MULTI_LINE', None) == 'abc \\'  # correct. line continuations not supported
+    assert 'MACRO' not in defines
+
+
+@test
+def test_process_include_file_with_multiple_files():
+    p = Preprocessor()
+
+    defines = p.process_include_file('fixtures/incl.h')
+    defines = p.process_include_file('fixtures/incl2.h')
+
+    assert defines['CONST1'] == '42', "constant from incl.h"
+    assert defines['CONST2'] == '123', "constant overridden by incl2.h"
+    assert defines['CONST3'] == '777', "constant from incl2.h"
+
+
 if __name__ == '__main__':
     # run all methods marked with @test
     for t in tests:

From 8d88fd1dd82b57746497687831db494060792f4a Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Tue, 27 Jul 2021 18:02:01 +0300
Subject: [PATCH 20/29] add support for using a btree database (DefinesDB) to
 store defines for preprocessing

The btree module, which ships with MicroPython, can efficiency manage a large
number of key-value pairs with minimal memory. It automatically initialises
to appropriate memory and cache limits, based on the device it's running
on, but if needed those parameters can be tuned too, e.g. to restrict memory
usage further.

The database is optional and must be supplied to the Preprocessor via the
use_db() method. It's safe however to always supply it, because a non-existing
database will behave like an empty database.

Care is taken not to unnecessarily create an empty db, when only reading from it
and not to unnecessarily check the file-system whether the database exists.

Inside the Preprocessor the database is opened and closed with a context manager.
This ensures the database will be closed properly again. While DefinesDB opens
the underlying database automatically, it cannot automatically close the database
again (using a destructor __del__ does not work, and MicroPython does not have
the "atexit" exit handler on the esp32). By using a context manager, the code
becomes cleaner, while still ensuring the database is closed at the end.
---
 esp32_ulp/definesdb.py  | 75 +++++++++++++++++++++++++++++++++
 esp32_ulp/preprocess.py | 61 ++++++++++++++++++++-------
 esp32_ulp/util.py       | 10 +++++
 tests/00_unit_tests.sh  |  2 +-
 tests/definesdb.py      | 60 ++++++++++++++++++++++++++
 tests/preprocess.py     | 93 +++++++++++++++++++++++++++++++++++++++++
 tests/util.py           | 16 ++++++-
 7 files changed, 299 insertions(+), 18 deletions(-)
 create mode 100644 esp32_ulp/definesdb.py
 create mode 100644 tests/definesdb.py

diff --git a/esp32_ulp/definesdb.py b/esp32_ulp/definesdb.py
new file mode 100644
index 0000000..ce1d232
--- /dev/null
+++ b/esp32_ulp/definesdb.py
@@ -0,0 +1,75 @@
+import os
+import btree
+from .util import file_exists
+
+DBNAME = 'defines.db'
+
+
+class DefinesDB:
+    def __init__(self):
+        self._file = None
+        self._db = None
+        self._db_exists = None
+
+    def clear(self):
+        self.close()
+        try:
+            os.remove(DBNAME)
+            self._db_exists = False
+        except OSError:
+            pass
+
+    def open(self):
+        if self._db:
+            return
+        try:
+            self._file = open(DBNAME, 'r+b')
+        except OSError:
+            self._file = open(DBNAME, 'w+b')
+        self._db = btree.open(self._file)
+        self._db_exists = True
+
+    def close(self):
+        if not self._db:
+            return
+        self._db.close()
+        self._db = None
+        self._file.close()
+        self._file = None
+
+    def db_exists(self):
+        if self._db_exists is None:
+            self._db_exists = file_exists(DBNAME)
+        return self._db_exists
+
+    def update(self, dictionary):
+        for k, v in dictionary.items():
+            self.__setitem__(k, v)
+
+    def get(self, key, default):
+        try:
+            result = self.__getitem__(key)
+        except KeyError:
+            result = default
+        return result
+
+    def keys(self):
+        if not self.db_exists():
+            return []
+
+        self.open()
+        return [k.decode() for k in self._db.keys()]
+
+    def __getitem__(self, key):
+        if not self.db_exists():
+            raise KeyError
+
+        self.open()
+        return self._db[key.encode()].decode()
+
+    def __setitem__(self, key, value):
+        self.open()
+        self._db[key.encode()] = str(value).encode()
+
+    def __iter__(self):
+        return iter(self.keys())
diff --git a/esp32_ulp/preprocess.py b/esp32_ulp/preprocess.py
index 51ec8ef..c0be8eb 100644
--- a/esp32_ulp/preprocess.py
+++ b/esp32_ulp/preprocess.py
@@ -1,5 +1,6 @@
 from . import nocomment
 from .util import split_tokens
+from .definesdb import DefinesDB
 
 
 class RTC_Macros:
@@ -56,6 +57,7 @@ def parse_define_line(self, line):
     def parse_defines(self, content):
         for line in content.splitlines():
             self._defines.update(self.parse_define_line(line))
+
         return self._defines
 
     def expand_defines(self, line):
@@ -66,6 +68,8 @@ def expand_defines(self, line):
             line = ""
             for t in tokens:
                 lu = self._defines.get(t, t)
+                if lu == t and self._defines_db:
+                    lu = self._defines_db.get(t, t)
                 if lu != t:
                     found = True
                 line += lu
@@ -73,14 +77,13 @@ def expand_defines(self, line):
         return line
 
     def process_include_file(self, filename):
-        defines = self._defines
-
-        with open(filename, 'r') as f:
-            for line in f:
-                result = self.parse_defines(line)
-                defines.update(result)
+        with self.open_db() as db:
+            with open(filename, 'r') as f:
+                for line in f:
+                    result = self.parse_define_line(line)
+                    db.update(result)
 
-        return defines
+        return db
 
     def expand_rtc_macros(self, line):
         clean_line = line.strip()
@@ -103,17 +106,43 @@ def expand_rtc_macros(self, line):
 
         return macro_fn(*macro_args)
 
+    def use_db(self, defines_db):
+        self._defines_db = defines_db
+
+    def open_db(self):
+        class ctx:
+            def __init__(self, db):
+                self._db = db
+
+            def __enter__(self):
+                # not opening DefinesDB - it opens itself when needed
+                return self._db
+
+            def __exit__(self, type, value, traceback):
+                if isinstance(self._db, DefinesDB):
+                    self._db.close()
+
+        if self._defines_db:
+            return ctx(self._defines_db)
+
+        return ctx(self._defines)
+
     def preprocess(self, content):
         self.parse_defines(content)
-        lines = nocomment.remove_comments(content)
-        result = []
-        for line in lines:
-            line = self.expand_defines(line)
-            line = self.expand_rtc_macros(line)
-            result.append(line)
-        result = "\n".join(result)
+
+        with self.open_db():
+            lines = nocomment.remove_comments(content)
+            result = []
+            for line in lines:
+                line = self.expand_defines(line)
+                line = self.expand_rtc_macros(line)
+                result.append(line)
+            result = "\n".join(result)
+
         return result
 
 
-def preprocess(content):
-    return Preprocessor().preprocess(content)
+def preprocess(content, use_defines_db=True):
+    preprocessor = Preprocessor()
+    preprocessor.use_db(DefinesDB())
+    return preprocessor.preprocess(content)
diff --git a/esp32_ulp/util.py b/esp32_ulp/util.py
index 8d2832f..0dacf72 100644
--- a/esp32_ulp/util.py
+++ b/esp32_ulp/util.py
@@ -1,6 +1,7 @@
 DEBUG = False
 
 import gc
+import os
 
 NORMAL, WHITESPACE = 0, 1
 
@@ -67,3 +68,12 @@ def validate_expression(param):
                 if c not in '0123456789abcdef':
                     state = 0
     return True
+
+
+def file_exists(filename):
+    try:
+        os.stat(filename)
+        return True
+    except OSError:
+        pass
+    return False
diff --git a/tests/00_unit_tests.sh b/tests/00_unit_tests.sh
index efd5b64..ee1a239 100755
--- a/tests/00_unit_tests.sh
+++ b/tests/00_unit_tests.sh
@@ -4,7 +4,7 @@
 
 set -e
 
-for file in opcodes assemble link util preprocess; do
+for file in opcodes assemble link util preprocess definesdb; do
     echo testing $file...
     micropython $file.py
 done
diff --git a/tests/definesdb.py b/tests/definesdb.py
new file mode 100644
index 0000000..5e2100c
--- /dev/null
+++ b/tests/definesdb.py
@@ -0,0 +1,60 @@
+import os
+
+from esp32_ulp.definesdb import DefinesDB, DBNAME
+from esp32_ulp.util import file_exists
+
+tests = []
+
+
+def test(param):
+    tests.append(param)
+
+
+@test
+def test_definesdb_clear_removes_all_keys():
+    db = DefinesDB()
+    db.open()
+    db.update({'KEY1': 'VALUE1'})
+
+    db.clear()
+
+    assert 'KEY1' not in db
+
+    db.close()
+
+
+@test
+def test_definesdb_persists_data_across_instantiations():
+    db = DefinesDB()
+    db.open()
+    db.clear()
+
+    db.update({'KEY1': 'VALUE1'})
+
+    assert 'KEY1' in db
+
+    db.close()
+    del db
+    db = DefinesDB()
+    db.open()
+
+    assert db.get('KEY1', None) == 'VALUE1'
+
+    db.close()
+
+
+@test
+def test_definesdb_should_not_create_a_db_file_when_only_reading():
+    db = DefinesDB()
+
+    db.clear()
+    assert not file_exists(DBNAME)
+
+    assert db.get('some-key', None) is None
+    assert not file_exists(DBNAME)
+
+
+if __name__ == '__main__':
+    # run all methods marked with @test
+    for t in tests:
+        t()
diff --git a/tests/preprocess.py b/tests/preprocess.py
index f9fe936..e275707 100644
--- a/tests/preprocess.py
+++ b/tests/preprocess.py
@@ -1,4 +1,8 @@
+import os
+
 from esp32_ulp.preprocess import Preprocessor
+from esp32_ulp.definesdb import DefinesDB, DBNAME
+from esp32_ulp.util import file_exists
 
 tests = []
 
@@ -186,6 +190,7 @@ def test_process_include_file():
     p = Preprocessor()
 
     defines = p.process_include_file('fixtures/incl.h')
+
     assert defines['CONST1'] == '42'
     assert defines['CONST2'] == '99'
     assert defines.get('MULTI_LINE', None) == 'abc \\'  # correct. line continuations not supported
@@ -204,6 +209,94 @@ def test_process_include_file_with_multiple_files():
     assert defines['CONST3'] == '777', "constant from incl2.h"
 
 
+@test
+def test_process_include_file_using_database():
+    db = DefinesDB()
+    db.clear()
+
+    p = Preprocessor()
+    p.use_db(db)
+
+    p.process_include_file('fixtures/incl.h')
+    p.process_include_file('fixtures/incl2.h')
+
+    assert db['CONST1'] == '42', "constant from incl.h"
+    assert db['CONST2'] == '123', "constant overridden by incl2.h"
+    assert db['CONST3'] == '777', "constant from incl2.h"
+
+    db.close()
+
+
+@test
+def test_process_include_file_should_not_load_database_keys_into_instance_defines_dictionary():
+    db = DefinesDB()
+    db.clear()
+
+    p = Preprocessor()
+    p.use_db(db)
+
+    p.process_include_file('fixtures/incl.h')
+
+    # a bit hackish to reference instance-internal state
+    # but it's important to verify this, as we otherwise run out of memory on device
+    assert 'CONST2' not in p._defines
+
+
+
+@test
+def test_preprocess_should_use_definesdb_when_provided():
+    p = Preprocessor()
+
+    content = """\
+#define LOCALCONST 42
+
+entry:
+    move r1, LOCALCONST
+    move r2, DBKEY
+"""
+
+    # first try without db
+    result = p.preprocess(content)
+
+    assert "move r1, 42" in result
+    assert "move r2, DBKEY" in result
+    assert "move r2, 99" not in result
+
+    # now try with db
+    db = DefinesDB()
+    db.clear()
+    db.update({'DBKEY': '99'})
+    p.use_db(db)
+
+    result = p.preprocess(content)
+
+    assert "move r1, 42" in result
+    assert "move r2, 99" in result
+    assert "move r2, DBKEY" not in result
+
+
+@test
+def test_preprocess_should_ensure_no_definesdb_is_created_when_only_reading_from_it():
+    content = """\
+    #define CONST 42
+    move r1, CONST"""
+
+    # remove any existing db
+    db = DefinesDB()
+    db.clear()
+    assert not file_exists(DBNAME)
+
+    # now preprocess using db
+    p = Preprocessor()
+    p.use_db(db)
+
+    result = p.preprocess(content)
+
+    assert "move r1, 42" in result
+
+    assert not file_exists(DBNAME)
+
+
 if __name__ == '__main__':
     # run all methods marked with @test
     for t in tests:
diff --git a/tests/util.py b/tests/util.py
index 18ab54e..009f3f1 100644
--- a/tests/util.py
+++ b/tests/util.py
@@ -1,4 +1,5 @@
-from esp32_ulp.util import split_tokens, validate_expression
+import os
+from esp32_ulp.util import split_tokens, validate_expression, file_exists
 
 tests = []
 
@@ -56,6 +57,19 @@ def test_validate_expression():
     assert validate_expression('def cafe()') is False  # valid hex digits, but potentially dangerous code
 
 
+@test
+def test_file_exists():
+    testfile = '.testfile'
+    with open(testfile, 'w') as f:
+        f.write('contents')
+
+    assert file_exists(testfile)
+
+    os.remove(testfile)
+
+    assert not file_exists(testfile)
+
+
 if __name__ == '__main__':
     # run all methods marked with @test
     for t in tests:

From 46f1442b25353c854ad244669d378335f794c2c8 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Tue, 27 Jul 2021 23:05:48 +0300
Subject: [PATCH 21/29] add special handling for the BIT macro used in the
 esp-idf framework

The functions the preprocessor supports (WRITE_RTC_*/READ_RTC_*) do not need the
value returned by the BIT macro. Instead, they use the bit number specified to the
BIT macro, i.e. for BIT(x) they need x. So this change handles BIT by simply
replacing it with an empty string, and BIT(x) results in (x) in the preprocessor
output.
---
 esp32_ulp/preprocess.py |  9 +++++++++
 tests/preprocess.py     | 15 +++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/esp32_ulp/preprocess.py b/esp32_ulp/preprocess.py
index c0be8eb..a890005 100644
--- a/esp32_ulp/preprocess.py
+++ b/esp32_ulp/preprocess.py
@@ -70,6 +70,15 @@ def expand_defines(self, line):
                 lu = self._defines.get(t, t)
                 if lu == t and self._defines_db:
                     lu = self._defines_db.get(t, t)
+                if lu == t and t == 'BIT':
+                    # Special hack: BIT(..) translates to a 32-bit mask where only the specified bit is set.
+                    # But the reg_wr and reg_rd opcodes expect actual bit numbers for argument 2 and 3.
+                    # While the real READ_RTC_*/WRITE_RTC_* macros take in the output of BIT(x), they
+                    # ultimately convert these back (via helper macros) to the bit number (x). And since this
+                    # preprocessor does not (aim to) implement "proper" macro-processing, we can simply
+                    # short-circuit this round-trip via macros and replace "BIT" with nothing so that
+                    # "BIT(x)" gets mapped to "(x)".
+                    continue
                 if lu != t:
                     found = True
                 line += lu
diff --git a/tests/preprocess.py b/tests/preprocess.py
index e275707..30f4e49 100644
--- a/tests/preprocess.py
+++ b/tests/preprocess.py
@@ -185,6 +185,21 @@ def test_expand_rtc_macros():
     assert p.expand_rtc_macros("READ_RTC_FIELD(1, 2)") == "\treg_rd 1, 2 + 1 - 1, 2"
 
 
+@test
+def preprocess_should_replace_BIT_with_empty_string_unless_defined():
+    # by default replace BIT with empty string (see description for why in the code)
+    src = " move r1, 0x123 << BIT(24)"
+    assert "move r1, 0x123 << (24)" in Preprocessor().preprocess(src)
+
+    # but if BIT is defined, use that
+    src = """\
+    #define BIT 12
+
+    move r1, BIT"""
+
+    assert "move r1, 12" in Preprocessor().preprocess(src)
+
+
 @test
 def test_process_include_file():
     p = Preprocessor()

From 2f6ee78d156d7ee15b44119db2a738125989107c Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Wed, 28 Jul 2021 07:29:04 +0300
Subject: [PATCH 22/29] add include processor tool for populating a defines.db
 from include files

---
 esp32_ulp/parse_to_db.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 esp32_ulp/parse_to_db.py

diff --git a/esp32_ulp/parse_to_db.py b/esp32_ulp/parse_to_db.py
new file mode 100644
index 0000000..ac61f98
--- /dev/null
+++ b/esp32_ulp/parse_to_db.py
@@ -0,0 +1,23 @@
+import sys
+
+from .preprocess import Preprocessor
+from .definesdb import DefinesDB
+
+
+def parse(files):
+    db = DefinesDB()
+
+    p = Preprocessor()
+    p.use_db(db)
+
+    for f in files:
+        print('Processing file:', f)
+
+        p.process_include_file(f)
+
+    print('Done.')
+
+
+if __name__ == '__main__':
+    parse(sys.argv[1:])
+

From 69ae94696bc9b4334a7934fdb8bd5d744df8a769 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Wed, 28 Jul 2021 07:26:18 +0300
Subject: [PATCH 23/29] add compatibility tests using good example code off the
 net

The test script will fetch the ESP-IDF framework to have all necessary
include files, and will then fetch two sources of example code (ulptool and
binutil-esp32_ulp's own test examples).

The examples are fetched rather than duplicated into this repo, to avoid
potential licensing and attribution issues.
---
 .github/workflows/run_tests.yaml |  10 ++-
 tests/02_compat_rtc_tests.sh     | 118 +++++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+), 1 deletion(-)
 create mode 100755 tests/02_compat_rtc_tests.sh

diff --git a/.github/workflows/run_tests.yaml b/.github/workflows/run_tests.yaml
index 166a2e5..e9fdb6d 100644
--- a/.github/workflows/run_tests.yaml
+++ b/.github/workflows/run_tests.yaml
@@ -70,5 +70,13 @@ jobs:
         export PATH=$PATH:${{ steps.build_micropython.outputs.bin_dir }}
         export PATH=$PATH:${{ steps.build_binutils.outputs.bin_dir }}
         cd tests
-        ln -s ../binutils-esp32ulp  # already cloned earlier. reuse.
         ./01_compat_tests.sh
+
+    - name: Run compat tests with RTC macros
+      id: compat_rtc_tests
+      run: |
+        export PATH=$PATH:${{ steps.build_micropython.outputs.bin_dir }}
+        export PATH=$PATH:${{ steps.build_binutils.outputs.bin_dir }}
+        cd tests
+        ln -s ../binutils-esp32ulp  # already cloned earlier. reuse.
+        ./02_compat_rtc_tests.sh
diff --git a/tests/02_compat_rtc_tests.sh b/tests/02_compat_rtc_tests.sh
new file mode 100755
index 0000000..0f64864
--- /dev/null
+++ b/tests/02_compat_rtc_tests.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+# export PYTHONPATH=.:$PYTHONPATH
+
+set -e
+
+make_log_dir() {
+   mkdir -p log
+}
+
+fetch_esp_idf() {
+    [ -d esp-idf ] && return
+
+    echo "Fetching esp-idf"
+    log_file=log/fetch-esp-idf.log
+    git clone --depth 1 \
+        https://github.com/espressif/esp-idf.git 1>$log_file 2>&1
+}
+
+fetch_ulptool_examples() {
+    [ -d ulptool ] && return
+
+    echo "Fetching ulptool examples"
+    log_file=log/fetch-ulptool.log
+    git clone --depth 1 \
+        https://github.com/duff2013/ulptool 1>$log_file 2>&1
+}
+
+fetch_binutils_esp32ulp_examples() {
+    [ -d binutils-esp32ulp ] && return
+
+    echo "Fetching binutils-esp32ulp examples"
+    log_file=log/fetch-binutils.log
+    git clone --depth 1 \
+        https://github.com/espressif/binutils-esp32ulp.git 1>$log_file 2>&1
+}
+
+build_defines_db() {
+    local defines_db=defines.db
+
+    if [ "$1" = "-r" ] && [ -s "${defines_db}" ]; then
+        # reuse existing defines.db
+        return
+    fi
+
+    echo "Building defines DB from include files"
+    log_file=log/build_defines_db.log
+    rm -f "${defines_db}"
+    micropython -m esp32_ulp.parse_to_db \
+        esp-idf/components/soc/esp32/include/soc/*.h \
+        esp-idf/components/esp_common/include/*.h 1>$log_file
+}
+
+make_log_dir
+fetch_esp_idf
+fetch_ulptool_examples
+fetch_binutils_esp32ulp_examples
+build_defines_db $1
+
+for src_file in ulptool/src/ulp_examples/*/*.s binutils-esp32ulp/gas/testsuite/gas/esp32ulp/esp32/*.s; do
+
+    src_name="${src_file%.s}"
+
+    echo "Testing $src_file"
+
+    test_name="${src_name##*/}"
+
+    # for now, skip files that contain known bugs in esp32_ulp (essentially a todo list of what to fix)
+    for I in rtcio esp32ulp_all esp32ulp_globals esp32ulp_jumpr esp32ulp_ranges test_reg; do
+        if [ "${test_name}" = "$I" ]; then
+            # these are old bugs, and not related to the RTC macro handling functionality
+            # they will still be great to fix over time
+            echo -e "\tSkipping... known bugs in esp32_ulp"
+            continue 2
+        fi
+    done
+
+    # for now, skip files that contain unsupported things (macros)
+    for I in i2c i2c_dev stack i2c_wr test1 test_jumpr test_macro; do
+        if [ "${test_name}" = "$I" ]; then
+            echo -e "\tSkipping... not yet supported"
+            continue 2
+        fi
+    done
+
+    echo -e "\tBuilding using py-esp32-ulp"
+    ulp_file="${src_name}.ulp"
+    log_file="${src_name}.log"
+    micropython -m esp32_ulp $src_file 1>$log_file   # generates $ulp_file
+
+    pre_file="${src_name}.pre"
+    obj_file="${src_name}.o"
+    elf_file="${src_name}.elf"
+    bin_file="${src_name}.bin"
+
+    echo -e "\tBuilding using binutils"
+    gcc -I esp-idf/components/soc/esp32/include -I esp-idf/components/esp_common/include \
+        -x assembler-with-cpp \
+        -E -o ${pre_file} $src_file
+    esp32ulp-elf-as -o $obj_file ${pre_file}
+    esp32ulp-elf-ld -T esp32.ulp.ld -o $elf_file $obj_file
+    esp32ulp-elf-objcopy -O binary $elf_file $bin_file
+
+    if ! diff $ulp_file $bin_file 1>/dev/null; then
+        echo -e "\tBuild outputs differ!"
+        echo ""
+        echo "Compatibility test failed for $src_file"
+        echo "py-esp32-ulp log:"
+        cat $log_file
+        echo "py-esp32-ulp output:"
+        xxd $ulp_file
+        echo "binutils output:"
+        xxd $bin_file
+        exit 1
+    else
+        echo -e "\tBuild outputs match"
+    fi
+done

From 4f90f762d2dbcea09fc8d3ecd0fda293bf74a935 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Thu, 29 Jul 2021 21:59:48 +0300
Subject: [PATCH 24/29] add documentation for the preprocessor

This defines what the preprocessor aims to do, why and what its
intentional limitations are.

Examples on how to use it and how to use the "Defines DB" are also
provided
---
 README.rst          |   5 ++
 docs/preprocess.rst | 138 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+)
 create mode 100644 docs/preprocess.rst

diff --git a/README.rst b/README.rst
index 56395d1..2afa421 100644
--- a/README.rst
+++ b/README.rst
@@ -24,6 +24,11 @@ Constants defined with ``.set`` are supported in expressions.
 We have some unit tests and also compatibility tests that compare the output
 whether it is identical with binutils-esp32ulp output.
 
+There is a simple preprocessor that understands just enough to allow assembling
+ULP source files containing convenience macros such as WRITE_RTC_REG. The
+preprocessor and how to use it is documented here:
+`Preprocessor support <docs/preprocess.rst>`_.
+
 There might be some stuff missing, some bugs and other symptoms of alpha
 software. Also, error and exception handling is rather rough yet.
 
diff --git a/docs/preprocess.rst b/docs/preprocess.rst
new file mode 100644
index 0000000..0716e69
--- /dev/null
+++ b/docs/preprocess.rst
@@ -0,0 +1,138 @@
+Preprocessor
+---------------------
+
+py-esp32-ulp contains a small preprocessor, which aims to fulfill one goal:
+facilitate assembling of ULP code from Espressif and other open-source
+projects to loadable/executable machine code without modification.
+
+Such code uses convenience macros (``READ_RTC_*`` and ``WRITE_RTC_*``)
+provided by the ESP-IDF framework, along with constants defined in the
+framework's include files (such as ``RTC_GPIO_IN_REG``), to make reading
+and writing from/to peripheral registers much easier.
+
+In order to do this the preprocessor has two capabilities:
+
+1. Parse and replace identifiers defined with ``#define``
+2. Recognise the ``WRITE_RTC_*`` and ``READ_RTC_*`` macros and expand
+   them in a way that mirrors what the real ESP-IDF macros do.
+
+
+Usage
+------------------------
+
+Normally the assembler is called as follows
+
+.. code-block:: python
+
+    src = "..full assembler file contents"
+    assembler = Assembler()
+    assembler.assemble(src)
+    ...
+
+With the preprocessor, simply pass the source code via the preprocessor first:
+
+.. code-block:: python
+
+    from preprocess import preprocess
+
+    src = "..full assembler file contents"
+    src = preprocess(src)
+    assembler = Assembler()
+    assembler.assemble(src)
+    ...
+
+
+Using a "Defines Database"
+--------------------------
+
+Because the py-esp32-ulp assembler was built for running on the ESP32
+microcontroller with limited RAM, the preprocessor aims to work there too.
+
+To handle large number of defined constants (such as the ``RTC_*`` constants from
+the ESP-IDF) the preprocessor can use a database (based on BerkleyDB) stored on the
+device's filesystem for looking up defines.
+
+The database needs to be populated before preprocessing. (Usually, when only using
+constants from the ESP-IDF, this is a one-time step, because the include files
+don't change.) The database can be reused for all subsequent preprocessor runs.
+
+(The database can also be generated on a PC and then deployed to the ESP32, to
+save processing effort on the device. In that case the include files themselves
+are not needed on the device either.)
+
+1. Build the defines database
+
+   The ``esp32_ulp.parse_to_db`` tool can be used to generate the defines
+   database from include files. The resulting file will be called
+   ``defines.db``.
+
+   (The following assume running on a PC. To do this on device, refer to the
+   `esp32_ulp/parse_to_db.py <../esp32_ulp/parse_to_db.py>`_ file.)
+
+   .. code-block:: bash
+
+      # general command
+      micropython -m esp32_ulp.parse_to_db path/to/include.h
+
+      # loading specific ESP-IDF include files
+      micropython -m esp32_ulp.parse_to_db esp-idf/components/soc/esp32/include/soc/soc_ulp.h
+
+      # loading multiple files at once
+      micropython -m esp32_ulp.parse_to_db esp-idf/components/soc/esp32/include/soc/*.h
+
+      # if file system space is not a concern, the following can be convenient
+      # by including all relevant include files from the ESP-IDF framework.
+      # This results in an approximately 2MB large database.
+      micropython -m esp32_ulp.parse_to_db \
+        esp-idf/components/soc/esp32/include/soc/*.h \
+        esp-idf/components/esp_common/include/*.h
+
+      # most ULP code uses only 5 include files. Parsing only those into the
+      # database should thus allow assembling virtually all ULP code one would
+      # find or want to write.
+      # This results in an approximately 250kB large database.
+      micropython -m esp32_ulp.parse_to_db \
+        esp-idf/components/soc/esp32/include/soc/{soc,soc_ulp,rtc_cntl_reg,rtc_io_reg,sens_reg}.h
+
+2. Using the defines database during preprocessing
+
+   The preprocessor will automatically use a defines database, when using the
+   ``preprocess.preprocess`` convenience function, even when the database does
+   not exist (an absent database is treated like an empty database, and care
+   is taken not to create an empty database file, cluttering up the filesystem,
+   when not needed).
+
+   If you do not want the preprocessor use use a DefinesDB, pass ``False`` to
+   the ``use_defines_db`` argument of the ``preprocess`` convenience function,
+   or instantiate the ``Preprocessor`` class directly, without passing it a
+   DefinesDB instance via ``use_db``.
+
+Design choices
+--------------
+
+The preprocessor does not support:
+
+1. Function style macros such as :code:`#define f(a,b) (a+b)`
+
+   This is not important, because there are only few RTC macros that need
+   to be supported and they are simply implemented as Python functions.
+
+   Since the preprocessor will understand ``#define`` directives directly in the
+   assembler source file, include mechanisms are not needed in some cases
+   (simply copying the needed ``#define`` statements from include files into the
+   assembler source will work).
+
+2. ``#include`` directives
+
+   The preprocessor does not currently follow ``#include`` directives. To
+   limit space requirements (both in memory and on the filesystem), the
+   preprocessor relies on a database of defines (key/value pairs). This
+   database should be populated before using the preprocessor, by using the
+   ``esp32_ulp.parse_to_db`` tool (see section above), which parses include
+   files for identifiers defined therein.
+
+3. Preserving comments
+
+   The assumption is that the output will almost always go into the
+   assembler directly, so preserving comments is not very useful and
+   would add a lot of complexity.

From d44384f1790d173fa2a1ff390da0948196f37d26 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Wed, 28 Jul 2021 07:52:02 +0300
Subject: [PATCH 25/29] fix use of treg field in i_move instruction to match
 binutils-esp32 output in all cases

This fix makes compat tests pass for:
https://github.com/duff2013/ulptool/blob/master/src/ulp_examples/ulp_rtc_gpio/rtcio.s
---
 esp32_ulp/opcodes.py         | 2 +-
 tests/02_compat_rtc_tests.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/esp32_ulp/opcodes.py b/esp32_ulp/opcodes.py
index 59006f2..10b5bd5 100644
--- a/esp32_ulp/opcodes.py
+++ b/esp32_ulp/opcodes.py
@@ -482,7 +482,7 @@ def i_move(reg_dest, reg_imm_src):
     if src.type == REG:
         _alu_reg.dreg = dest
         _alu_reg.sreg = src.value
-        _alu_reg.treg = 1  # XXX undocumented, this is the value binutils-esp32 uses
+        _alu_reg.treg = src.value  # XXX undocumented, this is the value binutils-esp32 uses
         _alu_reg.unused = 0
         _alu_reg.sel = ALU_SEL_MOV
         _alu_reg.sub_opcode = SUB_OPCODE_ALU_REG
diff --git a/tests/02_compat_rtc_tests.sh b/tests/02_compat_rtc_tests.sh
index 0f64864..2904ee6 100755
--- a/tests/02_compat_rtc_tests.sh
+++ b/tests/02_compat_rtc_tests.sh
@@ -66,7 +66,7 @@ for src_file in ulptool/src/ulp_examples/*/*.s binutils-esp32ulp/gas/testsuite/g
     test_name="${src_name##*/}"
 
     # for now, skip files that contain known bugs in esp32_ulp (essentially a todo list of what to fix)
-    for I in rtcio esp32ulp_all esp32ulp_globals esp32ulp_jumpr esp32ulp_ranges test_reg; do
+    for I in esp32ulp_all esp32ulp_globals esp32ulp_jumpr esp32ulp_ranges test_reg; do
         if [ "${test_name}" = "$I" ]; then
             # these are old bugs, and not related to the RTC macro handling functionality
             # they will still be great to fix over time

From 254adf983fcd7bfb7c673b91f3e7fbab2c97ae0d Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Wed, 28 Jul 2021 09:18:50 +0300
Subject: [PATCH 26/29] allow specifying the address for reg_rd and reg_wr in
 32-bit words

This change allows specifying the address in 32-bit words (i.e. the
address as seen from the ULP), in addition to the existing mode of
specifying a register's full address on the DPORT bus.

If an address is between 0 and DR_REG_MAX_DIRECT (0x3ff), treat it
as a word offset (ULP address), otherwise treat it as a full address
on the DPORT bus as before.
---
 esp32_ulp/opcodes.py | 16 ++++++++++++----
 tests/compat/fixes.S |  5 +++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/esp32_ulp/opcodes.py b/esp32_ulp/opcodes.py
index 10b5bd5..8a5b6d7 100644
--- a/esp32_ulp/opcodes.py
+++ b/esp32_ulp/opcodes.py
@@ -16,6 +16,7 @@
 OPCODE_WR_REG = 1
 OPCODE_RD_REG = 2
 
+DR_REG_MAX_DIRECT = 0x3ff
 RD_REG_PERIPH_RTC_CNTL = 0
 RD_REG_PERIPH_RTC_IO = 1
 RD_REG_PERIPH_SENS = 2
@@ -353,8 +354,9 @@ def get_cond(arg):
 
 def _soc_reg_to_ulp_periph_sel(reg):
     # Map SoC peripheral register to periph_sel field of RD_REG and WR_REG instructions.
-    ret = 3
-    if reg < DR_REG_RTCCNTL_BASE:
+    if reg < DR_REG_MAX_DIRECT:
+        ret = RD_REG_PERIPH_RTC_CNTL
+    elif reg < DR_REG_RTCCNTL_BASE:
         raise ValueError("invalid register base")
     elif reg < DR_REG_RTCIO_BASE:
         ret = RD_REG_PERIPH_RTC_CNTL
@@ -371,7 +373,10 @@ def _soc_reg_to_ulp_periph_sel(reg):
 
 def i_reg_wr(reg, high_bit, low_bit, val):
     reg = get_imm(reg)
-    _wr_reg.addr = (reg & 0xff) >> 2
+    if reg < DR_REG_MAX_DIRECT:  # see https://github.com/espressif/binutils-esp32ulp/blob/master/gas/config/tc-esp32ulp_esp32.c
+        _wr_reg.addr = reg
+    else:
+        _wr_reg.addr = (reg & 0xff) >> 2
     _wr_reg.periph_sel = _soc_reg_to_ulp_periph_sel(reg)
     _wr_reg.data = get_imm(val)
     _wr_reg.low = get_imm(low_bit)
@@ -382,7 +387,10 @@ def i_reg_wr(reg, high_bit, low_bit, val):
 
 def i_reg_rd(reg, high_bit, low_bit):
     reg = get_imm(reg)
-    _rd_reg.addr = (reg & 0xff) >> 2
+    if reg < DR_REG_MAX_DIRECT: # see https://github.com/espressif/binutils-esp32ulp/blob/master/gas/config/tc-esp32ulp_esp32.c
+        _rd_reg.addr = reg
+    else:
+        _rd_reg.addr = (reg & 0xff) >> 2
     _rd_reg.periph_sel = _soc_reg_to_ulp_periph_sel(reg)
     _rd_reg.unused = 0
     _rd_reg.low = get_imm(low_bit)
diff --git a/tests/compat/fixes.S b/tests/compat/fixes.S
index 0c84f1b..022951a 100644
--- a/tests/compat/fixes.S
+++ b/tests/compat/fixes.S
@@ -17,4 +17,9 @@ counter:
 entry:
   MOVE R1, gpio
   WAIT 42
+
+  # reg_rd/reg_wr with "short" and "long" address notation
+  reg_rd 12, 7, 0
+  reg_rd 0x3ff48000, 7, 0
+
   halt

From c3bd1010746324b9409e81360f75715c10d15d37 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Thu, 29 Jul 2021 21:31:00 +0300
Subject: [PATCH 27/29] support .int data type

.long and .int are the same as per GNU assembler manual:
https://sourceware.org/binutils/docs/as/Long.html

binutils-esp32ulp also treats them the same (compat test included
to verify this)
---
 esp32_ulp/assemble.py | 5 +++++
 tests/compat/fixes.S  | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/esp32_ulp/assemble.py b/esp32_ulp/assemble.py
index 9180d8a..7a92a8e 100644
--- a/esp32_ulp/assemble.py
+++ b/esp32_ulp/assemble.py
@@ -240,6 +240,11 @@ def d_word(self, *args):
         self.append_data(2, args)
 
     def d_long(self, *args):
+        self.d_int(*args)
+
+    def d_int(self, *args):
+        # .long and .int are identical as per GNU assembler documentation
+        # https://sourceware.org/binutils/docs/as/Long.html
         self.append_data(4, args)
 
     def assembler_pass(self, lines):
diff --git a/tests/compat/fixes.S b/tests/compat/fixes.S
index 022951a..9e4d0ef 100644
--- a/tests/compat/fixes.S
+++ b/tests/compat/fixes.S
@@ -12,6 +12,9 @@
 counter:
 .long 0
 
+.data
+var2: .int 1111
+
   .text
   .global entry
 entry:

From 2a0a39a810c70218a02c7ec9e0b33945ba064e23 Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Mon, 9 Aug 2021 19:45:36 +0300
Subject: [PATCH 28/29] refactor: small improvements based on PR comments.

---
 esp32_ulp/__main__.py   |  2 +-
 esp32_ulp/assemble.py   | 12 +++++++++---
 esp32_ulp/definesdb.py  |  7 +++++--
 esp32_ulp/opcodes.py    |  6 +++---
 esp32_ulp/preprocess.py |  5 ++---
 esp32_ulp/util.py       |  4 ++--
 tests/compat/expr.S     | 12 ++++++++----
 tests/fixtures/incl.h   |  2 +-
 tests/fixtures/incl2.h  |  2 +-
 tests/preprocess.py     | 20 ++++++++++++++++++++
 10 files changed, 52 insertions(+), 20 deletions(-)

diff --git a/esp32_ulp/__main__.py b/esp32_ulp/__main__.py
index d9555fd..209656f 100644
--- a/esp32_ulp/__main__.py
+++ b/esp32_ulp/__main__.py
@@ -10,6 +10,7 @@
 
 def src_to_binary(src):
     assembler = Assembler()
+    src = preprocess(src)
     assembler.assemble(src, remove_comments=False)  # comments already removed by preprocessor
     garbage_collect('before symbols export')
     addrs_syms = assembler.symbols.export()
@@ -24,7 +25,6 @@ def main(fn):
     with open(fn) as f:
         src = f.read()
 
-    src = preprocess(src)
     binary = src_to_binary(src)
 
     if fn.endswith('.s') or fn.endswith('.S'):
diff --git a/esp32_ulp/assemble.py b/esp32_ulp/assemble.py
index 7a92a8e..e348363 100644
--- a/esp32_ulp/assemble.py
+++ b/esp32_ulp/assemble.py
@@ -87,8 +87,8 @@ def set_global(self, symbol):
 
 class Assembler:
 
-    def __init__(self, symbols=None, bases=None, globls=None):
-        self.symbols = SymbolTable(symbols or {}, bases or {}, globls or {})
+    def __init__(self, symbols=None, bases=None, globals=None):
+        self.symbols = SymbolTable(symbols or {}, bases or {}, globals or {})
         opcodes.symbols = self.symbols  # XXX dirty hack
 
     def init(self, a_pass):
@@ -223,7 +223,7 @@ def d_align(self, align=4, fill=None):
             self.fill(self.section, amount, fill)
 
     def d_set(self, symbol, expr):
-        value = int(opcodes.eval_arg(expr))  # TODO: support more than just integers
+        value = int(opcodes.eval_arg(expr))
         self.symbols.set_sym(symbol, ABS, None, value)
 
     def d_global(self, symbol):
@@ -265,6 +265,12 @@ def assembler_pass(self, lines):
                     # machine instruction
                     func = getattr(opcodes, 'i_' + opcode.lower(), None)
                     if func is not None:
+                        # during the first pass, symbols are not all known yet.
+                        # so some expressions may not evaluate to something (yet).
+                        # instruction building requires sane arguments however.
+                        # since all instructions are 4 bytes long, we simply skip
+                        # building instructions during pass 1, and append an "empty
+                        # instruction" to the section to get the right section size.
                         instruction = 0 if self.a_pass == 1 else func(*args)
                         self.append_section(instruction.to_bytes(4, 'little'), TEXT)
                         continue
diff --git a/esp32_ulp/definesdb.py b/esp32_ulp/definesdb.py
index ce1d232..4a05459 100644
--- a/esp32_ulp/definesdb.py
+++ b/esp32_ulp/definesdb.py
@@ -19,8 +19,11 @@ def clear(self):
         except OSError:
             pass
 
+    def is_open(self):
+        return self._db is not None
+
     def open(self):
-        if self._db:
+        if self.is_open():
             return
         try:
             self._file = open(DBNAME, 'r+b')
@@ -30,7 +33,7 @@ def open(self):
         self._db_exists = True
 
     def close(self):
-        if not self._db:
+        if not self.is_open():
             return
         self._db.close()
         self._db = None
diff --git a/esp32_ulp/opcodes.py b/esp32_ulp/opcodes.py
index 8a5b6d7..103b1f7 100644
--- a/esp32_ulp/opcodes.py
+++ b/esp32_ulp/opcodes.py
@@ -307,10 +307,10 @@ def arg_qualify(arg):
         pass
     try:
         entry = symbols.get_sym(arg)
-        return ARG(SYM, entry, arg)
     except KeyError:
-        pass
-    return ARG(IMM, int(eval_arg(arg)), arg)
+        return ARG(IMM, int(eval_arg(arg)), arg)
+    else:
+        return ARG(SYM, entry, arg)
 
 
 def get_reg(arg):
diff --git a/esp32_ulp/preprocess.py b/esp32_ulp/preprocess.py
index a890005..03a9317 100644
--- a/esp32_ulp/preprocess.py
+++ b/esp32_ulp/preprocess.py
@@ -14,13 +14,12 @@ def READ_RTC_REG(rtc_reg, low_bit, bit_width):
 
     @staticmethod
     def WRITE_RTC_REG(rtc_reg, low_bit, bit_width, value):
-        args = (
+        return '\treg_wr ' + ', '.join((
             rtc_reg,
             '%s + %s - 1' % (low_bit, bit_width),
             low_bit,
             value
-        )
-        return '\treg_wr ' + ', '.join(args)
+        ))
 
     @staticmethod
     def READ_RTC_FIELD(rtc_reg, low_bit):
diff --git a/esp32_ulp/util.py b/esp32_ulp/util.py
index 0dacf72..d79c538 100644
--- a/esp32_ulp/util.py
+++ b/esp32_ulp/util.py
@@ -19,14 +19,14 @@ def split_tokens(line):
     tokens = []
     state = NORMAL
     for c in line:
-        if ('a' <= c <= 'z') or ('A' <= c <= 'Z') or ('0' <= c <= '9') or c == '_':
+        if c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_":
             if state != NORMAL:
                 if len(buf) > 0:
                     tokens.append(buf)
                     buf = ""
                 state = NORMAL
             buf += c
-        elif c == ' ' or c == '\t':
+        elif c in " \t":
             if state != WHITESPACE:
                 if len(buf) > 0:
                     tokens.append(buf)
diff --git a/tests/compat/expr.S b/tests/compat/expr.S
index 48f7304..3650623 100644
--- a/tests/compat/expr.S
+++ b/tests/compat/expr.S
@@ -1,3 +1,4 @@
+# common example of real world code using expressions
   .set adc_channel, 6
 
   .set adc_oversampling_factor_log, 2
@@ -26,7 +27,13 @@ measure:
   move r3, result
   st r0, r3, 0
 
-  #test that expressions evaluate correctly for all supported operators
+exit:
+  halt
+
+
+# ---
+# test that expressions evaluate correctly for all supported operators
+# (these statements do not mean anything other than testing the operations)
   move r3, 1+2
   move r3, 3-5
   move r3, -5
@@ -39,6 +46,3 @@ measure:
   move r3, 0x1234 & ~2
   move r3, 42|4&0xf  # 46 (4&0xf is evaluated first)
   move r3, (42|4)&0xf  # 14 (42|4 is evaluated first)
-
-exit:
-  halt
diff --git a/tests/fixtures/incl.h b/tests/fixtures/incl.h
index 5c8415e..712aa7c 100644
--- a/tests/fixtures/incl.h
+++ b/tests/fixtures/incl.h
@@ -2,4 +2,4 @@
 #define MACRO(x,y) x+y
 #define MULTI_LINE abc \
                    xyz
-#define CONST2 99
\ No newline at end of file
+#define CONST2 99
diff --git a/tests/fixtures/incl2.h b/tests/fixtures/incl2.h
index 09775d1..d19aeba 100644
--- a/tests/fixtures/incl2.h
+++ b/tests/fixtures/incl2.h
@@ -1,2 +1,2 @@
 #define CONST2 123
-#define CONST3 777
\ No newline at end of file
+#define CONST3 777
diff --git a/tests/preprocess.py b/tests/preprocess.py
index 30f4e49..5a3825d 100644
--- a/tests/preprocess.py
+++ b/tests/preprocess.py
@@ -312,6 +312,26 @@ def test_preprocess_should_ensure_no_definesdb_is_created_when_only_reading_from
     assert not file_exists(DBNAME)
 
 
+@test
+def test_preprocess_should_ensure_the_definesdb_is_properly_closed_after_use():
+    content = """\
+    #define CONST 42
+    move r1, CONST"""
+
+    # remove any existing db
+    db = DefinesDB()
+    db.open()
+    assert db.is_open()
+
+    # now preprocess using db
+    p = Preprocessor()
+    p.use_db(db)
+
+    p.preprocess(content)
+
+    assert not db.is_open()
+
+
 if __name__ == '__main__':
     # run all methods marked with @test
     for t in tests:

From 47d5e8a9e9e309cd8e50eeb9f8d8f36a7f67055a Mon Sep 17 00:00:00 2001
From: Wilko Nienhaus <wilko.nienhaus@gmail.com>
Date: Mon, 9 Aug 2021 20:22:22 +0300
Subject: [PATCH 29/29] Updated LICENSE file and added AUTHORS file

---
 AUTHORS | 8 ++++++++
 LICENSE | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)
 create mode 100644 AUTHORS

diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..571f8ee
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,8 @@
+E-mail addresses listed here are not intended for support.
+
+py-esp32-ulp authors
+--------------------
+py-esp32-ulp is written and maintained by Thomas Waldmann and various contributors:
+
+- Thomas Waldmann <tw@waldmann-edv.de>
+- Wilko Nienhaus <wilko.nienhaus@gmail.com>
diff --git a/LICENSE b/LICENSE
index 6fc734f..46bf124 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 The MIT License (MIT)
 
-Copyright (c) 2018 Thomas Waldmann
+Copyright 2018-2021 by the py-esp32-ulp authors, see AUTHORS file
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal