diff --git a/.github/workflows/run_tests.yaml b/.github/workflows/run_tests.yaml
index 166a2e5..e9fdb6d 100644
--- a/.github/workflows/run_tests.yaml
+++ b/.github/workflows/run_tests.yaml
@@ -70,5 +70,13 @@ jobs:
         export PATH=$PATH:${{ steps.build_micropython.outputs.bin_dir }}
         export PATH=$PATH:${{ steps.build_binutils.outputs.bin_dir }}
         cd tests
-        ln -s ../binutils-esp32ulp  # already cloned earlier. reuse.
         ./01_compat_tests.sh
+
+    - name: Run compat tests with RTC macros
+      id: compat_rtc_tests
+      run: |
+        export PATH=$PATH:${{ steps.build_micropython.outputs.bin_dir }}
+        export PATH=$PATH:${{ steps.build_binutils.outputs.bin_dir }}
+        cd tests
+        ln -s ../binutils-esp32ulp  # already cloned earlier. reuse.
+        ./02_compat_rtc_tests.sh
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..571f8ee
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,8 @@
+E-mail addresses listed here are not intended for support.
+
+py-esp32-ulp authors
+--------------------
+py-esp32-ulp is written and maintained by Thomas Waldmann and various contributors:
+
+- Thomas Waldmann <tw@waldmann-edv.de>
+- Wilko Nienhaus <wilko.nienhaus@gmail.com>
diff --git a/LICENSE b/LICENSE
index 6fc734f..46bf124 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 The MIT License (MIT)
 
-Copyright (c) 2018 Thomas Waldmann
+Copyright 2018-2021 by the py-esp32-ulp authors, see AUTHORS file
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README.rst b/README.rst
index 3952878..2afa421 100644
--- a/README.rst
+++ b/README.rst
@@ -17,9 +17,18 @@ Status
 
 The most commonly used simple stuff should work.
 
+Expressions in assembly source code are supported and get evaluated during
+assembling. Only expressions evaluating to a single integer are supported.
+Constants defined with ``.set`` are supported in expressions.
+
 We have some unit tests and also compatibility tests that compare the output
 whether it is identical with binutils-esp32ulp output.
 
+There is a simple preprocessor that understands just enough to allow assembling
+ULP source files containing convenience macros such as WRITE_RTC_REG. The
+preprocessor and how to use it is documented here:
+`Preprocessor support <docs/preprocess.rst>`_.
+
 There might be some stuff missing, some bugs and other symptoms of alpha
 software. Also, error and exception handling is rather rough yet.
 
diff --git a/docs/preprocess.rst b/docs/preprocess.rst
new file mode 100644
index 0000000..0716e69
--- /dev/null
+++ b/docs/preprocess.rst
@@ -0,0 +1,138 @@
+Preprocessor
+---------------------
+
+py-esp32-ulp contains a small preprocessor, which aims to fulfill one goal:
+facilitate assembling of ULP code from Espressif and other open-source
+projects to loadable/executable machine code without modification.
+
+Such code uses convenience macros (``READ_RTC_*`` and ``WRITE_RTC_*``)
+provided by the ESP-IDF framework, along with constants defined in the
+framework's include files (such as ``RTC_GPIO_IN_REG``), to make reading
+and writing from/to peripheral registers much easier.
+
+In order to do this the preprocessor has two capabilities:
+
+1. Parse and replace identifiers defined with ``#define``
+2. Recognise the ``WRITE_RTC_*`` and ``READ_RTC_*`` macros and expand
+   them in a way that mirrors what the real ESP-IDF macros do.
+
+
+Usage
+------------------------
+
+Normally the assembler is called as follows
+
+.. code-block:: python
+
+    src = "..full assembler file contents"
+    assembler = Assembler()
+    assembler.assemble(src)
+    ...
+
+With the preprocessor, simply pass the source code via the preprocessor first:
+
+.. code-block:: python
+
+    from preprocess import preprocess
+
+    src = "..full assembler file contents"
+    src = preprocess(src)
+    assembler = Assembler()
+    assembler.assemble(src)
+    ...
+
+
+Using a "Defines Database"
+--------------------------
+
+Because the py-esp32-ulp assembler was built for running on the ESP32
+microcontroller with limited RAM, the preprocessor aims to work there too.
+
+To handle large number of defined constants (such as the ``RTC_*`` constants from
+the ESP-IDF) the preprocessor can use a database (based on BerkleyDB) stored on the
+device's filesystem for looking up defines.
+
+The database needs to be populated before preprocessing. (Usually, when only using
+constants from the ESP-IDF, this is a one-time step, because the include files
+don't change.) The database can be reused for all subsequent preprocessor runs.
+
+(The database can also be generated on a PC and then deployed to the ESP32, to
+save processing effort on the device. In that case the include files themselves
+are not needed on the device either.)
+
+1. Build the defines database
+
+   The ``esp32_ulp.parse_to_db`` tool can be used to generate the defines
+   database from include files. The resulting file will be called
+   ``defines.db``.
+
+   (The following assume running on a PC. To do this on device, refer to the
+   `esp32_ulp/parse_to_db.py <../esp32_ulp/parse_to_db.py>`_ file.)
+
+   .. code-block:: bash
+
+      # general command
+      micropython -m esp32_ulp.parse_to_db path/to/include.h
+
+      # loading specific ESP-IDF include files
+      micropython -m esp32_ulp.parse_to_db esp-idf/components/soc/esp32/include/soc/soc_ulp.h
+
+      # loading multiple files at once
+      micropython -m esp32_ulp.parse_to_db esp-idf/components/soc/esp32/include/soc/*.h
+
+      # if file system space is not a concern, the following can be convenient
+      # by including all relevant include files from the ESP-IDF framework.
+      # This results in an approximately 2MB large database.
+      micropython -m esp32_ulp.parse_to_db \
+        esp-idf/components/soc/esp32/include/soc/*.h \
+        esp-idf/components/esp_common/include/*.h
+
+      # most ULP code uses only 5 include files. Parsing only those into the
+      # database should thus allow assembling virtually all ULP code one would
+      # find or want to write.
+      # This results in an approximately 250kB large database.
+      micropython -m esp32_ulp.parse_to_db \
+        esp-idf/components/soc/esp32/include/soc/{soc,soc_ulp,rtc_cntl_reg,rtc_io_reg,sens_reg}.h
+
+2. Using the defines database during preprocessing
+
+   The preprocessor will automatically use a defines database, when using the
+   ``preprocess.preprocess`` convenience function, even when the database does
+   not exist (an absent database is treated like an empty database, and care
+   is taken not to create an empty database file, cluttering up the filesystem,
+   when not needed).
+
+   If you do not want the preprocessor use use a DefinesDB, pass ``False`` to
+   the ``use_defines_db`` argument of the ``preprocess`` convenience function,
+   or instantiate the ``Preprocessor`` class directly, without passing it a
+   DefinesDB instance via ``use_db``.
+
+Design choices
+--------------
+
+The preprocessor does not support:
+
+1. Function style macros such as :code:`#define f(a,b) (a+b)`
+
+   This is not important, because there are only few RTC macros that need
+   to be supported and they are simply implemented as Python functions.
+
+   Since the preprocessor will understand ``#define`` directives directly in the
+   assembler source file, include mechanisms are not needed in some cases
+   (simply copying the needed ``#define`` statements from include files into the
+   assembler source will work).
+
+2. ``#include`` directives
+
+   The preprocessor does not currently follow ``#include`` directives. To
+   limit space requirements (both in memory and on the filesystem), the
+   preprocessor relies on a database of defines (key/value pairs). This
+   database should be populated before using the preprocessor, by using the
+   ``esp32_ulp.parse_to_db`` tool (see section above), which parses include
+   files for identifiers defined therein.
+
+3. Preserving comments
+
+   The assumption is that the output will almost always go into the
+   assembler directly, so preserving comments is not very useful and
+   would add a lot of complexity.
diff --git a/esp32_ulp/__main__.py b/esp32_ulp/__main__.py
index 584a3dd..209656f 100644
--- a/esp32_ulp/__main__.py
+++ b/esp32_ulp/__main__.py
@@ -2,6 +2,7 @@
 
 from .util import garbage_collect
 
+from .preprocess import preprocess
 from .assemble import Assembler
 from .link import make_binary
 garbage_collect('after import')
@@ -9,7 +10,8 @@
 
 def src_to_binary(src):
     assembler = Assembler()
-    assembler.assemble(src)
+    src = preprocess(src)
+    assembler.assemble(src, remove_comments=False)  # comments already removed by preprocessor
     garbage_collect('before symbols export')
     addrs_syms = assembler.symbols.export()
     for addr, sym in addrs_syms:
diff --git a/esp32_ulp/assemble.py b/esp32_ulp/assemble.py
index d0b1ff2..e348363 100644
--- a/esp32_ulp/assemble.py
+++ b/esp32_ulp/assemble.py
@@ -3,7 +3,7 @@
 """
 
 from . import opcodes
-from .nocomment import remove_comments
+from .nocomment import remove_comments as do_remove_comments
 from .util import garbage_collect
 
 TEXT, DATA, BSS = 'text', 'data', 'bss'
@@ -12,13 +12,10 @@
 
 
 class SymbolTable:
-    def __init__(self, symbols, bases):
+    def __init__(self, symbols, bases, globals):
         self._symbols = symbols
         self._bases = bases
-        self._pass = None
-
-    def set_pass(self, _pass):
-        self._pass = _pass
+        self._globals = globals
 
     def set_bases(self, bases):
         self._bases = bases
@@ -32,38 +29,28 @@ def get_from(self):
     def set_sym(self, symbol, stype, section, value):
         entry = (stype, section, value)
         if symbol in self._symbols and entry != self._symbols[symbol]:
-            raise Exception('redefining symbol %s with different value %r -> %r.' % (label, self._symbols[symbol], entry))
+            raise Exception('redefining symbol %s with different value %r -> %r.' % (symbol, self._symbols[symbol], entry))
         self._symbols[symbol] = entry
 
     def has_sym(self, symbol):
         return symbol in self._symbols
         
     def get_sym(self, symbol):
-        try:
-            entry = self._symbols[symbol]
-        except KeyError:
-            if self._pass == 1:
-                entry = (REL, TEXT, 0)  # for a dummy, this is good enough
-            else:
-                raise
+        entry = self._symbols[symbol]
         return entry
 
     def dump(self):
         for symbol, entry in self._symbols.items():
             print(symbol, entry)
 
-    def export(self):
-        addrs_syms = [(self.resolve_absolute(entry), symbol) for symbol, entry in self._symbols.items()]
+    def export(self, incl_non_globals=False):
+        addrs_syms = [(self.resolve_absolute(entry), symbol)
+                      for symbol, entry in self._symbols.items()
+                      if incl_non_globals or symbol in self._globals]
         return sorted(addrs_syms)
 
     def to_abs_addr(self, section, offset):
-        try:
-            base = self._bases[section]
-        except KeyError:
-            if self._pass == 1:
-                base = 0  # for a dummy this is good enough
-            else:
-                raise
+        base = self._bases[section]
         return base + offset
 
     def resolve_absolute(self, symbol):
@@ -93,16 +80,19 @@ def resolve_relative(self, symbol):
         from_addr = self.to_abs_addr(self._from_section, self._from_offset)
         return sym_addr - from_addr
 
+    def set_global(self, symbol):
+        self._globals[symbol] = True
+        pass
+
 
 class Assembler:
 
-    def __init__(self, symbols=None, bases=None):
-        self.symbols = SymbolTable(symbols or {}, bases or {})
+    def __init__(self, symbols=None, bases=None, globals=None):
+        self.symbols = SymbolTable(symbols or {}, bases or {}, globals or {})
         opcodes.symbols = self.symbols  # XXX dirty hack
 
     def init(self, a_pass):
         self.a_pass = a_pass
-        self.symbols.set_pass(a_pass)
         self.sections = dict(text=[], data=[])
         self.offsets = dict(text=0, data=0, bss=0)
         self.section = TEXT
@@ -118,7 +108,7 @@ def parse_line(self, line):
         """
         if not line:
             return
-        has_label = line[0] not in '\t '
+        has_label = line[0] not in '\t .'
         if has_label:
             label_line = line.split(None, 1)
             if len(label_line) == 2:
@@ -150,8 +140,10 @@ def append_section(self, value, expected_section=None):
         if expected_section is not None and s is not expected_section:
             raise TypeError('only allowed in %s section' % expected_section)
         if s is BSS:
-            # just increase BSS size by value
-            self.offsets[s] += value
+            if int.from_bytes(value, 'little') != 0:
+                raise ValueError('attempt to store non-zero value in section .bss')
+            # just increase BSS size by length of value
+            self.offsets[s] += len(value)
         else:
             self.sections[s].append(value)
             self.offsets[s] += len(value)
@@ -231,9 +223,12 @@ def d_align(self, align=4, fill=None):
             self.fill(self.section, amount, fill)
 
     def d_set(self, symbol, expr):
-        value = int(expr)  # TODO: support more than just integers
+        value = int(opcodes.eval_arg(expr))
         self.symbols.set_sym(symbol, ABS, None, value)
 
+    def d_global(self, symbol):
+        self.symbols.set_global(symbol)
+
     def append_data(self, wordlen, args):
         data = [int(arg).to_bytes(wordlen, 'little') for arg in args]
         self.append_section(b''.join(data))
@@ -245,6 +240,11 @@ def d_word(self, *args):
         self.append_data(2, args)
 
     def d_long(self, *args):
+        self.d_int(*args)
+
+    def d_int(self, *args):
+        # .long and .int are identical as per GNU assembler documentation
+        # https://sourceware.org/binutils/docs/as/Long.html
         self.append_data(4, args)
 
     def assembler_pass(self, lines):
@@ -263,16 +263,22 @@ def assembler_pass(self, lines):
                         continue
                 else:
                     # machine instruction
-                    func = getattr(opcodes, 'i_' + opcode, None)
+                    func = getattr(opcodes, 'i_' + opcode.lower(), None)
                     if func is not None:
-                        instruction = func(*args)
+                        # during the first pass, symbols are not all known yet.
+                        # so some expressions may not evaluate to something (yet).
+                        # instruction building requires sane arguments however.
+                        # since all instructions are 4 bytes long, we simply skip
+                        # building instructions during pass 1, and append an "empty
+                        # instruction" to the section to get the right section size.
+                        instruction = 0 if self.a_pass == 1 else func(*args)
                         self.append_section(instruction.to_bytes(4, 'little'), TEXT)
                         continue
-                raise Exception('Unknown opcode or directive: %s' % opcode)
+                raise ValueError('Unknown opcode or directive: %s' % opcode)
         self.finalize_sections()
 
-    def assemble(self, text):
-        lines = remove_comments(text)
+    def assemble(self, text, remove_comments=True):
+        lines = do_remove_comments(text) if remove_comments else text.splitlines()
         self.init(1)  # pass 1 is only to get the symbol table right
         self.assembler_pass(lines)
         self.symbols.set_bases(self.compute_bases())
diff --git a/esp32_ulp/definesdb.py b/esp32_ulp/definesdb.py
new file mode 100644
index 0000000..4a05459
--- /dev/null
+++ b/esp32_ulp/definesdb.py
@@ -0,0 +1,78 @@
+import os
+import btree
+from .util import file_exists
+
+DBNAME = 'defines.db'
+
+
+class DefinesDB:
+    def __init__(self):
+        self._file = None
+        self._db = None
+        self._db_exists = None
+
+    def clear(self):
+        self.close()
+        try:
+            os.remove(DBNAME)
+            self._db_exists = False
+        except OSError:
+            pass
+
+    def is_open(self):
+        return self._db is not None
+
+    def open(self):
+        if self.is_open():
+            return
+        try:
+            self._file = open(DBNAME, 'r+b')
+        except OSError:
+            self._file = open(DBNAME, 'w+b')
+        self._db = btree.open(self._file)
+        self._db_exists = True
+
+    def close(self):
+        if not self.is_open():
+            return
+        self._db.close()
+        self._db = None
+        self._file.close()
+        self._file = None
+
+    def db_exists(self):
+        if self._db_exists is None:
+            self._db_exists = file_exists(DBNAME)
+        return self._db_exists
+
+    def update(self, dictionary):
+        for k, v in dictionary.items():
+            self.__setitem__(k, v)
+
+    def get(self, key, default):
+        try:
+            result = self.__getitem__(key)
+        except KeyError:
+            result = default
+        return result
+
+    def keys(self):
+        if not self.db_exists():
+            return []
+
+        self.open()
+        return [k.decode() for k in self._db.keys()]
+
+    def __getitem__(self, key):
+        if not self.db_exists():
+            raise KeyError
+
+        self.open()
+        return self._db[key.encode()].decode()
+
+    def __setitem__(self, key, value):
+        self.open()
+        self._db[key.encode()] = str(value).encode()
+
+    def __iter__(self):
+        return iter(self.keys())
diff --git a/esp32_ulp/opcodes.py b/esp32_ulp/opcodes.py
index 4e2ca04..103b1f7 100644
--- a/esp32_ulp/opcodes.py
+++ b/esp32_ulp/opcodes.py
@@ -6,6 +6,7 @@
 from uctypes import struct, addressof, LITTLE_ENDIAN, UINT32, BFUINT32, BF_POS, BF_LEN
 
 from .soc import *
+from .util import split_tokens, validate_expression
 
 # XXX dirty hack: use a global for the symbol table
 symbols = None
@@ -15,6 +16,7 @@
 OPCODE_WR_REG = 1
 OPCODE_RD_REG = 2
 
+DR_REG_MAX_DIRECT = 0x3ff
 RD_REG_PERIPH_RTC_CNTL = 0
 RD_REG_PERIPH_RTC_IO = 1
 RD_REG_PERIPH_SENS = 2
@@ -112,7 +114,7 @@ def make_ins(layout):
     unused : 8      # Unused
     low : 5         # Low bit
     high : 5        # High bit
-    opcode : 4      # Opcode (OPCODE_WR_REG)
+    opcode : 4      # Opcode (OPCODE_RD_REG)
 """)
 
 
@@ -267,6 +269,20 @@ def make_ins(layout):
 ARG = namedtuple('ARG', ('type', 'value', 'raw'))
 
 
+def eval_arg(arg):
+    parts = []
+    for token in split_tokens(arg):
+        if symbols.has_sym(token):
+            _, _, sym_value = symbols.get_sym(token)
+            parts.append(str(sym_value))
+        else:
+            parts.append(token)
+    parts = "".join(parts)
+    if not validate_expression(parts):
+        raise ValueError('Unsupported expression: %s' % parts)
+    return eval(parts)
+
+
 def arg_qualify(arg):
     """
     look at arg and qualify its type:
@@ -289,8 +305,12 @@ def arg_qualify(arg):
         return ARG(IMM, int(arg), arg)
     except ValueError:
         pass
-    entry = symbols.get_sym(arg)
-    return ARG(SYM, entry, arg)
+    try:
+        entry = symbols.get_sym(arg)
+    except KeyError:
+        return ARG(IMM, int(eval_arg(arg)), arg)
+    else:
+        return ARG(SYM, entry, arg)
 
 
 def get_reg(arg):
@@ -334,8 +354,9 @@ def get_cond(arg):
 
 def _soc_reg_to_ulp_periph_sel(reg):
     # Map SoC peripheral register to periph_sel field of RD_REG and WR_REG instructions.
-    ret = 3
-    if reg < DR_REG_RTCCNTL_BASE:
+    if reg < DR_REG_MAX_DIRECT:
+        ret = RD_REG_PERIPH_RTC_CNTL
+    elif reg < DR_REG_RTCCNTL_BASE:
         raise ValueError("invalid register base")
     elif reg < DR_REG_RTCIO_BASE:
         ret = RD_REG_PERIPH_RTC_CNTL
@@ -352,7 +373,10 @@ def _soc_reg_to_ulp_periph_sel(reg):
 
 def i_reg_wr(reg, high_bit, low_bit, val):
     reg = get_imm(reg)
-    _wr_reg.addr = (reg & 0xff) >> 2
+    if reg < DR_REG_MAX_DIRECT:  # see https://github.com/espressif/binutils-esp32ulp/blob/master/gas/config/tc-esp32ulp_esp32.c
+        _wr_reg.addr = reg
+    else:
+        _wr_reg.addr = (reg & 0xff) >> 2
     _wr_reg.periph_sel = _soc_reg_to_ulp_periph_sel(reg)
     _wr_reg.data = get_imm(val)
     _wr_reg.low = get_imm(low_bit)
@@ -363,7 +387,10 @@ def i_reg_wr(reg, high_bit, low_bit, val):
 
 def i_reg_rd(reg, high_bit, low_bit):
     reg = get_imm(reg)
-    _rd_reg.addr = (reg & 0xff) >> 2
+    if reg < DR_REG_MAX_DIRECT: # see https://github.com/espressif/binutils-esp32ulp/blob/master/gas/config/tc-esp32ulp_esp32.c
+        _rd_reg.addr = reg
+    else:
+        _rd_reg.addr = (reg & 0xff) >> 2
     _rd_reg.periph_sel = _soc_reg_to_ulp_periph_sel(reg)
     _rd_reg.unused = 0
     _rd_reg.low = get_imm(low_bit)
@@ -463,7 +490,7 @@ def i_move(reg_dest, reg_imm_src):
     if src.type == REG:
         _alu_reg.dreg = dest
         _alu_reg.sreg = src.value
-        _alu_reg.treg = 1  # XXX undocumented, this is the value binutils-esp32 uses
+        _alu_reg.treg = src.value  # XXX undocumented, this is the value binutils-esp32 uses
         _alu_reg.unused = 0
         _alu_reg.sel = ALU_SEL_MOV
         _alu_reg.sub_opcode = SUB_OPCODE_ALU_REG
diff --git a/esp32_ulp/parse_to_db.py b/esp32_ulp/parse_to_db.py
new file mode 100644
index 0000000..ac61f98
--- /dev/null
+++ b/esp32_ulp/parse_to_db.py
@@ -0,0 +1,23 @@
+import sys
+
+from .preprocess import Preprocessor
+from .definesdb import DefinesDB
+
+
+def parse(files):
+    db = DefinesDB()
+
+    p = Preprocessor()
+    p.use_db(db)
+
+    for f in files:
+        print('Processing file:', f)
+
+        p.process_include_file(f)
+
+    print('Done.')
+
+
+if __name__ == '__main__':
+    parse(sys.argv[1:])
+
diff --git a/esp32_ulp/preprocess.py b/esp32_ulp/preprocess.py
new file mode 100644
index 0000000..03a9317
--- /dev/null
+++ b/esp32_ulp/preprocess.py
@@ -0,0 +1,156 @@
+from . import nocomment
+from .util import split_tokens
+from .definesdb import DefinesDB
+
+
+class RTC_Macros:
+    @staticmethod
+    def READ_RTC_REG(rtc_reg, low_bit, bit_width):
+        return '\treg_rd ' + ', '.join((
+            rtc_reg,
+            '%s + %s - 1' % (low_bit, bit_width),
+            low_bit
+        ))
+
+    @staticmethod
+    def WRITE_RTC_REG(rtc_reg, low_bit, bit_width, value):
+        return '\treg_wr ' + ', '.join((
+            rtc_reg,
+            '%s + %s - 1' % (low_bit, bit_width),
+            low_bit,
+            value
+        ))
+
+    @staticmethod
+    def READ_RTC_FIELD(rtc_reg, low_bit):
+        return RTC_Macros.READ_RTC_REG(rtc_reg, low_bit, 1)
+
+    @staticmethod
+    def WRITE_RTC_FIELD(rtc_reg, low_bit, value):
+        return RTC_Macros.WRITE_RTC_REG(rtc_reg, low_bit, 1, value + ' & 1')
+
+
+class Preprocessor:
+    def __init__(self):
+        self._defines_db = None
+        self._defines = {}
+
+    def parse_define_line(self, line):
+        line = line.strip()
+        if not line.startswith("#define"):
+            # skip lines not containing #define
+            return {}
+        line = line[8:].strip()  # remove #define
+        parts = line.split(None, 1)
+        if len(parts) != 2:
+            # skip defines without value
+            return {}
+        identifier, value = parts
+        tmp = identifier.split('(', 1)
+        if len(tmp) == 2:
+            # skip parameterised defines (macros)
+            return {}
+        value = "".join(nocomment.remove_comments(value)).strip()
+        return {identifier: value}
+
+    def parse_defines(self, content):
+        for line in content.splitlines():
+            self._defines.update(self.parse_define_line(line))
+
+        return self._defines
+
+    def expand_defines(self, line):
+        found = True
+        while found:  # do as many passed as needed, until nothing was replaced anymore
+            found = False
+            tokens = split_tokens(line)
+            line = ""
+            for t in tokens:
+                lu = self._defines.get(t, t)
+                if lu == t and self._defines_db:
+                    lu = self._defines_db.get(t, t)
+                if lu == t and t == 'BIT':
+                    # Special hack: BIT(..) translates to a 32-bit mask where only the specified bit is set.
+                    # But the reg_wr and reg_rd opcodes expect actual bit numbers for argument 2 and 3.
+                    # While the real READ_RTC_*/WRITE_RTC_* macros take in the output of BIT(x), they
+                    # ultimately convert these back (via helper macros) to the bit number (x). And since this
+                    # preprocessor does not (aim to) implement "proper" macro-processing, we can simply
+                    # short-circuit this round-trip via macros and replace "BIT" with nothing so that
+                    # "BIT(x)" gets mapped to "(x)".
+                    continue
+                if lu != t:
+                    found = True
+                line += lu
+
+        return line
+
+    def process_include_file(self, filename):
+        with self.open_db() as db:
+            with open(filename, 'r') as f:
+                for line in f:
+                    result = self.parse_define_line(line)
+                    db.update(result)
+
+        return db
+
+    def expand_rtc_macros(self, line):
+        clean_line = line.strip()
+        if not clean_line:
+            return line
+
+        macro = clean_line.split('(', 1)
+        if len(macro) != 2:
+            return line
+
+        macro_name, macro_args = macro
+
+        macro_fn = getattr(RTC_Macros, macro_name, None)
+        if macro_fn is None:
+            return line
+
+        macro_args, _ = macro_args.rsplit(')', 1)  # trim away right bracket. safe as comments already stripped
+        macro_args = macro_args.split(',')  # not safe when args contain ',' but we should not have those
+        macro_args = [x.strip() for x in macro_args]
+
+        return macro_fn(*macro_args)
+
+    def use_db(self, defines_db):
+        self._defines_db = defines_db
+
+    def open_db(self):
+        class ctx:
+            def __init__(self, db):
+                self._db = db
+
+            def __enter__(self):
+                # not opening DefinesDB - it opens itself when needed
+                return self._db
+
+            def __exit__(self, type, value, traceback):
+                if isinstance(self._db, DefinesDB):
+                    self._db.close()
+
+        if self._defines_db:
+            return ctx(self._defines_db)
+
+        return ctx(self._defines)
+
+    def preprocess(self, content):
+        self.parse_defines(content)
+
+        with self.open_db():
+            lines = nocomment.remove_comments(content)
+            result = []
+            for line in lines:
+                line = self.expand_defines(line)
+                line = self.expand_rtc_macros(line)
+                result.append(line)
+            result = "\n".join(result)
+
+        return result
+
+
+def preprocess(content, use_defines_db=True):
+    preprocessor = Preprocessor()
+    preprocessor.use_db(DefinesDB())
+    return preprocessor.preprocess(content)
diff --git a/esp32_ulp/util.py b/esp32_ulp/util.py
index c184414..d79c538 100644
--- a/esp32_ulp/util.py
+++ b/esp32_ulp/util.py
@@ -1,6 +1,9 @@
 DEBUG = False
 
 import gc
+import os
+
+NORMAL, WHITESPACE = 0, 1
 
 
 def garbage_collect(msg, verbose=DEBUG):
@@ -9,3 +12,68 @@ def garbage_collect(msg, verbose=DEBUG):
     free_after = gc.mem_free()
     if verbose:
         print("%s: %d --gc--> %d bytes free" % (msg, free_before, free_after))
+
+
+def split_tokens(line):
+    buf = ""
+    tokens = []
+    state = NORMAL
+    for c in line:
+        if c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_":
+            if state != NORMAL:
+                if len(buf) > 0:
+                    tokens.append(buf)
+                    buf = ""
+                state = NORMAL
+            buf += c
+        elif c in " \t":
+            if state != WHITESPACE:
+                if len(buf) > 0:
+                    tokens.append(buf)
+                    buf = ""
+                state = WHITESPACE
+            buf += c
+        else:
+            if len(buf) > 0:
+                tokens.append(buf)
+                buf = ""
+            tokens.append(c)
+
+    if len(buf) > 0:
+        tokens.append(buf)
+
+    return tokens
+
+
+def validate_expression(param):
+    for token in split_tokens(param):
+        state = 0
+        for c in token:
+            if c not in ' \t+-*/%()<>&|~x0123456789abcdef':
+                return False
+
+            # the following allows hex digits a-f after 0x but not otherwise
+            if state == 0:
+                if c in 'abcdef':
+                    return False
+                if c == '0':
+                    state = 1
+                continue
+
+            if state == 1:
+                state = 2 if c == 'x' else 0
+                continue
+
+            if state == 2:
+                if c not in '0123456789abcdef':
+                    state = 0
+    return True
+
+
+def file_exists(filename):
+    try:
+        os.stat(filename)
+        return True
+    except OSError:
+        pass
+    return False
diff --git a/tests/00_unit_tests.sh b/tests/00_unit_tests.sh
index 07d221f..ee1a239 100755
--- a/tests/00_unit_tests.sh
+++ b/tests/00_unit_tests.sh
@@ -4,7 +4,7 @@
 
 set -e
 
-for file in opcodes assemble link ; do
+for file in opcodes assemble link util preprocess definesdb; do
     echo testing $file...
     micropython $file.py
 done
diff --git a/tests/01_compat_tests.sh b/tests/01_compat_tests.sh
index c565aa1..68f8bdc 100755
--- a/tests/01_compat_tests.sh
+++ b/tests/01_compat_tests.sh
@@ -13,12 +13,14 @@ for src_file in $(ls -1 compat/*.S); do
     log_file="${src_name}.log"
     micropython -m esp32_ulp $src_file 1>$log_file   # generates $ulp_file
 
+    pre_file="${src_name}.pre"
     obj_file="${src_name}.o"
     elf_file="${src_name}.elf"
     bin_file="${src_name}.bin"
 
     echo -e "\tBuilding using binutils"
-    esp32ulp-elf-as -o $obj_file $src_file
+    gcc -E -o ${pre_file} $src_file
+    esp32ulp-elf-as -o $obj_file ${pre_file}
     esp32ulp-elf-ld -T esp32.ulp.ld -o $elf_file $obj_file
     esp32ulp-elf-objcopy -O binary $elf_file $bin_file
 
diff --git a/tests/02_compat_rtc_tests.sh b/tests/02_compat_rtc_tests.sh
new file mode 100755
index 0000000..2904ee6
--- /dev/null
+++ b/tests/02_compat_rtc_tests.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+# export PYTHONPATH=.:$PYTHONPATH
+
+set -e
+
+make_log_dir() {
+   mkdir -p log
+}
+
+fetch_esp_idf() {
+    [ -d esp-idf ] && return
+
+    echo "Fetching esp-idf"
+    log_file=log/fetch-esp-idf.log
+    git clone --depth 1 \
+        https://github.com/espressif/esp-idf.git 1>$log_file 2>&1
+}
+
+fetch_ulptool_examples() {
+    [ -d ulptool ] && return
+
+    echo "Fetching ulptool examples"
+    log_file=log/fetch-ulptool.log
+    git clone --depth 1 \
+        https://github.com/duff2013/ulptool 1>$log_file 2>&1
+}
+
+fetch_binutils_esp32ulp_examples() {
+    [ -d binutils-esp32ulp ] && return
+
+    echo "Fetching binutils-esp32ulp examples"
+    log_file=log/fetch-binutils.log
+    git clone --depth 1 \
+        https://github.com/espressif/binutils-esp32ulp.git 1>$log_file 2>&1
+}
+
+build_defines_db() {
+    local defines_db=defines.db
+
+    if [ "$1" = "-r" ] && [ -s "${defines_db}" ]; then
+        # reuse existing defines.db
+        return
+    fi
+
+    echo "Building defines DB from include files"
+    log_file=log/build_defines_db.log
+    rm -f "${defines_db}"
+    micropython -m esp32_ulp.parse_to_db \
+        esp-idf/components/soc/esp32/include/soc/*.h \
+        esp-idf/components/esp_common/include/*.h 1>$log_file
+}
+
+make_log_dir
+fetch_esp_idf
+fetch_ulptool_examples
+fetch_binutils_esp32ulp_examples
+build_defines_db $1
+
+for src_file in ulptool/src/ulp_examples/*/*.s binutils-esp32ulp/gas/testsuite/gas/esp32ulp/esp32/*.s; do
+
+    src_name="${src_file%.s}"
+
+    echo "Testing $src_file"
+
+    test_name="${src_name##*/}"
+
+    # for now, skip files that contain known bugs in esp32_ulp (essentially a todo list of what to fix)
+    for I in esp32ulp_all esp32ulp_globals esp32ulp_jumpr esp32ulp_ranges test_reg; do
+        if [ "${test_name}" = "$I" ]; then
+            # these are old bugs, and not related to the RTC macro handling functionality
+            # they will still be great to fix over time
+            echo -e "\tSkipping... known bugs in esp32_ulp"
+            continue 2
+        fi
+    done
+
+    # for now, skip files that contain unsupported things (macros)
+    for I in i2c i2c_dev stack i2c_wr test1 test_jumpr test_macro; do
+        if [ "${test_name}" = "$I" ]; then
+            echo -e "\tSkipping... not yet supported"
+            continue 2
+        fi
+    done
+
+    echo -e "\tBuilding using py-esp32-ulp"
+    ulp_file="${src_name}.ulp"
+    log_file="${src_name}.log"
+    micropython -m esp32_ulp $src_file 1>$log_file   # generates $ulp_file
+
+    pre_file="${src_name}.pre"
+    obj_file="${src_name}.o"
+    elf_file="${src_name}.elf"
+    bin_file="${src_name}.bin"
+
+    echo -e "\tBuilding using binutils"
+    gcc -I esp-idf/components/soc/esp32/include -I esp-idf/components/esp_common/include \
+        -x assembler-with-cpp \
+        -E -o ${pre_file} $src_file
+    esp32ulp-elf-as -o $obj_file ${pre_file}
+    esp32ulp-elf-ld -T esp32.ulp.ld -o $elf_file $obj_file
+    esp32ulp-elf-objcopy -O binary $elf_file $bin_file
+
+    if ! diff $ulp_file $bin_file 1>/dev/null; then
+        echo -e "\tBuild outputs differ!"
+        echo ""
+        echo "Compatibility test failed for $src_file"
+        echo "py-esp32-ulp log:"
+        cat $log_file
+        echo "py-esp32-ulp output:"
+        xxd $ulp_file
+        echo "binutils output:"
+        xxd $bin_file
+        exit 1
+    else
+        echo -e "\tBuild outputs match"
+    fi
+done
diff --git a/tests/assemble.py b/tests/assemble.py
index 3875ee0..e607ba2 100644
--- a/tests/assemble.py
+++ b/tests/assemble.py
@@ -3,25 +3,54 @@
 from esp32_ulp.nocomment import remove_comments
 
 src = """\
+        .set const, 123
+.set const_left, 976
 
 start:  wait 42
         ld r0, r1, 0
         st  r0,  r1,0
         halt
 end:
+.data
+"""
+
+src_bss = """\
+  .bss
+
+label:
+  .long 0
+"""
+
+
+src_global = """\
+
+  .global counter
+counter:
+  .long 0
+
+internal:
+  .long 0
+
+  .text
+  .global entry
+entry:
+  wait 42
+  halt
 """
 
 
 def test_parse_line():
     a = Assembler()
-    lines = src.splitlines()
-    # note: line number = index + 1
-    assert a.parse_line(lines[0]) == None
-    assert a.parse_line(lines[1]) == ('start', 'wait', ('42', ))
-    assert a.parse_line(lines[2]) == (None, 'ld', ('r0', 'r1', '0', ))
-    assert a.parse_line(lines[3]) == (None, 'st', ('r0', 'r1', '0', ))
-    assert a.parse_line(lines[4]) == (None, 'halt', ())
-    assert a.parse_line(lines[5]) == ('end', None, ())
+    lines = iter(src.splitlines())
+    assert a.parse_line(next(lines)) == (None, '.set', ('const', '123', ))
+    assert a.parse_line(next(lines)) == (None, '.set', ('const_left', '976', ))
+    assert a.parse_line(next(lines)) == None
+    assert a.parse_line(next(lines)) == ('start', 'wait', ('42', ))
+    assert a.parse_line(next(lines)) == (None, 'ld', ('r0', 'r1', '0', ))
+    assert a.parse_line(next(lines)) == (None, 'st', ('r0', 'r1', '0', ))
+    assert a.parse_line(next(lines)) == (None, 'halt', ())
+    assert a.parse_line(next(lines)) == ('end', None, ())
+    assert a.parse_line(next(lines)) == (None, '.data', ())  # test left-aligned directive is not treated as label
 
 
 def test_parse():
@@ -34,8 +63,12 @@ def test_parse():
 def test_assemble():
     a = Assembler()
     a.assemble(src)
+    assert a.symbols.has_sym('const')
+    assert a.symbols.has_sym('const_left')
     assert a.symbols.has_sym('start')
     assert a.symbols.has_sym('end')
+    assert a.symbols.get_sym('const') == (ABS, None, 123)
+    assert a.symbols.get_sym('const_left') == (ABS, None, 976)
     assert a.symbols.get_sym('start') == (REL, TEXT, 0)
     assert a.symbols.get_sym('end') == (REL, TEXT, 4)
     assert len(b''.join(a.sections[TEXT])) == 16  # 4 instructions * 4B
@@ -43,33 +76,151 @@ def test_assemble():
     assert a.offsets[BSS] == 0
 
 
+def test_assemble_bss():
+    a = Assembler()
+    try:
+        a.assemble(src_bss)
+    except TypeError:
+        raised = True
+    else:
+        raised = False
+    assert not raised
+    assert a.offsets[BSS] == 4  # 1 word * 4B
+
+
+def test_assemble_bss_with_value():
+    lines = """\
+.bss
+    .long 3  #non-zero value not allowed in bss section
+"""
+
+    a = Assembler()
+    try:
+        a.assemble(lines)
+    except ValueError as e:
+        if str(e) != "attempt to store non-zero value in section .bss":
+            raise  # re-raise failures we didn't expect
+        raised = True
+    else:
+        raised = False
+
+    assert raised
+
+
+def test_assemble_global():
+    a = Assembler()
+    a.assemble(src_global)
+    assert a.symbols.has_sym('counter')
+    assert a.symbols.has_sym('internal')
+    assert a.symbols.has_sym('entry')
+
+    exported_symbols = a.symbols.export()
+    assert exported_symbols == [(0, 'counter'), (2, 'entry')]  # internal not exported
+
+    exported_symbols = a.symbols.export(True)  # include non-global symbols
+    assert exported_symbols == [(0, 'counter'), (1, 'internal'), (2, 'entry')]
+
+
+def test_assemble_uppercase_opcode():
+    a = Assembler()
+    try:
+        a.assemble("  WAIT 42")
+    except ValueError as e:
+        if str(e) != "Unknown opcode or directive: WAIT":
+            # re-raise failures we didn't expect
+            raise
+        raised = True
+    else:
+        raised = False
+    assert not raised
+
+
+def test_assemble_evalulate_expressions():
+    src_w_expr = """\
+    .set shft, 2
+    .set loops, (1 << shft)
+
+entry:
+    move r0, 1+1
+    move r1, loops
+    move r2, (shft + 10) * 2
+    move r3, entry << 2
+"""
+    a = Assembler()
+    a.assemble(src_w_expr)
+
+    assert a.symbols.has_sym('shft')
+    assert a.symbols.has_sym('loops')
+    assert a.symbols.has_sym('entry')
+    assert a.symbols.get_sym('shft') == (ABS, None, 2)
+    assert a.symbols.get_sym('loops') == (ABS, None, 4)
+    assert a.symbols.get_sym('entry') == (REL, TEXT, 0)
+
+
+def test_assemble_optional_comment_removal():
+    line = " move r1, 123  # comment"
+
+    a = Assembler()
+
+    # first assemble as normal (comments will be removed by default)
+    a.assemble(line)
+
+    # now assemble with comment removal disabled
+    try:
+        a.assemble(line, remove_comments=False)
+    except ValueError as e:
+        raised = True
+    else:
+        raised = False
+    assert raised
+
+
+def test_assemble_test_regressions_from_evaluation():
+    line = " reg_wr (0x3ff48400 + 0x10), 1, 1, 1"
+
+    a = Assembler()
+    raised = False
+    try:
+        a.assemble(line)
+    except ValueError as e:
+        if str(e) == 'invalid register base':  # ensure we trapped the expected Exception
+            raised = True
+    assert not raised
+
+
 def test_symbols():
-    st = SymbolTable({}, {})
+    st = SymbolTable({}, {}, {})
     for entry in [
         ('rel_t4', REL, TEXT, 4),
         ('abs_t4', ABS, TEXT, 4),
         ('rel_d4', REL, DATA, 4),
         ('abs_d4', ABS, DATA, 4),
+        ('const', ABS, None, 123),
     ]:
         st.set_sym(*entry)
     # PASS 1 ========================================================
-    st.set_pass(1)
     assert st.has_sym('abs_t4')
     assert st.get_sym('abs_t4') == (ABS, TEXT, 4)
     assert not st.has_sym('notexist')
-    assert st.get_sym('notexist') == (REL, TEXT, 0)  # pass1 -> dummy
+    try:
+        st.get_sym('notexist')  # pass1 -> raises
+    except KeyError:
+        raised = True
+    else:
+        raised = False
+    assert raised
     assert st.resolve_absolute('abs_t4') == 4
-    assert st.resolve_absolute('abs_d4') == 4
-    assert st.resolve_absolute('rel_t4') == 4
-    assert st.resolve_absolute('rel_d4') == 4
-    st.set_from(TEXT, 8)
-    assert st.resolve_relative('abs_t4') == -4
-    assert st.resolve_relative('abs_d4') == -4
-    assert st.resolve_relative('rel_t4') == -4
-    assert st.resolve_relative('rel_d4') == -4
+    try:
+        # relative symbols cannot be resolved, because in pass 1 section bases are not yet defined
+        st.resolve_absolute('rel_t4')
+    except KeyError:
+        raised = True
+    else:
+        raised = False
+    assert raised
+    assert st.resolve_absolute('const') == 123
     # PASS 2 ========================================================
     st.set_bases({TEXT: 100, DATA: 200})
-    st.set_pass(2)
     assert st.has_sym('abs_t4')
     assert st.get_sym('abs_t4') == (ABS, TEXT, 4)
     assert not st.has_sym('notexist')
@@ -84,14 +235,23 @@ def test_symbols():
     assert st.resolve_absolute('abs_d4') == 4
     assert st.resolve_absolute('rel_t4') == 100 + 4
     assert st.resolve_absolute('rel_d4') == 200 + 4
+    assert st.resolve_absolute('const') == 123
     st.set_from(TEXT, 8)
     assert st.resolve_relative('abs_t4') == 4 - 108
     assert st.resolve_relative('abs_d4') == 4 - 108
     assert st.resolve_relative('rel_t4') == 104 - 108
     assert st.resolve_relative('rel_d4') == 204 - 108
+    assert st.resolve_absolute('const') == 123
 
 
 test_parse_line()
 test_parse()
 test_assemble()
+test_assemble_bss()
+test_assemble_bss_with_value()
+test_assemble_global()
+test_assemble_uppercase_opcode()
+test_assemble_evalulate_expressions()
+test_assemble_optional_comment_removal()
+test_assemble_test_regressions_from_evaluation()
 test_symbols()
diff --git a/tests/compat/expr.S b/tests/compat/expr.S
new file mode 100644
index 0000000..3650623
--- /dev/null
+++ b/tests/compat/expr.S
@@ -0,0 +1,48 @@
+# common example of real world code using expressions
+  .set adc_channel, 6
+
+  .set adc_oversampling_factor_log, 2
+  .set adc_oversampling_factor, (1 << adc_oversampling_factor_log)
+
+.data
+
+result:
+  .long 0
+
+  .text
+  .global entry
+entry:
+  move r0, 0
+  stage_rst
+
+measure:
+  adc r1, 0, adc_channel + 1
+  add r0, r0, r1
+
+  stage_inc 1
+  jumps measure, adc_oversampling_factor, lt
+
+  rsh r0, r0, adc_oversampling_factor_log
+
+  move r3, result
+  st r0, r3, 0
+
+exit:
+  halt
+
+
+# ---
+# test that expressions evaluate correctly for all supported operators
+# (these statements do not mean anything other than testing the operations)
+  move r3, 1+2
+  move r3, 3-5
+  move r3, -5
+  move r3, 2*3
+  move r3, 4/2
+  move r3, 4 % 3
+  move r3, 0xff << 2
+  move r3, 0xff >> 1
+  move r3, (0xabcdef | 0xff) & 0xff
+  move r3, 0x1234 & ~2
+  move r3, 42|4&0xf  # 46 (4&0xf is evaluated first)
+  move r3, (42|4)&0xf  # 14 (42|4 is evaluated first)
diff --git a/tests/compat/fixes.S b/tests/compat/fixes.S
new file mode 100644
index 0000000..9e4d0ef
--- /dev/null
+++ b/tests/compat/fixes.S
@@ -0,0 +1,28 @@
+# This file tests various fixes to the assembler,
+# to ensure the binary output matches that of binutils.
+# a) support for left-aligned directives (e.g. .set without preceding whitespace)
+# b) a crash-fix related to data items in the .bss section
+# c) support for marking labels as global
+# d) support for upper case ULP opcode names
+#
+.set gpio, 2
+
+.bss
+
+counter:
+.long 0
+
+.data
+var2: .int 1111
+
+  .text
+  .global entry
+entry:
+  MOVE R1, gpio
+  WAIT 42
+
+  # reg_rd/reg_wr with "short" and "long" address notation
+  reg_rd 12, 7, 0
+  reg_rd 0x3ff48000, 7, 0
+
+  halt
diff --git a/tests/compat/preprocess_simple.S b/tests/compat/preprocess_simple.S
new file mode 100644
index 0000000..b6a61e8
--- /dev/null
+++ b/tests/compat/preprocess_simple.S
@@ -0,0 +1,7 @@
+#define GPIO 2
+#define BASE 0x100
+#define ADDR (BASE + GPIO)
+
+entry:
+  move r0, GPIO
+  move r1, ADDR
diff --git a/tests/compat/symbols.S b/tests/compat/symbols.S
index bf59c3b..359fa15 100644
--- a/tests/compat/symbols.S
+++ b/tests/compat/symbols.S
@@ -1,10 +1,12 @@
             .text
 
             .set constant42, 42
+.set notindented, 1
 
 start:      move r0, data0
             move r1, data1
             move r2, constant42
+            move r3, notindented
 
             # count from 0 .. 42 in stage register
             stage_rst
diff --git a/tests/definesdb.py b/tests/definesdb.py
new file mode 100644
index 0000000..5e2100c
--- /dev/null
+++ b/tests/definesdb.py
@@ -0,0 +1,60 @@
+import os
+
+from esp32_ulp.definesdb import DefinesDB, DBNAME
+from esp32_ulp.util import file_exists
+
+tests = []
+
+
+def test(param):
+    tests.append(param)
+
+
+@test
+def test_definesdb_clear_removes_all_keys():
+    db = DefinesDB()
+    db.open()
+    db.update({'KEY1': 'VALUE1'})
+
+    db.clear()
+
+    assert 'KEY1' not in db
+
+    db.close()
+
+
+@test
+def test_definesdb_persists_data_across_instantiations():
+    db = DefinesDB()
+    db.open()
+    db.clear()
+
+    db.update({'KEY1': 'VALUE1'})
+
+    assert 'KEY1' in db
+
+    db.close()
+    del db
+    db = DefinesDB()
+    db.open()
+
+    assert db.get('KEY1', None) == 'VALUE1'
+
+    db.close()
+
+
+@test
+def test_definesdb_should_not_create_a_db_file_when_only_reading():
+    db = DefinesDB()
+
+    db.clear()
+    assert not file_exists(DBNAME)
+
+    assert db.get('some-key', None) is None
+    assert not file_exists(DBNAME)
+
+
+if __name__ == '__main__':
+    # run all methods marked with @test
+    for t in tests:
+        t()
diff --git a/tests/fixtures/incl.h b/tests/fixtures/incl.h
new file mode 100644
index 0000000..712aa7c
--- /dev/null
+++ b/tests/fixtures/incl.h
@@ -0,0 +1,5 @@
+#define CONST1 42
+#define MACRO(x,y) x+y
+#define MULTI_LINE abc \
+                   xyz
+#define CONST2 99
diff --git a/tests/fixtures/incl2.h b/tests/fixtures/incl2.h
new file mode 100644
index 0000000..d19aeba
--- /dev/null
+++ b/tests/fixtures/incl2.h
@@ -0,0 +1,2 @@
+#define CONST2 123
+#define CONST3 777
diff --git a/tests/opcodes.py b/tests/opcodes.py
index 54bb673..f14829a 100644
--- a/tests/opcodes.py
+++ b/tests/opcodes.py
@@ -1,6 +1,8 @@
 from uctypes import UINT32, BFUINT32, BF_POS, BF_LEN
 from esp32_ulp.opcodes import make_ins, make_ins_struct_def
-from esp32_ulp.opcodes import get_reg, get_imm, get_cond, arg_qualify, ARG, REG, IMM, COND
+from esp32_ulp.opcodes import get_reg, get_imm, get_cond, arg_qualify, eval_arg, ARG, REG, IMM, SYM, COND
+from esp32_ulp.assemble import SymbolTable, ABS, REL, TEXT
+import esp32_ulp.opcodes as opcodes
 
 OPCODE_DELAY = 4
 LAYOUT_DELAY = """
@@ -43,6 +45,19 @@ def test_arg_qualify():
     assert arg_qualify('Eq') == ARG(COND, 'eq', 'Eq')
     assert arg_qualify('EQ') == ARG(COND, 'eq', 'EQ')
 
+    # for the next tests, ensure the opcodes module has a SymbolTable
+    opcodes.symbols = SymbolTable({}, {}, {})
+    opcodes.symbols.set_sym('const', ABS, None, 42)  # constant as defined by .set
+    opcodes.symbols.set_sym('entry', REL, TEXT, 4)  # label pointing to code
+
+    assert arg_qualify('1+1') == ARG(IMM, 2, '1+1')
+    assert arg_qualify('const >> 1') == ARG(IMM, 21, 'const >> 1')
+    assert arg_qualify('entry') == ARG(SYM, (REL, TEXT, 4), 'entry')  # symbols should not (yet) be evaluated
+    assert arg_qualify('entry + const') == ARG(IMM, 46, 'entry + const')
+
+    # clean up
+    opcodes.symbols = None
+
 
 def test_get_reg():
     assert get_reg('r0') == 0
@@ -57,9 +72,46 @@ def test_get_cond():
     assert get_cond('Eq') == 'eq'
 
 
+def test_eval_arg():
+    opcodes.symbols = SymbolTable({}, {}, {})
+    opcodes.symbols.set_sym('const', ABS, None, 42)  # constant
+    opcodes.symbols.set_sym('raise', ABS, None, 99)  # constant using a python keyword as name (is allowed)
+
+    assert eval_arg('1+1') == 2
+    assert eval_arg('1+const') == 43
+    assert eval_arg('raise*2/3') == 66
+    assert eval_arg('raise-const') == 57
+    assert eval_arg('(raise-const)*2') == 114
+    assert eval_arg('const    % 5') == 2
+    assert eval_arg('const + 0x19af') == 0x19af + 42
+    assert eval_arg('const & ~2') == 40
+    assert eval_arg('const << 3') == 336
+    assert eval_arg('const >> 1') == 21
+    assert eval_arg('(const|4)&0xf') == 0xe
+
+    assert_raises(ValueError, eval_arg, 'evil()')
+    assert_raises(ValueError, eval_arg, 'def cafe()')
+    assert_raises(ValueError, eval_arg, '1 ^ 2')
+    assert_raises(ValueError, eval_arg, '!100')
+
+    # clean up
+    opcodes.symbols = None
+
+
+def assert_raises(exception, func, *args):
+    try:
+        func(*args)
+    except exception:
+        raised = True
+    else:
+        raised = False
+    assert raised
+
+
 test_make_ins_struct_def()
 test_make_ins()
 test_arg_qualify()
 test_get_reg()
 test_get_imm()
 test_get_cond()
+test_eval_arg()
\ No newline at end of file
diff --git a/tests/preprocess.py b/tests/preprocess.py
new file mode 100644
index 0000000..5a3825d
--- /dev/null
+++ b/tests/preprocess.py
@@ -0,0 +1,338 @@
+import os
+
+from esp32_ulp.preprocess import Preprocessor
+from esp32_ulp.definesdb import DefinesDB, DBNAME
+from esp32_ulp.util import file_exists
+
+tests = []
+
+
+def test(param):
+    tests.append(param)
+
+
+@test
+def test_replace_defines_should_return_empty_line_given_empty_string():
+    p = Preprocessor()
+
+    assert p.preprocess("") == ""
+
+
+@test
+def replace_defines_should_return_remove_comments():
+    p = Preprocessor()
+
+    line = "// some comment"
+    expected = ""
+    assert p.preprocess(line) == expected
+
+
+@test
+def test_parse_defines():
+    p = Preprocessor()
+
+    assert p.parse_define_line("") == {}
+    assert p.parse_define_line("// comment") == {}
+    assert p.parse_define_line("  // comment") == {}
+    assert p.parse_define_line("  /* comment */") == {}
+    assert p.parse_define_line("  /* comment */ #define A 42") == {}  # #define must be the first thing on a line
+    assert p.parse_define_line("#define a 1") == {"a": "1"}
+    assert p.parse_define_line(" #define a 1") == {"a": "1"}
+    assert p.parse_define_line("#define a 1 2") == {"a": "1 2"}
+    assert p.parse_define_line("#define f(a,b) 1") == {}  # macros not supported
+    assert p.parse_define_line("#define f(a, b) 1") == {}  # macros not supported
+    assert p.parse_define_line("#define f (a,b) 1") == {"f": "(a,b) 1"}  # f is not a macro
+    assert p.parse_define_line("#define f (a, b) 1") == {"f": "(a, b) 1"}  # f is not a macro
+    assert p.parse_define_line("#define RTC_ADDR       0x12345    // start of range") == {"RTC_ADDR": "0x12345"}
+
+
+@test
+def test_parse_defines_handles_multiple_input_lines():
+    p = Preprocessor()
+
+    multi_line_1 = """\
+#define ID_WITH_UNDERSCORE something
+#define ID2 somethingelse
+"""
+    assert p.parse_defines(multi_line_1) == {"ID_WITH_UNDERSCORE": "something", "ID2": "somethingelse"}
+
+
+@test
+def test_parse_defines_does_not_understand_comments_by_current_design():
+    # comments are not understood. lines are expected to already have comments removed!
+    p = Preprocessor()
+
+    multi_line_2 = """\
+#define ID_WITH_UNDERSCORE something
+/*
+#define ID2 somethingelse
+*/
+"""
+    assert "ID2" in p.parse_defines(multi_line_2)
+
+
+@test
+def test_parse_defines_does_not_understand_line_continuations_with_backslash_by_current_design():
+    p = Preprocessor()
+
+    multi_line_3 = r"""
+    #define ID_WITH_UNDERSCORE something \
+           line2
+    """
+
+    assert p.parse_defines(multi_line_3) == {"ID_WITH_UNDERSCORE": "something \\"}
+
+
+@test
+def preprocess_should_remove_comments_and_defines_but_keep_the_lines_as_empty_lines():
+    p = Preprocessor()
+
+    lines = """\
+    // copyright
+    #define A 1
+
+    move r1, r2"""
+
+    assert p.preprocess(lines) == "\n\n\n\tmove r1, r2"
+
+
+@test
+def preprocess_should_replace_words_defined():
+    p = Preprocessor()
+
+    lines = """\
+    #define DR_REG_RTCIO_BASE 0x3ff48400
+
+    move r1, DR_REG_RTCIO_BASE"""
+
+    assert "move r1, 0x3ff48400" in p.preprocess(lines)
+
+
+@test
+def preprocess_should_replace_words_defined_multiple_times():
+    p = Preprocessor()
+
+    lines = """\
+    #define DR_REG_RTCIO_BASE 0x3ff48400
+
+    move r1, DR_REG_RTCIO_BASE  #once
+    move r2, DR_REG_RTCIO_BASE  #second time"""
+
+    assert "move r1, 0x3ff48400" in p.preprocess(lines)
+    assert "move r2, 0x3ff48400" in p.preprocess(lines)
+
+
+@test
+def preprocess_should_replace_all_defined_words():
+    p = Preprocessor()
+
+    lines = """\
+    #define DR_REG_RTCIO_BASE 0x3ff48400
+    #define SOME_OFFSET 4
+
+    move r1, DR_REG_RTCIO_BASE
+    add r2, r1, SOME_OFFSET"""
+
+    assert "move r1, 0x3ff48400" in p.preprocess(lines)
+    assert "add r2, r1, 4" in p.preprocess(lines)
+
+
+@test
+def preprocess_should_not_replace_substrings_within_identifiers():
+    p = Preprocessor()
+
+    # ie. if AAA is defined don't touch PREFIX_AAA_SUFFIX
+    lines = """\
+    #define RTCIO 4
+    move r1, DR_REG_RTCIO_BASE"""
+
+    assert "DR_REG_4_BASE" not in p.preprocess(lines)
+
+    # ie. if A and AA are defined, don't replace AA as two A's but with AA
+    lines = """\
+    #define A 4
+    #define AA 8
+    move r1, A
+    move r2, AA"""
+
+    assert "move r1, 4" in p.preprocess(lines)
+    assert "move r2, 8" in p.preprocess(lines)
+
+
+@test
+def preprocess_should_replace_defines_used_in_defines():
+    p = Preprocessor()
+
+    lines = """\
+    #define BITS (BASE << 4)
+    #define BASE 0x1234
+
+    move r1, BITS
+    move r2, BASE"""
+
+    assert "move r1, (0x1234 << 4)" in p.preprocess(lines)
+
+
+@test
+def test_expand_rtc_macros():
+    p = Preprocessor()
+
+    assert p.expand_rtc_macros("") == ""
+    assert p.expand_rtc_macros("abc") == "abc"
+    assert p.expand_rtc_macros("WRITE_RTC_REG(1, 2, 3, 4)") == "\treg_wr 1, 2 + 3 - 1, 2, 4"
+    assert p.expand_rtc_macros("READ_RTC_REG(1, 2, 3)") == "\treg_rd 1, 2 + 3 - 1, 2"
+    assert p.expand_rtc_macros("WRITE_RTC_FIELD(1, 2, 3)") == "\treg_wr 1, 2 + 1 - 1, 2, 3 & 1"
+    assert p.expand_rtc_macros("READ_RTC_FIELD(1, 2)") == "\treg_rd 1, 2 + 1 - 1, 2"
+
+
+@test
+def preprocess_should_replace_BIT_with_empty_string_unless_defined():
+    # by default replace BIT with empty string (see description for why in the code)
+    src = " move r1, 0x123 << BIT(24)"
+    assert "move r1, 0x123 << (24)" in Preprocessor().preprocess(src)
+
+    # but if BIT is defined, use that
+    src = """\
+    #define BIT 12
+
+    move r1, BIT"""
+
+    assert "move r1, 12" in Preprocessor().preprocess(src)
+
+
+@test
+def test_process_include_file():
+    p = Preprocessor()
+
+    defines = p.process_include_file('fixtures/incl.h')
+
+    assert defines['CONST1'] == '42'
+    assert defines['CONST2'] == '99'
+    assert defines.get('MULTI_LINE', None) == 'abc \\'  # correct. line continuations not supported
+    assert 'MACRO' not in defines
+
+
+@test
+def test_process_include_file_with_multiple_files():
+    p = Preprocessor()
+
+    defines = p.process_include_file('fixtures/incl.h')
+    defines = p.process_include_file('fixtures/incl2.h')
+
+    assert defines['CONST1'] == '42', "constant from incl.h"
+    assert defines['CONST2'] == '123', "constant overridden by incl2.h"
+    assert defines['CONST3'] == '777', "constant from incl2.h"
+
+
+@test
+def test_process_include_file_using_database():
+    db = DefinesDB()
+    db.clear()
+
+    p = Preprocessor()
+    p.use_db(db)
+
+    p.process_include_file('fixtures/incl.h')
+    p.process_include_file('fixtures/incl2.h')
+
+    assert db['CONST1'] == '42', "constant from incl.h"
+    assert db['CONST2'] == '123', "constant overridden by incl2.h"
+    assert db['CONST3'] == '777', "constant from incl2.h"
+
+    db.close()
+
+
+@test
+def test_process_include_file_should_not_load_database_keys_into_instance_defines_dictionary():
+    db = DefinesDB()
+    db.clear()
+
+    p = Preprocessor()
+    p.use_db(db)
+
+    p.process_include_file('fixtures/incl.h')
+
+    # a bit hackish to reference instance-internal state
+    # but it's important to verify this, as we otherwise run out of memory on device
+    assert 'CONST2' not in p._defines
+
+
+
+@test
+def test_preprocess_should_use_definesdb_when_provided():
+    p = Preprocessor()
+
+    content = """\
+#define LOCALCONST 42
+
+entry:
+    move r1, LOCALCONST
+    move r2, DBKEY
+"""
+
+    # first try without db
+    result = p.preprocess(content)
+
+    assert "move r1, 42" in result
+    assert "move r2, DBKEY" in result
+    assert "move r2, 99" not in result
+
+    # now try with db
+    db = DefinesDB()
+    db.clear()
+    db.update({'DBKEY': '99'})
+    p.use_db(db)
+
+    result = p.preprocess(content)
+
+    assert "move r1, 42" in result
+    assert "move r2, 99" in result
+    assert "move r2, DBKEY" not in result
+
+
+@test
+def test_preprocess_should_ensure_no_definesdb_is_created_when_only_reading_from_it():
+    content = """\
+    #define CONST 42
+    move r1, CONST"""
+
+    # remove any existing db
+    db = DefinesDB()
+    db.clear()
+    assert not file_exists(DBNAME)
+
+    # now preprocess using db
+    p = Preprocessor()
+    p.use_db(db)
+
+    result = p.preprocess(content)
+
+    assert "move r1, 42" in result
+
+    assert not file_exists(DBNAME)
+
+
+@test
+def test_preprocess_should_ensure_the_definesdb_is_properly_closed_after_use():
+    content = """\
+    #define CONST 42
+    move r1, CONST"""
+
+    # remove any existing db
+    db = DefinesDB()
+    db.open()
+    assert db.is_open()
+
+    # now preprocess using db
+    p = Preprocessor()
+    p.use_db(db)
+
+    p.preprocess(content)
+
+    assert not db.is_open()
+
+
+if __name__ == '__main__':
+    # run all methods marked with @test
+    for t in tests:
+        t()
diff --git a/tests/util.py b/tests/util.py
new file mode 100644
index 0000000..009f3f1
--- /dev/null
+++ b/tests/util.py
@@ -0,0 +1,76 @@
+import os
+from esp32_ulp.util import split_tokens, validate_expression, file_exists
+
+tests = []
+
+
+def test(param):
+    """
+    the @test decorator
+    """
+    tests.append(param)
+
+
+@test
+def test_split_tokens():
+    assert split_tokens("") == []
+    assert split_tokens("t") == ['t']
+    assert split_tokens("test") == ['test']
+    assert split_tokens("t t") == ['t', ' ', 't']
+    assert split_tokens("t,t") == ['t', ',', 't']
+    assert split_tokens("test(arg)") == ['test', '(', 'arg', ')']
+    assert split_tokens("test(arg,arg2)") == ['test', '(', 'arg', ',', 'arg2', ')']
+    assert split_tokens("test(arg,arg2)") == ['test', '(', 'arg', ',', 'arg2', ')']
+    assert split_tokens("  test(  arg,  arg2)") == ['  ', 'test', '(', '  ', 'arg', ',', '  ', 'arg2', ')']
+    assert split_tokens("  test(  arg )  ") == ['  ', 'test', '(', '  ', 'arg', ' ', ')', '  ']
+    assert split_tokens("\t  test  \t  ") == ['\t  ', 'test', "  \t  "]
+    assert split_tokens("test\nrow2") == ['test', "\n", "row2"]
+
+    # split_token does not support comments. should generally only be used after comments are already stripped
+    assert split_tokens("test(arg /*comment*/)") == ['test', '(', 'arg', ' ', '/', '*', 'comment', '*', '/', ')']
+    assert split_tokens("#test") == ['#', 'test']
+
+
+@test
+def test_validate_expression():
+    assert validate_expression('') is True
+    assert validate_expression('1') is True
+    assert validate_expression('1+1') is True
+    assert validate_expression('(1+1)') is True
+    assert validate_expression('(1+1)*2') is True
+    assert validate_expression('(1 + 1)') is True
+    assert validate_expression('10 % 2') is True
+    assert validate_expression('0x100 << 2') is True
+    assert validate_expression('0x100 & ~2') is True
+    assert validate_expression('0xabcdef') is True
+    assert validate_expression('0x123def') is True
+    assert validate_expression('2*3+4/5&6|7') is True
+    assert validate_expression('(((((1+1) * 2') is True  # valid characters, even if expression is not valid
+
+    assert validate_expression(':') is False
+    assert validate_expression('_') is False
+    assert validate_expression('=') is False
+    assert validate_expression('.') is False
+    assert validate_expression('!') is False
+    assert validate_expression('123 ^ 4') is False  # operator not supported for now
+    assert validate_expression('evil()') is False
+    assert validate_expression('def cafe()') is False  # valid hex digits, but potentially dangerous code
+
+
+@test
+def test_file_exists():
+    testfile = '.testfile'
+    with open(testfile, 'w') as f:
+        f.write('contents')
+
+    assert file_exists(testfile)
+
+    os.remove(testfile)
+
+    assert not file_exists(testfile)
+
+
+if __name__ == '__main__':
+    # run all methods marked with @test
+    for t in tests:
+        t()