diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 42e6a9d..b0077f8 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -6,37 +6,56 @@ permissions:
   contents: read
 
 env:
-  CIBW_BEFORE_BUILD: pip install setuptools oldest-supported-numpy
+  CIBW_BEFORE_BUILD: pip install setuptools oldest-supported-numpy pytest
+  CIBW_BEFORE_TEST: pip install pytest
   CIBW_BUILD_VERBOSITY: 1
-  CIBW_TEST_COMMAND: python -c "import sys, numexpr; sys.exit(0 if numexpr.test().wasSuccessful() else 1)"
-  CIBW_TEST_SKIP: "*macosx*arm64*"
+  CIBW_TEST_COMMAND: pytest --pyargs numexpr
+  # Testing on aarch64 takes too long, as it is currently emulated on GitHub Actions
+  CIBW_TEST_SKIP: "*linux*aarch64*"
   # Building for musllinux and aarch64 takes way too much time.
   # Moreover, NumPy is not providing musllinux for x86_64 either, so it's not worth it.
   CIBW_SKIP: "*musllinux*aarch64* *musllinux*x86_64*"
 
 jobs:
   build_wheels:
-    name: Build wheels on ${{ matrix.os }} for ${{ matrix.arch }} - ${{ matrix.p_ver }}
-    runs-on: ${{ matrix.os }}
+    name: Build wheels on ${{ matrix.os }} for ${{ matrix.arch }}
+    runs-on: ${{ matrix.runs-on || matrix.os }}
     permissions:
       contents: write
     env:
-      CIBW_BUILD: ${{ matrix.cibw_build }}
+      CIBW_BUILD: ${{ matrix.cibw_pattern }}
       CIBW_ARCHS_LINUX: ${{ matrix.arch }}
       CIBW_ARCHS_MACOS: "x86_64 arm64"
+      CIBW_ENABLE: cpython-freethreading
     strategy:
+      fail-fast: false
       matrix:
-        os: [ubuntu-latest, windows-latest, macos-latest]
-        arch: [x86_64, aarch64]
-        cibw_build: ["cp3{10,11,12,13}-*"]
-        p_ver: ["3.10-3.13"]
-        exclude:
-          - os: windows-latest
-            arch: aarch64
-          # cibuild is already in charge to build aarch64 (see CIBW_ARCHS_MACOS)
-          - os: macos-latest
+        include:
+          # Linux x86_64 builds
+          - os: ubuntu-latest
+            arch: x86_64
+            cibw_pattern: "cp3{10,11,12,13,13t}-manylinux*"
+            artifact_name: "linux-x86_64"
+
+          # Linux ARM64 builds (native runners)
+          - os: ubuntu-latest
             arch: aarch64
+            cibw_pattern: "cp3{10,11,12,13,13t}-manylinux*"
+            artifact_name: "linux-aarch64"
+            # Don't use native runners for now (looks like wait times are too long)
+            #runs-on: ["ubuntu-latest", "arm64"]
 
+          # Windows builds
+          - os: windows-latest
+            arch: x86_64
+            cibw_pattern: "cp3{10,11,12,13,13t}-win*"
+            artifact_name: "windows-x86_64"
+
+          # macOS builds (universal2)
+          - os: macos-latest
+            arch: x86_64
+            cibw_pattern: "cp3{10,11,12,13,13t}-macosx*"
+            artifact_name: "macos-universal2"
     steps:
       - uses: actions/checkout@v3
 
@@ -45,17 +64,22 @@ jobs:
         with:
           python-version: '3.x'
 
-      - name: Install cibuildwheel
+      - name: Setup free-threading variables
+        if: ${{ endsWith(matrix.cibw_build, 't-*') }}
+        shell: bash -l {0}
         run: |
-          python -m pip install cibuildwheel
+          echo "CIBW_BEFORE_BUILD=pip install setuptools numpy" >> "$GITHUB_ENV"
+          echo "CIBW_BEFORE_TEST=pip install pytest pytest-run-parallel" >> "$GITHUB_ENV"
+          echo "CIBW_TEST_COMMAND=pytest --parallel-threads=4 --pyargs numexpr" >> "$GITHUB_ENV"
 
-      - uses: docker/setup-qemu-action@v2
-        if: ${{ matrix.arch == 'aarch64' }}
-        name: Set up QEMU
+      - name: Set up QEMU
+        if: matrix.arch == 'aarch64'
+        uses: docker/setup-qemu-action@v3
+        with:
+          platforms: arm64
 
       - name: Build wheels
-        run: |
-          python -m cibuildwheel --output-dir wheelhouse
+        uses: pypa/cibuildwheel@v2.23
 
       - name: Make sdist
         if: ${{ matrix.os == 'windows-latest' }}
@@ -65,6 +89,7 @@ jobs:
 
       - uses: actions/upload-artifact@v4
         with:
+          name: ${{ matrix.artifact_name }}
           path: ./wheelhouse/*
 
       - name: Upload to GitHub Release
diff --git a/.gitignore b/.gitignore
index 928bf15..7bf6f98 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ artifact/
 numexpr.egg-info/
 *.pyc
 *.swp
+*.so
 *~
 doc/_build
 site.cfg
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..cb4e829
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,26 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+    -   id: trailing-whitespace
+    -   id: end-of-file-fixer
+    -   id: check-yaml
+    -   id: debug-statements
+
+# Too many things to fix, let's just ignore it for now
+#-   repo: https://github.com/pycqa/flake8
+#    rev: 7.0.0
+#    hooks:
+#    -   id: flake8
+#
+-   repo: https://github.com/pycqa/isort
+    rev: 5.13.2
+    hooks:
+    -   id: isort
+
+# Too many things to fix, let's just ignore it for now
+#-   repo: https://github.com/pre-commit/mirrors-mypy
+#    rev: v1.8.0
+#    hooks:
+#    -   id: mypy
+#        exclude: ^(docs/|setup.py)
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index d2c3d13..3803a41 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -19,4 +19,4 @@ sphinx:
 # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
 python:
   install:
-  - requirements: doc/requirements.txt
\ No newline at end of file
+  - requirements: doc/requirements.txt
diff --git a/AUTHORS.txt b/AUTHORS.txt
index 88b9047..57410db 100644
--- a/AUTHORS.txt
+++ b/AUTHORS.txt
@@ -23,7 +23,7 @@ Google Inc. contributed bug fixes.
 
 David Cox improved readability of the Readme.
 
-Robert A. McLeod contributed bug fixes and ported the documentation to 
+Robert A. McLeod contributed bug fixes and ported the documentation to
 numexpr.readthedocs.io. He has served as the maintainer of the package
 since 2016 to 2023.
 
diff --git a/README.rst b/README.rst
index 9033d51..264fd2b 100644
--- a/README.rst
+++ b/README.rst
@@ -159,6 +159,24 @@ Usage
   array([ True, False, False], dtype=bool)
 
 
+Free-threading support
+----------------------
+Starting on CPython 3.13 onwards there is a new distribution that disables the
+Global Interpreter Lock (GIL) altogether, thus increasing the performance yields
+under multi-threaded conditions on a single interpreter, as opposed to having to use
+multiprocessing.
+
+Whilst numexpr has been demonstrated to work under free-threaded
+CPython, considerations need to be taken when using numexpr native parallel
+implementation vs using Python threads directly in order to prevent oversubscription,
+we recommend either using the main CPython interpreter thread to spawn multiple C threads
+using the parallel numexpr API, or spawning multiple CPython threads that do not use
+the parallel API.
+
+For more information about free-threaded CPython, we recommend visiting the following
+`community Wiki <https://py-free-threading.github.io/>`
+
+
 Documentation
 -------------
 
diff --git a/bench/boolean_timing.py b/bench/boolean_timing.py
index fe07b31..0be0bf7 100644
--- a/bench/boolean_timing.py
+++ b/bench/boolean_timing.py
@@ -9,8 +9,10 @@
 ####################################################################
 
 from __future__ import print_function
+
 import sys
 import timeit
+
 import numpy
 
 array_size = 5_000_000
diff --git a/bench/issue-36.py b/bench/issue-36.py
index 9c356cf..611bddb 100644
--- a/bench/issue-36.py
+++ b/bench/issue-36.py
@@ -2,10 +2,14 @@
 # performs better than the serial code.  See issue #36 for details.
 
 from __future__ import print_function
+
+from time import time
+
 import numpy as np
-import numexpr as ne
 from numpy.testing import assert_array_equal
-from time import time
+
+import numexpr as ne
+
 
 def bench(N):
     print("*** array length:", N)
@@ -31,4 +35,3 @@ def bench(N):
     ne.set_num_threads(2)
     for N in range(10, 20):
         bench(2**N)
-
diff --git a/bench/issue-47.py b/bench/issue-47.py
index 31c68a6..a48fbe2 100644
--- a/bench/issue-47.py
+++ b/bench/issue-47.py
@@ -1,4 +1,5 @@
 import numpy
+
 import numexpr
 
 numexpr.set_num_threads(8)
diff --git a/bench/large_array_vs_numpy.py b/bench/large_array_vs_numpy.py
index 72219a1..b480261 100644
--- a/bench/large_array_vs_numpy.py
+++ b/bench/large_array_vs_numpy.py
@@ -31,10 +31,12 @@
 import os
 
 os.environ["NUMEXPR_NUM_THREADS"] = "16"
+import threading
+import timeit
+
 import numpy as np
+
 import numexpr as ne
-import timeit
-import threading
 
 array_size = 10**8
 num_runs = 10
diff --git a/bench/multidim.py b/bench/multidim.py
index 587f100..eeccd0b 100644
--- a/bench/multidim.py
+++ b/bench/multidim.py
@@ -12,9 +12,12 @@
 # Based on a script provided by Andrew Collette.
 
 from __future__ import print_function
+
+import time
+
 import numpy as np
+
 import numexpr as nx
-import time
 
 test_shapes = [
     (100*100*100),
@@ -90,5 +93,3 @@ def test_func(a, b, c):
     print("Simple: ", (stop1-start1)/nruns)
     print("Numexpr: ", (stop2-start2)/nruns)
     print("Chunked: ", (stop3-start3)/nruns)
-
-
diff --git a/bench/poly.py b/bench/poly.py
index 0f50290..3eb12b1 100644
--- a/bench/poly.py
+++ b/bench/poly.py
@@ -17,11 +17,13 @@
 #######################################################################
 
 from __future__ import print_function
+
 import sys
 from time import time
+
 import numpy as np
-import numexpr as ne
 
+import numexpr as ne
 
 #expr = ".25*x**3 + .75*x**2 - 1.5*x - 2"  # the polynomial to compute
 expr = "((.25*x + .75)*x - 1.5)*x - 2"  # a computer-friendly polynomial
diff --git a/bench/timing.py b/bench/timing.py
index c84a6f4..9c70610 100644
--- a/bench/timing.py
+++ b/bench/timing.py
@@ -9,7 +9,10 @@
 ####################################################################
 
 from __future__ import print_function
-import timeit, numpy
+
+import timeit
+
+import numpy
 
 array_size = 5e6
 iterations = 2
diff --git a/bench/unaligned-simple.py b/bench/unaligned-simple.py
index e168c78..b653c7a 100644
--- a/bench/unaligned-simple.py
+++ b/bench/unaligned-simple.py
@@ -13,8 +13,11 @@
 """
 
 from __future__ import print_function
+
 from timeit import Timer
+
 import numpy as np
+
 import numexpr as ne
 
 niter = 10
diff --git a/bench/varying-expr.py b/bench/varying-expr.py
index d04ab35..df7419c 100644
--- a/bench/varying-expr.py
+++ b/bench/varying-expr.py
@@ -13,9 +13,12 @@
 # the latency of numexpr when working with small arrays.
 
 from __future__ import print_function
+
 import sys
 from time import time
+
 import numpy as np
+
 import numexpr as ne
 
 N = 100
diff --git a/bench/vml_timing.py b/bench/vml_timing.py
index 52f5003..57dd4d2 100644
--- a/bench/vml_timing.py
+++ b/bench/vml_timing.py
@@ -9,9 +9,12 @@
 ####################################################################
 
 from __future__ import print_function
+
 import sys
 import timeit
+
 import numpy
+
 import numexpr
 
 array_size = 5_000_000
diff --git a/bench/vml_timing2.py b/bench/vml_timing2.py
index 32fdc62..1c460d0 100644
--- a/bench/vml_timing2.py
+++ b/bench/vml_timing2.py
@@ -4,11 +4,14 @@
 # https://github.com/pydata/numexpr/wiki/NumexprMKL
 
 from __future__ import print_function
+
 import datetime
 import sys
+from time import time
+
 import numpy as np
+
 import numexpr as ne
-from time import time
 
 N = int(2**26)
 
diff --git a/bench/vml_timing3.py b/bench/vml_timing3.py
index 04997ff..0086421 100644
--- a/bench/vml_timing3.py
+++ b/bench/vml_timing3.py
@@ -1,7 +1,9 @@
 # -*- coding: utf-8 -*-
+from timeit import default_timer as timer
+
 import numpy as np
+
 import numexpr as ne
-from timeit import default_timer as timer
 
 x = np.ones(100000)
 scaler = -1J
diff --git a/doc/api.rst b/doc/api.rst
index 7d750e3..5d1bb0f 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -3,11 +3,11 @@ NumExpr API
 
 .. automodule:: numexpr
    :members: evaluate, re_evaluate, disassemble, NumExpr, get_vml_version, set_vml_accuracy_mode, set_vml_num_threads, set_num_threads, detect_number_of_cores, detect_number_of_threads
-   
+
 .. py:attribute:: ncores
 
     The number of (virtual) cores detected.
-                  
+
 .. py:attribute:: nthreads
 
     The number of threads currently in-use.
@@ -18,11 +18,11 @@ NumExpr API
 
 .. py:attribute:: version
 
-    The version of NumExpr.      
-                  
-    
+    The version of NumExpr.
+
+
 Tests submodule
 ---------------
 
 .. automodule:: numexpr.tests
-   :members: test, print_versions
\ No newline at end of file
+   :members: test, print_versions
diff --git a/doc/index.rst b/doc/index.rst
index 02922c3..d517391 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -25,4 +25,3 @@ Indices and tables
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
-
diff --git a/doc/intro.rst b/doc/intro.rst
index 11dbaaf..0d31925 100644
--- a/doc/intro.rst
+++ b/doc/intro.rst
@@ -1,25 +1,25 @@
 How it works
 ============
 
-The string passed to :code:`evaluate` is compiled into an object representing the 
+The string passed to :code:`evaluate` is compiled into an object representing the
 expression and types of the arrays used by the function :code:`numexpr`.
 
-The expression is first compiled using Python's :code:`compile` function (this means 
-that the expressions have to be valid Python expressions). From this, the 
-variable names can be taken. The expression is then evaluated using instances 
-of a special object that keep track of what is being done to them, and which 
+The expression is first compiled using Python's :code:`compile` function (this means
+that the expressions have to be valid Python expressions). From this, the
+variable names can be taken. The expression is then evaluated using instances
+of a special object that keep track of what is being done to them, and which
 builds up the parse tree of the expression.
 
-This parse tree is then compiled to a bytecode program, which describes how to 
-perform the operation element-wise. The virtual machine uses "vector registers": 
-each register is many elements wide (by default 4096 elements). The key to 
+This parse tree is then compiled to a bytecode program, which describes how to
+perform the operation element-wise. The virtual machine uses "vector registers":
+each register is many elements wide (by default 4096 elements). The key to
 NumExpr's speed is handling chunks of elements at a time.
 
-There are two extremes to evaluating an expression elementwise. You can do each 
-operation as arrays, returning temporary arrays. This is what you do when you 
-use NumPy: :code:`2*a+3*b` uses three temporary arrays as large as :code:`a` or 
-:code:`b`. This strategy wastes memory (a problem if your arrays are large), 
-and also is not a good use of cache memory: for large arrays, the results of 
+There are two extremes to evaluating an expression elementwise. You can do each
+operation as arrays, returning temporary arrays. This is what you do when you
+use NumPy: :code:`2*a+3*b` uses three temporary arrays as large as :code:`a` or
+:code:`b`. This strategy wastes memory (a problem if your arrays are large),
+and also is not a good use of cache memory: for large arrays, the results of
 :code:`2*a` and :code:`3*b` won't be in cache when you do the add.
 
 The other extreme is to loop over each element, as in::
@@ -27,13 +27,13 @@ The other extreme is to loop over each element, as in::
     for i in xrange(len(a)):
         c[i] = 2*a[i] + 3*b[i]
 
-This doesn't consume extra memory, and is good for the cache, but, if the 
-expression is not compiled to machine code, you will have a big case statement 
-(or a bunch of if's) inside the loop, which adds a large overhead for each 
+This doesn't consume extra memory, and is good for the cache, but, if the
+expression is not compiled to machine code, you will have a big case statement
+(or a bunch of if's) inside the loop, which adds a large overhead for each
 element, and will hurt the branch-prediction used on the CPU.
 
-:code:`numexpr` uses a in-between approach. Arrays are handled as chunks (of 
-4096 elements) at a time, using a register machine. As Python code, 
+:code:`numexpr` uses a in-between approach. Arrays are handled as chunks (of
+4096 elements) at a time, using a register machine. As Python code,
 it looks something like this::
 
     for i in xrange(0, len(a), 256):
@@ -44,11 +44,11 @@ it looks something like this::
        add(r2, r3, r2)
        c[i:i+128] = r2
 
-(remember that the 3-arg form stores the result in the third argument, 
-instead of allocating a new array). This achieves a good balance between 
-cache and branch-prediction. And the virtual machine is written entirely in 
-C, which makes it faster than the Python above.  Furthermore the virtual machine 
-is also multi-threaded, which allows for efficient parallelization of NumPy 
+(remember that the 3-arg form stores the result in the third argument,
+instead of allocating a new array). This achieves a good balance between
+cache and branch-prediction. And the virtual machine is written entirely in
+C, which makes it faster than the Python above.  Furthermore the virtual machine
+is also multi-threaded, which allows for efficient parallelization of NumPy
 operations.
 
 There is some more information and history at:
@@ -58,12 +58,12 @@ http://www.bitsofbits.com/2014/09/21/numpy-micro-optimization-and-numexpr/
 Expected performance
 ====================
 
-The range of speed-ups for NumExpr respect to NumPy can vary from 0.95x and 20x, 
-being 2x, 3x or 4x typical values, depending on the complexity of the 
-expression and the internal optimization of the operators used. The strided and 
-unaligned case has been optimized too, so if the expression contains such 
-arrays, the speed-up can increase significantly. Of course, you will need to 
-operate with large arrays (typically larger than the cache size of your CPU) 
+The range of speed-ups for NumExpr respect to NumPy can vary from 0.95x and 20x,
+being 2x, 3x or 4x typical values, depending on the complexity of the
+expression and the internal optimization of the operators used. The strided and
+unaligned case has been optimized too, so if the expression contains such
+arrays, the speed-up can increase significantly. Of course, you will need to
+operate with large arrays (typically larger than the cache size of your CPU)
 to see these improvements in performance.
 
 Here there are some real timings. For the contiguous case::
diff --git a/doc/mkl.rst b/doc/mkl.rst
index 6951655..0c706bb 100644
--- a/doc/mkl.rst
+++ b/doc/mkl.rst
@@ -1,19 +1,19 @@
 NumExpr with Intel MKL
 ======================
 
-Numexpr has support for Intel's VML (included in Intel's MKL) in order to 
-accelerate the evaluation of transcendental functions on Intel CPUs.  Here it 
+Numexpr has support for Intel's VML (included in Intel's MKL) in order to
+accelerate the evaluation of transcendental functions on Intel CPUs.  Here it
 is a small example on the kind of improvement you may get by using it.
 
 A first benchmark
 -----------------
 
-Firstly, we are going to exercise how MKL performs when computing a couple of 
-simple expressions.  One is a pure algebraic one: :code:`2*y + 4*x` and the other 
+Firstly, we are going to exercise how MKL performs when computing a couple of
+simple expressions.  One is a pure algebraic one: :code:`2*y + 4*x` and the other
 contains transcendental functions: :code:`sin(x)**2 + cos(y)**2`.
 
-For this, we are going to use this worksheet_.  I (Francesc Alted) ran this 
-benchmark on a Intel Xeon E3-1245 v5 @ 3.50GHz. Here are the results when 
+For this, we are going to use this worksheet_.  I (Francesc Alted) ran this
+benchmark on a Intel Xeon E3-1245 v5 @ 3.50GHz. Here are the results when
 not using MKL::
 
     NumPy version: 1.11.1
@@ -22,7 +22,7 @@ not using MKL::
     Numexpr version: 2.6.1. Using MKL: False
     Time for an algebraic expression:     0.058 s / 19.116 GB/s
     Time for a transcendental expression: 0.283 s / 3.950 GB/s
- 
+
 
 And now, using MKL::
 
@@ -34,14 +34,14 @@ And now, using MKL::
     Time for a transcendental expression: 0.075 s / 14.975 GB/s
 
 
-As you can see, numexpr using MKL can be up to 3.8x faster for the case of the 
-transcendental expression.  Also, you can notice that the pure algebraic 
-expression is not accelerated at all.  This is completely expected, as the 
-MKL is offering accelerations for CPU bounded functions (sin, cos, tan, exp, 
+As you can see, numexpr using MKL can be up to 3.8x faster for the case of the
+transcendental expression.  Also, you can notice that the pure algebraic
+expression is not accelerated at all.  This is completely expected, as the
+MKL is offering accelerations for CPU bounded functions (sin, cos, tan, exp,
 log, sinh...) and not pure multiplications or adds.
 
-Finally, note how numexpr+MKL can be up to 26x faster than using a pure NumPy 
-solution.  And this was using a processor with just four physical cores; you 
+Finally, note how numexpr+MKL can be up to 26x faster than using a pure NumPy
+solution.  And this was using a processor with just four physical cores; you
 should expect more speedup as you throw more cores at that.
 
 .. _worksheet: https://github.com/pydata/numexpr/blob/master/bench/vml_timing2.py
@@ -49,28 +49,28 @@ should expect more speedup as you throw more cores at that.
 More benchmarks (older)
 -----------------------
 
-Numexpr & VML can both use several threads for doing computations.  Let's see 
-how performance improves by using 1 or 2 threads on a 2-core Intel CPU (Core2 
+Numexpr & VML can both use several threads for doing computations.  Let's see
+how performance improves by using 1 or 2 threads on a 2-core Intel CPU (Core2
 E8400 @ 3.00GHz).
 
 Using 1 thread
 ^^^^^^^^^^^^^^
 
-Here we have some benchmarks on the improvement of speed that Intel's VML can 
-achieve.  First, look at times by some easy expression containing sine and 
+Here we have some benchmarks on the improvement of speed that Intel's VML can
+achieve.  First, look at times by some easy expression containing sine and
 cosine operations *without* using VML::
 
     In [17]: ne.use_vml
     Out[17]: False
-    
+
     In [18]: x = np.linspace(-1, 1, 1e6)
-    
+
     In [19]: timeit np.sin(x)**2+np.cos(x)**2
     10 loops, best of 3: 43.1 ms per loop
-    
+
     In [20]: ne.set_num_threads(1)
     Out[20]: 2
-    
+
     In [21]: timeit ne.evaluate('sin(x)**2+cos(x)**2')
     10 loops, best of 3: 29.5 ms per loop
 
@@ -79,15 +79,15 @@ and now using VML::
 
     In [37]: ne.use_vml
     Out[37]: True
-    
+
     In [38]: x = np.linspace(-1, 1, 1e6)
-    
+
     In [39]: timeit np.sin(x)**2+np.cos(x)**2
     10 loops, best of 3: 42.8 ms per loop
-    
+
     In [40]: ne.set_num_threads(1)
     Out[40]: 2
-    
+
     In [41]: timeit ne.evaluate('sin(x)**2+cos(x)**2')
     100 loops, best of 3: 19.8 ms per loop
 
@@ -96,37 +96,37 @@ Hey, VML can accelerate computations by a 50% using a single CPU.  That's great!
 Using 2 threads
 ^^^^^^^^^^^^^^^
 
-First, look at the time of the non-VML numexpr when using 2 threads:: 
+First, look at the time of the non-VML numexpr when using 2 threads::
 
     In [22]: ne.set_num_threads(2)
     Out[22]: 1
-    
+
     In [23]: timeit ne.evaluate('sin(x)**2+cos(x)**2')
     100 loops, best of 3: 15.3 ms per loop
 
-OK.  We've got an almost perfect 2x improvement in speed with regard to the 1 
+OK.  We've got an almost perfect 2x improvement in speed with regard to the 1
 thread case.  Let's see about the VML-powered numexpr version::
 
     In [43]: ne.set_num_threads(2)
     Out[43]: 1
-    
+
     In [44]: timeit ne.evaluate('sin(x)**2+cos(x)**2')
     100 loops, best of 3: 12.2 ms per loop
 
-Ok, that's about 1.6x improvement over the 1 thread VML computation, and 
-still a 25% of improvement over the non-VML version.  Good, native numexpr 
+Ok, that's about 1.6x improvement over the 1 thread VML computation, and
+still a 25% of improvement over the non-VML version.  Good, native numexpr
 multithreading code really looks very efficient!
 
 Numexpr native threading code vs VML's one
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-You may already know that both numexpr and Intel's VML do have support for 
-multithreaded computations, but you might be curious about which one is more 
-efficient, so here it goes a hint.  First, using the VML multithreaded 
+You may already know that both numexpr and Intel's VML do have support for
+multithreaded computations, but you might be curious about which one is more
+efficient, so here it goes a hint.  First, using the VML multithreaded
 implementation::
 
     In [49]: ne.set_vml_num_threads(2)
-    
+
     In [50]: ne.set_num_threads(1)
     Out[50]: 1
 
@@ -146,14 +146,14 @@ and now, using the native numexpr threading code::
     100 loops, best of 3: 12 ms per loop
 
 
-This means that numexpr's native multithreaded code is about 40% faster than 
-VML's for this case.  So, in general, you should use the former with numexpr 
+This means that numexpr's native multithreaded code is about 40% faster than
+VML's for this case.  So, in general, you should use the former with numexpr
 (and this is the default actually).
 
 Mixing numexpr's and VML multithreading capabilities
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Finally, you might be tempted to use both multithreading codes at the same 
+Finally, you might be tempted to use both multithreading codes at the same
 time, but you will be deceived about the improvement in performance::
 
     In [57]: ne.set_vml_num_threads(2)
@@ -161,7 +161,7 @@ time, but you will be deceived about the improvement in performance::
     In [58]: timeit ne.evaluate('sin(x)**2+cos(x)**2')
     100 loops, best of 3: 17.7 ms per loop
 
-Your code actually performs much worse.  That's normal too because you are 
-trying to run 4 threads on a 2-core CPU.  For CPUs with many cores, you may 
-want to try with different threading configurations, but as a rule of thumb, 
-numexpr's one will generally win.
\ No newline at end of file
+Your code actually performs much worse.  That's normal too because you are
+trying to run 4 threads on a 2-core CPU.  For CPUs with many cores, you may
+want to try with different threading configurations, but as a rule of thumb,
+numexpr's one will generally win.
diff --git a/doc/release_notes.rst b/doc/release_notes.rst
index 081e7f4..51d3212 100644
--- a/doc/release_notes.rst
+++ b/doc/release_notes.rst
@@ -1,4 +1,4 @@
 Release Notes
 =============
 
-.. include:: ../RELEASE_NOTES.rst
\ No newline at end of file
+.. include:: ../RELEASE_NOTES.rst
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
index 3a3cf63..ce2ff9d 100644
--- a/doc/user_guide.rst
+++ b/doc/user_guide.rst
@@ -30,7 +30,7 @@ and it can also re_evaluate an expression::
 Building
 --------
 
-*NumExpr* requires Python_ 3.7 or greater, and NumPy_ 1.13 or greater.  It is 
+*NumExpr* requires Python_ 3.7 or greater, and NumPy_ 1.13 or greater.  It is
 built in the standard Python way:
 
 .. code-block:: bash
@@ -39,7 +39,7 @@ built in the standard Python way:
 
 You must have a C-compiler (i.e. MSVC Build tools on Windows and GCC on Linux) installed.
 
-Then change to a directory that is not the repository directory (e.g. `/tmp`) and 
+Then change to a directory that is not the repository directory (e.g. `/tmp`) and
 test :code:`numexpr` with:
 
 .. code-block:: bash
@@ -73,23 +73,23 @@ affect performance).
 Threadpool Configuration
 ------------------------
 
-Threads are spawned at import-time, with the number being set by the environment 
-variable ``NUMEXPR_MAX_THREADS``. The default maximum thread count is **64**. 
+Threads are spawned at import-time, with the number being set by the environment
+variable ``NUMEXPR_MAX_THREADS``. The default maximum thread count is **64**.
 There is no advantage to spawning more threads than the number of virtual cores
-available on the computing node. Practically NumExpr scales at large thread 
-count (`> 8`) only on very large matrices (`> 2**22`). Spawning large numbers 
-of threads is not free, and can increase import times for NumExpr or packages 
+available on the computing node. Practically NumExpr scales at large thread
+count (`> 8`) only on very large matrices (`> 2**22`). Spawning large numbers
+of threads is not free, and can increase import times for NumExpr or packages
 that import it such as Pandas or PyTables.
 
-If desired, the number of threads in the pool used can be adjusted via an 
-environment variable, ``NUMEXPR_NUM_THREADS`` (preferred) or ``OMP_NUM_THREADS``. 
-Typically only setting ``NUMEXPR_MAX_THREADS`` is sufficient; the number of 
-threads used can be adjusted dynamically via ``numexpr.set_num_threads(int)``. 
+If desired, the number of threads in the pool used can be adjusted via an
+environment variable, ``NUMEXPR_NUM_THREADS`` (preferred) or ``OMP_NUM_THREADS``.
+Typically only setting ``NUMEXPR_MAX_THREADS`` is sufficient; the number of
+threads used can be adjusted dynamically via ``numexpr.set_num_threads(int)``.
 The number of threads can never exceed that set by ``NUMEXPR_MAX_THREADS``.
 
-If the user has not configured the environment prior to importing NumExpr, info 
-logs will be generated, and the initial number of threads *that are used*_ will 
-be set to the number of cores detected in the system or 8, whichever is *less*. 
+If the user has not configured the environment prior to importing NumExpr, info
+logs will be generated, and the initial number of threads *that are used*_ will
+be set to the number of cores detected in the system or 8, whichever is *less*.
 
 Usage::
 
@@ -111,16 +111,16 @@ function's frame (through the use of :code:`sys._getframe()`).
 Alternatively, they can be specified using the :code:`local_dict` or
 :code:`global_dict` arguments, or passed as keyword arguments.
 
-The :code:`optimization` parameter can take the values :code:`'moderate'` 
-or :code:`'aggressive'`.  :code:`'moderate'` means that no optimization is made 
-that can affect precision at all.  :code:`'aggressive'` (the default) means that 
-the expression can be rewritten in a way that precision *could* be affected, but 
-normally very little.  For example, in :code:`'aggressive'` mode, the 
-transformation :code:`x~**3` -> :code:`x*x*x` is made, but not in 
+The :code:`optimization` parameter can take the values :code:`'moderate'`
+or :code:`'aggressive'`.  :code:`'moderate'` means that no optimization is made
+that can affect precision at all.  :code:`'aggressive'` (the default) means that
+the expression can be rewritten in a way that precision *could* be affected, but
+normally very little.  For example, in :code:`'aggressive'` mode, the
+transformation :code:`x~**3` -> :code:`x*x*x` is made, but not in
 :code:`'moderate'` mode.
 
-The `truediv` parameter specifies whether the division is a 'floor division' 
-(False) or a 'true division' (True).  The default is the value of 
+The `truediv` parameter specifies whether the division is a 'floor division'
+(False) or a 'true division' (True).  The default is the value of
 `__future__.division` in the interpreter.  See PEP 238 for details.
 
 Expressions are cached, so reuse is fast.  Arrays or scalars are
@@ -164,22 +164,22 @@ Casting rules in NumExpr follow closely those of *NumPy*.  However, for
 implementation reasons, there are some known exceptions to this rule,
 namely:
 
-    * When an array with type :code:`int8`, :code:`uint8`, :code:`int16` or 
-      :code:`uint16` is used inside NumExpr, it is internally upcasted to an 
-      :code:`int` (or :code:`int32` in NumPy notation).                                         
-    * When an array with type :code:`uint32` is used inside NumExpr, it is 
-      internally upcasted to a :code:`long` (or :code:`int64` in NumPy notation).     
-    * A floating point function (e.g. :code:`sin`) acting on :code:`int8` or 
-      :code:`int16` types returns a :code:`float64` type, instead of the 
-      :code:`float32` that is returned by NumPy functions.  This is mainly due 
+    * When an array with type :code:`int8`, :code:`uint8`, :code:`int16` or
+      :code:`uint16` is used inside NumExpr, it is internally upcasted to an
+      :code:`int` (or :code:`int32` in NumPy notation).
+    * When an array with type :code:`uint32` is used inside NumExpr, it is
+      internally upcasted to a :code:`long` (or :code:`int64` in NumPy notation).
+    * A floating point function (e.g. :code:`sin`) acting on :code:`int8` or
+      :code:`int16` types returns a :code:`float64` type, instead of the
+      :code:`float32` that is returned by NumPy functions.  This is mainly due
       to the absence of native :code:`int8` or :code:`int16` types in NumExpr.
-    * In operations implying a scalar and an array, the normal rules of casting 
-      are used in NumExpr, in contrast with NumPy, where array types takes 
-      priority.  For example, if :code:`a` is an array of type :code:`float32` 
-      and :code:`b` is an scalar of type :code:`float64` (or Python :code:`float` 
-      type, which is equivalent), then :code:`a*b` returns a :code:`float64` in 
-      NumExpr, but a :code:`float32` in NumPy (i.e. array operands take priority 
-      in determining the result type).  If you need to keep the result a 
+    * In operations implying a scalar and an array, the normal rules of casting
+      are used in NumExpr, in contrast with NumPy, where array types takes
+      priority.  For example, if :code:`a` is an array of type :code:`float32`
+      and :code:`b` is an scalar of type :code:`float64` (or Python :code:`float`
+      type, which is equivalent), then :code:`a*b` returns a :code:`float64` in
+      NumExpr, but a :code:`float32` in NumPy (i.e. array operands take priority
+      in determining the result type).  If you need to keep the result a
       :code:`float32`, be sure you use a :code:`float32` scalar too.
 
 
@@ -199,42 +199,42 @@ Supported functions
 
 The next are the current supported set:
 
-    * :code:`where(bool, number1, number2): number` -- number1 if the bool condition 
+    * :code:`where(bool, number1, number2): number` -- number1 if the bool condition
       is true, number2 otherwise.
-    * :code:`{sin,cos,tan}(float|complex): float|complex` -- trigonometric sine, 
+    * :code:`{sin,cos,tan}(float|complex): float|complex` -- trigonometric sine,
       cosine or tangent.
-    * :code:`{arcsin,arccos,arctan}(float|complex): float|complex` -- trigonometric 
+    * :code:`{arcsin,arccos,arctan}(float|complex): float|complex` -- trigonometric
       inverse sine, cosine or tangent.
-    * :code:`arctan2(float1, float2): float` -- trigonometric inverse tangent of 
+    * :code:`arctan2(float1, float2): float` -- trigonometric inverse tangent of
       float1/float2.
-    * :code:`{sinh,cosh,tanh}(float|complex): float|complex` -- hyperbolic sine, 
+    * :code:`{sinh,cosh,tanh}(float|complex): float|complex` -- hyperbolic sine,
       cosine or tangent.
-    * :code:`{arcsinh,arccosh,arctanh}(float|complex): float|complex` -- hyperbolic 
+    * :code:`{arcsinh,arccosh,arctanh}(float|complex): float|complex` -- hyperbolic
       inverse sine, cosine or tangent.
-    * :code:`{log,log10,log1p}(float|complex): float|complex` -- natural, base-10 and 
+    * :code:`{log,log10,log1p}(float|complex): float|complex` -- natural, base-10 and
       log(1+x) logarithms.
-    * :code:`{exp,expm1}(float|complex): float|complex` -- exponential and exponential 
+    * :code:`{exp,expm1}(float|complex): float|complex` -- exponential and exponential
       minus one.
     * :code:`sqrt(float|complex): float|complex` -- square root.
     * :code:`abs(float|complex): float|complex`  -- absolute value.
     * :code:`conj(complex): complex` -- conjugate value.
     * :code:`{real,imag}(complex): float` -- real or imaginary part of complex.
-    * :code:`complex(float, float): complex` -- complex from real and imaginary 
+    * :code:`complex(float, float): complex` -- complex from real and imaginary
       parts.
-    * :code:`contains(np.str, np.str): bool` -- returns True for every string in :code:`op1` that 
+    * :code:`contains(np.str, np.str): bool` -- returns True for every string in :code:`op1` that
       contains :code:`op2`.
 
 Notes
 -----
 
     * :code:`abs()` for complex inputs returns a :code:`complex` output too.  This is a
-      departure from NumPy where a :code:`float` is returned instead.  However, 
-      NumExpr is not flexible enough yet so as to allow this to happen.  
-      Meanwhile, if you want to mimic NumPy behaviour, you may want to select the 
-      real part via the :code:`real` function (e.g. :code:`real(abs(cplx))`) or via the 
+      departure from NumPy where a :code:`float` is returned instead.  However,
+      NumExpr is not flexible enough yet so as to allow this to happen.
+      Meanwhile, if you want to mimic NumPy behaviour, you may want to select the
+      real part via the :code:`real` function (e.g. :code:`real(abs(cplx))`) or via the
       :code:`real` selector (e.g. :code:`abs(cplx).real`).
 
-More functions can be added if you need them. Note however that NumExpr 2.6 is 
+More functions can be added if you need them. Note however that NumExpr 2.6 is
 in maintenance mode and a new major revision is under development.
 
 Supported reduction operations
@@ -242,12 +242,12 @@ Supported reduction operations
 
 The next are the current supported set:
 
-  * :code:`sum(number, axis=None)`: Sum of array elements over a given axis.  
+  * :code:`sum(number, axis=None)`: Sum of array elements over a given axis.
     Negative axis are not supported.
-  * :code:`prod(number, axis=None)`: Product of array elements over a given axis.  
+  * :code:`prod(number, axis=None)`: Product of array elements over a given axis.
     Negative axis are not supported.
 
-*Note:* because of internal limitations, reduction operations must appear the 
+*Note:* because of internal limitations, reduction operations must appear the
 last in the stack.  If not, it will be issued an error like::
 
     >>> ne.evaluate('sum(1)*(-1)')
@@ -256,23 +256,23 @@ last in the stack.  If not, it will be issued an error like::
 General routines
 ----------------
 
-  * :code:`evaluate(expression, local_dict=None, global_dict=None, 
-    optimization='aggressive', truediv='auto')`:  Evaluate a simple array 
+  * :code:`evaluate(expression, local_dict=None, global_dict=None,
+    optimization='aggressive', truediv='auto')`:  Evaluate a simple array
     expression element-wise.  See examples above.
-  * :code:`re_evaluate(local_dict=None)`:  Re-evaluate the last array expression 
-    without any check.  This is meant for accelerating loops that are re-evaluating 
-    the same expression repeatedly without changing anything else than the operands.  
+  * :code:`re_evaluate(local_dict=None)`:  Re-evaluate the last array expression
+    without any check.  This is meant for accelerating loops that are re-evaluating
+    the same expression repeatedly without changing anything else than the operands.
     If unsure, use evaluate() which is safer.
   * :code:`test()`:  Run all the tests in the test suite.
   * :code:`print_versions()`:  Print the versions of software that numexpr relies on.
-  * :code:`set_num_threads(nthreads)`: Sets a number of threads to be used in operations.  
-    Returns the previous setting for the number of threads.  See note below to see 
+  * :code:`set_num_threads(nthreads)`: Sets a number of threads to be used in operations.
+    Returns the previous setting for the number of threads.  See note below to see
     how the number of threads is set via environment variables.
 
-    If you are using VML, you may want to use *set_vml_num_threads(nthreads)* to 
-    perform the parallel job with VML instead.  However, you should get very 
-    similar performance with VML-optimized functions, and VML's parallelizer 
-    cannot deal with common expressions like `(x+1)*(x-2)`, while NumExpr's 
+    If you are using VML, you may want to use *set_vml_num_threads(nthreads)* to
+    perform the parallel job with VML instead.  However, you should get very
+    similar performance with VML-optimized functions, and VML's parallelizer
+    cannot deal with common expressions like `(x+1)*(x-2)`, while NumExpr's
     one can.
 
   * :code:`detect_number_of_cores()`: Detects the number of cores on a system.
@@ -324,4 +324,4 @@ License
 
 NumExpr is distributed under the MIT_ license.
 
-.. _MIT: http://www.opensource.org/licenses/mit-license.php
\ No newline at end of file
+.. _MIT: http://www.opensource.org/licenses/mit-license.php
diff --git a/doc/vm2.rst b/doc/vm2.rst
index 45e9fc9..01c9826 100644
--- a/doc/vm2.rst
+++ b/doc/vm2.rst
@@ -1,32 +1,32 @@
 Performance of the Virtual Machine in NumExpr2.0
 ================================================
 
-Numexpr 2.0 leverages a new virtual machine completely based on the new ndarray 
-iterator introduced in NumPy 1.6.  This represents a nice combination of the 
-advantages of using the new iterator, while retaining the ability to avoid 
-copies in memory as well as the multi-threading capabilities of the previous 
+Numexpr 2.0 leverages a new virtual machine completely based on the new ndarray
+iterator introduced in NumPy 1.6.  This represents a nice combination of the
+advantages of using the new iterator, while retaining the ability to avoid
+copies in memory as well as the multi-threading capabilities of the previous
 virtual machine (1.x series).
 
-The increased performance of the new virtual machine can be seen in several 
+The increased performance of the new virtual machine can be seen in several
 scenarios, like:
 
-  * *Broadcasting*.  Expressions containing arrays that needs to be broadcasted, 
+  * *Broadcasting*.  Expressions containing arrays that needs to be broadcasted,
     will not need additional memory (i.e. they will be broadcasted on-the-fly).
-  * *Non-native dtypes*.  These will be translated to native dtypes on-the-fly, 
+  * *Non-native dtypes*.  These will be translated to native dtypes on-the-fly,
     so there is not need to convert the whole arrays first.
-  * *Fortran-ordered arrays*.  The new iterator will find the best path to 
+  * *Fortran-ordered arrays*.  The new iterator will find the best path to
     optimize operations on such arrays, without the need to transpose them first.
 
-There is a drawback though: performance with small arrays suffers a bit because 
-of higher set-up times for the new virtual machine.  See below for detailed 
+There is a drawback though: performance with small arrays suffers a bit because
+of higher set-up times for the new virtual machine.  See below for detailed
 benchmarks.
 
 Some benchmarks for best-case scenarios
 ---------------------------------------
 
-Here you have some benchmarks of some scenarios where the new virtual machine 
-actually represents an advantage in terms of speed (also memory, but this is 
-not shown here).  As you will see, the improvement is notable in many areas, 
+Here you have some benchmarks of some scenarios where the new virtual machine
+actually represents an advantage in terms of speed (also memory, but this is
+not shown here).  As you will see, the improvement is notable in many areas,
 ranging from 3x to 6x faster operations.
 
 Broadcasting
@@ -85,7 +85,7 @@ Mix of 'non-native' arrays, Fortran-ordered, and using broadcasting
 Longer setup-time
 ^^^^^^^^^^^^^^^^^
 
-The only drawback of the new virtual machine is during the computation of 
+The only drawback of the new virtual machine is during the computation of
 small arrays::
 
     >>> a = np.arange(10)
@@ -98,8 +98,8 @@ small arrays::
     10000 loops, best of 3: 30.6 µs per loop
 
 
-i.e. the new virtual machine takes a bit more time to set-up (around 8 µs in 
-this machine).  However, this should be not too important because for such a 
+i.e. the new virtual machine takes a bit more time to set-up (around 8 µs in
+this machine).  However, this should be not too important because for such a
 small arrays NumPy is always a better option::
 
     >>> timeit c = a*(b+1)
@@ -121,8 +121,8 @@ And for arrays large enough the difference is negligible::
 Conclusion
 ----------
 
-The new virtual machine introduced in numexpr 2.0 brings more performance in 
-many different scenarios (broadcast, non-native dtypes, Fortran-orderd arrays), 
-while it shows slightly worse performance for small arrays.  However, as 
-numexpr is more geared to compute large arrays, the new virtual machine should 
-be good news for numexpr users in general.
\ No newline at end of file
+The new virtual machine introduced in numexpr 2.0 brings more performance in
+many different scenarios (broadcast, non-native dtypes, Fortran-orderd arrays),
+while it shows slightly worse performance for small arrays.  However, as
+numexpr is more geared to compute large arrays, the new virtual machine should
+be good news for numexpr users in general.
diff --git a/issues/issue418.py b/issues/issue418.py
index b871c65..31ca2fc 100644
--- a/issues/issue418.py
+++ b/issues/issue418.py
@@ -1,7 +1,9 @@
+from time import perf_counter as pc
+
+import matplotlib.pyplot as plt
 import numpy as np
+
 import numexpr as ne
-import matplotlib.pyplot as plt
-from time import perf_counter as pc
 
 # geomspace seems to be very slow, just a warning about setting `n` too high.
 # n = 2**24
diff --git a/numexpr/__init__.py b/numexpr/__init__.py
index 648b869..63bb9e9 100644
--- a/numexpr/__init__.py
+++ b/numexpr/__init__.py
@@ -21,21 +21,20 @@
 
 """
 
-from numexpr.interpreter import MAX_THREADS, use_vml, __BLOCK_SIZE1__
+from numexpr.interpreter import __BLOCK_SIZE1__, MAX_THREADS, use_vml
 
 is_cpu_amd_intel = False # DEPRECATION WARNING: WILL BE REMOVED IN FUTURE RELEASE
 
-# cpuinfo imports were moved into the test submodule function that calls them 
+# cpuinfo imports were moved into the test submodule function that calls them
 # to improve import times.
 
 from numexpr.expressions import E
-from numexpr.necompiler import (NumExpr, disassemble, evaluate, re_evaluate, 
-    validate)
-
-from numexpr.utils import (_init_num_threads,
-    get_vml_version, set_vml_accuracy_mode, set_vml_num_threads,
-    set_num_threads, get_num_threads,
-    detect_number_of_cores, detect_number_of_threads)
+from numexpr.necompiler import (NumExpr, disassemble, evaluate, re_evaluate,
+                                validate)
+from numexpr.utils import (_init_num_threads, detect_number_of_cores,
+                           detect_number_of_threads, get_num_threads,
+                           get_vml_version, set_num_threads,
+                           set_vml_accuracy_mode, set_vml_num_threads)
 
 # Detect the number of cores
 ncores = detect_number_of_cores()
@@ -45,6 +44,7 @@
 # set_vml_num_threads(1)
 
 from . import version
+
 __version__ = version.version
 
 def print_versions():
@@ -63,4 +63,4 @@ def test(verbosity=1):
         return numexpr.tests.test(verbosity=verbosity)
     except ImportError:
         # To maintain Python 2.6 compatibility we have simple error handling
-        raise ImportError('`numexpr.tests` could not be imported, likely it was excluded from the distribution.')
\ No newline at end of file
+        raise ImportError('`numexpr.tests` could not be imported, likely it was excluded from the distribution.')
diff --git a/numexpr/cpuinfo.py b/numexpr/cpuinfo.py
index 4a57d3c..897a4ca 100755
--- a/numexpr/cpuinfo.py
+++ b/numexpr/cpuinfo.py
@@ -23,12 +23,14 @@
 
 __all__ = ['cpu']
 
-import sys, re, types
+import inspect
 import os
+import platform
+import re
 import subprocess
+import sys
+import types
 import warnings
-import platform
-import inspect
 
 is_cpu_amd_intel = False # DEPRECATION WARNING: WILL BE REMOVED IN FUTURE RELEASE
 
diff --git a/numexpr/expressions.py b/numexpr/expressions.py
index 419d7dc..5924c5f 100644
--- a/numexpr/expressions.py
+++ b/numexpr/expressions.py
@@ -35,6 +35,7 @@
 
 from numexpr import interpreter
 
+
 class Expression():
 
     def __getattr__(self, name):
@@ -269,10 +270,10 @@ def rtruediv_op(a, b):
 
 @ophelper
 def pow_op(a, b):
-    
+
     if isinstance(b, ConstantNode):
         x = b.value
-        if (    a.astKind in ('int', 'long') and 
+        if (    a.astKind in ('int', 'long') and
                 b.astKind in ('int', 'long') and x < 0) :
             raise ValueError(
                 'Integers to negative integer powers are not allowed.')
diff --git a/numexpr/interp_body.cpp b/numexpr/interp_body.cpp
index 09b9da9..573ce8c 100644
--- a/numexpr/interp_body.cpp
+++ b/numexpr/interp_body.cpp
@@ -7,13 +7,13 @@
   See LICENSE.txt for details about copyright and rights to use.
 **********************************************************************/
 
-// WARNING: This file is included multiple times in `interpreter.cpp`. It is 
-// essentially a very macro-heavy jump table. Interpretation is best done by 
+// WARNING: This file is included multiple times in `interpreter.cpp`. It is
+// essentially a very macro-heavy jump table. Interpretation is best done by
 // the developer by expanding all macros (e.g. adding `'-E'` to the `extra_cflags`
 // argument in `setup.py` and looking at the resulting `interpreter.cpp`.
 //
-// Changes made to this file will not be recognized by the compile, so the developer 
-// must make a trivial change is made to `interpreter.cpp` or delete the `build/` 
+// Changes made to this file will not be recognized by the compile, so the developer
+// must make a trivial change is made to `interpreter.cpp` or delete the `build/`
 // directory in-between each build.
 {
 #define VEC_LOOP(expr) for(j = 0; j < BLOCK_SIZE; j++) {       \
diff --git a/numexpr/interpreter.cpp b/numexpr/interpreter.cpp
index edebd71..dbfcca1 100644
--- a/numexpr/interpreter.cpp
+++ b/numexpr/interpreter.cpp
@@ -25,7 +25,7 @@
 #define fmin min
 #define NE_INFINITY (DBL_MAX+DBL_MAX)
 #define NE_NAN (INFINITY-INFINITY)
-#else 
+#else
 #define NE_INFINITY INFINITY
 #define NE_NAN NAN
 #endif
@@ -556,7 +556,7 @@ stringcontains(const char *haystack_start, const char *needle_start, npy_intp ma
 
     size_t si = 0;
     size_t min_len = min(needle_len, haystack_len);
-    while (*haystack && *needle && si < min_len)
+    while (si < min_len && *haystack && *needle)
     {
       ok &= *haystack++ == *needle++;
       si++;
@@ -573,7 +573,7 @@ stringcontains(const char *haystack_start, const char *needle_start, npy_intp ma
     }
 
     /* calc haystack length */
-    while (*haystack && si < haystack_len) {
+    while (si < haystack_len && *haystack) {
         haystack++;
         si++;
     }
@@ -652,6 +652,7 @@ int vm_engine_iter_task(NpyIter *iter, npy_intp *memsteps,
 
     /* Then finish off the rest */
     if (block_size > 0) do {
+        block_size = *size_ptr;
 #define REDUCTION_INNER_LOOP
 #define BLOCK_SIZE block_size
 #include "interp_body.cpp"
@@ -698,6 +699,7 @@ vm_engine_iter_outer_reduce_task(NpyIter *iter, npy_intp *memsteps,
 
     /* Then finish off the rest */
     if (block_size > 0) do {
+        block_size = *size_ptr;
 #define BLOCK_SIZE block_size
 #define NO_OUTPUT_BUFFERING // Because it's a reduction
 #include "interp_body.cpp"
@@ -1260,7 +1262,7 @@ NumExpr_run(NumExprObject *self, PyObject *args, PyObject *kwds)
     PyArrayObject *singleton;
     bool writeback;
     // NOTE: cannot assign on declaration due to `goto` statements
-    singleton = NULL; 
+    singleton = NULL;
     writeback = false;
     if (n_inputs == 0) {
         char retsig = get_return_sig(self->program);
@@ -1319,10 +1321,10 @@ NumExpr_run(NumExprObject *self, PyObject *args, PyObject *kwds)
     /* Allocate the iterator or nested iterators */
     if (reduction_size < 0 || full_reduction) {
         /* When there's no reduction, reduction_size is 1 as well */
-        // RAM: in issue #277 this was also the case for reductions on arrays 
-        // with axis=0 having singleton dimension, i.e. such ops were interpreted 
-        // as full_reductions when they weren't in Numpy. As such, the default 
-        // reduction_size is now -1 and we add the flag for full_reduction, 
+        // RAM: in issue #277 this was also the case for reductions on arrays
+        // with axis=0 having singleton dimension, i.e. such ops were interpreted
+        // as full_reductions when they weren't in Numpy. As such, the default
+        // reduction_size is now -1 and we add the flag for full_reduction,
         // e.g. ne.evaluate("sum(a)")"
         iter = NpyIter_AdvancedNew(n_inputs+1, operands,
                             NPY_ITER_BUFFERED|
diff --git a/numexpr/interpreter.hpp b/numexpr/interpreter.hpp
index f9ac1c7..93c6e49 100644
--- a/numexpr/interpreter.hpp
+++ b/numexpr/interpreter.hpp
@@ -75,7 +75,7 @@ struct thread_data {
     int ret_code;
     int *pc_error;
     char **errmsg;
-    // NOTE: memsteps, iter, and reduce_iter are arrays, they MUST be allocated 
+    // NOTE: memsteps, iter, and reduce_iter are arrays, they MUST be allocated
     // to length `global_max_threads` before module load.
     // One memsteps array per thread
     // npy_intp *memsteps[MAX_THREADS];
diff --git a/numexpr/module.cpp b/numexpr/module.cpp
index 66b5b77..e7d6ded 100644
--- a/numexpr/module.cpp
+++ b/numexpr/module.cpp
@@ -51,7 +51,9 @@ void *th_worker(void *tidptr)
     while (1) {
 
         /* Sentinels have to be initialised yet */
-        gs.init_sentinels_done = 0;
+        if (tid == 0) {
+            gs.init_sentinels_done = 0;
+        }
 
         /* Meeting point for all threads (wait for initialization) */
         pthread_mutex_lock(&gs.count_threads_mutex);
@@ -380,7 +382,7 @@ Py_set_num_threads(PyObject *self, PyObject *args)
 }
 
 static PyObject*
-Py_get_num_threads(PyObject *self, PyObject *args) 
+Py_get_num_threads(PyObject *self, PyObject *args)
 {
     int n_thread;
     n_thread = gs.nthreads;
@@ -477,6 +479,10 @@ PyInit_interpreter(void) {
     if (m == NULL)
         INITERROR;
 
+    #ifdef Py_GIL_DISABLED
+        PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
+    #endif
+
     Py_INCREF(&NumExprType);
     PyModule_AddObject(m, "NumExpr", (PyObject *)&NumExprType);
 
diff --git a/numexpr/module.hpp b/numexpr/module.hpp
index cf7b571..079a17f 100644
--- a/numexpr/module.hpp
+++ b/numexpr/module.hpp
@@ -23,7 +23,7 @@ struct global_state {
     int end_threads;                 /* should exisiting threads end? */
     // pthread_t threads[MAX_THREADS];  /* opaque structure for threads */
     // int tids[MAX_THREADS];           /* ID per each thread */
-    /* NOTE: threads and tids are arrays, they MUST be allocated to length 
+    /* NOTE: threads and tids are arrays, they MUST be allocated to length
        `global_max_threads` before module load. */
     pthread_t *threads;              /* opaque structure for threads */
     int *tids;                       /* ID per each thread */
@@ -36,7 +36,7 @@ struct global_state {
     /* Synchronization variables for threadpool state */
     pthread_mutex_t count_mutex;
     int count_threads;
-    int barrier_passed;         /* indicates if the thread pool's thread barrier 
+    int barrier_passed;         /* indicates if the thread pool's thread barrier
                                    is unlocked and ready for the VM to process.*/
     pthread_mutex_t count_threads_mutex;
     pthread_cond_t count_threads_cv;
diff --git a/numexpr/necompiler.py b/numexpr/necompiler.py
index 98aee4c..4ada878 100644
--- a/numexpr/necompiler.py
+++ b/numexpr/necompiler.py
@@ -8,17 +8,18 @@
 #  rights to use.
 ####################################################################
 
-from typing import Optional, Dict
 import __future__
-import sys
+
 import os
-import threading
 import re
+import sys
+import threading
+from typing import Dict, Optional
 
 import numpy
 
 is_cpu_amd_intel = False # DEPRECATION WARNING: WILL BE REMOVED IN FUTURE RELEASE
-from numexpr import interpreter, expressions, use_vml
+from numexpr import expressions, interpreter, use_vml
 from numexpr.utils import CacheDict, ContextDict
 
 # Declare a double type that does not exist in Python space
@@ -28,7 +29,7 @@
 int_ = numpy.int32
 long_ = numpy.int64
 
-typecode_to_kind = {'b': 'bool', 'i': 'int', 'l': 'long', 'f': 'float', 'd': 'double', 
+typecode_to_kind = {'b': 'bool', 'i': 'int', 'l': 'long', 'f': 'float', 'd': 'double',
                     'c': 'complex', 'n': 'none', 's': 'str'}
 kind_to_typecode = {'bool': 'b', 'int': 'i', 'long': 'l', 'float': 'f', 'double': 'd',
                     'complex': 'c', 'bytes': 's', 'str': 's', 'none': 'n'}
@@ -104,11 +105,11 @@ def __eq__(self, other):
             if getattr(self, name) != getattr(other, name):
                 return False
         return True
-    
+
     def __lt__(self,other):
-        # RAM: this is a fix for issue #88 whereby sorting on constants 
+        # RAM: this is a fix for issue #88 whereby sorting on constants
         # that may be of astKind == 'complex' but type(self.value) == int or float
-        # Here we let NumPy sort as it will cast data properly for comparison 
+        # Here we let NumPy sort as it will cast data properly for comparison
         # when the Python built-ins will raise an error.
         if self.astType == 'constant':
             if self.astKind == other.astKind:
@@ -271,7 +272,7 @@ def __str__(self):
 def stringToExpression(s, types, context, sanitize: bool=True):
     """Given a string, convert it to a tree of ExpressionNode's.
     """
-    # sanitize the string for obvious attack vectors that NumExpr cannot 
+    # sanitize the string for obvious attack vectors that NumExpr cannot
     # parse into its homebrew AST. This is to protect the call to `eval` below.
     # We forbid `;`, `:`. `[` and `__`, and attribute access via '.'.
     # We cannot ban `.real` or `.imag` however...
@@ -281,7 +282,7 @@ def stringToExpression(s, types, context, sanitize: bool=True):
         skip_quotes = re.sub(r'(\'[^\']*\')', '', no_whitespace)
         if _blacklist_re.search(skip_quotes) is not None:
             raise ValueError(f'Expression {s} has forbidden control characters.')
-    
+
     old_ctx = expressions._context.get_current_context()
     try:
         expressions._context.set_new_context(context)
@@ -307,7 +308,7 @@ def stringToExpression(s, types, context, sanitize: bool=True):
 
         # now build the expression
         ex = eval(c, names)
-        
+
         if expressions.isConstant(ex):
             ex = expressions.ConstantNode(ex, expressions.getKind(ex))
         elif not isinstance(ex, expressions.ExpressionNode):
@@ -363,7 +364,7 @@ def getConstants(ast):
         a = 1 + 3j; b = 5.0
         ne.evaluate('a*2 + 15j - b')
     """
-    constant_registers = set([node.reg for node in ast.allOf("constant")]) 
+    constant_registers = set([node.reg for node in ast.allOf("constant")])
     constants_order = sorted([r.node for r in constant_registers])
     constants = [convertConstantToKind(a.value, a.astKind)
                  for a in constants_order]
@@ -557,7 +558,7 @@ def getContext(kwargs, _frame_depth=1):
             context[name] = value
         else:
             raise ValueError("'%s' must be one of %s" % (name, allowed))
-    
+
     if d:
         raise ValueError("Unknown keyword argument '%s'" % d.popitem()[0])
     if context['truediv'] == 'auto':
@@ -657,7 +658,7 @@ def disassemble(nex):
 
     def parseOp(op):
         name, sig = [*op.rsplit(b'_', 1), ''][:2]
-        return name, sig 
+        return name, sig
 
     def getArg(pc, offset):
         arg = nex.program[pc + (offset if offset < 4 else offset+1)]
@@ -752,7 +753,7 @@ def getArguments(names, local_dict=None, global_dict=None, _frame_depth: int=2):
         if global_dict is None:
             global_dict = frame_globals
 
-        # If `call_frame` is the top frame of the interpreter we can't clear its 
+        # If `call_frame` is the top frame of the interpreter we can't clear its
         # `local_dict`, because it is actually the `global_dict`.
         clear_local_dict = clear_local_dict and not frame_globals is local_dict
 
@@ -774,23 +775,26 @@ def getArguments(names, local_dict=None, global_dict=None, _frame_depth: int=2):
 
 
 # Dictionaries for caching variable names and compiled expressions
-_names_cache = CacheDict(256)
-_numexpr_cache = CacheDict(256)
-_numexpr_last = ContextDict()
+# _names_cache = CacheDict(256)
+_names_cache = threading.local()
+# _numexpr_cache = CacheDict(256)
+_numexpr_cache = threading.local()
+# _numexpr_last = ContextDict()
+_numexpr_last = threading.local()
 evaluate_lock = threading.Lock()
 
-def validate(ex: str, 
-             local_dict: Optional[Dict] = None, 
+def validate(ex: str,
+             local_dict: Optional[Dict] = None,
              global_dict: Optional[Dict] = None,
-             out: numpy.ndarray = None, 
-             order: str = 'K', 
-             casting: str = 'safe', 
+             out: numpy.ndarray = None,
+             order: str = 'K',
+             casting: str = 'safe',
              _frame_depth: int = 2,
              sanitize: Optional[bool] = None,
              **kwargs) -> Optional[Exception]:
     r"""
     Validate a NumExpr expression with the given `local_dict` or `locals()`.
-    Returns `None` on success and the Exception object if one occurs. Note that 
+    Returns `None` on success and the Exception object if one occurs. Note that
     you can proceed directly to call `re_evaluate()` if you use `validate()`
     to sanitize your expressions and variables in advance.
 
@@ -835,30 +839,38 @@ def validate(ex: str,
           * 'unsafe' means any data conversions may be done.
 
     sanitize: Optional[bool]
-        Both `validate` and by extension `evaluate` call `eval(ex)`, which is 
-        potentially dangerous on unsanitized inputs. As such, NumExpr by default 
-        performs simple sanitization, banning the character ':;[', the 
+        Both `validate` and by extension `evaluate` call `eval(ex)`, which is
+        potentially dangerous on unsanitized inputs. As such, NumExpr by default
+        performs simple sanitization, banning the character ':;[', the
         dunder '__[\w+]__', and attribute access to all but '.real' and '.imag'.
-        
-        Using `None` defaults to `True` unless the environment variable 
-        `NUMEXPR_SANITIZE=0` is set, in which case the default is `False`. 
+
+        Using `None` defaults to `True` unless the environment variable
+        `NUMEXPR_SANITIZE=0` is set, in which case the default is `False`.
         Nominally this can be set via `os.environ` before `import numexpr`.
 
     _frame_depth: int
-        The calling frame depth. Unless you are a NumExpr developer you should 
+        The calling frame depth. Unless you are a NumExpr developer you should
         not set this value.
 
     Note
     ----
-    
+
     """
     global _numexpr_last
+    if not hasattr(_numexpr_last, 'l'):
+        _numexpr_last.l = ContextDict()
+
+    if not hasattr(_names_cache, 'c'):
+        _names_cache.c = CacheDict(256)
+
+    if not hasattr(_numexpr_cache, 'c'):
+        _numexpr_cache.c = CacheDict(256)
 
     try:
-        
+
         if not isinstance(ex, str):
             raise ValueError("must specify expression as a string")
-        
+
         if sanitize is None:
             if 'NUMEXPR_SANITIZE' in os.environ:
                 sanitize = bool(int(os.environ['NUMEXPR_SANITIZE']))
@@ -868,9 +880,9 @@ def validate(ex: str,
         # Get the names for this expression
         context = getContext(kwargs)
         expr_key = (ex, tuple(sorted(context.items())))
-        if expr_key not in _names_cache:
-            _names_cache[expr_key] = getExprNames(ex, context, sanitize=sanitize)
-        names, ex_uses_vml = _names_cache[expr_key]
+        if expr_key not in _names_cache.c:
+            _names_cache.c[expr_key] = getExprNames(ex, context, sanitize=sanitize)
+        names, ex_uses_vml = _names_cache.c[expr_key]
         arguments = getArguments(names, local_dict, global_dict, _frame_depth=_frame_depth)
 
         # Create a signature
@@ -880,22 +892,22 @@ def validate(ex: str,
         # Look up numexpr if possible.
         numexpr_key = expr_key + (tuple(signature),)
         try:
-            compiled_ex = _numexpr_cache[numexpr_key]
+            compiled_ex = _numexpr_cache.c[numexpr_key]
         except KeyError:
-            compiled_ex = _numexpr_cache[numexpr_key] = NumExpr(ex, signature, sanitize=sanitize, **context)
+            compiled_ex = _numexpr_cache.c[numexpr_key] = NumExpr(ex, signature, sanitize=sanitize, **context)
         kwargs = {'out': out, 'order': order, 'casting': casting,
                   'ex_uses_vml': ex_uses_vml}
-        _numexpr_last.set(ex=compiled_ex, argnames=names, kwargs=kwargs)
+        _numexpr_last.l.set(ex=compiled_ex, argnames=names, kwargs=kwargs)
     except Exception as e:
         return e
     return None
 
-def evaluate(ex: str, 
-             local_dict: Optional[Dict] = None, 
+def evaluate(ex: str,
+             local_dict: Optional[Dict] = None,
              global_dict: Optional[Dict] = None,
-             out: numpy.ndarray = None, 
-             order: str = 'K', 
-             casting: str = 'safe', 
+             out: numpy.ndarray = None,
+             order: str = 'K',
+             casting: str = 'safe',
              sanitize: Optional[bool] = None,
              _frame_depth: int = 3,
              **kwargs) -> numpy.ndarray:
@@ -948,27 +960,27 @@ def evaluate(ex: str,
         performs simple sanitization, banning the characters ':;[', the
         dunder '__[\w+]__', and attribute access to all but '.real' and '.imag'.
 
-        Using `None` defaults to `True` unless the environment variable 
-        `NUMEXPR_SANITIZE=0` is set, in which case the default is `False`. 
+        Using `None` defaults to `True` unless the environment variable
+        `NUMEXPR_SANITIZE=0` is set, in which case the default is `False`.
         Nominally this can be set via `os.environ` before `import numexpr`.
 
     _frame_depth: int
-        The calling frame depth. Unless you are a NumExpr developer you should 
+        The calling frame depth. Unless you are a NumExpr developer you should
         not set this value.
 
     """
-    # We could avoid code duplication if we called validate and then re_evaluate 
+    # We could avoid code duplication if we called validate and then re_evaluate
     # here, but we have difficulties with the `sys.getframe(2)` call in
     # `getArguments`
-    e = validate(ex, local_dict=local_dict, global_dict=global_dict, 
-                 out=out, order=order, casting=casting, 
+    e = validate(ex, local_dict=local_dict, global_dict=global_dict,
+                 out=out, order=order, casting=casting,
                  _frame_depth=_frame_depth, sanitize=sanitize, **kwargs)
     if e is None:
         return re_evaluate(local_dict=local_dict, global_dict=global_dict, _frame_depth=_frame_depth)
     else:
         raise e
-    
-def re_evaluate(local_dict: Optional[Dict] = None, 
+
+def re_evaluate(local_dict: Optional[Dict] = None,
                 global_dict: Optional[Dict] = None,
                 _frame_depth: int=2) -> numpy.ndarray:
     """
@@ -983,17 +995,19 @@ def re_evaluate(local_dict: Optional[Dict] = None,
     local_dict: dictionary, optional
         A dictionary that replaces the local operands in current frame.
     _frame_depth: int
-        The calling frame depth. Unless you are a NumExpr developer you should 
+        The calling frame depth. Unless you are a NumExpr developer you should
         not set this value.
     """
     global _numexpr_last
+    if not hasattr(_numexpr_last, 'l'):
+        _numexpr_last.l = ContextDict()
 
     try:
-        compiled_ex = _numexpr_last['ex']
+        compiled_ex = _numexpr_last.l['ex']
     except KeyError:
         raise RuntimeError("A previous evaluate() execution was not found, please call `validate` or `evaluate` once before `re_evaluate`")
-    argnames = _numexpr_last['argnames']
+    argnames = _numexpr_last.l['argnames']
     args = getArguments(argnames, local_dict, global_dict, _frame_depth=_frame_depth)
-    kwargs = _numexpr_last['kwargs']
+    kwargs = _numexpr_last.l['kwargs']
     with evaluate_lock:
         return compiled_ex(*args, **kwargs)
diff --git a/numexpr/numexpr_config.hpp b/numexpr/numexpr_config.hpp
index 0663c6d..2bf0091 100644
--- a/numexpr/numexpr_config.hpp
+++ b/numexpr/numexpr_config.hpp
@@ -19,7 +19,7 @@
 #define BLOCK_SIZE1 1024
 #endif
 
-// The default threadpool size. It's prefer that the user set this via an 
+// The default threadpool size. It's prefer that the user set this via an
 // environment variable, "NUMEXPR_MAX_THREADS"
 #define DEFAULT_MAX_THREADS 64
 
diff --git a/numexpr/numexpr_object.cpp b/numexpr/numexpr_object.cpp
index e788d1c..b20aef0 100644
--- a/numexpr/numexpr_object.cpp
+++ b/numexpr/numexpr_object.cpp
@@ -405,4 +405,3 @@ PyTypeObject NumExprType = {
     0,                         /* tp_alloc */
     NumExpr_new,               /* tp_new */
 };
-
diff --git a/numexpr/tests/__init__.py b/numexpr/tests/__init__.py
index 3fff411..f47c8cc 100644
--- a/numexpr/tests/__init__.py
+++ b/numexpr/tests/__init__.py
@@ -8,7 +8,7 @@
 #  rights to use.
 ####################################################################
 
-from numexpr.tests.test_numexpr import test, print_versions
+from numexpr.tests.test_numexpr import print_versions, test
 
 if __name__ == '__main__':
     test()
diff --git a/numexpr/tests/test_numexpr.py b/numexpr/tests/test_numexpr.py
index 62210b4..2731b32 100644
--- a/numexpr/tests/test_numexpr.py
+++ b/numexpr/tests/test_numexpr.py
@@ -11,31 +11,37 @@
 
 
 import os
-import sys
 import platform
+import subprocess
+import sys
+import unittest
 import warnings
 from contextlib import contextmanager
-import subprocess
+from unittest.mock import MagicMock
 
-import numpy as np
-from numpy import (
-    array, arange, empty, zeros, int32, int64, uint16, cdouble, float64, rec,
-    copy, ones_like, where, all as alltrue, linspace,
-    sum, prod, sqrt, fmod, floor, ceil,
-    sin, cos, tan, arcsin, arccos, arctan, arctan2,
-    sinh, cosh, tanh, arcsinh, arccosh, arctanh,
-    log, log1p, log10, exp, expm1, conj)
 import numpy
-from numpy.testing import (assert_equal, assert_array_equal,
-                           assert_array_almost_equal, assert_allclose)
-from numpy import shape, allclose, array_equal, ravel, isnan, isinf
+import numpy as np
+from numpy import all as alltrue
+from numpy import (allclose, arange, arccos, arccosh, arcsin, arcsinh, arctan,
+                   arctan2, arctanh, array, array_equal, cdouble, ceil, conj,
+                   copy, cos, cosh, empty, exp, expm1, float64, floor, fmod,
+                   int32, int64, isinf, isnan, linspace, log, log1p, log10,
+                   ones_like, prod, ravel, rec, shape, sin, sinh, sqrt, sum,
+                   tan, tanh, uint16, where, zeros)
+from numpy.testing import (assert_allclose, assert_array_almost_equal,
+                           assert_array_equal, assert_equal)
 
 import numexpr
-from numexpr import E, NumExpr, evaluate, re_evaluate, validate, disassemble, use_vml
+from numexpr import (E, NumExpr, disassemble, evaluate, re_evaluate, use_vml,
+                     validate)
 from numexpr.expressions import ConstantNode
 from numexpr.utils import detect_number_of_cores
 
-import unittest
+try:
+    import pytest
+    pytest_available = True
+except ImportError:
+    pytest_available = False
 
 TestCase = unittest.TestCase
 
@@ -44,6 +50,15 @@
 MAX_THREADS = 16
 
 
+if not pytest_available:
+    def identity(f):
+        return f
+
+    pytest = MagicMock()
+    pytest.mark = MagicMock()
+    pytest.mark.thread_unsafe = identity
+
+
 class test_numexpr(TestCase):
     """Testing with 1 thread"""
     nthreads = 1
@@ -318,6 +333,7 @@ def test_refcount(self):
         evaluate('1')
         assert sys.getrefcount(a) == 2
 
+    @pytest.mark.thread_unsafe
     def test_locals_clears_globals(self):
         # Check for issue #313, whereby clearing f_locals also clear f_globals
         # if in the top-frame. This cannot be done inside `unittest` as it is always
@@ -341,6 +357,7 @@ def test_locals_clears_globals(self):
 
 
 
+@pytest.mark.thread_unsafe
 class test_numexpr2(test_numexpr):
     """Testing with 2 threads"""
     nthreads = 2
@@ -512,6 +529,7 @@ def test_illegal_value(self):
         else:
             self.fail()
 
+    @pytest.mark.thread_unsafe
     def test_sanitize(self):
         with _environment('NUMEXPR_SANITIZE', '1'):
             # Forbid dunder
@@ -590,7 +608,7 @@ def test_sanitize(self):
             x = np.array(['a', 'b'], dtype=bytes)
             evaluate("x == 'b:'")
 
-
+    @pytest.mark.thread_unsafe
     def test_no_sanitize(self):
         try: # Errors on compile() after eval()
             evaluate('import os;', sanitize=False)
@@ -677,6 +695,7 @@ def test_ex_uses_vml(self):
     if 'sparc' not in platform.machine():
         # Execution order set here so as to not use too many threads
         # during the rest of the execution.  See #33 for details.
+        @pytest.mark.thread_unsafe
         def test_changing_nthreads_00_inc(self):
             a = linspace(-1, 1, 1000000)
             b = ((.25 * a + .75) * a - 1.5) * a - 2
@@ -685,6 +704,7 @@ def test_changing_nthreads_00_inc(self):
                 c = evaluate("((.25*a + .75)*a - 1.5)*a - 2")
                 assert_array_almost_equal(b, c)
 
+        @pytest.mark.thread_unsafe
         def test_changing_nthreads_01_dec(self):
             a = linspace(-1, 1, 1000000)
             b = ((.25 * a + .75) * a - 1.5) * a - 2
@@ -1123,6 +1143,7 @@ def _environment(key, value):
             del os.environ[key]
 
 # Test cases for the threading configuration
+@pytest.mark.thread_unsafe
 class test_threading_config(TestCase):
     def test_max_threads_unset(self):
         # Has to be done in a subprocess as `importlib.reload` doesn't let us
@@ -1306,6 +1327,7 @@ def _worker(qout=None):
 
 # Case test for subprocesses (via multiprocessing module)
 class test_subprocess(TestCase):
+    @pytest.mark.thread_unsafe
     def test_multiprocess(self):
         try:
             import multiprocessing as mp
@@ -1328,9 +1350,10 @@ def test_multiprocess(self):
 def print_versions():
     """Print the versions of software that numexpr relies on."""
     # from pkg_resources import parse_version
-    from numexpr.cpuinfo import cpu
     import platform
 
+    from numexpr.cpuinfo import cpu
+
     print('-=' * 38)
     print('Numexpr version:   %s' % numexpr.__version__)
     print('NumPy version:     %s' % np.__version__)
@@ -1371,8 +1394,8 @@ def test(verbosity=1):
 
 
 def suite():
-    import unittest
     import platform as pl
+    import unittest
 
     theSuite = unittest.TestSuite()
     niter = 1
diff --git a/numexpr/utils.py b/numexpr/utils.py
index cc61833..9e45fbe 100644
--- a/numexpr/utils.py
+++ b/numexpr/utils.py
@@ -9,20 +9,22 @@
 ####################################################################
 
 import logging
+
 log = logging.getLogger(__name__)
 
+import contextvars
 import os
 import subprocess
-import contextvars
 
-from numexpr.interpreter import _set_num_threads, _get_num_threads, MAX_THREADS
 from numexpr import use_vml
+from numexpr.interpreter import MAX_THREADS, _get_num_threads, _set_num_threads
+
 from . import version
 
 if use_vml:
-    from numexpr.interpreter import (
-        _get_vml_version, _set_vml_accuracy_mode, _set_vml_num_threads,
-        _get_vml_num_threads)
+    from numexpr.interpreter import (_get_vml_num_threads, _get_vml_version,
+                                     _set_vml_accuracy_mode,
+                                     _set_vml_num_threads)
 
 
 def get_vml_version():
@@ -118,9 +120,9 @@ def get_num_threads():
 
 def _init_num_threads():
     """
-    Detects the environment variable 'NUMEXPR_MAX_THREADS' to set the threadpool 
-    size, and if necessary the slightly redundant 'NUMEXPR_NUM_THREADS' or 
-    'OMP_NUM_THREADS' env vars to set the initial number of threads used by 
+    Detects the environment variable 'NUMEXPR_MAX_THREADS' to set the threadpool
+    size, and if necessary the slightly redundant 'NUMEXPR_NUM_THREADS' or
+    'OMP_NUM_THREADS' env vars to set the initial number of threads used by
     the virtual machine.
     """
     # Any platform-specific short-circuits
@@ -140,7 +142,7 @@ def _init_num_threads():
         env_configured = True
         n_cores = MAX_THREADS
     else:
-        # The use has not set 'NUMEXPR_MAX_THREADS', so likely they have not 
+        # The use has not set 'NUMEXPR_MAX_THREADS', so likely they have not
         # configured NumExpr as desired, so we emit info logs.
         if n_cores > MAX_THREADS:
             log.info('Note: detected %d virtual cores but NumExpr set to maximum of %d, check "NUMEXPR_MAX_THREADS" environment variable.'%(n_cores, MAX_THREADS))
@@ -149,7 +151,7 @@ def _init_num_threads():
             log.info('Note: NumExpr detected %d cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.'%n_cores)
             n_cores = 16
 
-    # Now we check for 'NUMEXPR_NUM_THREADS' or 'OMP_NUM_THREADS' to set the 
+    # Now we check for 'NUMEXPR_NUM_THREADS' or 'OMP_NUM_THREADS' to set the
     # actual number of threads used.
     if 'NUMEXPR_NUM_THREADS' in os.environ and os.environ['NUMEXPR_NUM_THREADS'] != '':
         requested_threads = int(os.environ['NUMEXPR_NUM_THREADS'])
@@ -165,7 +167,7 @@ def _init_num_threads():
     set_num_threads(requested_threads)
     return requested_threads
 
-    
+
 def detect_number_of_cores():
     """
     Detects the number of cores on a system. Cribbed from pp.
diff --git a/numexpr/win32/stdint.h b/numexpr/win32/stdint.h
index b7e7112..c66267a 100644
--- a/numexpr/win32/stdint.h
+++ b/numexpr/win32/stdint.h
@@ -17,7 +17,7 @@
  *
  * mwb: This was modified in the following ways:
  *
- *      - make it compatible with Visual C++ 6 (which uses 
+ *      - make it compatible with Visual C++ 6 (which uses
  *          non-standard keywords and suffixes for 64-bit types)
  *      - some environments need stddef.h included (for wchar stuff?)
  *      - handle the fact that Microsoft's limits.h header defines
@@ -70,9 +70,9 @@ typedef unsigned   uint_least32_t;
 typedef __STDINT_LONGLONG  int_least64_t;
 typedef unsigned __STDINT_LONGLONG   uint_least64_t;
 
-/*  7.18.1.3  Fastest minimum-width integer types 
+/*  7.18.1.3  Fastest minimum-width integer types
  *  Not actually guaranteed to be fastest for all purposes
- *  Here we use the exact-width types for 8 and 16-bit ints. 
+ *  Here we use the exact-width types for 8 and 16-bit ints.
  */
 typedef char int_fast8_t;
 typedef unsigned char uint_fast8_t;
@@ -110,7 +110,7 @@ typedef unsigned __STDINT_LONGLONG   uintmax_t;
 #if !defined ( __cplusplus) || defined (__STDC_LIMIT_MACROS)
 
 /* 7.18.2.1  Limits of exact-width integer types */
-#define INT8_MIN (-128) 
+#define INT8_MIN (-128)
 #define INT16_MIN (-32768)
 #define INT32_MIN (-2147483647 - 1)
 #define INT64_MIN  (PASTE( -9223372036854775807, __STDINT_LONGLONG_SUFFIX) - 1)
@@ -158,7 +158,7 @@ typedef unsigned __STDINT_LONGLONG   uintmax_t;
 #define UINT_FAST64_MAX UINT64_MAX
 
 /* 7.18.2.4  Limits of integer types capable of holding
-    object pointers */ 
+    object pointers */
 #ifdef _WIN64
 #define INTPTR_MIN INT64_MIN
 #define INTPTR_MAX INT64_MAX
@@ -186,7 +186,7 @@ typedef unsigned __STDINT_LONGLONG   uintmax_t;
 #define SIZE_MAX UINTPTR_MAX
 #endif
 
-#ifndef WCHAR_MIN  /* also in wchar.h */ 
+#ifndef WCHAR_MIN  /* also in wchar.h */
 #define WCHAR_MIN 0
 #define WCHAR_MAX ((wchar_t)-1) /* UINT16_MAX */
 #endif
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..4fec170
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+markers =
+    thread_unsafe: mark a test as thread unsafe
diff --git a/setup.py b/setup.py
index 82f3651..64d9f20 100644
--- a/setup.py
+++ b/setup.py
@@ -9,12 +9,13 @@
 #  rights to use.
 ####################################################################
 
-import os, os.path as op
-import platform
 import configparser
-import numpy as np
-from setuptools import setup, Extension
+import os
+import os.path as op
+import platform
 
+import numpy as np
+from setuptools import Extension, setup
 
 with open('requirements.txt') as f:
     requirements = f.read().splitlines()
@@ -40,7 +41,7 @@
 libs = []  # Pre-built libraries ONLY, like python36.so
 clibs = []
 def_macros = [
-    # keep in sync with minimal runtime requirement (requirements.txt) 
+    # keep in sync with minimal runtime requirement (requirements.txt)
     ('NPY_TARGET_VERSION', 'NPY_1_23_API_VERSION')
 ]
 sources = ['numexpr/interpreter.cpp',