diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 42e6a9d..b0077f8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -6,37 +6,56 @@ permissions: contents: read env: - CIBW_BEFORE_BUILD: pip install setuptools oldest-supported-numpy + CIBW_BEFORE_BUILD: pip install setuptools oldest-supported-numpy pytest + CIBW_BEFORE_TEST: pip install pytest CIBW_BUILD_VERBOSITY: 1 - CIBW_TEST_COMMAND: python -c "import sys, numexpr; sys.exit(0 if numexpr.test().wasSuccessful() else 1)" - CIBW_TEST_SKIP: "*macosx*arm64*" + CIBW_TEST_COMMAND: pytest --pyargs numexpr + # Testing on aarch64 takes too long, as it is currently emulated on GitHub Actions + CIBW_TEST_SKIP: "*linux*aarch64*" # Building for musllinux and aarch64 takes way too much time. # Moreover, NumPy is not providing musllinux for x86_64 either, so it's not worth it. CIBW_SKIP: "*musllinux*aarch64* *musllinux*x86_64*" jobs: build_wheels: - name: Build wheels on ${{ matrix.os }} for ${{ matrix.arch }} - ${{ matrix.p_ver }} - runs-on: ${{ matrix.os }} + name: Build wheels on ${{ matrix.os }} for ${{ matrix.arch }} + runs-on: ${{ matrix.runs-on || matrix.os }} permissions: contents: write env: - CIBW_BUILD: ${{ matrix.cibw_build }} + CIBW_BUILD: ${{ matrix.cibw_pattern }} CIBW_ARCHS_LINUX: ${{ matrix.arch }} CIBW_ARCHS_MACOS: "x86_64 arm64" + CIBW_ENABLE: cpython-freethreading strategy: + fail-fast: false matrix: - os: [ubuntu-latest, windows-latest, macos-latest] - arch: [x86_64, aarch64] - cibw_build: ["cp3{10,11,12,13}-*"] - p_ver: ["3.10-3.13"] - exclude: - - os: windows-latest - arch: aarch64 - # cibuild is already in charge to build aarch64 (see CIBW_ARCHS_MACOS) - - os: macos-latest + include: + # Linux x86_64 builds + - os: ubuntu-latest + arch: x86_64 + cibw_pattern: "cp3{10,11,12,13,13t}-manylinux*" + artifact_name: "linux-x86_64" + + # Linux ARM64 builds (native runners) + - os: ubuntu-latest arch: aarch64 + cibw_pattern: "cp3{10,11,12,13,13t}-manylinux*" + artifact_name: "linux-aarch64" + # Don't use native runners for now (looks like wait times are too long) + #runs-on: ["ubuntu-latest", "arm64"] + # Windows builds + - os: windows-latest + arch: x86_64 + cibw_pattern: "cp3{10,11,12,13,13t}-win*" + artifact_name: "windows-x86_64" + + # macOS builds (universal2) + - os: macos-latest + arch: x86_64 + cibw_pattern: "cp3{10,11,12,13,13t}-macosx*" + artifact_name: "macos-universal2" steps: - uses: actions/checkout@v3 @@ -45,17 +64,22 @@ jobs: with: python-version: '3.x' - - name: Install cibuildwheel + - name: Setup free-threading variables + if: ${{ endsWith(matrix.cibw_build, 't-*') }} + shell: bash -l {0} run: | - python -m pip install cibuildwheel + echo "CIBW_BEFORE_BUILD=pip install setuptools numpy" >> "$GITHUB_ENV" + echo "CIBW_BEFORE_TEST=pip install pytest pytest-run-parallel" >> "$GITHUB_ENV" + echo "CIBW_TEST_COMMAND=pytest --parallel-threads=4 --pyargs numexpr" >> "$GITHUB_ENV" - - uses: docker/setup-qemu-action@v2 - if: ${{ matrix.arch == 'aarch64' }} - name: Set up QEMU + - name: Set up QEMU + if: matrix.arch == 'aarch64' + uses: docker/setup-qemu-action@v3 + with: + platforms: arm64 - name: Build wheels - run: | - python -m cibuildwheel --output-dir wheelhouse + uses: pypa/cibuildwheel@v2.23 - name: Make sdist if: ${{ matrix.os == 'windows-latest' }} @@ -65,6 +89,7 @@ jobs: - uses: actions/upload-artifact@v4 with: + name: ${{ matrix.artifact_name }} path: ./wheelhouse/* - name: Upload to GitHub Release diff --git a/.gitignore b/.gitignore index 928bf15..7bf6f98 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ artifact/ numexpr.egg-info/ *.pyc *.swp +*.so *~ doc/_build site.cfg diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..cb4e829 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,26 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: debug-statements + +# Too many things to fix, let's just ignore it for now +#- repo: https://github.com/pycqa/flake8 +# rev: 7.0.0 +# hooks: +# - id: flake8 +# +- repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort + +# Too many things to fix, let's just ignore it for now +#- repo: https://github.com/pre-commit/mirrors-mypy +# rev: v1.8.0 +# hooks: +# - id: mypy +# exclude: ^(docs/|setup.py) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index d2c3d13..3803a41 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -19,4 +19,4 @@ sphinx: # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html python: install: - - requirements: doc/requirements.txt \ No newline at end of file + - requirements: doc/requirements.txt diff --git a/AUTHORS.txt b/AUTHORS.txt index 88b9047..57410db 100644 --- a/AUTHORS.txt +++ b/AUTHORS.txt @@ -23,7 +23,7 @@ Google Inc. contributed bug fixes. David Cox improved readability of the Readme. -Robert A. McLeod contributed bug fixes and ported the documentation to +Robert A. McLeod contributed bug fixes and ported the documentation to numexpr.readthedocs.io. He has served as the maintainer of the package since 2016 to 2023. diff --git a/README.rst b/README.rst index 9033d51..264fd2b 100644 --- a/README.rst +++ b/README.rst @@ -159,6 +159,24 @@ Usage array([ True, False, False], dtype=bool) +Free-threading support +---------------------- +Starting on CPython 3.13 onwards there is a new distribution that disables the +Global Interpreter Lock (GIL) altogether, thus increasing the performance yields +under multi-threaded conditions on a single interpreter, as opposed to having to use +multiprocessing. + +Whilst numexpr has been demonstrated to work under free-threaded +CPython, considerations need to be taken when using numexpr native parallel +implementation vs using Python threads directly in order to prevent oversubscription, +we recommend either using the main CPython interpreter thread to spawn multiple C threads +using the parallel numexpr API, or spawning multiple CPython threads that do not use +the parallel API. + +For more information about free-threaded CPython, we recommend visiting the following +`community Wiki ` + + Documentation ------------- diff --git a/bench/boolean_timing.py b/bench/boolean_timing.py index fe07b31..0be0bf7 100644 --- a/bench/boolean_timing.py +++ b/bench/boolean_timing.py @@ -9,8 +9,10 @@ #################################################################### from __future__ import print_function + import sys import timeit + import numpy array_size = 5_000_000 diff --git a/bench/issue-36.py b/bench/issue-36.py index 9c356cf..611bddb 100644 --- a/bench/issue-36.py +++ b/bench/issue-36.py @@ -2,10 +2,14 @@ # performs better than the serial code. See issue #36 for details. from __future__ import print_function + +from time import time + import numpy as np -import numexpr as ne from numpy.testing import assert_array_equal -from time import time + +import numexpr as ne + def bench(N): print("*** array length:", N) @@ -31,4 +35,3 @@ def bench(N): ne.set_num_threads(2) for N in range(10, 20): bench(2**N) - diff --git a/bench/issue-47.py b/bench/issue-47.py index 31c68a6..a48fbe2 100644 --- a/bench/issue-47.py +++ b/bench/issue-47.py @@ -1,4 +1,5 @@ import numpy + import numexpr numexpr.set_num_threads(8) diff --git a/bench/large_array_vs_numpy.py b/bench/large_array_vs_numpy.py index 72219a1..b480261 100644 --- a/bench/large_array_vs_numpy.py +++ b/bench/large_array_vs_numpy.py @@ -31,10 +31,12 @@ import os os.environ["NUMEXPR_NUM_THREADS"] = "16" +import threading +import timeit + import numpy as np + import numexpr as ne -import timeit -import threading array_size = 10**8 num_runs = 10 diff --git a/bench/multidim.py b/bench/multidim.py index 587f100..eeccd0b 100644 --- a/bench/multidim.py +++ b/bench/multidim.py @@ -12,9 +12,12 @@ # Based on a script provided by Andrew Collette. from __future__ import print_function + +import time + import numpy as np + import numexpr as nx -import time test_shapes = [ (100*100*100), @@ -90,5 +93,3 @@ def test_func(a, b, c): print("Simple: ", (stop1-start1)/nruns) print("Numexpr: ", (stop2-start2)/nruns) print("Chunked: ", (stop3-start3)/nruns) - - diff --git a/bench/poly.py b/bench/poly.py index 0f50290..3eb12b1 100644 --- a/bench/poly.py +++ b/bench/poly.py @@ -17,11 +17,13 @@ ####################################################################### from __future__ import print_function + import sys from time import time + import numpy as np -import numexpr as ne +import numexpr as ne #expr = ".25*x**3 + .75*x**2 - 1.5*x - 2" # the polynomial to compute expr = "((.25*x + .75)*x - 1.5)*x - 2" # a computer-friendly polynomial diff --git a/bench/timing.py b/bench/timing.py index c84a6f4..9c70610 100644 --- a/bench/timing.py +++ b/bench/timing.py @@ -9,7 +9,10 @@ #################################################################### from __future__ import print_function -import timeit, numpy + +import timeit + +import numpy array_size = 5e6 iterations = 2 diff --git a/bench/unaligned-simple.py b/bench/unaligned-simple.py index e168c78..b653c7a 100644 --- a/bench/unaligned-simple.py +++ b/bench/unaligned-simple.py @@ -13,8 +13,11 @@ """ from __future__ import print_function + from timeit import Timer + import numpy as np + import numexpr as ne niter = 10 diff --git a/bench/varying-expr.py b/bench/varying-expr.py index d04ab35..df7419c 100644 --- a/bench/varying-expr.py +++ b/bench/varying-expr.py @@ -13,9 +13,12 @@ # the latency of numexpr when working with small arrays. from __future__ import print_function + import sys from time import time + import numpy as np + import numexpr as ne N = 100 diff --git a/bench/vml_timing.py b/bench/vml_timing.py index 52f5003..57dd4d2 100644 --- a/bench/vml_timing.py +++ b/bench/vml_timing.py @@ -9,9 +9,12 @@ #################################################################### from __future__ import print_function + import sys import timeit + import numpy + import numexpr array_size = 5_000_000 diff --git a/bench/vml_timing2.py b/bench/vml_timing2.py index 32fdc62..1c460d0 100644 --- a/bench/vml_timing2.py +++ b/bench/vml_timing2.py @@ -4,11 +4,14 @@ # https://github.com/pydata/numexpr/wiki/NumexprMKL from __future__ import print_function + import datetime import sys +from time import time + import numpy as np + import numexpr as ne -from time import time N = int(2**26) diff --git a/bench/vml_timing3.py b/bench/vml_timing3.py index 04997ff..0086421 100644 --- a/bench/vml_timing3.py +++ b/bench/vml_timing3.py @@ -1,7 +1,9 @@ # -*- coding: utf-8 -*- +from timeit import default_timer as timer + import numpy as np + import numexpr as ne -from timeit import default_timer as timer x = np.ones(100000) scaler = -1J diff --git a/doc/api.rst b/doc/api.rst index 7d750e3..5d1bb0f 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -3,11 +3,11 @@ NumExpr API .. automodule:: numexpr :members: evaluate, re_evaluate, disassemble, NumExpr, get_vml_version, set_vml_accuracy_mode, set_vml_num_threads, set_num_threads, detect_number_of_cores, detect_number_of_threads - + .. py:attribute:: ncores The number of (virtual) cores detected. - + .. py:attribute:: nthreads The number of threads currently in-use. @@ -18,11 +18,11 @@ NumExpr API .. py:attribute:: version - The version of NumExpr. - - + The version of NumExpr. + + Tests submodule --------------- .. automodule:: numexpr.tests - :members: test, print_versions \ No newline at end of file + :members: test, print_versions diff --git a/doc/index.rst b/doc/index.rst index 02922c3..d517391 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -25,4 +25,3 @@ Indices and tables * :ref:`genindex` * :ref:`modindex` * :ref:`search` - diff --git a/doc/intro.rst b/doc/intro.rst index 11dbaaf..0d31925 100644 --- a/doc/intro.rst +++ b/doc/intro.rst @@ -1,25 +1,25 @@ How it works ============ -The string passed to :code:`evaluate` is compiled into an object representing the +The string passed to :code:`evaluate` is compiled into an object representing the expression and types of the arrays used by the function :code:`numexpr`. -The expression is first compiled using Python's :code:`compile` function (this means -that the expressions have to be valid Python expressions). From this, the -variable names can be taken. The expression is then evaluated using instances -of a special object that keep track of what is being done to them, and which +The expression is first compiled using Python's :code:`compile` function (this means +that the expressions have to be valid Python expressions). From this, the +variable names can be taken. The expression is then evaluated using instances +of a special object that keep track of what is being done to them, and which builds up the parse tree of the expression. -This parse tree is then compiled to a bytecode program, which describes how to -perform the operation element-wise. The virtual machine uses "vector registers": -each register is many elements wide (by default 4096 elements). The key to +This parse tree is then compiled to a bytecode program, which describes how to +perform the operation element-wise. The virtual machine uses "vector registers": +each register is many elements wide (by default 4096 elements). The key to NumExpr's speed is handling chunks of elements at a time. -There are two extremes to evaluating an expression elementwise. You can do each -operation as arrays, returning temporary arrays. This is what you do when you -use NumPy: :code:`2*a+3*b` uses three temporary arrays as large as :code:`a` or -:code:`b`. This strategy wastes memory (a problem if your arrays are large), -and also is not a good use of cache memory: for large arrays, the results of +There are two extremes to evaluating an expression elementwise. You can do each +operation as arrays, returning temporary arrays. This is what you do when you +use NumPy: :code:`2*a+3*b` uses three temporary arrays as large as :code:`a` or +:code:`b`. This strategy wastes memory (a problem if your arrays are large), +and also is not a good use of cache memory: for large arrays, the results of :code:`2*a` and :code:`3*b` won't be in cache when you do the add. The other extreme is to loop over each element, as in:: @@ -27,13 +27,13 @@ The other extreme is to loop over each element, as in:: for i in xrange(len(a)): c[i] = 2*a[i] + 3*b[i] -This doesn't consume extra memory, and is good for the cache, but, if the -expression is not compiled to machine code, you will have a big case statement -(or a bunch of if's) inside the loop, which adds a large overhead for each +This doesn't consume extra memory, and is good for the cache, but, if the +expression is not compiled to machine code, you will have a big case statement +(or a bunch of if's) inside the loop, which adds a large overhead for each element, and will hurt the branch-prediction used on the CPU. -:code:`numexpr` uses a in-between approach. Arrays are handled as chunks (of -4096 elements) at a time, using a register machine. As Python code, +:code:`numexpr` uses a in-between approach. Arrays are handled as chunks (of +4096 elements) at a time, using a register machine. As Python code, it looks something like this:: for i in xrange(0, len(a), 256): @@ -44,11 +44,11 @@ it looks something like this:: add(r2, r3, r2) c[i:i+128] = r2 -(remember that the 3-arg form stores the result in the third argument, -instead of allocating a new array). This achieves a good balance between -cache and branch-prediction. And the virtual machine is written entirely in -C, which makes it faster than the Python above. Furthermore the virtual machine -is also multi-threaded, which allows for efficient parallelization of NumPy +(remember that the 3-arg form stores the result in the third argument, +instead of allocating a new array). This achieves a good balance between +cache and branch-prediction. And the virtual machine is written entirely in +C, which makes it faster than the Python above. Furthermore the virtual machine +is also multi-threaded, which allows for efficient parallelization of NumPy operations. There is some more information and history at: @@ -58,12 +58,12 @@ http://www.bitsofbits.com/2014/09/21/numpy-micro-optimization-and-numexpr/ Expected performance ==================== -The range of speed-ups for NumExpr respect to NumPy can vary from 0.95x and 20x, -being 2x, 3x or 4x typical values, depending on the complexity of the -expression and the internal optimization of the operators used. The strided and -unaligned case has been optimized too, so if the expression contains such -arrays, the speed-up can increase significantly. Of course, you will need to -operate with large arrays (typically larger than the cache size of your CPU) +The range of speed-ups for NumExpr respect to NumPy can vary from 0.95x and 20x, +being 2x, 3x or 4x typical values, depending on the complexity of the +expression and the internal optimization of the operators used. The strided and +unaligned case has been optimized too, so if the expression contains such +arrays, the speed-up can increase significantly. Of course, you will need to +operate with large arrays (typically larger than the cache size of your CPU) to see these improvements in performance. Here there are some real timings. For the contiguous case:: diff --git a/doc/mkl.rst b/doc/mkl.rst index 6951655..0c706bb 100644 --- a/doc/mkl.rst +++ b/doc/mkl.rst @@ -1,19 +1,19 @@ NumExpr with Intel MKL ====================== -Numexpr has support for Intel's VML (included in Intel's MKL) in order to -accelerate the evaluation of transcendental functions on Intel CPUs. Here it +Numexpr has support for Intel's VML (included in Intel's MKL) in order to +accelerate the evaluation of transcendental functions on Intel CPUs. Here it is a small example on the kind of improvement you may get by using it. A first benchmark ----------------- -Firstly, we are going to exercise how MKL performs when computing a couple of -simple expressions. One is a pure algebraic one: :code:`2*y + 4*x` and the other +Firstly, we are going to exercise how MKL performs when computing a couple of +simple expressions. One is a pure algebraic one: :code:`2*y + 4*x` and the other contains transcendental functions: :code:`sin(x)**2 + cos(y)**2`. -For this, we are going to use this worksheet_. I (Francesc Alted) ran this -benchmark on a Intel Xeon E3-1245 v5 @ 3.50GHz. Here are the results when +For this, we are going to use this worksheet_. I (Francesc Alted) ran this +benchmark on a Intel Xeon E3-1245 v5 @ 3.50GHz. Here are the results when not using MKL:: NumPy version: 1.11.1 @@ -22,7 +22,7 @@ not using MKL:: Numexpr version: 2.6.1. Using MKL: False Time for an algebraic expression: 0.058 s / 19.116 GB/s Time for a transcendental expression: 0.283 s / 3.950 GB/s - + And now, using MKL:: @@ -34,14 +34,14 @@ And now, using MKL:: Time for a transcendental expression: 0.075 s / 14.975 GB/s -As you can see, numexpr using MKL can be up to 3.8x faster for the case of the -transcendental expression. Also, you can notice that the pure algebraic -expression is not accelerated at all. This is completely expected, as the -MKL is offering accelerations for CPU bounded functions (sin, cos, tan, exp, +As you can see, numexpr using MKL can be up to 3.8x faster for the case of the +transcendental expression. Also, you can notice that the pure algebraic +expression is not accelerated at all. This is completely expected, as the +MKL is offering accelerations for CPU bounded functions (sin, cos, tan, exp, log, sinh...) and not pure multiplications or adds. -Finally, note how numexpr+MKL can be up to 26x faster than using a pure NumPy -solution. And this was using a processor with just four physical cores; you +Finally, note how numexpr+MKL can be up to 26x faster than using a pure NumPy +solution. And this was using a processor with just four physical cores; you should expect more speedup as you throw more cores at that. .. _worksheet: https://github.com/pydata/numexpr/blob/master/bench/vml_timing2.py @@ -49,28 +49,28 @@ should expect more speedup as you throw more cores at that. More benchmarks (older) ----------------------- -Numexpr & VML can both use several threads for doing computations. Let's see -how performance improves by using 1 or 2 threads on a 2-core Intel CPU (Core2 +Numexpr & VML can both use several threads for doing computations. Let's see +how performance improves by using 1 or 2 threads on a 2-core Intel CPU (Core2 E8400 @ 3.00GHz). Using 1 thread ^^^^^^^^^^^^^^ -Here we have some benchmarks on the improvement of speed that Intel's VML can -achieve. First, look at times by some easy expression containing sine and +Here we have some benchmarks on the improvement of speed that Intel's VML can +achieve. First, look at times by some easy expression containing sine and cosine operations *without* using VML:: In [17]: ne.use_vml Out[17]: False - + In [18]: x = np.linspace(-1, 1, 1e6) - + In [19]: timeit np.sin(x)**2+np.cos(x)**2 10 loops, best of 3: 43.1 ms per loop - + In [20]: ne.set_num_threads(1) Out[20]: 2 - + In [21]: timeit ne.evaluate('sin(x)**2+cos(x)**2') 10 loops, best of 3: 29.5 ms per loop @@ -79,15 +79,15 @@ and now using VML:: In [37]: ne.use_vml Out[37]: True - + In [38]: x = np.linspace(-1, 1, 1e6) - + In [39]: timeit np.sin(x)**2+np.cos(x)**2 10 loops, best of 3: 42.8 ms per loop - + In [40]: ne.set_num_threads(1) Out[40]: 2 - + In [41]: timeit ne.evaluate('sin(x)**2+cos(x)**2') 100 loops, best of 3: 19.8 ms per loop @@ -96,37 +96,37 @@ Hey, VML can accelerate computations by a 50% using a single CPU. That's great! Using 2 threads ^^^^^^^^^^^^^^^ -First, look at the time of the non-VML numexpr when using 2 threads:: +First, look at the time of the non-VML numexpr when using 2 threads:: In [22]: ne.set_num_threads(2) Out[22]: 1 - + In [23]: timeit ne.evaluate('sin(x)**2+cos(x)**2') 100 loops, best of 3: 15.3 ms per loop -OK. We've got an almost perfect 2x improvement in speed with regard to the 1 +OK. We've got an almost perfect 2x improvement in speed with regard to the 1 thread case. Let's see about the VML-powered numexpr version:: In [43]: ne.set_num_threads(2) Out[43]: 1 - + In [44]: timeit ne.evaluate('sin(x)**2+cos(x)**2') 100 loops, best of 3: 12.2 ms per loop -Ok, that's about 1.6x improvement over the 1 thread VML computation, and -still a 25% of improvement over the non-VML version. Good, native numexpr +Ok, that's about 1.6x improvement over the 1 thread VML computation, and +still a 25% of improvement over the non-VML version. Good, native numexpr multithreading code really looks very efficient! Numexpr native threading code vs VML's one ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -You may already know that both numexpr and Intel's VML do have support for -multithreaded computations, but you might be curious about which one is more -efficient, so here it goes a hint. First, using the VML multithreaded +You may already know that both numexpr and Intel's VML do have support for +multithreaded computations, but you might be curious about which one is more +efficient, so here it goes a hint. First, using the VML multithreaded implementation:: In [49]: ne.set_vml_num_threads(2) - + In [50]: ne.set_num_threads(1) Out[50]: 1 @@ -146,14 +146,14 @@ and now, using the native numexpr threading code:: 100 loops, best of 3: 12 ms per loop -This means that numexpr's native multithreaded code is about 40% faster than -VML's for this case. So, in general, you should use the former with numexpr +This means that numexpr's native multithreaded code is about 40% faster than +VML's for this case. So, in general, you should use the former with numexpr (and this is the default actually). Mixing numexpr's and VML multithreading capabilities ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Finally, you might be tempted to use both multithreading codes at the same +Finally, you might be tempted to use both multithreading codes at the same time, but you will be deceived about the improvement in performance:: In [57]: ne.set_vml_num_threads(2) @@ -161,7 +161,7 @@ time, but you will be deceived about the improvement in performance:: In [58]: timeit ne.evaluate('sin(x)**2+cos(x)**2') 100 loops, best of 3: 17.7 ms per loop -Your code actually performs much worse. That's normal too because you are -trying to run 4 threads on a 2-core CPU. For CPUs with many cores, you may -want to try with different threading configurations, but as a rule of thumb, -numexpr's one will generally win. \ No newline at end of file +Your code actually performs much worse. That's normal too because you are +trying to run 4 threads on a 2-core CPU. For CPUs with many cores, you may +want to try with different threading configurations, but as a rule of thumb, +numexpr's one will generally win. diff --git a/doc/release_notes.rst b/doc/release_notes.rst index 081e7f4..51d3212 100644 --- a/doc/release_notes.rst +++ b/doc/release_notes.rst @@ -1,4 +1,4 @@ Release Notes ============= -.. include:: ../RELEASE_NOTES.rst \ No newline at end of file +.. include:: ../RELEASE_NOTES.rst diff --git a/doc/user_guide.rst b/doc/user_guide.rst index 3a3cf63..ce2ff9d 100644 --- a/doc/user_guide.rst +++ b/doc/user_guide.rst @@ -30,7 +30,7 @@ and it can also re_evaluate an expression:: Building -------- -*NumExpr* requires Python_ 3.7 or greater, and NumPy_ 1.13 or greater. It is +*NumExpr* requires Python_ 3.7 or greater, and NumPy_ 1.13 or greater. It is built in the standard Python way: .. code-block:: bash @@ -39,7 +39,7 @@ built in the standard Python way: You must have a C-compiler (i.e. MSVC Build tools on Windows and GCC on Linux) installed. -Then change to a directory that is not the repository directory (e.g. `/tmp`) and +Then change to a directory that is not the repository directory (e.g. `/tmp`) and test :code:`numexpr` with: .. code-block:: bash @@ -73,23 +73,23 @@ affect performance). Threadpool Configuration ------------------------ -Threads are spawned at import-time, with the number being set by the environment -variable ``NUMEXPR_MAX_THREADS``. The default maximum thread count is **64**. +Threads are spawned at import-time, with the number being set by the environment +variable ``NUMEXPR_MAX_THREADS``. The default maximum thread count is **64**. There is no advantage to spawning more threads than the number of virtual cores -available on the computing node. Practically NumExpr scales at large thread -count (`> 8`) only on very large matrices (`> 2**22`). Spawning large numbers -of threads is not free, and can increase import times for NumExpr or packages +available on the computing node. Practically NumExpr scales at large thread +count (`> 8`) only on very large matrices (`> 2**22`). Spawning large numbers +of threads is not free, and can increase import times for NumExpr or packages that import it such as Pandas or PyTables. -If desired, the number of threads in the pool used can be adjusted via an -environment variable, ``NUMEXPR_NUM_THREADS`` (preferred) or ``OMP_NUM_THREADS``. -Typically only setting ``NUMEXPR_MAX_THREADS`` is sufficient; the number of -threads used can be adjusted dynamically via ``numexpr.set_num_threads(int)``. +If desired, the number of threads in the pool used can be adjusted via an +environment variable, ``NUMEXPR_NUM_THREADS`` (preferred) or ``OMP_NUM_THREADS``. +Typically only setting ``NUMEXPR_MAX_THREADS`` is sufficient; the number of +threads used can be adjusted dynamically via ``numexpr.set_num_threads(int)``. The number of threads can never exceed that set by ``NUMEXPR_MAX_THREADS``. -If the user has not configured the environment prior to importing NumExpr, info -logs will be generated, and the initial number of threads *that are used*_ will -be set to the number of cores detected in the system or 8, whichever is *less*. +If the user has not configured the environment prior to importing NumExpr, info +logs will be generated, and the initial number of threads *that are used*_ will +be set to the number of cores detected in the system or 8, whichever is *less*. Usage:: @@ -111,16 +111,16 @@ function's frame (through the use of :code:`sys._getframe()`). Alternatively, they can be specified using the :code:`local_dict` or :code:`global_dict` arguments, or passed as keyword arguments. -The :code:`optimization` parameter can take the values :code:`'moderate'` -or :code:`'aggressive'`. :code:`'moderate'` means that no optimization is made -that can affect precision at all. :code:`'aggressive'` (the default) means that -the expression can be rewritten in a way that precision *could* be affected, but -normally very little. For example, in :code:`'aggressive'` mode, the -transformation :code:`x~**3` -> :code:`x*x*x` is made, but not in +The :code:`optimization` parameter can take the values :code:`'moderate'` +or :code:`'aggressive'`. :code:`'moderate'` means that no optimization is made +that can affect precision at all. :code:`'aggressive'` (the default) means that +the expression can be rewritten in a way that precision *could* be affected, but +normally very little. For example, in :code:`'aggressive'` mode, the +transformation :code:`x~**3` -> :code:`x*x*x` is made, but not in :code:`'moderate'` mode. -The `truediv` parameter specifies whether the division is a 'floor division' -(False) or a 'true division' (True). The default is the value of +The `truediv` parameter specifies whether the division is a 'floor division' +(False) or a 'true division' (True). The default is the value of `__future__.division` in the interpreter. See PEP 238 for details. Expressions are cached, so reuse is fast. Arrays or scalars are @@ -164,22 +164,22 @@ Casting rules in NumExpr follow closely those of *NumPy*. However, for implementation reasons, there are some known exceptions to this rule, namely: - * When an array with type :code:`int8`, :code:`uint8`, :code:`int16` or - :code:`uint16` is used inside NumExpr, it is internally upcasted to an - :code:`int` (or :code:`int32` in NumPy notation). - * When an array with type :code:`uint32` is used inside NumExpr, it is - internally upcasted to a :code:`long` (or :code:`int64` in NumPy notation). - * A floating point function (e.g. :code:`sin`) acting on :code:`int8` or - :code:`int16` types returns a :code:`float64` type, instead of the - :code:`float32` that is returned by NumPy functions. This is mainly due + * When an array with type :code:`int8`, :code:`uint8`, :code:`int16` or + :code:`uint16` is used inside NumExpr, it is internally upcasted to an + :code:`int` (or :code:`int32` in NumPy notation). + * When an array with type :code:`uint32` is used inside NumExpr, it is + internally upcasted to a :code:`long` (or :code:`int64` in NumPy notation). + * A floating point function (e.g. :code:`sin`) acting on :code:`int8` or + :code:`int16` types returns a :code:`float64` type, instead of the + :code:`float32` that is returned by NumPy functions. This is mainly due to the absence of native :code:`int8` or :code:`int16` types in NumExpr. - * In operations implying a scalar and an array, the normal rules of casting - are used in NumExpr, in contrast with NumPy, where array types takes - priority. For example, if :code:`a` is an array of type :code:`float32` - and :code:`b` is an scalar of type :code:`float64` (or Python :code:`float` - type, which is equivalent), then :code:`a*b` returns a :code:`float64` in - NumExpr, but a :code:`float32` in NumPy (i.e. array operands take priority - in determining the result type). If you need to keep the result a + * In operations implying a scalar and an array, the normal rules of casting + are used in NumExpr, in contrast with NumPy, where array types takes + priority. For example, if :code:`a` is an array of type :code:`float32` + and :code:`b` is an scalar of type :code:`float64` (or Python :code:`float` + type, which is equivalent), then :code:`a*b` returns a :code:`float64` in + NumExpr, but a :code:`float32` in NumPy (i.e. array operands take priority + in determining the result type). If you need to keep the result a :code:`float32`, be sure you use a :code:`float32` scalar too. @@ -199,42 +199,42 @@ Supported functions The next are the current supported set: - * :code:`where(bool, number1, number2): number` -- number1 if the bool condition + * :code:`where(bool, number1, number2): number` -- number1 if the bool condition is true, number2 otherwise. - * :code:`{sin,cos,tan}(float|complex): float|complex` -- trigonometric sine, + * :code:`{sin,cos,tan}(float|complex): float|complex` -- trigonometric sine, cosine or tangent. - * :code:`{arcsin,arccos,arctan}(float|complex): float|complex` -- trigonometric + * :code:`{arcsin,arccos,arctan}(float|complex): float|complex` -- trigonometric inverse sine, cosine or tangent. - * :code:`arctan2(float1, float2): float` -- trigonometric inverse tangent of + * :code:`arctan2(float1, float2): float` -- trigonometric inverse tangent of float1/float2. - * :code:`{sinh,cosh,tanh}(float|complex): float|complex` -- hyperbolic sine, + * :code:`{sinh,cosh,tanh}(float|complex): float|complex` -- hyperbolic sine, cosine or tangent. - * :code:`{arcsinh,arccosh,arctanh}(float|complex): float|complex` -- hyperbolic + * :code:`{arcsinh,arccosh,arctanh}(float|complex): float|complex` -- hyperbolic inverse sine, cosine or tangent. - * :code:`{log,log10,log1p}(float|complex): float|complex` -- natural, base-10 and + * :code:`{log,log10,log1p}(float|complex): float|complex` -- natural, base-10 and log(1+x) logarithms. - * :code:`{exp,expm1}(float|complex): float|complex` -- exponential and exponential + * :code:`{exp,expm1}(float|complex): float|complex` -- exponential and exponential minus one. * :code:`sqrt(float|complex): float|complex` -- square root. * :code:`abs(float|complex): float|complex` -- absolute value. * :code:`conj(complex): complex` -- conjugate value. * :code:`{real,imag}(complex): float` -- real or imaginary part of complex. - * :code:`complex(float, float): complex` -- complex from real and imaginary + * :code:`complex(float, float): complex` -- complex from real and imaginary parts. - * :code:`contains(np.str, np.str): bool` -- returns True for every string in :code:`op1` that + * :code:`contains(np.str, np.str): bool` -- returns True for every string in :code:`op1` that contains :code:`op2`. Notes ----- * :code:`abs()` for complex inputs returns a :code:`complex` output too. This is a - departure from NumPy where a :code:`float` is returned instead. However, - NumExpr is not flexible enough yet so as to allow this to happen. - Meanwhile, if you want to mimic NumPy behaviour, you may want to select the - real part via the :code:`real` function (e.g. :code:`real(abs(cplx))`) or via the + departure from NumPy where a :code:`float` is returned instead. However, + NumExpr is not flexible enough yet so as to allow this to happen. + Meanwhile, if you want to mimic NumPy behaviour, you may want to select the + real part via the :code:`real` function (e.g. :code:`real(abs(cplx))`) or via the :code:`real` selector (e.g. :code:`abs(cplx).real`). -More functions can be added if you need them. Note however that NumExpr 2.6 is +More functions can be added if you need them. Note however that NumExpr 2.6 is in maintenance mode and a new major revision is under development. Supported reduction operations @@ -242,12 +242,12 @@ Supported reduction operations The next are the current supported set: - * :code:`sum(number, axis=None)`: Sum of array elements over a given axis. + * :code:`sum(number, axis=None)`: Sum of array elements over a given axis. Negative axis are not supported. - * :code:`prod(number, axis=None)`: Product of array elements over a given axis. + * :code:`prod(number, axis=None)`: Product of array elements over a given axis. Negative axis are not supported. -*Note:* because of internal limitations, reduction operations must appear the +*Note:* because of internal limitations, reduction operations must appear the last in the stack. If not, it will be issued an error like:: >>> ne.evaluate('sum(1)*(-1)') @@ -256,23 +256,23 @@ last in the stack. If not, it will be issued an error like:: General routines ---------------- - * :code:`evaluate(expression, local_dict=None, global_dict=None, - optimization='aggressive', truediv='auto')`: Evaluate a simple array + * :code:`evaluate(expression, local_dict=None, global_dict=None, + optimization='aggressive', truediv='auto')`: Evaluate a simple array expression element-wise. See examples above. - * :code:`re_evaluate(local_dict=None)`: Re-evaluate the last array expression - without any check. This is meant for accelerating loops that are re-evaluating - the same expression repeatedly without changing anything else than the operands. + * :code:`re_evaluate(local_dict=None)`: Re-evaluate the last array expression + without any check. This is meant for accelerating loops that are re-evaluating + the same expression repeatedly without changing anything else than the operands. If unsure, use evaluate() which is safer. * :code:`test()`: Run all the tests in the test suite. * :code:`print_versions()`: Print the versions of software that numexpr relies on. - * :code:`set_num_threads(nthreads)`: Sets a number of threads to be used in operations. - Returns the previous setting for the number of threads. See note below to see + * :code:`set_num_threads(nthreads)`: Sets a number of threads to be used in operations. + Returns the previous setting for the number of threads. See note below to see how the number of threads is set via environment variables. - If you are using VML, you may want to use *set_vml_num_threads(nthreads)* to - perform the parallel job with VML instead. However, you should get very - similar performance with VML-optimized functions, and VML's parallelizer - cannot deal with common expressions like `(x+1)*(x-2)`, while NumExpr's + If you are using VML, you may want to use *set_vml_num_threads(nthreads)* to + perform the parallel job with VML instead. However, you should get very + similar performance with VML-optimized functions, and VML's parallelizer + cannot deal with common expressions like `(x+1)*(x-2)`, while NumExpr's one can. * :code:`detect_number_of_cores()`: Detects the number of cores on a system. @@ -324,4 +324,4 @@ License NumExpr is distributed under the MIT_ license. -.. _MIT: http://www.opensource.org/licenses/mit-license.php \ No newline at end of file +.. _MIT: http://www.opensource.org/licenses/mit-license.php diff --git a/doc/vm2.rst b/doc/vm2.rst index 45e9fc9..01c9826 100644 --- a/doc/vm2.rst +++ b/doc/vm2.rst @@ -1,32 +1,32 @@ Performance of the Virtual Machine in NumExpr2.0 ================================================ -Numexpr 2.0 leverages a new virtual machine completely based on the new ndarray -iterator introduced in NumPy 1.6. This represents a nice combination of the -advantages of using the new iterator, while retaining the ability to avoid -copies in memory as well as the multi-threading capabilities of the previous +Numexpr 2.0 leverages a new virtual machine completely based on the new ndarray +iterator introduced in NumPy 1.6. This represents a nice combination of the +advantages of using the new iterator, while retaining the ability to avoid +copies in memory as well as the multi-threading capabilities of the previous virtual machine (1.x series). -The increased performance of the new virtual machine can be seen in several +The increased performance of the new virtual machine can be seen in several scenarios, like: - * *Broadcasting*. Expressions containing arrays that needs to be broadcasted, + * *Broadcasting*. Expressions containing arrays that needs to be broadcasted, will not need additional memory (i.e. they will be broadcasted on-the-fly). - * *Non-native dtypes*. These will be translated to native dtypes on-the-fly, + * *Non-native dtypes*. These will be translated to native dtypes on-the-fly, so there is not need to convert the whole arrays first. - * *Fortran-ordered arrays*. The new iterator will find the best path to + * *Fortran-ordered arrays*. The new iterator will find the best path to optimize operations on such arrays, without the need to transpose them first. -There is a drawback though: performance with small arrays suffers a bit because -of higher set-up times for the new virtual machine. See below for detailed +There is a drawback though: performance with small arrays suffers a bit because +of higher set-up times for the new virtual machine. See below for detailed benchmarks. Some benchmarks for best-case scenarios --------------------------------------- -Here you have some benchmarks of some scenarios where the new virtual machine -actually represents an advantage in terms of speed (also memory, but this is -not shown here). As you will see, the improvement is notable in many areas, +Here you have some benchmarks of some scenarios where the new virtual machine +actually represents an advantage in terms of speed (also memory, but this is +not shown here). As you will see, the improvement is notable in many areas, ranging from 3x to 6x faster operations. Broadcasting @@ -85,7 +85,7 @@ Mix of 'non-native' arrays, Fortran-ordered, and using broadcasting Longer setup-time ^^^^^^^^^^^^^^^^^ -The only drawback of the new virtual machine is during the computation of +The only drawback of the new virtual machine is during the computation of small arrays:: >>> a = np.arange(10) @@ -98,8 +98,8 @@ small arrays:: 10000 loops, best of 3: 30.6 µs per loop -i.e. the new virtual machine takes a bit more time to set-up (around 8 µs in -this machine). However, this should be not too important because for such a +i.e. the new virtual machine takes a bit more time to set-up (around 8 µs in +this machine). However, this should be not too important because for such a small arrays NumPy is always a better option:: >>> timeit c = a*(b+1) @@ -121,8 +121,8 @@ And for arrays large enough the difference is negligible:: Conclusion ---------- -The new virtual machine introduced in numexpr 2.0 brings more performance in -many different scenarios (broadcast, non-native dtypes, Fortran-orderd arrays), -while it shows slightly worse performance for small arrays. However, as -numexpr is more geared to compute large arrays, the new virtual machine should -be good news for numexpr users in general. \ No newline at end of file +The new virtual machine introduced in numexpr 2.0 brings more performance in +many different scenarios (broadcast, non-native dtypes, Fortran-orderd arrays), +while it shows slightly worse performance for small arrays. However, as +numexpr is more geared to compute large arrays, the new virtual machine should +be good news for numexpr users in general. diff --git a/issues/issue418.py b/issues/issue418.py index b871c65..31ca2fc 100644 --- a/issues/issue418.py +++ b/issues/issue418.py @@ -1,7 +1,9 @@ +from time import perf_counter as pc + +import matplotlib.pyplot as plt import numpy as np + import numexpr as ne -import matplotlib.pyplot as plt -from time import perf_counter as pc # geomspace seems to be very slow, just a warning about setting `n` too high. # n = 2**24 diff --git a/numexpr/__init__.py b/numexpr/__init__.py index 648b869..63bb9e9 100644 --- a/numexpr/__init__.py +++ b/numexpr/__init__.py @@ -21,21 +21,20 @@ """ -from numexpr.interpreter import MAX_THREADS, use_vml, __BLOCK_SIZE1__ +from numexpr.interpreter import __BLOCK_SIZE1__, MAX_THREADS, use_vml is_cpu_amd_intel = False # DEPRECATION WARNING: WILL BE REMOVED IN FUTURE RELEASE -# cpuinfo imports were moved into the test submodule function that calls them +# cpuinfo imports were moved into the test submodule function that calls them # to improve import times. from numexpr.expressions import E -from numexpr.necompiler import (NumExpr, disassemble, evaluate, re_evaluate, - validate) - -from numexpr.utils import (_init_num_threads, - get_vml_version, set_vml_accuracy_mode, set_vml_num_threads, - set_num_threads, get_num_threads, - detect_number_of_cores, detect_number_of_threads) +from numexpr.necompiler import (NumExpr, disassemble, evaluate, re_evaluate, + validate) +from numexpr.utils import (_init_num_threads, detect_number_of_cores, + detect_number_of_threads, get_num_threads, + get_vml_version, set_num_threads, + set_vml_accuracy_mode, set_vml_num_threads) # Detect the number of cores ncores = detect_number_of_cores() @@ -45,6 +44,7 @@ # set_vml_num_threads(1) from . import version + __version__ = version.version def print_versions(): @@ -63,4 +63,4 @@ def test(verbosity=1): return numexpr.tests.test(verbosity=verbosity) except ImportError: # To maintain Python 2.6 compatibility we have simple error handling - raise ImportError('`numexpr.tests` could not be imported, likely it was excluded from the distribution.') \ No newline at end of file + raise ImportError('`numexpr.tests` could not be imported, likely it was excluded from the distribution.') diff --git a/numexpr/cpuinfo.py b/numexpr/cpuinfo.py index 4a57d3c..897a4ca 100755 --- a/numexpr/cpuinfo.py +++ b/numexpr/cpuinfo.py @@ -23,12 +23,14 @@ __all__ = ['cpu'] -import sys, re, types +import inspect import os +import platform +import re import subprocess +import sys +import types import warnings -import platform -import inspect is_cpu_amd_intel = False # DEPRECATION WARNING: WILL BE REMOVED IN FUTURE RELEASE diff --git a/numexpr/expressions.py b/numexpr/expressions.py index 419d7dc..5924c5f 100644 --- a/numexpr/expressions.py +++ b/numexpr/expressions.py @@ -35,6 +35,7 @@ from numexpr import interpreter + class Expression(): def __getattr__(self, name): @@ -269,10 +270,10 @@ def rtruediv_op(a, b): @ophelper def pow_op(a, b): - + if isinstance(b, ConstantNode): x = b.value - if ( a.astKind in ('int', 'long') and + if ( a.astKind in ('int', 'long') and b.astKind in ('int', 'long') and x < 0) : raise ValueError( 'Integers to negative integer powers are not allowed.') diff --git a/numexpr/interp_body.cpp b/numexpr/interp_body.cpp index 09b9da9..573ce8c 100644 --- a/numexpr/interp_body.cpp +++ b/numexpr/interp_body.cpp @@ -7,13 +7,13 @@ See LICENSE.txt for details about copyright and rights to use. **********************************************************************/ -// WARNING: This file is included multiple times in `interpreter.cpp`. It is -// essentially a very macro-heavy jump table. Interpretation is best done by +// WARNING: This file is included multiple times in `interpreter.cpp`. It is +// essentially a very macro-heavy jump table. Interpretation is best done by // the developer by expanding all macros (e.g. adding `'-E'` to the `extra_cflags` // argument in `setup.py` and looking at the resulting `interpreter.cpp`. // -// Changes made to this file will not be recognized by the compile, so the developer -// must make a trivial change is made to `interpreter.cpp` or delete the `build/` +// Changes made to this file will not be recognized by the compile, so the developer +// must make a trivial change is made to `interpreter.cpp` or delete the `build/` // directory in-between each build. { #define VEC_LOOP(expr) for(j = 0; j < BLOCK_SIZE; j++) { \ diff --git a/numexpr/interpreter.cpp b/numexpr/interpreter.cpp index edebd71..dbfcca1 100644 --- a/numexpr/interpreter.cpp +++ b/numexpr/interpreter.cpp @@ -25,7 +25,7 @@ #define fmin min #define NE_INFINITY (DBL_MAX+DBL_MAX) #define NE_NAN (INFINITY-INFINITY) -#else +#else #define NE_INFINITY INFINITY #define NE_NAN NAN #endif @@ -556,7 +556,7 @@ stringcontains(const char *haystack_start, const char *needle_start, npy_intp ma size_t si = 0; size_t min_len = min(needle_len, haystack_len); - while (*haystack && *needle && si < min_len) + while (si < min_len && *haystack && *needle) { ok &= *haystack++ == *needle++; si++; @@ -573,7 +573,7 @@ stringcontains(const char *haystack_start, const char *needle_start, npy_intp ma } /* calc haystack length */ - while (*haystack && si < haystack_len) { + while (si < haystack_len && *haystack) { haystack++; si++; } @@ -652,6 +652,7 @@ int vm_engine_iter_task(NpyIter *iter, npy_intp *memsteps, /* Then finish off the rest */ if (block_size > 0) do { + block_size = *size_ptr; #define REDUCTION_INNER_LOOP #define BLOCK_SIZE block_size #include "interp_body.cpp" @@ -698,6 +699,7 @@ vm_engine_iter_outer_reduce_task(NpyIter *iter, npy_intp *memsteps, /* Then finish off the rest */ if (block_size > 0) do { + block_size = *size_ptr; #define BLOCK_SIZE block_size #define NO_OUTPUT_BUFFERING // Because it's a reduction #include "interp_body.cpp" @@ -1260,7 +1262,7 @@ NumExpr_run(NumExprObject *self, PyObject *args, PyObject *kwds) PyArrayObject *singleton; bool writeback; // NOTE: cannot assign on declaration due to `goto` statements - singleton = NULL; + singleton = NULL; writeback = false; if (n_inputs == 0) { char retsig = get_return_sig(self->program); @@ -1319,10 +1321,10 @@ NumExpr_run(NumExprObject *self, PyObject *args, PyObject *kwds) /* Allocate the iterator or nested iterators */ if (reduction_size < 0 || full_reduction) { /* When there's no reduction, reduction_size is 1 as well */ - // RAM: in issue #277 this was also the case for reductions on arrays - // with axis=0 having singleton dimension, i.e. such ops were interpreted - // as full_reductions when they weren't in Numpy. As such, the default - // reduction_size is now -1 and we add the flag for full_reduction, + // RAM: in issue #277 this was also the case for reductions on arrays + // with axis=0 having singleton dimension, i.e. such ops were interpreted + // as full_reductions when they weren't in Numpy. As such, the default + // reduction_size is now -1 and we add the flag for full_reduction, // e.g. ne.evaluate("sum(a)")" iter = NpyIter_AdvancedNew(n_inputs+1, operands, NPY_ITER_BUFFERED| diff --git a/numexpr/interpreter.hpp b/numexpr/interpreter.hpp index f9ac1c7..93c6e49 100644 --- a/numexpr/interpreter.hpp +++ b/numexpr/interpreter.hpp @@ -75,7 +75,7 @@ struct thread_data { int ret_code; int *pc_error; char **errmsg; - // NOTE: memsteps, iter, and reduce_iter are arrays, they MUST be allocated + // NOTE: memsteps, iter, and reduce_iter are arrays, they MUST be allocated // to length `global_max_threads` before module load. // One memsteps array per thread // npy_intp *memsteps[MAX_THREADS]; diff --git a/numexpr/module.cpp b/numexpr/module.cpp index 66b5b77..e7d6ded 100644 --- a/numexpr/module.cpp +++ b/numexpr/module.cpp @@ -51,7 +51,9 @@ void *th_worker(void *tidptr) while (1) { /* Sentinels have to be initialised yet */ - gs.init_sentinels_done = 0; + if (tid == 0) { + gs.init_sentinels_done = 0; + } /* Meeting point for all threads (wait for initialization) */ pthread_mutex_lock(&gs.count_threads_mutex); @@ -380,7 +382,7 @@ Py_set_num_threads(PyObject *self, PyObject *args) } static PyObject* -Py_get_num_threads(PyObject *self, PyObject *args) +Py_get_num_threads(PyObject *self, PyObject *args) { int n_thread; n_thread = gs.nthreads; @@ -477,6 +479,10 @@ PyInit_interpreter(void) { if (m == NULL) INITERROR; + #ifdef Py_GIL_DISABLED + PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED); + #endif + Py_INCREF(&NumExprType); PyModule_AddObject(m, "NumExpr", (PyObject *)&NumExprType); diff --git a/numexpr/module.hpp b/numexpr/module.hpp index cf7b571..079a17f 100644 --- a/numexpr/module.hpp +++ b/numexpr/module.hpp @@ -23,7 +23,7 @@ struct global_state { int end_threads; /* should exisiting threads end? */ // pthread_t threads[MAX_THREADS]; /* opaque structure for threads */ // int tids[MAX_THREADS]; /* ID per each thread */ - /* NOTE: threads and tids are arrays, they MUST be allocated to length + /* NOTE: threads and tids are arrays, they MUST be allocated to length `global_max_threads` before module load. */ pthread_t *threads; /* opaque structure for threads */ int *tids; /* ID per each thread */ @@ -36,7 +36,7 @@ struct global_state { /* Synchronization variables for threadpool state */ pthread_mutex_t count_mutex; int count_threads; - int barrier_passed; /* indicates if the thread pool's thread barrier + int barrier_passed; /* indicates if the thread pool's thread barrier is unlocked and ready for the VM to process.*/ pthread_mutex_t count_threads_mutex; pthread_cond_t count_threads_cv; diff --git a/numexpr/necompiler.py b/numexpr/necompiler.py index 98aee4c..4ada878 100644 --- a/numexpr/necompiler.py +++ b/numexpr/necompiler.py @@ -8,17 +8,18 @@ # rights to use. #################################################################### -from typing import Optional, Dict import __future__ -import sys + import os -import threading import re +import sys +import threading +from typing import Dict, Optional import numpy is_cpu_amd_intel = False # DEPRECATION WARNING: WILL BE REMOVED IN FUTURE RELEASE -from numexpr import interpreter, expressions, use_vml +from numexpr import expressions, interpreter, use_vml from numexpr.utils import CacheDict, ContextDict # Declare a double type that does not exist in Python space @@ -28,7 +29,7 @@ int_ = numpy.int32 long_ = numpy.int64 -typecode_to_kind = {'b': 'bool', 'i': 'int', 'l': 'long', 'f': 'float', 'd': 'double', +typecode_to_kind = {'b': 'bool', 'i': 'int', 'l': 'long', 'f': 'float', 'd': 'double', 'c': 'complex', 'n': 'none', 's': 'str'} kind_to_typecode = {'bool': 'b', 'int': 'i', 'long': 'l', 'float': 'f', 'double': 'd', 'complex': 'c', 'bytes': 's', 'str': 's', 'none': 'n'} @@ -104,11 +105,11 @@ def __eq__(self, other): if getattr(self, name) != getattr(other, name): return False return True - + def __lt__(self,other): - # RAM: this is a fix for issue #88 whereby sorting on constants + # RAM: this is a fix for issue #88 whereby sorting on constants # that may be of astKind == 'complex' but type(self.value) == int or float - # Here we let NumPy sort as it will cast data properly for comparison + # Here we let NumPy sort as it will cast data properly for comparison # when the Python built-ins will raise an error. if self.astType == 'constant': if self.astKind == other.astKind: @@ -271,7 +272,7 @@ def __str__(self): def stringToExpression(s, types, context, sanitize: bool=True): """Given a string, convert it to a tree of ExpressionNode's. """ - # sanitize the string for obvious attack vectors that NumExpr cannot + # sanitize the string for obvious attack vectors that NumExpr cannot # parse into its homebrew AST. This is to protect the call to `eval` below. # We forbid `;`, `:`. `[` and `__`, and attribute access via '.'. # We cannot ban `.real` or `.imag` however... @@ -281,7 +282,7 @@ def stringToExpression(s, types, context, sanitize: bool=True): skip_quotes = re.sub(r'(\'[^\']*\')', '', no_whitespace) if _blacklist_re.search(skip_quotes) is not None: raise ValueError(f'Expression {s} has forbidden control characters.') - + old_ctx = expressions._context.get_current_context() try: expressions._context.set_new_context(context) @@ -307,7 +308,7 @@ def stringToExpression(s, types, context, sanitize: bool=True): # now build the expression ex = eval(c, names) - + if expressions.isConstant(ex): ex = expressions.ConstantNode(ex, expressions.getKind(ex)) elif not isinstance(ex, expressions.ExpressionNode): @@ -363,7 +364,7 @@ def getConstants(ast): a = 1 + 3j; b = 5.0 ne.evaluate('a*2 + 15j - b') """ - constant_registers = set([node.reg for node in ast.allOf("constant")]) + constant_registers = set([node.reg for node in ast.allOf("constant")]) constants_order = sorted([r.node for r in constant_registers]) constants = [convertConstantToKind(a.value, a.astKind) for a in constants_order] @@ -557,7 +558,7 @@ def getContext(kwargs, _frame_depth=1): context[name] = value else: raise ValueError("'%s' must be one of %s" % (name, allowed)) - + if d: raise ValueError("Unknown keyword argument '%s'" % d.popitem()[0]) if context['truediv'] == 'auto': @@ -657,7 +658,7 @@ def disassemble(nex): def parseOp(op): name, sig = [*op.rsplit(b'_', 1), ''][:2] - return name, sig + return name, sig def getArg(pc, offset): arg = nex.program[pc + (offset if offset < 4 else offset+1)] @@ -752,7 +753,7 @@ def getArguments(names, local_dict=None, global_dict=None, _frame_depth: int=2): if global_dict is None: global_dict = frame_globals - # If `call_frame` is the top frame of the interpreter we can't clear its + # If `call_frame` is the top frame of the interpreter we can't clear its # `local_dict`, because it is actually the `global_dict`. clear_local_dict = clear_local_dict and not frame_globals is local_dict @@ -774,23 +775,26 @@ def getArguments(names, local_dict=None, global_dict=None, _frame_depth: int=2): # Dictionaries for caching variable names and compiled expressions -_names_cache = CacheDict(256) -_numexpr_cache = CacheDict(256) -_numexpr_last = ContextDict() +# _names_cache = CacheDict(256) +_names_cache = threading.local() +# _numexpr_cache = CacheDict(256) +_numexpr_cache = threading.local() +# _numexpr_last = ContextDict() +_numexpr_last = threading.local() evaluate_lock = threading.Lock() -def validate(ex: str, - local_dict: Optional[Dict] = None, +def validate(ex: str, + local_dict: Optional[Dict] = None, global_dict: Optional[Dict] = None, - out: numpy.ndarray = None, - order: str = 'K', - casting: str = 'safe', + out: numpy.ndarray = None, + order: str = 'K', + casting: str = 'safe', _frame_depth: int = 2, sanitize: Optional[bool] = None, **kwargs) -> Optional[Exception]: r""" Validate a NumExpr expression with the given `local_dict` or `locals()`. - Returns `None` on success and the Exception object if one occurs. Note that + Returns `None` on success and the Exception object if one occurs. Note that you can proceed directly to call `re_evaluate()` if you use `validate()` to sanitize your expressions and variables in advance. @@ -835,30 +839,38 @@ def validate(ex: str, * 'unsafe' means any data conversions may be done. sanitize: Optional[bool] - Both `validate` and by extension `evaluate` call `eval(ex)`, which is - potentially dangerous on unsanitized inputs. As such, NumExpr by default - performs simple sanitization, banning the character ':;[', the + Both `validate` and by extension `evaluate` call `eval(ex)`, which is + potentially dangerous on unsanitized inputs. As such, NumExpr by default + performs simple sanitization, banning the character ':;[', the dunder '__[\w+]__', and attribute access to all but '.real' and '.imag'. - - Using `None` defaults to `True` unless the environment variable - `NUMEXPR_SANITIZE=0` is set, in which case the default is `False`. + + Using `None` defaults to `True` unless the environment variable + `NUMEXPR_SANITIZE=0` is set, in which case the default is `False`. Nominally this can be set via `os.environ` before `import numexpr`. _frame_depth: int - The calling frame depth. Unless you are a NumExpr developer you should + The calling frame depth. Unless you are a NumExpr developer you should not set this value. Note ---- - + """ global _numexpr_last + if not hasattr(_numexpr_last, 'l'): + _numexpr_last.l = ContextDict() + + if not hasattr(_names_cache, 'c'): + _names_cache.c = CacheDict(256) + + if not hasattr(_numexpr_cache, 'c'): + _numexpr_cache.c = CacheDict(256) try: - + if not isinstance(ex, str): raise ValueError("must specify expression as a string") - + if sanitize is None: if 'NUMEXPR_SANITIZE' in os.environ: sanitize = bool(int(os.environ['NUMEXPR_SANITIZE'])) @@ -868,9 +880,9 @@ def validate(ex: str, # Get the names for this expression context = getContext(kwargs) expr_key = (ex, tuple(sorted(context.items()))) - if expr_key not in _names_cache: - _names_cache[expr_key] = getExprNames(ex, context, sanitize=sanitize) - names, ex_uses_vml = _names_cache[expr_key] + if expr_key not in _names_cache.c: + _names_cache.c[expr_key] = getExprNames(ex, context, sanitize=sanitize) + names, ex_uses_vml = _names_cache.c[expr_key] arguments = getArguments(names, local_dict, global_dict, _frame_depth=_frame_depth) # Create a signature @@ -880,22 +892,22 @@ def validate(ex: str, # Look up numexpr if possible. numexpr_key = expr_key + (tuple(signature),) try: - compiled_ex = _numexpr_cache[numexpr_key] + compiled_ex = _numexpr_cache.c[numexpr_key] except KeyError: - compiled_ex = _numexpr_cache[numexpr_key] = NumExpr(ex, signature, sanitize=sanitize, **context) + compiled_ex = _numexpr_cache.c[numexpr_key] = NumExpr(ex, signature, sanitize=sanitize, **context) kwargs = {'out': out, 'order': order, 'casting': casting, 'ex_uses_vml': ex_uses_vml} - _numexpr_last.set(ex=compiled_ex, argnames=names, kwargs=kwargs) + _numexpr_last.l.set(ex=compiled_ex, argnames=names, kwargs=kwargs) except Exception as e: return e return None -def evaluate(ex: str, - local_dict: Optional[Dict] = None, +def evaluate(ex: str, + local_dict: Optional[Dict] = None, global_dict: Optional[Dict] = None, - out: numpy.ndarray = None, - order: str = 'K', - casting: str = 'safe', + out: numpy.ndarray = None, + order: str = 'K', + casting: str = 'safe', sanitize: Optional[bool] = None, _frame_depth: int = 3, **kwargs) -> numpy.ndarray: @@ -948,27 +960,27 @@ def evaluate(ex: str, performs simple sanitization, banning the characters ':;[', the dunder '__[\w+]__', and attribute access to all but '.real' and '.imag'. - Using `None` defaults to `True` unless the environment variable - `NUMEXPR_SANITIZE=0` is set, in which case the default is `False`. + Using `None` defaults to `True` unless the environment variable + `NUMEXPR_SANITIZE=0` is set, in which case the default is `False`. Nominally this can be set via `os.environ` before `import numexpr`. _frame_depth: int - The calling frame depth. Unless you are a NumExpr developer you should + The calling frame depth. Unless you are a NumExpr developer you should not set this value. """ - # We could avoid code duplication if we called validate and then re_evaluate + # We could avoid code duplication if we called validate and then re_evaluate # here, but we have difficulties with the `sys.getframe(2)` call in # `getArguments` - e = validate(ex, local_dict=local_dict, global_dict=global_dict, - out=out, order=order, casting=casting, + e = validate(ex, local_dict=local_dict, global_dict=global_dict, + out=out, order=order, casting=casting, _frame_depth=_frame_depth, sanitize=sanitize, **kwargs) if e is None: return re_evaluate(local_dict=local_dict, global_dict=global_dict, _frame_depth=_frame_depth) else: raise e - -def re_evaluate(local_dict: Optional[Dict] = None, + +def re_evaluate(local_dict: Optional[Dict] = None, global_dict: Optional[Dict] = None, _frame_depth: int=2) -> numpy.ndarray: """ @@ -983,17 +995,19 @@ def re_evaluate(local_dict: Optional[Dict] = None, local_dict: dictionary, optional A dictionary that replaces the local operands in current frame. _frame_depth: int - The calling frame depth. Unless you are a NumExpr developer you should + The calling frame depth. Unless you are a NumExpr developer you should not set this value. """ global _numexpr_last + if not hasattr(_numexpr_last, 'l'): + _numexpr_last.l = ContextDict() try: - compiled_ex = _numexpr_last['ex'] + compiled_ex = _numexpr_last.l['ex'] except KeyError: raise RuntimeError("A previous evaluate() execution was not found, please call `validate` or `evaluate` once before `re_evaluate`") - argnames = _numexpr_last['argnames'] + argnames = _numexpr_last.l['argnames'] args = getArguments(argnames, local_dict, global_dict, _frame_depth=_frame_depth) - kwargs = _numexpr_last['kwargs'] + kwargs = _numexpr_last.l['kwargs'] with evaluate_lock: return compiled_ex(*args, **kwargs) diff --git a/numexpr/numexpr_config.hpp b/numexpr/numexpr_config.hpp index 0663c6d..2bf0091 100644 --- a/numexpr/numexpr_config.hpp +++ b/numexpr/numexpr_config.hpp @@ -19,7 +19,7 @@ #define BLOCK_SIZE1 1024 #endif -// The default threadpool size. It's prefer that the user set this via an +// The default threadpool size. It's prefer that the user set this via an // environment variable, "NUMEXPR_MAX_THREADS" #define DEFAULT_MAX_THREADS 64 diff --git a/numexpr/numexpr_object.cpp b/numexpr/numexpr_object.cpp index e788d1c..b20aef0 100644 --- a/numexpr/numexpr_object.cpp +++ b/numexpr/numexpr_object.cpp @@ -405,4 +405,3 @@ PyTypeObject NumExprType = { 0, /* tp_alloc */ NumExpr_new, /* tp_new */ }; - diff --git a/numexpr/tests/__init__.py b/numexpr/tests/__init__.py index 3fff411..f47c8cc 100644 --- a/numexpr/tests/__init__.py +++ b/numexpr/tests/__init__.py @@ -8,7 +8,7 @@ # rights to use. #################################################################### -from numexpr.tests.test_numexpr import test, print_versions +from numexpr.tests.test_numexpr import print_versions, test if __name__ == '__main__': test() diff --git a/numexpr/tests/test_numexpr.py b/numexpr/tests/test_numexpr.py index 62210b4..2731b32 100644 --- a/numexpr/tests/test_numexpr.py +++ b/numexpr/tests/test_numexpr.py @@ -11,31 +11,37 @@ import os -import sys import platform +import subprocess +import sys +import unittest import warnings from contextlib import contextmanager -import subprocess +from unittest.mock import MagicMock -import numpy as np -from numpy import ( - array, arange, empty, zeros, int32, int64, uint16, cdouble, float64, rec, - copy, ones_like, where, all as alltrue, linspace, - sum, prod, sqrt, fmod, floor, ceil, - sin, cos, tan, arcsin, arccos, arctan, arctan2, - sinh, cosh, tanh, arcsinh, arccosh, arctanh, - log, log1p, log10, exp, expm1, conj) import numpy -from numpy.testing import (assert_equal, assert_array_equal, - assert_array_almost_equal, assert_allclose) -from numpy import shape, allclose, array_equal, ravel, isnan, isinf +import numpy as np +from numpy import all as alltrue +from numpy import (allclose, arange, arccos, arccosh, arcsin, arcsinh, arctan, + arctan2, arctanh, array, array_equal, cdouble, ceil, conj, + copy, cos, cosh, empty, exp, expm1, float64, floor, fmod, + int32, int64, isinf, isnan, linspace, log, log1p, log10, + ones_like, prod, ravel, rec, shape, sin, sinh, sqrt, sum, + tan, tanh, uint16, where, zeros) +from numpy.testing import (assert_allclose, assert_array_almost_equal, + assert_array_equal, assert_equal) import numexpr -from numexpr import E, NumExpr, evaluate, re_evaluate, validate, disassemble, use_vml +from numexpr import (E, NumExpr, disassemble, evaluate, re_evaluate, use_vml, + validate) from numexpr.expressions import ConstantNode from numexpr.utils import detect_number_of_cores -import unittest +try: + import pytest + pytest_available = True +except ImportError: + pytest_available = False TestCase = unittest.TestCase @@ -44,6 +50,15 @@ MAX_THREADS = 16 +if not pytest_available: + def identity(f): + return f + + pytest = MagicMock() + pytest.mark = MagicMock() + pytest.mark.thread_unsafe = identity + + class test_numexpr(TestCase): """Testing with 1 thread""" nthreads = 1 @@ -318,6 +333,7 @@ def test_refcount(self): evaluate('1') assert sys.getrefcount(a) == 2 + @pytest.mark.thread_unsafe def test_locals_clears_globals(self): # Check for issue #313, whereby clearing f_locals also clear f_globals # if in the top-frame. This cannot be done inside `unittest` as it is always @@ -341,6 +357,7 @@ def test_locals_clears_globals(self): +@pytest.mark.thread_unsafe class test_numexpr2(test_numexpr): """Testing with 2 threads""" nthreads = 2 @@ -512,6 +529,7 @@ def test_illegal_value(self): else: self.fail() + @pytest.mark.thread_unsafe def test_sanitize(self): with _environment('NUMEXPR_SANITIZE', '1'): # Forbid dunder @@ -590,7 +608,7 @@ def test_sanitize(self): x = np.array(['a', 'b'], dtype=bytes) evaluate("x == 'b:'") - + @pytest.mark.thread_unsafe def test_no_sanitize(self): try: # Errors on compile() after eval() evaluate('import os;', sanitize=False) @@ -677,6 +695,7 @@ def test_ex_uses_vml(self): if 'sparc' not in platform.machine(): # Execution order set here so as to not use too many threads # during the rest of the execution. See #33 for details. + @pytest.mark.thread_unsafe def test_changing_nthreads_00_inc(self): a = linspace(-1, 1, 1000000) b = ((.25 * a + .75) * a - 1.5) * a - 2 @@ -685,6 +704,7 @@ def test_changing_nthreads_00_inc(self): c = evaluate("((.25*a + .75)*a - 1.5)*a - 2") assert_array_almost_equal(b, c) + @pytest.mark.thread_unsafe def test_changing_nthreads_01_dec(self): a = linspace(-1, 1, 1000000) b = ((.25 * a + .75) * a - 1.5) * a - 2 @@ -1123,6 +1143,7 @@ def _environment(key, value): del os.environ[key] # Test cases for the threading configuration +@pytest.mark.thread_unsafe class test_threading_config(TestCase): def test_max_threads_unset(self): # Has to be done in a subprocess as `importlib.reload` doesn't let us @@ -1306,6 +1327,7 @@ def _worker(qout=None): # Case test for subprocesses (via multiprocessing module) class test_subprocess(TestCase): + @pytest.mark.thread_unsafe def test_multiprocess(self): try: import multiprocessing as mp @@ -1328,9 +1350,10 @@ def test_multiprocess(self): def print_versions(): """Print the versions of software that numexpr relies on.""" # from pkg_resources import parse_version - from numexpr.cpuinfo import cpu import platform + from numexpr.cpuinfo import cpu + print('-=' * 38) print('Numexpr version: %s' % numexpr.__version__) print('NumPy version: %s' % np.__version__) @@ -1371,8 +1394,8 @@ def test(verbosity=1): def suite(): - import unittest import platform as pl + import unittest theSuite = unittest.TestSuite() niter = 1 diff --git a/numexpr/utils.py b/numexpr/utils.py index cc61833..9e45fbe 100644 --- a/numexpr/utils.py +++ b/numexpr/utils.py @@ -9,20 +9,22 @@ #################################################################### import logging + log = logging.getLogger(__name__) +import contextvars import os import subprocess -import contextvars -from numexpr.interpreter import _set_num_threads, _get_num_threads, MAX_THREADS from numexpr import use_vml +from numexpr.interpreter import MAX_THREADS, _get_num_threads, _set_num_threads + from . import version if use_vml: - from numexpr.interpreter import ( - _get_vml_version, _set_vml_accuracy_mode, _set_vml_num_threads, - _get_vml_num_threads) + from numexpr.interpreter import (_get_vml_num_threads, _get_vml_version, + _set_vml_accuracy_mode, + _set_vml_num_threads) def get_vml_version(): @@ -118,9 +120,9 @@ def get_num_threads(): def _init_num_threads(): """ - Detects the environment variable 'NUMEXPR_MAX_THREADS' to set the threadpool - size, and if necessary the slightly redundant 'NUMEXPR_NUM_THREADS' or - 'OMP_NUM_THREADS' env vars to set the initial number of threads used by + Detects the environment variable 'NUMEXPR_MAX_THREADS' to set the threadpool + size, and if necessary the slightly redundant 'NUMEXPR_NUM_THREADS' or + 'OMP_NUM_THREADS' env vars to set the initial number of threads used by the virtual machine. """ # Any platform-specific short-circuits @@ -140,7 +142,7 @@ def _init_num_threads(): env_configured = True n_cores = MAX_THREADS else: - # The use has not set 'NUMEXPR_MAX_THREADS', so likely they have not + # The use has not set 'NUMEXPR_MAX_THREADS', so likely they have not # configured NumExpr as desired, so we emit info logs. if n_cores > MAX_THREADS: log.info('Note: detected %d virtual cores but NumExpr set to maximum of %d, check "NUMEXPR_MAX_THREADS" environment variable.'%(n_cores, MAX_THREADS)) @@ -149,7 +151,7 @@ def _init_num_threads(): log.info('Note: NumExpr detected %d cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.'%n_cores) n_cores = 16 - # Now we check for 'NUMEXPR_NUM_THREADS' or 'OMP_NUM_THREADS' to set the + # Now we check for 'NUMEXPR_NUM_THREADS' or 'OMP_NUM_THREADS' to set the # actual number of threads used. if 'NUMEXPR_NUM_THREADS' in os.environ and os.environ['NUMEXPR_NUM_THREADS'] != '': requested_threads = int(os.environ['NUMEXPR_NUM_THREADS']) @@ -165,7 +167,7 @@ def _init_num_threads(): set_num_threads(requested_threads) return requested_threads - + def detect_number_of_cores(): """ Detects the number of cores on a system. Cribbed from pp. diff --git a/numexpr/win32/stdint.h b/numexpr/win32/stdint.h index b7e7112..c66267a 100644 --- a/numexpr/win32/stdint.h +++ b/numexpr/win32/stdint.h @@ -17,7 +17,7 @@ * * mwb: This was modified in the following ways: * - * - make it compatible with Visual C++ 6 (which uses + * - make it compatible with Visual C++ 6 (which uses * non-standard keywords and suffixes for 64-bit types) * - some environments need stddef.h included (for wchar stuff?) * - handle the fact that Microsoft's limits.h header defines @@ -70,9 +70,9 @@ typedef unsigned uint_least32_t; typedef __STDINT_LONGLONG int_least64_t; typedef unsigned __STDINT_LONGLONG uint_least64_t; -/* 7.18.1.3 Fastest minimum-width integer types +/* 7.18.1.3 Fastest minimum-width integer types * Not actually guaranteed to be fastest for all purposes - * Here we use the exact-width types for 8 and 16-bit ints. + * Here we use the exact-width types for 8 and 16-bit ints. */ typedef char int_fast8_t; typedef unsigned char uint_fast8_t; @@ -110,7 +110,7 @@ typedef unsigned __STDINT_LONGLONG uintmax_t; #if !defined ( __cplusplus) || defined (__STDC_LIMIT_MACROS) /* 7.18.2.1 Limits of exact-width integer types */ -#define INT8_MIN (-128) +#define INT8_MIN (-128) #define INT16_MIN (-32768) #define INT32_MIN (-2147483647 - 1) #define INT64_MIN (PASTE( -9223372036854775807, __STDINT_LONGLONG_SUFFIX) - 1) @@ -158,7 +158,7 @@ typedef unsigned __STDINT_LONGLONG uintmax_t; #define UINT_FAST64_MAX UINT64_MAX /* 7.18.2.4 Limits of integer types capable of holding - object pointers */ + object pointers */ #ifdef _WIN64 #define INTPTR_MIN INT64_MIN #define INTPTR_MAX INT64_MAX @@ -186,7 +186,7 @@ typedef unsigned __STDINT_LONGLONG uintmax_t; #define SIZE_MAX UINTPTR_MAX #endif -#ifndef WCHAR_MIN /* also in wchar.h */ +#ifndef WCHAR_MIN /* also in wchar.h */ #define WCHAR_MIN 0 #define WCHAR_MAX ((wchar_t)-1) /* UINT16_MAX */ #endif diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..4fec170 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +markers = + thread_unsafe: mark a test as thread unsafe diff --git a/setup.py b/setup.py index 82f3651..64d9f20 100644 --- a/setup.py +++ b/setup.py @@ -9,12 +9,13 @@ # rights to use. #################################################################### -import os, os.path as op -import platform import configparser -import numpy as np -from setuptools import setup, Extension +import os +import os.path as op +import platform +import numpy as np +from setuptools import Extension, setup with open('requirements.txt') as f: requirements = f.read().splitlines() @@ -40,7 +41,7 @@ libs = [] # Pre-built libraries ONLY, like python36.so clibs = [] def_macros = [ - # keep in sync with minimal runtime requirement (requirements.txt) + # keep in sync with minimal runtime requirement (requirements.txt) ('NPY_TARGET_VERSION', 'NPY_1_23_API_VERSION') ] sources = ['numexpr/interpreter.cpp',