Merge branch 'release/0.7.3'

fjarri · May 2, 2019 · 3cbed28 · 3cbed28
2 parents 98c1e87 + 73825a4
commit 3cbed28
Show file tree

Hide file tree

Showing 20 changed files with 674 additions and 110 deletions.
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -1,6 +1,6 @@
 Reikna is licensed under the MIT license:
 
-Copyright (c) 2012-2013 Bogdan Opanchuk
+Copyright (c) 2012-2018 Bogdan Opanchuk
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 

diff --git a/TODO.rst b/TODO.rst
diff --git a/doc/source/api/core.rst b/doc/source/api/core.rst
@@ -47,3 +47,9 @@ Result and attribute classes
 
 .. automodule:: reikna.core.transformation
     :members: TransformationParameter, KernelParameter
+
+
+Array tools
+-----------
+
+.. autofunction:: reikna.concatenate
diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -41,7 +41,7 @@
 
 # General information about the project.
 project = 'reikna'
-copyright = '2012-2015, Bogdan Opanchuk'
+copyright = '2012-2018, Bogdan Opanchuk'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the

diff --git a/doc/source/history.rst b/doc/source/history.rst
@@ -3,6 +3,38 @@ Release history
 ***************
 
 
+0.7.3 (1 May 2019)
+==================
+
+* ADDED: ``inverse`` parameter for :py:class:`~reikna.fft.FFTShift` (contributed by @drtpotter).
+
+* ADDED: :py:meth:`~reikna.core.Type.with_dtype` method for :py:class:`~reikna.core.Type`.
+
+* ADDED: :py:func:`~reikna.transformations.cast` transformation.
+
+* ADDED: :py:meth:`~reikna.core.Type.broadcastable_to` method for :py:class:`~reikna.core.Type`.
+
+* ADDED: added :py:func:`~reikna.transformations.copy_broadcasted` transformation.
+
+* ADDED: :py:meth:`~reikna.cluda.api.Thread.get_cached_computation` method for :py:class:`~reikna.cluda.api.Thread`.
+
+* ADDED: arrays now support setting arbitrary slices with scalars or arrays.
+
+* ADDED: support for ``get()`` method for non-contiguous arrays.
+
+* ADDED: :py:func:`~reikna.concatenate` for concatenating GPU arrays.
+
+* ADDED: :py:func:`~reikna.roll` for GPU arrays and the inplace version :py:meth:`~reikna.cluda.api.Array.roll`.
+
+* FIXED: updated the CUDA backend for the change ``async`` -> ``async_`` in the new versions of PyCUDA. Bumped PyOpenCL and PyCUDA versions to 2018.1.1.
+
+* FIXED: an error in the conversion of `numpy.int64` to ctype for Windows.
+
+* FIXED: an unstable type of ``nbytes`` in ``Thread.array()``, leading to problems with calling the C++ backend later on.
+
+* FIXED: a bug where a nonzero offset was ignored when building an accessor macro for an array with a zero-length shape
+
+
 0.7.2 (16 Sep 2018)
 ===================
 

diff --git a/reikna/__init__.py b/reikna/__init__.py
@@ -1 +1,2 @@
+from reikna.cluda.array_helpers import concatenate, roll
 
diff --git a/reikna/cluda/api.py b/reikna/cluda/api.py
@@ -31,6 +31,16 @@
 
         The total size of the array data plus the offset (in bytes).
 
+    .. py:method:: roll(shift, axis=-1)
+
+        Cyclically shifts elements of ``array`` **inplace**
+        by ``shift`` positions to the right along ``axis``.
+        ``shift`` can be negative (in which case the elements are shifted to the left).
+        Elements that are shifted beyond the last position are re-introduced at the first
+        (and vice versa).
+
+        Works equivalently to ``numpy.roll`` (except ``axis=None`` is not supported).
+
     .. py:method:: get()
 
         Returns ``numpy.ndarray`` with the contents of the array.
@@ -188,7 +198,6 @@ def create(cls, interactive=False, device_filters=None, **thread_kwds):
         :param device_filters: keywords to filter devices
             (see the keywords for :py:func:`~reikna.cluda.find_devices`).
         :param thread_kwds: keywords to pass to :py:class:`Thread` constructor.
-        :param kwds: same as in :py:class:`Thread`.
         """
 
         if device_filters is None:
@@ -267,6 +276,34 @@ def __init__(self, cqd, async_=True, temp_alloc=None):
             pack_on_alloc=temp_alloc_params['pack_on_alloc'],
             pack_on_free=temp_alloc_params['pack_on_free'])
 
+        self._computation_cache = dict()
+
+    def get_cached_computation(self, cls, *args, **kwds):
+        """
+        Returns a compiled computation ``cls`` initialized with ``args`` and ``kwds``.
+        The results are cached, so any computation with the same arguments will only be
+        initialized and compiled once.
+
+        .. note::
+
+            All of ``args`` and ``kwds`` must be hashable! If any of those are arrays,
+            they can be passed through ``Type.from_value()``.
+        """
+
+        # TODO: we could make given numpy arrays and GPU arrays hashable
+        # by calling ``Type.from_value()`` on them automatically,
+        # but that would require importing the ``Array`` type, creating a circular dependency.
+
+        hashable_kwds = tuple((key, kwds[key]) for key in sorted(kwds))
+        key = (cls, args, hashable_kwds)
+        if key in self._computation_cache:
+            return self._computation_cache[key]
+        else:
+            comp = cls(*args, **kwds)
+            compiled_comp = comp.compile(self)
+            self._computation_cache[key] = compiled_comp
+            return compiled_comp
+
     def allocate(self, size):
         """
         Creates an untyped memory allocation object of type :py:class:`Buffer` with size ``size``.
@@ -517,6 +554,8 @@ def release(self):
         # and auto-releasable when not in use (crucial for CUDA because of its
         # stupid stateful contexts), I'll leave at is it is for now.
 
+        self._computation_cache.clear()
+
         try:
             released = self._released
         except AttributeError:

diff --git a/reikna/cluda/array_helpers.py b/reikna/cluda/array_helpers.py
@@ -0,0 +1,199 @@
+# FIXME: This module is a part of Array functionality, so it is located on CLUDA level,
+# but it requires some high-level Reikna functionality (computations and transformations).
+# So it is a bit of circular dependency.
+# Ideally, this should be moved to computation level, perhaps creating two versions of Array -
+# CLUDA level (without __setitem__) and Reikna level (with one).
+
+import numpy
+
+import reikna.cluda.dtypes as dtypes
+import reikna.transformations as transformations
+import reikna.cluda.functions as functions
+from reikna.algorithms import PureParallel
+from reikna.core import Type, Parameter, Annotation, Computation
+
+
+def normalize_value(thr, gpu_array_type, val):
+    """
+    Transforms a given value (a scalar or an array)
+    to a value that can be passed to a kernel.
+    """
+    if isinstance(val, gpu_array_type):
+        return val
+    elif isinstance(val, numpy.ndarray):
+        return thr.to_device(val)
+    else:
+        dtype = dtypes.detect_type(val)
+        return numpy.cast[dtype](val)
+
+
+def setitem_computation(dest, source):
+    """
+    Returns a compiled computation that broadcasts ``source`` to ``dest``,
+    where ``dest`` is a GPU array, and ``source`` is either a GPU array or a scalar.
+    """
+    if len(source.shape) == 0:
+        trf = transformations.broadcast_param(dest)
+        return PureParallel.from_trf(trf, guiding_array=trf.output)
+    else:
+        source_dt = Type.from_value(source).with_dtype(dest.dtype)
+        trf = transformations.copy(source_dt, dest)
+        comp = PureParallel.from_trf(trf, guiding_array=trf.output)
+        cast_trf = transformations.cast(source, dest.dtype)
+        comp.parameter.input.connect(cast_trf, cast_trf.output, src_input=cast_trf.input)
+        return comp
+
+
+def setitem_method(array, index, value):
+    # We need it both in ``cuda.Array`` and ``ocl.Array``, hence a standalone function.
+
+    # PyOpenCL and PyCUDA support __setitem__() for some restricted cases,
+    # but it is too complicated to determine when it will work,
+    # and it is easier to just call our own implementation every time.
+
+    view = array[index]
+    value = normalize_value(array.thread, type(array), value)
+    comp = array.thread.get_cached_computation(
+        setitem_computation, Type.from_value(view), Type.from_value(value))
+    comp(view, value)
+
+
+def get_method(array):
+    temp = array.thread.array(array.shape, array.dtype)
+    comp = array.thread.get_cached_computation(
+        setitem_computation, Type.from_value(temp), Type.from_value(array))
+    comp(temp, array)
+    return temp.get()
+
+
+def is_shape_compatible(template_shape, shape, axis):
+    for i in range(len(template_shape)):
+        if i != axis and shape[i] != template_shape[i]:
+            return False
+    return True
+
+
+def concatenate(arrays, axis=0, out=None):
+    """
+    Concatenate an iterable of arrays along ``axis`` and write them to ``out``
+    (allocating it if it is set to ``None``).
+
+    Works analogously to ``numpy.concatenate()`` (except ``axis=None`` is not supported).
+    """
+
+    # TODO: support axis=None.
+    # Requires Array.ravel() returnign an Array instead of CUDA/PyOpenCL array.
+
+    if len(arrays) == 0:
+        raise ValueError("Need at least one array to concatenate")
+    if any(array.dtype != arrays[0].dtype for array in arrays[1:]):
+        raise ValueError("Data types of all arrays must be the same")
+
+    dtype = arrays[0].dtype
+    thread = arrays[0].thread
+
+    template_shape = arrays[0].shape
+    axis = axis % len(template_shape)
+    for array in arrays[1:]:
+        if not is_shape_compatible(template_shape, array.shape, axis):
+            raise ValueError(
+                "Shapes are not compatible: " + str(template_shape) + " and " + str(shape))
+
+    out_shape = list(template_shape)
+    out_shape[axis] = sum(array.shape[axis] for array in arrays)
+    out_shape = tuple(out_shape)
+
+    if out is None:
+        out = thread.array(out_shape, dtype)
+    else:
+        if out.shape != out_shape:
+            raise ValueError(
+                "Incorrect output shape: expected " + str(out_shape) + ", got " + str(out.shape))
+        if out.dtype != dtype:
+            raise ValueError(
+                "Incorrect output dtype: expected " + str(dtype) + ", got " + str(out.dtype))
+
+    offset = 0
+    slices = [slice(None) for i in range(len(out_shape))]
+    for array in arrays:
+        slices[axis] = slice(offset, offset + array.shape[axis])
+        out[tuple(slices)] = array
+        offset += array.shape[axis]
+
+    return out
+
+
+def roll_computation(array, axis):
+    return PureParallel(
+        [
+            Parameter('output', Annotation(array, 'o')),
+            Parameter('input', Annotation(array, 'i')),
+            Parameter('shift', Annotation(Type(numpy.int32)))],
+        """
+        <%
+            shape = input.shape
+        %>
+        %for i in range(len(shape)):
+            VSIZE_T output_${idxs[i]} =
+                %if i == axis:
+                ${shift} == 0 ?
+                    ${idxs[i]} :
+                    ## Since ``shift`` can be negative, and its absolute value greater than
+                    ## ``shape[i]``, a double modulo division is necessary
+                    ## (the ``%`` operator preserves the sign of the dividend in C).
+                    (${idxs[i]} + (${shape[i]} + ${shift} % ${shape[i]})) % ${shape[i]};
+                %else:
+                ${idxs[i]};
+                %endif
+        %endfor
+        ${output.store_idx}(
+            ${", ".join("output_" + name for name in idxs)},
+            ${input.load_idx}(${", ".join(idxs)}));
+        """,
+        guiding_array='input',
+        render_kwds=dict(axis=axis))
+
+
+class RollInplace(Computation):
+
+    def __init__(self, array, axis):
+        self._axis = axis
+        Computation.__init__(self, [
+            Parameter('array', Annotation(array, 'io')),
+            Parameter('shift', Annotation(Type(numpy.int32)))])
+
+    def _build_plan(self, plan_factory, device_params, array, shift):
+        plan = plan_factory()
+
+        temp = plan.temp_array_like(array)
+        plan.computation_call(roll_computation(array, self._axis), temp, array, shift)
+
+        tr = transformations.copy(temp, out_arr_t=array)
+        copy_comp = PureParallel.from_trf(tr, guiding_array=tr.output)
+        plan.computation_call(copy_comp, array, temp)
+
+        return plan
+
+
+def roll(array, shift, axis=-1):
+    """
+    Cyclically shifts elements of ``array`` by ``shift`` positions to the right along ``axis``.
+    ``shift`` can be negative (in which case the elements are shifted to the left).
+    Elements that are shifted beyond the last position are re-introduced at the first
+    (and vice versa).
+
+    Works equivalently to ``numpy.roll`` (except ``axis=None`` is not supported).
+    """
+    temp = array.thread.array(array.shape, array.dtype)
+    axis = axis % len(array.shape)
+    comp = array.thread.get_cached_computation(
+        roll_computation, Type.from_value(array), axis)
+    comp(temp, array, shift)
+    return temp
+
+
+def roll_method(array, shift, axis=-1):
+    axis = axis % len(array.shape)
+    comp = array.thread.get_cached_computation(
+        RollInplace, Type.from_value(array), axis)
+    comp(array, shift)
diff --git a/reikna/cluda/cuda.py b/reikna/cluda/cuda.py
@@ -13,6 +13,8 @@
 from reikna.helpers import factors, wrap_in_tuple, product, min_buffer_size
 import reikna.cluda.api as api_base
 
+from reikna.cluda.array_helpers import setitem_method, get_method, roll_method
+
 
 cuda.init()
 
@@ -101,10 +103,7 @@ def copy(self):
         so we're overriding it.
         """
         new_arr = self._new_like_me()
-        # FIXME: a temporary workaround for PyCUDA not being compatible with Py3.7
-        # where `async` is a keyword.
-        kwds = {'async': True, 'stream': self.thread._queue}
-        gpuarray._memcpy_discontig(new_arr, self, **kwds)
+        gpuarray._memcpy_discontig(new_arr, self, async_=True, stream=self.thread._queue)
         return new_arr
 
     def _new_like_me(self, dtype=None):
@@ -125,6 +124,18 @@ def __getitem__(self, index):
             base_data=self.base_data,
             offset=int(res.gpudata) - int(self.base_data))
 
+    def __setitem__(self, index, value):
+        setitem_method(self, index, value)
+
+    def roll(self, shift, axis=-1):
+        roll_method(self, shift, axis=axis)
+
+    def get(self):
+        if self.flags.forc:
+            return gpuarray.GPUArray.get(self)
+        else:
+            return get_method(self)
+
     def _tempalloc_update_buffer(self, data):
         self.base_data = data
         self.gpudata = int(self.base_data) + self.offset
@@ -166,7 +177,7 @@ def array(
         dtype = dtypes.normalize_type(dtype)
         shape = wrap_in_tuple(shape)
         if nbytes is None:
-            nbytes = min_buffer_size(shape, dtype.itemsize, strides=strides, offset=offset)
+            nbytes = int(min_buffer_size(shape, dtype.itemsize, strides=strides, offset=offset))
 
         if (offset != 0 or strides is not None) and base_data is None and base is None:
             base_data = allocator(nbytes)

diff --git a/reikna/cluda/dtypes.py b/reikna/cluda/dtypes.py
@@ -185,7 +185,7 @@ def _fill_dtype_registry(respect_windows=True):
     _register_dtype(numpy.int32, "int")
     _register_dtype(numpy.uint32, "unsigned int")
 
-    if platform.system == 'Windows' and respect_windows:
+    if platform.system() == 'Windows' and respect_windows:
         i64_name = "long long"
     else:
         i64_name = "long"