diff --git a/.github/workflows/docker-bases.yml b/.github/workflows/docker-bases.yml index 9d964c6548..090ff631da 100644 --- a/.github/workflows/docker-bases.yml +++ b/.github/workflows/docker-bases.yml @@ -255,4 +255,6 @@ jobs: file: './docker/Dockerfile.amd' push: true target: 'hip' + build-args: | + ROCM_VERSION=6.3.4 tags: devitocodes/bases:amd-hip diff --git a/devito/__init__.py b/devito/__init__.py index b8d7621297..e6cf0092bf 100644 --- a/devito/__init__.py +++ b/devito/__init__.py @@ -56,7 +56,8 @@ def reinit_compiler(val): """ Re-initialize the Compiler. """ - configuration['compiler'].__init__(suffix=configuration['compiler'].suffix, + configuration['compiler'].__init__(name=configuration['compiler'].name, + suffix=configuration['compiler'].suffix, mpi=configuration['mpi']) return val @@ -65,7 +66,7 @@ def reinit_compiler(val): configuration.add('platform', 'cpu64', list(platform_registry), callback=lambda i: platform_registry[i]()) configuration.add('compiler', 'custom', compiler_registry, - callback=lambda i: compiler_registry[i]()) + callback=lambda i: compiler_registry[i](name=i)) # Setup language for shared-memory parallelism preprocessor = lambda i: {0: 'C', 1: 'openmp'}.get(i, i) # Handles DEVITO_OPENMP deprec diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py index 6987ce3cb6..04bd30bcbf 100644 --- a/devito/arch/archinfo.py +++ b/devito/arch/archinfo.py @@ -330,6 +330,69 @@ def cbk(deviceid=0): except OSError: pass + # *** Third try: `sycl-ls`, clearly only works with Intel cards + try: + gpu_infos = {} + + # sycl-ls sometimes finds gpu twice with opencl and without so + # we need to make sure we don't get duplicates + selected_platform = None + platform_block = "" + + proc = Popen(["sycl-ls", "--verbose"], stdout=PIPE, stderr=DEVNULL, text=True) + sycl_output, _ = proc.communicate() + + # Extract platform blocks + platforms = re.findall(r"Platform \[#(\d+)\]:([\s\S]*?)(?=Platform \[#\d+\]:|$)", + sycl_output) + + # Select Level-Zero if available, otherwise use OpenCL + for platform_id, platform_content in platforms: + if "Intel(R) Level-Zero" in platform_content: + selected_platform = platform_id + platform_block = platform_content + break + elif "Intel(R) OpenCL Graphics" in platform_content and \ + selected_platform is None: + selected_platform = platform_id + platform_block = platform_content + + # Extract GPU devices from the selected platform + devices = re.findall(r"Device \[#(\d+)\]:([\s\S]*?)(?=Device \[#\d+\]:|$)", + platform_block) + + for device_id, device_block in devices: + if re.search(r"^\s*Type\s*:\s*gpu", device_block, re.MULTILINE): + name_match = re.search(r"^\s*Name\s*:\s*(.+)", device_block, re.MULTILINE) + + if name_match: + name = name_match.group(1).strip() + + # Store GPU info with correct physical ID + gpu_infos[device_id] = { + "physicalid": device_id, + "product": name + } + + gpu_info = homogenise_gpus(list(gpu_infos.values())) + + # Also attach callbacks to retrieve instantaneous memory info + # Now this should be done using xpu-smi but for some reason + # it throws a lot of weird errors in docker so skipping for now + for i in ['total', 'free', 'used']: + def make_cbk(i): + def cbk(deviceid=0): + return None + return cbk + + gpu_info['mem.%s' % i] = make_cbk(i) + + gpu_infos['architecture'] = 'Intel' + return gpu_info + + except OSError: + pass + # *** Second try: `lshw` try: info_cmd = ['lshw', '-C', 'video'] @@ -391,7 +454,7 @@ def parse_product_arch(): gpu_infos = [] for line in lines: # Graphics cards are listed as VGA or 3D controllers in lspci - if 'VGA' in line or '3D' in line: + if any(i in line for i in ('VGA', '3D', 'Display')): gpu_info = {} # Lines produced by lspci command are of the form: # xxxx:xx:xx.x Device Type: Name diff --git a/devito/arch/compiler.py b/devito/arch/compiler.py index aa8cfa0795..a2b9334f54 100644 --- a/devito/arch/compiler.py +++ b/devito/arch/compiler.py @@ -180,12 +180,21 @@ def __init__(self): """ fields = {'cc', 'ld'} - _cpp = False + _default_cpp = False + _cxxstd = 'c++14' + _cstd = 'c99' def __init__(self, **kwargs): + maybe_name = kwargs.pop('name', self.__class__.__name__) + if isinstance(maybe_name, Compiler): + self._name = maybe_name.name + else: + self._name = maybe_name + super().__init__(**kwargs) self.__lookup_cmds__() + self._cpp = kwargs.get('cpp', self._default_cpp) self.suffix = kwargs.get('suffix') if not kwargs.get('mpi'): @@ -195,7 +204,7 @@ def __init__(self, **kwargs): self.cc = self.MPICC if self._cpp is False else self.MPICXX self.ld = self.cc # Wanted by the superclass - self.cflags = ['-O3', '-g', '-fPIC', '-Wall', '-std=c99'] + self.cflags = ['-O3', '-g', '-fPIC', '-Wall', f'-std={self.std}'] self.ldflags = ['-shared'] self.include_dirs = [] @@ -225,13 +234,13 @@ def __new_with__(self, **kwargs): Create a new Compiler from an existing one, inherenting from it the flags that are not specified via ``kwargs``. """ - return self.__class__(suffix=kwargs.pop('suffix', self.suffix), + return self.__class__(name=self.name, suffix=kwargs.pop('suffix', self.suffix), mpi=kwargs.pop('mpi', configuration['mpi']), **kwargs) @property def name(self): - return self.__class__.__name__ + return self._name @property def version(self): @@ -247,6 +256,10 @@ def version(self): return version + @property + def std(self): + return self._cxxstd if self._cpp else self._cstd + def get_version(self): result, stdout, stderr = call_capture_output((self.cc, "--version")) if result != 0: @@ -482,7 +495,7 @@ def __init_finalize__(self, **kwargs): platform = kwargs.pop('platform', configuration['platform']) if isinstance(platform, NvidiaDevice): - self.cflags.remove('-std=c99') + self.cflags.remove(f'-std={self.std}') # Add flags for OpenMP offloading if language in ['C', 'openmp']: cc = get_nvidia_cc() @@ -490,7 +503,7 @@ def __init_finalize__(self, **kwargs): self.cflags += ['-Xopenmp-target', f'-march=sm_{cc}'] self.ldflags += ['-fopenmp', '-fopenmp-targets=nvptx64-nvidia-cuda'] elif platform is AMDGPUX: - self.cflags.remove('-std=c99') + self.cflags.remove(f'-std={self.std}') # Add flags for OpenMP offloading if language in ['C', 'openmp']: self.ldflags += ['-target', 'x86_64-pc-linux-gnu'] @@ -553,9 +566,9 @@ def __init_finalize__(self, **kwargs): self.ldflags += ['-fopenmp'] if isinstance(platform, NvidiaDevice): - self.cflags.remove('-std=c99') + self.cflags.remove(f'-std={self.std}') elif platform is AMDGPUX: - self.cflags.remove('-std=c99') + self.cflags.remove(f'-std={self.std}') # Add flags for OpenMP offloading if language in ['C', 'openmp']: self.ldflags += ['-target', 'x86_64-pc-linux-gnu'] @@ -590,16 +603,13 @@ def __lookup_cmds__(self): class PGICompiler(Compiler): - _cpp = True + _default_cpp = True def __init_finalize__(self, **kwargs): - self.cflags.remove('-std=c99') self.cflags.remove('-O3') self.cflags.remove('-Wall') - self.cflags.append('-std=c++11') - language = kwargs.pop('language', configuration['language']) platform = kwargs.pop('platform', configuration['platform']) @@ -643,14 +653,13 @@ def __lookup_cmds__(self): class CudaCompiler(Compiler): - _cpp = True + _default_cpp = True def __init_finalize__(self, **kwargs): - self.cflags.remove('-std=c99') self.cflags.remove('-Wall') self.cflags.remove('-fPIC') - self.cflags.extend(['-std=c++14', '-Xcompiler', '-fPIC']) + self.cflags.extend(['-Xcompiler', '-fPIC']) if configuration['mpi']: # We rather use `nvcc` to compile MPI, but for this we have to @@ -717,15 +726,10 @@ def __lookup_cmds__(self): class HipCompiler(Compiler): - _cpp = True + _default_cpp = True def __init_finalize__(self, **kwargs): - self.cflags.remove('-std=c99') - self.cflags.remove('-Wall') - self.cflags.remove('-fPIC') - self.cflags.extend(['-std=c++14', '-fPIC']) - if configuration['mpi']: # We rather use `hipcc` to compile MPI, but for this we have to # explicitly pass the flags that an `mpicc` would implicitly use @@ -831,7 +835,7 @@ def __init_finalize__(self, **kwargs): language = kwargs.pop('language', configuration['language']) if language == 'sycl': - raise ValueError("Use SyclCompiler to jit-compile sycl") + warning(f"Use SyclCompiler (`sycl`) to jit-compile sycl, not {self.name}") elif language == 'openmp': # Earlier versions to OneAPI 2023.2.0 (clang17 underneath), have an @@ -878,7 +882,7 @@ def __lookup_cmds__(self): class SyclCompiler(OneapiCompiler): - _cpp = True + _default_cpp = True def __init_finalize__(self, **kwargs): IntelCompiler.__init_finalize__(self, **kwargs) @@ -887,9 +891,9 @@ def __init_finalize__(self, **kwargs): language = kwargs.pop('language', configuration['language']) if language != 'sycl': - raise ValueError("Expected language sycl with SyclCompiler") + warning(f"Expected language sycl with SyclCompiler, not {language}") - self.cflags.remove('-std=c99') + self.cflags.remove(f'-std={self.std}') self.cflags.append('-fsycl') self.cflags.remove('-g') # -g disables some optimizations in IGC @@ -903,7 +907,7 @@ def __init_finalize__(self, **kwargs): elif isinstance(platform, IntelDevice): self.cflags.append('-fsycl-targets=spir64') else: - raise NotImplementedError(f"Unsupported platform {platform}") + warning(f"Unsupported platform {platform}") class CustomCompiler(Compiler): @@ -945,7 +949,6 @@ def __new__(cls, *args, **kwargs): obj = super().__new__(cls) # Keep base to initialize accordingly obj._base = kwargs.pop('base', _base) - obj._cpp = obj._base._cpp return obj @@ -976,6 +979,10 @@ def __lookup_cmds__(self): def __new_with__(self, **kwargs): return super().__new_with__(base=self._base, **kwargs) + @property + def _default_cpp(self): + return self._base._default_cpp + class CompilerRegistry(dict): """ @@ -984,15 +991,19 @@ class CompilerRegistry(dict): """ def __getitem__(self, key): + if isinstance(key, Compiler): + key = key.name + if key.startswith('gcc-'): i = key.split('-')[1] return partial(GNUCompiler, suffix=i) + return super().__getitem__(key) - def __contains__(self, k): - if isinstance(k, Compiler): - k = k.name - return k in self.keys() or k.startswith('gcc-') + def __contains__(self, key): + if isinstance(key, Compiler): + key = key.name + return key in self.keys() or key.startswith('gcc-') _compiler_registry = { @@ -1011,6 +1022,7 @@ def __contains__(self, k): 'nvc++': NvidiaCompiler, 'nvidia': NvidiaCompiler, 'cuda': CudaCompiler, + 'nvcc': CudaCompiler, 'osx': ClangCompiler, 'intel': OneapiCompiler, 'icx': OneapiCompiler, diff --git a/devito/builtins/arithmetic.py b/devito/builtins/arithmetic.py index 93fbc917b5..5e807219f5 100644 --- a/devito/builtins/arithmetic.py +++ b/devito/builtins/arithmetic.py @@ -32,13 +32,13 @@ def norm(f, order=2): s = dv.types.Symbol(name='sum', dtype=n.dtype) op = dv.Operator([dv.Eq(s, 0.0)] + eqns + - [dv.Inc(s, dv.Abs(Pow(p, order))), dv.Eq(n[0], s)], + [dv.Inc(s, Pow(dv.Abs(p), order)), dv.Eq(n[0], s)], name='norm%d' % order) op.apply(**kwargs) v = np.power(n.data[0], 1/order) - return f.dtype(v) + return np.real(f.dtype(v)) @dv.switchconfig(log_level='ERROR') diff --git a/devito/builtins/utils.py b/devito/builtins/utils.py index 67aef28ba0..a83dd765a8 100644 --- a/devito/builtins/utils.py +++ b/devito/builtins/utils.py @@ -23,7 +23,10 @@ # NOTE: np.float128 isn't really a thing, see for example # https://github.com/numpy/numpy/issues/10288 # https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html#1070 - np.float64: np.float64 + np.float64: np.float64, + # ComplexX accumulates on Complex2X + np.complex64: np.complex128, + np.complex128: np.complex128, } diff --git a/devito/core/__init__.py b/devito/core/__init__.py index 507c5edf15..5b7ad7878b 100644 --- a/devito/core/__init__.py +++ b/devito/core/__init__.py @@ -1,41 +1,75 @@ from devito.arch import Cpu64, Intel64, Arm, Power, Device -from devito.core.cpu import (Cpu64NoopCOperator, Cpu64NoopOmpOperator, - Cpu64AdvCOperator, Cpu64AdvOmpOperator, - Cpu64FsgCOperator, Cpu64FsgOmpOperator, - Cpu64CustomOperator) -from devito.core.intel import (Intel64AdvCOperator, Intel64AdvOmpOperator, - Intel64FsgCOperator, Intel64FsgOmpOperator) -from devito.core.arm import ArmAdvCOperator, ArmAdvOmpOperator -from devito.core.power import PowerAdvCOperator, PowerAdvOmpOperator -from devito.core.gpu import (DeviceNoopOmpOperator, DeviceNoopAccOperator, - DeviceAdvOmpOperator, DeviceAdvAccOperator, - DeviceFsgOmpOperator, DeviceFsgAccOperator, - DeviceCustomOmpOperator, DeviceCustomAccOperator) +from devito.core.cpu import ( + Cpu64NoopCOperator, Cpu64NoopOmpOperator, + Cpu64AdvCOperator, Cpu64AdvOmpOperator, + Cpu64FsgCOperator, Cpu64FsgOmpOperator, + Cpu64CustomOperator, Cpu64CustomCXXOperator, + Cpu64CXXNoopCOperator, Cpu64CXXNoopOmpOperator, + Cpu64AdvCXXOperator, Cpu64AdvCXXOmpOperator, + Cpu64FsgCXXOperator, Cpu64FsgCXXOmpOperator +) +from devito.core.intel import ( + Intel64AdvCOperator, Intel64AdvOmpOperator, + Intel64FsgCOperator, Intel64FsgOmpOperator, + Intel64CXXAdvCOperator, Intel64AdvCXXOmpOperator, + Intel64FsgCXXOperator, Intel64FsgCXXOmpOperator +) +from devito.core.arm import ( + ArmAdvCOperator, ArmAdvOmpOperator, + ArmAdvCXXOperator, ArmAdvCXXOmpOperator +) +from devito.core.power import ( + PowerAdvCOperator, PowerAdvOmpOperator, + PowerCXXAdvCOperator, PowerAdvCXXOmpOperator +) +from devito.core.gpu import ( + DeviceNoopOmpOperator, DeviceNoopAccOperator, + DeviceAdvOmpOperator, DeviceAdvAccOperator, + DeviceFsgOmpOperator, DeviceFsgAccOperator, + DeviceCustomOmpOperator, DeviceCustomAccOperator +) from devito.operator.registry import operator_registry # Register CPU Operators operator_registry.add(Cpu64CustomOperator, Cpu64, 'custom', 'C') operator_registry.add(Cpu64CustomOperator, Cpu64, 'custom', 'openmp') +operator_registry.add(Cpu64CustomCXXOperator, Cpu64, 'custom', 'CXX') +operator_registry.add(Cpu64CustomCXXOperator, Cpu64, 'custom', 'CXXopenmp') operator_registry.add(Cpu64NoopCOperator, Cpu64, 'noop', 'C') operator_registry.add(Cpu64NoopOmpOperator, Cpu64, 'noop', 'openmp') +operator_registry.add(Cpu64CXXNoopCOperator, Cpu64, 'noop', 'CXX') +operator_registry.add(Cpu64CXXNoopOmpOperator, Cpu64, 'noop', 'CXXopenmp') operator_registry.add(Cpu64AdvCOperator, Cpu64, 'advanced', 'C') operator_registry.add(Cpu64AdvOmpOperator, Cpu64, 'advanced', 'openmp') +operator_registry.add(Cpu64AdvCXXOperator, Cpu64, 'advanced', 'CXX') +operator_registry.add(Cpu64AdvCXXOmpOperator, Cpu64, 'advanced', 'CXXopenmp') operator_registry.add(Cpu64FsgCOperator, Cpu64, 'advanced-fsg', 'C') operator_registry.add(Cpu64FsgOmpOperator, Cpu64, 'advanced-fsg', 'openmp') +operator_registry.add(Cpu64FsgCXXOperator, Cpu64, 'advanced-fsg', 'CXX') +operator_registry.add(Cpu64FsgCXXOmpOperator, Cpu64, 'advanced-fsg', 'CXXopenmp') operator_registry.add(Intel64AdvCOperator, Intel64, 'advanced', 'C') operator_registry.add(Intel64AdvOmpOperator, Intel64, 'advanced', 'openmp') +operator_registry.add(Intel64CXXAdvCOperator, Intel64, 'advanced', 'CXX') +operator_registry.add(Intel64AdvCXXOmpOperator, Intel64, 'advanced', 'CXXopenmp') + operator_registry.add(Intel64FsgCOperator, Intel64, 'advanced-fsg', 'C') operator_registry.add(Intel64FsgOmpOperator, Intel64, 'advanced-fsg', 'openmp') +operator_registry.add(Intel64FsgCXXOperator, Intel64, 'advanced-fsg', 'CXX') +operator_registry.add(Intel64FsgCXXOmpOperator, Intel64, 'advanced-fsg', 'CXXopenmp') operator_registry.add(ArmAdvCOperator, Arm, 'advanced', 'C') operator_registry.add(ArmAdvOmpOperator, Arm, 'advanced', 'openmp') +operator_registry.add(ArmAdvCXXOperator, Arm, 'advanced', 'CXX') +operator_registry.add(ArmAdvCXXOmpOperator, Arm, 'advanced', 'CXXopenmp') operator_registry.add(PowerAdvCOperator, Power, 'advanced', 'C') operator_registry.add(PowerAdvOmpOperator, Power, 'advanced', 'openmp') +operator_registry.add(PowerCXXAdvCOperator, Power, 'advanced', 'CXX') +operator_registry.add(PowerAdvCXXOmpOperator, Power, 'advanced', 'CXXopenmp') # Register Device Operators operator_registry.add(DeviceCustomOmpOperator, Device, 'custom', 'C') diff --git a/devito/core/arm.py b/devito/core/arm.py index 0b765c1b52..f990ef31e0 100644 --- a/devito/core/arm.py +++ b/devito/core/arm.py @@ -1,19 +1,23 @@ -from devito.core.cpu import Cpu64AdvOperator -from devito.passes.iet import CTarget, OmpTarget +from devito.core.cpu import (Cpu64AdvOperator, Cpu64AdvCXXOperator, + Cpu64AdvCOperator) +from devito.passes.iet import OmpTarget, CXXOmpTarget -__all__ = ['ArmAdvCOperator', 'ArmAdvOmpOperator'] +__all__ = ['ArmAdvCOperator', 'ArmAdvOmpOperator', 'ArmAdvCXXOperator', + 'ArmAdvCXXOmpOperator'] -class ArmAdvOperator(Cpu64AdvOperator): - pass +ArmAdvOperator = Cpu64AdvOperator +ArmAdvCOperator = Cpu64AdvCOperator +ArmAdvCXXOperator = Cpu64AdvCXXOperator -class ArmAdvCOperator(ArmAdvOperator): - _Target = CTarget - - -class ArmAdvOmpOperator(ArmAdvOperator): +class ArmAdvOmpOperator(ArmAdvCOperator): _Target = OmpTarget # Avoid nested parallelism on ThunderX2 PAR_NESTED = 4 + + +class ArmAdvCXXOmpOperator(ArmAdvOmpOperator): + _Target = CXXOmpTarget + LINEARIZE = True diff --git a/devito/core/cpu.py b/devito/core/cpu.py index b9baedb237..d96d03e31d 100644 --- a/devito/core/cpu.py +++ b/devito/core/cpu.py @@ -8,14 +8,16 @@ from devito.passes.clusters import (Lift, blocking, buffering, cire, cse, factorize, fission, fuse, optimize_pows, optimize_hyperplanes) -from devito.passes.iet import (CTarget, OmpTarget, avoid_denormals, linearize, +from devito.passes.iet import (CTarget, CXXTarget, COmpTarget, CXXOmpTarget, + avoid_denormals, linearize, mpiize, hoist_prodders, relax_incr_dimensions, check_stability) from devito.tools import timed_pass __all__ = ['Cpu64NoopCOperator', 'Cpu64NoopOmpOperator', 'Cpu64AdvCOperator', 'Cpu64AdvOmpOperator', 'Cpu64FsgCOperator', 'Cpu64FsgOmpOperator', - 'Cpu64CustomOperator'] + 'Cpu64CustomOperator', 'Cpu64CustomCXXOperator', 'Cpu64AdvCXXOperator', + 'Cpu64AdvCXXOmpOperator', 'Cpu64FsgCXXOperator', 'Cpu64FsgCXXOmpOperator'] class Cpu64OperatorMixin: @@ -77,11 +79,12 @@ def _normalize_kwargs(cls, **kwargs): # Misc o['opt-comms'] = oo.pop('opt-comms', True) - o['linearize'] = oo.pop('linearize', False) + o['linearize'] = oo.pop('linearize', cls.LINEARIZE) o['mapify-reduce'] = oo.pop('mapify-reduce', cls.MAPIFY_REDUCE) o['index-mode'] = oo.pop('index-mode', cls.INDEX_MODE) o['place-transfers'] = oo.pop('place-transfers', True) o['errctl'] = oo.pop('errctl', cls.ERRCTL) + o['scalar-min-type'] = oo.pop('scalar-min-type', cls.SCALAR_MIN_TYPE) # Recognised but unused by the CPU backend oo.pop('par-disabled', None) @@ -244,7 +247,7 @@ def _normalize_kwargs(cls, **kwargs): class Cpu64CustomOperator(Cpu64OperatorMixin, CustomOperator): - _Target = OmpTarget + _Target = COmpTarget @classmethod def _make_dsl_passes_mapper(cls, **kwargs): @@ -317,6 +320,11 @@ def _make_iet_passes_mapper(cls, **kwargs): assert not (set(_known_passes) & set(_known_passes_disabled)) +class Cpu64CustomCXXOperator(Cpu64CustomOperator): + + _Target = CXXTarget + LINEARIZE = True + # Language level @@ -324,21 +332,51 @@ class Cpu64NoopCOperator(Cpu64NoopOperator): _Target = CTarget +class Cpu64CXXNoopCOperator(Cpu64NoopOperator): + _Target = CXXTarget + LINEARIZE = True + + class Cpu64NoopOmpOperator(Cpu64NoopOperator): - _Target = OmpTarget + _Target = COmpTarget + + +class Cpu64CXXNoopOmpOperator(Cpu64NoopOperator): + _Target = CXXOmpTarget + LINEARIZE = True class Cpu64AdvCOperator(Cpu64AdvOperator): _Target = CTarget +class Cpu64AdvCXXOperator(Cpu64AdvOperator): + _Target = CXXTarget + LINEARIZE = True + + class Cpu64AdvOmpOperator(Cpu64AdvOperator): - _Target = OmpTarget + _Target = COmpTarget + + +class Cpu64AdvCXXOmpOperator(Cpu64AdvOperator): + _Target = CXXOmpTarget + LINEARIZE = True class Cpu64FsgCOperator(Cpu64FsgOperator): _Target = CTarget +class Cpu64FsgCXXOperator(Cpu64FsgOperator): + _Target = CXXTarget + LINEARIZE = True + + class Cpu64FsgOmpOperator(Cpu64FsgOperator): - _Target = OmpTarget + _Target = COmpTarget + + +class Cpu64FsgCXXOmpOperator(Cpu64FsgOperator): + _Target = CXXOmpTarget + LINEARIZE = True diff --git a/devito/core/gpu.py b/devito/core/gpu.py index 0e42b4886c..37a2e7228d 100644 --- a/devito/core/gpu.py +++ b/devito/core/gpu.py @@ -95,6 +95,7 @@ def _normalize_kwargs(cls, **kwargs): o['index-mode'] = oo.pop('index-mode', cls.INDEX_MODE) o['place-transfers'] = oo.pop('place-transfers', True) o['errctl'] = oo.pop('errctl', cls.ERRCTL) + o['scalar-min-type'] = oo.pop('scalar-min-type', cls.SCALAR_MIN_TYPE) if oo: raise InvalidOperator("Unsupported optimization options: [%s]" diff --git a/devito/core/intel.py b/devito/core/intel.py index 3b8f8b0208..9e378ffc12 100644 --- a/devito/core/intel.py +++ b/devito/core/intel.py @@ -1,11 +1,18 @@ from devito.core.cpu import (Cpu64AdvCOperator, Cpu64AdvOmpOperator, - Cpu64FsgCOperator, Cpu64FsgOmpOperator) + Cpu64FsgCOperator, Cpu64FsgOmpOperator, + Cpu64AdvCXXOperator, Cpu64AdvCXXOmpOperator, + Cpu64FsgCXXOperator, Cpu64FsgCXXOmpOperator) __all__ = ['Intel64AdvCOperator', 'Intel64AdvOmpOperator', 'Intel64FsgCOperator', - 'Intel64FsgOmpOperator'] + 'Intel64FsgOmpOperator', 'Intel64CXXAdvCOperator', 'Intel64AdvCXXOmpOperator', + 'Intel64FsgCXXOperator', 'Intel64FsgCXXOmpOperator'] Intel64AdvCOperator = Cpu64AdvCOperator Intel64AdvOmpOperator = Cpu64AdvOmpOperator Intel64FsgCOperator = Cpu64FsgCOperator Intel64FsgOmpOperator = Cpu64FsgOmpOperator +Intel64CXXAdvCOperator = Cpu64AdvCXXOperator +Intel64AdvCXXOmpOperator = Cpu64AdvCXXOmpOperator +Intel64FsgCXXOperator = Cpu64FsgCXXOperator +Intel64FsgCXXOmpOperator = Cpu64FsgCXXOmpOperator diff --git a/devito/core/operator.py b/devito/core/operator.py index e6bfd18916..cd094ed04b 100644 --- a/devito/core/operator.py +++ b/devito/core/operator.py @@ -1,6 +1,8 @@ from collections.abc import Iterable from functools import cached_property +import numpy as np + from devito.core.autotuning import autotune from devito.exceptions import InvalidOperator from devito.ir import FindSymbols @@ -67,6 +69,11 @@ class BasicOperator(Operator): intensity of the generated kernel. """ + SCALAR_MIN_TYPE = np.float16 + """ + Minimum datatype for a scalar arising from a common sub-expression or CIRE temp. + """ + PAR_COLLAPSE_NCORES = 4 """ Use a collapse clause if the number of available physical cores is greater @@ -131,6 +138,11 @@ class BasicOperator(Operator): (default) or `int64`. """ + LINEARIZE = False + """ + Linearize n-dimensional Indexeds. + """ + ERRCTL = None """ Runtime error checking. If this option is enabled, the generated code will diff --git a/devito/core/power.py b/devito/core/power.py index ab651a1910..65cf4c3cf3 100644 --- a/devito/core/power.py +++ b/devito/core/power.py @@ -1,6 +1,10 @@ -from devito.core.cpu import Cpu64AdvCOperator, Cpu64AdvOmpOperator +from devito.core.cpu import (Cpu64AdvCOperator, Cpu64AdvOmpOperator, + Cpu64AdvCXXOperator, Cpu64AdvCXXOmpOperator) -__all__ = ['PowerAdvCOperator', 'PowerAdvOmpOperator'] +__all__ = ['PowerAdvCOperator', 'PowerAdvOmpOperator', + 'PowerCXXAdvCOperator', 'PowerAdvCXXOmpOperator'] PowerAdvCOperator = Cpu64AdvCOperator PowerAdvOmpOperator = Cpu64AdvOmpOperator +PowerCXXAdvCOperator = Cpu64AdvCXXOperator +PowerAdvCXXOmpOperator = Cpu64AdvCXXOmpOperator diff --git a/devito/data/allocators.py b/devito/data/allocators.py index aff28ef108..4ccd7cddfc 100644 --- a/devito/data/allocators.py +++ b/devito/data/allocators.py @@ -1,6 +1,4 @@ import abc -from functools import reduce -from operator import mul import ctypes from ctypes.util import find_library import mmap @@ -11,7 +9,7 @@ from devito.logger import logger from devito.parameters import configuration -from devito.tools import dtype_to_ctype, is_integer +from devito.tools import is_integer, infer_datasize __all__ = ['ALLOC_ALIGNED', 'ALLOC_NUMA_LOCAL', 'ALLOC_NUMA_ANY', 'ALLOC_KNL_MCDRAM', 'ALLOC_KNL_DRAM', 'ALLOC_GUARD', @@ -92,8 +90,7 @@ def initialize(cls): return def alloc(self, shape, dtype, padding=0): - datasize = int(reduce(mul, shape)) - ctype = dtype_to_ctype(dtype) + ctype, datasize = infer_datasize(dtype, shape) # Add padding, if any try: diff --git a/devito/finite_differences/differentiable.py b/devito/finite_differences/differentiable.py index 2009f7a772..39458424d4 100644 --- a/devito/finite_differences/differentiable.py +++ b/devito/finite_differences/differentiable.py @@ -18,8 +18,7 @@ from devito.logger import warning from devito.tools import (as_tuple, filter_ordered, flatten, frozendict, infer_dtype, is_integer, split) -from devito.types import (Array, DimensionTuple, Evaluable, Indexed, - StencilDimension) +from devito.types import Array, DimensionTuple, Evaluable, StencilDimension from devito.types.basic import AbstractFunction __all__ = ['Differentiable', 'DiffDerivative', 'IndexDerivative', 'EvalDerivative', @@ -74,7 +73,7 @@ def grid(self): @cached_property def dtype(self): - dtypes = {f.dtype for f in self.find(Indexed)} - {None} + dtypes = {f.dtype for f in self._functions} - {None} return infer_dtype(dtypes) @cached_property diff --git a/devito/ir/__init__.py b/devito/ir/__init__.py index 0cfa5730eb..78a10d0a8c 100644 --- a/devito/ir/__init__.py +++ b/devito/ir/__init__.py @@ -2,3 +2,4 @@ from devito.ir.equations import * # noqa from devito.ir.clusters import * # noqa from devito.ir.iet import * # noqa +from devito.ir.cgen import * # noqa \ No newline at end of file diff --git a/devito/ir/cgen/__init__.py b/devito/ir/cgen/__init__.py new file mode 100644 index 0000000000..b9d0e6ed2c --- /dev/null +++ b/devito/ir/cgen/__init__.py @@ -0,0 +1 @@ +from devito.ir.cgen.printer import * # noqa \ No newline at end of file diff --git a/devito/symbolics/printer.py b/devito/ir/cgen/printer.py similarity index 52% rename from devito/symbolics/printer.py rename to devito/ir/cgen/printer.py index 2c366a389e..cf6eee1a7c 100644 --- a/devito/symbolics/printer.py +++ b/devito/ir/cgen/printer.py @@ -1,28 +1,28 @@ """ Utilities to turn SymPy objects into C strings. """ - import numpy as np import sympy from mpmath.libmp import prec_to_dps, to_str from packaging.version import Version -from numbers import Real from sympy.core import S from sympy.core.numbers import equal_valued, Float +from sympy.printing.codeprinter import CodePrinter from sympy.logic.boolalg import BooleanFunction from sympy.printing.precedence import PRECEDENCE_VALUES, precedence -from sympy.printing.c import C99CodePrinter +from devito import configuration from devito.arch.compiler import AOMPCompiler from devito.symbolics.inspection import has_integer_args, sympy_dtype from devito.types.basic import AbstractFunction +from devito.tools import ctypes_to_cstr, dtype_to_ctype, ctypes_vector_mapper -__all__ = ['ccode'] +__all__ = ['BasePrinter', 'ccode'] -class CodePrinter(C99CodePrinter): +class BasePrinter(CodePrinter): """ Decorator for sympy.printing.ccode.CCodePrinter. @@ -33,25 +33,97 @@ class CodePrinter(C99CodePrinter): Options for code printing. """ _default_settings = {'compiler': None, 'dtype': np.float32, - **C99CodePrinter._default_settings} + **CodePrinter._default_settings} + + _func_prefix = {} + _func_literals = {} + _prec_literals = {np.float32: 'F', np.complex64: 'F'} + + _qualifiers_mapper = { + 'is_const': 'const', + 'is_volatile': 'volatile', + '_mem_constant': 'static', + '_mem_shared': '', + } + + _restrict_keyword = 'restrict' + + _includes = [] + _namespaces = [] + _headers = [('_POSIX_C_SOURCE', '200809L')] @property def dtype(self): - return self._settings['dtype'] + try: + return self._settings['dtype'].nptype + except AttributeError: + return self._settings['dtype'] @property def compiler(self): - return self._settings['compiler'] + return self._settings['compiler'] or configuration['compiler'] + + def doprint(self, expr, assign_to=None): + """ + The sympy code printer does a lot of extra things we do not need + as we handle all of it in the compiler so we directly default to `_print`. + """ + return self._print(expr) + + def _prec(self, expr): + dtype = sympy_dtype(expr, default=self.dtype) + if dtype is None or np.issubdtype(dtype, np.integer): + if any(isinstance(i, Float) for i in expr.atoms()): + try: + return np.promote_types(self.dtype, np.float32).type + except np.exceptions.DTypePromotionError: + # Corner cases, e.g. Void, cannot (shouldn't) be promoted + return self.dtype + else: + return dtype or self.dtype + else: + return dtype or self.dtype - def single_prec(self, expr=None): - dtype = sympy_dtype(expr) if expr is not None else self.dtype - return dtype in [np.float32, np.float16] + def prec_literal(self, expr): + return self._prec_literals.get(self._prec(expr), '') + + def func_literal(self, expr): + return self._func_literals.get(self._prec(expr), '') + + def func_prefix(self, expr, mfunc=False): + prefix = self._func_prefix.get(self._prec(expr), '') + if mfunc: + return prefix + elif prefix == 'f': + return '' + else: + return prefix def parenthesize(self, item, level, strict=False): if isinstance(item, BooleanFunction): - return "(%s)" % self._print(item) + return f"({self._print(item)})" return super().parenthesize(item, level, strict=strict) + def _print_PyCPointerType(self, expr): + ctype = f'{self._print_type(expr._type_)}' + if ctype.endswith('*'): + return f'{ctype}*' + else: + return f'{ctype} *' + + def _print_type(self, expr): + try: + expr = dtype_to_ctype(expr) + except TypeError: + pass + try: + return self.type_mappings[expr] + except KeyError: + return ctypes_to_cstr(expr) + + def _print_VoidDType(self, expr): + return ctypes_vector_mapper[expr].__name__ + def _print_Function(self, expr): if isinstance(expr, AbstractFunction): return str(expr) @@ -61,7 +133,7 @@ def _print_Function(self, expr): return super()._print_Function(expr) def _print_CondEq(self, expr): - return "%s == %s" % (self._print(expr.lhs), self._print(expr.rhs)) + return f"{self._print(expr.lhs)} == {self._print(expr.rhs)}" def _print_Indexed(self, expr): """ @@ -72,7 +144,7 @@ def _print_Indexed(self, expr): U[t,x,y,z] -> U[t][x][y][z] """ inds = ''.join(['[' + self._print(x) + ']' for x in expr.indices]) - return '%s%s' % (self._print(expr.base.label), inds) + return f'{self._print(expr.base.label)}{inds}' def _print_FIndexed(self, expr): """ @@ -87,7 +159,7 @@ def _print_FIndexed(self, expr): label = expr.accessor.label except AttributeError: label = expr.base.label - return '%s(%s)' % (self._print(label), inds) + return f'{self._print(label)}({inds})' def _print_Rational(self, expr): """Print a Rational as a C-like float/float division.""" @@ -96,10 +168,8 @@ def _print_Rational(self, expr): # to be 32-bit floats. # http://en.cppreference.com/w/cpp/language/floating_literal p, q = int(expr.p), int(expr.q) - if self.dtype == np.float64: - return '%d.0/%d.0' % (p, q) - else: - return '%d.0F/%d.0F' % (p, q) + prec = self.prec_literal(expr) + return f'{p}.0{prec}/{q}.0{prec}' def _print_math_func(self, expr, nest=False, known=None): cls = type(expr) @@ -113,8 +183,7 @@ def _print_math_func(self, expr, nest=False, known=None): if cname not in self._prec_funcs: return super()._print_math_func(expr, nest=nest, known=known) - if self.single_prec(expr): - cname = '%sf' % cname + cname = f'{self.func_prefix(expr)}{cname}{self.func_literal(expr)}' if nest and len(expr.args) > 2: args = ', '.join([self._print(expr.args[0]), @@ -122,7 +191,7 @@ def _print_math_func(self, expr, nest=False, known=None): else: args = ', '.join([self._print(arg) for arg in expr.args]) - return f'{cname}({args})' + return f'{self._ns}{cname}({args})' def _print_Pow(self, expr): # Completely reimplement `_print_Pow` from sympy, since it doesn't @@ -130,16 +199,17 @@ def _print_Pow(self, expr): if "Pow" in self.known_functions: return self._print_Function(expr) PREC = precedence(expr) - suffix = 'f' if self.single_prec(expr) else '' + suffix = self.func_literal(expr) + base = self._print(expr.base) if equal_valued(expr.exp, -1): return self._print_Float(Float(1.0)) + '/' + \ self.parenthesize(expr.base, PREC) elif equal_valued(expr.exp, 0.5): - return f'sqrt{suffix}({self._print(expr.base)})' + return f'{self._ns}sqrt{suffix}({base})' elif expr.exp == S.One/3 and self.standard != 'C89': - return f'cbrt{suffix}({self._print(expr.base)})' + return f'{self._ns}cbrt{suffix}({base})' else: - return f'pow{suffix}({self._print(expr.base)}, {self._print(expr.exp)})' + return f'{self._ns}pow{suffix}({base}, {self._print(expr.exp)})' def _print_SafeInv(self, expr): """Print a SafeInv as a C-like division with a check for zero.""" @@ -149,42 +219,57 @@ def _print_SafeInv(self, expr): def _print_Mod(self, expr): """Print a Mod as a C-like %-based operation.""" - args = ['(%s)' % self._print(a) for a in expr.args] + args = [f'({self._print(a)})' for a in expr.args] return '%'.join(args) def _print_Mul(self, expr): - term = super()._print_Mul(expr) - return term.replace("(-1)*", "-") + args = [a for a in expr.args if a != -1] + neg = (len(expr.args) - len(args)) % 2 + + if len(args) > 1: + term = super()._print_Mul(expr.func(*args, evaluate=False)) + else: + term = self.parenthesize(args[0], precedence(expr)) + + if neg: + return f'-{term}' + else: + return term + + def _print_fmath_func(self, name, expr): + args = ",".join([self._print(i) for i in expr.args]) + func = f'{self.func_prefix(expr, mfunc=True)}{name}{self.func_literal(expr)}' + return f"{self._ns}{func}({args})" def _print_Min(self, expr): - if has_integer_args(*expr.args) and len(expr.args) == 2: - return "MIN(%s)" % self._print(expr.args)[1:-1] + if len(expr.args) > 2: + return self._print_Min(expr.func(expr.args[0], + expr.func(*expr.args[1:]), + evaluate=False)) + elif has_integer_args(*expr.args) and len(expr.args) == 2: + return f"MIN({self._print(expr.args)[1:-1]})" else: - return super()._print_Min(expr) + return self._print_fmath_func('min', expr) def _print_Max(self, expr): - if has_integer_args(*expr.args) and len(expr.args) == 2: - return "MAX(%s)" % self._print(expr.args)[1:-1] + if len(expr.args) > 2: + return self._print_Max(expr.func(expr.args[0], + expr.func(*expr.args[1:]), + evaluate=False)) + elif has_integer_args(*expr.args) and len(expr.args) == 2: + return f"MAX({self._print(expr.args)[1:-1]})" else: - return super()._print_Max(expr) + return self._print_fmath_func('max', expr) def _print_Abs(self, expr): """Print an absolute value. Use `abs` if can infer it is an Integer""" + # Unary function, single argument + arg = expr.args[0] # AOMPCC errors with abs, always use fabs - if isinstance(self.compiler, AOMPCompiler): - return "fabs(%s)" % self._print(expr.args[0]) - # Check if argument is an integer - if has_integer_args(*expr.args[0].args): - func = "abs" - elif self.single_prec(expr): - func = "fabsf" - elif any([isinstance(a, Real) for a in expr.args[0].args]): - # The previous condition isn't sufficient to detect case with - # Python `float`s in that case, fall back to the "default" - func = "fabsf" if self.single_prec() else "fabs" - else: - func = "fabs" - return f"{func}({self._print(expr.args[0])})" + if isinstance(self.compiler, AOMPCompiler) and \ + not np.issubdtype(self._prec(expr), np.integer): + return f"fabs({self._print(arg)})" + return self._print_fmath_func('abs', expr) def _print_Add(self, expr, order=None): """" @@ -197,7 +282,7 @@ def _print_Add(self, expr, order=None): for term in terms: t = self._print(term) if precedence(term) < PREC: - l.extend(["+", "(%s)" % t]) + l.extend(["+", f"({t})"]) elif t.startswith('-'): l.extend(["-", t[1:]]) else: @@ -234,75 +319,77 @@ def _print_Float(self, expr): if 'e' not in rv: rv = rv.rstrip('0') + "0" - if self.single_prec(): - rv = '%sF' % rv - - return rv + return f'{rv}{self.prec_literal(expr)}' def _print_Differentiable(self, expr): - return "(%s)" % self._print(expr._expr) + return f"({self._print(expr._expr)})" - _print_EvalDerivative = C99CodePrinter._print_Add + _print_EvalDerivative = _print_Add def _print_CallFromPointer(self, expr): indices = [self._print(i) for i in expr.params] - return "%s->%s(%s)" % (expr.pointer, expr.call, ', '.join(indices)) + return f"{expr.pointer}->{expr.call}({', '.join(indices)})" def _print_CallFromComposite(self, expr): indices = [self._print(i) for i in expr.params] - return "%s.%s(%s)" % (expr.pointer, expr.call, ', '.join(indices)) + return f"{expr.pointer}.{expr.call}({', '.join(indices)})" def _print_FieldFromPointer(self, expr): - return "%s->%s" % (expr.pointer, expr.field) + return f"{expr.pointer}->{expr.field}" def _print_FieldFromComposite(self, expr): - return "%s.%s" % (expr.pointer, expr.field) + return f"{expr.pointer}.{expr.field}" def _print_ListInitializer(self, expr): - return "{%s}" % ', '.join([self._print(i) for i in expr.params]) + return f"{{{', '.join(self._print(i) for i in expr.params)}}}" def _print_IndexedPointer(self, expr): - return "%s%s" % (expr.base, ''.join('[%s]' % self._print(i) for i in expr.index)) + return f"{expr.base}{''.join(f'[{self._print(i)}]' for i in expr.index)}" def _print_IntDiv(self, expr): lhs = self._print(expr.lhs) if not expr.lhs.is_Atom: - lhs = '(%s)' % (lhs) + lhs = f"({lhs})" rhs = self._print(expr.rhs) PREC = precedence(expr) - return self.parenthesize("%s / %s" % (lhs, rhs), PREC) + return self.parenthesize(f"{lhs} / {rhs}", PREC) def _print_InlineIf(self, expr): cond = self._print(expr.cond) true_expr = self._print(expr.true_expr) false_expr = self._print(expr.false_expr) PREC = precedence(expr) - return self.parenthesize("(%s) ? %s : %s" % (cond, true_expr, false_expr), PREC) + return self.parenthesize(f"({cond}) ? {true_expr} : {false_expr}", PREC) - def _print_UnaryOp(self, expr): - if expr.base.is_Symbol: - return "%s%s" % (expr._op, self._print(expr.base)) - else: - return "%s(%s)" % (expr._op, self._print(expr.base)) + def _print_UnaryOp(self, expr, op=None, parenthesize=False): + op = op or expr._op + base = self._print(expr.base) + if not expr.base.is_Symbol or parenthesize: + base = f'({base})' + return f'{op}{base}' - def _print_ComponentAccess(self, expr): - return "%s.%s" % (self._print(expr.base), expr.sindex) + def _print_Cast(self, expr): + cast = f'({self._print(expr._C_ctype)}{self._print(expr.stars)})' + return self._print_UnaryOp(expr, op=cast) - def _print_TrigonometricFunction(self, expr): - func_name = str(expr.func) - if self.single_prec(): - func_name = '%sf' % func_name - return '%s(%s)' % (func_name, self._print(*expr.args)) + def _print_ComponentAccess(self, expr): + return f"{self._print(expr.base)}.{expr.sindex}" def _print_DefFunction(self, expr): arguments = [self._print(i) for i in expr.arguments] if expr.template: - template = '<%s>' % ','.join([str(i) for i in expr.template]) + ctemplate = ','.join([str(i) for i in expr.template]) + template = f'<{ctemplate}>' else: template = '' - return "%s%s(%s)" % (expr.name, template, ','.join(arguments)) + args = ','.join(arguments) + return f"{expr.name}{template}({args})" - _print_MathFunction = _print_DefFunction + def _print_SizeOf(self, expr): + return f'sizeof({self._print(expr.intype)}{self._print(expr.stars)})' + + def _print_MathFunction(self, expr): + return f"{self._ns}{self._print_DefFunction(expr)}" def _print_Fallback(self, expr): return expr.__str__() @@ -318,7 +405,7 @@ def _print_Fallback(self, expr): # Lifted from SymPy so that we go through our own `_print_math_func` for k in ('exp log sin cos tan ceiling floor').split(): - setattr(CodePrinter, '_print_%s' % k, CodePrinter._print_math_func) + setattr(BasePrinter, f'_print_{k}', BasePrinter._print_math_func) # Always parenthesize IntDiv and InlineIf within expressions @@ -326,7 +413,13 @@ def _print_Fallback(self, expr): PRECEDENCE_VALUES['InlineIf'] = 1 -def ccode(expr, **settings): +# Sympy 1.11 has introduced a bug in `_print_Add`, so we enforce here +# to always use the correct one from our printer +if Version(sympy.__version__) >= Version("1.11"): + setattr(sympy.printing.str.StrPrinter, '_print_Add', BasePrinter._print_Add) + + +def ccode(expr, printer=None, **settings): """Generate C++ code from an expression. Parameters @@ -342,10 +435,7 @@ def ccode(expr, **settings): The resulting code as a C++ string. If something went south, returns the input ``expr`` itself. """ - return CodePrinter(settings=settings).doprint(expr, None) - - -# Sympy 1.11 has introduced a bug in `_print_Add`, so we enforce here -# to always use the correct one from our printer -if Version(sympy.__version__) >= Version("1.11"): - setattr(sympy.printing.str.StrPrinter, '_print_Add', CodePrinter._print_Add) + if printer is None: + from devito.passes.iet.languages.C import CPrinter + printer = CPrinter + return printer(settings=settings).doprint(expr, None) diff --git a/devito/ir/iet/nodes.py b/devito/ir/iet/nodes.py index d45c9af939..60646066b7 100644 --- a/devito/ir/iet/nodes.py +++ b/devito/ir/iet/nodes.py @@ -1,6 +1,7 @@ """The Iteration/Expression Tree (IET) hierarchy.""" import abc +import ctypes import inspect from functools import cached_property from collections import OrderedDict, namedtuple @@ -10,11 +11,12 @@ from sympy import IndexedBase, sympify from devito.data import FULL +from devito.ir.cgen import ccode from devito.ir.equations import DummyEq, OpInc, OpMin, OpMax from devito.ir.support import (INBOUND, SEQUENTIAL, PARALLEL, PARALLEL_IF_ATOMIC, PARALLEL_IF_PVT, VECTORIZED, AFFINE, Property, Forward, WithLock, PrefetchUpdate, detect_io) -from devito.symbolics import ListInitializer, CallFromPointer, ccode +from devito.symbolics import ListInitializer, CallFromPointer from devito.tools import (Signer, as_tuple, filter_ordered, filter_sorted, flatten, ctypes_to_cstr) from devito.types.basic import (AbstractFunction, AbstractSymbol, Basic, Indexed, @@ -28,7 +30,7 @@ 'Increment', 'Return', 'While', 'ListMajor', 'ParallelIteration', 'ParallelBlock', 'Dereference', 'Lambda', 'SyncSpot', 'Pragma', 'DummyExpr', 'BlankLine', 'ParallelTree', 'BusyWait', 'UsingNamespace', - 'CallableBody', 'Transfer'] + 'Using', 'CallableBody', 'Transfer'] # First-class IET nodes @@ -62,12 +64,6 @@ class Node(Signer): appears in this list are treated as traversable fields. """ - _ccode_handler = None - """ - Customizable by subclasses, in particular Operator subclasses which define - backend-specific nodes and, as such, require node-specific handlers. - """ - def __new__(cls, *args, **kwargs): obj = super().__new__(cls) argnames, _, _, defaultvalues, _, _, _ = inspect.getfullargspec(cls.__init__) @@ -152,7 +148,7 @@ def writes(self): return () def _signature_items(self): - return (str(self.ccode),) + return (str(self),) class ExprStmt: @@ -1043,6 +1039,8 @@ class Dereference(ExprStmt, Node): * `pointer` is a PointerArray or TempFunction, and `pointee` is an Array. * `pointer` is an ArrayObject representing a pointer to a C struct, and `pointee` is a field in `pointer`. + * `pointer` is a Symbol with its _C_ctype deriving from ct._Pointer, and + `pointee` is a Symbol representing the dereferenced value. """ is_Dereference = True @@ -1061,13 +1059,18 @@ def functions(self): @property def expr_symbols(self): - ret = [self.pointer.indexed] - if self.pointer.is_PointerArray or self.pointer.is_TempFunction: - ret.append(self.pointee.indexed) - ret.extend(flatten(i.free_symbols for i in self.pointee.symbolic_shape[1:])) + ret = [] + if self.pointer.is_Symbol: + assert issubclass(self.pointer._C_ctype, ctypes._Pointer), \ + "Scalar dereference must have a pointer ctype" + ret.extend([self.pointer._C_symbol, self.pointee._C_symbol]) + elif self.pointer.is_PointerArray or self.pointer.is_TempFunction: + ret.extend([self.pointer.indexed, self.pointee.indexed]) + ret.extend(flatten(i.free_symbols + for i in self.pointee.symbolic_shape[1:])) ret.extend(self.pointer.free_symbols) else: - ret.append(self.pointee._C_symbol) + ret.extend([self.pointer.indexed, self.pointee._C_symbol]) return tuple(filter_ordered(ret)) @property @@ -1214,6 +1217,19 @@ def periodic(self): return self._periodic +class Using(Node): + + """ + A C++ using directive. + """ + + def __init__(self, name): + self.name = name + + def __repr__(self): + return "" % self.name + + class UsingNamespace(Node): """ diff --git a/devito/ir/iet/utils.py b/devito/ir/iet/utils.py index a31c82973a..0ffe7c3d36 100644 --- a/devito/ir/iet/utils.py +++ b/devito/ir/iet/utils.py @@ -1,3 +1,5 @@ +import numpy as np + from devito.ir.iet import FindSections, FindSymbols from devito.symbolics import Keyword, Macro from devito.tools import filter_ordered @@ -166,3 +168,20 @@ def maybe_alias(obj, candidate): # the __rkwargs__ except for e.g. the name return False + + +def has_dtype(iet, dtype): + """ + Check if the given IET has at least one symbol with the given dtype or + dtype kind. + """ + for f in FindSymbols().visit(iet): + try: + # Check if the dtype matches exactly (dtype input) + # or matches the generic kind (dtype generic input) + if np.issubdtype(f.dtype, dtype) or f.dtype == dtype: + return True + except TypeError: + continue + else: + return False diff --git a/devito/ir/iet/visitors.py b/devito/ir/iet/visitors.py index 360d5253f5..3817c4e39d 100644 --- a/devito/ir/iet/visitors.py +++ b/devito/ir/iet/visitors.py @@ -18,11 +18,12 @@ Call, Lambda, BlankLine, Section, ListMajor) from devito.ir.support.space import Backward from devito.symbolics import (FieldFromComposite, FieldFromPointer, - ListInitializer, ccode, uxreplace) -from devito.tools import (GenericVisitor, as_tuple, ctypes_to_cstr, filter_ordered, + ListInitializer, uxreplace) +from devito.symbolics.extended_dtypes import NoDeclStruct +from devito.tools import (GenericVisitor, as_tuple, filter_ordered, filter_sorted, flatten, is_external_ctype, c_restrict_void_p, sorted_priority) -from devito.types.basic import AbstractFunction, Basic +from devito.types.basic import AbstractFunction, AbstractSymbol, Basic from devito.types import (ArrayObject, CompositeObject, Dimension, Pointer, IndexedData, DeviceMap) @@ -79,23 +80,24 @@ def indent(self): return ' ' * self._depth def visit_Node(self, o): - return self.indent + '<%s>' % o.__class__.__name__ + return self.indent + f'<{o.__class__.__name__}>' def visit_Generable(self, o): - body = ' %s' % str(o) if self.verbose else '' - return self.indent + '' % (o.__class__.__name__, body) + body = f" {str(o) if self.verbose else ''}" + return self.indent + f'' def visit_Callable(self, o): self._depth += 1 body = self._visit(o.children) self._depth -= 1 - return self.indent + '\n%s' % (o.name, body) + return self.indent + f'\n{body}' def visit_CallableBody(self, o): self._depth += 1 body = [self._visit(o.init), self._visit(o.unpacks), self._visit(o.body)] self._depth -= 1 - return self.indent + "%s\n%s" % (o.__repr__(), '\n'.join([i for i in body if i])) + cbody = '\n'.join([i for i in body if i]) + return self.indent + f"{o.__repr__()}\n{cbody}" def visit_list(self, o): return ('\n').join([self._visit(i) for i in o]) @@ -110,43 +112,49 @@ def visit_List(self, o): else: body = [self._visit(o.body)] self._depth -= 1 - return self.indent + "%s\n%s" % (o.__repr__(), '\n'.join(body)) + cbody = '\n'.join(body) + return self.indent + f"{o.__repr__()}\n{cbody}" def visit_TimedList(self, o): self._depth += 1 body = [self._visit(o.body)] self._depth -= 1 - return self.indent + "%s\n%s" % (o.__repr__(), '\n'.join(body)) + cbody = '\n'.join(body) + return self.indent + f"{o.__repr__()}\n{cbody}" def visit_Iteration(self, o): self._depth += 1 body = self._visit(o.children) self._depth -= 1 if self.verbose: - detail = '::%s::%s' % (o.index, o.limits) + detail = f'::{o.index}::{o.limits}' props = [str(i) for i in o.properties] - props = '[%s] ' % ','.join(props) if props else '' + if props: + cprops = ','.join(props) + props = f'[{cprops}] ' + else: + props = '' else: detail, props = '', '' - return self.indent + "<%sIteration %s%s>\n%s" % (props, o.dim.name, detail, body) + return self.indent + f"<{props}Iteration {o.dim.name}{detail}>\n{body}" def visit_While(self, o): self._depth += 1 body = self._visit(o.children) self._depth -= 1 - return self.indent + "\n%s" % (o.condition, body) + return self.indent + f"\n{body}" def visit_Expression(self, o): if self.verbose: - body = "%s = %s" % (o.expr.lhs, o.expr.rhs) - return self.indent + "" % body + body = f"{o.expr.lhs} = {o.expr.rhs}" + return self.indent + f"" else: return self.indent + str(o) def visit_AugmentedExpression(self, o): if self.verbose: - body = "%s %s= %s" % (o.expr.lhs, o.op, o.expr.rhs) - return self.indent + "<%s %s>" % (o.__class__.__name__, body) + body = f"{o.expr.lhs} {o.op}= {o.expr.rhs}" + return self.indent + f"<{o.__class__.__name__} {body}>" else: return self.indent + str(o) @@ -154,7 +162,7 @@ def visit_HaloSpot(self, o): self._depth += 1 body = self._visit(o.children) self._depth -= 1 - return self.indent + "%s\n%s" % (o.__repr__(), body) + return self.indent + f"{o.__repr__()}\n{body}" def visit_Conditional(self, o): self._depth += 1 @@ -162,10 +170,9 @@ def visit_Conditional(self, o): self._depth -= 1 if o.else_body: else_body = self._visit(o.else_body) - return self.indent + "\n%s\n\n%s" % (o.condition, - then_body, else_body) + return self.indent + f"\n{then_body}\n\n{else_body}" else: - return self.indent + "\n%s" % (o.condition, then_body) + return self.indent + f"\n{then_body}" class CGen(Visitor): @@ -174,19 +181,23 @@ class CGen(Visitor): Return a representation of the Iteration/Expression tree as a :module:`cgen` tree. """ - def __init__(self, *args, compiler=None, **kwargs): + def __init__(self, *args, printer=None, **kwargs): super().__init__(*args, **kwargs) - self._compiler = compiler - - # The following mappers may be customized by subclasses (that is, - # backend-specific CGen-erators) - _qualifiers_mapper = { - 'is_const': 'const', - 'is_volatile': 'volatile', - '_mem_constant': 'static', - '_mem_shared': '', - } - _restrict_keyword = 'restrict' + if printer is None: + from devito.passes.iet.languages.C import CPrinter + printer = CPrinter + self.printer = printer + + def ccode(self, expr, **kwargs): + return self.printer(settings=kwargs).doprint(expr, None) + + @property + def _qualifiers_mapper(self): + return self.printer._qualifiers_mapper + + @property + def _restrict_keyword(self): + return self.printer._restrict_keyword def _gen_struct_decl(self, obj, masked=()): """ @@ -197,7 +208,8 @@ def _gen_struct_decl(self, obj, masked=()): while issubclass(ctype, ctypes._Pointer): ctype = ctype._type_ - if not issubclass(ctype, ctypes.Structure): + if not issubclass(ctype, ctypes.Structure) or \ + issubclass(ctype, NoDeclStruct): return None except TypeError: # E.g., `ctype` is of type `dtypes_lowering.CustomDtype` @@ -221,9 +233,9 @@ def _gen_struct_decl(self, obj, masked=()): try: entries.append(self._gen_value(i, 0, masked=('const',))) except AttributeError: - cstr = ctypes_to_cstr(ct) + cstr = self.ccode(ct) if ct is c_restrict_void_p: - cstr = '%srestrict' % cstr + cstr = f'{cstr}{self._restrict_keyword}' entries.append(c.Value(cstr, n)) return c.Struct(ctype.__name__, entries) @@ -243,24 +255,24 @@ def _gen_value(self, obj, mode=1, masked=()): if getattr(obj.function, k, False) and v not in masked] if (obj._mem_stack or obj._mem_constant) and mode == 1: - strtype = obj._C_typedata - strshape = ''.join('[%s]' % ccode(i) for i in obj.symbolic_shape) + strtype = self.ccode(obj._C_typedata) + strshape = ''.join(f'[{self.ccode(i)}]' for i in obj.symbolic_shape) else: - strtype = ctypes_to_cstr(obj._C_ctype) + strtype = self.ccode(obj._C_ctype) strshape = '' if isinstance(obj, (AbstractFunction, IndexedData)) and mode >= 1: if not obj._mem_stack: - strtype = '%s%s' % (strtype, self._restrict_keyword) + strtype = f'{strtype}{self._restrict_keyword}' strtype = ' '.join(qualifiers + [strtype]) if obj.is_LocalObject and obj._C_modifier is not None and mode == 2: strtype += obj._C_modifier strname = obj._C_name - strobj = '%s%s' % (strname, strshape) + strobj = f'{strname}{strshape}' if obj.is_LocalObject and obj.cargs and mode == 1: - arguments = [ccode(i) for i in obj.cargs] + arguments = [self.ccode(i) for i in obj.cargs] strobj = MultilineCall(strobj, arguments, True) value = c.Value(strtype, strobj) @@ -274,9 +286,9 @@ def _gen_value(self, obj, mode=1, masked=()): if obj.is_Array and obj.initvalue is not None and mode == 1: init = ListInitializer(obj.initvalue) if not obj._mem_constant or init.is_numeric: - value = c.Initializer(value, ccode(init)) + value = c.Initializer(value, self.ccode(init)) elif obj.is_LocalObject and obj.initvalue is not None and mode == 1: - value = c.Initializer(value, ccode(obj.initvalue)) + value = c.Initializer(value, self.ccode(obj.initvalue)) return value @@ -310,7 +322,7 @@ def _args_call(self, args): else: ret.append(i._C_name) except AttributeError: - ret.append(ccode(i)) + ret.append(self.ccode(i)) return ret def _gen_signature(self, o, is_declaration=False): @@ -387,19 +399,20 @@ def visit_tuple(self, o): def visit_PointerCast(self, o): f = o.function i = f.indexed + cstr = self.ccode(i._C_typedata) if f.is_PointerArray: # lvalue - lvalue = c.Value(i._C_typedata, '**%s' % f.name) + lvalue = c.Value(cstr, f'**{f.name}') # rvalue if isinstance(o.obj, ArrayObject): - v = '%s->%s' % (o.obj.name, f._C_name) + v = f'{o.obj.name}->{f._C_name}' elif isinstance(o.obj, IndexedData): v = f._C_name else: assert False - rvalue = '(%s**) %s' % (i._C_typedata, v) + rvalue = f'({cstr}**) {v}' else: # lvalue @@ -408,12 +421,12 @@ def visit_PointerCast(self, o): else: v = f.name if o.flat is None: - shape = ''.join("[%s]" % ccode(i) for i in o.castshape) - rshape = '(*)%s' % shape - lvalue = c.Value(i._C_typedata, '(*restrict %s)%s' % (v, shape)) + shape = ''.join(f"[{self.ccode(i)}]" for i in o.castshape) + rshape = f'(*){shape}' + lvalue = c.Value(cstr, f'(*{self._restrict_keyword} {v}){shape}') else: rshape = '*' - lvalue = c.Value(i._C_typedata, '*%s' % v) + lvalue = c.Value(cstr, f'*{v}') if o.alignment and f._data_alignment: lvalue = c.AlignedAttribute(f._data_alignment, lvalue) @@ -426,14 +439,14 @@ def visit_PointerCast(self, o): else: assert False - rvalue = '(%s %s) %s->%s' % (i._C_typedata, rshape, f._C_name, v) + rvalue = f'({cstr} {rshape}) {f._C_name}->{v}' else: if isinstance(o.obj, Pointer): v = o.obj.name else: v = f._C_name - rvalue = '(%s %s) %s' % (i._C_typedata, rshape, v) + rvalue = f'({cstr} {rshape}) {v}' return c.Initializer(lvalue, rvalue) @@ -441,19 +454,21 @@ def visit_Dereference(self, o): a0, a1 = o.functions if a1.is_PointerArray or a1.is_TempFunction: i = a1.indexed + cstr = self.ccode(i._C_typedata) if o.flat is None: - shape = ''.join("[%s]" % ccode(i) for i in a0.symbolic_shape[1:]) - rvalue = '(%s (*)%s) %s[%s]' % (i._C_typedata, shape, a1.name, - a1.dim.name) - lvalue = c.Value(i._C_typedata, - '(*restrict %s)%s' % (a0.name, shape)) + shape = ''.join(f"[{self.ccode(i)}]" for i in a0.symbolic_shape[1:]) + rvalue = f'({cstr} (*){shape}) {a1.name}[{a1.dim.name}]' + lvalue = c.Value(cstr, f'(*{self._restrict_keyword} {a0.name}){shape}') else: - rvalue = '(%s *) %s[%s]' % (i._C_typedata, a1.name, a1.dim.name) - lvalue = c.Value(i._C_typedata, '*restrict %s' % a0.name) + rvalue = f'({cstr} *) {a1.name}[{a1.dim.name}]' + lvalue = c.Value(cstr, f'*{self._restrict_keyword} {a0.name}') if a0._data_alignment: lvalue = c.AlignedAttribute(a0._data_alignment, lvalue) else: - rvalue = '%s->%s' % (a1.name, a0._C_name) + if a1.is_Symbol: + rvalue = f'*{a1.name}' + else: + rvalue = f'{a1.name}->{a0._C_name}' lvalue = self._gen_value(a0, 0) return c.Initializer(lvalue, rvalue) @@ -475,15 +490,15 @@ def visit_Break(self, o): def visit_Return(self, o): v = 'return' if o.value is not None: - v += ' %s' % o.value + v += f' {self.ccode(o.value)}' return c.Statement(v) def visit_Definition(self, o): return self._gen_value(o.function) def visit_Expression(self, o): - lhs = ccode(o.expr.lhs, dtype=o.dtype, compiler=self._compiler) - rhs = ccode(o.expr.rhs, dtype=o.dtype, compiler=self._compiler) + lhs = self.ccode(o.expr.lhs, dtype=o.dtype) + rhs = self.ccode(o.expr.rhs, dtype=o.dtype) if o.init: code = c.Initializer(self._gen_value(o.expr.lhs, 0), rhs) @@ -496,9 +511,9 @@ def visit_Expression(self, o): return code def visit_AugmentedExpression(self, o): - c_lhs = ccode(o.expr.lhs, dtype=o.dtype, compiler=self._compiler) - c_rhs = ccode(o.expr.rhs, dtype=o.dtype, compiler=self._compiler) - code = c.Statement("%s %s= %s" % (c_lhs, o.op, c_rhs)) + c_lhs = self.ccode(o.expr.lhs, dtype=o.dtype) + c_rhs = self.ccode(o.expr.rhs, dtype=o.dtype) + code = c.Statement(f"{c_lhs} {o.op}= {c_rhs}") if o.pragmas: code = c.Module(self._visit(o.pragmas) + (code,)) return code @@ -516,7 +531,7 @@ def visit_Call(self, o, nested_call=False): o.templates) if retobj.is_Indexed or \ isinstance(retobj, (FieldFromComposite, FieldFromPointer)): - return c.Assign(ccode(retobj), call) + return c.Assign(self.ccode(retobj), call) else: return c.Initializer(c.Value(rettype, retobj._C_name), call) @@ -530,9 +545,9 @@ def visit_Conditional(self, o): then_body = c.Block(self._visit(then_body)) if else_body: else_body = c.Block(self._visit(else_body)) - return c.If(ccode(o.condition), then_body, else_body) + return c.If(self.ccode(o.condition), then_body, else_body) else: - return c.If(ccode(o.condition), then_body) + return c.If(self.ccode(o.condition), then_body) def visit_Iteration(self, o): body = flatten(self._visit(i) for i in self._blankline_logic(o.children)) @@ -542,23 +557,23 @@ def visit_Iteration(self, o): # For backward direction flip loop bounds if o.direction == Backward: - loop_init = 'int %s = %s' % (o.index, ccode(_max)) - loop_cond = '%s >= %s' % (o.index, ccode(_min)) - loop_inc = '%s -= %s' % (o.index, o.limits[2]) + loop_init = f'int {o.index} = {self.ccode(_max)}' + loop_cond = f'{o.index} >= {self.ccode(_min)}' + loop_inc = f'{o.index} -= {o.limits[2]}' else: - loop_init = 'int %s = %s' % (o.index, ccode(_min)) - loop_cond = '%s <= %s' % (o.index, ccode(_max)) - loop_inc = '%s += %s' % (o.index, o.limits[2]) + loop_init = f'int {o.index} = {self.ccode(_min)}' + loop_cond = f'{o.index} <= {self.ccode(_max)}' + loop_inc = f'{o.index} += {o.limits[2]}' # Append unbounded indices, if any if o.uindices: - uinit = ['%s = %s' % (i.name, ccode(i.symbolic_min)) for i in o.uindices] + uinit = [f'{i.name} = {self.ccode(i.symbolic_min)}' for i in o.uindices] loop_init = c.Line(', '.join([loop_init] + uinit)) ustep = [] for i in o.uindices: op = '=' if i.is_Modulo else '+=' - ustep.append('%s %s %s' % (i.name, op, ccode(i.symbolic_incr))) + ustep.append(f'{i.name} {op} {self.ccode(i.symbolic_incr)}') loop_inc = c.Line(', '.join([loop_inc] + ustep)) # Create For header+body @@ -575,13 +590,13 @@ def visit_Pragma(self, o): return c.Pragma(o._generate) def visit_While(self, o): - condition = ccode(o.condition) + condition = self.ccode(o.condition) if o.body: body = flatten(self._visit(i) for i in o.children) return c.While(condition, c.Block(body)) else: # Hack: cgen doesn't support body-less while-loops, i.e. `while(...);` - return c.Statement('while(%s)' % condition) + return c.Statement(f'while({condition})') def visit_Callable(self, o): body = flatten(self._visit(i) for i in o.children) @@ -600,8 +615,11 @@ def visit_MultiTraversable(self, o): body.extend(as_tuple(v)) return c.Collection(body) + def visit_Using(self, o): + return c.Statement(f'using {str(o.name)}') + def visit_UsingNamespace(self, o): - return c.Statement('using namespace %s' % ccode(o.namespace)) + return c.Statement(f'using namespace {str(o.namespace)}') def visit_Lambda(self, o): body = [] @@ -611,17 +629,19 @@ def visit_Lambda(self, o): if body: body.append(c.Line()) body.extend(as_tuple(v)) + captures = [str(i) for i in o.captures] decls = [i.inline() for i in self._args_decl(o.parameters)] + extra = [] if o.special: extra.append(' ') extra.append(' '.join(str(i) for i in o.special)) if o.attributes: extra.append(' ') - extra.append(' '.join('[[%s]]' % i for i in o.attributes)) - top = c.Line('[%s](%s)%s' % - (', '.join(captures), ', '.join(decls), ''.join(extra))) + extra.append(' '.join(f'[[{i}]]' for i in o.attributes)) + + top = c.Line(f"[{', '.join(captures)}]({', '.join(decls)}){''.join(extra)}") return LambdaCollection([top, c.Block(body)]) def visit_HaloSpot(self, o): @@ -630,7 +650,7 @@ def visit_HaloSpot(self, o): def visit_KernelLaunch(self, o): if o.templates: - templates = '<%s>' % ','.join([str(i) for i in o.templates]) + templates = f"<{','.join([str(i) for i in o.templates])}>" else: templates = '' @@ -644,14 +664,34 @@ def visit_KernelLaunch(self, o): arguments = self._args_call(o.arguments) arguments = ','.join(arguments) - return c.Statement('%s%s<<<%s>>>(%s)' - % (o.name, templates, launch_config, arguments)) + return c.Statement(f'{o.name}{templates}<<<{launch_config}>>>({arguments})') # Operator-handle machinery def _operator_includes(self, o): + """ + Generate cgen includes from an iterable of symbols and expressions. + """ return [c.Include(i, system=(False if i.endswith('.h') else True)) - for i in o._includes] + for i in o.includes] + [blankline] + + def _operator_namespaces(self, o): + """ + Generate cgen namespaces from an iterable of symbols and expressions. + """ + namespaces = [self._visit(i) for i in o.namespaces] + if namespaces: + namespaces.append(blankline) + return namespaces + + def _operator_headers(self, o): + """ + Generate cgen headers from an iterable of symbols and expressions. + """ + headers = [c.Define(*as_tuple(i)) for i in o.headers] + if headers: + headers.append(blankline) + return headers def _operator_typedecls(self, o, mode='all'): xfilter0 = lambda i: self._gen_struct_decl(i) is not None @@ -709,15 +749,13 @@ def visit_Operator(self, o, mode='all'): efuncs.extend([self._visit(i), blankline]) # Definitions - headers = [c.Define(*i) for i in o._headers] + [blankline] + headers = self._operator_headers(o) # Header files - includes = self._operator_includes(o) + [blankline] + includes = self._operator_includes(o) # Namespaces - namespaces = [self._visit(i) for i in o._namespaces] - if namespaces: - namespaces.append(blankline) + namespaces = self._operator_namespaces(o) # Type declarations typedecls = self._operator_typedecls(o, mode) @@ -738,7 +776,7 @@ class CInterface(CGen): def _operator_includes(self, o): includes = super()._operator_includes(o) - includes.append(c.Include("%s.h" % o.name, system=False)) + includes.append(c.Include(f"{o.name}.h", system=False)) return includes @@ -750,7 +788,7 @@ def visit_Operator(self, o): typedecls = self._operator_typedecls(o, mode='public') guarded_typedecls = [] for i in typedecls: - guard = "DEVITO_%s" % i.tpname.upper() + guard = f"DEVITO_{i.tpname.upper()}" iflines = [c.Define(guard, ""), blankline, i, blankline] guarded_typedecl = c.IfNDef(guard, iflines, []) guarded_typedecls.extend([guarded_typedecl, blankline]) @@ -956,6 +994,7 @@ def default_retval(cls): Drive the search. Accepted: - `symbolics`: Collect all AbstractFunction objects, default - `basics`: Collect all Basic objects + - `abstractsymbols`: Collect all AbstractSymbol objects - `dimensions`: Collect all Dimensions - `indexeds`: Collect all Indexed objects - `indexedbases`: Collect all IndexedBase objects @@ -976,6 +1015,8 @@ def _defines_aliases(n): rules = { 'symbolics': lambda n: n.functions, 'basics': lambda n: [i for i in n.expr_symbols if isinstance(i, Basic)], + 'symbols': lambda n: [i for i in n.expr_symbols + if isinstance(i, AbstractSymbol)], 'dimensions': lambda n: [i for i in n.expr_symbols if isinstance(i, Dimension)], 'indexeds': lambda n: [i for i in n.expr_symbols if i.is_Indexed], 'indexedbases': lambda n: [i for i in n.expr_symbols @@ -1376,13 +1417,14 @@ def __init__(self, name, arguments, is_expr=False, is_indirect=False, def generate(self): if self.templates: - tip = "%s<%s>" % (self.name, ", ".join(str(i) for i in self.templates)) + tip = f"{self.name}<{', '.join(str(i) for i in self.templates)}>" else: tip = self.name if not self.is_indirect: - tip = "%s(" % tip + tip = f"{tip}(" else: - tip = "%s%s" % (tip, ',' if self.arguments else '') + cargs = ',' if self.arguments else '' + tip = f"{tip}{cargs}" processed = [] for i in self.arguments: if isinstance(i, (MultilineCall, LambdaCollection)): @@ -1404,7 +1446,7 @@ def generate(self): if not self.is_expr: tip += ";" if self.cast: - tip = '(%s)%s' % (self.cast, tip) + tip = f'({self.cast}){tip}' yield tip diff --git a/devito/mpi/routines.py b/devito/mpi/routines.py index 8b4987c8bb..745a30df2a 100644 --- a/devito/mpi/routines.py +++ b/devito/mpi/routines.py @@ -15,8 +15,8 @@ Transformer, ElementalCall, CommCallable) from devito.mpi import MPI from devito.symbolics import (Byref, CondNe, FieldFromPointer, FieldFromComposite, - IndexedPointer, Macro, cast_mapper, subs_op_args) -from devito.tools import (as_mapper, dtype_to_mpitype, dtype_len, dtype_to_ctype, + IndexedPointer, Macro, cast, subs_op_args) +from devito.tools import (as_mapper, dtype_to_mpitype, dtype_len, infer_datasize, flatten, generator, is_integer, split) from devito.types import (Array, Bag, Dimension, Eq, Symbol, LocalObject, CompositeObject, CustomDimension) @@ -605,7 +605,7 @@ def _make_msg(self, f, hse, key): return MPIMsg('msg%d' % key, f, halos) def _make_sendrecv(self, f, hse, key, msg=None): - cast = cast_mapper[(f.c0.dtype, '*')] + fcast = cast(f.c0.dtype, '*') comm = f.grid.distributor._obj_comm bufg = FieldFromPointer(msg._C_field_bufg, msg) @@ -619,7 +619,7 @@ def _make_sendrecv(self, f, hse, key, msg=None): sizes = [FieldFromPointer('%s[%d]' % (msg._C_field_sizes, i), msg) for i in range(len(f._dist_dimensions))] - arguments = [cast(bufg)] + sizes + list(f.handles) + ofsg + arguments = [fcast(bufg)] + sizes + list(f.handles) + ofsg gather = Gather('gather%s' % key, arguments) # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) @@ -671,7 +671,7 @@ def _call_compute(self, hs, compute, *args): return compute.make_call(dynamic_args_mapper=hs.omapper.core) def _make_wait(self, f, hse, key, msg=None): - cast = cast_mapper[(f.c0.dtype, '*')] + fcast = cast(f.c0.dtype, '*') bufs = FieldFromPointer(msg._C_field_bufs, msg) @@ -681,7 +681,7 @@ def _make_wait(self, f, hse, key, msg=None): sizes = [FieldFromPointer('%s[%d]' % (msg._C_field_sizes, i), msg) for i in range(len(f._dist_dimensions))] - arguments = [cast(bufs)] + sizes + list(f.handles) + ofss + arguments = [fcast(bufs)] + sizes + list(f.handles) + ofss scatter = Scatter('scatter%s' % key, arguments) # The `scatter` must be guarded as we must not alter the halo values along @@ -772,7 +772,7 @@ def _call_sendrecv(self, *args): return def _make_haloupdate(self, f, hse, key, *args, msg=None): - cast = cast_mapper[(f.c0.dtype, '*')] + fcast = cast(f.c0.dtype, '*') comm = f.grid.distributor._obj_comm fixed = {d: Symbol(name="o%s" % d.root) for d in hse.loc_indices} @@ -794,7 +794,7 @@ def _make_haloupdate(self, f, hse, key, *args, msg=None): ofsg = [fixed.get(d) or ofsg.pop(0) for d in f.dimensions] # The `gather` is unnecessary if sending to MPI.PROC_NULL - arguments = [cast(bufg)] + sizes + list(f.handles) + ofsg + arguments = [fcast(bufg)] + sizes + list(f.handles) + ofsg gather = Gather('gather%s' % key, arguments) gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) @@ -819,7 +819,7 @@ def _call_haloupdate(self, name, f, hse, msg): return HaloUpdateCall(name, args) def _make_halowait(self, f, hse, key, *args, msg=None): - cast = cast_mapper[(f.c0.dtype, '*')] + fcast = cast(f.c0.dtype, '*') fixed = {d: Symbol(name="o%s" % d.root) for d in hse.loc_indices} @@ -839,7 +839,7 @@ def _make_halowait(self, f, hse, key, *args, msg=None): # The `scatter` must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL - arguments = [cast(bufs)] + sizes + list(f.handles) + ofss + arguments = [fcast(bufs)] + sizes + list(f.handles) + ofss scatter = Scatter('scatter%s' % key, arguments) scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) @@ -1204,8 +1204,8 @@ def _arg_defaults(self, allocator, alias, args=None): entry.sizes = (c_int*len(shape))(*shape) # Allocate the send/recv buffers - size = reduce(mul, shape)*dtype_len(self.target.dtype) - ctype = dtype_to_ctype(f.dtype) + ctype, datasize = infer_datasize(f.dtype, shape) + size = datasize * dtype_len(self.target.dtype) entry.bufg, bufg_memfree_args = allocator._alloc_C_libcall(size, ctype) entry.bufs, bufs_memfree_args = allocator._alloc_C_libcall(size, ctype) diff --git a/devito/operator/operator.py b/devito/operator/operator.py index 57b29e90de..0a9f9db62f 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -27,6 +27,7 @@ from devito.passes import (Graph, lower_index_derivatives, generate_implicit, generate_macros, minimize_symbols, unevaluate, error_mapper, is_on_device) +from devito.passes.iet.dtypes import lower_dtypes from devito.symbolics import estimate_cost, subs_op_args from devito.tools import (DAG, OrderedSet, Signer, ReducerMap, as_mapper, as_tuple, flatten, filter_sorted, frozendict, is_integer, @@ -143,11 +144,6 @@ class Operator(Callable): refer to the relevant documentation. """ - _default_headers = [('_POSIX_C_SOURCE', '200809L')] - _default_includes = ['stdlib.h', 'math.h', 'sys/time.h'] - _default_globals = [] - _default_namespaces = [] - def __new__(cls, expressions, **kwargs): if expressions is None: # Return a dummy Callable. This is exploited by unpickling. Users @@ -207,15 +203,11 @@ def _build(cls, expressions, **kwargs): Callable.__init__(op, **op.args) # Header files, etc. - op._headers = OrderedSet(*cls._default_headers) - op._headers.update(byproduct.headers) - op._globals = OrderedSet(*cls._default_globals) - op._globals.update(byproduct.globals) - op._includes = OrderedSet(*cls._default_includes) - op._includes.update(profiler._default_includes) + op._headers = OrderedSet(*byproduct.headers) + op._globals = OrderedSet(*byproduct.globals) + op._includes = OrderedSet(*profiler._default_includes) op._includes.update(byproduct.includes) - op._namespaces = OrderedSet(*cls._default_namespaces) - op._namespaces.update(byproduct.namespaces) + op._namespaces = OrderedSet(*byproduct.namespaces) # Required for the jit-compilation op._compiler = kwargs['compiler'] @@ -268,6 +260,9 @@ def _lower(cls, expressions, **kwargs): """ # Create a symbol registry kwargs.setdefault('sregistry', SymbolRegistry()) + # Add lang-base kwargs + kwargs.setdefault('langbb', cls._Target.langbb()) + kwargs.setdefault('printer', cls._Target.Printer) expressions = as_tuple(expressions) @@ -493,6 +488,9 @@ def _lower_iet(cls, uiet, profiler=None, **kwargs): # Extract the necessary macros from the symbolic objects generate_macros(graph, **kwargs) + # Add type specific metadata + lower_dtypes(graph, **kwargs) + # Target-independent optimizations minimize_symbols(graph) @@ -766,13 +764,26 @@ def _soname(self): """A unique name for the shared object resulting from JIT compilation.""" return Signer._digest(self, configuration) + @cached_property + def _printer(self): + return self._Target.Printer + + @cached_property + def headers(self): + return OrderedSet(*self._printer._headers).union(self._headers) + + @cached_property + def includes(self): + return OrderedSet(*self._printer._includes).union(self._includes) + + @cached_property + def namespaces(self): + return OrderedSet(*self._printer._namespaces).union(self._namespaces) + @cached_property def ccode(self): - try: - return self._ccode_handler(compiler=self._compiler).visit(self) - except (AttributeError, TypeError): - from devito.ir.iet.visitors import CGen - return CGen(compiler=self._compiler).visit(self) + from devito.ir.iet.visitors import CGen + return CGen(printer=self._printer).visit(self) def _jit_compile(self): """ @@ -787,11 +798,11 @@ def _jit_compile(self): elapsed = self._profiler.py_timers['jit-compile'] if recompiled: - perf("Operator `%s` jit-compiled `%s` in %.2f s with `%s`" % - (self.name, src_file, elapsed, self._compiler)) + perf(f"Operator `{self.name}` jit-compiled `{src_file}` in " + f"{elapsed:.2f} s with `{self._compiler}`") else: - perf("Operator `%s` fetched `%s` in %.2f s from jit-cache" % - (self.name, src_file, elapsed)) + perf(f"Operator `{self.name}` fetched `{src_file}` in " + f"{elapsed:.2f} s from jit-cache") @property def cfunction(self): @@ -825,7 +836,7 @@ def cinterface(self, force=False): dest = self._compiler.get_jit_dir() name = dest.joinpath(self.name) - cfile = name.with_suffix(".%s" % self._compiler.src_ext) + cfile = name.with_suffix(f".{self._compiler.src_ext}") hfile = name.with_suffix('.h') # Generate the .c and .h code @@ -833,11 +844,11 @@ def cinterface(self, force=False): for f, code in [(cfile, ccode), (hfile, hcode)]: if not force and f.is_file(): - debug("`%s` was not saved in `%s` as it already exists" % (f.name, dest)) + debug(f"`{f.name}` was not saved in `{dest}` as it already exists") else: with open(str(f), 'w') as ff: ff.write(str(code)) - debug("`%s` successfully saved in `%s`" % (f.name, dest)) + debug(f"`{f.name}` successfully saved in `{dest}`") return ccode, hcode @@ -954,7 +965,7 @@ def _emit_build_profiling(self): timings = self._profiler.py_timers.copy() tot = timings.pop('op-compile') - perf("Operator `%s` generated in %.2f s" % (self.name, fround(tot))) + perf(f"Operator `{self.name}` generated in {fround(tot):.2f} s") max_hotspots = 3 threshold = 20. @@ -966,14 +977,14 @@ def _emit_timings(timings, indent=''): v = fround(timings[i]['total']) perc = fround(v/tot*100, n=10) if perc > threshold: - perf("%s%s: %.2f s (%.1f %%)" % (indent, i.lstrip('_'), v, perc)) + perf(f"{indent}{i.lstrip('_')}: {v:.2f} s ({perc:.1f} %)") _emit_timings(timings[i], ' '*len(indent) + ' * ') _emit_timings(timings, ' * ') if self._profiler._ops: ops = ['%d --> %d' % i for i in self._profiler._ops] - perf("Flops reduction after symbolic optimization: [%s]" % ' ; '.join(ops)) + perf(f"Flops reduction after symbolic optimization: [{' ; '.join(ops)}]") def _emit_apply_profiling(self, args): """Produce a performance summary of the profiled sections.""" @@ -981,7 +992,7 @@ def _emit_apply_profiling(self, args): fround = lambda i: ceil(i * 100) / 100 elapsed = fround(self._profiler.py_timers['apply']) - info("Operator `%s` ran in %.2f s" % (self.name, elapsed)) + info(f"Operator `{self.name}` ran in {elapsed:.2f} s") summary = self._profiler.summary(args, self._dtype, reduce_over=elapsed) @@ -999,16 +1010,16 @@ def _emit_apply_profiling(self, args): v = summary.globals.get('vanilla') if v is not None: if v.oi is not None: - metrics.append("OI=%.2f" % fround(v.oi)) + metrics.append(f"OI={fround(v.oi):.2f}") if v.gflopss is not None and np.isfinite(v.gflopss): - metrics.append("%.2f GFlops/s" % fround(v.gflopss)) + metrics.append(f"{fround(v.gflopss):.2f} GFlops/s") v = summary.globals.get('fdlike') if v is not None: - metrics.append("%.2f GPts/s" % fround(v.gpointss)) + metrics.append(f"{fround(v.gpointss):.2f} GPts/s") if metrics: - perf("Global performance: [%s]" % ', '.join(metrics)) + perf(f"Global performance: [{', '.join(metrics)}]") # Same as above, but excluding the setup phase, e.g. the CPU-GPU # data transfers in the case of a GPU run, mallocs, frees, etc. @@ -1016,10 +1027,10 @@ def _emit_apply_profiling(self, args): v = summary.globals.get('fdlike-nosetup') if v is not None: - metrics.append("%.2f s" % fround(v.time)) - metrics.append("%.2f GPts/s" % fround(v.gpointss)) + metrics.append(f"{fround(v.time):.2f} s") + metrics.append(f"{fround(v.gpointss):.2f} GPts/s") - perf("Global performance : [%s]" % ', '.join(metrics)) + perf(f"Global performance : [{', '.join(metrics)}]") # Prepare for the local performance indicators perf("Local performance:") @@ -1032,34 +1043,33 @@ def _emit_apply_profiling(self, args): def lower_perfentry(v): values = [] if v.oi: - values.append("OI=%.2f" % fround(v.oi)) + values.append(f"OI={fround(v.oi):.2f}") if v.gflopss: - values.append("%.2f GFlops/s" % fround(v.gflopss)) + values.append(f"{fround(v.gflopss):.2f} GFlops/s") if v.gpointss: - values.append("%.2f GPts/s" % fround(v.gpointss)) + values.append(f"{fround(v.gpointss):.2f} GPts/s") if values: - return "[%s]" % ", ".join(values) + return f"[{', '.join(values)}]" else: return "" for k, v in summary.items(): - rank = "[rank%d]" % k.rank if k.rank is not None else "" - name = "%s%s" % (k.name, rank) + rank = f"[rank{k.rank if k.rank is not None else ''}]" + name = f"{k.name}{rank}" if v.time <= 0.01: # Trim down the output for very fast sections - perf("%s* %s ran in %.2f s" % (indent, name, fround(v.time))) + perf(f"{indent}* {name} ran in {fround(v.time):.2f} s") continue metrics = lower_perfentry(v) - perf("%s* %s ran in %.2f s %s" % (indent, name, fround(v.time), metrics)) + perf(f"{indent}* {name} ran in {fround(v.time):.2f} s {metrics}") for n, v1 in summary.subsections.get(k.name, {}).items(): metrics = lower_perfentry(v1) - perf("%s+ %s ran in %.2f s [%.2f%%] %s" % - (indent*2, n, fround(v1.time), fround(v1.time/v.time*100), - metrics)) + perf(f"{indent*2}+ {n} ran in {fround(v1.time):.2f} s " + f"[{fround(v1.time/v.time*100):.2f}%] {metrics}") # Emit performance mode and arguments perf_args = {} @@ -1077,7 +1087,7 @@ def lower_perfentry(v): if is_integer(self.npthreads): perf_args['pthreads'] = self.npthreads perf_args = {k: perf_args[k] for k in sorted(perf_args)} - perf("Performance[mode=%s] arguments: %s" % (self._mode, perf_args)) + perf(f"Performance[mode={self._mode}] arguments: {perf_args}") return summary @@ -1128,7 +1138,7 @@ def __setstate__(self, state): self._lib.name = soname self._allocator = default_allocator( - '%s.%s.%s' % (self._compiler.name, self._language, self._platform) + f'{type(self._compiler).__name__}.{self._language}.{self._platform}' ) @@ -1318,7 +1328,7 @@ def parse_kwargs(**kwargs): warning("Both `dle` and `opt` were passed; ignoring `dle` argument") opt = kwargs.pop('opt') else: - warning("Setting `opt=%s`" % str(dle)) + warning(f"Setting `opt={str(dle)}`") opt = dle elif 'opt' in kwargs: opt = kwargs.pop('opt') @@ -1338,7 +1348,7 @@ def parse_kwargs(**kwargs): else: mode, options = tuple(flatten(i.split(',') for i in opt)), {} else: - raise InvalidOperator("Illegal `opt=%s`" % str(opt)) + raise InvalidOperator(f"Illegal `opt={str(opt)}`") # `opt`, deprecated kwargs kwopenmp = kwargs.get('openmp', options.get('openmp')) @@ -1358,7 +1368,7 @@ def parse_kwargs(**kwargs): for i in deprecated_options: try: options.pop(i) - warning("Ignoring deprecated optimization option `%s`" % i) + warning(f"Ignoring deprecated optimization option `{i}`") except KeyError: pass kwargs['options'] = options @@ -1374,7 +1384,7 @@ def parse_kwargs(**kwargs): if not isinstance(platform, str): raise ValueError("Argument `platform` should be a `str`") if platform not in configuration._accepted['platform']: - raise InvalidOperator("Illegal `platform=%s`" % str(platform)) + raise InvalidOperator(f"Illegal `platform={str(platform)}`") kwargs['platform'] = platform_registry[platform]() else: kwargs['platform'] = configuration['platform'] @@ -1385,7 +1395,7 @@ def parse_kwargs(**kwargs): if not isinstance(language, str): raise ValueError("Argument `language` should be a `str`") if language not in configuration._accepted['language']: - raise InvalidOperator("Illegal `language=%s`" % str(language)) + raise InvalidOperator(f"Illegal `language={str(language)}`") kwargs['language'] = language elif kwopenmp is not None: # Handle deprecated `openmp` kwarg for backward compatibility @@ -1399,10 +1409,11 @@ def parse_kwargs(**kwargs): if not isinstance(compiler, str): raise ValueError("Argument `compiler` should be a `str`") if compiler not in configuration._accepted['compiler']: - raise InvalidOperator("Illegal `compiler=%s`" % str(compiler)) + raise InvalidOperator(f"Illegal `compiler={str(compiler)}`") kwargs['compiler'] = compiler_registry[compiler](platform=kwargs['platform'], language=kwargs['language'], - mpi=configuration['mpi']) + mpi=configuration['mpi'], + name=compiler) elif any([platform, language]): kwargs['compiler'] =\ configuration['compiler'].__new_with__(platform=kwargs['platform'], @@ -1411,11 +1422,18 @@ def parse_kwargs(**kwargs): else: kwargs['compiler'] = configuration['compiler'].__new_with__() + # Make sure compiler and language are compatible + if compiler is not None and kwargs['compiler']._cpp and \ + kwargs['language'] in ['C', 'openmp']: + kwargs['language'] = 'CXX' if kwargs['language'] == 'C' else 'CXXopenmp' + if 'CXX' in kwargs['language'] and not kwargs['compiler']._cpp: + kwargs['compiler'] = kwargs['compiler'].__new_with__(cpp=True) + # `allocator` kwargs['allocator'] = default_allocator( - '%s.%s.%s' % (kwargs['compiler'].name, - kwargs['language'], - kwargs['platform']) + f"{kwargs['compiler'].__class__.__name__}" + f".{kwargs['language']}" + f".{kwargs['platform']}" ) # Normalize `subs`, if any diff --git a/devito/operator/registry.py b/devito/operator/registry.py index 04c1000866..c8aac315b7 100644 --- a/devito/operator/registry.py +++ b/devito/operator/registry.py @@ -26,7 +26,8 @@ class OperatorRegistry(OrderedDict, metaclass=Singleton): """ _modes = ('noop', 'advanced', 'advanced-fsg') - _languages = ('C', 'openmp', 'openacc', 'cuda', 'hip', 'sycl') + _languages = ('C', 'CXX', 'openmp', 'Copenmp', 'CXXopenmp', + 'openacc', 'cuda', 'hip', 'sycl') _accepted = _modes + tuple(product(_modes, _languages)) def add(self, operator, platform, mode, language='C'): diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 9c5ecde279..e828782812 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -109,11 +109,11 @@ class CireTransformer: def __init__(self, sregistry, options, platform): self.sregistry = sregistry self.platform = platform - self.opt_minstorage = options['min-storage'] self.opt_rotate = options['cire-rotate'] self.opt_ftemps = options['cire-ftemps'] self.opt_mingain = options['cire-mingain'] + self.opt_min_dtype = options['scalar-min-type'] self.opt_multisubdomain = True def _aliases_from_clusters(self, clusters, exclude, meta): @@ -143,7 +143,7 @@ def _aliases_from_clusters(self, clusters, exclude, meta): # Schedule -> [Clusters]_k processed, subs = lower_schedule(schedule, meta, self.sregistry, - self.opt_ftemps) + self.opt_ftemps, self.opt_min_dtype) # [Clusters]_k -> [Clusters]_k (optimization) if self.opt_multisubdomain: @@ -831,7 +831,7 @@ def optimize_schedule_rotations(schedule, sregistry): return schedule.rebuild(*processed, rmapper=rmapper) -def lower_schedule(schedule, meta, sregistry, ftemps): +def lower_schedule(schedule, meta, sregistry, ftemps, min_dtype): """ Turn a Schedule into a sequence of Clusters. """ @@ -849,7 +849,6 @@ def lower_schedule(schedule, meta, sregistry, ftemps): # This prevents cases such as `floor(a*b)` with `a` and `b` floats # that would creat a temporary `int r = b` leading to erronous # numerical results - dtype = sympy_dtype(pivot, base=meta.dtype) if writeto: # The Dimensions defining the shape of Array @@ -881,6 +880,7 @@ def lower_schedule(schedule, meta, sregistry, ftemps): # E.g., `z` -- a non-shifted Dimension indices.append(i.dim - i.lower) + dtype = sympy_dtype(pivot, base=meta.dtype) obj = make(name=name, dimensions=dimensions, halo=halo, dtype=dtype) expression = Eq(obj[indices], uxreplace(pivot, subs)) @@ -889,6 +889,7 @@ def lower_schedule(schedule, meta, sregistry, ftemps): # Degenerate case: scalar expression assert writeto.size == 0 + dtype = sympy_dtype(pivot, base=meta.dtype, smin=min_dtype) obj = Temp(name=name, dtype=dtype) expression = Eq(obj, uxreplace(pivot, subs)) diff --git a/devito/passes/clusters/cse.py b/devito/passes/clusters/cse.py index 30b4afa0af..f2677f9ca8 100644 --- a/devito/passes/clusters/cse.py +++ b/devito/passes/clusters/cse.py @@ -1,6 +1,7 @@ from collections import defaultdict from functools import cached_property, singledispatch +import numpy as np import sympy from sympy import Add, Function, Indexed, Mul, Pow try: @@ -69,11 +70,15 @@ def cse(cluster, sregistry=None, options=None, **kwargs): """ min_cost = options['cse-min-cost'] mode = options['cse-algo'] + try: + dtype = np.promote_types(options['scalar-min-type'], cluster.dtype).type + except TypeError: + dtype = cluster.dtype if cluster.is_fence: return cluster - make = lambda: CTemp(name=sregistry.make_name(), dtype=cluster.dtype) + make = lambda: CTemp(name=sregistry.make_name(), dtype=dtype) exprs = _cse(cluster, make, min_cost=min_cost, mode=mode) diff --git a/devito/passes/clusters/derivatives.py b/devito/passes/clusters/derivatives.py index f8f339aa1e..5af92a3208 100644 --- a/devito/passes/clusters/derivatives.py +++ b/devito/passes/clusters/derivatives.py @@ -1,6 +1,7 @@ from functools import singledispatch from sympy import S +import numpy as np from devito.finite_differences import IndexDerivative from devito.ir import Backward, Forward, Interval, IterationSpace, Queue @@ -157,7 +158,7 @@ def _(expr, c, ispace, weights, reusables, mapper, **kwargs): # NOTE: created before recurring so that we ultimately get a sound ordering try: s = reusables.pop() - assert s.dtype is dtype + assert np.can_cast(s.dtype, dtype) except KeyError: name = sregistry.make_name(prefix='r') s = Symbol(name=name, dtype=dtype) diff --git a/devito/passes/clusters/factorization.py b/devito/passes/clusters/factorization.py index 8007d3fee2..45d140a253 100644 --- a/devito/passes/clusters/factorization.py +++ b/devito/passes/clusters/factorization.py @@ -195,7 +195,6 @@ def _collect_nested(expr, strategy): Recursion helper for `collect_nested`. """ # Return semantic (rebuilt expression, factorization candidates) - if expr.is_Number: return expr, {'coeffs': expr} elif q_routine(expr): diff --git a/devito/passes/iet/__init__.py b/devito/passes/iet/__init__.py index c09db00c9b..1cdb97c794 100644 --- a/devito/passes/iet/__init__.py +++ b/devito/passes/iet/__init__.py @@ -8,3 +8,4 @@ from .instrument import * # noqa from .languages import * # noqa from .errors import * # noqa +from .dtypes import * # noqa diff --git a/devito/passes/iet/definitions.py b/devito/passes/iet/definitions.py index 3532169754..3cc5446d57 100644 --- a/devito/passes/iet/definitions.py +++ b/devito/passes/iet/definitions.py @@ -16,7 +16,7 @@ from devito.passes.iet.engine import iet_pass from devito.passes.iet.langbase import LangBB from devito.symbolics import (Byref, DefFunction, FieldFromPointer, IndexedPointer, - SizeOf, VOID, Keyword, pow_to_mul) + SizeOf, VOID, pow_to_mul) from devito.tools import as_mapper, as_list, as_tuple, filter_sorted, flatten from devito.types import (Array, ComponentAccess, CustomDimension, DeviceMap, DeviceRM, Eq, Symbol) @@ -70,7 +70,7 @@ def map(self, key, site, k, v): class DataManager: - lang = LangBB + langbb = LangBB """ The language used to express data allocations, deletions, and host-device transfers. """ @@ -127,13 +127,13 @@ def _alloc_array_on_global_mem(self, site, obj, storage): name = self.sregistry.make_name(prefix='init_global') nbytes = SizeOf(obj._C_typedata)*obj.size body = [Definition(src), - self.lang['alloc-global-symbol'](obj.indexed, src.indexed, nbytes)] + self.langbb['alloc-global-symbol'](obj.indexed, src.indexed, nbytes)] efunc = make_callable(name, body) alloc = Call(name, efunc.parameters) storage.update(obj, site, allocs=alloc, efuncs=efunc) - return self.lang['header-memcpy'] + return self.langbb['header-memcpy'] def _alloc_scalar_on_low_lat_mem(self, site, expr, storage): """ @@ -150,9 +150,9 @@ def _alloc_host_array_on_high_bw_mem(self, site, obj, storage, *args): memptr = VOID(Byref(obj._C_symbol), '**') alignment = obj._data_alignment nbytes = SizeOf(obj._C_typedata)*obj.size - alloc = self.lang['host-alloc'](memptr, alignment, nbytes) + alloc = self.langbb['host-alloc'](memptr, alignment, nbytes) - free = self.lang['host-free'](obj._C_symbol) + free = self.langbb['host-free'](obj._C_symbol) storage.update(obj, site, allocs=(decl, alloc), frees=free) @@ -172,7 +172,7 @@ def _alloc_mapped_array_on_high_bw_mem(self, site, obj, storage, *args): memptr = VOID(Byref(obj._C_symbol), '**') alignment = obj._data_alignment nbytes = SizeOf(obj._C_typedata) - allocs = [self.lang['host-alloc'](memptr, alignment, nbytes)] + allocs = [self.langbb['host-alloc'](memptr, alignment, nbytes)] nbytes_param = Symbol(name='nbytes', dtype=np.uint64, is_const=True) nbytes_arg = SizeOf(obj.indexed._C_typedata)*obj.size @@ -180,7 +180,7 @@ def _alloc_mapped_array_on_high_bw_mem(self, site, obj, storage, *args): # Allocate the underlying host data ffp0 = FieldFromPointer(obj._C_field_data, obj._C_symbol) memptr = VOID(Byref(ffp0), '**') - allocs.append(self.lang['host-alloc-pin'](memptr, alignment, nbytes_param)) + allocs.append(self.langbb['host-alloc-pin'](memptr, alignment, nbytes_param)) # Initialize the Array struct ffp1 = FieldFromPointer(obj._C_field_nbytes, obj._C_symbol) @@ -188,8 +188,8 @@ def _alloc_mapped_array_on_high_bw_mem(self, site, obj, storage, *args): ffp2 = FieldFromPointer(obj._C_field_size, obj._C_symbol) init1 = DummyExpr(ffp2, 0) - frees = [self.lang['host-free-pin'](ffp0), - self.lang['host-free'](obj._C_symbol)] + frees = [self.langbb['host-free-pin'](ffp0), + self.langbb['host-free'](obj._C_symbol)] # Allocate the underlying device data, if required by the backend alloc, free = self._make_dmap_allocfree(obj, nbytes_param) @@ -226,7 +226,7 @@ def _alloc_bundle_struct_on_high_bw_mem(self, site, obj, storage): memptr = VOID(Byref(obj._C_symbol), '**') alignment = obj._data_alignment nbytes = SizeOf(obj._C_typedata) - alloc = self.lang['host-alloc'](memptr, alignment, nbytes) + alloc = self.langbb['host-alloc'](memptr, alignment, nbytes) nbytes_param = Symbol(name='nbytes', dtype=np.uint64, is_const=True) nbytes_arg = SizeOf(obj.indexed._C_typedata)*obj.size @@ -237,7 +237,7 @@ def _alloc_bundle_struct_on_high_bw_mem(self, site, obj, storage): ffp2 = FieldFromPointer(obj._C_field_size, obj._C_symbol) init1 = DummyExpr(ffp2, 0) - free = self.lang['host-free'](obj._C_symbol) + free = self.langbb['host-free'](obj._C_symbol) ret = Return(obj._C_symbol) @@ -276,18 +276,18 @@ def _alloc_pointed_array_on_high_bw_mem(self, site, obj, storage): memptr = VOID(Byref(obj._C_symbol), '**') alignment = obj._data_alignment - nbytes = SizeOf(Keyword('%s*' % obj._C_typedata))*obj.dim.symbolic_size - alloc0 = self.lang['host-alloc'](memptr, alignment, nbytes) + nbytes = SizeOf(obj._C_typedata, stars='*')*obj.dim.symbolic_size + alloc0 = self.langbb['host-alloc'](memptr, alignment, nbytes) - free0 = self.lang['host-free'](obj._C_symbol) + free0 = self.langbb['host-free'](obj._C_symbol) # The pointee Array pobj = IndexedPointer(obj._C_symbol, obj.dim) memptr = VOID(Byref(pobj), '**') nbytes = SizeOf(obj._C_typedata)*obj.array.size - alloc1 = self.lang['host-alloc'](memptr, alignment, nbytes) + alloc1 = self.langbb['host-alloc'](memptr, alignment, nbytes) - free1 = self.lang['host-free'](pobj) + free1 = self.langbb['host-free'](pobj) # Dump if obj.dim is self.sregistry.threadid: @@ -322,15 +322,15 @@ def _inject_definitions(self, iet, storage): allocs = as_list(cbody.allocs) + flatten(v.allocs) stacks = as_list(cbody.stacks) + flatten(v.stacks) for tid, body in as_mapper(v.pallocs, itemgetter(0), itemgetter(1)).items(): - header = self.lang.Region._make_header(tid.symbolic_size) - init = self.lang['thread-num'](retobj=tid) + header = self.langbb.Region._make_header(tid.symbolic_size) + init = self.langbb['thread-num'](retobj=tid) allocs.append(Block(header=header, body=[init] + body)) # frees/pfrees frees = [] for tid, body in as_mapper(v.pfrees, itemgetter(0), itemgetter(1)).items(): - header = self.lang.Region._make_header(tid.symbolic_size) - init = self.lang['thread-num'](retobj=tid) + header = self.langbb.Region._make_header(tid.symbolic_size) + init = self.langbb['thread-num'](retobj=tid) frees.append(Block(header=header, body=[init] + body)) frees.extend(as_list(cbody.frees) + flatten(v.frees)) @@ -455,7 +455,7 @@ def place_casts(self, iet, **kwargs): bases = [i for i in bases if i.name != i.function._C_name] # Create and attach the type casts - casts = tuple(self.lang.PointerCast(i.function, obj=i) for i in bases + casts = tuple(self.langbb.PointerCast(i.function, obj=i) for i in bases if i not in defines) if casts: iet = iet._rebuild(body=iet.body._rebuild(casts=casts + iet.body.casts)) @@ -483,9 +483,9 @@ def _alloc_local_array_on_high_bw_mem(self, site, obj, storage): """ Allocate a local Array in the device high bandwidth memory. """ - deviceid = DefFunction(self.lang['device-get'].name) - doalloc = self.lang['device-alloc'] - dofree = self.lang['device-free'] + deviceid = DefFunction(self.langbb['device-get'].name) + doalloc = self.langbb['device-alloc'] + dofree = self.langbb['device-free'] nbytes = SizeOf(obj._C_typedata)*obj.size init = doalloc(nbytes, deviceid, retobj=obj) @@ -503,8 +503,8 @@ def _map_array_on_high_bw_mem(self, site, obj, storage): if not obj._mem_mapped: return - mmap = self.lang._map_alloc(obj) - unmap = self.lang._map_delete(obj) + mmap = self.langbb._map_alloc(obj) + unmap = self.langbb._map_delete(obj) storage.update(obj, site, maps=mmap, unmaps=unmap) @@ -521,22 +521,22 @@ def _map_function_on_high_bw_mem(self, site, obj, storage, devicerm, read_only=F """ if read_only is False: if is_gpu_create(obj, self.gpu_create): - mmap = self.lang._map_alloc(obj) + mmap = self.langbb._map_alloc(obj) efuncs, init = make_zero_init(obj, self.rcompile, self.sregistry) mmap = (mmap, init) else: - mmap = self.lang._map_to(obj) + mmap = self.langbb._map_to(obj) efuncs = () # Copy back to host memory, release device memory - unmap = (self.lang._map_update(obj), - self.lang._map_release(obj, devicerm=devicerm)) + unmap = (self.langbb._map_update(obj), + self.langbb._map_release(obj, devicerm=devicerm)) else: - mmap = self.lang._map_to(obj) + mmap = self.langbb._map_to(obj) efuncs = () - unmap = self.lang._map_delete(obj, devicerm=devicerm) + unmap = self.langbb._map_delete(obj, devicerm=devicerm) storage.update(obj, site, maps=mmap, unmaps=unmap, efuncs=efuncs) @@ -587,7 +587,7 @@ def place_devptr(self, iet, **kwargs): dmaps = [i for i in FindSymbols('basics').visit(iet) if isinstance(i, DeviceMap) and i not in defines] - maps = [self.lang.PointerCast(i.function, obj=i) for i in dmaps] + maps = [self.langbb.PointerCast(i.function, obj=i) for i in dmaps] body = iet.body._rebuild(maps=iet.body.maps + tuple(maps)) iet = iet._rebuild(body=body) diff --git a/devito/passes/iet/dtypes.py b/devito/passes/iet/dtypes.py new file mode 100644 index 0000000000..28d30df13b --- /dev/null +++ b/devito/passes/iet/dtypes.py @@ -0,0 +1,55 @@ +import numpy as np + +from devito.arch.compiler import Compiler +from devito.ir import Callable, SymbolRegistry +from devito.ir.iet.utils import has_dtype +from devito.passes.iet.engine import iet_pass +from devito.passes.iet.langbase import LangBB +from devito.tools import as_tuple + +__all__ = ['lower_dtypes'] + + +@iet_pass +def _complex_includes(iet: Callable, langbb: type[LangBB], compiler: Compiler, + sregistry: SymbolRegistry) -> tuple[Callable, dict]: + """ + Includes complex arithmetic headers for the given language, if needed. + """ + # Check if there are complex numbers that always take dtype precedence + if not has_dtype(iet, np.complexfloating): + return iet, {} + + metadata = {} + lib = as_tuple(langbb['includes-complex']) + + if langbb.get('complex-namespace') is not None: + metadata['namespaces'] = langbb['complex-namespace'] + + # Some languges such as c++11 need some extra arithmetic definitions + if langbb.get('def-complex'): + dest = compiler.get_jit_dir() + hfile = dest.joinpath('complex_arith.h') + with open(str(hfile), 'w') as ff: + ff.write(str(langbb['def-complex'])) + lib += (str(hfile),) + + metadata['includes'] = lib + + return iet, metadata + + +dtype_passes = [_complex_includes] + + +def lower_dtypes(graph: Callable, + langbb: type[LangBB] = None, + compiler: Compiler = None, + sregistry: SymbolRegistry = None, **kwargs) -> tuple[Callable, dict]: + """ + Lowers float16 scalar types to pointers since we can't directly pass their + value. Also includes headers for complex arithmetic if needed. + """ + + for dtype_pass in dtype_passes: + dtype_pass(graph, langbb=langbb, compiler=compiler, sregistry=sregistry) diff --git a/devito/passes/iet/errors.py b/devito/passes/iet/errors.py index fb89934e46..8082ae53cf 100644 --- a/devito/passes/iet/errors.py +++ b/devito/passes/iet/errors.py @@ -6,7 +6,7 @@ List, Break, Return, FindNodes, FindSymbols, Transformer, make_callable) from devito.passes.iet.engine import iet_pass -from devito.symbolics import CondEq, DefFunction +from devito.symbolics import CondEq, MathFunction from devito.tools import dtype_to_ctype from devito.types import Eq, Inc, LocalObject, Symbol @@ -58,7 +58,7 @@ def _check_stability(iet, wmovs=(), rcompile=None, sregistry=None): irs, byproduct = rcompile(eqns) name = sregistry.make_name(prefix='is_finite') - retval = Return(DefFunction('isfinite', accumulator)) + retval = Return(MathFunction('isfinite', accumulator)) body = irs.iet.body.body + (retval,) efunc = make_callable(name, body, retval='int') diff --git a/devito/passes/iet/instrument.py b/devito/passes/iet/instrument.py index 7462c8f07c..5ad934d13f 100644 --- a/devito/passes/iet/instrument.py +++ b/devito/passes/iet/instrument.py @@ -121,12 +121,12 @@ def instrument_sections(iet, **kwargs): @iet_pass -def sync_sections(iet, lang=None, profiler=None, **kwargs): +def sync_sections(iet, langbb=None, profiler=None, **kwargs): """ Wrap sections within global barriers if deemed necessary by the profiler. """ try: - sync = lang['map-wait'] + sync = langbb['map-wait'] except (KeyError, NotImplementedError): return iet, {} @@ -137,7 +137,7 @@ def sync_sections(iet, lang=None, profiler=None, **kwargs): for tl in FindNodes(TimedList).visit(iet): symbols = FindSymbols().visit(tl) - queues = [i for i in symbols if isinstance(i, lang.AsyncQueue)] + queues = [i for i in symbols if isinstance(i, langbb.AsyncQueue)] unnecessary = any(FindNodes(BusyWait).visit(tl)) if queues and not unnecessary: waits = tuple(sync(i) for i in queues) diff --git a/devito/passes/iet/langbase.py b/devito/passes/iet/langbase.py index d27674c419..d2542309e2 100644 --- a/devito/passes/iet/langbase.py +++ b/devito/passes/iet/langbase.py @@ -31,6 +31,9 @@ def __getitem__(self, k): raise NotImplementedError("Missing required mapping for `%s`" % k) return self.mapper[k] + def get(self, k, v=None): + return self.mapper.get(k, v) + class LangBB(metaclass=LangMeta): @@ -151,7 +154,7 @@ class LangTransformer(ABC): an IET for a certain target language (e.g., C, C+OpenMP). """ - lang = LangBB + langbb = LangBB """ The constructs of the target language. To be specialized by a subclass. """ @@ -202,19 +205,19 @@ def initialize(self, iet, options=None): @property def Region(self): - return self.lang.Region + return self.langbb.Region @property def HostIteration(self): - return self.lang.HostIteration + return self.langbb.HostIteration @property def DeviceIteration(self): - return self.lang.DeviceIteration + return self.langbb.DeviceIteration @property def Prodder(self): - return self.lang.Prodder + return self.langbb.Prodder class ShmTransformer(LangTransformer): @@ -433,11 +436,11 @@ def _(iet): except AttributeError: pass - devicetype = as_list(self.lang[self.platform]) + devicetype = as_list(self.langbb[self.platform]) deviceid = self.deviceid try: - lang_init = [self.lang['init'](devicetype)] + lang_init = [self.langbb['init'](devicetype)] except TypeError: # Not all target languages need to be explicitly initialized lang_init = [] @@ -447,10 +450,10 @@ def _(iet): rank_decl = DummyExpr(rank, 0) rank_init = Call('MPI_Comm_rank', [objcomm, Byref(rank)]) - ngpus, call_ngpus = self.lang._get_num_devices(self.platform) + ngpus, call_ngpus = self.langbb._get_num_devices(self.platform) - osdd_then = self.lang['set-device']([deviceid] + devicetype) - osdd_else = self.lang['set-device']([rank % ngpus] + devicetype) + osdd_then = self.langbb['set-device']([deviceid] + devicetype) + osdd_else = self.langbb['set-device']([rank % ngpus] + devicetype) body = lang_init + [Conditional( CondNe(deviceid, -1), @@ -458,16 +461,16 @@ def _(iet): List(body=[rank_decl, rank_init, call_ngpus, osdd_else]), )] - header = c.Comment('Begin of %s+MPI setup' % self.lang['name']) - footer = c.Comment('End of %s+MPI setup' % self.lang['name']) + header = c.Comment('Begin of %s+MPI setup' % self.langbb['name']) + footer = c.Comment('End of %s+MPI setup' % self.langbb['name']) else: body = lang_init + [Conditional( CondNe(deviceid, -1), - self.lang['set-device']([deviceid] + devicetype) + self.langbb['set-device']([deviceid] + devicetype) )] - header = c.Comment('Begin of %s setup' % self.lang['name']) - footer = c.Comment('End of %s setup' % self.lang['name']) + header = c.Comment('Begin of %s setup' % self.langbb['name']) + footer = c.Comment('End of %s setup' % self.langbb['name']) init = List(header=header, body=body, footer=footer) iet = iet._rebuild(body=iet.body._rebuild(init=init)) @@ -476,12 +479,12 @@ def _(iet): @_initialize.register(AsyncCallable) def _(iet): - devicetype = as_list(self.lang[self.platform]) + devicetype = as_list(self.langbb[self.platform]) deviceid = self.deviceid init = Conditional( CondNe(deviceid, -1), - self.lang['set-device']([deviceid] + devicetype) + self.langbb['set-device']([deviceid] + devicetype) ) iet = iet._rebuild(body=iet.body._rebuild(init=init)) diff --git a/devito/passes/iet/languages/C.py b/devito/passes/iet/languages/C.py index 4b3358798d..bfb0935a20 100644 --- a/devito/passes/iet/languages/C.py +++ b/devito/passes/iet/languages/C.py @@ -1,7 +1,11 @@ -from devito.ir import Call +import numpy as np +from sympy.printing.c import C99CodePrinter + +from devito.ir import Call, BasePrinter from devito.passes.iet.definitions import DataManager from devito.passes.iet.orchestration import Orchestrator from devito.passes.iet.langbase import LangBB +from devito.symbolics.extended_dtypes import c_complex, c_double_complex __all__ = ['CBB', 'CDataManager', 'COrchestrator'] @@ -9,6 +13,9 @@ class CBB(LangBB): mapper = { + # Complex + 'includes-complex': 'complex.h', + # Allocs 'header-memcpy': 'string.h', 'host-alloc': lambda i, j, k: Call('posix_memalign', (i, j, k)), @@ -19,13 +26,32 @@ class CBB(LangBB): 'host-free-pin': lambda i: Call('free', (i,)), 'alloc-global-symbol': lambda i, j, k: - Call('memcpy', (i, j, k)) + Call('memcpy', (i, j, k)), } class CDataManager(DataManager): - lang = CBB + langbb = CBB class COrchestrator(Orchestrator): - lang = CBB + langbb = CBB + + +class CPrinter(BasePrinter, C99CodePrinter): + + _default_settings = {**BasePrinter._default_settings, + **C99CodePrinter._default_settings} + _func_literals = {np.float32: 'f', np.complex64: 'f'} + _func_prefix = {np.float32: 'f', np.float64: 'f', + np.complex64: 'c', np.complex128: 'c'} + _includes = ['stdlib.h', 'math.h', 'sys/time.h'] + + # These cannot go through _print_xxx because they are classes not + # instances + type_mappings = {**C99CodePrinter.type_mappings, + c_complex: 'float _Complex', + c_double_complex: 'double _Complex'} + + def _print_ImaginaryUnit(self, expr): + return '_Complex_I' diff --git a/devito/passes/iet/languages/CXX.py b/devito/passes/iet/languages/CXX.py new file mode 100644 index 0000000000..82e099e88c --- /dev/null +++ b/devito/passes/iet/languages/CXX.py @@ -0,0 +1,114 @@ +import numpy as np +from sympy.printing.cxx import CXX11CodePrinter + +from devito.ir import Call, UsingNamespace, BasePrinter +from devito.passes.iet.langbase import LangBB +from devito.symbolics.extended_dtypes import c_complex, c_double_complex + +__all__ = ['CXXBB'] + + +def std_arith(prefix=None): + if prefix: + # Method definition prefix, e.g. "__host__" + # Make sure there is a space between the prefix and the method name + prefix = prefix if prefix.endswith(" ") else f"{prefix} " + else: + prefix = "" + return f""" +#include + +template +{prefix}std::complex<_Tp> operator * (const _Ti & a, const std::complex<_Tp> & b){{ + return std::complex<_Tp>(b.real() * a, b.imag() * a); +}} + +template +{prefix}std::complex<_Tp> operator * (const std::complex<_Tp> & b, const _Ti & a){{ + return std::complex<_Tp>(b.real() * a, b.imag() * a); +}} + +template +{prefix}std::complex<_Tp> operator / (const _Ti & a, const std::complex<_Tp> & b){{ + _Tp denom = b.real() * b.real () + b.imag() * b.imag(); + return std::complex<_Tp>(b.real() * a / denom, - b.imag() * a / denom); +}} + +template +{prefix}std::complex<_Tp> operator / (const std::complex<_Tp> & b, const _Ti & a){{ + return std::complex<_Tp>(b.real() / a, b.imag() / a); +}} + +template +{prefix}std::complex<_Tp> operator + (const _Ti & a, const std::complex<_Tp> & b){{ + return std::complex<_Tp>(b.real() + a, b.imag()); +}} + +template +{prefix}std::complex<_Tp> operator + (const std::complex<_Tp> & b, const _Ti & a){{ + return std::complex<_Tp>(b.real() + a, b.imag()); +}} + +template +{prefix}std::complex<_Tp> operator - (const _Ti & a, const std::complex<_Tp> & b){{ + return std::complex<_Tp>(a = b.real(), b.imag()); +}} + +template +{prefix}std::complex<_Tp> operator - (const std::complex<_Tp> & b, const _Ti & a){{ + return std::complex<_Tp>(b.real() - a, b.imag()); +}} + +""" + + +class CXXBB(LangBB): + + mapper = { + # Complex + 'includes-complex': 'complex', + 'complex-namespace': [UsingNamespace('std::complex_literals')], + 'def-complex': std_arith(), + # Allocs + 'header-memcpy': 'string.h', + 'header-math': 'algorithm', + 'host-alloc': lambda i, j, k: + Call('posix_memalign', (i, j, k)), + 'host-alloc-pin': lambda i, j, k: + Call('posix_memalign', (i, j, k)), + 'host-free': lambda i: + Call('free', (i,)), + 'host-free-pin': lambda i: + Call('free', (i,)), + 'alloc-global-symbol': lambda i, j, k: + Call('memcpy', (i, j, k)), + } + + +class CXXPrinter(BasePrinter, CXX11CodePrinter): + + _default_settings = {**BasePrinter._default_settings, + **CXX11CodePrinter._default_settings} + _ns = "std::" + _func_literals = {} + _func_prefix = {np.float32: 'f', np.float64: 'f'} + _restrict_keyword = '__restrict' + _includes = ['stdlib.h', 'cmath', 'sys/time.h'] + + # These cannot go through _print_xxx because they are classes not + # instances + type_mappings = {**CXX11CodePrinter.type_mappings, + c_complex: 'std::complex', + c_double_complex: 'std::complex'} + + def _print_ImaginaryUnit(self, expr): + return f'1i{self.prec_literal(expr).lower()}' + + def _print_Cast(self, expr): + # The CXX recommended way to cast is to use static_cast + tstr = self._print(expr._C_ctype) + if 'void' in tstr: + return super()._print_Cast(expr) + caster = 'reinterpret_cast' if expr.reinterpret else 'static_cast' + cast = f'{caster}<{tstr}{self._print(expr.stars)}>' + return self._print_UnaryOp(expr, op=cast, parenthesize=True) diff --git a/devito/passes/iet/languages/openacc.py b/devito/passes/iet/languages/openacc.py index bcd2c8d006..bacd2a5f66 100644 --- a/devito/passes/iet/languages/openacc.py +++ b/devito/passes/iet/languages/openacc.py @@ -9,9 +9,9 @@ from devito.passes.iet.orchestration import Orchestrator from devito.passes.iet.parpragma import (PragmaDeviceAwareTransformer, PragmaLangBB, PragmaIteration, PragmaTransfer) -from devito.passes.iet.languages.C import CBB +from devito.passes.iet.languages.CXX import CXXBB, CXXPrinter from devito.passes.iet.languages.openmp import OmpRegion, OmpIteration -from devito.symbolics import FieldFromPointer, Macro, cast_mapper +from devito.symbolics import FieldFromPointer, Macro, cast from devito.tools import filter_ordered, UnboundTuple from devito.types import Symbol @@ -122,7 +122,8 @@ class AccBB(PragmaLangBB): 'device-free': lambda i, *a: Call('acc_free', (i,)) } - mapper.update(CBB.mapper) + + mapper.update(CXXBB.mapper) Region = OmpRegion HostIteration = OmpIteration # Host parallelism still goes via OpenMP @@ -161,7 +162,7 @@ def _map_update_device_async(cls, f, imask=None, qid=None): class DeviceAccizer(PragmaDeviceAwareTransformer): - lang = AccBB + langbb = AccBB def _make_partree(self, candidates, nthreads=None): assert candidates @@ -186,7 +187,7 @@ def _make_partree(self, candidates, nthreads=None): class DeviceAccDataManager(DeviceAwareDataManager): - lang = AccBB + langbb = AccBB @iet_pass def place_devptr(self, iet, **kwargs): @@ -234,17 +235,17 @@ def place_devptr(self, iet, **kwargs): init = DummyExpr(tdp, 0, init=True) dpf = List(body=[ - self.lang.mapper['map-serial-present'](hp, tdp), - Block(body=DummyExpr(tdp, cast_mapper[tdp.dtype](hp))) + self.langbb.mapper['map-serial-present'](hp, tdp), + Block(body=DummyExpr(tdp, cast(tdp.dtype)(hp, reinterpret=True))) ]) ffp = FieldFromPointer(f._C_field_dmap, f._C_symbol) - ctdp = cast_mapper[(hp.dtype, '*')](tdp) - cast = DummyExpr(ffp, ctdp) + ctdp = cast(hp.dtype, '*')(tdp, reinterpret=True) + castf = DummyExpr(ffp, ctdp) ret = Return(ctdp) - body = List(body=[init, dpf, cast, ret]) + body = List(body=[init, dpf, castf, ret]) name = self.sregistry.make_name(prefix='map_device_ptr') efuncs.append(make_callable(name, body, retval=hp)) @@ -261,4 +262,8 @@ def place_devptr(self, iet, **kwargs): class AccOrchestrator(Orchestrator): - lang = AccBB + langbb = AccBB + + +class AccPrinter(CXXPrinter): + pass diff --git a/devito/passes/iet/languages/openmp.py b/devito/passes/iet/languages/openmp.py index ad12879b25..94b68acfb6 100644 --- a/devito/passes/iet/languages/openmp.py +++ b/devito/passes/iet/languages/openmp.py @@ -87,7 +87,7 @@ class ThreadedProdder(Conditional, Prodder): def __init__(self, prodder, arguments=None): # Atomic-ize any single-thread Prodders in the parallel tree - condition = CondEq(DefFunction(Ompizer.lang['thread-num']().name), 0) + condition = CondEq(DefFunction(Ompizer.langbb['thread-num']().name), 0) # Prod within a while loop until all communications have completed # In other words, the thread delegated to prodding is entrapped for as long @@ -210,12 +210,12 @@ def _map_delete(cls, f, imask=None, devicerm=None): class SimdOmpizer(PragmaSimdTransformer): - lang = OmpBB + langbb = OmpBB class Ompizer(PragmaShmTransformer): - lang = OmpBB + langbb = OmpBB @classmethod def _support_array_reduction(cls, compiler): @@ -228,20 +228,20 @@ def _support_array_reduction(cls, compiler): class DeviceOmpizer(PragmaDeviceAwareTransformer): - lang = DeviceOmpBB + langbb = DeviceOmpBB class OmpDataManager(DataManager): - lang = OmpBB + langbb = OmpBB class DeviceOmpDataManager(DeviceAwareDataManager): - lang = DeviceOmpBB + langbb = DeviceOmpBB class OmpOrchestrator(Orchestrator): - lang = OmpBB + langbb = OmpBB class DeviceOmpOrchestrator(Orchestrator): - lang = DeviceOmpBB + langbb = DeviceOmpBB diff --git a/devito/passes/iet/languages/targets.py b/devito/passes/iet/languages/targets.py index 4ac8d94398..3ca64e1c10 100644 --- a/devito/passes/iet/languages/targets.py +++ b/devito/passes/iet/languages/targets.py @@ -1,47 +1,81 @@ -from devito.passes.iet.languages.C import CDataManager, COrchestrator +from devito.passes.iet.languages.C import CDataManager, COrchestrator, CPrinter +from devito.passes.iet.languages.CXX import CXXPrinter from devito.passes.iet.languages.openmp import (SimdOmpizer, Ompizer, DeviceOmpizer, OmpDataManager, DeviceOmpDataManager, OmpOrchestrator, DeviceOmpOrchestrator) from devito.passes.iet.languages.openacc import (DeviceAccizer, DeviceAccDataManager, - AccOrchestrator) + AccOrchestrator, AccPrinter) from devito.passes.iet.instrument import instrument -__all__ = ['CTarget', 'OmpTarget', 'DeviceOmpTarget', 'DeviceAccTarget'] +__all__ = ['CTarget', 'OmpTarget', 'COmpTarget', 'DeviceOmpTarget', 'DeviceAccTarget', + 'CXXTarget', 'CXXOmpTarget', 'DeviceCXXOmpTarget'] class Target: Parizer = None DataManager = None Orchestrator = None + Printer = None @classmethod - def lang(cls): - return cls.Parizer.lang + def langbb(cls): + return cls.Parizer.langbb @classmethod def instrument(cls, *args, **kwargs): - instrument(*args, lang=cls.lang(), **kwargs) + instrument(*args, **kwargs) class CTarget(Target): Parizer = SimdOmpizer DataManager = CDataManager Orchestrator = COrchestrator + Printer = CPrinter -class OmpTarget(Target): +class CXXTarget(Target): + Parizer = SimdOmpizer + DataManager = CDataManager + Orchestrator = COrchestrator + Printer = CXXPrinter + + +class COmpTarget(Target): Parizer = Ompizer DataManager = OmpDataManager Orchestrator = OmpOrchestrator + Printer = CPrinter + + +OmpTarget = COmpTarget + + +class CXXOmpTarget(Target): + Parizer = Ompizer + DataManager = OmpDataManager + Orchestrator = OmpOrchestrator + Printer = CXXPrinter + + +class DeviceCOmpTarget(Target): + Parizer = DeviceOmpizer + DataManager = DeviceOmpDataManager + Orchestrator = DeviceOmpOrchestrator + Printer = CPrinter + + +DeviceOmpTarget = DeviceCOmpTarget -class DeviceOmpTarget(Target): +class DeviceCXXOmpTarget(Target): Parizer = DeviceOmpizer DataManager = DeviceOmpDataManager Orchestrator = DeviceOmpOrchestrator + Printer = CXXPrinter class DeviceAccTarget(Target): Parizer = DeviceAccizer DataManager = DeviceAccDataManager Orchestrator = AccOrchestrator + Printer = AccPrinter diff --git a/devito/passes/iet/misc.py b/devito/passes/iet/misc.py index 34f4f367c3..e404a8e373 100644 --- a/devito/passes/iet/misc.py +++ b/devito/passes/iet/misc.py @@ -6,15 +6,16 @@ from devito.finite_differences import Max, Min from devito.finite_differences.differentiable import SafeInv +from devito.logger import warning from devito.ir import (Any, Forward, DummyExpr, Iteration, List, Prodder, FindApplications, FindNodes, FindSymbols, Transformer, Uxreplace, filter_iterations, retrieve_iteration_tree, pull_dims) from devito.passes.iet.engine import iet_pass +from devito.passes.iet.languages.C import CPrinter from devito.ir.iet.efunc import DeviceFunction, EntryFunction -from devito.symbolics import (ValueLimit, evalrel, has_integer_args, limits_mapper, - ccode) -from devito.tools import Bunch, as_mapper, filter_ordered, split +from devito.symbolics import (ValueLimit, evalrel, has_integer_args, limits_mapper, Cast) +from devito.tools import Bunch, as_mapper, filter_ordered, split, as_tuple from devito.types import FIndexed __all__ = ['avoid_denormals', 'hoist_prodders', 'relax_incr_dimensions', @@ -144,16 +145,19 @@ def generate_macros(graph, **kwargs): @iet_pass -def _generate_macros(iet, tracker=None, **kwargs): +def _generate_macros(iet, tracker=None, langbb=None, printer=CPrinter, **kwargs): # Derive the Macros necessary for the FIndexeds iet = _generate_macros_findexeds(iet, tracker=tracker, **kwargs) # NOTE: sorting is necessary to ensure deterministic code generation headers = [i.header for i in tracker.values()] - headers = sorted((ccode(define), ccode(expr)) for define, expr in headers) + headers = sorted((printer()._print(define), printer()._print(expr)) + for define, expr in headers) # Generate Macros from higher-level SymPy objects - headers.extend(sorted(_generate_macros_math(iet), key=str)) + mheaders, includes = _generate_macros_math(iet, langbb=langbb) + includes = sorted(includes, key=str) + headers.extend(sorted(mheaders, key=str)) # Remove redundancies while preserving the order headers = filter_ordered(headers) @@ -161,11 +165,10 @@ def _generate_macros(iet, tracker=None, **kwargs): # Some special Symbols may represent Macros defined in standard libraries, # so we need to include the respective includes limits = FindApplications(ValueLimit).visit(iet) - includes = set() if limits & (set(limits_mapper[np.int32]) | set(limits_mapper[np.int64])): - includes.add('limits.h') + includes.append('limits.h') elif limits & (set(limits_mapper[np.float32]) | set(limits_mapper[np.float64])): - includes.add('float.h') + includes.append('float.h') return iet, {'headers': headers, 'includes': includes} @@ -196,42 +199,50 @@ def _generate_macros_findexeds(iet, sregistry=None, tracker=None, **kwargs): return iet -def _generate_macros_math(iet): +def _generate_macros_math(iet, langbb=None): headers = [] + includes = [] for i in FindApplications().visit(iet): - headers.extend(_lower_macro_math(i)) + header, include = _lower_macro_math(i, langbb) + headers.extend(header) + includes.extend(include) - return headers + return headers, set(includes) - {None} @singledispatch -def _lower_macro_math(expr): - return () +def _lower_macro_math(expr, langbb): + return (), {} @_lower_macro_math.register(Min) @_lower_macro_math.register(sympy.Min) -def _(expr): - if has_integer_args(*expr.args) and len(expr.args) == 2: - return (('MIN(a,b)', ('(((a) < (b)) ? (a) : (b))')),) +def _(expr, langbb): + if has_integer_args(*expr.args): + return (('MIN(a,b)', ('(((a) < (b)) ? (a) : (b))')),), {} else: - return () + return (), as_tuple(langbb.get('header-math')) @_lower_macro_math.register(Max) @_lower_macro_math.register(sympy.Max) -def _(expr): - if has_integer_args(*expr.args) and len(expr.args) == 2: - return (('MAX(a,b)', ('(((a) > (b)) ? (a) : (b))')),) +def _(expr, langbb): + if has_integer_args(*expr.args): + return (('MAX(a,b)', ('(((a) > (b)) ? (a) : (b))')),), {} else: - return () + return (), as_tuple(langbb.get('header-math')) @_lower_macro_math.register(SafeInv) -def _(expr): - eps = np.finfo(np.float32).resolution**2 +def _(expr, langbb): + try: + eps = np.finfo(expr.base.dtype).resolution**2 + except ValueError: + warning(f"dtype not recognized in SafeInv for {expr.base}, assuming float32") + eps = np.finfo(np.float32).resolution**2 + b = Cast('b', dtype=np.float32) return (('SAFEINV(a, b)', - f'(((a) < {eps} || (b) < {eps}) ? (0.0F) : (1.0F / (a)))'),) + f'(((a) < {eps}F || ({b}) < {eps}F) ? (0.0F) : ((1.0F) / (a)))'),), {} @iet_pass diff --git a/devito/passes/iet/orchestration.py b/devito/passes/iet/orchestration.py index b807fd561b..cd3cbf17b3 100644 --- a/devito/passes/iet/orchestration.py +++ b/devito/passes/iet/orchestration.py @@ -24,7 +24,7 @@ class Orchestrator: Lower the SyncSpot in IET for efficient host-device asynchronous computation. """ - lang = LangBB + langbb = LangBB """ The language used to implement host-device data movements. """ @@ -55,7 +55,7 @@ def _make_releaselock(self, iet, sync_ops, *args): return iet, [efunc] def _make_withlock(self, iet, sync_ops, layer): - body, prefix = withlock(layer, iet, sync_ops, self.lang, self.sregistry) + body, prefix = withlock(layer, iet, sync_ops, self.langbb, self.sregistry) # Turn `iet` into an AsyncCallable so that subsequent passes know # that we're happy for this Callable to be executed asynchronously @@ -94,7 +94,7 @@ def _make_syncarray(self, iet, sync_ops, layer): body = list(iet.body) try: - body.extend([self.lang._map_update_device(s.target, s.imask, qid=qid) + body.extend([self.langbb._map_update_device(s.target, s.imask, qid=qid) for s in sync_ops]) except NotImplementedError: pass @@ -103,7 +103,7 @@ def _make_syncarray(self, iet, sync_ops, layer): return iet, [] def _make_prefetchupdate(self, iet, sync_ops, layer): - body, prefix = prefetchupdate(layer, iet, sync_ops, self.lang, self.sregistry) + body, prefix = prefetchupdate(layer, iet, sync_ops, self.langbb, self.sregistry) # Turn `iet` into an AsyncCallable so that subsequent passes know # that we're happy for this Callable to be executed asynchronously diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index c3ed016a94..9cefc4786b 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -9,12 +9,12 @@ from devito.ir import (Conditional, DummyEq, Dereference, Expression, ExpressionBundle, FindSymbols, FindNodes, ParallelIteration, ParallelTree, Pragma, Prodder, Transfer, List, Transformer, - IsPerfectIteration, OpInc, filter_iterations, + IsPerfectIteration, OpInc, filter_iterations, ccode, retrieve_iteration_tree, IMask, VECTORIZED) from devito.passes.iet.engine import iet_pass from devito.passes.iet.langbase import (LangBB, LangTransformer, DeviceAwareMixin, ShmTransformer, make_sections_from_imask) -from devito.symbolics import INT, ccode +from devito.symbolics import INT from devito.tools import as_tuple, flatten, is_integer, prod from devito.types import Symbol @@ -50,10 +50,10 @@ def _make_simd_pragma(self, iet): indexeds = FindSymbols('indexeds').visit(iet) aligned = {i.base for i in indexeds if i.function.is_DiscreteFunction} if aligned: - simd = self.lang['simd-for-aligned'] + simd = self.langbb['simd-for-aligned'] simd = as_tuple(simd(self.simd_reg_nbytes, *aligned)) else: - simd = as_tuple(self.lang['simd-for']) + simd = as_tuple(self.langbb['simd-for']) return simd @@ -239,7 +239,7 @@ def _make_reductions(self, partree): mapper = {partree.root: partree.root._rebuild(reduction=reductions)} elif all(i is OpInc for _, _, i in reductions): # Use atomic increments - mapper = {i: i._rebuild(pragmas=self.lang['atomic']) for i in exprs} + mapper = {i: i._rebuild(pragmas=self.langbb['atomic']) for i in exprs} else: raise NotImplementedError @@ -292,7 +292,7 @@ def _make_partree(self, candidates, nthreads=None): **root.args) niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable]) - value = INT(Max(niters / (nthreads*self.chunk_nonaffine), 1)) + value = INT(Max(INT(niters / (nthreads*self.chunk_nonaffine)), 1)) prefix = [Expression(DummyEq(chunk_size, value, dtype=np.int32))] # Create a ParallelTree @@ -319,7 +319,7 @@ def _make_parregion(self, partree, parrays): vexpandeds.append(VExpanded(i, pi)) if vexpandeds: - init = self.lang['thread-num'](retobj=self.threadid) + init = self.langbb['thread-num'](retobj=self.threadid) prefix = List(body=[init] + vexpandeds + list(partree.prefix), footer=c.Line()) partree = partree._rebuild(prefix=prefix) @@ -417,7 +417,7 @@ def _make_parallel(self, iet, sync_mapper=None): iet = Transformer(mapper).visit(iet) - return iet, {'includes': [self.lang['header']]} + return iet, {'includes': [self.langbb['header']]} def make_parallel(self, graph): return self._make_parallel(graph, sync_mapper=graph.sync_mapper) diff --git a/devito/symbolics/__init__.py b/devito/symbolics/__init__.py index 0f5c261471..3f1525297a 100644 --- a/devito/symbolics/__init__.py +++ b/devito/symbolics/__init__.py @@ -1,6 +1,6 @@ from devito.symbolics.extended_sympy import * # noqa +from devito.symbolics.extended_dtypes import * # noqa from devito.symbolics.queries import * # noqa from devito.symbolics.search import * # noqa -from devito.symbolics.printer import * # noqa from devito.symbolics.inspection import * # noqa from devito.symbolics.manipulation import * # noqa diff --git a/devito/symbolics/extended_dtypes.py b/devito/symbolics/extended_dtypes.py new file mode 100644 index 0000000000..080ef09fbb --- /dev/null +++ b/devito/symbolics/extended_dtypes.py @@ -0,0 +1,97 @@ +import ctypes +import numpy as np + +from devito.symbolics.extended_sympy import ReservedWord, Cast, ValueLimit +from devito.tools import (Bunch, float2, float3, float4, double2, double3, double4, # noqa + int2, int3, int4, ctypes_vector_mapper) +from devito.tools.dtypes_lowering import dtype_mapper + +__all__ = ['cast', 'CustomType', 'limits_mapper', 'INT', 'FLOAT', 'BaseCast', # noqa + 'DOUBLE', 'VOID', 'NoDeclStruct', 'c_complex', 'c_double_complex'] + + +limits_mapper = { + np.int32: Bunch(min=ValueLimit('INT_MIN'), max=ValueLimit('INT_MAX')), + np.int64: Bunch(min=ValueLimit('LONG_MIN'), max=ValueLimit('LONG_MAX')), + np.float32: Bunch(min=-ValueLimit('FLT_MIN'), max=ValueLimit('FLT_MAX')), + np.float64: Bunch(min=-ValueLimit('DBL_MIN'), max=ValueLimit('DBL_MAX')), +} + + +class NoDeclStruct(ctypes.Structure): + """ + A ctypes.Structure that does not generate a struct definition. + + Some foreign types (e.g. complex) need to be passed to C/C++ as a struct + that mimics an existing type, but the struct types themselves don't show + up in the kernel, so we don't need to generate their definitions. + """ + + pass + + +class c_complex(NoDeclStruct): + """ + Structure for passing complex float to C/C++ + """ + + _fields_ = [('real', ctypes.c_float), ('imag', ctypes.c_float)] + + _base_dtype = True + + @classmethod + def from_param(cls, val): + return cls(val.real, val.imag) + + +class c_double_complex(NoDeclStruct): + """ + Structure for passing complex double to C/C++ + """ + + _fields_ = [('real', ctypes.c_double), ('imag', ctypes.c_double)] + + _base_dtype = True + + @classmethod + def from_param(cls, val): + return cls(val.real, val.imag) + + +ctypes_vector_mapper.update({np.complex64: c_complex, + np.complex128: c_double_complex}) + + +class CustomType(ReservedWord): + pass + + +def cast(casttype, stars=None): + return lambda v, dtype=None, **kw: Cast(v, dtype=casttype, stars=stars, **kw) + + +ULONG = cast(np.uint64) +UINTP = cast(np.uint32, '*') + + +# Standard ones, needed as class for e.g. single dispatch +class BaseCast(Cast): + + def __new__(cls, base, stars=None, **kwargs): + kwargs['dtype'] = cls._dtype + return super().__new__(cls, base, stars=stars, **kwargs) + + +class VOID(BaseCast): + + _dtype = 'void' + + +# Dynamically create INT, INT2, .... INTP, INT2P, ... FLOAT, ... +for (base_name, dtype) in dtype_mapper.items(): + name = base_name.upper() + globals()[name] = type(name, (BaseCast,), {'_dtype': dtype}) + for i in ['2', '3', '4']: + v = '%s%s' % (base_name, i) + globals()[v.upper()] = cast(v) + globals()[f'{v.upper()}P'] = cast(v, '*') diff --git a/devito/symbolics/extended_sympy.py b/devito/symbolics/extended_sympy.py index 5b13262ded..8acb5cac99 100644 --- a/devito/symbolics/extended_sympy.py +++ b/devito/symbolics/extended_sympy.py @@ -10,7 +10,8 @@ from devito.finite_differences.elementary import Min, Max from devito.tools import (Pickable, Bunch, as_tuple, is_integer, float2, # noqa float3, float4, double2, double3, double4, int2, int3, - int4) + int4, dtype_to_ctype, ctypes_to_cstr, ctypes_vector_mapper, + ctypes_to_cstr) from devito.types import Symbol from devito.types.basic import Basic @@ -18,9 +19,8 @@ 'CallFromComposite', 'FieldFromPointer', 'FieldFromComposite', 'ListInitializer', 'Byref', 'IndexedPointer', 'Cast', 'DefFunction', 'MathFunction', 'InlineIf', 'ReservedWord', 'Keyword', 'String', - 'Macro', 'Class', 'MacroArgument', 'CustomType', 'Deref', 'Namespace', - 'Rvalue', 'INT', 'FLOAT', 'DOUBLE', 'VOID', 'Null', 'SizeOf', 'rfunc', - 'cast_mapper', 'BasicWrapperMixin', 'ValueLimit', 'limits_mapper'] + 'Macro', 'Class', 'MacroArgument', 'Deref', 'Namespace', + 'Rvalue', 'Null', 'SizeOf', 'rfunc', 'BasicWrapperMixin', 'ValueLimit'] class CondEq(sympy.Eq): @@ -89,10 +89,10 @@ def __new__(cls, lhs, rhs, params=None): if not is_integer(rhs): # Perhaps it's a symbolic RHS -- but we wanna be sure it's of type int if not hasattr(rhs, 'dtype'): - raise ValueError("Symbolic RHS `%s` lacks dtype" % rhs) + raise ValueError(f"Symbolic RHS `{rhs}` lacks dtype") if not issubclass(rhs.dtype, np.integer): - raise ValueError("Symbolic RHS `%s` must be of type `int`, found " - "`%s` instead" % (rhs, rhs.dtype)) + raise ValueError(f"Symbolic RHS `{rhs}` must be of type `int`, found " + f"`{rhs.dtype}` instead") rhs = sympify(rhs) obj = sympy.Expr.__new__(cls, lhs, rhs) @@ -103,7 +103,7 @@ def __new__(cls, lhs, rhs, params=None): return obj def __str__(self): - return "IntDiv(%s, %s)" % (self.lhs, self.rhs) + return f"IntDiv({self.lhs}, {self.rhs})" __repr__ = __str__ @@ -191,8 +191,8 @@ def __new__(cls, call, pointer, params=None, **kwargs): return obj def __str__(self): - return '%s->%s(%s)' % (self.pointer, self.call, - ", ".join(str(i) for i in as_tuple(self.params))) + params = ", ".join(str(i) for i in as_tuple(self.params)) + return f'{self.pointer}->{self.call}({params})' __repr__ = __str__ @@ -228,8 +228,8 @@ class CallFromComposite(CallFromPointer, Pickable): """ def __str__(self): - return '%s.%s(%s)' % (self.pointer, self.call, - ", ".join(str(i) for i in as_tuple(self.params))) + params = ", ".join(str(i) for i in as_tuple(self.params)) + return f'{self.pointer}.{self.call}({params})' __repr__ = __str__ @@ -246,7 +246,7 @@ def __new__(cls, field, pointer, *args, **kwargs): return CallFromPointer.__new__(cls, field, pointer) def __str__(self): - return '%s->%s' % (self.pointer, self.field) + return f'{self.pointer}->{self.field}' @property def field(self): @@ -268,7 +268,7 @@ def __new__(cls, field, composite, *args, **kwargs): return CallFromPointer.__new__(cls, field, composite) def __str__(self): - return '%s.%s' % (self.composite, self.field) + return f'{self.composite}.{self.field}' @property def field(self): @@ -295,13 +295,13 @@ def __new__(cls, params): try: args.append(sympify(p)) except sympy.SympifyError: - raise ValueError("Illegal param `%s`" % p) + raise ValueError(f"Illegal param `{p}`") obj = sympy.Expr.__new__(cls, *args) obj.params = tuple(args) return obj def __str__(self): - return "{%s}" % ", ".join(str(i) for i in self.params) + return f"{{{', '.join(str(i) for i in self.params)}}}" __repr__ = __str__ @@ -349,9 +349,9 @@ def free_symbols(self): def __str__(self): if self.base.is_Symbol: - return "%s%s" % (self._op, str(self.base)) + return f'{self._op}{self.base}' else: - return "%s(%s)" % (self._op, str(self.base)) + return f'{self._op}({self.base})' __repr__ = __str__ @@ -383,24 +383,22 @@ class Cast(UnaryOp): Symbolic representation of the C notation `(type)expr`. """ - _base_typ = '' + __rargs__ = ('base', ) + __rkwargs__ = ('dtype', 'stars', 'reinterpret') - __rkwargs__ = ('stars',) - - def __new__(cls, base, stars=None, **kwargs): - # Attempt simplifcation - # E.g., `FLOAT(32) -> 32.0` of type `sympy.Float` + def __new__(cls, base, dtype=None, stars=None, reinterpret=False, **kwargs): try: - return sympify(eval(cls._base_typ)(base)) - except (NameError, SyntaxError): - # E.g., `_base_typ` is "char" or "unsigned long" - pass + if issubclass(dtype, np.generic) and sympify(base).is_Number: + base = sympify(dtype(base)) except TypeError: - # `base` ain't a number + # E.g. void pass obj = super().__new__(cls, base) - obj._stars = stars + obj._stars = stars or '' + obj._dtype = dtype + obj._reinterpret = reinterpret + return obj def _hashable_content(self): @@ -413,12 +411,28 @@ def stars(self): return self._stars @property - def typ(self): - return '%s%s' % (self._base_typ, self.stars or '') + def dtype(self): + return self._dtype + + @property + def reinterpret(self): + return self._reinterpret + + @property + def _C_ctype(self): + ctype = ctypes_vector_mapper.get(self.dtype, self.dtype) + try: + ctype = dtype_to_ctype(ctype) + except TypeError: + pass + return ctype @property def _op(self): - return '(%s)' % self.typ + return f'({ctypes_to_cstr(self._C_ctype)})' + + def __str__(self): + return f"{self._op}{self.base}" class IndexedPointer(sympy.Expr, Pickable, BasicWrapperMixin): @@ -457,7 +471,8 @@ def index(self): return self._index def __str__(self): - return "%s%s" % (self.base, ''.join('[%s]' % i for i in self.index)) + indices = ''.join(f'[{i}]' for i in self.index) + return f"{self.base}{indices}" __repr__ = __str__ @@ -484,7 +499,7 @@ class ReservedWord(sympy.Atom, Pickable): def __new__(cls, value, **kwargs): if not isinstance(value, str): - raise TypeError("Expected str, got `%s`" % type(value)) + raise TypeError(f"Expected str, got `{type(value)}`") obj = sympy.Atom.__new__(cls, **kwargs) obj.value = value @@ -509,10 +524,6 @@ class Keyword(ReservedWord): pass -class CustomType(ReservedWord): - pass - - class String(ReservedWord): pass @@ -524,7 +535,7 @@ class Macro(ReservedWord): class Class(ReservedWord): def __str__(self): - return "class %s" % self.value + return f"class {self.value}" __repr__ = __str__ @@ -532,7 +543,7 @@ def __str__(self): class MacroArgument(sympy.Symbol): def __str__(self): - return "(%s)" % self.name + return f"({self.name})" __repr__ = __str__ @@ -547,14 +558,6 @@ class ValueLimit(ReservedWord, sympy.Expr): pass -limits_mapper = { - np.int32: Bunch(min=ValueLimit('INT_MIN'), max=ValueLimit('INT_MAX')), - np.int64: Bunch(min=ValueLimit('LONG_MIN'), max=ValueLimit('LONG_MAX')), - np.float32: Bunch(min=-ValueLimit('FLT_MAX'), max=ValueLimit('FLT_MAX')), - np.float64: Bunch(min=-ValueLimit('DBL_MAX'), max=ValueLimit('DBL_MAX')), -} - - class DefFunction(Function, Pickable): """ @@ -613,11 +616,11 @@ def template(self): def __str__(self): if self.template: - template = '<%s>' % ','.join(str(i) for i in self.template) + template = f"<{','.join(str(i) for i in self.template)}>" else: template = '' arguments = ', '.join(str(i) for i in self.arguments) - return "%s%s(%s)" % (self.name, template, arguments) + return f"{self.name}{template}({arguments})" __repr__ = __str__ @@ -670,7 +673,7 @@ def false_expr(self): return self._false_expr def __str__(self): - return "(%s) ? %s : %s" % (self.cond, self.true_expr, self.false_expr) + return f"({self.cond}) ? {self.true_expr} : {self.false_expr}" __repr__ = __str__ @@ -754,125 +757,43 @@ def init(self): def __str__(self): rvalue = str(self.expr) if self.namespace: - rvalue = "%s::%s" % (self.namespace, rvalue) + rvalue = f"{self.namespace}::{rvalue}" if self.init: - rvalue = "%s%s" % (rvalue, self.init) + rvalue = f"{rvalue}{self.init}" return rvalue __repr__ = __str__ -# *** Casting - -class CastStar: - - base = None - - def __new__(cls, base=''): - return cls.base(base, '*') - - -# Dynamically create INT, INT2, .... INTP, INT2P, ... FLOAT, ... -for base_name in ['int', 'float', 'double']: - for i in ['', '2', '3', '4']: - v = '%s%s' % (base_name, i) - cls = type(v.upper(), (Cast,), {'_base_typ': v}) - globals()[cls.__name__] = cls - - clsp = type('%sP' % v.upper(), (CastStar,), {'base': cls}) - globals()[clsp.__name__] = clsp - - -class CHAR(Cast): - _base_typ = 'char' - - -class SHORT(Cast): - _base_typ = 'short' - - -class USHORT(Cast): - _base_typ = 'unsigned short' - - -class UCHAR(Cast): - _base_typ = 'unsigned char' - - -class UINT(Cast): - _base_typ = 'unsigned int' - - -class UINTP(CastStar): - base = UINT - - -class LONG(Cast): - _base_typ = 'long' - - -class ULONG(Cast): - _base_typ = 'unsigned long' - - -class VOID(Cast): - _base_typ = 'void' - - -class CHARP(CastStar): - base = CHAR - - -class UCHARP(CastStar): - base = UCHAR - - -class SHORTP(CastStar): - base = SHORT - - -class USHORTP(CastStar): - base = USHORT - +# Some other utility objects +Null = Macro('NULL') -cast_mapper = { - np.int8: CHAR, - np.uint8: UCHAR, - np.int16: SHORT, # noqa - np.uint16: USHORT, # noqa - int: INT, # noqa - np.int32: INT, # noqa - np.int64: LONG, - np.uint64: ULONG, - np.float32: FLOAT, # noqa - float: DOUBLE, # noqa - np.float64: DOUBLE, # noqa - (np.int8, '*'): CHARP, - (np.uint8, '*'): UCHARP, - (int, '*'): INTP, # noqa - (np.uint16, '*'): USHORTP, # noqa - (np.int16, '*'): SHORTP, # noqa - (np.int32, '*'): INTP, # noqa - (np.int64, '*'): INTP, # noqa - (np.float32, '*'): FLOATP, # noqa - (float, '*'): DOUBLEP, # noqa - (np.float64, '*'): DOUBLEP # noqa -} +# DefFunction, unlike sympy.Function, generates e.g. `sizeof(float)`, not `sizeof(float_)` +class SizeOf(DefFunction): + + __rargs__ = ('intype', 'stars') + + def __new__(cls, intype, stars=None, **kwargs): + stars = stars or '' + if not isinstance(intype, (str, ReservedWord)): + ctype = dtype_to_ctype(intype) + for k, v in ctypes_vector_mapper.items(): + if ctype is v: + intype = k + break + else: + intype = ctypes_to_cstr(ctype) -for base_name in ['int', 'float', 'double']: - for i in [2, 3, 4]: - v = '%s%d' % (base_name, i) - cls = locals()[v] - cast_mapper[cls] = locals()[v.upper()] - cast_mapper[(cls, '*')] = locals()['%sP' % v.upper()] + newobj = super().__new__(cls, 'sizeof', arguments=f'{intype}{stars}', **kwargs) + newobj.stars = stars + newobj.intype = intype + return newobj -# Some other utility objects -Null = Macro('NULL') - -# DefFunction, unlike sympy.Function, generates e.g. `sizeof(float)`, not `sizeof(float_)` -SizeOf = lambda *args: DefFunction('sizeof', tuple(args)) + @property + def args(self): + return super().args[1] def rfunc(func, item, *args): diff --git a/devito/symbolics/inspection.py b/devito/symbolics/inspection.py index 18e2623764..0ff7fcf6ba 100644 --- a/devito/symbolics/inspection.py +++ b/devito/symbolics/inspection.py @@ -3,11 +3,13 @@ import numpy as np from sympy import (Function, Indexed, Integer, Mul, Number, Pow, S, Symbol, Tuple) +from sympy.core.numbers import ImaginaryUnit from devito.finite_differences import Derivative from devito.finite_differences.differentiable import IndexDerivative from devito.logger import warning -from devito.symbolics.extended_sympy import (INT, CallFromPointer, Cast, +from devito.symbolics.extended_dtypes import INT +from devito.symbolics.extended_sympy import (CallFromPointer, Cast, DefFunction, ReservedWord) from devito.symbolics.queries import q_routine from devito.tools import as_tuple, prod @@ -142,6 +144,8 @@ def wrapper(expr, estimate, seen): def _estimate_cost(expr, estimate, seen): # Retval: flops (int), flag (bool) # The flag tells wether it's an integer expression (implying flops==0) or not + if not expr.args: + return 0, False flops, flags = zip(*[_estimate_cost(a, estimate, seen) for a in expr.args]) flops = sum(flops) if all(flags): @@ -168,6 +172,7 @@ def _(expr, estimate, seen): return 0, True +@_estimate_cost.register(ImaginaryUnit) @_estimate_cost.register(Number) @_estimate_cost.register(ReservedWord) def _(expr, estimate, seen): @@ -190,6 +195,8 @@ def _(expr, estimate, seen): flops, flags = _estimate_cost.registry[object](expr, estimate, seen) if {S.One, S.NegativeOne}.intersection(expr.args): flops -= 1 + if ImaginaryUnit in expr.args: + flops *= 2 return flops, flags @@ -282,7 +289,9 @@ def has_integer_args(*args): res = True for a in args: try: - if len(a.args) > 0: + if isinstance(a, INT): + res = res and True + elif len(a.args) > 0: res = res and has_integer_args(*a.args) else: res = res and has_integer_args(a) @@ -291,14 +300,31 @@ def has_integer_args(*args): return res -def sympy_dtype(expr, base=None): +def sympy_dtype(expr, base=None, default=None, smin=None): """ Infer the dtype of the expression. """ + if expr is None: + return default + dtypes = {base} - {None} for i in expr.free_symbols: try: dtypes.add(i.dtype) except AttributeError: pass - return infer_dtype(dtypes) + + dtype = infer_dtype(dtypes) + + # Promote if we missed complex number, i.e f + I + is_im = np.issubdtype(dtype, np.complexfloating) + if expr.has(ImaginaryUnit) and not is_im: + if dtype is None: + dtype = default or np.complex64 + else: + dtype = np.promote_types(dtype, np.complex64).type + + if smin is not None and not np.issubdtype(dtype, np.integer): + dtype = np.promote_types(dtype, smin).type + + return dtype diff --git a/devito/symbolics/manipulation.py b/devito/symbolics/manipulation.py index f5992ac8be..bf795cb86d 100644 --- a/devito/symbolics/manipulation.py +++ b/devito/symbolics/manipulation.py @@ -13,6 +13,7 @@ from devito.symbolics.extended_sympy import DefFunction, rfunc from devito.symbolics.queries import q_leaf from devito.symbolics.search import retrieve_indexed, retrieve_functions +from devito.symbolics.unevaluation import Mul as UMul from devito.tools import as_list, as_tuple, flatten, split, transitive_closure from devito.types.basic import Basic, Indexed from devito.types.array import ComponentAccess @@ -128,7 +129,10 @@ def _(mapper, rule): @singledispatch def _uxreplace_handle(expr, args, kwargs): - return expr.func(*args) + try: + return expr.func(*args, evaluate=False) + except TypeError: + return expr.func(*args) @_uxreplace_handle.register(Min) @@ -329,24 +333,25 @@ def pow_to_mul(expr): if exp > 10 or exp < -10 or exp == 0: # Large powers remain untouched return expr - elif exp == -1 or (int(exp) - exp != 0): - # Reciprocals and fractional powers also remain untouched, + elif (int(exp) - exp != 0): + # Fractional powers also remain untouched, # but at least we traverse the base looking for other Pows return expr.func(pow_to_mul(base), exp, evaluate=False) elif exp > 0: - return Mul(*[pow_to_mul(base)]*int(exp), evaluate=False) + return UMul(*[pow_to_mul(base)]*int(exp), evaluate=False) + elif exp < 0: + # Reciprocal powers become inverse of the negative power + # for example Pow(expr, -2) becomes Pow(expr * expr, -1) + return expr.func(pow_to_mul(base**(-int(exp))), -1, evaluate=False) else: - # SymPy represents 1/x as Pow(x,-1). Also, it represents - # 2/x as Mul(2, Pow(x, -1)). So we shouldn't end up here, - # but just in case SymPy changes its internal conventions... - posexpr = Mul(*[base]*(-int(exp)), evaluate=False) - return Pow(posexpr, -1, evaluate=False) + # Default. We should not end up here as all cases are handled + return expr else: args = [pow_to_mul(i) for i in expr.args] # Some SymPy versions will evaluate the two-args case # `(negative integer, mul)` despite the `evaluate=False`. For example, - # `Mul(-2, a*a, evaluate=False)` gets evaluated to `-2/a**2`. By swapping + # `Mul(-2, 1/a*a, evaluate=False)` gets evaluated to `-2/a**2`. By swapping # the args, the issue disappears... try: a0, a1 = args @@ -393,7 +398,7 @@ def normalize_args(args): for k, v in args.items(): try: retval[k] = sympify(v, strict=True) - except SympifyError: + except (TypeError, SympifyError): continue return retval diff --git a/devito/tools/data_structures.py b/devito/tools/data_structures.py index de7aa69d0d..c32360b6a7 100644 --- a/devito/tools/data_structures.py +++ b/devito/tools/data_structures.py @@ -267,6 +267,11 @@ def update(self, *args, **kwargs): for e in s: self.add(e) + def union(self, *args): + ret = OrderedSet(*self) + ret.update(*args) + return ret + def add(self, elem): self[elem] = None @@ -299,7 +304,6 @@ def __str__(self): issuperset = property(lambda self: self.__ge__) symmetric_difference = property(lambda self: self.__xor__) symmetric_difference_update = property(lambda self: self.__ixor__) - union = property(lambda self: self.__or__) class Ordering(tuple): diff --git a/devito/tools/dtypes_lowering.py b/devito/tools/dtypes_lowering.py index b5b564a4d7..8f3111ccaa 100644 --- a/devito/tools/dtypes_lowering.py +++ b/devito/tools/dtypes_lowering.py @@ -3,6 +3,8 @@ """ import ctypes +from functools import reduce +from operator import mul import numpy as np from cgen import dtype_to_ctype as cgen_dtype_to_ctype @@ -11,24 +13,25 @@ __all__ = ['int2', 'int3', 'int4', 'float2', 'float3', 'float4', 'double2', # noqa 'double3', 'double4', 'dtypes_vector_mapper', 'dtype_to_mpidtype', - 'dtype_to_cstr', 'dtype_to_ctype', 'dtype_to_mpitype', 'dtype_len', - 'ctypes_to_cstr', 'c_restrict_void_p', 'ctypes_vector_mapper', - 'is_external_ctype', 'infer_dtype', 'CustomDtype'] + 'dtype_to_cstr', 'dtype_to_ctype', 'infer_datasize', 'dtype_to_mpitype', + 'dtype_len', 'ctypes_to_cstr', 'c_restrict_void_p', 'ctypes_vector_mapper', + 'is_external_ctype', 'infer_dtype', 'CustomDtype', 'mpi4py_mapper'] # *** Custom np.dtypes # NOTE: the following is inspired by pyopencl.cltypes -mapper = { +dtype_mapper = { "int": np.int32, "float": np.float32, "double": np.float64 } -def build_dtypes_vector(field_names, counts): +def build_dtypes_vector(field_names, counts, mapper=None): ret = {} + mapper = mapper or dtype_mapper for base_name, base_dtype in mapper.items(): for count in counts: name = "%s%d" % (base_name, count) @@ -92,7 +95,7 @@ def get_base_dtype(self, v, default=None): # Standard vector dtypes dtypes_vector_mapper.update(build_dtypes_vector(field_names, counts)) # Fallbacks -dtypes_vector_mapper.update({(v, 1): v for v in mapper.values()}) +dtypes_vector_mapper.update({(v, 1): v for v in dtype_mapper.values()}) # *** Custom types escaping both the numpy and ctypes namespaces @@ -133,6 +136,9 @@ def dtype_to_cstr(dtype): def dtype_to_ctype(dtype): """Translate numpy.dtype into a ctypes type.""" + if isinstance(dtype, CustomDtype): + return dtype + try: return ctypes_vector_mapper[dtype] except KeyError: @@ -140,7 +146,7 @@ def dtype_to_ctype(dtype): if isinstance(dtype, CustomDtype): return dtype - elif issubclass(dtype, ctypes._SimpleCData): + elif issubclass(dtype, (ctypes._Pointer, ctypes.Structure, ctypes._SimpleCData)): # Bypass np.ctypeslib's normalization rules such as # `np.ctypeslib.as_ctypes_type(ctypes.c_void_p) -> ctypes.c_ulong` return dtype @@ -148,20 +154,53 @@ def dtype_to_ctype(dtype): return np.ctypeslib.as_ctypes_type(dtype) +def infer_datasize(dtype, shape): + """ + Translate numpy.dtype to (ctype, int): type and scale for correct C allocation size. + """ + datasize = int(reduce(mul, shape)) + if isinstance(dtype, CustomDtype): + return dtype, datasize + + try: + return ctypes_vector_mapper[dtype], datasize + except KeyError: + pass + + if issubclass(dtype, ctypes._SimpleCData): + return dtype, datasize + + if dtype == np.float16: + # Allocate half float as unsigned short + return ctypes.c_uint16, datasize + + if np.issubdtype(dtype, np.complexfloating): + # For complex float, allocate twice the size of real/imaginary part + return np.ctypeslib.as_ctypes_type(dtype(0).real.__class__), 2 * datasize + + return np.ctypeslib.as_ctypes_type(dtype), datasize + + +mpi4py_mapper = {} +mpi_mapper = { + np.ubyte: 'MPI_BYTE', + np.ushort: 'MPI_UNSIGNED_SHORT', + np.int32: 'MPI_INT', + np.float32: 'MPI_FLOAT', + np.int64: 'MPI_LONG', + np.float64: 'MPI_DOUBLE', + np.complex64: 'MPI_C_COMPLEX', + np.complex128: 'MPI_C_DOUBLE_COMPLEX' +} + + def dtype_to_mpitype(dtype): """Map numpy types to MPI datatypes.""" # Resolve vector dtype if necessary dtype = dtypes_vector_mapper.get_base_dtype(dtype) - return { - np.ubyte: 'MPI_BYTE', - np.ushort: 'MPI_UNSIGNED_SHORT', - np.int32: 'MPI_INT', - np.float32: 'MPI_FLOAT', - np.int64: 'MPI_LONG', - np.float64: 'MPI_DOUBLE' - }[dtype] + return mpi_mapper[dtype] def dtype_to_mpidtype(dtype): @@ -192,7 +231,7 @@ class c_restrict_void_p(ctypes.c_void_p): ctypes_vector_mapper = {} -for base_name, base_dtype in mapper.items(): +for base_name, base_dtype in dtype_mapper.items(): base_ctype = dtype_to_ctype(base_dtype) for count in counts: @@ -200,7 +239,8 @@ class c_restrict_void_p(ctypes.c_void_p): name = "%s%d" % (base_name, count) ctype = type(name, (ctypes.Structure,), - {'_fields_': [(i, base_ctype)] for i in field_names[:count]}) + {'_fields_': [(i, base_ctype) for i in field_names[:count]], + '_base_dtype': True}) ctypes_vector_mapper[dtype] = ctype @@ -232,7 +272,6 @@ def ctypes_to_cstr(ctype, toarray=None): retval = '%s[%d]' % (ctypes_to_cstr(ctype._type_, toarray), ctype._length_) elif ctype.__name__.startswith('c_'): name = ctype.__name__[2:] - # A primitive datatype # FIXME: Is there a better way of extracting the C typename ? # Here, we're following the ctypes convention that each basic type has @@ -268,11 +307,6 @@ def ctypes_to_cstr(ctype, toarray=None): return retval -known_ctypes = { - 'vector_types.h': list(ctypes_vector_mapper.values()), -} - - def is_external_ctype(ctype, includes): """ True if `ctype` is known to be declared in one of the given `includes` @@ -285,13 +319,22 @@ def is_external_ctype(ctype, includes): if issubclass(ctype, ctypes._SimpleCData): return False - for k, v in known_ctypes.items(): - if ctype in v: - return True + if ctype in ctypes_vector_mapper.values(): + return True return False +def is_numpy_dtype(dtype): + """ + True if `dtype` is a numpy dtype, False otherwise. + """ + try: + return issubclass(dtype, np.generic) + except TypeError: + return False + + def infer_dtype(dtypes): """ Given a set of np.dtypes, return the "winning" dtype: @@ -302,8 +345,11 @@ def infer_dtype(dtypes): """ # Resolve the vector types, if any dtypes = {dtypes_vector_mapper.get_base_dtype(i, i) for i in dtypes} - - fdtypes = {i for i in dtypes if np.issubdtype(i, np.floating)} + # Only keep number types + dtypes = {i for i in dtypes if is_numpy_dtype(i)} + # Separate floating point types from the rest + fdtypes = {i for i in dtypes if np.issubdtype(i, np.floating) or + np.issubdtype(i, np.complexfloating)} if len(fdtypes) > 1: return max(fdtypes, key=lambda i: np.dtype(i).itemsize) elif len(fdtypes) == 1: diff --git a/devito/types/array.py b/devito/types/array.py index 7eda353b9b..105cdd21da 100644 --- a/devito/types/array.py +++ b/devito/types/array.py @@ -365,6 +365,9 @@ class Bundle(ArrayBasic): __rkwargs__ = AbstractFunction.__rkwargs__ + ('components',) + def __new__(cls, *args, components=(), **kwargs): + return super().__new__(cls, *args, components=as_tuple(components), **kwargs) + def __init_finalize__(self, *args, components=(), **kwargs): super().__init_finalize__(*args, components=components, **kwargs) diff --git a/devito/types/basic.py b/devito/types/basic.py index bd847f3a98..04e4281f12 100644 --- a/devito/types/basic.py +++ b/devito/types/basic.py @@ -1,7 +1,7 @@ import abc import inspect from collections import namedtuple -from ctypes import POINTER, _Pointer, c_char_p, c_char +from ctypes import POINTER, _Pointer, c_char_p, c_char, Structure from functools import reduce, cached_property from operator import mul @@ -13,8 +13,8 @@ from devito.data import default_allocator from devito.parameters import configuration -from devito.tools import (Pickable, as_tuple, ctypes_to_cstr, dtype_to_ctype, - frozendict, memoized_meth, sympy_mutex) +from devito.tools import (Pickable, as_tuple, dtype_to_ctype, + frozendict, memoized_meth, sympy_mutex, CustomDtype) from devito.types.args import ArgProvider from devito.types.caching import Cached, Uncached from devito.types.lazy import Evaluable @@ -81,9 +81,12 @@ def _C_name(self): @property def _C_typedata(self): """ - The type of the object in the generated code as a `str`. + The type of the object's data in the generated code. """ _type = self._C_ctype + if isinstance(_type, CustomDtype): + return _type + while issubclass(_type, _Pointer): _type = _type._type_ @@ -91,7 +94,15 @@ def _C_typedata(self): if _type is c_char_p: _type = c_char - return ctypes_to_cstr(_type) + try: + # We have internal types such as c_complex that are + # Structure too but should be treated as plain c_type + _type._base_dtype + except AttributeError: + if issubclass(_type, Structure): + _type = f'struct {_type.__name__}' + + return _type @abc.abstractproperty def _C_ctype(self): @@ -338,8 +349,6 @@ class AbstractSymbol(sympy.Symbol, Basic, Pickable, Evaluable): is_Symbol = True # SymPy default assumptions - is_real = True - is_imaginary = False is_commutative = True __rkwargs__ = ('name', 'dtype', 'is_const') @@ -400,6 +409,12 @@ def _hashable_content(self): def dtype(self): return self._dtype + def _eval_is_real(self): + return not self.is_imaginary + + def _eval_is_imaginary(self): + return np.iscomplexobj(self.dtype(0)) + @property def indices(self): return () @@ -848,8 +863,6 @@ class AbstractFunction(sympy.Function, Basic, Pickable, Evaluable): is_AbstractFunction = True # SymPy default assumptions - is_real = True - is_imaginary = False is_commutative = True # Devito default assumptions @@ -945,6 +958,8 @@ def _sympystr(self, printer, **kwargs): return str(self) _latex = _sympystr + _eval_is_real = AbstractSymbol._eval_is_real + _eval_is_imaginary = AbstractSymbol._eval_is_imaginary def _pretty(self, printer, **kwargs): return printer._print_Function(self, func_name=self.name) diff --git a/devito/types/dense.py b/devito/types/dense.py index 9af238a6d0..efdb22cca5 100644 --- a/devito/types/dense.py +++ b/devito/types/dense.py @@ -21,7 +21,8 @@ from devito.finite_differences import Differentiable, generate_fd_shortcuts from devito.finite_differences.tools import fd_weights_registry from devito.tools import (ReducerMap, as_tuple, c_restrict_void_p, flatten, - is_integer, memoized_meth, dtype_to_ctype, humanbytes) + is_integer, memoized_meth, dtype_to_ctype, humanbytes, + mpi4py_mapper) from devito.types.dimension import Dimension from devito.types.args import ArgProvider from devito.types.caching import CacheManager @@ -786,6 +787,7 @@ def _halo_exchange(self): neighborhood = self._distributor.neighborhood comm = self._distributor.comm + comm_dtype = mpi4py_mapper.get(self.dtype, self.dtype) for d in self._dist_dimensions: for i in [LEFT, RIGHT]: @@ -795,18 +797,18 @@ def _halo_exchange(self): # Gather send data data = self._data_in_region(OWNED, d, i) - sendbuf = np.ascontiguousarray(data) + sendbuf = np.ascontiguousarray(data.view(comm_dtype)) # Setup recv buffer shape = self._data_in_region(HALO, d, i.flip()).shape - recvbuf = np.ndarray(shape=shape, dtype=self.dtype) + recvbuf = np.ndarray(shape=shape, dtype=comm_dtype) # Communication comm.Sendrecv(sendbuf, dest=dest, recvbuf=recvbuf, source=source) # Scatter received data if recvbuf is not None and source != MPI.PROC_NULL: - self._data_in_region(HALO, d, i.flip())[:] = recvbuf + self._data_in_region(HALO, d, i.flip())[:] = recvbuf.view(self.dtype) self._is_halo_dirty = False diff --git a/devito/types/grid.py b/devito/types/grid.py index 6683ab5713..f0d4a440d5 100644 --- a/devito/types/grid.py +++ b/devito/types/grid.py @@ -78,9 +78,10 @@ class Grid(CartesianDiscretization, ArgProvider): ---------- shape : tuple of ints Shape of the computational domain in grid points. - extent : tuple of floats, default=unit box of extent 1m in all dimensions + extent : tuple of values interpretable as dtype, default=unit box of extent 1m + in all dimensions. Physical extent of the domain in m. - origin : tuple of floats, default=0.0 in all dimensions + origin : tuple of values interpretable as dtype, default=0.0 in all dimensions Physical coordinate of the origin of the domain. dimensions : tuple of SpaceDimension, optional The dimensions of the computational domain encapsulated by this Grid. @@ -189,9 +190,12 @@ def __init__(self, shape, extent=None, origin=None, dimensions=None, self._distributor = Distributor(shape, dimensions, comm, self._topology) # The physical extent - self._extent = as_tuple(extent or tuple(1. for _ in self.shape)) + extent = as_tuple(extent or tuple(1. for _ in self.shape)) + self._extent = tuple(dtype(e) for e in extent) - self._origin = as_tuple(origin or tuple(0. for _ in self.shape)) + # The origin of the grid + origin = as_tuple(origin or tuple(0. for _ in self.shape)) + self._origin = tuple(dtype(o) for o in origin) self._origin_symbols = tuple(Scalar(name='o_%s' % d.name, dtype=dtype, is_const=True) for d in self.dimensions) diff --git a/devito/types/object.py b/devito/types/object.py index cba54b0add..032bca303a 100644 --- a/devito/types/object.py +++ b/devito/types/object.py @@ -54,6 +54,7 @@ def _sympystr(self, printer): return str(self) _ccode = _sympystr + _cxxcode = _sympystr def _hashable_content(self): return (self.name, self.dtype) diff --git a/examples/compiler/03_iet-A.ipynb b/examples/compiler/03_iet-A.ipynb index 8f8e26be34..3a40f70c86 100644 --- a/examples/compiler/03_iet-A.ipynb +++ b/examples/compiler/03_iet-A.ipynb @@ -227,7 +227,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this example, `op` is represented as a ``. Attached to it are metadata, such as `_headers` and `_includes`, as well as the `body`, which includes the children IET nodes. Here, the body is the concatenation of an `PointerCast` and a `List` object.\n" + "In this example, `op` is represented as a ``. Attached to it are metadata, such as `headers` and `includes`, as well as the `body`, which includes the children IET nodes. Here, the body is the concatenation of an `PointerCast` and a `List` object.\n" ] }, { @@ -247,7 +247,7 @@ } ], "source": [ - "op._headers" + "op.headers" ] }, { @@ -267,7 +267,7 @@ } ], "source": [ - "op._includes" + "op.includes" ] }, { @@ -460,7 +460,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -474,7 +474,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.12.9" } }, "nbformat": 4, diff --git a/examples/seismic/tutorials/17_fourier_mode.ipynb b/examples/seismic/tutorials/17_fourier_mode.ipynb new file mode 100644 index 0000000000..c79edb329b --- /dev/null +++ b/examples/seismic/tutorials/17_fourier_mode.ipynb @@ -0,0 +1,1519 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 17 - On-the-fly discrete Fourier transform\n", + "\n", + "In this notebook, we show how to compute Fourier modes during the propagation of the wavefield (on-the-fly) using the discrete Fourier transform (DFT). This method is known to be a memory efficient way to compute gradients for seismic inversion as only a few model-size Fourier modes are necessary instead of the full time history.\n", + "\n", + "The method illustrate the forward modeling to implement. the method in the following paper:\n", + "\n", + "- *Compressive least-squares migration with on-the-fly Fourier transforms*, Philipp A. Witte, Mathias Louboutin, Fabio Luporini, Gerard J. Gorman, and Felix J. Herrmann, 2019, Geophysics, 84(5), R655-R672. [DOI](https://doi.org/10.1190/geo2018-0490.1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Theory\n", + "\n", + "The Fourier transform is a mathematical operation that transforms a function of time (or space) into a function of frequency. In the context of wave propagation, it allows us to analyze the frequency content of the wavefield.\n", + "\n", + "The discrete Fourier transform (DFT) is a specific kind of Fourier transform used for discrete data. It is defined as:\n", + "\n", + "$$ X_k = \\sum_{n=0}^{N-1} x_n e^{-i 2 \\pi k n / N} $$\n", + "\n", + "where:\n", + "- $X_k$ is the DFT of the sequence $x_n$ \n", + "- $N$ is the number of samples\n", + "- $k$ is the frequency index\n", + "- $n$ is the time index\n", + "\n", + "In seismic applications, the DFT can be used to compute the frequency components of the wavefield as it propagates through the subsurface." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## On-the-fly Fourier Transform\n", + "\n", + "The on-the-fly Fourier transform is a technique where the Fourier transform is computed during the wavefield propagation, rather than storing the entire time history of the wavefield. This approach is memory efficient and particularly useful for large-scale seismic inversion problems.\n", + "\n", + "The key idea is to update the Fourier modes at each time step using the current wavefield values. This can be expressed as:\n", + "\n", + "$$ F_k(t+\\Delta t) = F_k(t) + u(t) e^{-i \\omega_k t \\Delta t} $$\n", + "\n", + "where:\n", + "- $F_k(t)$ is the Fourier mode at frequency $\\omega_k$ and time $t$ \n", + "- $u(t)$ is the wavefield at time $t$\n", + "- $\\Delta t$ is the time step\n", + "\n", + "By updating the Fourier modes on-the-fly, we avoid the need to store the entire wavefield history, thus saving memory." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from devito import *\n", + "\n", + "from examples.seismic import demo_model, AcquisitionGeometry, plot_velocity\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from IPython.display import Code" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "NUMA domain count autodetection failed, assuming 1\n", + "Operator `initdamp` ran in 0.01 s\n", + "Operator `initdamp` ran in 0.01 s\n" + ] + } + ], + "source": [ + "#NBVAL_IGNORE_OUTPUT\n", + "model = demo_model('layers-isotropic', vp=3.0, origin=(0., 0.), shape=(101, 101), spacing=(10., 10.), nbl=40, nlayers=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#NBVAL_IGNORE_OUTPUT\n", + "plot_velocity(model)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#NBVAL_IGNORE_OUTPUT\n", + "# Define acquisition geometry: source\n", + "\n", + "# First, position source centrally in all dimensions, then set depth\n", + "src_coordinates = np.empty((1, 2))\n", + "src_coordinates[0, :] = np.array(model.domain_size) * .5\n", + "src_coordinates[0, -1] = 20. # Depth is 20m\n", + "\n", + "\n", + "# Define acquisition geometry: receivers\n", + "\n", + "# Initialize receivers for synthetic and imaging data\n", + "nreceivers = 101\n", + "rec_coordinates = np.empty((nreceivers, 2))\n", + "rec_coordinates[:, 0] = np.linspace(0, model.domain_size[0], num=nreceivers)\n", + "rec_coordinates[:, 1] = 30.\n", + "\n", + "# Geometry\n", + "t0 = 0.\n", + "tn = 500. # Simulation last 1 second (1000 ms)\n", + "dt = model.critical_dt\n", + "geometry = AcquisitionGeometry(model, rec_coordinates, src_coordinates, t0, tn, f0=.010, src_type='Ricker')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/latex": [ + "$\\displaystyle u(t + dt, x, y) = \\frac{- \\frac{- \\frac{2.0 u(t, x, y)}{dt^{2}} + \\frac{u(t - dt, x, y)}{dt^{2}}}{vp(x, y)} + \\frac{\\partial^{2}}{\\partial x^{2}} u(t, x, y) + \\frac{\\partial^{2}}{\\partial y^{2}} u(t, x, y) + \\frac{damp(x, y) u(t, x, y)}{dt}}{\\frac{damp(x, y)}{dt} + \\frac{1}{dt^{2} vp(x, y)}}$" + ], + "text/plain": [ + "Eq(u(t + dt, x, y), (-(-2.0*u(t, x, y)/dt**2 + u(t - dt, x, y)/dt**2)/vp(x, y)**2 + Derivative(u(t, x, y), (x, 2)) + Derivative(u(t, x, y), (y, 2)) + damp(x, y)*u(t, x, y)/dt)/(damp(x, y)/dt + 1/(dt**2*vp(x, y)**2)))" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define the wavefield with the size of the model and the time dimension\n", + "u = TimeFunction(name=\"u\", grid=model.grid, time_order=2, space_order=2)\n", + "\n", + "# We can now write the PDE\n", + "pde = model.m * u.dt2 - u.laplace + model.damp * u.dt\n", + "\n", + "\n", + "# Stencil update\n", + "stencil = Eq(u.forward, solve(pde, u.forward))\n", + "stencil" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "freq_mode = Function(name='freq_modes', grid=model.grid, space_order=0, dtype=np.complex64)\n", + "freq = .01 # Compute 10Hz slice\n", + "omega = 2 * np.pi * freq\n", + "\n", + "basis = exp(-1j * omega * model.grid.time_dim * model.grid.time_dim.spacing)\n", + "dft = [Inc(freq_mode, basis * u)]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Finally we define the source injection and receiver read function to generate the corresponding code\n", + "src = geometry.src\n", + "src_term = src.inject(field=u.forward, expr=src * dt**2 / model.m)\n", + "\n", + "# Create interpolation expression for receivers\n", + "rec = geometry.rec\n", + "rec_term = rec.interpolate(expr=u.forward)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "op = Operator([stencil] + src_term + rec_term + dft, subs=model.spacing_map)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
#define _POSIX_C_SOURCE 200809L\n",
+       "#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n",
+       "#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n",
+       "#define MAX(a,b) (((a) > (b)) ? (a) : (b))\n",
+       "\n",
+       "#include "stdlib.h"\n",
+       "#include "math.h"\n",
+       "#include "sys/time.h"\n",
+       "#include "omp.h"\n",
+       "#include "complex.h"\n",
+       "\n",
+       "struct dataobj\n",
+       "{\n",
+       "  void *restrict data;\n",
+       "  unsigned long * size;\n",
+       "  unsigned long * npsize;\n",
+       "  unsigned long * dsize;\n",
+       "  int * hsize;\n",
+       "  int * hofs;\n",
+       "  int * oofs;\n",
+       "  void * dmap;\n",
+       "} ;\n",
+       "\n",
+       "struct profiler\n",
+       "{\n",
+       "  double section0;\n",
+       "  double section1;\n",
+       "  double section2;\n",
+       "} ;\n",
+       "\n",
+       "\n",
+       "int Kernel(struct dataobj *restrict damp_vec, struct dataobj *restrict freq_modes_vec, struct dataobj *restrict rec_vec, struct dataobj *restrict rec_coords_vec, struct dataobj *restrict src_vec, struct dataobj *restrict src_coords_vec, struct dataobj *restrict u_vec, struct dataobj *restrict vp_vec, const int x_M, const int x_m, const int y_M, const int y_m, const float dt, const float o_x, const float o_y, const int p_rec_M, const int p_rec_m, const int p_src_M, const int p_src_m, const int time_M, const int time_m, const int nthreads, const int nthreads_nonaffine, struct profiler * timers)\n",
+       "{\n",
+       "  float (*restrict damp)[damp_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[damp_vec->size[1]]) damp_vec->data;\n",
+       "  float _Complex (*restrict freq_modes)[freq_modes_vec->size[1]] __attribute__ ((aligned (64))) = (float _Complex (*)[freq_modes_vec->size[1]]) freq_modes_vec->data;\n",
+       "  float (*restrict rec)[rec_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[rec_vec->size[1]]) rec_vec->data;\n",
+       "  float (*restrict rec_coords)[rec_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[rec_coords_vec->size[1]]) rec_coords_vec->data;\n",
+       "  float (*restrict src)[src_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[src_vec->size[1]]) src_vec->data;\n",
+       "  float (*restrict src_coords)[src_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[src_coords_vec->size[1]]) src_coords_vec->data;\n",
+       "  float (*restrict u)[u_vec->size[1]][u_vec->size[2]] __attribute__ ((aligned (64))) = (float (*)[u_vec->size[1]][u_vec->size[2]]) u_vec->data;\n",
+       "  float (*restrict vp)[vp_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[vp_vec->size[1]]) vp_vec->data;\n",
+       "\n",
+       "  float _Complex r2 = 1.0F/(dt*dt);\n",
+       "  float _Complex r3 = 1.0F/dt;\n",
+       "\n",
+       "  for (int time = time_m, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3); time <= time_M; time += 1, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3))\n",
+       "  {\n",
+       "    float _Complex r1 = cexpf(-6.28318530717959e-2F*time*_Complex_I*dt);\n",
+       "    START(section0)\n",
+       "    #pragma omp parallel num_threads(nthreads)\n",
+       "    {\n",
+       "      #pragma omp for schedule(dynamic,1)\n",
+       "      for (int x = x_m; x <= x_M; x += 1)\n",
+       "      {\n",
+       "        #pragma omp simd aligned(damp,freq_modes,u,vp:16)\n",
+       "        for (int y = y_m; y <= y_M; y += 1)\n",
+       "        {\n",
+       "          float _Complex r4 = 1.0F/(vp[x + 2][y + 2]*vp[x + 2][y + 2]);\n",
+       "          u[t2][x + 2][y + 2] = (-r4*(-2.0F*r2*u[t0][x + 2][y + 2] + r2*u[t1][x + 2][y + 2]) + r3*damp[x + 2][y + 2]*u[t0][x + 2][y + 2] + 1.0e-2F*(u[t0][x + 1][y + 2] + u[t0][x + 2][y + 1] + u[t0][x + 2][y + 3] + u[t0][x + 3][y + 2]) - 3.99999991e-2F*u[t0][x + 2][y + 2])/(r4*r2 + r3*damp[x + 2][y + 2]);\n",
+       "          freq_modes[x][y] += r1*u[t0][x + 2][y + 2];\n",
+       "        }\n",
+       "      }\n",
+       "    }\n",
+       "    STOP(section0,timers)\n",
+       "\n",
+       "    START(section1)\n",
+       "    #pragma omp parallel num_threads(nthreads_nonaffine)\n",
+       "    {\n",
+       "      int chunk_size = (int)(MAX(1, (int)((1.0/3.0)*(p_src_M - p_src_m + 1)/nthreads_nonaffine)));\n",
+       "      #pragma omp for schedule(dynamic,chunk_size)\n",
+       "      for (int p_src = p_src_m; p_src <= p_src_M; p_src += 1)\n",
+       "      {\n",
+       "        for (int rsrcx = 0; rsrcx <= 1; rsrcx += 1)\n",
+       "        {\n",
+       "          for (int rsrcy = 0; rsrcy <= 1; rsrcy += 1)\n",
+       "          {\n",
+       "            int posx = (int)(floorf(1.0e-1*(-o_x + src_coords[p_src][0])));\n",
+       "            int posy = (int)(floorf(1.0e-1*(-o_y + src_coords[p_src][1])));\n",
+       "            float px = 1.0e-1F*(-o_x + src_coords[p_src][0]) - floorf(1.0e-1F*(-o_x + src_coords[p_src][0]));\n",
+       "            float py = 1.0e-1F*(-o_y + src_coords[p_src][1]) - floorf(1.0e-1F*(-o_y + src_coords[p_src][1]));\n",
+       "            if (rsrcx + posx >= x_m - 1 && rsrcy + posy >= y_m - 1 && rsrcx + posx <= x_M + 1 && rsrcy + posy <= y_M + 1)\n",
+       "            {\n",
+       "              float r0 = 3.06250F*(vp[posx + 2][posy + 2]*vp[posx + 2][posy + 2])*(rsrcx*px + (1 - rsrcx)*(1 - px))*(rsrcy*py + (1 - rsrcy)*(1 - py))*src[time][p_src];\n",
+       "              #pragma omp atomic update\n",
+       "              u[t2][rsrcx + posx + 2][rsrcy + posy + 2] += r0;\n",
+       "            }\n",
+       "          }\n",
+       "        }\n",
+       "      }\n",
+       "    }\n",
+       "    STOP(section1,timers)\n",
+       "\n",
+       "    START(section2)\n",
+       "    #pragma omp parallel num_threads(nthreads_nonaffine)\n",
+       "    {\n",
+       "      int chunk_size = (int)(MAX(1, (int)((1.0/3.0)*(p_rec_M - p_rec_m + 1)/nthreads_nonaffine)));\n",
+       "      #pragma omp for schedule(dynamic,chunk_size)\n",
+       "      for (int p_rec = p_rec_m; p_rec <= p_rec_M; p_rec += 1)\n",
+       "      {\n",
+       "        float r7 = 1.0e-1F*(-o_x + rec_coords[p_rec][0]);\n",
+       "        float r5 = floorf(r7);\n",
+       "        int posx = (int)r5;\n",
+       "        float r8 = 1.0e-1F*(-o_y + rec_coords[p_rec][1]);\n",
+       "        float r6 = floorf(r8);\n",
+       "        int posy = (int)r6;\n",
+       "        float px = -r5 + r7;\n",
+       "        float py = -r6 + r8;\n",
+       "        float sum = 0.0F;\n",
+       "\n",
+       "        for (int rrecx = 0; rrecx <= 1; rrecx += 1)\n",
+       "        {\n",
+       "          for (int rrecy = 0; rrecy <= 1; rrecy += 1)\n",
+       "          {\n",
+       "            if (rrecx + posx >= x_m - 1 && rrecy + posy >= y_m - 1 && rrecx + posx <= x_M + 1 && rrecy + posy <= y_M + 1)\n",
+       "            {\n",
+       "              sum += (rrecx*px + (1 - rrecx)*(1 - px))*(rrecy*py + (1 - rrecy)*(1 - py))*u[t2][rrecx + posx + 2][rrecy + posy + 2];\n",
+       "            }\n",
+       "          }\n",
+       "        }\n",
+       "\n",
+       "        rec[time][p_rec] = sum;\n",
+       "      }\n",
+       "    }\n",
+       "    STOP(section2,timers)\n",
+       "  }\n",
+       "\n",
+       "  return 0;\n",
+       "}\n",
+       "
\n" + ], + "text/latex": [ + "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{define \\PYZus{}POSIX\\PYZus{}C\\PYZus{}SOURCE 200809L}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{define START(S) struct timeval start\\PYZus{} \\PYZsh{}\\PYZsh{} S , end\\PYZus{} \\PYZsh{}\\PYZsh{} S ; gettimeofday(\\PYZam{}start\\PYZus{} \\PYZsh{}\\PYZsh{} S , NULL);}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{define STOP(S,T) gettimeofday(\\PYZam{}end\\PYZus{} \\PYZsh{}\\PYZsh{} S, NULL); T\\PYZhy{}\\PYZgt{}S += (double)(end\\PYZus{} \\PYZsh{}\\PYZsh{} S .tv\\PYZus{}sec\\PYZhy{}start\\PYZus{} \\PYZsh{}\\PYZsh{} S.tv\\PYZus{}sec)+(double)(end\\PYZus{} \\PYZsh{}\\PYZsh{} S .tv\\PYZus{}usec\\PYZhy{}start\\PYZus{} \\PYZsh{}\\PYZsh{} S .tv\\PYZus{}usec)}\\PY{c+cp}{/}\\PY{c+cp}{1000000;}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{define MAX(a,b) (((a) \\PYZgt{} (b)) ? (a) : (b))}\n", + "\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}stdlib.h\\PYZdq{}}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}math.h\\PYZdq{}}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}sys/time.h\\PYZdq{}}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}omp.h\\PYZdq{}}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}complex.h\\PYZdq{}}\n", + "\n", + "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{data}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{unsigned}\\PY{+w}{ }\\PY{k+kt}{long}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{size}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{unsigned}\\PY{+w}{ }\\PY{k+kt}{long}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{npsize}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{unsigned}\\PY{+w}{ }\\PY{k+kt}{long}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{dsize}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{hsize}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{hofs}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{oofs}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{dmap}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{p}{;}\n", + "\n", + "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{profiler}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{n}{section0}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{n}{section1}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{n}{section2}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{p}{;}\n", + "\n", + "\n", + "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{Kernel}\\PY{p}{(}\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{damp\\PYZus{}vec}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{freq\\PYZus{}modes\\PYZus{}vec}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{rec\\PYZus{}vec}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{rec\\PYZus{}coords\\PYZus{}vec}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{src\\PYZus{}vec}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{src\\PYZus{}coords\\PYZus{}vec}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{u\\PYZus{}vec}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{vp\\PYZus{}vec}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{x\\PYZus{}M}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{x\\PYZus{}m}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{y\\PYZus{}M}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{y\\PYZus{}m}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{dt}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{o\\PYZus{}x}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{o\\PYZus{}y}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{p\\PYZus{}rec\\PYZus{}M}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{p\\PYZus{}rec\\PYZus{}m}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{p\\PYZus{}src\\PYZus{}M}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{p\\PYZus{}src\\PYZus{}m}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{time\\PYZus{}M}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{time\\PYZus{}m}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{nthreads}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{nthreads\\PYZus{}nonaffine}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{profiler}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{timers}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{damp}\\PY{p}{)}\\PY{p}{[}\\PY{n}{damp\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{p}{)}\\PY{p}{[}\\PY{n}{damp\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{damp\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{data}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{k+kt}{\\PYZus{}Complex}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{freq\\PYZus{}modes}\\PY{p}{)}\\PY{p}{[}\\PY{n}{freq\\PYZus{}modes\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{k+kt}{\\PYZus{}Complex}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{p}{)}\\PY{p}{[}\\PY{n}{freq\\PYZus{}modes\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{freq\\PYZus{}modes\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{data}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{rec}\\PY{p}{)}\\PY{p}{[}\\PY{n}{rec\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{p}{)}\\PY{p}{[}\\PY{n}{rec\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{rec\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{data}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{rec\\PYZus{}coords}\\PY{p}{)}\\PY{p}{[}\\PY{n}{rec\\PYZus{}coords\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{p}{)}\\PY{p}{[}\\PY{n}{rec\\PYZus{}coords\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{rec\\PYZus{}coords\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{data}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{src}\\PY{p}{)}\\PY{p}{[}\\PY{n}{src\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{p}{)}\\PY{p}{[}\\PY{n}{src\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{src\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{data}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{src\\PYZus{}coords}\\PY{p}{)}\\PY{p}{[}\\PY{n}{src\\PYZus{}coords\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{p}{)}\\PY{p}{[}\\PY{n}{src\\PYZus{}coords\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{src\\PYZus{}coords\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{data}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{u}\\PY{p}{)}\\PY{p}{[}\\PY{n}{u\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{[}\\PY{n}{u\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{]}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{p}{)}\\PY{p}{[}\\PY{n}{u\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{[}\\PY{n}{u\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{u\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{data}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{vp}\\PY{p}{)}\\PY{p}{[}\\PY{n}{vp\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{p}{)}\\PY{p}{[}\\PY{n}{vp\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{vp\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{data}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{k+kt}{\\PYZus{}Complex}\\PY{+w}{ }\\PY{n}{r2}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mf}{1.0F}\\PY{o}{/}\\PY{p}{(}\\PY{n}{dt}\\PY{o}{*}\\PY{n}{dt}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{k+kt}{\\PYZus{}Complex}\\PY{+w}{ }\\PY{n}{r3}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mf}{1.0F}\\PY{o}{/}\\PY{n}{dt}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{time}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{time\\PYZus{}m}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{t0}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{time}\\PY{p}{)}\\PY{o}{\\PYZpc{}}\\PY{p}{(}\\PY{l+m+mi}{3}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{t1}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{time}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{)}\\PY{o}{\\PYZpc{}}\\PY{p}{(}\\PY{l+m+mi}{3}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{t2}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{time}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{o}{\\PYZpc{}}\\PY{p}{(}\\PY{l+m+mi}{3}\\PY{p}{)}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{time}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{time\\PYZus{}M}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{time}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{t0}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{time}\\PY{p}{)}\\PY{o}{\\PYZpc{}}\\PY{p}{(}\\PY{l+m+mi}{3}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{t1}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{time}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{)}\\PY{o}{\\PYZpc{}}\\PY{p}{(}\\PY{l+m+mi}{3}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{t2}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{time}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{o}{\\PYZpc{}}\\PY{p}{(}\\PY{l+m+mi}{3}\\PY{p}{)}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{k+kt}{\\PYZus{}Complex}\\PY{+w}{ }\\PY{n}{r1}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cexpf}\\PY{p}{(}\\PY{l+m+mf}{\\PYZhy{}6.28318530717959e\\PYZhy{}2F}\\PY{o}{*}\\PY{n}{time}\\PY{o}{*}\\PY{n}{\\PYZus{}Complex\\PYZus{}I}\\PY{o}{*}\\PY{n}{dt}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{START}\\PY{p}{(}\\PY{n}{section0}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp parallel num\\PYZus{}threads(nthreads)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp for schedule(dynamic,1)}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{x}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{x\\PYZus{}m}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{x}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{x\\PYZus{}M}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp simd aligned(damp,freq\\PYZus{}modes,u,vp:16)}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{y}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{y\\PYZus{}m}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{y}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{y\\PYZus{}M}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{k+kt}{\\PYZus{}Complex}\\PY{+w}{ }\\PY{n}{r4}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mf}{1.0F}\\PY{o}{/}\\PY{p}{(}\\PY{n}{vp}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{o}{*}\\PY{n}{vp}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{u}\\PY{p}{[}\\PY{n}{t2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{\\PYZhy{}}\\PY{n}{r4}\\PY{o}{*}\\PY{p}{(}\\PY{l+m+mf}{\\PYZhy{}2.0F}\\PY{o}{*}\\PY{n}{r2}\\PY{o}{*}\\PY{n}{u}\\PY{p}{[}\\PY{n}{t0}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{r2}\\PY{o}{*}\\PY{n}{u}\\PY{p}{[}\\PY{n}{t1}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{r3}\\PY{o}{*}\\PY{n}{damp}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{o}{*}\\PY{n}{u}\\PY{p}{[}\\PY{n}{t0}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mf}{1.0e\\PYZhy{}2F}\\PY{o}{*}\\PY{p}{(}\\PY{n}{u}\\PY{p}{[}\\PY{n}{t0}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{u}\\PY{p}{[}\\PY{n}{t0}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{u}\\PY{p}{[}\\PY{n}{t0}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{3}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{u}\\PY{p}{[}\\PY{n}{t0}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{3}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{l+m+mf}{3.99999991e\\PYZhy{}2F}\\PY{o}{*}\\PY{n}{u}\\PY{p}{[}\\PY{n}{t0}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{)}\\PY{o}{/}\\PY{p}{(}\\PY{n}{r4}\\PY{o}{*}\\PY{n}{r2}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{r3}\\PY{o}{*}\\PY{n}{damp}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{freq\\PYZus{}modes}\\PY{p}{[}\\PY{n}{x}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{r1}\\PY{o}{*}\\PY{n}{u}\\PY{p}{[}\\PY{n}{t0}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{n}{STOP}\\PY{p}{(}\\PY{n}{section0}\\PY{p}{,}\\PY{n}{timers}\\PY{p}{)}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{START}\\PY{p}{(}\\PY{n}{section1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp parallel num\\PYZus{}threads(nthreads\\PYZus{}nonaffine)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{chunk\\PYZus{}size}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{p}{)}\\PY{p}{(}\\PY{n}{MAX}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{p}{)}\\PY{p}{(}\\PY{p}{(}\\PY{l+m+mf}{1.0}\\PY{o}{/}\\PY{l+m+mf}{3.0}\\PY{p}{)}\\PY{o}{*}\\PY{p}{(}\\PY{n}{p\\PYZus{}src\\PYZus{}M}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{p\\PYZus{}src\\PYZus{}m}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{o}{/}\\PY{n}{nthreads\\PYZus{}nonaffine}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp for schedule(dynamic,chunk\\PYZus{}size)}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{p\\PYZus{}src}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{p\\PYZus{}src\\PYZus{}m}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{p\\PYZus{}src}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{p\\PYZus{}src\\PYZus{}M}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{p\\PYZus{}src}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{rsrcx}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{rsrcx}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{rsrcx}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{rsrcy}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{rsrcy}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{rsrcy}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{p}{)}\\PY{p}{(}\\PY{n}{floorf}\\PY{p}{(}\\PY{l+m+mf}{1.0e\\PYZhy{}1}\\PY{o}{*}\\PY{p}{(}\\PY{o}{\\PYZhy{}}\\PY{n}{o\\PYZus{}x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{src\\PYZus{}coords}\\PY{p}{[}\\PY{n}{p\\PYZus{}src}\\PY{p}{]}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{p}{)}\\PY{p}{(}\\PY{n}{floorf}\\PY{p}{(}\\PY{l+m+mf}{1.0e\\PYZhy{}1}\\PY{o}{*}\\PY{p}{(}\\PY{o}{\\PYZhy{}}\\PY{n}{o\\PYZus{}y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{src\\PYZus{}coords}\\PY{p}{[}\\PY{n}{p\\PYZus{}src}\\PY{p}{]}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{px}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mf}{1.0e\\PYZhy{}1F}\\PY{o}{*}\\PY{p}{(}\\PY{o}{\\PYZhy{}}\\PY{n}{o\\PYZus{}x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{src\\PYZus{}coords}\\PY{p}{[}\\PY{n}{p\\PYZus{}src}\\PY{p}{]}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{floorf}\\PY{p}{(}\\PY{l+m+mf}{1.0e\\PYZhy{}1F}\\PY{o}{*}\\PY{p}{(}\\PY{o}{\\PYZhy{}}\\PY{n}{o\\PYZus{}x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{src\\PYZus{}coords}\\PY{p}{[}\\PY{n}{p\\PYZus{}src}\\PY{p}{]}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{py}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mf}{1.0e\\PYZhy{}1F}\\PY{o}{*}\\PY{p}{(}\\PY{o}{\\PYZhy{}}\\PY{n}{o\\PYZus{}y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{src\\PYZus{}coords}\\PY{p}{[}\\PY{n}{p\\PYZus{}src}\\PY{p}{]}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{floorf}\\PY{p}{(}\\PY{l+m+mf}{1.0e\\PYZhy{}1F}\\PY{o}{*}\\PY{p}{(}\\PY{o}{\\PYZhy{}}\\PY{n}{o\\PYZus{}y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{src\\PYZus{}coords}\\PY{p}{[}\\PY{n}{p\\PYZus{}src}\\PY{p}{]}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{rsrcx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{\\PYZgt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{x\\PYZus{}m}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{o}{\\PYZam{}}\\PY{+w}{ }\\PY{n}{rsrcy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{\\PYZgt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{y\\PYZus{}m}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{o}{\\PYZam{}}\\PY{+w}{ }\\PY{n}{rsrcx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{x\\PYZus{}M}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{o}{\\PYZam{}}\\PY{+w}{ }\\PY{n}{rsrcy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{y\\PYZus{}M}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{r0}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mf}{3.06250F}\\PY{o}{*}\\PY{p}{(}\\PY{n}{vp}\\PY{p}{[}\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{o}{*}\\PY{n}{vp}\\PY{p}{[}\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{)}\\PY{o}{*}\\PY{p}{(}\\PY{n}{rsrcx}\\PY{o}{*}\\PY{n}{px}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{rsrcx}\\PY{p}{)}\\PY{o}{*}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{px}\\PY{p}{)}\\PY{p}{)}\\PY{o}{*}\\PY{p}{(}\\PY{n}{rsrcy}\\PY{o}{*}\\PY{n}{py}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{rsrcy}\\PY{p}{)}\\PY{o}{*}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{py}\\PY{p}{)}\\PY{p}{)}\\PY{o}{*}\\PY{n}{src}\\PY{p}{[}\\PY{n}{time}\\PY{p}{]}\\PY{p}{[}\\PY{n}{p\\PYZus{}src}\\PY{p}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp atomic update}\n", + "\\PY{+w}{ }\\PY{n}{u}\\PY{p}{[}\\PY{n}{t2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{rsrcx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{rsrcy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{r0}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{n}{STOP}\\PY{p}{(}\\PY{n}{section1}\\PY{p}{,}\\PY{n}{timers}\\PY{p}{)}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{START}\\PY{p}{(}\\PY{n}{section2}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp parallel num\\PYZus{}threads(nthreads\\PYZus{}nonaffine)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{chunk\\PYZus{}size}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{p}{)}\\PY{p}{(}\\PY{n}{MAX}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{p}{)}\\PY{p}{(}\\PY{p}{(}\\PY{l+m+mf}{1.0}\\PY{o}{/}\\PY{l+m+mf}{3.0}\\PY{p}{)}\\PY{o}{*}\\PY{p}{(}\\PY{n}{p\\PYZus{}rec\\PYZus{}M}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{p\\PYZus{}rec\\PYZus{}m}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{o}{/}\\PY{n}{nthreads\\PYZus{}nonaffine}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp for schedule(dynamic,chunk\\PYZus{}size)}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{p\\PYZus{}rec}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{p\\PYZus{}rec\\PYZus{}m}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{p\\PYZus{}rec}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{p\\PYZus{}rec\\PYZus{}M}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{p\\PYZus{}rec}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{r7}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mf}{1.0e\\PYZhy{}1F}\\PY{o}{*}\\PY{p}{(}\\PY{o}{\\PYZhy{}}\\PY{n}{o\\PYZus{}x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{rec\\PYZus{}coords}\\PY{p}{[}\\PY{n}{p\\PYZus{}rec}\\PY{p}{]}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{r5}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{floorf}\\PY{p}{(}\\PY{n}{r7}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{p}{)}\\PY{n}{r5}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{r8}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mf}{1.0e\\PYZhy{}1F}\\PY{o}{*}\\PY{p}{(}\\PY{o}{\\PYZhy{}}\\PY{n}{o\\PYZus{}y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{rec\\PYZus{}coords}\\PY{p}{[}\\PY{n}{p\\PYZus{}rec}\\PY{p}{]}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{r6}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{floorf}\\PY{p}{(}\\PY{n}{r8}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{p}{)}\\PY{n}{r6}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{px}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{n}{r5}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{r7}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{py}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{n}{r6}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{r8}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{sum}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mf}{0.0F}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{rrecx}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{rrecx}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{rrecx}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{rrecy}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{rrecy}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{rrecy}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{rrecx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{\\PYZgt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{x\\PYZus{}m}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{o}{\\PYZam{}}\\PY{+w}{ }\\PY{n}{rrecy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{\\PYZgt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{y\\PYZus{}m}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{o}{\\PYZam{}}\\PY{+w}{ }\\PY{n}{rrecx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{x\\PYZus{}M}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{o}{\\PYZam{}}\\PY{+w}{ }\\PY{n}{rrecy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{y\\PYZus{}M}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{sum}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{rrecx}\\PY{o}{*}\\PY{n}{px}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{rrecx}\\PY{p}{)}\\PY{o}{*}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{px}\\PY{p}{)}\\PY{p}{)}\\PY{o}{*}\\PY{p}{(}\\PY{n}{rrecy}\\PY{o}{*}\\PY{n}{py}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{rrecy}\\PY{p}{)}\\PY{o}{*}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{py}\\PY{p}{)}\\PY{p}{)}\\PY{o}{*}\\PY{n}{u}\\PY{p}{[}\\PY{n}{t2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{rrecx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{rrecy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{rec}\\PY{p}{[}\\PY{n}{time}\\PY{p}{]}\\PY{p}{[}\\PY{n}{p\\PYZus{}rec}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{sum}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{n}{STOP}\\PY{p}{(}\\PY{n}{section2}\\PY{p}{,}\\PY{n}{timers}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\\end{Verbatim}\n" + ], + "text/plain": [ + "#define _POSIX_C_SOURCE 200809L\n", + "#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", + "#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", + "#define MAX(a,b) (((a) > (b)) ? (a) : (b))\n", + "\n", + "#include \"stdlib.h\"\n", + "#include \"math.h\"\n", + "#include \"sys/time.h\"\n", + "#include \"omp.h\"\n", + "#include \"complex.h\"\n", + "\n", + "struct dataobj\n", + "{\n", + " void *restrict data;\n", + " unsigned long * size;\n", + " unsigned long * npsize;\n", + " unsigned long * dsize;\n", + " int * hsize;\n", + " int * hofs;\n", + " int * oofs;\n", + " void * dmap;\n", + "} ;\n", + "\n", + "struct profiler\n", + "{\n", + " double section0;\n", + " double section1;\n", + " double section2;\n", + "} ;\n", + "\n", + "\n", + "int Kernel(struct dataobj *restrict damp_vec, struct dataobj *restrict freq_modes_vec, struct dataobj *restrict rec_vec, struct dataobj *restrict rec_coords_vec, struct dataobj *restrict src_vec, struct dataobj *restrict src_coords_vec, struct dataobj *restrict u_vec, struct dataobj *restrict vp_vec, const int x_M, const int x_m, const int y_M, const int y_m, const float dt, const float o_x, const float o_y, const int p_rec_M, const int p_rec_m, const int p_src_M, const int p_src_m, const int time_M, const int time_m, const int nthreads, const int nthreads_nonaffine, struct profiler * timers)\n", + "{\n", + " float (*restrict damp)[damp_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[damp_vec->size[1]]) damp_vec->data;\n", + " float _Complex (*restrict freq_modes)[freq_modes_vec->size[1]] __attribute__ ((aligned (64))) = (float _Complex (*)[freq_modes_vec->size[1]]) freq_modes_vec->data;\n", + " float (*restrict rec)[rec_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[rec_vec->size[1]]) rec_vec->data;\n", + " float (*restrict rec_coords)[rec_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[rec_coords_vec->size[1]]) rec_coords_vec->data;\n", + " float (*restrict src)[src_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[src_vec->size[1]]) src_vec->data;\n", + " float (*restrict src_coords)[src_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[src_coords_vec->size[1]]) src_coords_vec->data;\n", + " float (*restrict u)[u_vec->size[1]][u_vec->size[2]] __attribute__ ((aligned (64))) = (float (*)[u_vec->size[1]][u_vec->size[2]]) u_vec->data;\n", + " float (*restrict vp)[vp_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[vp_vec->size[1]]) vp_vec->data;\n", + "\n", + " float _Complex r2 = 1.0F/(dt*dt);\n", + " float _Complex r3 = 1.0F/dt;\n", + "\n", + " for (int time = time_m, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3); time <= time_M; time += 1, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3))\n", + " {\n", + " float _Complex r1 = cexpf(-6.28318530717959e-2F*time*_Complex_I*dt);\n", + " START(section0)\n", + " #pragma omp parallel num_threads(nthreads)\n", + " {\n", + " #pragma omp for schedule(dynamic,1)\n", + " for (int x = x_m; x <= x_M; x += 1)\n", + " {\n", + " #pragma omp simd aligned(damp,freq_modes,u,vp:16)\n", + " for (int y = y_m; y <= y_M; y += 1)\n", + " {\n", + " float _Complex r4 = 1.0F/(vp[x + 2][y + 2]*vp[x + 2][y + 2]);\n", + " u[t2][x + 2][y + 2] = (-r4*(-2.0F*r2*u[t0][x + 2][y + 2] + r2*u[t1][x + 2][y + 2]) + r3*damp[x + 2][y + 2]*u[t0][x + 2][y + 2] + 1.0e-2F*(u[t0][x + 1][y + 2] + u[t0][x + 2][y + 1] + u[t0][x + 2][y + 3] + u[t0][x + 3][y + 2]) - 3.99999991e-2F*u[t0][x + 2][y + 2])/(r4*r2 + r3*damp[x + 2][y + 2]);\n", + " freq_modes[x][y] += r1*u[t0][x + 2][y + 2];\n", + " }\n", + " }\n", + " }\n", + " STOP(section0,timers)\n", + "\n", + " START(section1)\n", + " #pragma omp parallel num_threads(nthreads_nonaffine)\n", + " {\n", + " int chunk_size = (int)(MAX(1, (int)((1.0/3.0)*(p_src_M - p_src_m + 1)/nthreads_nonaffine)));\n", + " #pragma omp for schedule(dynamic,chunk_size)\n", + " for (int p_src = p_src_m; p_src <= p_src_M; p_src += 1)\n", + " {\n", + " for (int rsrcx = 0; rsrcx <= 1; rsrcx += 1)\n", + " {\n", + " for (int rsrcy = 0; rsrcy <= 1; rsrcy += 1)\n", + " {\n", + " int posx = (int)(floorf(1.0e-1*(-o_x + src_coords[p_src][0])));\n", + " int posy = (int)(floorf(1.0e-1*(-o_y + src_coords[p_src][1])));\n", + " float px = 1.0e-1F*(-o_x + src_coords[p_src][0]) - floorf(1.0e-1F*(-o_x + src_coords[p_src][0]));\n", + " float py = 1.0e-1F*(-o_y + src_coords[p_src][1]) - floorf(1.0e-1F*(-o_y + src_coords[p_src][1]));\n", + " if (rsrcx + posx >= x_m - 1 && rsrcy + posy >= y_m - 1 && rsrcx + posx <= x_M + 1 && rsrcy + posy <= y_M + 1)\n", + " {\n", + " float r0 = 3.06250F*(vp[posx + 2][posy + 2]*vp[posx + 2][posy + 2])*(rsrcx*px + (1 - rsrcx)*(1 - px))*(rsrcy*py + (1 - rsrcy)*(1 - py))*src[time][p_src];\n", + " #pragma omp atomic update\n", + " u[t2][rsrcx + posx + 2][rsrcy + posy + 2] += r0;\n", + " }\n", + " }\n", + " }\n", + " }\n", + " }\n", + " STOP(section1,timers)\n", + "\n", + " START(section2)\n", + " #pragma omp parallel num_threads(nthreads_nonaffine)\n", + " {\n", + " int chunk_size = (int)(MAX(1, (int)((1.0/3.0)*(p_rec_M - p_rec_m + 1)/nthreads_nonaffine)));\n", + " #pragma omp for schedule(dynamic,chunk_size)\n", + " for (int p_rec = p_rec_m; p_rec <= p_rec_M; p_rec += 1)\n", + " {\n", + " float r7 = 1.0e-1F*(-o_x + rec_coords[p_rec][0]);\n", + " float r5 = floorf(r7);\n", + " int posx = (int)r5;\n", + " float r8 = 1.0e-1F*(-o_y + rec_coords[p_rec][1]);\n", + " float r6 = floorf(r8);\n", + " int posy = (int)r6;\n", + " float px = -r5 + r7;\n", + " float py = -r6 + r8;\n", + " float sum = 0.0F;\n", + "\n", + " for (int rrecx = 0; rrecx <= 1; rrecx += 1)\n", + " {\n", + " for (int rrecy = 0; rrecy <= 1; rrecy += 1)\n", + " {\n", + " if (rrecx + posx >= x_m - 1 && rrecy + posy >= y_m - 1 && rrecx + posx <= x_M + 1 && rrecy + posy <= y_M + 1)\n", + " {\n", + " sum += (rrecx*px + (1 - rrecx)*(1 - px))*(rrecy*py + (1 - rrecy)*(1 - py))*u[t2][rrecx + posx + 2][rrecy + posy + 2];\n", + " }\n", + " }\n", + " }\n", + "\n", + " rec[time][p_rec] = sum;\n", + " }\n", + " }\n", + " STOP(section2,timers)\n", + " }\n", + "\n", + " return 0;\n", + "}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#NBVAL_IGNORE_OUTPUT\n", + "Code(str(op.ccode), language='C')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Operator `Kernel` ran in 0.04 s\n" + ] + }, + { + "data": { + "text/plain": [ + "PerformanceSummary([(PerfKey(name='section0', rank=None),\n", + " PerfEntry(time=0.031197000000000023, gflopss=0.0, gpointss=0.0, oi=0.0, ops=0, itershapes=[])),\n", + " (PerfKey(name='section1', rank=None),\n", + " PerfEntry(time=0.0028049999999999933, gflopss=0.0, gpointss=0.0, oi=0.0, ops=0, itershapes=[])),\n", + " (PerfKey(name='section2', rank=None),\n", + " PerfEntry(time=0.0026769999999999967, gflopss=0.0, gpointss=0.0, oi=0.0, ops=0, itershapes=[]))])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#NBVAL_IGNORE_OUTPUT\n", + "op(dt=model.critical_dt)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#NBVAL_IGNORE_OUTPUT\n", + "plt.figure(figsize=(12, 6))\n", + "plt.subplot(1, 2, 1)\n", + "plt.imshow(np.real(freq_mode.data.T), cmap='seismic', vmin=-1e2, vmax=1e2)\n", + "plt.colorbar()\n", + "plt.subplot(1, 2, 2)\n", + "plt.imshow(np.imag(freq_mode.data.T), cmap='seismic', vmin=-1e2, vmax=1e2)\n", + "plt.colorbar()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "assert np.isclose(norm(freq_mode), 13873.049, atol=0, rtol=1e-4)\n", + "assert np.isclose(norm(u), 323.74207, atol=0, rtol=1e-4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Multiple frequencies\n", + "\n", + "We can now extend the method to multiple frequencies. The idea is to compute the Fourier modes for a set of frequencies simultaneously. This can be done by adding a frequency dimension to the Fourier modes and updating them for each frequency at each time step.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "nfreq = 5\n", + "f = Dimension(name='f')\n", + "\n", + "frequencies = Function(name='frequencies', dimensions=(f,), shape=(nfreq,), dtype=np.float32)\n", + "frequencies.data[:] = np.linspace(0.005, 0.015, num=nfreq)\n", + "\n", + "freq_modes = Function(name='freq_modes', grid=model.grid, space_order=0, dtype=np.complex64,\n", + " dimensions=(f, *model.grid.dimensions), shape=(nfreq, *model.grid.shape))\n", + "\n", + "omega = 2 * np.pi * frequencies\n", + "basis = exp(-1j * omega * model.grid.time_dim * model.grid.time_dim.spacing)\n", + "dfts = [Inc(freq_modes, basis * u)]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "op = Operator([stencil] + src_term + rec_term + dfts, subs=model.spacing_map)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
#define _POSIX_C_SOURCE 200809L\n",
+       "#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n",
+       "#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n",
+       "#define MAX(a,b) (((a) > (b)) ? (a) : (b))\n",
+       "#define MIN(a,b) (((a) < (b)) ? (a) : (b))\n",
+       "\n",
+       "#include "stdlib.h"\n",
+       "#include "math.h"\n",
+       "#include "sys/time.h"\n",
+       "#include "omp.h"\n",
+       "#include "complex.h"\n",
+       "\n",
+       "struct dataobj\n",
+       "{\n",
+       "  void *restrict data;\n",
+       "  unsigned long * size;\n",
+       "  unsigned long * npsize;\n",
+       "  unsigned long * dsize;\n",
+       "  int * hsize;\n",
+       "  int * hofs;\n",
+       "  int * oofs;\n",
+       "  void * dmap;\n",
+       "} ;\n",
+       "\n",
+       "struct profiler\n",
+       "{\n",
+       "  double section0;\n",
+       "  double section1;\n",
+       "  double section2;\n",
+       "  double section3;\n",
+       "  double section4;\n",
+       "} ;\n",
+       "\n",
+       "\n",
+       "int Kernel(struct dataobj *restrict damp_vec, struct dataobj *restrict freq_modes_vec, struct dataobj *restrict frequencies_vec, struct dataobj *restrict rec_vec, struct dataobj *restrict rec_coords_vec, struct dataobj *restrict src_vec, struct dataobj *restrict src_coords_vec, struct dataobj *restrict u_vec, struct dataobj *restrict vp_vec, const int x_M, const int x_m, const int y_M, const int y_m, const float dt, const int f0_blk0_size, const int f_M, const int f_m, const float o_x, const float o_y, const int p_rec_M, const int p_rec_m, const int p_src_M, const int p_src_m, const int time_M, const int time_m, const int x0_blk0_size, const int nthreads, const int nthreads_nonaffine, const int f_size, struct profiler * timers)\n",
+       "{\n",
+       "  float _Complex *restrict r1_vec __attribute__ ((aligned (64)));\n",
+       "  posix_memalign((void**)(&r1_vec),64,f_size*sizeof(float _Complex));\n",
+       "\n",
+       "  float (*restrict damp)[damp_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[damp_vec->size[1]]) damp_vec->data;\n",
+       "  float _Complex (*restrict freq_modes)[freq_modes_vec->size[1]][freq_modes_vec->size[2]] __attribute__ ((aligned (64))) = (float _Complex (*)[freq_modes_vec->size[1]][freq_modes_vec->size[2]]) freq_modes_vec->data;\n",
+       "  float (*restrict frequencies) __attribute__ ((aligned (64))) = (float (*)) frequencies_vec->data;\n",
+       "  float _Complex (*restrict r1) __attribute__ ((aligned (64))) = (float _Complex (*)) r1_vec;\n",
+       "  float (*restrict rec)[rec_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[rec_vec->size[1]]) rec_vec->data;\n",
+       "  float (*restrict rec_coords)[rec_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[rec_coords_vec->size[1]]) rec_coords_vec->data;\n",
+       "  float (*restrict src)[src_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[src_vec->size[1]]) src_vec->data;\n",
+       "  float (*restrict src_coords)[src_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[src_coords_vec->size[1]]) src_coords_vec->data;\n",
+       "  float (*restrict u)[u_vec->size[1]][u_vec->size[2]] __attribute__ ((aligned (64))) = (float (*)[u_vec->size[1]][u_vec->size[2]]) u_vec->data;\n",
+       "  float (*restrict vp)[vp_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[vp_vec->size[1]]) vp_vec->data;\n",
+       "\n",
+       "  float r2 = 1.0F/(dt*dt);\n",
+       "  float r3 = 1.0F/dt;\n",
+       "\n",
+       "  for (int time = time_m, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3); time <= time_M; time += 1, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3))\n",
+       "  {\n",
+       "    START(section0)\n",
+       "    #pragma omp parallel num_threads(nthreads)\n",
+       "    {\n",
+       "      #pragma omp for schedule(dynamic,1)\n",
+       "      for (int x = x_m; x <= x_M; x += 1)\n",
+       "      {\n",
+       "        #pragma omp simd aligned(damp,u,vp:16)\n",
+       "        for (int y = y_m; y <= y_M; y += 1)\n",
+       "        {\n",
+       "          float r4 = 1.0F/(vp[x + 2][y + 2]*vp[x + 2][y + 2]);\n",
+       "          u[t2][x + 2][y + 2] = (-r4*(-2.0F*r2*u[t0][x + 2][y + 2] + r2*u[t1][x + 2][y + 2]) + r3*damp[x + 2][y + 2]*u[t0][x + 2][y + 2] + 1.0e-2F*(u[t0][x + 1][y + 2] + u[t0][x + 2][y + 1] + u[t0][x + 2][y + 3] + u[t0][x + 3][y + 2]) - 3.99999991e-2F*u[t0][x + 2][y + 2])/(r4*r2 + r3*damp[x + 2][y + 2]);\n",
+       "        }\n",
+       "      }\n",
+       "    }\n",
+       "    STOP(section0,timers)\n",
+       "\n",
+       "    START(section1)\n",
+       "    #pragma omp parallel num_threads(nthreads_nonaffine)\n",
+       "    {\n",
+       "      int chunk_size = (int)(MAX(1, (int)((1.0/3.0)*(p_src_M - p_src_m + 1)/nthreads_nonaffine)));\n",
+       "      #pragma omp for schedule(dynamic,chunk_size)\n",
+       "      for (int p_src = p_src_m; p_src <= p_src_M; p_src += 1)\n",
+       "      {\n",
+       "        for (int rsrcx = 0; rsrcx <= 1; rsrcx += 1)\n",
+       "        {\n",
+       "          for (int rsrcy = 0; rsrcy <= 1; rsrcy += 1)\n",
+       "          {\n",
+       "            int posx = (int)(floorf(1.0e-1*(-o_x + src_coords[p_src][0])));\n",
+       "            int posy = (int)(floorf(1.0e-1*(-o_y + src_coords[p_src][1])));\n",
+       "            float px = 1.0e-1F*(-o_x + src_coords[p_src][0]) - floorf(1.0e-1F*(-o_x + src_coords[p_src][0]));\n",
+       "            float py = 1.0e-1F*(-o_y + src_coords[p_src][1]) - floorf(1.0e-1F*(-o_y + src_coords[p_src][1]));\n",
+       "            if (rsrcx + posx >= x_m - 1 && rsrcy + posy >= y_m - 1 && rsrcx + posx <= x_M + 1 && rsrcy + posy <= y_M + 1)\n",
+       "            {\n",
+       "              float r0 = 3.06250F*(vp[posx + 2][posy + 2]*vp[posx + 2][posy + 2])*(rsrcx*px + (1 - rsrcx)*(1 - px))*(rsrcy*py + (1 - rsrcy)*(1 - py))*src[time][p_src];\n",
+       "              #pragma omp atomic update\n",
+       "              u[t2][rsrcx + posx + 2][rsrcy + posy + 2] += r0;\n",
+       "            }\n",
+       "          }\n",
+       "        }\n",
+       "      }\n",
+       "    }\n",
+       "    STOP(section1,timers)\n",
+       "\n",
+       "    START(section2)\n",
+       "    #pragma omp parallel num_threads(nthreads_nonaffine)\n",
+       "    {\n",
+       "      int chunk_size = (int)(MAX(1, (int)((1.0/3.0)*(p_rec_M - p_rec_m + 1)/nthreads_nonaffine)));\n",
+       "      #pragma omp for schedule(dynamic,chunk_size)\n",
+       "      for (int p_rec = p_rec_m; p_rec <= p_rec_M; p_rec += 1)\n",
+       "      {\n",
+       "        float r7 = 1.0e-1F*(-o_x + rec_coords[p_rec][0]);\n",
+       "        float r5 = floorf(r7);\n",
+       "        int posx = (int)r5;\n",
+       "        float r8 = 1.0e-1F*(-o_y + rec_coords[p_rec][1]);\n",
+       "        float r6 = floorf(r8);\n",
+       "        int posy = (int)r6;\n",
+       "        float px = -r5 + r7;\n",
+       "        float py = -r6 + r8;\n",
+       "        float sum = 0.0F;\n",
+       "\n",
+       "        for (int rrecx = 0; rrecx <= 1; rrecx += 1)\n",
+       "        {\n",
+       "          for (int rrecy = 0; rrecy <= 1; rrecy += 1)\n",
+       "          {\n",
+       "            if (rrecx + posx >= x_m - 1 && rrecy + posy >= y_m - 1 && rrecx + posx <= x_M + 1 && rrecy + posy <= y_M + 1)\n",
+       "            {\n",
+       "              sum += (rrecx*px + (1 - rrecx)*(1 - px))*(rrecy*py + (1 - rrecy)*(1 - py))*u[t2][rrecx + posx + 2][rrecy + posy + 2];\n",
+       "            }\n",
+       "          }\n",
+       "        }\n",
+       "\n",
+       "        rec[time][p_rec] = sum;\n",
+       "      }\n",
+       "    }\n",
+       "    STOP(section2,timers)\n",
+       "\n",
+       "    START(section3)\n",
+       "    #pragma omp parallel num_threads(nthreads)\n",
+       "    {\n",
+       "      #pragma omp for schedule(static,1)\n",
+       "      for (int f = f_m; f <= f_M; f += 1)\n",
+       "      {\n",
+       "        r1[f] = cexpf(-6.283185307179590F*time*_Complex_I*dt*frequencies[f]);\n",
+       "      }\n",
+       "    }\n",
+       "    STOP(section3,timers)\n",
+       "\n",
+       "    START(section4)\n",
+       "    #pragma omp parallel num_threads(nthreads)\n",
+       "    {\n",
+       "      #pragma omp for collapse(2) schedule(static,1)\n",
+       "      for (int f0_blk0 = f_m; f0_blk0 <= f_M; f0_blk0 += f0_blk0_size)\n",
+       "      {\n",
+       "        for (int x0_blk0 = x_m; x0_blk0 <= x_M; x0_blk0 += x0_blk0_size)\n",
+       "        {\n",
+       "          for (int f = f0_blk0; f <= MIN(f_M, f0_blk0 + f0_blk0_size - 1); f += 1)\n",
+       "          {\n",
+       "            for (int x = x0_blk0; x <= MIN(x_M, x0_blk0 + x0_blk0_size - 1); x += 1)\n",
+       "            {\n",
+       "              #pragma omp simd aligned(freq_modes,u:16)\n",
+       "              for (int y = y_m; y <= y_M; y += 1)\n",
+       "              {\n",
+       "                freq_modes[f][x][y] += r1[f]*u[t0][x + 2][y + 2];\n",
+       "              }\n",
+       "            }\n",
+       "          }\n",
+       "        }\n",
+       "      }\n",
+       "    }\n",
+       "    STOP(section4,timers)\n",
+       "  }\n",
+       "\n",
+       "  free(r1_vec);\n",
+       "\n",
+       "  return 0;\n",
+       "}\n",
+       "
\n" + ], + "text/latex": [ + "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{define \\PYZus{}POSIX\\PYZus{}C\\PYZus{}SOURCE 200809L}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{define START(S) struct timeval start\\PYZus{} \\PYZsh{}\\PYZsh{} S , end\\PYZus{} \\PYZsh{}\\PYZsh{} S ; gettimeofday(\\PYZam{}start\\PYZus{} \\PYZsh{}\\PYZsh{} S , NULL);}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{define STOP(S,T) gettimeofday(\\PYZam{}end\\PYZus{} \\PYZsh{}\\PYZsh{} S, NULL); T\\PYZhy{}\\PYZgt{}S += (double)(end\\PYZus{} \\PYZsh{}\\PYZsh{} S .tv\\PYZus{}sec\\PYZhy{}start\\PYZus{} \\PYZsh{}\\PYZsh{} S.tv\\PYZus{}sec)+(double)(end\\PYZus{} \\PYZsh{}\\PYZsh{} S .tv\\PYZus{}usec\\PYZhy{}start\\PYZus{} \\PYZsh{}\\PYZsh{} S .tv\\PYZus{}usec)}\\PY{c+cp}{/}\\PY{c+cp}{1000000;}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{define MAX(a,b) (((a) \\PYZgt{} (b)) ? (a) : (b))}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{define MIN(a,b) (((a) \\PYZlt{} (b)) ? (a) : (b))}\n", + "\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}stdlib.h\\PYZdq{}}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}math.h\\PYZdq{}}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}sys/time.h\\PYZdq{}}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}omp.h\\PYZdq{}}\n", + "\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{include}\\PY{+w}{ }\\PY{c+cpf}{\\PYZdq{}complex.h\\PYZdq{}}\n", + "\n", + "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{data}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{unsigned}\\PY{+w}{ }\\PY{k+kt}{long}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{size}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{unsigned}\\PY{+w}{ }\\PY{k+kt}{long}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{npsize}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{unsigned}\\PY{+w}{ }\\PY{k+kt}{long}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{dsize}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{hsize}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{hofs}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{oofs}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{void}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{dmap}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{p}{;}\n", + "\n", + "\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{profiler}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{n}{section0}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{n}{section1}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{n}{section2}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{n}{section3}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{double}\\PY{+w}{ }\\PY{n}{section4}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\\PY{+w}{ }\\PY{p}{;}\n", + "\n", + "\n", + "\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n+nf}{Kernel}\\PY{p}{(}\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{damp\\PYZus{}vec}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{freq\\PYZus{}modes\\PYZus{}vec}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{frequencies\\PYZus{}vec}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{rec\\PYZus{}vec}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{rec\\PYZus{}coords\\PYZus{}vec}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{src\\PYZus{}vec}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{src\\PYZus{}coords\\PYZus{}vec}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{u\\PYZus{}vec}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{dataobj}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{vp\\PYZus{}vec}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{x\\PYZus{}M}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{x\\PYZus{}m}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{y\\PYZus{}M}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{y\\PYZus{}m}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{dt}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{f0\\PYZus{}blk0\\PYZus{}size}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{f\\PYZus{}M}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{f\\PYZus{}m}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{o\\PYZus{}x}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{o\\PYZus{}y}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{p\\PYZus{}rec\\PYZus{}M}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{p\\PYZus{}rec\\PYZus{}m}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{p\\PYZus{}src\\PYZus{}M}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{p\\PYZus{}src\\PYZus{}m}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{time\\PYZus{}M}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{time\\PYZus{}m}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{x0\\PYZus{}blk0\\PYZus{}size}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{nthreads}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{nthreads\\PYZus{}nonaffine}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{const}\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{f\\PYZus{}size}\\PY{p}{,}\\PY{+w}{ }\\PY{k}{struct}\\PY{+w}{ }\\PY{n+nc}{profiler}\\PY{+w}{ }\\PY{o}{*}\\PY{+w}{ }\\PY{n}{timers}\\PY{p}{)}\n", + "\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{k+kt}{\\PYZus{}Complex}\\PY{+w}{ }\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{r1\\PYZus{}vec}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{posix\\PYZus{}memalign}\\PY{p}{(}\\PY{p}{(}\\PY{k+kt}{void}\\PY{o}{*}\\PY{o}{*}\\PY{p}{)}\\PY{p}{(}\\PY{o}{\\PYZam{}}\\PY{n}{r1\\PYZus{}vec}\\PY{p}{)}\\PY{p}{,}\\PY{l+m+mi}{64}\\PY{p}{,}\\PY{n}{f\\PYZus{}size}\\PY{o}{*}\\PY{k}{sizeof}\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{k+kt}{\\PYZus{}Complex}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{damp}\\PY{p}{)}\\PY{p}{[}\\PY{n}{damp\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{p}{)}\\PY{p}{[}\\PY{n}{damp\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{damp\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{data}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{k+kt}{\\PYZus{}Complex}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{freq\\PYZus{}modes}\\PY{p}{)}\\PY{p}{[}\\PY{n}{freq\\PYZus{}modes\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{[}\\PY{n}{freq\\PYZus{}modes\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{]}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{k+kt}{\\PYZus{}Complex}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{p}{)}\\PY{p}{[}\\PY{n}{freq\\PYZus{}modes\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{[}\\PY{n}{freq\\PYZus{}modes\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{freq\\PYZus{}modes\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{data}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{frequencies}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{frequencies\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{data}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{k+kt}{\\PYZus{}Complex}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{r1}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{k+kt}{\\PYZus{}Complex}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{r1\\PYZus{}vec}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{rec}\\PY{p}{)}\\PY{p}{[}\\PY{n}{rec\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{p}{)}\\PY{p}{[}\\PY{n}{rec\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{rec\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{data}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{rec\\PYZus{}coords}\\PY{p}{)}\\PY{p}{[}\\PY{n}{rec\\PYZus{}coords\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{p}{)}\\PY{p}{[}\\PY{n}{rec\\PYZus{}coords\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{rec\\PYZus{}coords\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{data}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{src}\\PY{p}{)}\\PY{p}{[}\\PY{n}{src\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{p}{)}\\PY{p}{[}\\PY{n}{src\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{src\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{data}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{src\\PYZus{}coords}\\PY{p}{)}\\PY{p}{[}\\PY{n}{src\\PYZus{}coords\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{p}{)}\\PY{p}{[}\\PY{n}{src\\PYZus{}coords\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{src\\PYZus{}coords\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{data}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{u}\\PY{p}{)}\\PY{p}{[}\\PY{n}{u\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{[}\\PY{n}{u\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{]}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{p}{)}\\PY{p}{[}\\PY{n}{u\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{[}\\PY{n}{u\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{u\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{data}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{k+kr}{restrict}\\PY{+w}{ }\\PY{n}{vp}\\PY{p}{)}\\PY{p}{[}\\PY{n}{vp\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{+w}{ }\\PY{n}{\\PYZus{}\\PYZus{}attribute\\PYZus{}\\PYZus{}}\\PY{+w}{ }\\PY{p}{(}\\PY{p}{(}\\PY{n}{aligned}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{64}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{float}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{*}\\PY{p}{)}\\PY{p}{[}\\PY{n}{vp\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{size}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{n}{vp\\PYZus{}vec}\\PY{o}{\\PYZhy{}}\\PY{o}{\\PYZgt{}}\\PY{n}{data}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{r2}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mf}{1.0F}\\PY{o}{/}\\PY{p}{(}\\PY{n}{dt}\\PY{o}{*}\\PY{n}{dt}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{r3}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mf}{1.0F}\\PY{o}{/}\\PY{n}{dt}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{time}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{time\\PYZus{}m}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{t0}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{time}\\PY{p}{)}\\PY{o}{\\PYZpc{}}\\PY{p}{(}\\PY{l+m+mi}{3}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{t1}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{time}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{)}\\PY{o}{\\PYZpc{}}\\PY{p}{(}\\PY{l+m+mi}{3}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{t2}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{time}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{o}{\\PYZpc{}}\\PY{p}{(}\\PY{l+m+mi}{3}\\PY{p}{)}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{time}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{time\\PYZus{}M}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{time}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{t0}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{time}\\PY{p}{)}\\PY{o}{\\PYZpc{}}\\PY{p}{(}\\PY{l+m+mi}{3}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{t1}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{time}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{)}\\PY{o}{\\PYZpc{}}\\PY{p}{(}\\PY{l+m+mi}{3}\\PY{p}{)}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{t2}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{time}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{o}{\\PYZpc{}}\\PY{p}{(}\\PY{l+m+mi}{3}\\PY{p}{)}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{START}\\PY{p}{(}\\PY{n}{section0}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp parallel num\\PYZus{}threads(nthreads)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp for schedule(dynamic,1)}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{x}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{x\\PYZus{}m}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{x}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{x\\PYZus{}M}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp simd aligned(damp,u,vp:16)}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{y}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{y\\PYZus{}m}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{y}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{y\\PYZus{}M}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{r4}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mf}{1.0F}\\PY{o}{/}\\PY{p}{(}\\PY{n}{vp}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{o}{*}\\PY{n}{vp}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{n}{u}\\PY{p}{[}\\PY{n}{t2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{o}{\\PYZhy{}}\\PY{n}{r4}\\PY{o}{*}\\PY{p}{(}\\PY{l+m+mf}{\\PYZhy{}2.0F}\\PY{o}{*}\\PY{n}{r2}\\PY{o}{*}\\PY{n}{u}\\PY{p}{[}\\PY{n}{t0}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{r2}\\PY{o}{*}\\PY{n}{u}\\PY{p}{[}\\PY{n}{t1}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{r3}\\PY{o}{*}\\PY{n}{damp}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{o}{*}\\PY{n}{u}\\PY{p}{[}\\PY{n}{t0}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mf}{1.0e\\PYZhy{}2F}\\PY{o}{*}\\PY{p}{(}\\PY{n}{u}\\PY{p}{[}\\PY{n}{t0}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{u}\\PY{p}{[}\\PY{n}{t0}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{u}\\PY{p}{[}\\PY{n}{t0}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{3}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{u}\\PY{p}{[}\\PY{n}{t0}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{3}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{l+m+mf}{3.99999991e\\PYZhy{}2F}\\PY{o}{*}\\PY{n}{u}\\PY{p}{[}\\PY{n}{t0}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{)}\\PY{o}{/}\\PY{p}{(}\\PY{n}{r4}\\PY{o}{*}\\PY{n}{r2}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{r3}\\PY{o}{*}\\PY{n}{damp}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{n}{STOP}\\PY{p}{(}\\PY{n}{section0}\\PY{p}{,}\\PY{n}{timers}\\PY{p}{)}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{START}\\PY{p}{(}\\PY{n}{section1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp parallel num\\PYZus{}threads(nthreads\\PYZus{}nonaffine)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{chunk\\PYZus{}size}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{p}{)}\\PY{p}{(}\\PY{n}{MAX}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{p}{)}\\PY{p}{(}\\PY{p}{(}\\PY{l+m+mf}{1.0}\\PY{o}{/}\\PY{l+m+mf}{3.0}\\PY{p}{)}\\PY{o}{*}\\PY{p}{(}\\PY{n}{p\\PYZus{}src\\PYZus{}M}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{p\\PYZus{}src\\PYZus{}m}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{o}{/}\\PY{n}{nthreads\\PYZus{}nonaffine}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp for schedule(dynamic,chunk\\PYZus{}size)}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{p\\PYZus{}src}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{p\\PYZus{}src\\PYZus{}m}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{p\\PYZus{}src}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{p\\PYZus{}src\\PYZus{}M}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{p\\PYZus{}src}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{rsrcx}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{rsrcx}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{rsrcx}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{rsrcy}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{rsrcy}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{rsrcy}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{p}{)}\\PY{p}{(}\\PY{n}{floorf}\\PY{p}{(}\\PY{l+m+mf}{1.0e\\PYZhy{}1}\\PY{o}{*}\\PY{p}{(}\\PY{o}{\\PYZhy{}}\\PY{n}{o\\PYZus{}x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{src\\PYZus{}coords}\\PY{p}{[}\\PY{n}{p\\PYZus{}src}\\PY{p}{]}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{p}{)}\\PY{p}{(}\\PY{n}{floorf}\\PY{p}{(}\\PY{l+m+mf}{1.0e\\PYZhy{}1}\\PY{o}{*}\\PY{p}{(}\\PY{o}{\\PYZhy{}}\\PY{n}{o\\PYZus{}y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{src\\PYZus{}coords}\\PY{p}{[}\\PY{n}{p\\PYZus{}src}\\PY{p}{]}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{px}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mf}{1.0e\\PYZhy{}1F}\\PY{o}{*}\\PY{p}{(}\\PY{o}{\\PYZhy{}}\\PY{n}{o\\PYZus{}x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{src\\PYZus{}coords}\\PY{p}{[}\\PY{n}{p\\PYZus{}src}\\PY{p}{]}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{floorf}\\PY{p}{(}\\PY{l+m+mf}{1.0e\\PYZhy{}1F}\\PY{o}{*}\\PY{p}{(}\\PY{o}{\\PYZhy{}}\\PY{n}{o\\PYZus{}x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{src\\PYZus{}coords}\\PY{p}{[}\\PY{n}{p\\PYZus{}src}\\PY{p}{]}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{py}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mf}{1.0e\\PYZhy{}1F}\\PY{o}{*}\\PY{p}{(}\\PY{o}{\\PYZhy{}}\\PY{n}{o\\PYZus{}y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{src\\PYZus{}coords}\\PY{p}{[}\\PY{n}{p\\PYZus{}src}\\PY{p}{]}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{)}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{floorf}\\PY{p}{(}\\PY{l+m+mf}{1.0e\\PYZhy{}1F}\\PY{o}{*}\\PY{p}{(}\\PY{o}{\\PYZhy{}}\\PY{n}{o\\PYZus{}y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{src\\PYZus{}coords}\\PY{p}{[}\\PY{n}{p\\PYZus{}src}\\PY{p}{]}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{rsrcx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{\\PYZgt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{x\\PYZus{}m}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{o}{\\PYZam{}}\\PY{+w}{ }\\PY{n}{rsrcy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{\\PYZgt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{y\\PYZus{}m}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{o}{\\PYZam{}}\\PY{+w}{ }\\PY{n}{rsrcx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{x\\PYZus{}M}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{o}{\\PYZam{}}\\PY{+w}{ }\\PY{n}{rsrcy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{y\\PYZus{}M}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{r0}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mf}{3.06250F}\\PY{o}{*}\\PY{p}{(}\\PY{n}{vp}\\PY{p}{[}\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{o}{*}\\PY{n}{vp}\\PY{p}{[}\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{)}\\PY{o}{*}\\PY{p}{(}\\PY{n}{rsrcx}\\PY{o}{*}\\PY{n}{px}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{rsrcx}\\PY{p}{)}\\PY{o}{*}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{px}\\PY{p}{)}\\PY{p}{)}\\PY{o}{*}\\PY{p}{(}\\PY{n}{rsrcy}\\PY{o}{*}\\PY{n}{py}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{rsrcy}\\PY{p}{)}\\PY{o}{*}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{py}\\PY{p}{)}\\PY{p}{)}\\PY{o}{*}\\PY{n}{src}\\PY{p}{[}\\PY{n}{time}\\PY{p}{]}\\PY{p}{[}\\PY{n}{p\\PYZus{}src}\\PY{p}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp atomic update}\n", + "\\PY{+w}{ }\\PY{n}{u}\\PY{p}{[}\\PY{n}{t2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{rsrcx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{rsrcy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{r0}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{n}{STOP}\\PY{p}{(}\\PY{n}{section1}\\PY{p}{,}\\PY{n}{timers}\\PY{p}{)}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{START}\\PY{p}{(}\\PY{n}{section2}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp parallel num\\PYZus{}threads(nthreads\\PYZus{}nonaffine)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{chunk\\PYZus{}size}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{p}{)}\\PY{p}{(}\\PY{n}{MAX}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{p}{,}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{p}{)}\\PY{p}{(}\\PY{p}{(}\\PY{l+m+mf}{1.0}\\PY{o}{/}\\PY{l+m+mf}{3.0}\\PY{p}{)}\\PY{o}{*}\\PY{p}{(}\\PY{n}{p\\PYZus{}rec\\PYZus{}M}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{p\\PYZus{}rec\\PYZus{}m}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{o}{/}\\PY{n}{nthreads\\PYZus{}nonaffine}\\PY{p}{)}\\PY{p}{)}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp for schedule(dynamic,chunk\\PYZus{}size)}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{p\\PYZus{}rec}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{p\\PYZus{}rec\\PYZus{}m}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{p\\PYZus{}rec}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{p\\PYZus{}rec\\PYZus{}M}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{p\\PYZus{}rec}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{r7}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mf}{1.0e\\PYZhy{}1F}\\PY{o}{*}\\PY{p}{(}\\PY{o}{\\PYZhy{}}\\PY{n}{o\\PYZus{}x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{rec\\PYZus{}coords}\\PY{p}{[}\\PY{n}{p\\PYZus{}rec}\\PY{p}{]}\\PY{p}{[}\\PY{l+m+mi}{0}\\PY{p}{]}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{r5}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{floorf}\\PY{p}{(}\\PY{n}{r7}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{p}{)}\\PY{n}{r5}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{r8}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mf}{1.0e\\PYZhy{}1F}\\PY{o}{*}\\PY{p}{(}\\PY{o}{\\PYZhy{}}\\PY{n}{o\\PYZus{}y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{rec\\PYZus{}coords}\\PY{p}{[}\\PY{n}{p\\PYZus{}rec}\\PY{p}{]}\\PY{p}{[}\\PY{l+m+mi}{1}\\PY{p}{]}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{r6}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{floorf}\\PY{p}{(}\\PY{n}{r8}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{p}{)}\\PY{n}{r6}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{px}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{n}{r5}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{r7}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{py}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{n}{r6}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{r8}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{k+kt}{float}\\PY{+w}{ }\\PY{n}{sum}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mf}{0.0F}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{rrecx}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{rrecx}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{rrecx}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{rrecy}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{rrecy}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{rrecy}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{if}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{rrecx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{\\PYZgt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{x\\PYZus{}m}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{o}{\\PYZam{}}\\PY{+w}{ }\\PY{n}{rrecy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{\\PYZgt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{y\\PYZus{}m}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{o}{\\PYZam{}}\\PY{+w}{ }\\PY{n}{rrecx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{x\\PYZus{}M}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZam{}}\\PY{o}{\\PYZam{}}\\PY{+w}{ }\\PY{n}{rrecy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{y\\PYZus{}M}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{sum}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{p}{(}\\PY{n}{rrecx}\\PY{o}{*}\\PY{n}{px}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{rrecx}\\PY{p}{)}\\PY{o}{*}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{px}\\PY{p}{)}\\PY{p}{)}\\PY{o}{*}\\PY{p}{(}\\PY{n}{rrecy}\\PY{o}{*}\\PY{n}{py}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{rrecy}\\PY{p}{)}\\PY{o}{*}\\PY{p}{(}\\PY{l+m+mi}{1}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{n}{py}\\PY{p}{)}\\PY{p}{)}\\PY{o}{*}\\PY{n}{u}\\PY{p}{[}\\PY{n}{t2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{rrecx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posx}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{rrecy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{posy}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{rec}\\PY{p}{[}\\PY{n}{time}\\PY{p}{]}\\PY{p}{[}\\PY{n}{p\\PYZus{}rec}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{sum}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{n}{STOP}\\PY{p}{(}\\PY{n}{section2}\\PY{p}{,}\\PY{n}{timers}\\PY{p}{)}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{START}\\PY{p}{(}\\PY{n}{section3}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp parallel num\\PYZus{}threads(nthreads)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp for schedule(static,1)}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{f}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{f\\PYZus{}m}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{f}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{f\\PYZus{}M}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{f}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{r1}\\PY{p}{[}\\PY{n}{f}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{cexpf}\\PY{p}{(}\\PY{l+m+mf}{\\PYZhy{}6.283185307179590F}\\PY{o}{*}\\PY{n}{time}\\PY{o}{*}\\PY{n}{\\PYZus{}Complex\\PYZus{}I}\\PY{o}{*}\\PY{n}{dt}\\PY{o}{*}\\PY{n}{frequencies}\\PY{p}{[}\\PY{n}{f}\\PY{p}{]}\\PY{p}{)}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{n}{STOP}\\PY{p}{(}\\PY{n}{section3}\\PY{p}{,}\\PY{n}{timers}\\PY{p}{)}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{START}\\PY{p}{(}\\PY{n}{section4}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp parallel num\\PYZus{}threads(nthreads)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp for collapse(2) schedule(static,1)}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{f0\\PYZus{}blk0}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{f\\PYZus{}m}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{f0\\PYZus{}blk0}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{f\\PYZus{}M}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{f0\\PYZus{}blk0}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{f0\\PYZus{}blk0\\PYZus{}size}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{x0\\PYZus{}blk0}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{x\\PYZus{}m}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{x0\\PYZus{}blk0}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{x\\PYZus{}M}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{x0\\PYZus{}blk0}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{x0\\PYZus{}blk0\\PYZus{}size}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{f}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{f0\\PYZus{}blk0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{f}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{MIN}\\PY{p}{(}\\PY{n}{f\\PYZus{}M}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{f0\\PYZus{}blk0}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{f0\\PYZus{}blk0\\PYZus{}size}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{f}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{x}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{x0\\PYZus{}blk0}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{x}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{MIN}\\PY{p}{(}\\PY{n}{x\\PYZus{}M}\\PY{p}{,}\\PY{+w}{ }\\PY{n}{x0\\PYZus{}blk0}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{n}{x0\\PYZus{}blk0\\PYZus{}size}\\PY{+w}{ }\\PY{o}{\\PYZhy{}}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{c+cp}{\\PYZsh{}}\\PY{c+cp}{pragma omp simd aligned(freq\\PYZus{}modes,u:16)}\n", + "\\PY{+w}{ }\\PY{k}{for}\\PY{+w}{ }\\PY{p}{(}\\PY{k+kt}{int}\\PY{+w}{ }\\PY{n}{y}\\PY{+w}{ }\\PY{o}{=}\\PY{+w}{ }\\PY{n}{y\\PYZus{}m}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{y}\\PY{+w}{ }\\PY{o}{\\PYZlt{}}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{y\\PYZus{}M}\\PY{p}{;}\\PY{+w}{ }\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{l+m+mi}{1}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZob{}}\n", + "\\PY{+w}{ }\\PY{n}{freq\\PYZus{}modes}\\PY{p}{[}\\PY{n}{f}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{p}{]}\\PY{+w}{ }\\PY{o}{+}\\PY{o}{=}\\PY{+w}{ }\\PY{n}{r1}\\PY{p}{[}\\PY{n}{f}\\PY{p}{]}\\PY{o}{*}\\PY{n}{u}\\PY{p}{[}\\PY{n}{t0}\\PY{p}{]}\\PY{p}{[}\\PY{n}{x}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{[}\\PY{n}{y}\\PY{+w}{ }\\PY{o}{+}\\PY{+w}{ }\\PY{l+m+mi}{2}\\PY{p}{]}\\PY{p}{;}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\\PY{+w}{ }\\PY{n}{STOP}\\PY{p}{(}\\PY{n}{section4}\\PY{p}{,}\\PY{n}{timers}\\PY{p}{)}\n", + "\\PY{+w}{ }\\PY{p}{\\PYZcb{}}\n", + "\n", + "\\PY{+w}{ }\\PY{n}{free}\\PY{p}{(}\\PY{n}{r1\\PYZus{}vec}\\PY{p}{)}\\PY{p}{;}\n", + "\n", + "\\PY{+w}{ }\\PY{k}{return}\\PY{+w}{ }\\PY{l+m+mi}{0}\\PY{p}{;}\n", + "\\PY{p}{\\PYZcb{}}\n", + "\\end{Verbatim}\n" + ], + "text/plain": [ + "#define _POSIX_C_SOURCE 200809L\n", + "#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", + "#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", + "#define MAX(a,b) (((a) > (b)) ? (a) : (b))\n", + "#define MIN(a,b) (((a) < (b)) ? (a) : (b))\n", + "\n", + "#include \"stdlib.h\"\n", + "#include \"math.h\"\n", + "#include \"sys/time.h\"\n", + "#include \"omp.h\"\n", + "#include \"complex.h\"\n", + "\n", + "struct dataobj\n", + "{\n", + " void *restrict data;\n", + " unsigned long * size;\n", + " unsigned long * npsize;\n", + " unsigned long * dsize;\n", + " int * hsize;\n", + " int * hofs;\n", + " int * oofs;\n", + " void * dmap;\n", + "} ;\n", + "\n", + "struct profiler\n", + "{\n", + " double section0;\n", + " double section1;\n", + " double section2;\n", + " double section3;\n", + " double section4;\n", + "} ;\n", + "\n", + "\n", + "int Kernel(struct dataobj *restrict damp_vec, struct dataobj *restrict freq_modes_vec, struct dataobj *restrict frequencies_vec, struct dataobj *restrict rec_vec, struct dataobj *restrict rec_coords_vec, struct dataobj *restrict src_vec, struct dataobj *restrict src_coords_vec, struct dataobj *restrict u_vec, struct dataobj *restrict vp_vec, const int x_M, const int x_m, const int y_M, const int y_m, const float dt, const int f0_blk0_size, const int f_M, const int f_m, const float o_x, const float o_y, const int p_rec_M, const int p_rec_m, const int p_src_M, const int p_src_m, const int time_M, const int time_m, const int x0_blk0_size, const int nthreads, const int nthreads_nonaffine, const int f_size, struct profiler * timers)\n", + "{\n", + " float _Complex *restrict r1_vec __attribute__ ((aligned (64)));\n", + " posix_memalign((void**)(&r1_vec),64,f_size*sizeof(float _Complex));\n", + "\n", + " float (*restrict damp)[damp_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[damp_vec->size[1]]) damp_vec->data;\n", + " float _Complex (*restrict freq_modes)[freq_modes_vec->size[1]][freq_modes_vec->size[2]] __attribute__ ((aligned (64))) = (float _Complex (*)[freq_modes_vec->size[1]][freq_modes_vec->size[2]]) freq_modes_vec->data;\n", + " float (*restrict frequencies) __attribute__ ((aligned (64))) = (float (*)) frequencies_vec->data;\n", + " float _Complex (*restrict r1) __attribute__ ((aligned (64))) = (float _Complex (*)) r1_vec;\n", + " float (*restrict rec)[rec_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[rec_vec->size[1]]) rec_vec->data;\n", + " float (*restrict rec_coords)[rec_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[rec_coords_vec->size[1]]) rec_coords_vec->data;\n", + " float (*restrict src)[src_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[src_vec->size[1]]) src_vec->data;\n", + " float (*restrict src_coords)[src_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[src_coords_vec->size[1]]) src_coords_vec->data;\n", + " float (*restrict u)[u_vec->size[1]][u_vec->size[2]] __attribute__ ((aligned (64))) = (float (*)[u_vec->size[1]][u_vec->size[2]]) u_vec->data;\n", + " float (*restrict vp)[vp_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[vp_vec->size[1]]) vp_vec->data;\n", + "\n", + " float r2 = 1.0F/(dt*dt);\n", + " float r3 = 1.0F/dt;\n", + "\n", + " for (int time = time_m, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3); time <= time_M; time += 1, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3))\n", + " {\n", + " START(section0)\n", + " #pragma omp parallel num_threads(nthreads)\n", + " {\n", + " #pragma omp for schedule(dynamic,1)\n", + " for (int x = x_m; x <= x_M; x += 1)\n", + " {\n", + " #pragma omp simd aligned(damp,u,vp:16)\n", + " for (int y = y_m; y <= y_M; y += 1)\n", + " {\n", + " float r4 = 1.0F/(vp[x + 2][y + 2]*vp[x + 2][y + 2]);\n", + " u[t2][x + 2][y + 2] = (-r4*(-2.0F*r2*u[t0][x + 2][y + 2] + r2*u[t1][x + 2][y + 2]) + r3*damp[x + 2][y + 2]*u[t0][x + 2][y + 2] + 1.0e-2F*(u[t0][x + 1][y + 2] + u[t0][x + 2][y + 1] + u[t0][x + 2][y + 3] + u[t0][x + 3][y + 2]) - 3.99999991e-2F*u[t0][x + 2][y + 2])/(r4*r2 + r3*damp[x + 2][y + 2]);\n", + " }\n", + " }\n", + " }\n", + " STOP(section0,timers)\n", + "\n", + " START(section1)\n", + " #pragma omp parallel num_threads(nthreads_nonaffine)\n", + " {\n", + " int chunk_size = (int)(MAX(1, (int)((1.0/3.0)*(p_src_M - p_src_m + 1)/nthreads_nonaffine)));\n", + " #pragma omp for schedule(dynamic,chunk_size)\n", + " for (int p_src = p_src_m; p_src <= p_src_M; p_src += 1)\n", + " {\n", + " for (int rsrcx = 0; rsrcx <= 1; rsrcx += 1)\n", + " {\n", + " for (int rsrcy = 0; rsrcy <= 1; rsrcy += 1)\n", + " {\n", + " int posx = (int)(floorf(1.0e-1*(-o_x + src_coords[p_src][0])));\n", + " int posy = (int)(floorf(1.0e-1*(-o_y + src_coords[p_src][1])));\n", + " float px = 1.0e-1F*(-o_x + src_coords[p_src][0]) - floorf(1.0e-1F*(-o_x + src_coords[p_src][0]));\n", + " float py = 1.0e-1F*(-o_y + src_coords[p_src][1]) - floorf(1.0e-1F*(-o_y + src_coords[p_src][1]));\n", + " if (rsrcx + posx >= x_m - 1 && rsrcy + posy >= y_m - 1 && rsrcx + posx <= x_M + 1 && rsrcy + posy <= y_M + 1)\n", + " {\n", + " float r0 = 3.06250F*(vp[posx + 2][posy + 2]*vp[posx + 2][posy + 2])*(rsrcx*px + (1 - rsrcx)*(1 - px))*(rsrcy*py + (1 - rsrcy)*(1 - py))*src[time][p_src];\n", + " #pragma omp atomic update\n", + " u[t2][rsrcx + posx + 2][rsrcy + posy + 2] += r0;\n", + " }\n", + " }\n", + " }\n", + " }\n", + " }\n", + " STOP(section1,timers)\n", + "\n", + " START(section2)\n", + " #pragma omp parallel num_threads(nthreads_nonaffine)\n", + " {\n", + " int chunk_size = (int)(MAX(1, (int)((1.0/3.0)*(p_rec_M - p_rec_m + 1)/nthreads_nonaffine)));\n", + " #pragma omp for schedule(dynamic,chunk_size)\n", + " for (int p_rec = p_rec_m; p_rec <= p_rec_M; p_rec += 1)\n", + " {\n", + " float r7 = 1.0e-1F*(-o_x + rec_coords[p_rec][0]);\n", + " float r5 = floorf(r7);\n", + " int posx = (int)r5;\n", + " float r8 = 1.0e-1F*(-o_y + rec_coords[p_rec][1]);\n", + " float r6 = floorf(r8);\n", + " int posy = (int)r6;\n", + " float px = -r5 + r7;\n", + " float py = -r6 + r8;\n", + " float sum = 0.0F;\n", + "\n", + " for (int rrecx = 0; rrecx <= 1; rrecx += 1)\n", + " {\n", + " for (int rrecy = 0; rrecy <= 1; rrecy += 1)\n", + " {\n", + " if (rrecx + posx >= x_m - 1 && rrecy + posy >= y_m - 1 && rrecx + posx <= x_M + 1 && rrecy + posy <= y_M + 1)\n", + " {\n", + " sum += (rrecx*px + (1 - rrecx)*(1 - px))*(rrecy*py + (1 - rrecy)*(1 - py))*u[t2][rrecx + posx + 2][rrecy + posy + 2];\n", + " }\n", + " }\n", + " }\n", + "\n", + " rec[time][p_rec] = sum;\n", + " }\n", + " }\n", + " STOP(section2,timers)\n", + "\n", + " START(section3)\n", + " #pragma omp parallel num_threads(nthreads)\n", + " {\n", + " #pragma omp for schedule(static,1)\n", + " for (int f = f_m; f <= f_M; f += 1)\n", + " {\n", + " r1[f] = cexpf(-6.283185307179590F*time*_Complex_I*dt*frequencies[f]);\n", + " }\n", + " }\n", + " STOP(section3,timers)\n", + "\n", + " START(section4)\n", + " #pragma omp parallel num_threads(nthreads)\n", + " {\n", + " #pragma omp for collapse(2) schedule(static,1)\n", + " for (int f0_blk0 = f_m; f0_blk0 <= f_M; f0_blk0 += f0_blk0_size)\n", + " {\n", + " for (int x0_blk0 = x_m; x0_blk0 <= x_M; x0_blk0 += x0_blk0_size)\n", + " {\n", + " for (int f = f0_blk0; f <= MIN(f_M, f0_blk0 + f0_blk0_size - 1); f += 1)\n", + " {\n", + " for (int x = x0_blk0; x <= MIN(x_M, x0_blk0 + x0_blk0_size - 1); x += 1)\n", + " {\n", + " #pragma omp simd aligned(freq_modes,u:16)\n", + " for (int y = y_m; y <= y_M; y += 1)\n", + " {\n", + " freq_modes[f][x][y] += r1[f]*u[t0][x + 2][y + 2];\n", + " }\n", + " }\n", + " }\n", + " }\n", + " }\n", + " }\n", + " STOP(section4,timers)\n", + " }\n", + "\n", + " free(r1_vec);\n", + "\n", + " return 0;\n", + "}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#NBVAL_IGNORE_OUTPUT\n", + "Code(str(op.ccode), language='C')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Operator `Kernel` ran in 0.05 s\n" + ] + }, + { + "data": { + "text/plain": [ + "PerformanceSummary([(PerfKey(name='section0', rank=None),\n", + " PerfEntry(time=0.007022999999999993, gflopss=0.0, gpointss=0.0, oi=0.0, ops=0, itershapes=[])),\n", + " (PerfKey(name='section1', rank=None),\n", + " PerfEntry(time=0.002476999999999992, gflopss=0.0, gpointss=0.0, oi=0.0, ops=0, itershapes=[])),\n", + " (PerfKey(name='section2', rank=None),\n", + " PerfEntry(time=0.002686000000000001, gflopss=0.0, gpointss=0.0, oi=0.0, ops=0, itershapes=[])),\n", + " (PerfKey(name='section3', rank=None),\n", + " PerfEntry(time=0.0023849999999999943, gflopss=0.0, gpointss=0.0, oi=0.0, ops=0, itershapes=[])),\n", + " (PerfKey(name='section4', rank=None),\n", + " PerfEntry(time=0.025902999999999992, gflopss=0.0, gpointss=0.0, oi=0.0, ops=0, itershapes=[]))])" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#NBVAL_IGNORE_OUTPUT\n", + "u.data.fill(0)\n", + "op(dt=model.critical_dt)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#NBVAL_IGNORE_OUTPUT\n", + "plt.figure(figsize=(12, 30))\n", + "for i in range(5):\n", + " plt.subplot(5, 2, 2*i+1)\n", + " plt.imshow(np.real(freq_modes.data[i].T), cmap='seismic', vmin=-1e2, vmax=1e2)\n", + " plt.title(f'Real part {1e3*frequencies.data[i]} Hz')\n", + " plt.subplot(5, 2, 2*i+2)\n", + " plt.imshow(np.imag(freq_modes.data[i].T), cmap='seismic', vmin=-1e2, vmax=1e2)\n", + " plt.title(f'Imaginary part {1e3*frequencies.data[i]} Hz')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "assert np.isclose(norm(freq_modes), 26016.113, atol=0, rtol=1e-4)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/userapi/06_sparse_operations.ipynb b/examples/userapi/06_sparse_operations.ipynb index 26359cf6e1..b4c131e247 100644 --- a/examples/userapi/06_sparse_operations.ipynb +++ b/examples/userapi/06_sparse_operations.ipynb @@ -277,8 +277,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Eq(posx, (int)(floor((-o_x + s_coords(p_s, 0))/h_x)))\n", - "Eq(posy, (int)(floor((-o_y + s_coords(p_s, 1))/h_y)))\n", + "Eq(posx, (int)floor((-o_x + s_coords(p_s, 0))/h_x))\n", + "Eq(posy, (int)floor((-o_y + s_coords(p_s, 1))/h_y))\n", "Eq(px, -floor((-o_x + s_coords(p_s, 0))/h_x) + (-o_x + s_coords(p_s, 0))/h_x)\n", "Eq(py, -floor((-o_y + s_coords(p_s, 1))/h_y) + (-o_y + s_coords(p_s, 1))/h_y)\n", "Eq(sum, 0.0)\n", @@ -484,8 +484,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Eq(posx, (int)(floor((-o_x + s_coords(p_s, 0))/h_x)))\n", - "Eq(posy, (int)(floor((-o_y + s_coords(p_s, 1))/h_y)))\n", + "Eq(posx, (int)floor((-o_x + s_coords(p_s, 0))/h_x))\n", + "Eq(posy, (int)floor((-o_y + s_coords(p_s, 1))/h_y))\n", "Eq(sum, 0.0)\n", "Inc(sum, wsincrsx(p_s, rsx + 3)*wsincrsy(p_s, rsy + 3)*f(t, rsx + posx, rsy + posy))\n", "Eq(s(time, p_s), sum)\n" diff --git a/tests/test_builtins.py b/tests/test_builtins.py index 483bd6a135..8df6c4ad2c 100644 --- a/tests/test_builtins.py +++ b/tests/test_builtins.py @@ -155,7 +155,7 @@ def test_gs_1d_float(self, sigma): def test_gs_2d_int(self, sigma): """Test the Gaussian smoother in 2d.""" - a = ascent() + a = ascent().astype(np.int32) sp_smoothed = gaussian_filter(a, sigma=sigma) dv_smoothed = gaussian_smooth(a, sigma=sigma) @@ -169,8 +169,7 @@ def test_gs_2d_int(self, sigma): def test_gs_2d_float(self, sigma): """Test the Gaussian smoother in 2d.""" - a = ascent() - a = a+0.1 + a = ascent()+0.1 sp_smoothed = gaussian_filter(a, sigma=sigma) dv_smoothed = gaussian_smooth(a, sigma=sigma) @@ -375,6 +374,23 @@ def test_inner_sparse(self): term2 = np.inner(rec0.data.reshape(-1), rec1.data.reshape(-1)) assert np.isclose(term1/term2 - 1, 0.0, rtol=0.0, atol=1e-5) + @pytest.mark.parametrize('dtype', [np.float32, np.complex64]) + def test_norm_dense(self, dtype): + """ + Test that norm produces the correct result against NumPy + """ + grid = Grid((101, 101), extent=(1000., 1000.)) + + f = Function(name='f', grid=grid, dtype=dtype) + + f.data[:] = 1 + np.random.randn(*f.shape).astype(grid.dtype) + if np.iscomplexobj(f.data): + f.data[:] += 1j*np.random.randn(*f.shape).astype(grid.dtype) + term1 = np.linalg.norm(f.data) + term2 = norm(f) + assert np.isreal(term2) + assert np.isclose(term1/term2 - 1, 0.0, rtol=0.0, atol=1e-5) + def test_norm_sparse(self): """ Test that norm produces the correct result against NumPy diff --git a/tests/test_caching.py b/tests/test_caching.py index 47d90680d9..26ce3b6721 100644 --- a/tests/test_caching.py +++ b/tests/test_caching.py @@ -453,8 +453,16 @@ def test_grid_objs(self): assert y0 is y1 assert x0.spacing is x1.spacing assert y0.spacing is y1.spacing - assert ox0 is ox1 - assert oy0 is oy1 + + def test_grid_dtypes(self): + """ + Test that two grids with different dtypes have different hash values. + """ + + grid0 = Grid(shape=(4, 4), dtype=np.float32) + grid1 = Grid(shape=(4, 4), dtype=np.float64) + + assert hash(grid0) != hash(grid1) def test_special_symbols(self): """ diff --git a/tests/test_differentiable.py b/tests/test_differentiable.py index ef73579ef5..cfc254c3b9 100644 --- a/tests/test_differentiable.py +++ b/tests/test_differentiable.py @@ -17,14 +17,17 @@ def test_differentiable(): assert isinstance(e * a, Mul) assert isinstance(a * a, Pow) assert isinstance(1 / (a * a), Pow) + assert (a + e*a).dtype == a.dtype addition = a + 1.2 * a.dx assert isinstance(addition, Add) assert all(isinstance(a, Differentiable) for a in addition.args) + assert addition.dtype == a.dtype addition2 = a + e * a.dx assert isinstance(addition2, Add) assert all(isinstance(a, Differentiable) for a in addition2.args) + assert addition2.dtype == a.dtype def test_diffify(): diff --git a/tests/test_dtypes.py b/tests/test_dtypes.py new file mode 100644 index 0000000000..fad1840383 --- /dev/null +++ b/tests/test_dtypes.py @@ -0,0 +1,273 @@ +import numpy as np +import pytest +import sympy + +from devito import Constant, Eq, Function, Grid, Operator, exp, log, sin +from devito.ir.cgen.printer import BasePrinter +from devito.passes.iet.langbase import LangBB +from devito.passes.iet.languages.C import CBB, CPrinter +from devito.passes.iet.languages.openacc import AccBB, AccPrinter +from devito.passes.iet.languages.openmp import OmpBB +from devito.symbolics.extended_dtypes import ctypes_vector_mapper +from devito.types.basic import Basic, Scalar, Symbol +from devito.types.dense import TimeFunction + +# Mappers for language-specific types and headers +_languages: dict[str, type[LangBB]] = { + 'C': CBB, + 'openmp': OmpBB, + 'openacc': AccBB +} + +_printers: dict[str, type[BasePrinter]] = { + 'C': CPrinter, + 'openmp': CPrinter, + 'openacc': AccPrinter +} + + +def _get_language(language: str, **_) -> type[LangBB]: + """ + Gets the language building block type from parametrized kwargs. + """ + return _languages[language] + + +def _get_printer(language: str, **_) -> type[BasePrinter]: + """ + Gets the printer building block type from parametrized kwargs. + """ + return _printers[language] + + +def _config_kwargs(platform: str, language: str) -> dict[str, str]: + """ + Generates kwargs for Operator to test language-specific behavior. + """ + return { + 'platform': platform, + 'language': language, + } + + +# List of pararmetrized operator kwargs for testing language-specific behavior +_configs: list[dict[str, str]] = [ + _config_kwargs(*cfg) for cfg in [ + ('cpu64', 'C'), + ('cpu64', 'openmp'), + ('nvidiaX', 'openacc') + ] +] + + +def kw_id(kwargs): + # For more readable log + return "-".join(f'{k}' for k in kwargs.values()) + + +@pytest.mark.parametrize('dtype', [np.complex64, np.complex128]) +@pytest.mark.parametrize('kwargs', _configs, ids=kw_id) +def test_dtype_mapping(dtype: np.dtype[np.inexact], kwargs: dict[str, str], + expected=None) -> None: + """ + Tests that half and complex floats' dtypes result in the correct type + strings in generated code. + """ + # Set up an operator + grid = Grid(shape=(3, 3)) + x, y = grid.dimensions + + c = Constant(name='c', dtype=dtype) + u = Function(name='u', grid=grid, dtype=dtype) + eq = Eq(u, c * x * y) + op = Operator(eq, **kwargs) + + # Check ctypes of the mapped parameters + params: dict[str, Basic] = {p.name: p for p in op.parameters} + _u, _c = params['u'], params['c'] + assert type(_u.indexed._C_ctype._type_()) == ctypes_vector_mapper[dtype] + assert _c._C_ctype == expected or ctypes_vector_mapper[dtype] + + +@pytest.mark.parametrize('dtype', [np.complex64, np.complex128]) +@pytest.mark.parametrize('kwargs', _configs, ids=kw_id) +def test_cse_ctypes(dtype: np.dtype[np.inexact], kwargs: dict[str, str]) -> None: + """ + Tests that variables introduced by CSE have the correct type strings in + the generated code. + """ + # Retrieve the language-specific type mapping + printer: type[BasePrinter] = _get_printer(**kwargs) + + # Set up an operator + grid = Grid(shape=(3, 3)) + x, y = grid.dimensions + + c = Constant(name='c', dtype=dtype) + u = Function(name='u', grid=grid, dtype=dtype) + # sin(c) should be CSE'd + eq = Eq(u, x * x.spacing + y * y.spacing * sympy.sin(c)) + op = Operator(eq, **kwargs) + + # Ensure the CSE'd variable has the correct type + assert f'{printer()._print(ctypes_vector_mapper[dtype])} r0' in str(op) + + +@pytest.mark.parametrize('dtype', [np.float32, np.complex64, np.complex128]) +@pytest.mark.parametrize('kwargs', _configs, ids=kw_id) +def test_complex_headers(dtype: np.dtype[np.inexact], kwargs: dict[str, str]) -> None: + np.dtype + """ + Tests that the correct complex headers are included when complex dtypes + are present in the operator, and omitted otherwise. + """ + # Set up an operator + grid = Grid(shape=(3, 3)) + x, y = grid.dimensions + + c = Constant(name='c', dtype=dtype) + u = Function(name='u', grid=grid, dtype=dtype) + eq = Eq(u, c * x * y) + op = Operator(eq, **kwargs) + + # Check that the complex header is included <=> complex dtypes are present + header: str = _get_language(**kwargs).get('includes-complex') + if np.issubdtype(dtype, np.complexfloating): + assert header in op._includes + else: + assert header not in op._includes + + +@pytest.mark.parametrize('dtype', [np.complex64, np.complex128]) +@pytest.mark.parametrize('kwargs', _configs, ids=kw_id) +def test_imag_unit(dtype: np.complexfloating, kwargs: dict[str, str]) -> None: + """ + Tests that the correct literal is used for the imaginary unit. + """ + # Determine the expected imaginary unit string + unit_str: str + if kwargs['platform'] == 'cpu64': + # In C we multiply by the _Complex_I macro constant + unit_str = '_Complex_I' + else: + # C++ provides imaginary literals + if dtype == np.complex64: + unit_str = '1if' + else: + unit_str = '1i' + + # Set up an operator + s = Symbol(name='s', dtype=dtype) + eq = Eq(s, 2.0 + 3.0j) + op = Operator(eq, **kwargs) + + # Check that the correct imaginary unit is used + assert unit_str in str(op) + + +@pytest.mark.parametrize('dtype', [np.float32, np.float64, + np.complex64, np.complex128]) +@pytest.mark.parametrize(['sym', 'fun'], [(exp, np.exp), + (log, np.log), + (sin, np.sin)]) +def test_math_functions(dtype: np.dtype[np.inexact], + sym: sympy.Function, fun: np.ufunc) -> None: + """ + Tests that the correct math functions are used, and their results cast + and assigned appropriately for different float precisions and for + complex floats/doubles. + """ + # Get the expected function call string + call_str = str(sym) + if np.issubdtype(dtype, np.complexfloating): + # Complex functions have a 'c' prefix + call_str = 'c%s' % call_str + if dtype(0).real.itemsize <= 4: + # Single precision have an 'f' suffix (half is promoted to single) + call_str = '%sf' % call_str + + # Operator setup + a = Symbol(name='a', dtype=dtype) + b = Scalar(name='b', dtype=dtype) + + eq = Eq(a, sym(b)) + op = Operator(eq) + + # Ensure the generated function call has the correct form + assert call_str in str(op) + + +@pytest.mark.parametrize('dtype', [np.complex64, np.complex128]) +def test_complex_override(dtype: np.dtype[np.complexfloating]) -> None: + """ + Tests overriding complex values in op.apply(). + """ + grid = Grid(shape=(5, 5)) + x, y = grid.dimensions + + c = Constant(name='c', dtype=dtype, value=1.0 + 0.0j) + u = Function(name='u', grid=grid, dtype=dtype) + eq = Eq(u, x * x.spacing + c * y * y.spacing) + op = Operator(eq) + op.apply(c=2.0 + 1.0j) + + # Check against numpy result + dx, dy = grid.spacing_map.values() + xx, yy = np.meshgrid(np.linspace(0, 4, 5, dtype=dtype), + np.linspace(0, 4, 5, dtype=dtype)) + expected = xx * dx + yy * dy * dtype(2.0 + 1.0j) + assert np.allclose(u.data.T, expected) + + +@pytest.mark.parametrize('dtype', [np.complex64, np.complex128]) +def test_complex_time_deriv(dtype: np.dtype[np.complexfloating]) -> None: + """ + Tests taking the time derivative of a complex-valued function. + """ + grid = Grid(shape=(5, 5)) + x, y = grid.dimensions + t = grid.time_dim + + f = TimeFunction(name='f', grid=grid, space_order=2, dtype=dtype) + g = Function(name='g', grid=grid, dtype=dtype) + eqns = [Eq(f.forward, t * x * x.spacing * (1.0 + 0.0j) + + t * y * y.spacing * (0.0 + 1.0j)), + Eq(g, f.dt)] + op = Operator(eqns) + op.apply(time=10, dt=1.0) + + # Check against expected result + dx, dy = grid.spacing_map.values() + xx, yy = np.meshgrid(np.linspace(0, 4, 5, dtype=dtype), + np.linspace(0, 4, 5, dtype=dtype)) + expected = xx * dx + yy * dy * dtype(0.0 + 1.0j) + assert np.allclose(g.data.T, expected) + + +@pytest.mark.parametrize('dtype', [np.complex64, np.complex128]) +def test_complex_space_deriv(dtype: np.dtype[np.complexfloating]) -> None: + """ + Tests taking the space derivative of a complex-valued function, with + respect to the real and imaginary axes. + """ + grid = Grid(shape=(7, 7), dtype=dtype) + x, y = grid.dimensions + + # Operator setup + f = Function(name='f', grid=grid, space_order=2) + g = Function(name='g', grid=grid) + h = Function(name='h', grid=grid) + eqns = [Eq(f, x * x.spacing + y * y.spacing), + Eq(g, f.dx, subdomain=grid.interior), + Eq(h, f.dy, subdomain=grid.interior)] + op = Operator(eqns) + + dx = 1.0 + 0.0j + dy = 0.0 + 1.0j + op.apply(h_x=dx, h_y=dy) + + # Check against expected result (1 within the interior) + dfdx = g.data.T[1:-1, 1:-1] + dfdy = h.data.T[1:-1, 1:-1] + assert np.allclose(dfdx, np.ones((5, 5), dtype=dtype)) + assert np.allclose(dfdy, np.ones((5, 5), dtype=dtype)) diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index a22ab93df4..13239687bc 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -2,12 +2,13 @@ import pytest import numpy as np +import sympy import scipy.sparse from conftest import assert_structure from devito import (Constant, Eq, Inc, Grid, Function, ConditionalDimension, Dimension, MatrixSparseTimeFunction, SparseTimeFunction, - SubDimension, SubDomain, SubDomainSet, TimeFunction, + SubDimension, SubDomain, SubDomainSet, TimeFunction, exp, Operator, configuration, switchconfig, TensorTimeFunction, Buffer, assign) from devito.arch import get_gpu_info, get_cpu_info, Device, Cpu64 @@ -76,6 +77,25 @@ def test_maxpar_option(self): assert trees[0][0] is trees[1][0] assert trees[0][1] is not trees[1][1] + @pytest.mark.parametrize('dtype', [np.complex64, np.complex128]) + def test_complex(self, dtype): + grid = Grid((5, 5)) + x, y = grid.dimensions + + c = Constant(name='c', dtype=dtype) + u = Function(name="u", grid=grid, dtype=dtype) + + eq = Eq(u, x + sympy.I*y + exp(sympy.I + x.spacing) * c) + op = Operator(eq) + op(c=1.0 + 2.0j) + + # Check against numpy + dx = grid.spacing_map[x.spacing] + xx, yy = np.meshgrid(np.linspace(0, 4, 5), np.linspace(0, 4, 5)) + npres = xx + 1j*yy + np.exp(1j + dx) * (1.0 + 2.0j) + + assert np.allclose(u.data, npres.T, rtol=5e-7, atol=0) + class TestPassesOptional: diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py index bdf732a12d..8c4813db0b 100644 --- a/tests/test_gpu_openacc.py +++ b/tests/test_gpu_openacc.py @@ -2,7 +2,7 @@ import numpy as np from devito import (Grid, Function, TimeFunction, SparseTimeFunction, Eq, Operator, - norm, solve) + norm, solve, Max) from conftest import skipif, assert_blocking, opts_device_tiling from devito.data import LEFT from devito.exceptions import InvalidOperator @@ -171,6 +171,17 @@ def test_multi_tile_blocking_structure(self): assert len(iters) == len(v) assert all(i.step == j for i, j in zip(iters, v)) + def test_std_max(self): + grid = Grid(shape=(3, 3, 3)) + x, y, z = grid.dimensions + + u = Function(name='u', grid=grid) + + op = Operator(Eq(u, Max(1.2 * x / y, 2.3 * y / x)), + platform='nvidiaX', language='openacc') + + assert '' in str(op) + class TestOperator: diff --git a/tests/test_grid.py b/tests/test_grid.py new file mode 100644 index 0000000000..5753e17d7b --- /dev/null +++ b/tests/test_grid.py @@ -0,0 +1,27 @@ +import numpy as np +import pytest + +from devito import Grid + + +# Unsigned ints are unreasonable but not necessarily invalid +@pytest.mark.parametrize('dtype', [np.float16, np.float32, np.float64, np.longdouble, + np.complex64, np.complex128, np.int8, np.int16, + np.int32, np.int64, np.uint8, np.uint16, np.uint32, + np.uint64]) +def test_extent_dtypes(dtype: np.dtype[np.number]) -> None: + """ + Test that grid spacings are correctly computed for different dtypes. + """ + + # Construct a grid with the dtype and retrieve the spacing values + extent = (1, 1j) if np.issubdtype(dtype, np.complexfloating) else (2, 4) + grid = Grid(shape=(5, 5), extent=extent, dtype=dtype) + dx, dy = grid.spacing_map.values() + + # Check that the spacings have the correct dtype + assert dx.dtype == dy.dtype == dtype + + # Check that the spacings have the correct values + assert dx == dtype(extent[0] / 4) + assert dy == dtype(extent[1] / 4) diff --git a/tests/test_ir.py b/tests/test_ir.py index e6ea3f89e3..350de3ae20 100644 --- a/tests/test_ir.py +++ b/tests/test_ir.py @@ -5,6 +5,7 @@ from conftest import EVAL, skipif # noqa from devito import (Eq, Inc, Grid, Constant, Function, TimeFunction, # noqa Operator, Dimension, SubDimension, switchconfig) +from devito.ir.cgen import ccode from devito.ir.equations import LoweredEq from devito.ir.equations.algorithms import dimension_sort from devito.ir.iet import Iteration, FindNodes @@ -14,7 +15,7 @@ from devito.ir.support.space import (NullInterval, Interval, Forward, Backward, IterationSpace) from devito.ir.support.guards import GuardOverflow -from devito.symbolics import DefFunction, FieldFromPointer, ccode +from devito.symbolics import DefFunction, FieldFromPointer from devito.tools import prod from devito.types import Array, CriticalRegion, Jump, Scalar, Symbol diff --git a/tests/test_linearize.py b/tests/test_linearize.py index 1b531e6ab8..56f31ebe1f 100644 --- a/tests/test_linearize.py +++ b/tests/test_linearize.py @@ -190,7 +190,7 @@ def test_codegen_quality0(mode): # Only four access macros necessary, namely `uL0`, `bufL0`, `bufL1` # for the efunc args # (the other three obviously are _POSIX_C_SOURCE, START, STOP) - assert len(op._headers) == 6 + assert len(op.headers) == 6 def test_codegen_quality1(): @@ -212,7 +212,7 @@ def test_codegen_quality1(): # Only two access macros necessary, namely `uL0` and `r1L0` (the other five # obviously are _POSIX_C_SOURCE, MIN, MAX, START, STOP) - assert len(op._headers) == 6 + assert len(op.headers) == 6 def test_pow(): diff --git a/tests/test_mpi.py b/tests/test_mpi.py index fed3ae0905..8761069ff4 100644 --- a/tests/test_mpi.py +++ b/tests/test_mpi.py @@ -21,10 +21,7 @@ from devito.tools import Bunch from examples.seismic.acoustic import acoustic_setup -try: - from tests.test_dse import TestTTI -except ImportError: - TestTTI = None +from test_dse import TestTTI class TestDistributor: diff --git a/tests/test_operator.py b/tests/test_operator.py index d5759c1c92..edd5e9ec9c 100644 --- a/tests/test_operator.py +++ b/tests/test_operator.py @@ -9,14 +9,15 @@ SparseFunction, SparseTimeFunction, Dimension, error, SpaceDimension, NODE, CELL, dimensions, configuration, TensorFunction, TensorTimeFunction, VectorFunction, VectorTimeFunction, - div, grad, switchconfig) + div, grad, switchconfig, exp) from devito import Inc, Le, Lt, Ge, Gt # noqa from devito.exceptions import InvalidOperator from devito.finite_differences.differentiable import diff2sympy from devito.ir.equations import ClusterizedEq from devito.ir.equations.algorithms import lower_exprs from devito.ir.iet import (Callable, Conditional, Expression, Iteration, TimedList, - FindNodes, IsPerfectIteration, retrieve_iteration_tree) + FindNodes, IsPerfectIteration, retrieve_iteration_tree, + FindSymbols) from devito.ir.support import Any, Backward, Forward from devito.passes.iet.languages.C import CDataManager from devito.symbolics import ListInitializer, indexify, retrieve_indexed @@ -348,6 +349,19 @@ def test_nested_lowering_indexify(self): assert np.all(u0.data[1, :] == 4) assert np.all(u0.data[2, :] == 8) + def test_scalar_type(self): + grid = Grid(shape=(4, 4), dtype=np.float32) + u = Function(name='u', grid=grid, space_order=4) + + eq = Eq(u, u.laplace) + op0 = Operator(eq) + scalars = [s for s in FindSymbols().visit(op0) if s.name.startswith('r')] + assert all(s.dtype == np.float32 for s in scalars) + + op1 = Operator(eq, opt=('advanced', {'scalar-min-type': np.float64})) + scalars = [s for s in FindSymbols().visit(op1) if s.name.startswith('r')] + assert all(s.dtype == np.float64 for s in scalars) + class TestArithmetic: @@ -640,6 +654,25 @@ def test_tensor(self, func1): op2 = Operator([Eq(f, f.dx) for f in f1.values()]) assert str(op1.ccode) == str(op2.ccode) + @pytest.mark.parametrize('dtype', [np.complex64, np.complex128]) + def test_complex(self, dtype): + grid = Grid((5, 5)) + x, y = grid.dimensions + + c = Constant(name='c', dtype=dtype) + u = Function(name="u", grid=grid, dtype=dtype) + + eq = Eq(u, x + sympy.I*y + exp(sympy.I + x.spacing) * c) + op = Operator(eq) + op(c=1.0 + 2.0j) + + # Check against numpy + dx = grid.spacing_map[x.spacing] + xx, yy = np.meshgrid(np.linspace(0, 4, 5), np.linspace(0, 4, 5)) + npres = xx + 1j*yy + np.exp(1j + dx) * (1.0 + 2.0j) + + assert np.allclose(u.data, npres.T, rtol=1e-7, atol=0) + class TestAllocation: @@ -724,10 +757,10 @@ def verify_parameters(self, parameters, expected): """ boilerplate = ['timers'] parameters = [p.name for p in parameters] - for exp in expected: - if exp not in parameters + boilerplate: - error("Missing parameter: %s" % exp) - assert exp in parameters + boilerplate + for expi in expected: + if expi not in parameters + boilerplate: + error("Missing parameter: %s" % expi) + assert expi in parameters + boilerplate extra = [p for p in parameters if p not in expected and p not in boilerplate] if len(extra) > 0: error("Redundant parameters: %s" % str(extra)) diff --git a/tests/test_pickle.py b/tests/test_pickle.py index 373b4dca29..df8c3a79b2 100644 --- a/tests/test_pickle.py +++ b/tests/test_pickle.py @@ -1,3 +1,4 @@ +import ctypes import pickle as pickle0 import cloudpickle as pickle1 @@ -22,7 +23,8 @@ from devito.types.basic import BoundSymbol, AbstractSymbol from devito.tools import EnrichedTuple from devito.symbolics import (IntDiv, ListInitializer, FieldFromPointer, - CallFromPointer, DefFunction) + CallFromPointer, DefFunction, Cast, SizeOf, + pow_to_mul) from examples.seismic import (demo_model, AcquisitionGeometry, TimeAxis, RickerSource, Receiver) @@ -581,14 +583,45 @@ def test_equation(self, pickle): eq = Eq(f, f+1, implicit_dims=xs) - pkl_eq = pickle0.dumps(eq) - new_eq = pickle0.loads(pkl_eq) + pkl_eq = pickle.dumps(eq) + new_eq = pickle.loads(pkl_eq) assert new_eq.lhs.name == f.name assert str(new_eq.rhs) == 'f(x) + 1' assert new_eq.implicit_dims[0].name == 'xs' assert new_eq.implicit_dims[0].factor.data == 4 + @pytest.mark.parametrize('typ', [ctypes.c_float, 'struct truct']) + def test_Cast(self, pickle, typ): + a = Symbol('a') + un = Cast(a, dtype=typ) + + pkl_un = pickle.dumps(un) + new_un = pickle.loads(pkl_un) + + assert un == new_un + + @pytest.mark.parametrize('typ', [ctypes.c_float, 'struct truct']) + def test_SizeOf(self, pickle, typ): + un = SizeOf(typ) + + pkl_un = pickle.dumps(un) + new_un = pickle.loads(pkl_un) + + assert un == new_un + + def test_pow_to_mul(self, pickle): + grid = Grid(shape=(3,)) + f = Function(name='f', grid=grid) + expr = pow_to_mul(f ** 2) + + assert expr.is_Mul + + pkl_expr = pickle.dumps(expr) + new_expr = pickle.loads(pkl_expr) + + assert new_expr.is_Mul + class TestAdvanced: diff --git a/tests/test_symbolics.py b/tests/test_symbolics.py index 2bae5679c8..9644270a6f 100644 --- a/tests/test_symbolics.py +++ b/tests/test_symbolics.py @@ -9,12 +9,12 @@ Operator, SubDimension, norm, Le, Ge, Gt, Lt, Abs, sin, cos, Min, Max) from devito.finite_differences.differentiable import SafeInv, Weights -from devito.ir import Expression, FindNodes +from devito.ir import Expression, FindNodes, ccode from devito.symbolics import (retrieve_functions, retrieve_indexed, evalrel, # noqa CallFromPointer, Cast, DefFunction, FieldFromPointer, INT, FieldFromComposite, IntDiv, Namespace, Rvalue, - ReservedWord, ListInitializer, ccode, uxreplace, - retrieve_derivatives) + ReservedWord, ListInitializer, uxreplace, + retrieve_derivatives, BaseCast) from devito.tools import as_tuple from devito.types import (Array, Bundle, FIndexed, LocalObject, Object, ComponentAccess, StencilDimension, Symbol as dSymbol) @@ -112,6 +112,19 @@ def test_modified_sympy_assumptions(): assert s2 == s1 +def test_real(): + for dtype in [np.float32, np.complex64]: + c = Constant(name='c', dtype=dtype) + assert c.is_real is not np.iscomplexobj(dtype(0)) + assert c.is_imaginary is np.iscomplexobj(dtype(0)) + f = Function(name='f', dtype=dtype, grid=Grid((11,))) + assert f.is_real is not np.iscomplexobj(dtype(0)) + assert f.is_imaginary is np.iscomplexobj(dtype(0)) + s = dSymbol(name='s', dtype=dtype) + assert s.is_real is not np.iscomplexobj(dtype(0)) + assert s.is_imaginary is np.iscomplexobj(dtype(0)) + + def test_constant(): c = Constant(name='c') @@ -409,8 +422,8 @@ def test_rvalue(): def test_cast(): s = Symbol(name='s', dtype=np.float32) - class BarCast(Cast): - _base_typ = 'bar' + class BarCast(BaseCast): + _dtype = 'bar' v = BarCast(s, '**') assert ccode(v) == '(bar**)s'