diff --git a/Dockerfile_CPU b/Dockerfile_CPU index 370d361..92011c1 100644 --- a/Dockerfile_CPU +++ b/Dockerfile_CPU @@ -1,8 +1,9 @@ -FROM python:3 +FROM ubuntu:latest +RUN apt-get update && apt-get -y upgrade && apt-get -y install python3-pip build-essential WORKDIR /usr/src/app -RUN pip install torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html +RUN pip3 install torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html COPY requirements.txt joey/ -RUN pip install -r joey/requirements.txt +RUN pip3 install -r joey/requirements.txt COPY . joey/ -RUN pip install -e joey +RUN pip3 install -e joey WORKDIR /usr/src/app/joey diff --git a/joey/base.py b/joey/base.py index f970f78..b0bbcce 100644 --- a/joey/base.py +++ b/joey/base.py @@ -47,7 +47,9 @@ def __init__(self, kernel_size, dim_allocator_func) if generate_code: - self._op = Operator(self.equations()) + eqs, args = self.equations() + self._arg_dict = dict(args) + self._op = Operator(eqs) self._op.cfunction @property @@ -120,14 +122,14 @@ def _allocate(self, kernel_size, input_size, name_allocator_func, @abstractmethod def execute(self, kernel_data=None, input_data=None, bias=None) -> array: - self._op.apply() + self._op.apply(**self._arg_dict) return self._R.data @abstractmethod - def equations(self, input_function=None) -> list: + def equations(self) -> (list, list): pass @abstractmethod def backprop_equations(self, prev_layer, next_layer, - batch_constant, backward_arg_dict) -> list: + batch_constant) -> (list, list): pass diff --git a/joey/layers.py b/joey/layers.py index 4091606..ba48483 100644 --- a/joey/layers.py +++ b/joey/layers.py @@ -3,9 +3,9 @@ from joey import activation from joey import default_name_allocator as alloc from joey import default_dim_allocator as dim_alloc -from devito import Grid, Function, Constant, Eq, Inc, Ne, \ +from devito import Grid, Function, Constant, Eq, Inc, \ ConditionalDimension -from sympy import exp, Max, And +from sympy import exp, Max, And, Min, sign import numpy as np @@ -72,13 +72,15 @@ def _allocate(self, kernel_size, input_size, name_allocator_func, map_width = input_size[3] + 2 * self._padding[1] _, _, kernel_height, kernel_width = kernel_size - gridK = Grid(shape=kernel_size, dimensions=dim_allocator_func(4)) + t1, t2, t3, t4, t5, t6, t7, t8, t9, t10 = dim_allocator_func(10) + + gridK = Grid(shape=kernel_size, dimensions=(t1, t2, t3, t4)) K = Function(name=name_allocator_func(), grid=gridK, space_order=0, dtype=np.float64) gridB = Grid(shape=(input_size[0], input_size[1], map_height, map_width), - dimensions=dim_allocator_func(4)) + dimensions=(t5, t6, t7, t8)) B = Function(name=name_allocator_func(), grid=gridB, space_order=0, dtype=np.float64) @@ -87,12 +89,12 @@ def _allocate(self, kernel_size, input_size, name_allocator_func, // self._stride[0], (map_width - kernel_width + self._stride[1]) // self._stride[1]), - dimensions=dim_allocator_func(4)) + dimensions=(t5, t1, t9, t10)) R = Function(name=name_allocator_func(), grid=gridR, space_order=0, dtype=np.float64) bias_grid = Grid(shape=kernel_size[0], - dimensions=dim_allocator_func(1)) + dimensions=(t1,)) bias = Function(name=name_allocator_func(), grid=bias_grid, space_order=0, dtype=np.float64) @@ -103,7 +105,7 @@ def _allocate(self, kernel_size, input_size, name_allocator_func, grid=Grid(shape=(gridR.shape[1], gridR.shape[2], gridR.shape[3]), - dimensions=dim_allocator_func(3)), + dimensions=(t1, t9, t10)), space_order=0, dtype=np.float64) bias_grad = Function(name=name_allocator_func(), @@ -133,32 +135,28 @@ def execute(self, input_data, bias, kernel_data=None): return super().execute() - def equations(self, input_function=None): - if input_function is None: - input_function = self._I - + def equations(self): a, b, c, d = self._R.dimensions _, _, kernel_height, kernel_width = self._kernel_size - batch_size, channels, _, _ = input_function.shape + batch_size, channels, _, _ = self._I.shape e, f, g, h = self._K.dimensions - rhs = sum([self._K[e, f, x, y] * - input_function[a, f, self._stride[0] * c + x, - self._stride[1] * d + y] - for x in range(kernel_height) - for y in range(kernel_width)]) + rhs = self._K[b, f, g, h] * \ + self._I[a, f, self._stride[0] * c + g, + self._stride[1] * d + h] - eqs = [Inc(self._R[a, e, c, d], rhs), - Inc(self._R[a, e, c, d], self._bias[e])] + eqs = [Inc(self._R[a, b, c, d], rhs)] if self._activation is not None: - eqs.append(Eq(self._R[a, e, c, d], - self._activation(self._R[a, e, c, d]))) + eqs.append(Eq(self._R[a, b, c, d], + self._activation(self._R[a, b, c, d] + + self._bias[b]))) + else: + eqs.append(Inc(self._R[a, b, c, d], self._bias[b])) - return eqs + return (eqs, []) - def backprop_equations(self, prev_layer, next_layer, batch_constant, - backward_arg_dict=None): + def backprop_equations(self, prev_layer, next_layer, batch_constant): layer = self kernel_dims = layer.kernel_gradients.dimensions @@ -168,7 +166,7 @@ def backprop_equations(self, prev_layer, next_layer, batch_constant, eqs = [Eq(layer.bias_gradients, batch_constant * layer.bias_gradients), Inc(layer.bias_gradients[bias_dims[0]], - layer.result_gradients[bias_dims[0], dims[1], dims[2]]), + layer.result_gradients[dims[0], dims[1], dims[2]]), Eq(layer.bias_gradients, layer.bias_gradients / (batch_constant + 1))] @@ -198,9 +196,9 @@ def backprop_equations(self, prev_layer, next_layer, batch_constant, kernel_dims[2], kernel_dims[3]], layer.result_gradients[kernel_dims[0], dims[1], dims[2]] * - next_layer.result[batch_constant, kernel_dims[1], - kernel_dims[2] + dims[1], - kernel_dims[3] + dims[2]]), + layer.input[batch_constant, kernel_dims[1], + kernel_dims[2] + dims[1], + kernel_dims[3] + dims[2]]), Eq(layer.kernel_gradients, layer.kernel_gradients / (batch_constant + 1)), Eq(next_layer.result_gradients, 0), @@ -232,7 +230,7 @@ def backprop_equations(self, prev_layer, next_layer, batch_constant, Eq(layer.kernel_gradients, layer.kernel_gradients / (batch_constant + 1))] - return eqs + return (eqs, []) class Pooling(Layer): @@ -292,20 +290,20 @@ def _allocate(self, kernel_size, input_size, name_allocator_func, map_width = input_size[3] + 2 * self._padding[1] kernel_height, kernel_width = kernel_size - a, b, c, d = dim_allocator_func(4) + t1, t2, t3, t4, t5, t6 = dim_allocator_func(6) + gridB = Grid(shape=(input_size[0], input_size[1], map_height, map_width), - dimensions=(a, b, c, d)) + dimensions=(t1, t2, t3, t4)) B = Function(name=name_allocator_func(), grid=gridB, space_order=0, dtype=np.float64) - e, f, g, h = dim_allocator_func(4) gridR = Grid(shape=(input_size[0], input_size[1], (map_height - kernel_height + self._stride[0]) // self._stride[0], (map_width - kernel_width + self._stride[1]) // self._stride[1]), - dimensions=(e, f, g, h)) + dimensions=(t1, t2, t5, t6)) R = Function(name=name_allocator_func(), grid=gridR, space_order=0, dtype=np.float64) @@ -314,7 +312,7 @@ def _allocate(self, kernel_size, input_size, name_allocator_func, grid=Grid(shape=(gridR.shape[1], gridR.shape[2], gridR.shape[3]), - dimensions=dim_allocator_func(3)), + dimensions=(t2, t5, t6)), space_order=0, dtype=np.float64) return (None, B, R, None, None, output_grad, None) @@ -342,85 +340,91 @@ def execute(self, input_data): return super().execute() @abstractmethod - def equations(self, input_function=None): + def equations(self): pass @abstractmethod - def backprop_equations(self, prev_layer, next_layer, batch_constant, - backward_arg_dict): + def backprop_equations(self, prev_layer, next_layer, batch_constant): pass class MaxPooling(Pooling): def __init__(self, *args, **kwargs): + self._indices = None + self._forward_tmp_constants = None + self._backward_tmp_constants = None super().__init__(*args, **kwargs) - def equations(self, input_function=None): - if input_function is None: - input_function = self._I + def equations(self): + if self._forward_tmp_constants is None: + self._forward_tmp_constants = \ + [Constant(name=alloc(), dtype=np.float64)] + + if self._indices is None: + self._indices = \ + Function(name=alloc(), + grid=self._R.grid, + space_order=0, + dtype=np.int32) a, b, c, d = self._R.dimensions kernel_height, kernel_width = self._kernel_size - - rhs = Max(*[input_function[a, b, - self._stride[0] * c + i, - self._stride[1] * d + j] - for i in range(kernel_height) - for j in range(kernel_width)]) + i, j = dim_alloc(2) + + args = [(i.name + '_M', kernel_height - 1), + (j.name + '_M', kernel_width - 1)] + + old = self._forward_tmp_constants[0] + + cond1 = abs(sign(self._R[a, b, c, d] - old)) * kernel_width * \ + kernel_height + cond2 = abs(sign(self._I[a, b, self._stride[0] * c + i, + self._stride[1] * d + j] - + self._R[a, b, c, d])) * kernel_width * kernel_height + + eqs = [Eq(self._indices, kernel_height * kernel_width), + Eq(self._R[a, b, c, d], self._I[a, b, + self._stride[0] * c, + self._stride[1] * d]), + Eq(old, self._R[a, b, c, d], implicit_dims=(i, j)), + Eq(self._R[a, b, c, d], Max(self._R[a, b, c, d], + self._I[a, b, + self._stride[0] * c + i, + self._stride[1] * d + j])), + Eq(self._indices[a, b, c, d], + Min(self._indices[a, b, c, d] + cond1, + i * kernel_width + j + cond2))] if self._activation is not None: - rhs = self._activation(rhs) + eqs.append(Eq(self._R, self._activation(self._R))) - return [Eq(self._R[a, b, c, d], rhs)] + return (eqs, args) - def backprop_equations(self, prev_layer, next_layer, batch_constant, - backward_arg_dict): + def backprop_equations(self, prev_layer, next_layer, batch_constant): if next_layer is None: - return [] - - layer = self - - a, b = dim_alloc(2) - backward_arg_dict[a.name + '_M'] = layer.kernel_size[0] - 1 - backward_arg_dict[b.name + '_M'] = layer.kernel_size[1] - 1 - processed = Function(name=alloc(), grid=layer.result.grid, - space_order=0, dtype=np.float64) - - dims = layer.result.dimensions - - # The first dimension corresponding to a batch index must be - # discarded here. - dims = dims[1:] - - stride_rows, stride_cols = layer.stride - - cd1 = ConditionalDimension(name=alloc(), parent=b, - condition=And(Ne(processed[batch_constant, - dims[0], - dims[1], - dims[2]], 1), - ~Ne(next_layer - .result[batch_constant, - dims[0], - stride_rows * - dims[1] + a, - stride_cols * - dims[2] + b], - layer.result[batch_constant, - dims[0], - dims[1], - dims[2]]))) - - return [Eq(next_layer.result_gradients, 0), - Eq(processed, 0), - Eq(next_layer.result_gradients[dims[0], stride_rows * dims[1] + - a, stride_cols * dims[2] + b], - layer.result_gradients[dims[0], dims[1], dims[2]], - implicit_dims=cd1), - Eq(processed[batch_constant, dims[0], dims[1], dims[2]], - 1, implicit_dims=(a, b, cd1))] + \ - next_layer.activation.backprop_eqs(next_layer, - batch_constant) + return ([], []) + + if self._backward_tmp_constants is None: + self._backward_tmp_constants = \ + [Constant(name=alloc(), dtype=np.int32), + Constant(name=alloc(), dtype=np.int32)] + + dims = self._R.dimensions + stride_rows, stride_cols = self.stride + + index = self._indices[batch_constant, dims[1], dims[2], dims[3]] + a = self._backward_tmp_constants[0] + b = self._backward_tmp_constants[1] + + return ([Eq(next_layer.result_gradients, 0), + Eq(a, index // 2), + Eq(b, index % 2), + Inc(next_layer.result_gradients[dims[1], + stride_rows * dims[2] + a, + stride_cols * dims[3] + b], + self.result_gradients[dims[1], dims[2], dims[3]])] + + next_layer.activation.backprop_eqs(next_layer, + batch_constant), []) class FullyConnected(Layer): @@ -438,25 +442,25 @@ def _allocate(self, weight_size, input_size, name_allocator_func, dim_allocator_func): self._input_is_vector = type(input_size) == int - self._dimensions = dim_allocator_func(3) - a, b, c = self._dimensions + t1, t2, t3 = dim_allocator_func(3) + self._dimensions = (t1, t2, t3) - gridW = Grid(shape=weight_size, dimensions=(a, b)) + gridW = Grid(shape=weight_size, dimensions=(t1, t2)) W = Function(name=name_allocator_func(), grid=gridW, space_order=0, dtype=np.float64) if self._input_is_vector: - gridV_dimensions = (b,) - gridR_dimensions = (a,) + gridV_dimensions = (t2,) + gridR_dimensions = (t1,) gridR_shape = weight_size[0] output_grad_grid = Grid(shape=gridR_shape, - dimensions=dim_allocator_func(1)) + dimensions=gridR_dimensions) else: - gridV_dimensions = (b, c) - gridR_dimensions = (a, c) + gridV_dimensions = (t2, t3) + gridR_dimensions = (t1, t3) gridR_shape = (weight_size[0], input_size[1]) output_grad_grid = Grid(shape=weight_size[0], - dimensions=dim_allocator_func(1)) + dimensions=(t1,)) gridV = Grid(shape=input_size, dimensions=gridV_dimensions) V = Function(name=name_allocator_func(), grid=gridV, space_order=0, @@ -471,7 +475,7 @@ def _allocate(self, weight_size, input_size, name_allocator_func, space_order=0, dtype=np.float64) bias_grid = Grid(shape=weight_size[0], - dimensions=dim_allocator_func(1)) + dimensions=(t1,)) bias = Function(name=name_allocator_func(), grid=bias_grid, space_order=0, dtype=np.float64) @@ -501,26 +505,22 @@ def execute(self, input_data, bias, weight_data=None): return super().execute() - def equations(self, input_function=None): - if input_function is None: - input_function = self._I - + def equations(self): a, b, c = self._dimensions if self._input_is_vector: - eqs = [Inc(self._R[a], self._K[a, b] * input_function[b])] + eqs = [Inc(self._R[a], self._K[a, b] * self._I[b])] else: - eqs = [Inc(self._R[a, c], self._K[a, b] * input_function[b, c])] - - eqs.append(Inc(self._R[a, c], self._bias[a])) + eqs = [Inc(self._R[a, c], self._K[a, b] * self._I[b, c])] if self._activation is not None: - eqs.append(Eq(self._R, self._activation(self._R))) + eqs.append(Eq(self._R, self._activation(self._bias[a] + self._R))) + else: + eqs.append(Inc(self._R[a, c], self._bias[a])) - return eqs + return (eqs, []) - def backprop_equations(self, prev_layer, next_layer, batch_constant, - backward_arg_dict=None): + def backprop_equations(self, prev_layer, next_layer, batch_constant): layer = self dims = layer.result_gradients.dimensions @@ -528,41 +528,42 @@ def backprop_equations(self, prev_layer, next_layer, batch_constant, kernel_dims = layer.kernel_gradients.dimensions if prev_layer is None: - return [Eq(layer.bias_gradients, - layer.bias_gradients * batch_constant), - Inc(layer.bias_gradients[bias_dims[0]], - layer.result_gradients[bias_dims[0]]), - Eq(layer.bias_gradients, - layer.bias_gradients / (batch_constant + 1)), - Eq(layer.kernel_gradients, - layer.kernel_gradients * batch_constant), - Inc(layer.kernel_gradients[kernel_dims[0], kernel_dims[1]], - next_layer.result[kernel_dims[1], - batch_constant] * - layer.result_gradients[kernel_dims[0]]), - Eq(layer.kernel_gradients, - layer.kernel_gradients / (batch_constant + 1))] + return ([Eq(layer.bias_gradients, + layer.bias_gradients * batch_constant), + Inc(layer.bias_gradients[bias_dims[0]], + layer.result_gradients[bias_dims[0]]), + Eq(layer.bias_gradients, + layer.bias_gradients / (batch_constant + 1)), + Eq(layer.kernel_gradients, + layer.kernel_gradients * batch_constant), + Inc(layer.kernel_gradients[kernel_dims[0], + kernel_dims[1]], + layer.input[kernel_dims[1], + batch_constant] * + layer.result_gradients[kernel_dims[0]]), + Eq(layer.kernel_gradients, + layer.kernel_gradients / (batch_constant + 1))], []) prev_dims = prev_layer.result_gradients.dimensions - return [Eq(layer.result_gradients, 0), - Inc(layer.result_gradients[dims[0]], - prev_layer.kernel[prev_dims[0], dims[0]] * - prev_layer.result_gradients[prev_dims[0]])] + \ - layer.activation.backprop_eqs(layer, batch_constant) + \ - [Eq(layer.bias_gradients, - layer.bias_gradients * batch_constant), - Inc(layer.bias_gradients[bias_dims[0]], - layer.result_gradients[bias_dims[0]]), - Eq(layer.bias_gradients, - layer.bias_gradients / (batch_constant + 1)), - Eq(layer.kernel_gradients, - layer.kernel_gradients * batch_constant), - Inc(layer.kernel_gradients[kernel_dims[0], kernel_dims[1]], - next_layer.result[kernel_dims[1], batch_constant] * - layer.result_gradients[kernel_dims[0]]), - Eq(layer.kernel_gradients, - layer.kernel_gradients / (batch_constant + 1))] + return ([Eq(layer.result_gradients, 0), + Inc(layer.result_gradients[dims[0]], + prev_layer.kernel[prev_dims[0], dims[0]] * + prev_layer.result_gradients[prev_dims[0]])] + + layer.activation.backprop_eqs(layer, batch_constant) + + [Eq(layer.bias_gradients, + layer.bias_gradients * batch_constant), + Inc(layer.bias_gradients[bias_dims[0]], + layer.result_gradients[bias_dims[0]]), + Eq(layer.bias_gradients, + layer.bias_gradients / (batch_constant + 1)), + Eq(layer.kernel_gradients, + layer.kernel_gradients * batch_constant), + Inc(layer.kernel_gradients[kernel_dims[0], kernel_dims[1]], + layer.input[kernel_dims[1], batch_constant] * + layer.result_gradients[kernel_dims[0]]), + Eq(layer.kernel_gradients, + layer.kernel_gradients / (batch_constant + 1))], []) class FullyConnectedSoftmax(FullyConnected): @@ -575,37 +576,34 @@ def __init__(self, weight_size, input_size, name_allocator_func=alloc, super().__init__(weight_size, input_size, name_allocator_func, dim_allocator_func, activation.Dummy(), generate_code) - def equations(self, input_function=None): - if input_function is None: - input_function = self._I - + def equations(self): if self._input_is_vector: - return self._equations_vector(input_function) + return (self._equations_vector(), []) else: - return self._equations_matrix(input_function) + return (self._equations_matrix(), []) - def _equations_vector(self, input_function): + def _equations_vector(self): C = Constant(name=self._name_allocator()) a, b, c = self._dimensions - return [Inc(self._T[a], self._K[a, b] * input_function[b]), + return [Inc(self._T[a], self._K[a, b] * self._I[b]), Inc(self._T, self._bias), Eq(C, sum([exp(self._T[i]) for i in range(self._R.shape[0])])), Eq(self._R, exp(self._T) / C)] - def _equations_matrix(self, input_function): - gridC = Grid(shape=self._R.shape[1], dimensions=self._dim_allocator(1)) + def _equations_matrix(self): + a, b, c = self._dimensions + + gridC = Grid(shape=self._R.shape[1], dimensions=(c,)) C = Function(name=self._name_allocator(), grid=gridC, space_order=0, dtype=np.float64) M = Function(name=self._name_allocator(), grid=gridC, space_order=0, dtype=np.float64) - x = C.dimensions[0] - a, b, c = self._dimensions - return [Inc(self._T[a, c], self._K[a, b] * input_function[b, c]), + return [Inc(self._T[a, c], self._K[a, b] * self._I[b, c]), Inc(self._T[a, c], self._bias[a]), - Eq(M[x], Max(*[self._T[i, x] + Eq(M[c], Max(*[self._T[i, c] for i in range(self._R.shape[0])])), - Eq(C[x], sum([exp(self._T[i, x] - M[x]) + Eq(C[c], sum([exp(self._T[i, c] - M[c]) for i in range(self._R.shape[0])])), Eq(self._R[a, b], exp(self._T[a, b] - M[b]) / C[b]), Eq(self._T, 0)] @@ -621,19 +619,21 @@ def __init__(self, input_size, name_allocator_func=alloc, def _allocate(self, kernel_size, input_size, name_allocator_func, dim_allocator_func): - gridI = Grid(shape=input_size, dimensions=dim_allocator_func(4)) + t1, t2, t3, t4, t5 = dim_allocator_func(5) + + gridI = Grid(shape=input_size, dimensions=(t1, t2, t3, t4)) I = Function(name=name_allocator_func(), grid=gridI, space_order=0, dtype=np.float64) gridR = Grid(shape=(input_size[1]*input_size[2]*input_size[3], input_size[0]), - dimensions=dim_allocator_func(2)) + dimensions=(t5, t1)) R = Function(name=name_allocator_func(), grid=gridR, space_order=0, dtype=np.float64) output_grad = Function(name=name_allocator_func(), grid=Grid(shape=gridR.shape[0], - dimensions=dim_allocator_func(1)), + dimensions=(t5,)), space_order=0, dtype=np.float64) return (None, I, R, None, None, output_grad, None) @@ -642,18 +642,14 @@ def execute(self, input_data): self._I.data[:] = input_data return super().execute() - def equations(self, input_function=None): - if input_function is None: - input_function = self._I - - _, b, c, d = input_function.dimensions - batch_size, channels, height, width = input_function.shape + def equations(self): + _, b, c, d = self._I.dimensions + batch_size, channels, height, width = self._I.shape - return [Eq(self._R[b * height * width + c * height + d, a], - input_function[a, b, c, d]) for a in range(batch_size)] + return ([Eq(self._R[b * height * width + c * height + d, a], + self._I[a, b, c, d]) for a in range(batch_size)], []) - def backprop_equations(self, prev_layer, next_layer, batch_constant, - backward_arg_dict=None): + def backprop_equations(self, prev_layer, next_layer, batch_constant): layer = self prev_kernel_dims = prev_layer.kernel_gradients.dimensions @@ -662,13 +658,14 @@ def backprop_equations(self, prev_layer, next_layer, batch_constant, _, height, width = next_layer.result_gradients.shape next_dims = next_layer.result_gradients.dimensions - return [Eq(layer.result_gradients, 0), - Inc(layer.result_gradients[dims[0]], - prev_layer.kernel[prev_kernel_dims[0], dims[0]] * - prev_layer.result_gradients[prev_kernel_dims[0]]), - Eq(next_layer.result_gradients[next_dims[0], next_dims[1], - next_dims[2]], - layer.result_gradients[next_dims[0] * height * width + - next_dims[1] * height + - next_dims[2]])] + \ - next_layer.activation.backprop_eqs(next_layer, batch_constant) + return ([Eq(layer.result_gradients, 0), + Inc(layer.result_gradients[dims[0]], + prev_layer.kernel[prev_kernel_dims[0], dims[0]] * + prev_layer.result_gradients[prev_kernel_dims[0]]), + Eq(next_layer.result_gradients[next_dims[0], next_dims[1], + next_dims[2]], + layer.result_gradients[next_dims[0] * height * width + + next_dims[1] * height + + next_dims[2]])] + + next_layer.activation.backprop_eqs(next_layer, batch_constant), + []) diff --git a/joey/net.py b/joey/net.py index b69cb89..dc9d033 100644 --- a/joey/net.py +++ b/joey/net.py @@ -11,10 +11,17 @@ class Net: def __init__(self, layers: list): self._layers = layers self._batch_constant = Constant(name='batch', dtype=np.int32) + self._forward_arg_dict = {} self._backward_arg_dict = {} - eqs = self._gen_eqs() - backprop_eqs = self._gen_backprop_eqs() + eqs, args = self._gen_eqs() + backprop_eqs, backprop_args = self._gen_backprop_eqs() + + for (key, value) in args: + self._forward_arg_dict[key] = value + + for (key, value) in backprop_args: + self._backward_arg_dict[key] = value parameter_lists = list(map(ml.Layer.pytorch_parameters, self._layers)) parameters = [] @@ -35,23 +42,34 @@ def __init__(self, layers: list): def _init_parameters(self): for layer in self._layers: if layer.kernel is not None: - layer.kernel.data[:] = np.random.rand(*layer.kernel.shape) - 0.5 + layer.kernel.data[:] = \ + np.random.rand(*layer.kernel.shape) - 0.5 if layer.bias is not None: layer.bias.data[:] = np.random.rand(*layer.bias.shape) - 0.5 def _gen_eqs(self): eqs = [] + args = [] + input_function = None for layer in self._layers: - eqs += layer.equations(input_function=input_function) + if input_function is not None: + dims = input_function.dimensions + eqs.append(Eq(layer.input[dims], input_function[dims])) + + layer_eqs, layer_args = layer.equations() + + args += layer_args + eqs += layer_eqs input_function = layer.result - return eqs + return (eqs, args) def _gen_backprop_eqs(self): eqs = [] + args = [] for i in range(len(self._layers) - 1, -1, -1): if i < len(self._layers) - 1: @@ -64,11 +82,14 @@ def _gen_backprop_eqs(self): else: next_layer = None - eqs += self._layers[i].backprop_equations(prev_layer, next_layer, - self._batch_constant, - self._backward_arg_dict) + layer_eqs, layer_args = \ + self._layers[i].backprop_equations(prev_layer, next_layer, + self._batch_constant) + + args += layer_args + eqs += layer_eqs - return eqs + return (eqs, args) @property def pytorch_parameters(self): @@ -79,7 +100,7 @@ def forward(self, input_data): layer.result.data[:] = 0 self._layers[0].input.data[:] = input_data - self._forward_operator.apply() + self._forward_operator.apply(**self._forward_arg_dict) return self._layers[-1].result.data def backward(self, loss_gradient_func, pytorch_optimizer=None): diff --git a/tests/test_layers.py b/tests/test_layers.py index 47cf845..cc1a0fe 100644 --- a/tests/test_layers.py +++ b/tests/test_layers.py @@ -1,6 +1,10 @@ import joey import numpy as np from joey.activation import ReLU +from devito import logger +from utils import get_run_count + +logger.set_log_noperf() def test_conv(): @@ -13,27 +17,29 @@ def test_conv(): [11, 12]], [[13, -14], [-15, 16]]]] - output = layer.execute(np.array([[[[1, 2, 3], - [4, 5, 6], - [7, 8, 9]], - [[0, 0, 1], - [0, 1, 0], - [0, 0, 2]]], - [[[-1, -2, -3], - [4, 6, 8], - [11, 0, 2]], - [[9, 8, 7], - [6, 5, 4], - [3, 2, 1]]]]), [0, 0]) - - assert(np.array_equal(output, [[[[37.5, 48], - [53, 75]], - [[130.5, 109], - [171, 253.5]]], - [[[216.5, 205], - [123, 69.5]], - [[100.5, 146], - [138, 42]]]])) + + for i in range(get_run_count()): + output = layer.execute(np.array([[[[1, 2, 3], + [4, 5, 6], + [7, 8, 9]], + [[0, 0, 1], + [0, 1, 0], + [0, 0, 2]]], + [[[-1, -2, -3], + [4, 6, 8], + [11, 0, 2]], + [[9, 8, 7], + [6, 5, 4], + [3, 2, 1]]]]), [0, 0]) + + assert(np.array_equal(output, [[[[37.5, 48], + [53, 75]], + [[130.5, 109], + [171, 253.5]]], + [[[216.5, 205], + [123, 69.5]], + [[100.5, 146], + [138, 42]]]])) def test_conv_relu(): @@ -47,27 +53,29 @@ def test_conv_relu(): [11, 12]], [[13, -14], [-15, 16]]]] - output = layer.execute(np.array([[[[1, 2, 3], - [4, 5, 6], - [7, 8, 9]], - [[0, 0, 1], - [0, 1, 0], - [0, 0, 2]]], - [[[-1, -2, -3], - [4, 6, 8], - [11, 0, 2]], - [[9, 8, 7], - [6, 5, 4], - [3, 2, 1]]]]), [-50, -79.75]) - - assert(np.array_equal(output, [[[[0, 0], - [3, 25]], - [[50.75, 29.25], - [91.25, 173.75]]], - [[[166.5, 155], - [73, 19.5]], - [[20.75, 66.25], - [58.25, 0]]]])) + + for i in range(get_run_count()): + output = layer.execute(np.array([[[[1, 2, 3], + [4, 5, 6], + [7, 8, 9]], + [[0, 0, 1], + [0, 1, 0], + [0, 0, 2]]], + [[[-1, -2, -3], + [4, 6, 8], + [11, 0, 2]], + [[9, 8, 7], + [6, 5, 4], + [3, 2, 1]]]]), [-50, -79.75]) + + assert(np.array_equal(output, [[[[0, 0], + [3, 25]], + [[50.75, 29.25], + [91.25, 173.75]]], + [[[166.5, 155], + [73, 19.5]], + [[20.75, 66.25], + [58.25, 0]]]])) def test_conv_larger_stride(): @@ -81,135 +89,145 @@ def test_conv_larger_stride(): [11, 12]], [[13, -14], [-15, 16]]]] - output = layer.execute(np.array([[[[1, 2, 3], - [4, 5, 6], - [7, 8, 9]], - [[0, 0, 1], - [0, 1, 0], - [0, 0, 2]]], - [[[-1, -2, -3], - [4, 6, 8], - [11, 0, 2]], - [[9, 8, 7], - [6, 5, 4], - [3, 2, 1]]]]), [0, 0]) - - assert(np.array_equal(output, [[[[37.5, 48]], - [[130.5, 109]]], - [[[216.5, 205]], - [[100.5, 146]]]])) + + for i in range(get_run_count()): + output = layer.execute(np.array([[[[1, 2, 3], + [4, 5, 6], + [7, 8, 9]], + [[0, 0, 1], + [0, 1, 0], + [0, 0, 2]]], + [[[-1, -2, -3], + [4, 6, 8], + [11, 0, 2]], + [[9, 8, 7], + [6, 5, 4], + [3, 2, 1]]]]), [0, 0]) + + assert(np.array_equal(output, [[[[37.5, 48]], + [[130.5, 109]]], + [[[216.5, 205]], + [[100.5, 146]]]])) def test_max_pooling(): layer = joey.MaxPooling(kernel_size=(2, 2), input_size=(2, 2, 3, 3)) - output = layer.execute(np.array([[[[1, 2, 3], - [4, 5, 6], - [7, 8, 9]], - [[10, 11, 12], - [13, 14, 15], - [16, 17, 18]]], - [[[19, 20, 21], - [22, 23, 24], - [25, 26, 27]], - [[28, 29, 30], - [31, 32, 33], - [34, 35, 36]]]])) - - assert(np.array_equal(output, [[[[5, 6], - [8, 9]], - [[14, 15], - [17, 18]]], - [[[23, 24], - [26, 27]], - [[32, 33], - [35, 36]]]])) + + for i in range(get_run_count()): + output = layer.execute(np.array([[[[1, 2, 3], + [4, 5, 6], + [7, 8, 9]], + [[10, 11, 12], + [13, 14, 15], + [16, 17, 18]]], + [[[19, 20, 21], + [22, 23, 24], + [25, 26, 27]], + [[28, 29, 30], + [31, 32, 33], + [34, 35, 36]]]])) + + assert(np.array_equal(output, [[[[5, 6], + [8, 9]], + [[14, 15], + [17, 18]]], + [[[23, 24], + [26, 27]], + [[32, 33], + [35, 36]]]])) def test_max_pooling_larger_stride(): layer = joey.MaxPooling(kernel_size=(2, 2), input_size=(2, 2, 3, 3), stride=(1, 2), strict_stride_check=False) - output = layer.execute(np.array([[[[1, 2, 3], - [4, 5, 6], - [7, 8, 9]], - [[10, 11, 12], - [13, 14, 15], - [16, 17, 18]]], - [[[19, 20, 21], - [22, 23, 24], - [25, 26, 27]], - [[28, 29, 30], - [31, 32, 33], - [34, 35, 36]]]])) - - assert(np.array_equal(output, [[[[5], - [8]], - [[14], - [17]]], - [[[23], - [26]], - [[32], - [35]]]])) + + for i in range(get_run_count()): + output = layer.execute(np.array([[[[1, 2, 3], + [4, 5, 6], + [7, 8, 9]], + [[10, 11, 12], + [13, 14, 15], + [16, 17, 18]]], + [[[19, 20, 21], + [22, 23, 24], + [25, 26, 27]], + [[28, 29, 30], + [31, 32, 33], + [34, 35, 36]]]])) + + assert(np.array_equal(output, [[[[5], + [8]], + [[14], + [17]]], + [[[23], + [26]], + [[32], + [35]]]])) def test_max_pooling_relu(): layer = joey.MaxPooling(kernel_size=(2, 2), input_size=(2, 2, 3, 3), activation=ReLU()) - output = layer.execute(np.array([[[[-1, -2, 3], - [-4, -5, 6], - [7, 8, 9]], - [[10, 11, 12], - [13, -14, -15], - [16, -17, -18]]], - [[[19, -20, -21], - [22, -23, -24], - [25, 26, 27]], - [[28, 29, 30], - [31, 32, 33], - [34, 35, 36]]]])) - - assert(np.array_equal(output, [[[[0, 6], - [8, 9]], - [[13, 12], - [16, 0]]], - [[[22, 0], - [26, 27]], - [[32, 33], - [35, 36]]]])) + + for i in range(get_run_count()): + output = layer.execute(np.array([[[[-1, -2, 3], + [-4, -5, 6], + [7, 8, 9]], + [[10, 11, 12], + [13, -14, -15], + [16, -17, -18]]], + [[[19, -20, -21], + [22, -23, -24], + [25, 26, 27]], + [[28, 29, 30], + [31, 32, 33], + [34, 35, 36]]]])) + + assert(np.array_equal(output, [[[[0, 6], + [8, 9]], + [[13, 12], + [16, 0]]], + [[[22, 0], + [26, 27]], + [[32, 33], + [35, 36]]]])) def test_flat(): layer = joey.Flat(input_size=(2, 2, 3, 3)) - output = layer.execute([[[[1, 2, 3], - [4, 5, 6], - [7, 8, 9]], - [[10, 11, 12], - [13, 14, 15], - [16, 17, 18]]], - [[[19, 20, 21], - [22, 23, 24], - [25, 26, 27]], - [[28, 29, 30], - [31, 32, 33], - [34, 35, 36]]]]) - - assert(np.array_equal(output, [[1, 19], - [2, 20], - [3, 21], - [4, 22], - [5, 23], - [6, 24], - [7, 25], - [8, 26], - [9, 27], - [10, 28], - [11, 29], - [12, 30], - [13, 31], - [14, 32], - [15, 33], - [16, 34], - [17, 35], - [18, 36]])) + + for i in range(get_run_count()): + output = layer.execute([[[[1, 2, 3], + [4, 5, 6], + [7, 8, 9]], + [[10, 11, 12], + [13, 14, 15], + [16, 17, 18]]], + [[[19, 20, 21], + [22, 23, 24], + [25, 26, 27]], + [[28, 29, 30], + [31, 32, 33], + [34, 35, 36]]]]) + + assert(np.array_equal(output, [[1, 19], + [2, 20], + [3, 21], + [4, 22], + [5, 23], + [6, 24], + [7, 25], + [8, 26], + [9, 27], + [10, 28], + [11, 29], + [12, 30], + [13, 31], + [14, 32], + [15, 33], + [16, 34], + [17, 35], + [18, 36]])) def test_fully_connected(): @@ -217,9 +235,11 @@ def test_fully_connected(): layer.kernel.data[:] = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - output = layer.execute([[-1], [1], [-2]], [4, 1, -2]) - assert(np.array_equal(output, [[-1], [-10], [-19]])) + for i in range(get_run_count()): + output = layer.execute([[-1], [1], [-2]], [4, 1, -2]) + + assert(np.array_equal(output, [[-1], [-10], [-19]])) def test_fully_connected_relu(): @@ -228,6 +248,8 @@ def test_fully_connected_relu(): layer.kernel.data[:] = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - output = layer.execute([[-1], [1], [-2]], [6, 1, -2]) - assert(np.array_equal(output, [[1], [0], [0]])) + for i in range(get_run_count()): + output = layer.execute([[-1], [1], [-2]], [6, 1, -2]) + + assert(np.array_equal(output, [[1], [0], [0]])) diff --git a/tests/test_lenet.py b/tests/test_lenet.py index 601edde..5d3dd5c 100644 --- a/tests/test_lenet.py +++ b/tests/test_lenet.py @@ -7,7 +7,10 @@ import torch.nn.functional as F import torch.optim as optim import numpy as np -from utils import compare +from utils import compare, get_run_count +from devito import logger + +logger.set_log_noperf() # PyTorch class @@ -145,10 +148,11 @@ def test_forward_pass(net_arguments, mnist): images = images.double() - outputs = net.forward(images.numpy()) - pytorch_outputs = pytorch_net(images) + for i in range(get_run_count()): + outputs = net.forward(images.numpy()) + pytorch_outputs = pytorch_net(images) - compare(outputs, nn.Softmax(dim=1)(pytorch_outputs)) + compare(outputs, nn.Softmax(dim=1)(pytorch_outputs), 1e-12) def test_backward_pass(net_arguments, mnist): @@ -171,34 +175,34 @@ def loss_grad(layer, b): images = images.double() - net.forward(images.numpy()) - net.backward(loss_grad) + for i in range(get_run_count()): + net.forward(images.numpy()) + net.backward(loss_grad) - criterion = nn.CrossEntropyLoss() + criterion = nn.CrossEntropyLoss() - pytorch_net.zero_grad() - outputs = pytorch_net(images) - loss = criterion(outputs, labels) - loss.backward() + pytorch_net.zero_grad() + outputs = pytorch_net(images) + loss = criterion(outputs, labels) + loss.backward() - pytorch_layers = [pytorch_net.conv1, pytorch_net.conv2, - pytorch_net.fc1, pytorch_net.fc2, pytorch_net.fc3] - devito_layers = [layers[0], layers[2], layers[5], layers[6], layers[7]] + pytorch_layers = [pytorch_net.conv1, pytorch_net.conv2, + pytorch_net.fc1, pytorch_net.fc2, pytorch_net.fc3] + devito_layers = [layers[0], layers[2], layers[5], layers[6], layers[7]] - for i in range(len(pytorch_layers)): - pytorch_layer = pytorch_layers[i] - devito_layer = devito_layers[i] + for j in range(len(pytorch_layers) - 1, -1, -1): + pytorch_layer = pytorch_layers[j] + devito_layer = devito_layers[j] - compare(devito_layer.kernel_gradients.data, - pytorch_layer.weight.grad) + compare(devito_layer.kernel_gradients.data, + pytorch_layer.weight.grad, 1e-11) - compare(devito_layer.bias_gradients.data, - pytorch_layer.bias.grad) + compare(devito_layer.bias_gradients.data, + pytorch_layer.bias.grad, 1e-11) -def test_training_sgd(net_arguments, mnist): +def run_training(net_arguments, mnist): mnist_train, _ = mnist - iterations = 100 net, pytorch_net, layers = net_arguments @@ -208,41 +212,48 @@ def test_training_sgd(net_arguments, mnist): criterion = nn.CrossEntropyLoss() - for i, data in enumerate(mnist_train, 0): - images, labels = data + pytorch_layers = [pytorch_net.conv1, pytorch_net.conv2, + pytorch_net.fc1, pytorch_net.fc2, pytorch_net.fc3] + devito_layers = [layers[0], layers[2], layers[5], layers[6], layers[7]] - def loss_grad(layer, b): - gradients = [] + images, labels = iter(mnist_train).next() - for j in range(10): - result = layer.result.data[j, b] - if j == labels[b]: - result -= 1 - gradients.append(result) + def loss_grad(layer, b): + gradients = [] - return gradients + for j in range(10): + result = layer.result.data[j, b] + if j == labels[b]: + result -= 1 + gradients.append(result) - images = images.double() + return gradients - net.forward(images.numpy()) - net.backward(loss_grad, optimizer) + images = images.double() - pytorch_optimizer.zero_grad() - pytorch_outputs = pytorch_net(images) - pytorch_loss = criterion(pytorch_outputs, labels) - pytorch_loss.backward() - pytorch_optimizer.step() + outputs = net.forward(images.numpy()) - if i == iterations - 1: - break + pytorch_optimizer.zero_grad() + pytorch_outputs = pytorch_net(images) - pytorch_layers = [pytorch_net.conv1, pytorch_net.conv2, - pytorch_net.fc1, pytorch_net.fc2, pytorch_net.fc3] - devito_layers = [layers[0], layers[2], layers[5], layers[6], layers[7]] + compare(outputs, nn.Softmax(dim=1)(pytorch_outputs), + 1e-12) + + net.backward(loss_grad, optimizer) - for i in range(len(pytorch_layers)): - pytorch_layer = pytorch_layers[i] - devito_layer = devito_layers[i] + pytorch_loss = criterion(pytorch_outputs, labels) + pytorch_loss.backward() + pytorch_optimizer.step() - compare(devito_layer.kernel.data, pytorch_layer.weight) - compare(devito_layer.bias.data, pytorch_layer.bias) + for j in range(len(pytorch_layers) - 1, -1, -1): + pytorch_layer = pytorch_layers[j] + devito_layer = devito_layers[j] + + compare(devito_layer.kernel.data, pytorch_layer.weight, + 1e-12) + compare(devito_layer.bias.data, pytorch_layer.bias, + 1e-12) + + +def test_training_sgd(net_arguments, mnist): + run_training(net_arguments, mnist) diff --git a/tests/test_simple.py b/tests/test_simple.py index b543541..8279ef1 100644 --- a/tests/test_simple.py +++ b/tests/test_simple.py @@ -5,7 +5,10 @@ import torch.nn.functional as F import numpy as np from joey.activation import ReLU -from utils import compare +from utils import compare, get_run_count +from devito import logger + +logger.set_log_noperf() # PyTorch class @@ -13,13 +16,15 @@ class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv = nn.Conv2d(2, 2, 2) - self.fc = nn.Linear(8, 3) + self.fc1 = nn.Linear(8, 5) + self.fc2 = nn.Linear(5, 3) def forward(self, x): - x = F.max_pool2d(x, 2, stride=(1, 1)) x = F.relu(self.conv(x)) + x = F.max_pool2d(x, 2, stride=(1, 1)) x = x.view(-1, self.num_flat_features(x)) - x = self.fc(x) + x = F.relu(self.fc1(x)) + x = self.fc2(x) return x def num_flat_features(self, x): @@ -38,20 +43,24 @@ def num_flat_features(self, x): def net_arguments(): np.random.seed(SEED) - layer1 = joey.MaxPooling(kernel_size=(2, 2), - input_size=(1, 2, 4, 4), - generate_code=False) - layer2 = joey.Conv(kernel_size=(2, 2, 2), - input_size=(1, 2, 3, 3), + layer1 = joey.Conv(kernel_size=(2, 2, 2), + input_size=(2, 2, 4, 4), activation=ReLU(), generate_code=False) - layer_flat = joey.Flat(input_size=(1, 2, 2, 2), + layer2 = joey.MaxPooling(kernel_size=(2, 2), + input_size=(2, 2, 3, 3), + generate_code=False) + layer_flat = joey.Flat(input_size=(2, 2, 2, 2), generate_code=False) - layer3 = joey.FullyConnectedSoftmax(weight_size=(3, 8), - input_size=(8, 1), + layer3 = joey.FullyConnected(weight_size=(5, 8), + input_size=(8, 2), + activation=ReLU(), + generate_code=False) + layer4 = joey.FullyConnectedSoftmax(weight_size=(3, 5), + input_size=(5, 2), generate_code=False) - layers = [layer1, layer2, layer_flat, layer3] + layers = [layer1, layer2, layer_flat, layer3, layer4] net = joey.Net(layers) @@ -59,11 +68,14 @@ def net_arguments(): pytorch_net.double() with torch.no_grad(): - pytorch_net.conv.weight[:] = torch.from_numpy(layer2.kernel.data) - pytorch_net.conv.bias[:] = torch.from_numpy(layer2.bias.data) + pytorch_net.conv.weight[:] = torch.from_numpy(layer1.kernel.data) + pytorch_net.conv.bias[:] = torch.from_numpy(layer1.bias.data) + + pytorch_net.fc1.weight[:] = torch.from_numpy(layer3.kernel.data) + pytorch_net.fc1.bias[:] = torch.from_numpy(layer3.bias.data) - pytorch_net.fc.weight[:] = torch.from_numpy(layer3.kernel.data) - pytorch_net.fc.bias[:] = torch.from_numpy(layer3.bias.data) + pytorch_net.fc2.weight[:] = torch.from_numpy(layer4.kernel.data) + pytorch_net.fc2.bias[:] = torch.from_numpy(layer4.bias.data) return (net, pytorch_net, layers) @@ -78,13 +90,22 @@ def test_forward_pass(net_arguments): [[-1, -2, 0, 1], [-2, -3, 1, 2], [3, 4, 2, -1], - [-2, -3, -4, 9]]]], + [-2, -3, -4, 9]]], + [[[5, 6, 7, 8], + [9, 10, 11, 12], + [13, 14, 15, 16], + [17, 18, 19, 20]], + [[1, 2, 0, -1], + [2, 3, -1, -2], + [-3, -4, -2, 1], + [2, 3, 4, -9]]]], dtype=np.float64) - outputs = net.forward(input_data) - pytorch_outputs = pytorch_net(torch.from_numpy(input_data)) + for i in range(get_run_count()): + outputs = net.forward(input_data) + pytorch_outputs = pytorch_net(torch.from_numpy(input_data)) - compare(outputs, nn.Softmax(dim=1)(pytorch_outputs)) + compare(outputs, nn.Softmax(dim=1)(pytorch_outputs), 1e-14) def test_backward_pass(net_arguments): @@ -96,40 +117,49 @@ def test_backward_pass(net_arguments): [[-1, -2, 0, 1], [-2, -3, 1, 2], [3, 4, 2, -1], - [-2, -3, -4, 9]]]], + [-2, -3, -4, 9]]], + [[[5, 6, 7, 8], + [9, 10, 11, 12], + [13, 14, 15, 16], + [17, 18, 19, 20]], + [[1, 2, 0, -1], + [2, 3, -1, -2], + [-3, -4, -2, 1], + [2, 3, 4, -9]]]], dtype=np.float64) - expected = np.array([2]) + expected = np.array([2, 1]) def loss_grad(layer, b): gradients = [] for i in range(3): - result = layer.result.data[i] - if i == expected[0]: + result = layer.result.data[i, b] + if i == expected[b]: result -= 1 gradients.append(result) return gradients - net.forward(input_data) - net.backward(loss_grad) + for i in range(get_run_count()): + net.forward(input_data) + net.backward(loss_grad) - criterion = nn.CrossEntropyLoss() + criterion = nn.CrossEntropyLoss() - pytorch_net.zero_grad() - outputs = pytorch_net(torch.from_numpy(input_data)) - loss = criterion(outputs, torch.from_numpy(expected)) - loss.backward() + pytorch_net.zero_grad() + outputs = pytorch_net(torch.from_numpy(input_data)) + loss = criterion(outputs, torch.from_numpy(expected)) + loss.backward() - pytorch_layers = [pytorch_net.conv, pytorch_net.fc] - devito_layers = [layers[1], layers[3]] + pytorch_layers = [pytorch_net.conv, pytorch_net.fc1, pytorch_net.fc2] + devito_layers = [layers[0], layers[3], layers[4]] - for i in range(len(pytorch_layers)): - pytorch_layer = pytorch_layers[i] - devito_layer = devito_layers[i] + for j in range(len(pytorch_layers) - 1, -1, -1): + pytorch_layer = pytorch_layers[j] + devito_layer = devito_layers[j] - compare(devito_layer.kernel_gradients.data, - pytorch_layer.weight.grad) + compare(devito_layer.kernel_gradients.data, + pytorch_layer.weight.grad, 1e-13) - compare(devito_layer.bias_gradients.data, - pytorch_layer.bias.grad) + compare(devito_layer.bias_gradients.data, + pytorch_layer.bias.grad, 1e-13) diff --git a/tests/utils.py b/tests/utils.py index c68639e..002ec72 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,7 +1,8 @@ import numpy as np +from os import environ -def compare(devito, pytorch): +def compare(devito, pytorch, tolerance): pytorch = pytorch.detach().numpy() if devito.shape != pytorch.shape: @@ -10,5 +11,18 @@ def compare(devito, pytorch): error = abs(devito - pytorch) / abs(pytorch) max_error = np.nanmax(error) - if max_error != np.nan: - assert(max_error < 10**(-9)) + assert(np.isnan(max_error) or max_error < tolerance) + + +def running_in_parallel(): + if 'DEVITO_LANGUAGE' not in environ: + return False + + return environ['DEVITO_LANGUAGE'] in ['openmp'] + + +def get_run_count(): + if running_in_parallel(): + return 1000 + else: + return 1