diff --git a/Dockerfile_CPU b/Dockerfile_CPU
index 370d361..92011c1 100644
--- a/Dockerfile_CPU
+++ b/Dockerfile_CPU
@@ -1,8 +1,9 @@
-FROM python:3
+FROM ubuntu:latest
+RUN apt-get update && apt-get -y upgrade && apt-get -y install python3-pip build-essential
 WORKDIR /usr/src/app
-RUN pip install torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+RUN pip3 install torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
 COPY requirements.txt joey/
-RUN pip install -r joey/requirements.txt
+RUN pip3 install -r joey/requirements.txt
 COPY . joey/
-RUN pip install -e joey
+RUN pip3 install -e joey
 WORKDIR /usr/src/app/joey
diff --git a/joey/base.py b/joey/base.py
index f970f78..b0bbcce 100644
--- a/joey/base.py
+++ b/joey/base.py
@@ -47,7 +47,9 @@ def __init__(self, kernel_size,
                                          dim_allocator_func)
 
         if generate_code:
-            self._op = Operator(self.equations())
+            eqs, args = self.equations()
+            self._arg_dict = dict(args)
+            self._op = Operator(eqs)
             self._op.cfunction
 
     @property
@@ -120,14 +122,14 @@ def _allocate(self, kernel_size, input_size, name_allocator_func,
 
     @abstractmethod
     def execute(self, kernel_data=None, input_data=None, bias=None) -> array:
-        self._op.apply()
+        self._op.apply(**self._arg_dict)
         return self._R.data
 
     @abstractmethod
-    def equations(self, input_function=None) -> list:
+    def equations(self) -> (list, list):
         pass
 
     @abstractmethod
     def backprop_equations(self, prev_layer, next_layer,
-                           batch_constant, backward_arg_dict) -> list:
+                           batch_constant) -> (list, list):
         pass
diff --git a/joey/layers.py b/joey/layers.py
index 4091606..ba48483 100644
--- a/joey/layers.py
+++ b/joey/layers.py
@@ -3,9 +3,9 @@
 from joey import activation
 from joey import default_name_allocator as alloc
 from joey import default_dim_allocator as dim_alloc
-from devito import Grid, Function, Constant, Eq, Inc, Ne, \
+from devito import Grid, Function, Constant, Eq, Inc, \
     ConditionalDimension
-from sympy import exp, Max, And
+from sympy import exp, Max, And, Min, sign
 import numpy as np
 
 
@@ -72,13 +72,15 @@ def _allocate(self, kernel_size, input_size, name_allocator_func,
         map_width = input_size[3] + 2 * self._padding[1]
         _, _, kernel_height, kernel_width = kernel_size
 
-        gridK = Grid(shape=kernel_size, dimensions=dim_allocator_func(4))
+        t1, t2, t3, t4, t5, t6, t7, t8, t9, t10 = dim_allocator_func(10)
+
+        gridK = Grid(shape=kernel_size, dimensions=(t1, t2, t3, t4))
         K = Function(name=name_allocator_func(), grid=gridK, space_order=0,
                      dtype=np.float64)
 
         gridB = Grid(shape=(input_size[0], input_size[1],
                             map_height, map_width),
-                     dimensions=dim_allocator_func(4))
+                     dimensions=(t5, t6, t7, t8))
         B = Function(name=name_allocator_func(), grid=gridB, space_order=0,
                      dtype=np.float64)
 
@@ -87,12 +89,12 @@ def _allocate(self, kernel_size, input_size, name_allocator_func,
                             // self._stride[0],
                             (map_width - kernel_width + self._stride[1])
                             // self._stride[1]),
-                     dimensions=dim_allocator_func(4))
+                     dimensions=(t5, t1, t9, t10))
         R = Function(name=name_allocator_func(), grid=gridR, space_order=0,
                      dtype=np.float64)
 
         bias_grid = Grid(shape=kernel_size[0],
-                         dimensions=dim_allocator_func(1))
+                         dimensions=(t1,))
         bias = Function(name=name_allocator_func(), grid=bias_grid,
                         space_order=0, dtype=np.float64)
 
@@ -103,7 +105,7 @@ def _allocate(self, kernel_size, input_size, name_allocator_func,
                                grid=Grid(shape=(gridR.shape[1],
                                                 gridR.shape[2],
                                                 gridR.shape[3]),
-                                         dimensions=dim_allocator_func(3)),
+                                         dimensions=(t1, t9, t10)),
                                space_order=0, dtype=np.float64)
 
         bias_grad = Function(name=name_allocator_func(),
@@ -133,32 +135,28 @@ def execute(self, input_data, bias, kernel_data=None):
 
         return super().execute()
 
-    def equations(self, input_function=None):
-        if input_function is None:
-            input_function = self._I
-
+    def equations(self):
         a, b, c, d = self._R.dimensions
         _, _, kernel_height, kernel_width = self._kernel_size
-        batch_size, channels, _, _ = input_function.shape
+        batch_size, channels, _, _ = self._I.shape
         e, f, g, h = self._K.dimensions
 
-        rhs = sum([self._K[e, f, x, y] *
-                   input_function[a, f, self._stride[0] * c + x,
-                                  self._stride[1] * d + y]
-                   for x in range(kernel_height)
-                   for y in range(kernel_width)])
+        rhs = self._K[b, f, g, h] * \
+            self._I[a, f, self._stride[0] * c + g,
+                    self._stride[1] * d + h]
 
-        eqs = [Inc(self._R[a, e, c, d], rhs),
-               Inc(self._R[a, e, c, d], self._bias[e])]
+        eqs = [Inc(self._R[a, b, c, d], rhs)]
 
         if self._activation is not None:
-            eqs.append(Eq(self._R[a, e, c, d],
-                          self._activation(self._R[a, e, c, d])))
+            eqs.append(Eq(self._R[a, b, c, d],
+                          self._activation(self._R[a, b, c, d] +
+                                           self._bias[b])))
+        else:
+            eqs.append(Inc(self._R[a, b, c, d], self._bias[b]))
 
-        return eqs
+        return (eqs, [])
 
-    def backprop_equations(self, prev_layer, next_layer, batch_constant,
-                           backward_arg_dict=None):
+    def backprop_equations(self, prev_layer, next_layer, batch_constant):
         layer = self
 
         kernel_dims = layer.kernel_gradients.dimensions
@@ -168,7 +166,7 @@ def backprop_equations(self, prev_layer, next_layer, batch_constant,
         eqs = [Eq(layer.bias_gradients,
                   batch_constant * layer.bias_gradients),
                Inc(layer.bias_gradients[bias_dims[0]],
-                   layer.result_gradients[bias_dims[0], dims[1], dims[2]]),
+                   layer.result_gradients[dims[0], dims[1], dims[2]]),
                Eq(layer.bias_gradients,
                   layer.bias_gradients / (batch_constant + 1))]
 
@@ -198,9 +196,9 @@ def backprop_equations(self, prev_layer, next_layer, batch_constant,
                                                kernel_dims[2], kernel_dims[3]],
                         layer.result_gradients[kernel_dims[0], dims[1],
                                                dims[2]] *
-                        next_layer.result[batch_constant, kernel_dims[1],
-                                          kernel_dims[2] + dims[1],
-                                          kernel_dims[3] + dims[2]]),
+                        layer.input[batch_constant, kernel_dims[1],
+                                    kernel_dims[2] + dims[1],
+                                    kernel_dims[3] + dims[2]]),
                     Eq(layer.kernel_gradients,
                        layer.kernel_gradients / (batch_constant + 1)),
                     Eq(next_layer.result_gradients, 0),
@@ -232,7 +230,7 @@ def backprop_equations(self, prev_layer, next_layer, batch_constant,
                     Eq(layer.kernel_gradients,
                        layer.kernel_gradients / (batch_constant + 1))]
 
-        return eqs
+        return (eqs, [])
 
 
 class Pooling(Layer):
@@ -292,20 +290,20 @@ def _allocate(self, kernel_size, input_size, name_allocator_func,
         map_width = input_size[3] + 2 * self._padding[1]
         kernel_height, kernel_width = kernel_size
 
-        a, b, c, d = dim_allocator_func(4)
+        t1, t2, t3, t4, t5, t6 = dim_allocator_func(6)
+
         gridB = Grid(shape=(input_size[0], input_size[1], map_height,
                             map_width),
-                     dimensions=(a, b, c, d))
+                     dimensions=(t1, t2, t3, t4))
         B = Function(name=name_allocator_func(), grid=gridB, space_order=0,
                      dtype=np.float64)
 
-        e, f, g, h = dim_allocator_func(4)
         gridR = Grid(shape=(input_size[0], input_size[1],
                             (map_height - kernel_height + self._stride[0])
                             // self._stride[0],
                             (map_width - kernel_width + self._stride[1])
                             // self._stride[1]),
-                     dimensions=(e, f, g, h))
+                     dimensions=(t1, t2, t5, t6))
 
         R = Function(name=name_allocator_func(), grid=gridR, space_order=0,
                      dtype=np.float64)
@@ -314,7 +312,7 @@ def _allocate(self, kernel_size, input_size, name_allocator_func,
                                grid=Grid(shape=(gridR.shape[1],
                                                 gridR.shape[2],
                                                 gridR.shape[3]),
-                                         dimensions=dim_allocator_func(3)),
+                                         dimensions=(t2, t5, t6)),
                                space_order=0, dtype=np.float64)
 
         return (None, B, R, None, None, output_grad, None)
@@ -342,85 +340,91 @@ def execute(self, input_data):
         return super().execute()
 
     @abstractmethod
-    def equations(self, input_function=None):
+    def equations(self):
         pass
 
     @abstractmethod
-    def backprop_equations(self, prev_layer, next_layer, batch_constant,
-                           backward_arg_dict):
+    def backprop_equations(self, prev_layer, next_layer, batch_constant):
         pass
 
 
 class MaxPooling(Pooling):
     def __init__(self, *args, **kwargs):
+        self._indices = None
+        self._forward_tmp_constants = None
+        self._backward_tmp_constants = None
         super().__init__(*args, **kwargs)
 
-    def equations(self, input_function=None):
-        if input_function is None:
-            input_function = self._I
+    def equations(self):
+        if self._forward_tmp_constants is None:
+            self._forward_tmp_constants = \
+                [Constant(name=alloc(), dtype=np.float64)]
+
+        if self._indices is None:
+            self._indices = \
+                Function(name=alloc(),
+                         grid=self._R.grid,
+                         space_order=0,
+                         dtype=np.int32)
 
         a, b, c, d = self._R.dimensions
         kernel_height, kernel_width = self._kernel_size
-
-        rhs = Max(*[input_function[a, b,
-                                   self._stride[0] * c + i,
-                                   self._stride[1] * d + j]
-                    for i in range(kernel_height)
-                    for j in range(kernel_width)])
+        i, j = dim_alloc(2)
+
+        args = [(i.name + '_M', kernel_height - 1),
+                (j.name + '_M', kernel_width - 1)]
+
+        old = self._forward_tmp_constants[0]
+
+        cond1 = abs(sign(self._R[a, b, c, d] - old)) * kernel_width * \
+            kernel_height
+        cond2 = abs(sign(self._I[a, b, self._stride[0] * c + i,
+                                 self._stride[1] * d + j] -
+                         self._R[a, b, c, d])) * kernel_width * kernel_height
+
+        eqs = [Eq(self._indices, kernel_height * kernel_width),
+               Eq(self._R[a, b, c, d], self._I[a, b,
+                                               self._stride[0] * c,
+                                               self._stride[1] * d]),
+               Eq(old, self._R[a, b, c, d], implicit_dims=(i, j)),
+               Eq(self._R[a, b, c, d], Max(self._R[a, b, c, d],
+                                           self._I[a, b,
+                                                   self._stride[0] * c + i,
+                                                   self._stride[1] * d + j])),
+               Eq(self._indices[a, b, c, d],
+                  Min(self._indices[a, b, c, d] + cond1,
+                      i * kernel_width + j + cond2))]
 
         if self._activation is not None:
-            rhs = self._activation(rhs)
+            eqs.append(Eq(self._R, self._activation(self._R)))
 
-        return [Eq(self._R[a, b, c, d], rhs)]
+        return (eqs, args)
 
-    def backprop_equations(self, prev_layer, next_layer, batch_constant,
-                           backward_arg_dict):
+    def backprop_equations(self, prev_layer, next_layer, batch_constant):
         if next_layer is None:
-            return []
-
-        layer = self
-
-        a, b = dim_alloc(2)
-        backward_arg_dict[a.name + '_M'] = layer.kernel_size[0] - 1
-        backward_arg_dict[b.name + '_M'] = layer.kernel_size[1] - 1
-        processed = Function(name=alloc(), grid=layer.result.grid,
-                             space_order=0, dtype=np.float64)
-
-        dims = layer.result.dimensions
-
-        # The first dimension corresponding to a batch index must be
-        # discarded here.
-        dims = dims[1:]
-
-        stride_rows, stride_cols = layer.stride
-
-        cd1 = ConditionalDimension(name=alloc(), parent=b,
-                                   condition=And(Ne(processed[batch_constant,
-                                                              dims[0],
-                                                              dims[1],
-                                                              dims[2]], 1),
-                                                 ~Ne(next_layer
-                                                     .result[batch_constant,
-                                                             dims[0],
-                                                             stride_rows *
-                                                             dims[1] + a,
-                                                             stride_cols *
-                                                             dims[2] + b],
-                                                     layer.result[batch_constant,
-                                                                  dims[0],
-                                                                  dims[1],
-                                                                  dims[2]])))
-
-        return [Eq(next_layer.result_gradients, 0),
-                Eq(processed, 0),
-                Eq(next_layer.result_gradients[dims[0], stride_rows * dims[1] +
-                                               a, stride_cols * dims[2] + b],
-                   layer.result_gradients[dims[0], dims[1], dims[2]],
-                   implicit_dims=cd1),
-                Eq(processed[batch_constant, dims[0], dims[1], dims[2]],
-                   1, implicit_dims=(a, b, cd1))] + \
-            next_layer.activation.backprop_eqs(next_layer,
-                                               batch_constant)
+            return ([], [])
+
+        if self._backward_tmp_constants is None:
+            self._backward_tmp_constants = \
+                [Constant(name=alloc(), dtype=np.int32),
+                 Constant(name=alloc(), dtype=np.int32)]
+
+        dims = self._R.dimensions
+        stride_rows, stride_cols = self.stride
+
+        index = self._indices[batch_constant, dims[1], dims[2], dims[3]]
+        a = self._backward_tmp_constants[0]
+        b = self._backward_tmp_constants[1]
+
+        return ([Eq(next_layer.result_gradients, 0),
+                 Eq(a, index // 2),
+                 Eq(b, index % 2),
+                 Inc(next_layer.result_gradients[dims[1],
+                                                 stride_rows * dims[2] + a,
+                                                 stride_cols * dims[3] + b],
+                     self.result_gradients[dims[1], dims[2], dims[3]])] +
+                next_layer.activation.backprop_eqs(next_layer,
+                                                   batch_constant), [])
 
 
 class FullyConnected(Layer):
@@ -438,25 +442,25 @@ def _allocate(self, weight_size, input_size, name_allocator_func,
                   dim_allocator_func):
         self._input_is_vector = type(input_size) == int
 
-        self._dimensions = dim_allocator_func(3)
-        a, b, c = self._dimensions
+        t1, t2, t3 = dim_allocator_func(3)
+        self._dimensions = (t1, t2, t3)
 
-        gridW = Grid(shape=weight_size, dimensions=(a, b))
+        gridW = Grid(shape=weight_size, dimensions=(t1, t2))
         W = Function(name=name_allocator_func(), grid=gridW, space_order=0,
                      dtype=np.float64)
 
         if self._input_is_vector:
-            gridV_dimensions = (b,)
-            gridR_dimensions = (a,)
+            gridV_dimensions = (t2,)
+            gridR_dimensions = (t1,)
             gridR_shape = weight_size[0]
             output_grad_grid = Grid(shape=gridR_shape,
-                                    dimensions=dim_allocator_func(1))
+                                    dimensions=gridR_dimensions)
         else:
-            gridV_dimensions = (b, c)
-            gridR_dimensions = (a, c)
+            gridV_dimensions = (t2, t3)
+            gridR_dimensions = (t1, t3)
             gridR_shape = (weight_size[0], input_size[1])
             output_grad_grid = Grid(shape=weight_size[0],
-                                    dimensions=dim_allocator_func(1))
+                                    dimensions=(t1,))
 
         gridV = Grid(shape=input_size, dimensions=gridV_dimensions)
         V = Function(name=name_allocator_func(), grid=gridV, space_order=0,
@@ -471,7 +475,7 @@ def _allocate(self, weight_size, input_size, name_allocator_func,
                                space_order=0, dtype=np.float64)
 
         bias_grid = Grid(shape=weight_size[0],
-                         dimensions=dim_allocator_func(1))
+                         dimensions=(t1,))
         bias = Function(name=name_allocator_func(), grid=bias_grid,
                         space_order=0, dtype=np.float64)
 
@@ -501,26 +505,22 @@ def execute(self, input_data, bias, weight_data=None):
 
         return super().execute()
 
-    def equations(self, input_function=None):
-        if input_function is None:
-            input_function = self._I
-
+    def equations(self):
         a, b, c = self._dimensions
 
         if self._input_is_vector:
-            eqs = [Inc(self._R[a], self._K[a, b] * input_function[b])]
+            eqs = [Inc(self._R[a], self._K[a, b] * self._I[b])]
         else:
-            eqs = [Inc(self._R[a, c], self._K[a, b] * input_function[b, c])]
-
-        eqs.append(Inc(self._R[a, c], self._bias[a]))
+            eqs = [Inc(self._R[a, c], self._K[a, b] * self._I[b, c])]
 
         if self._activation is not None:
-            eqs.append(Eq(self._R, self._activation(self._R)))
+            eqs.append(Eq(self._R, self._activation(self._bias[a] + self._R)))
+        else:
+            eqs.append(Inc(self._R[a, c], self._bias[a]))
 
-        return eqs
+        return (eqs, [])
 
-    def backprop_equations(self, prev_layer, next_layer, batch_constant,
-                           backward_arg_dict=None):
+    def backprop_equations(self, prev_layer, next_layer, batch_constant):
         layer = self
 
         dims = layer.result_gradients.dimensions
@@ -528,41 +528,42 @@ def backprop_equations(self, prev_layer, next_layer, batch_constant,
         kernel_dims = layer.kernel_gradients.dimensions
 
         if prev_layer is None:
-            return [Eq(layer.bias_gradients,
-                       layer.bias_gradients * batch_constant),
-                    Inc(layer.bias_gradients[bias_dims[0]],
-                    layer.result_gradients[bias_dims[0]]),
-                    Eq(layer.bias_gradients,
-                       layer.bias_gradients / (batch_constant + 1)),
-                    Eq(layer.kernel_gradients,
-                       layer.kernel_gradients * batch_constant),
-                    Inc(layer.kernel_gradients[kernel_dims[0], kernel_dims[1]],
-                        next_layer.result[kernel_dims[1],
-                                          batch_constant] *
-                        layer.result_gradients[kernel_dims[0]]),
-                    Eq(layer.kernel_gradients,
-                       layer.kernel_gradients / (batch_constant + 1))]
+            return ([Eq(layer.bias_gradients,
+                        layer.bias_gradients * batch_constant),
+                     Inc(layer.bias_gradients[bias_dims[0]],
+                         layer.result_gradients[bias_dims[0]]),
+                     Eq(layer.bias_gradients,
+                        layer.bias_gradients / (batch_constant + 1)),
+                     Eq(layer.kernel_gradients,
+                        layer.kernel_gradients * batch_constant),
+                     Inc(layer.kernel_gradients[kernel_dims[0],
+                                                kernel_dims[1]],
+                         layer.input[kernel_dims[1],
+                                     batch_constant] *
+                         layer.result_gradients[kernel_dims[0]]),
+                     Eq(layer.kernel_gradients,
+                        layer.kernel_gradients / (batch_constant + 1))], [])
 
         prev_dims = prev_layer.result_gradients.dimensions
 
-        return [Eq(layer.result_gradients, 0),
-                Inc(layer.result_gradients[dims[0]],
-                    prev_layer.kernel[prev_dims[0], dims[0]] *
-                    prev_layer.result_gradients[prev_dims[0]])] + \
-            layer.activation.backprop_eqs(layer, batch_constant) + \
-            [Eq(layer.bias_gradients,
-                layer.bias_gradients * batch_constant),
-             Inc(layer.bias_gradients[bias_dims[0]],
-                 layer.result_gradients[bias_dims[0]]),
-             Eq(layer.bias_gradients,
-                layer.bias_gradients / (batch_constant + 1)),
-             Eq(layer.kernel_gradients,
-                layer.kernel_gradients * batch_constant),
-             Inc(layer.kernel_gradients[kernel_dims[0], kernel_dims[1]],
-                 next_layer.result[kernel_dims[1], batch_constant] *
-                 layer.result_gradients[kernel_dims[0]]),
-             Eq(layer.kernel_gradients,
-                layer.kernel_gradients / (batch_constant + 1))]
+        return ([Eq(layer.result_gradients, 0),
+                 Inc(layer.result_gradients[dims[0]],
+                     prev_layer.kernel[prev_dims[0], dims[0]] *
+                     prev_layer.result_gradients[prev_dims[0]])] +
+                layer.activation.backprop_eqs(layer, batch_constant) +
+                [Eq(layer.bias_gradients,
+                    layer.bias_gradients * batch_constant),
+                 Inc(layer.bias_gradients[bias_dims[0]],
+                     layer.result_gradients[bias_dims[0]]),
+                 Eq(layer.bias_gradients,
+                    layer.bias_gradients / (batch_constant + 1)),
+                 Eq(layer.kernel_gradients,
+                    layer.kernel_gradients * batch_constant),
+                 Inc(layer.kernel_gradients[kernel_dims[0], kernel_dims[1]],
+                     layer.input[kernel_dims[1], batch_constant] *
+                     layer.result_gradients[kernel_dims[0]]),
+                 Eq(layer.kernel_gradients,
+                    layer.kernel_gradients / (batch_constant + 1))], [])
 
 
 class FullyConnectedSoftmax(FullyConnected):
@@ -575,37 +576,34 @@ def __init__(self, weight_size, input_size, name_allocator_func=alloc,
         super().__init__(weight_size, input_size, name_allocator_func,
                          dim_allocator_func, activation.Dummy(), generate_code)
 
-    def equations(self, input_function=None):
-        if input_function is None:
-            input_function = self._I
-
+    def equations(self):
         if self._input_is_vector:
-            return self._equations_vector(input_function)
+            return (self._equations_vector(), [])
         else:
-            return self._equations_matrix(input_function)
+            return (self._equations_matrix(), [])
 
-    def _equations_vector(self, input_function):
+    def _equations_vector(self):
         C = Constant(name=self._name_allocator())
         a, b, c = self._dimensions
-        return [Inc(self._T[a], self._K[a, b] * input_function[b]),
+        return [Inc(self._T[a], self._K[a, b] * self._I[b]),
                 Inc(self._T, self._bias),
                 Eq(C, sum([exp(self._T[i]) for i in range(self._R.shape[0])])),
                 Eq(self._R, exp(self._T) / C)]
 
-    def _equations_matrix(self, input_function):
-        gridC = Grid(shape=self._R.shape[1], dimensions=self._dim_allocator(1))
+    def _equations_matrix(self):
+        a, b, c = self._dimensions
+
+        gridC = Grid(shape=self._R.shape[1], dimensions=(c,))
         C = Function(name=self._name_allocator(), grid=gridC, space_order=0,
                      dtype=np.float64)
         M = Function(name=self._name_allocator(), grid=gridC, space_order=0,
                      dtype=np.float64)
-        x = C.dimensions[0]
-        a, b, c = self._dimensions
 
-        return [Inc(self._T[a, c], self._K[a, b] * input_function[b, c]),
+        return [Inc(self._T[a, c], self._K[a, b] * self._I[b, c]),
                 Inc(self._T[a, c], self._bias[a]),
-                Eq(M[x], Max(*[self._T[i, x]
+                Eq(M[c], Max(*[self._T[i, c]
                                for i in range(self._R.shape[0])])),
-                Eq(C[x], sum([exp(self._T[i, x] - M[x])
+                Eq(C[c], sum([exp(self._T[i, c] - M[c])
                               for i in range(self._R.shape[0])])),
                 Eq(self._R[a, b], exp(self._T[a, b] - M[b]) / C[b]),
                 Eq(self._T, 0)]
@@ -621,19 +619,21 @@ def __init__(self, input_size, name_allocator_func=alloc,
 
     def _allocate(self, kernel_size, input_size, name_allocator_func,
                   dim_allocator_func):
-        gridI = Grid(shape=input_size, dimensions=dim_allocator_func(4))
+        t1, t2, t3, t4, t5 = dim_allocator_func(5)
+
+        gridI = Grid(shape=input_size, dimensions=(t1, t2, t3, t4))
         I = Function(name=name_allocator_func(), grid=gridI, space_order=0,
                      dtype=np.float64)
 
         gridR = Grid(shape=(input_size[1]*input_size[2]*input_size[3],
                             input_size[0]),
-                     dimensions=dim_allocator_func(2))
+                     dimensions=(t5, t1))
         R = Function(name=name_allocator_func(), grid=gridR, space_order=0,
                      dtype=np.float64)
 
         output_grad = Function(name=name_allocator_func(),
                                grid=Grid(shape=gridR.shape[0],
-                                         dimensions=dim_allocator_func(1)),
+                                         dimensions=(t5,)),
                                space_order=0, dtype=np.float64)
 
         return (None, I, R, None, None, output_grad, None)
@@ -642,18 +642,14 @@ def execute(self, input_data):
         self._I.data[:] = input_data
         return super().execute()
 
-    def equations(self, input_function=None):
-        if input_function is None:
-            input_function = self._I
-
-        _, b, c, d = input_function.dimensions
-        batch_size, channels, height, width = input_function.shape
+    def equations(self):
+        _, b, c, d = self._I.dimensions
+        batch_size, channels, height, width = self._I.shape
 
-        return [Eq(self._R[b * height * width + c * height + d, a],
-                   input_function[a, b, c, d]) for a in range(batch_size)]
+        return ([Eq(self._R[b * height * width + c * height + d, a],
+                    self._I[a, b, c, d]) for a in range(batch_size)], [])
 
-    def backprop_equations(self, prev_layer, next_layer, batch_constant,
-                           backward_arg_dict=None):
+    def backprop_equations(self, prev_layer, next_layer, batch_constant):
         layer = self
 
         prev_kernel_dims = prev_layer.kernel_gradients.dimensions
@@ -662,13 +658,14 @@ def backprop_equations(self, prev_layer, next_layer, batch_constant,
         _, height, width = next_layer.result_gradients.shape
         next_dims = next_layer.result_gradients.dimensions
 
-        return [Eq(layer.result_gradients, 0),
-                Inc(layer.result_gradients[dims[0]],
-                    prev_layer.kernel[prev_kernel_dims[0], dims[0]] *
-                    prev_layer.result_gradients[prev_kernel_dims[0]]),
-                Eq(next_layer.result_gradients[next_dims[0], next_dims[1],
-                                               next_dims[2]],
-                   layer.result_gradients[next_dims[0] * height * width +
-                                          next_dims[1] * height +
-                                          next_dims[2]])] + \
-            next_layer.activation.backprop_eqs(next_layer, batch_constant)
+        return ([Eq(layer.result_gradients, 0),
+                 Inc(layer.result_gradients[dims[0]],
+                     prev_layer.kernel[prev_kernel_dims[0], dims[0]] *
+                     prev_layer.result_gradients[prev_kernel_dims[0]]),
+                 Eq(next_layer.result_gradients[next_dims[0], next_dims[1],
+                                                next_dims[2]],
+                    layer.result_gradients[next_dims[0] * height * width +
+                                           next_dims[1] * height +
+                                           next_dims[2]])] +
+                next_layer.activation.backprop_eqs(next_layer, batch_constant),
+                [])
diff --git a/joey/net.py b/joey/net.py
index b69cb89..dc9d033 100644
--- a/joey/net.py
+++ b/joey/net.py
@@ -11,10 +11,17 @@ class Net:
     def __init__(self, layers: list):
         self._layers = layers
         self._batch_constant = Constant(name='batch', dtype=np.int32)
+        self._forward_arg_dict = {}
         self._backward_arg_dict = {}
 
-        eqs = self._gen_eqs()
-        backprop_eqs = self._gen_backprop_eqs()
+        eqs, args = self._gen_eqs()
+        backprop_eqs, backprop_args = self._gen_backprop_eqs()
+
+        for (key, value) in args:
+            self._forward_arg_dict[key] = value
+
+        for (key, value) in backprop_args:
+            self._backward_arg_dict[key] = value
 
         parameter_lists = list(map(ml.Layer.pytorch_parameters, self._layers))
         parameters = []
@@ -35,23 +42,34 @@ def __init__(self, layers: list):
     def _init_parameters(self):
         for layer in self._layers:
             if layer.kernel is not None:
-                layer.kernel.data[:] = np.random.rand(*layer.kernel.shape) - 0.5
+                layer.kernel.data[:] = \
+                    np.random.rand(*layer.kernel.shape) - 0.5
 
             if layer.bias is not None:
                 layer.bias.data[:] = np.random.rand(*layer.bias.shape) - 0.5
 
     def _gen_eqs(self):
         eqs = []
+        args = []
+
         input_function = None
 
         for layer in self._layers:
-            eqs += layer.equations(input_function=input_function)
+            if input_function is not None:
+                dims = input_function.dimensions
+                eqs.append(Eq(layer.input[dims], input_function[dims]))
+
+            layer_eqs, layer_args = layer.equations()
+
+            args += layer_args
+            eqs += layer_eqs
             input_function = layer.result
 
-        return eqs
+        return (eqs, args)
 
     def _gen_backprop_eqs(self):
         eqs = []
+        args = []
 
         for i in range(len(self._layers) - 1, -1, -1):
             if i < len(self._layers) - 1:
@@ -64,11 +82,14 @@ def _gen_backprop_eqs(self):
             else:
                 next_layer = None
 
-            eqs += self._layers[i].backprop_equations(prev_layer, next_layer,
-                                                      self._batch_constant,
-                                                      self._backward_arg_dict)
+            layer_eqs, layer_args = \
+                self._layers[i].backprop_equations(prev_layer, next_layer,
+                                                   self._batch_constant)
+
+            args += layer_args
+            eqs += layer_eqs
 
-        return eqs
+        return (eqs, args)
 
     @property
     def pytorch_parameters(self):
@@ -79,7 +100,7 @@ def forward(self, input_data):
             layer.result.data[:] = 0
 
         self._layers[0].input.data[:] = input_data
-        self._forward_operator.apply()
+        self._forward_operator.apply(**self._forward_arg_dict)
         return self._layers[-1].result.data
 
     def backward(self, loss_gradient_func, pytorch_optimizer=None):
diff --git a/tests/test_layers.py b/tests/test_layers.py
index 47cf845..cc1a0fe 100644
--- a/tests/test_layers.py
+++ b/tests/test_layers.py
@@ -1,6 +1,10 @@
 import joey
 import numpy as np
 from joey.activation import ReLU
+from devito import logger
+from utils import get_run_count
+
+logger.set_log_noperf()
 
 
 def test_conv():
@@ -13,27 +17,29 @@ def test_conv():
                               [11, 12]],
                              [[13, -14],
                               [-15, 16]]]]
-    output = layer.execute(np.array([[[[1, 2, 3],
-                                       [4, 5, 6],
-                                       [7, 8, 9]],
-                                      [[0, 0, 1],
-                                       [0, 1, 0],
-                                       [0, 0, 2]]],
-                                     [[[-1, -2, -3],
-                                       [4, 6, 8],
-                                       [11, 0, 2]],
-                                      [[9, 8, 7],
-                                       [6, 5, 4],
-                                       [3, 2, 1]]]]), [0, 0])
-
-    assert(np.array_equal(output, [[[[37.5, 48],
-                                     [53, 75]],
-                                    [[130.5, 109],
-                                     [171, 253.5]]],
-                                   [[[216.5, 205],
-                                     [123, 69.5]],
-                                    [[100.5, 146],
-                                     [138, 42]]]]))
+
+    for i in range(get_run_count()):
+        output = layer.execute(np.array([[[[1, 2, 3],
+                                           [4, 5, 6],
+                                           [7, 8, 9]],
+                                          [[0, 0, 1],
+                                           [0, 1, 0],
+                                           [0, 0, 2]]],
+                                         [[[-1, -2, -3],
+                                           [4, 6, 8],
+                                           [11, 0, 2]],
+                                          [[9, 8, 7],
+                                           [6, 5, 4],
+                                           [3, 2, 1]]]]), [0, 0])
+
+        assert(np.array_equal(output, [[[[37.5, 48],
+                                         [53, 75]],
+                                        [[130.5, 109],
+                                         [171, 253.5]]],
+                                       [[[216.5, 205],
+                                         [123, 69.5]],
+                                        [[100.5, 146],
+                                         [138, 42]]]]))
 
 
 def test_conv_relu():
@@ -47,27 +53,29 @@ def test_conv_relu():
                               [11, 12]],
                              [[13, -14],
                               [-15, 16]]]]
-    output = layer.execute(np.array([[[[1, 2, 3],
-                                       [4, 5, 6],
-                                       [7, 8, 9]],
-                                      [[0, 0, 1],
-                                       [0, 1, 0],
-                                       [0, 0, 2]]],
-                                     [[[-1, -2, -3],
-                                       [4, 6, 8],
-                                       [11, 0, 2]],
-                                      [[9, 8, 7],
-                                       [6, 5, 4],
-                                       [3, 2, 1]]]]), [-50, -79.75])
-
-    assert(np.array_equal(output, [[[[0, 0],
-                                     [3, 25]],
-                                    [[50.75, 29.25],
-                                     [91.25, 173.75]]],
-                                   [[[166.5, 155],
-                                     [73, 19.5]],
-                                    [[20.75, 66.25],
-                                     [58.25, 0]]]]))
+
+    for i in range(get_run_count()):
+        output = layer.execute(np.array([[[[1, 2, 3],
+                                           [4, 5, 6],
+                                           [7, 8, 9]],
+                                          [[0, 0, 1],
+                                           [0, 1, 0],
+                                           [0, 0, 2]]],
+                                         [[[-1, -2, -3],
+                                           [4, 6, 8],
+                                           [11, 0, 2]],
+                                          [[9, 8, 7],
+                                           [6, 5, 4],
+                                           [3, 2, 1]]]]), [-50, -79.75])
+
+        assert(np.array_equal(output, [[[[0, 0],
+                                         [3, 25]],
+                                        [[50.75, 29.25],
+                                         [91.25, 173.75]]],
+                                       [[[166.5, 155],
+                                         [73, 19.5]],
+                                        [[20.75, 66.25],
+                                         [58.25, 0]]]]))
 
 
 def test_conv_larger_stride():
@@ -81,135 +89,145 @@ def test_conv_larger_stride():
                               [11, 12]],
                              [[13, -14],
                               [-15, 16]]]]
-    output = layer.execute(np.array([[[[1, 2, 3],
-                                       [4, 5, 6],
-                                       [7, 8, 9]],
-                                      [[0, 0, 1],
-                                       [0, 1, 0],
-                                       [0, 0, 2]]],
-                                     [[[-1, -2, -3],
-                                       [4, 6, 8],
-                                       [11, 0, 2]],
-                                      [[9, 8, 7],
-                                       [6, 5, 4],
-                                       [3, 2, 1]]]]), [0, 0])
-
-    assert(np.array_equal(output, [[[[37.5, 48]],
-                                    [[130.5, 109]]],
-                                   [[[216.5, 205]],
-                                    [[100.5, 146]]]]))
+
+    for i in range(get_run_count()):
+        output = layer.execute(np.array([[[[1, 2, 3],
+                                           [4, 5, 6],
+                                           [7, 8, 9]],
+                                          [[0, 0, 1],
+                                           [0, 1, 0],
+                                           [0, 0, 2]]],
+                                         [[[-1, -2, -3],
+                                           [4, 6, 8],
+                                           [11, 0, 2]],
+                                          [[9, 8, 7],
+                                           [6, 5, 4],
+                                           [3, 2, 1]]]]), [0, 0])
+
+        assert(np.array_equal(output, [[[[37.5, 48]],
+                                        [[130.5, 109]]],
+                                       [[[216.5, 205]],
+                                        [[100.5, 146]]]]))
 
 
 def test_max_pooling():
     layer = joey.MaxPooling(kernel_size=(2, 2), input_size=(2, 2, 3, 3))
-    output = layer.execute(np.array([[[[1, 2, 3],
-                                       [4, 5, 6],
-                                       [7, 8, 9]],
-                                      [[10, 11, 12],
-                                       [13, 14, 15],
-                                       [16, 17, 18]]],
-                                     [[[19, 20, 21],
-                                       [22, 23, 24],
-                                       [25, 26, 27]],
-                                      [[28, 29, 30],
-                                       [31, 32, 33],
-                                       [34, 35, 36]]]]))
-
-    assert(np.array_equal(output, [[[[5, 6],
-                                     [8, 9]],
-                                    [[14, 15],
-                                     [17, 18]]],
-                                   [[[23, 24],
-                                     [26, 27]],
-                                    [[32, 33],
-                                     [35, 36]]]]))
+
+    for i in range(get_run_count()):
+        output = layer.execute(np.array([[[[1, 2, 3],
+                                           [4, 5, 6],
+                                           [7, 8, 9]],
+                                          [[10, 11, 12],
+                                           [13, 14, 15],
+                                           [16, 17, 18]]],
+                                         [[[19, 20, 21],
+                                           [22, 23, 24],
+                                           [25, 26, 27]],
+                                          [[28, 29, 30],
+                                           [31, 32, 33],
+                                           [34, 35, 36]]]]))
+
+        assert(np.array_equal(output, [[[[5, 6],
+                                         [8, 9]],
+                                        [[14, 15],
+                                         [17, 18]]],
+                                       [[[23, 24],
+                                         [26, 27]],
+                                        [[32, 33],
+                                         [35, 36]]]]))
 
 
 def test_max_pooling_larger_stride():
     layer = joey.MaxPooling(kernel_size=(2, 2), input_size=(2, 2, 3, 3),
                             stride=(1, 2), strict_stride_check=False)
-    output = layer.execute(np.array([[[[1, 2, 3],
-                                       [4, 5, 6],
-                                       [7, 8, 9]],
-                                      [[10, 11, 12],
-                                       [13, 14, 15],
-                                       [16, 17, 18]]],
-                                     [[[19, 20, 21],
-                                       [22, 23, 24],
-                                       [25, 26, 27]],
-                                      [[28, 29, 30],
-                                       [31, 32, 33],
-                                       [34, 35, 36]]]]))
-
-    assert(np.array_equal(output, [[[[5],
-                                     [8]],
-                                    [[14],
-                                     [17]]],
-                                   [[[23],
-                                     [26]],
-                                    [[32],
-                                     [35]]]]))
+
+    for i in range(get_run_count()):
+        output = layer.execute(np.array([[[[1, 2, 3],
+                                           [4, 5, 6],
+                                           [7, 8, 9]],
+                                          [[10, 11, 12],
+                                           [13, 14, 15],
+                                           [16, 17, 18]]],
+                                         [[[19, 20, 21],
+                                           [22, 23, 24],
+                                           [25, 26, 27]],
+                                          [[28, 29, 30],
+                                           [31, 32, 33],
+                                           [34, 35, 36]]]]))
+
+        assert(np.array_equal(output, [[[[5],
+                                         [8]],
+                                        [[14],
+                                         [17]]],
+                                       [[[23],
+                                         [26]],
+                                        [[32],
+                                         [35]]]]))
 
 
 def test_max_pooling_relu():
     layer = joey.MaxPooling(kernel_size=(2, 2), input_size=(2, 2, 3, 3),
                             activation=ReLU())
-    output = layer.execute(np.array([[[[-1, -2, 3],
-                                       [-4, -5, 6],
-                                       [7, 8, 9]],
-                                      [[10, 11, 12],
-                                       [13, -14, -15],
-                                       [16, -17, -18]]],
-                                     [[[19, -20, -21],
-                                       [22, -23, -24],
-                                       [25, 26, 27]],
-                                      [[28, 29, 30],
-                                       [31, 32, 33],
-                                       [34, 35, 36]]]]))
-
-    assert(np.array_equal(output, [[[[0, 6],
-                                     [8, 9]],
-                                    [[13, 12],
-                                     [16, 0]]],
-                                   [[[22, 0],
-                                     [26, 27]],
-                                    [[32, 33],
-                                     [35, 36]]]]))
+
+    for i in range(get_run_count()):
+        output = layer.execute(np.array([[[[-1, -2, 3],
+                                           [-4, -5, 6],
+                                           [7, 8, 9]],
+                                          [[10, 11, 12],
+                                           [13, -14, -15],
+                                           [16, -17, -18]]],
+                                         [[[19, -20, -21],
+                                           [22, -23, -24],
+                                           [25, 26, 27]],
+                                          [[28, 29, 30],
+                                           [31, 32, 33],
+                                           [34, 35, 36]]]]))
+
+        assert(np.array_equal(output, [[[[0, 6],
+                                         [8, 9]],
+                                        [[13, 12],
+                                         [16, 0]]],
+                                       [[[22, 0],
+                                         [26, 27]],
+                                        [[32, 33],
+                                         [35, 36]]]]))
 
 
 def test_flat():
     layer = joey.Flat(input_size=(2, 2, 3, 3))
-    output = layer.execute([[[[1, 2, 3],
-                              [4, 5, 6],
-                              [7, 8, 9]],
-                             [[10, 11, 12],
-                              [13, 14, 15],
-                              [16, 17, 18]]],
-                            [[[19, 20, 21],
-                              [22, 23, 24],
-                              [25, 26, 27]],
-                             [[28, 29, 30],
-                              [31, 32, 33],
-                              [34, 35, 36]]]])
-
-    assert(np.array_equal(output, [[1, 19],
-                                   [2, 20],
-                                   [3, 21],
-                                   [4, 22],
-                                   [5, 23],
-                                   [6, 24],
-                                   [7, 25],
-                                   [8, 26],
-                                   [9, 27],
-                                   [10, 28],
-                                   [11, 29],
-                                   [12, 30],
-                                   [13, 31],
-                                   [14, 32],
-                                   [15, 33],
-                                   [16, 34],
-                                   [17, 35],
-                                   [18, 36]]))
+
+    for i in range(get_run_count()):
+        output = layer.execute([[[[1, 2, 3],
+                                  [4, 5, 6],
+                                  [7, 8, 9]],
+                                 [[10, 11, 12],
+                                  [13, 14, 15],
+                                  [16, 17, 18]]],
+                                [[[19, 20, 21],
+                                  [22, 23, 24],
+                                  [25, 26, 27]],
+                                 [[28, 29, 30],
+                                  [31, 32, 33],
+                                  [34, 35, 36]]]])
+
+        assert(np.array_equal(output, [[1, 19],
+                                       [2, 20],
+                                       [3, 21],
+                                       [4, 22],
+                                       [5, 23],
+                                       [6, 24],
+                                       [7, 25],
+                                       [8, 26],
+                                       [9, 27],
+                                       [10, 28],
+                                       [11, 29],
+                                       [12, 30],
+                                       [13, 31],
+                                       [14, 32],
+                                       [15, 33],
+                                       [16, 34],
+                                       [17, 35],
+                                       [18, 36]]))
 
 
 def test_fully_connected():
@@ -217,9 +235,11 @@ def test_fully_connected():
     layer.kernel.data[:] = [[1, 2, 3],
                             [4, 5, 6],
                             [7, 8, 9]]
-    output = layer.execute([[-1], [1], [-2]], [4, 1, -2])
 
-    assert(np.array_equal(output, [[-1], [-10], [-19]]))
+    for i in range(get_run_count()):
+        output = layer.execute([[-1], [1], [-2]], [4, 1, -2])
+
+        assert(np.array_equal(output, [[-1], [-10], [-19]]))
 
 
 def test_fully_connected_relu():
@@ -228,6 +248,8 @@ def test_fully_connected_relu():
     layer.kernel.data[:] = [[1, 2, 3],
                             [4, 5, 6],
                             [7, 8, 9]]
-    output = layer.execute([[-1], [1], [-2]], [6, 1, -2])
 
-    assert(np.array_equal(output, [[1], [0], [0]]))
+    for i in range(get_run_count()):
+        output = layer.execute([[-1], [1], [-2]], [6, 1, -2])
+
+        assert(np.array_equal(output, [[1], [0], [0]]))
diff --git a/tests/test_lenet.py b/tests/test_lenet.py
index 601edde..5d3dd5c 100644
--- a/tests/test_lenet.py
+++ b/tests/test_lenet.py
@@ -7,7 +7,10 @@
 import torch.nn.functional as F
 import torch.optim as optim
 import numpy as np
-from utils import compare
+from utils import compare, get_run_count
+from devito import logger
+
+logger.set_log_noperf()
 
 
 # PyTorch class
@@ -145,10 +148,11 @@ def test_forward_pass(net_arguments, mnist):
 
     images = images.double()
 
-    outputs = net.forward(images.numpy())
-    pytorch_outputs = pytorch_net(images)
+    for i in range(get_run_count()):
+        outputs = net.forward(images.numpy())
+        pytorch_outputs = pytorch_net(images)
 
-    compare(outputs, nn.Softmax(dim=1)(pytorch_outputs))
+        compare(outputs, nn.Softmax(dim=1)(pytorch_outputs), 1e-12)
 
 
 def test_backward_pass(net_arguments, mnist):
@@ -171,34 +175,34 @@ def loss_grad(layer, b):
 
     images = images.double()
 
-    net.forward(images.numpy())
-    net.backward(loss_grad)
+    for i in range(get_run_count()):
+        net.forward(images.numpy())
+        net.backward(loss_grad)
 
-    criterion = nn.CrossEntropyLoss()
+        criterion = nn.CrossEntropyLoss()
 
-    pytorch_net.zero_grad()
-    outputs = pytorch_net(images)
-    loss = criterion(outputs, labels)
-    loss.backward()
+        pytorch_net.zero_grad()
+        outputs = pytorch_net(images)
+        loss = criterion(outputs, labels)
+        loss.backward()
 
-    pytorch_layers = [pytorch_net.conv1, pytorch_net.conv2,
-                      pytorch_net.fc1, pytorch_net.fc2, pytorch_net.fc3]
-    devito_layers = [layers[0], layers[2], layers[5], layers[6], layers[7]]
+        pytorch_layers = [pytorch_net.conv1, pytorch_net.conv2,
+                          pytorch_net.fc1, pytorch_net.fc2, pytorch_net.fc3]
+        devito_layers = [layers[0], layers[2], layers[5], layers[6], layers[7]]
 
-    for i in range(len(pytorch_layers)):
-        pytorch_layer = pytorch_layers[i]
-        devito_layer = devito_layers[i]
+        for j in range(len(pytorch_layers) - 1, -1, -1):
+            pytorch_layer = pytorch_layers[j]
+            devito_layer = devito_layers[j]
 
-        compare(devito_layer.kernel_gradients.data,
-                pytorch_layer.weight.grad)
+            compare(devito_layer.kernel_gradients.data,
+                    pytorch_layer.weight.grad, 1e-11)
 
-        compare(devito_layer.bias_gradients.data,
-                pytorch_layer.bias.grad)
+            compare(devito_layer.bias_gradients.data,
+                    pytorch_layer.bias.grad, 1e-11)
 
 
-def test_training_sgd(net_arguments, mnist):
+def run_training(net_arguments, mnist):
     mnist_train, _ = mnist
-    iterations = 100
 
     net, pytorch_net, layers = net_arguments
 
@@ -208,41 +212,48 @@ def test_training_sgd(net_arguments, mnist):
 
     criterion = nn.CrossEntropyLoss()
 
-    for i, data in enumerate(mnist_train, 0):
-        images, labels = data
+    pytorch_layers = [pytorch_net.conv1, pytorch_net.conv2,
+                      pytorch_net.fc1, pytorch_net.fc2, pytorch_net.fc3]
+    devito_layers = [layers[0], layers[2], layers[5], layers[6], layers[7]]
 
-        def loss_grad(layer, b):
-            gradients = []
+    images, labels = iter(mnist_train).next()
 
-            for j in range(10):
-                result = layer.result.data[j, b]
-                if j == labels[b]:
-                    result -= 1
-                gradients.append(result)
+    def loss_grad(layer, b):
+        gradients = []
 
-            return gradients
+        for j in range(10):
+            result = layer.result.data[j, b]
+            if j == labels[b]:
+                result -= 1
+            gradients.append(result)
 
-        images = images.double()
+        return gradients
 
-        net.forward(images.numpy())
-        net.backward(loss_grad, optimizer)
+    images = images.double()
 
-        pytorch_optimizer.zero_grad()
-        pytorch_outputs = pytorch_net(images)
-        pytorch_loss = criterion(pytorch_outputs, labels)
-        pytorch_loss.backward()
-        pytorch_optimizer.step()
+    outputs = net.forward(images.numpy())
 
-        if i == iterations - 1:
-            break
+    pytorch_optimizer.zero_grad()
+    pytorch_outputs = pytorch_net(images)
 
-    pytorch_layers = [pytorch_net.conv1, pytorch_net.conv2,
-                      pytorch_net.fc1, pytorch_net.fc2, pytorch_net.fc3]
-    devito_layers = [layers[0], layers[2], layers[5], layers[6], layers[7]]
+    compare(outputs, nn.Softmax(dim=1)(pytorch_outputs),
+            1e-12)
+
+    net.backward(loss_grad, optimizer)
 
-    for i in range(len(pytorch_layers)):
-        pytorch_layer = pytorch_layers[i]
-        devito_layer = devito_layers[i]
+    pytorch_loss = criterion(pytorch_outputs, labels)
+    pytorch_loss.backward()
+    pytorch_optimizer.step()
 
-        compare(devito_layer.kernel.data, pytorch_layer.weight)
-        compare(devito_layer.bias.data, pytorch_layer.bias)
+    for j in range(len(pytorch_layers) - 1, -1, -1):
+        pytorch_layer = pytorch_layers[j]
+        devito_layer = devito_layers[j]
+
+        compare(devito_layer.kernel.data, pytorch_layer.weight,
+                1e-12)
+        compare(devito_layer.bias.data, pytorch_layer.bias,
+                1e-12)
+
+
+def test_training_sgd(net_arguments, mnist):
+    run_training(net_arguments, mnist)
diff --git a/tests/test_simple.py b/tests/test_simple.py
index b543541..8279ef1 100644
--- a/tests/test_simple.py
+++ b/tests/test_simple.py
@@ -5,7 +5,10 @@
 import torch.nn.functional as F
 import numpy as np
 from joey.activation import ReLU
-from utils import compare
+from utils import compare, get_run_count
+from devito import logger
+
+logger.set_log_noperf()
 
 
 # PyTorch class
@@ -13,13 +16,15 @@ class Net(nn.Module):
     def __init__(self):
         super(Net, self).__init__()
         self.conv = nn.Conv2d(2, 2, 2)
-        self.fc = nn.Linear(8, 3)
+        self.fc1 = nn.Linear(8, 5)
+        self.fc2 = nn.Linear(5, 3)
 
     def forward(self, x):
-        x = F.max_pool2d(x, 2, stride=(1, 1))
         x = F.relu(self.conv(x))
+        x = F.max_pool2d(x, 2, stride=(1, 1))
         x = x.view(-1, self.num_flat_features(x))
-        x = self.fc(x)
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
         return x
 
     def num_flat_features(self, x):
@@ -38,20 +43,24 @@ def num_flat_features(self, x):
 def net_arguments():
     np.random.seed(SEED)
 
-    layer1 = joey.MaxPooling(kernel_size=(2, 2),
-                             input_size=(1, 2, 4, 4),
-                             generate_code=False)
-    layer2 = joey.Conv(kernel_size=(2, 2, 2),
-                       input_size=(1, 2, 3, 3),
+    layer1 = joey.Conv(kernel_size=(2, 2, 2),
+                       input_size=(2, 2, 4, 4),
                        activation=ReLU(),
                        generate_code=False)
-    layer_flat = joey.Flat(input_size=(1, 2, 2, 2),
+    layer2 = joey.MaxPooling(kernel_size=(2, 2),
+                             input_size=(2, 2, 3, 3),
+                             generate_code=False)
+    layer_flat = joey.Flat(input_size=(2, 2, 2, 2),
                            generate_code=False)
-    layer3 = joey.FullyConnectedSoftmax(weight_size=(3, 8),
-                                        input_size=(8, 1),
+    layer3 = joey.FullyConnected(weight_size=(5, 8),
+                                 input_size=(8, 2),
+                                 activation=ReLU(),
+                                 generate_code=False)
+    layer4 = joey.FullyConnectedSoftmax(weight_size=(3, 5),
+                                        input_size=(5, 2),
                                         generate_code=False)
 
-    layers = [layer1, layer2, layer_flat, layer3]
+    layers = [layer1, layer2, layer_flat, layer3, layer4]
 
     net = joey.Net(layers)
 
@@ -59,11 +68,14 @@ def net_arguments():
     pytorch_net.double()
 
     with torch.no_grad():
-        pytorch_net.conv.weight[:] = torch.from_numpy(layer2.kernel.data)
-        pytorch_net.conv.bias[:] = torch.from_numpy(layer2.bias.data)
+        pytorch_net.conv.weight[:] = torch.from_numpy(layer1.kernel.data)
+        pytorch_net.conv.bias[:] = torch.from_numpy(layer1.bias.data)
+
+        pytorch_net.fc1.weight[:] = torch.from_numpy(layer3.kernel.data)
+        pytorch_net.fc1.bias[:] = torch.from_numpy(layer3.bias.data)
 
-        pytorch_net.fc.weight[:] = torch.from_numpy(layer3.kernel.data)
-        pytorch_net.fc.bias[:] = torch.from_numpy(layer3.bias.data)
+        pytorch_net.fc2.weight[:] = torch.from_numpy(layer4.kernel.data)
+        pytorch_net.fc2.bias[:] = torch.from_numpy(layer4.bias.data)
 
     return (net, pytorch_net, layers)
 
@@ -78,13 +90,22 @@ def test_forward_pass(net_arguments):
                             [[-1, -2, 0, 1],
                              [-2, -3, 1, 2],
                              [3, 4, 2, -1],
-                             [-2, -3, -4, 9]]]],
+                             [-2, -3, -4, 9]]],
+                           [[[5, 6, 7, 8],
+                             [9, 10, 11, 12],
+                             [13, 14, 15, 16],
+                             [17, 18, 19, 20]],
+                            [[1, 2, 0, -1],
+                             [2, 3, -1, -2],
+                             [-3, -4, -2, 1],
+                             [2, 3, 4, -9]]]],
                           dtype=np.float64)
 
-    outputs = net.forward(input_data)
-    pytorch_outputs = pytorch_net(torch.from_numpy(input_data))
+    for i in range(get_run_count()):
+        outputs = net.forward(input_data)
+        pytorch_outputs = pytorch_net(torch.from_numpy(input_data))
 
-    compare(outputs, nn.Softmax(dim=1)(pytorch_outputs))
+        compare(outputs, nn.Softmax(dim=1)(pytorch_outputs), 1e-14)
 
 
 def test_backward_pass(net_arguments):
@@ -96,40 +117,49 @@ def test_backward_pass(net_arguments):
                             [[-1, -2, 0, 1],
                              [-2, -3, 1, 2],
                              [3, 4, 2, -1],
-                             [-2, -3, -4, 9]]]],
+                             [-2, -3, -4, 9]]],
+                           [[[5, 6, 7, 8],
+                             [9, 10, 11, 12],
+                             [13, 14, 15, 16],
+                             [17, 18, 19, 20]],
+                            [[1, 2, 0, -1],
+                             [2, 3, -1, -2],
+                             [-3, -4, -2, 1],
+                             [2, 3, 4, -9]]]],
                           dtype=np.float64)
-    expected = np.array([2])
+    expected = np.array([2, 1])
 
     def loss_grad(layer, b):
         gradients = []
 
         for i in range(3):
-            result = layer.result.data[i]
-            if i == expected[0]:
+            result = layer.result.data[i, b]
+            if i == expected[b]:
                 result -= 1
             gradients.append(result)
 
         return gradients
 
-    net.forward(input_data)
-    net.backward(loss_grad)
+    for i in range(get_run_count()):
+        net.forward(input_data)
+        net.backward(loss_grad)
 
-    criterion = nn.CrossEntropyLoss()
+        criterion = nn.CrossEntropyLoss()
 
-    pytorch_net.zero_grad()
-    outputs = pytorch_net(torch.from_numpy(input_data))
-    loss = criterion(outputs, torch.from_numpy(expected))
-    loss.backward()
+        pytorch_net.zero_grad()
+        outputs = pytorch_net(torch.from_numpy(input_data))
+        loss = criterion(outputs, torch.from_numpy(expected))
+        loss.backward()
 
-    pytorch_layers = [pytorch_net.conv, pytorch_net.fc]
-    devito_layers = [layers[1], layers[3]]
+        pytorch_layers = [pytorch_net.conv, pytorch_net.fc1, pytorch_net.fc2]
+        devito_layers = [layers[0], layers[3], layers[4]]
 
-    for i in range(len(pytorch_layers)):
-        pytorch_layer = pytorch_layers[i]
-        devito_layer = devito_layers[i]
+        for j in range(len(pytorch_layers) - 1, -1, -1):
+            pytorch_layer = pytorch_layers[j]
+            devito_layer = devito_layers[j]
 
-        compare(devito_layer.kernel_gradients.data,
-                pytorch_layer.weight.grad)
+            compare(devito_layer.kernel_gradients.data,
+                    pytorch_layer.weight.grad, 1e-13)
 
-        compare(devito_layer.bias_gradients.data,
-                pytorch_layer.bias.grad)
+            compare(devito_layer.bias_gradients.data,
+                    pytorch_layer.bias.grad, 1e-13)
diff --git a/tests/utils.py b/tests/utils.py
index c68639e..002ec72 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,7 +1,8 @@
 import numpy as np
+from os import environ
 
 
-def compare(devito, pytorch):
+def compare(devito, pytorch, tolerance):
     pytorch = pytorch.detach().numpy()
 
     if devito.shape != pytorch.shape:
@@ -10,5 +11,18 @@ def compare(devito, pytorch):
     error = abs(devito - pytorch) / abs(pytorch)
     max_error = np.nanmax(error)
 
-    if max_error != np.nan:
-        assert(max_error < 10**(-9))
+    assert(np.isnan(max_error) or max_error < tolerance)
+
+
+def running_in_parallel():
+    if 'DEVITO_LANGUAGE' not in environ:
+        return False
+
+    return environ['DEVITO_LANGUAGE'] in ['openmp']
+
+
+def get_run_count():
+    if running_in_parallel():
+        return 1000
+    else:
+        return 1