diff --git a/.gitignore b/.gitignore
index 62b2ca8..52d9f37 100644
--- a/.gitignore
+++ b/.gitignore
@@ -189,3 +189,4 @@ dmypy.json
 
 # End of https://www.toptal.com/developers/gitignore/api/python,emacs
 
+data/
\ No newline at end of file
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..07b79ea
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,37 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="DuplicatedCode" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <Languages>
+        <language minSize="75" name="Python" />
+      </Languages>
+    </inspection_tool>
+    <inspection_tool class="LanguageDetectionInspection" enabled="false" level="WARNING" enabled_by_default="false" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="10">
+            <item index="0" class="java.lang.String" itemvalue="torchvision" />
+            <item index="1" class="java.lang.String" itemvalue="pandas" />
+            <item index="2" class="java.lang.String" itemvalue="chardet" />
+            <item index="3" class="java.lang.String" itemvalue="boto3" />
+            <item index="4" class="java.lang.String" itemvalue="pdfminer.six" />
+            <item index="5" class="java.lang.String" itemvalue="botocore" />
+            <item index="6" class="java.lang.String" itemvalue="python-dateutil" />
+            <item index="7" class="java.lang.String" itemvalue="numpy" />
+            <item index="8" class="java.lang.String" itemvalue="pycryptodome" />
+            <item index="9" class="java.lang.String" itemvalue="mysql-connector-python" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N806" />
+          <option value="N803" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
diff --git a/.idea/joey2.0.iml b/.idea/joey2.0.iml
new file mode 100644
index 0000000..9289075
--- /dev/null
+++ b/.idea/joey2.0.iml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Python 3.10.6 WSL (Ubuntu): (/usr/bin/python3)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="NUMPY" />
+    <option name="myDocStringFormat" value="NumPy" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="py.test" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..1f79844
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10.6 WSL (Ubuntu): (/usr/bin/python3)" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..55c4497
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/joey2.0.iml" filepath="$PROJECT_DIR$/.idea/joey2.0.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/README.md b/README.md
index eac241b..8a72c9e 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@ Joey is a machine learning framework running on top of [Devito](https://github.c
 * A backward pass through a neural network with batch processing
 * Producing backpropagation equations automatically based on the list of layers in a neural network (only a loss function must be defined manually by the user)
 * Training a neural network with PyTorch optimizers
+* Initiate a transformer neural network for image classification
 
 Unlike other machine learning frameworks, Joey generates and compiles an optimized low-level code on-the-spot (using Devito) for both standalone layers and proper neural networks.
 
@@ -16,10 +17,23 @@ Unlike other machine learning frameworks, Joey generates and compiles an optimiz
 * 2D max pooling (other types of 2D pooling can be implemented by the user by extending the `Pooling` abstract class)
 * Full connection
 * Flattening (an internal layer turning 2D data with channels into a 1D vector or 2D matrix, depending on the batch size)
+* 3D FullyConnected
+* Einsun function
+* Dropout 1, 2, 3 and 4 dimensions
+* Norm 2D
+* Norm 3D
+* Softmax 3D and 4D function
+
+## Supported modules
+* MultiHeadAttention
+* VisionEnconder
+
+## Built-in Models
+* ViT (Vision Transformer)
 
 ## Supported activation functions
 * ReLU
-* Softmax (only via the `FullyConnectedSoftmax` class)
+* Softmax (only via the `FullyConnectedSoftmax` class or with `Softmax3d` / `Softmax4d` function)
 * Dummy (`f(x) = x`)
 
 Other activation functions can be implemented by extending the `Activation` abstract class.
diff --git a/examples/ViT_Running.py b/examples/ViT_Running.py
new file mode 100644
index 0000000..9e5f5f8
--- /dev/null
+++ b/examples/ViT_Running.py
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/examples/resources/model_weights_ViT b/examples/resources/model_weights_ViT
new file mode 100644
index 0000000..53c2b4d
Binary files /dev/null and b/examples/resources/model_weights_ViT differ
diff --git a/joey/base.py b/joey/base.py
index d4db2d5..37dd95d 100644
--- a/joey/base.py
+++ b/joey/base.py
@@ -1,5 +1,7 @@
 from abc import ABC, abstractmethod
-from devito import Operator, Function, dimensions
+
+import numpy as np
+from devito import Operator, Function, dimensions, SpaceDimension
 from joey import Activation
 from joey import activation as activ
 from numpy import array
@@ -8,21 +10,60 @@
 dim_index = 0
 
 
-def default_name_allocator():
+def default_name_allocator(name=''):
     global index
-    name = 'f' + str(index)
+    _name = 'f' + name + str(index)
     index += 1
-    return name
+    return _name
 
 
 def default_dim_allocator(count):
     global dim_index
-    names = ''
+    names = []
     for i in range(count):
-        names += 'd' + str(dim_index) + ' '
+        names.append('d' + str(dim_index))
         dim_index += 1
-    names = names[:-1]
-    return dimensions(names)
+    return [SpaceDimension(n) for n in names]
+
+
+class Module(ABC):
+    @property
+    def input(self):
+        """A Function object corresponding to an input data array."""
+        return self._I
+
+    @property
+    def result(self):
+        """A Function object corresponding to a result array."""
+        return self._R
+
+    @abstractmethod
+    def equations(self) -> (list, list):
+        pass
+
+    def init_params(self):
+        if self.kernel is not None:
+            self.kernel.data[:] = \
+                np.random.rand(*self.kernel.shape) - 0.5
+
+        if self.bias is not None:
+            self.bias.data[:] = np.random.rand(*self.bias.shape) - 0.5
+
+    @abstractmethod
+    def _allocate(self, **kwargs) -> (Function, Function, Function,
+                                      Function, Function, Function,
+                                      Function):
+
+        pass
+
+    def execute(self, kernel_data=None, input_data=None, bias=None) -> array:
+
+        self._op.apply(**self._arg_dict)
+        return self._R.data
+
+    @abstractmethod
+    def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+        pass
 
 
 class Layer(ABC):
@@ -62,7 +103,7 @@ def __init__(self, kernel_size,
                  input_size, activation=activ.Dummy(),
                  name_allocator_func=default_name_allocator,
                  dim_allocator_func=default_dim_allocator,
-                 generate_code=False):
+                 generate_code=False, **kwargs):
         if activation is None:
             activation = activ.Dummy()
 
@@ -71,12 +112,14 @@ def __init__(self, kernel_size,
                             "its subclass")
 
         self._activation = activation
-
-        self._K, self._I, self._R, self._bias, self._KG, self._RG, \
-            self._biasG = self._allocate(kernel_size,
-                                         input_size,
-                                         name_allocator_func,
-                                         dim_allocator_func)
+        self.propagate = True
+        self.back_propagate = True
+        self.name = kwargs.get('name', '')
+        self._K, self._I, self._R, self._bias, self._KG, self._RG, self._biasG = self._allocate(kernel_size,
+                                                                                                input_size,
+                                                                                                name_allocator_func,
+                                                                                                dim_allocator_func,
+                                                                                                **kwargs)
 
         if generate_code:
             eqs, args = self.equations()
@@ -89,6 +132,11 @@ def kernel(self):
         """A Function object corresponding to a kernel/weight array."""
         return self._K
 
+    @property
+    def weight(self):
+        """A Function object corresponding to a kernel/weight array."""
+        return self._K.data
+
     @property
     def input(self):
         """A Function object corresponding to an input data array."""
@@ -135,27 +183,35 @@ def pytorch_parameters(self):
         kernel_parameter = None
         bias_parameter = None
 
-        if self._K is not None:
+        if self._K is not None and self.propagate:
             kernel_tensor = from_numpy(self._K.data)
             kernel_parameter = Parameter(kernel_tensor, requires_grad=False)
 
             if self._KG is not None:
                 kernel_parameter.grad = from_numpy(self._KG.data)
 
-        if self._bias is not None:
+        if self._bias is not None and self.propagate:
             bias_tensor = from_numpy(self._bias.data)
             bias_parameter = Parameter(bias_tensor, requires_grad=False)
 
             if self._biasG is not None:
                 bias_parameter.grad = from_numpy(self._biasG.data)
 
-        return (kernel_parameter, bias_parameter)
+        return kernel_parameter, bias_parameter
+
+    def init_params(self):
+        if self.kernel is not None:
+            self.kernel.data[:] = \
+                np.random.rand(*self.kernel.shape) - 0.5
+
+        if self.bias is not None:
+            self.bias.data[:] = np.random.rand(*self.bias.shape) - 0.5
 
     @abstractmethod
     def _allocate(self, kernel_size, input_size, name_allocator_func,
-                  dim_allocator_func) -> (Function, Function, Function,
-                                          Function, Function, Function,
-                                          Function):
+                  dim_allocator_func, **kwargs) -> (Function, Function, Function,
+                                                    Function, Function, Function,
+                                                    Function):
         """
         This method should return a (Function, Function, Function, Function,
         Function, Function, Function) object corresponding to a kernel,
diff --git a/joey/funtional.py b/joey/funtional.py
new file mode 100644
index 0000000..59b24af
--- /dev/null
+++ b/joey/funtional.py
@@ -0,0 +1,310 @@
+from abc import ABC, abstractmethod
+from functools import reduce
+import numpy as np
+from devito import Eq, Function, exp, Inc
+
+from joey import default_name_allocator, Layer, default_dim_allocator
+from joey.utils import get_tensor_2d, get_tensor_1d, get_tensor_3d, get_tensor_4d
+
+
+def kernel_shape(x):
+    return reduce(lambda a, b: a * b, x)
+
+
+class Functional(Layer):
+
+    def _allocate(self, kernel_size, input_size, name_allocator_func, dim_allocator_func, **kwargs) -> (
+            Function, Function, Function,
+            Function, Function, Function,
+            Function):
+        return self._K, self._I, self._R, self.bias, self._KG, self._RG, self._biasG
+
+    def execute(self, kernel_data=None, input_data=None, bias=None) -> np.array:
+        pass
+
+    @abstractmethod
+    def equations(self) -> (list, list):
+        pass
+
+    @abstractmethod
+    def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+        pass
+
+    def __init__(self):
+        pass
+
+
+class BaseDropout(Functional):
+    def equations(self) -> (list, list):
+        pass
+
+    def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+        pass
+
+    def init_params(self):
+        K = int(self.N * self.dropout)
+        arr = np.array([0] * K + [1] * (self.N - K))
+        np.random.shuffle(arr)
+
+        self._K.data[:] = arr.reshape(*self.shape)
+
+
+class Dropout1d(BaseDropout):
+    def __init__(self, name, shape, kernel_size, input_size, dropout=0.1, **kwargs):
+        self.N = kernel_shape(shape)
+        self.dropout = dropout
+        self.name = name
+        self.shape = shape
+        self.propagate = False
+
+        self._I = get_tensor_1d(default_name_allocator('input_' + self.name), shape=self.shape)
+        self._K = get_tensor_1d(default_name_allocator('kernel_' + self.name), shape=self.shape,
+                                dims=self._I.dimensions)
+        self._R = get_tensor_1d(default_name_allocator('result_' + self.name), shape=self.shape,
+                                dims=self._I.dimensions)
+
+        self._bias, self._KG, self._RG, self._biasG = None, None, None, None
+
+    def equations(self) -> (list, list):
+        a = self._R.dimensions
+        return [
+                   Eq(self._R[a], self._I[a] * self._K[a])
+               ], []
+
+    def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+        return [], []
+
+
+class Dropout2d(BaseDropout):
+    def __init__(self, name, shape, kernel_size, input_size, dropout=0.1, **kwargs):
+        self.N = kernel_shape(shape)
+        self.dropout = dropout
+        self.name = name
+        self.shape = shape
+        self.propagate = False
+
+        self._I = get_tensor_2d(default_name_allocator('input_' + self.name), shape=self.shape)
+        self._K = get_tensor_2d(default_name_allocator('kernel_' + self.name), shape=self.shape,
+                                dims=self._I.dimensions)
+        self._R = get_tensor_2d(default_name_allocator('result_' + self.name), shape=self.shape,
+                                dims=self._I.dimensions)
+
+        self._bias, self._KG, self._RG, self._biasG = None, None, None, None
+
+    def equations(self) -> (list, list):
+        a, b = self._R.dimensions
+        return [
+                   Eq(self._R[a, b], self._I[a, b] * self._K[a, b])
+               ], []
+
+    def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+        return [], []
+
+
+class Dropout3d(BaseDropout):
+    def __init__(self, name, shape, dropout=0.1, **kwargs):
+        self.N = kernel_shape(shape)
+        self.dropout = dropout
+        self.name = name
+        self.shape = shape
+        self.propagate = False
+
+        self._I = get_tensor_3d(default_name_allocator('input_' + self.name), shape=self.shape)
+        self._K = get_tensor_3d(default_name_allocator('kernel_' + self.name), shape=self.shape,
+                                dims=self._I.dimensions)
+        self._R = get_tensor_3d(default_name_allocator('result_' + self.name), shape=self.shape,
+                                dims=self._I.dimensions)
+
+        self._bias, self._KG, self._RG, self._biasG = None, None, None, None
+
+    def equations(self) -> (list, list):
+        a, b, c = self._R.dimensions
+        return [
+                   Eq(self._R[a, b, c], self._I[a, b, c] * self._K[a, b, c])
+               ], []
+
+    def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+        return [], []
+
+
+class Dropout4d(BaseDropout):
+    def __init__(self, name, shape, dropout=0.1, **kwargs):
+        self.N = kernel_shape(shape)
+        self.dropout = dropout
+        self.name = name
+        self.shape = shape
+        self.propagate = False
+
+        self._I = get_tensor_4d(default_name_allocator('input_' + self.name), shape=self.shape)
+        self._K = get_tensor_4d(default_name_allocator('kernel_' + self.name), shape=self.shape,
+                                dims=self._I.dimensions)
+        self._R = get_tensor_4d(default_name_allocator('result_' + self.name), shape=self.shape,
+                                dims=self._I.dimensions)
+
+        self._bias, self._KG, self._RG, self._biasG = None, None, None, None
+
+    def equations(self) -> (list, list):
+        a, b, c, d = self._R.dimensions
+        return [
+           Eq(self._R[a, b, c, d], self._I[a, b, c, d] * self._K[a, b, c, d])
+        ], []
+
+    def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+        return [], []
+
+
+class Softmax3d(Functional):
+    def __init__(self, name, shape, **kwargs):
+        self.name = name
+        self.shape = shape
+        self.propagate = False
+
+        self._I = get_tensor_3d(default_name_allocator('input_' + self.name), shape=self.shape)
+        self._R = get_tensor_3d(default_name_allocator('result_' + self.name), shape=self.shape)
+
+        self._bias, self._KG, self._RG, self._biasG, self._K = None, None, None, None, None
+
+    def equations(self) -> (list, list):
+        a, b, c = self._I.dimensions
+        x, y, z = self._R.dimensions
+
+        h = default_dim_allocator(1)[0]
+        expon = get_tensor_3d(default_name_allocator('exponential_' + self.name), shape=self.shape, dims=(a, b, c))
+        sum_last_axis = get_tensor_3d(default_name_allocator('sum_all_' + self.name), shape=(self.shape[0:2] + (1,)),
+                                      dims=(a, b, h))
+
+        return [
+            Eq(self.result, 0),
+            Eq(expon[a, b, c], exp(self._I[a, b, c])),
+            Eq(sum_last_axis[a, b, h], 0),
+            Inc(sum_last_axis[a, b, h], expon[a, b, c]),
+            Eq(self.result[x, y, z], expon[x, y, z] / sum_last_axis[x, y, h]),
+        ], []
+
+    def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+        return [], []
+
+    def init_params(self):
+        pass
+
+
+class Softmax4d(Functional):
+    def __init__(self, name, shape):
+        self.name = name
+        self.shape = shape
+        self.propagate = False
+
+        self._I = get_tensor_4d(default_name_allocator('input_' + self.name), shape=self.shape)
+        self._R = get_tensor_4d(default_name_allocator('result_' + self.name), shape=self.shape)
+
+        self.dimensions = self._R.dimensions
+
+        self._bias, self._KG, self._RG, self._biasG, self._K = None, None, None, None, None
+
+    def equations(self) -> (list, list):
+        a, b, c, d = self._I.dimensions
+        x, y, z, w = self._R.dimensions
+
+        h = default_dim_allocator(1)[0]
+        expon = get_tensor_4d(default_name_allocator('exponential_' + self.name), shape=self.shape, dims=(a, b, c, d))
+        sum_last_axis = get_tensor_4d(default_name_allocator('sum_all_' + self.name), shape=(self.shape[0:3] + (1,)),
+                                      dims=(a, b, c, h))
+        eqs = [Eq(self.result, 0)]
+        eqs += [
+            Eq(expon[a, b, c, d], exp(self.input[a, b, c, d])),
+            Eq(sum_last_axis[a, b, c, h], 0),
+            Inc(sum_last_axis[a, b, c, h], expon[a, b, c, d]),
+            Eq(self.result[x, y, z, w], expon[x, y, z, w] / sum_last_axis[x, y, z, h]),
+
+        ]
+
+        return eqs, []
+
+    def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+        return [], []
+
+    def init_params(self):
+        pass
+
+
+class Expand3to4(Functional):
+
+    def __init__(self, name, shape_in, shape_out):
+        assert shape_in[-1] == (shape_out[-1] * shape_out[-2]), 'The last Input dimension must match the ' \
+                                                                'multiplication ' \
+                                                   'of the 2 last Result dimensions.'
+
+        self.name = name
+        self.propagate = False
+
+        self._I = get_tensor_3d(default_name_allocator('input_' + self.name), shape=shape_in)
+        self._R = get_tensor_4d(default_name_allocator('result_' + self.name), shape=shape_out)
+
+        self.dimensions = self._R.dimensions
+
+        self._bias, self._KG, self._RG, self._biasG, self._K = None, None, None, None, None
+
+    def init_params(self):
+        pass
+
+    def equations(self) -> (list, list):
+        a, b, c, d = self._R.dimensions
+        _, _, D = self._I.shape
+        return [
+           Eq(self._R[a, b, c, d], self._I[a, b, (c * D) + d])
+        ], []
+
+    def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+        return [], []
+
+
+class Contract4to3(Functional):
+
+    def __init__(self, name, shape_in, shape_out):
+        assert shape_out[-1] == (shape_in[-1] * shape_in[-2]), 'The last Result dimension must match the ' \
+                                                                'multiplication ' \
+                                                                'of the 2 last Input dimensions.'
+
+        self.name = name
+        self.propagate = False
+
+        self._I = get_tensor_4d(default_name_allocator('input_' + self.name), shape=shape_in)
+        self._R = get_tensor_3d(default_name_allocator('result_' + self.name), shape=shape_out)
+
+        self.dimensions = self._R.dimensions
+
+        self._bias, self._KG, self._RG, self._biasG, self._K = None, None, None, None, None
+
+    def init_params(self):
+        pass
+
+    def equations(self) -> (list, list):
+        a, b, c, d = self._I.dimensions
+        _, _, _, D = self._I.shape
+        return [
+           Eq(self._R[a, b, (c * D + d)], self._I[a, b, c, d])
+        ], []
+
+    def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+        return [], []
+
+
+class Reduce2ndDimension3d(Functional):
+
+    def __init__(self, name, shape):
+
+        self._I = get_tensor_3d('reduce_input_' + name, shape=shape)
+        self._R = get_tensor_2d('reduce_result_' + name, shape=(shape[0], shape[-1]))
+
+        self._bias, self._KG, self._RG, self._biasG, self._K = None, None, None, None, None
+
+    def equations(self) -> (list, list):
+        a, b = self.result.dimensions
+        _, X, _ = self.input.shape
+        return [
+            Eq(self._R[a, b], self._I[a, X-1, b])
+
+        ], []
+
+    def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+        return [], []
diff --git a/joey/models/ViT.py b/joey/models/ViT.py
new file mode 100644
index 0000000..7fcc74a
--- /dev/null
+++ b/joey/models/ViT.py
@@ -0,0 +1,130 @@
+from joey import Module, default_dim_allocator
+from joey.module.VisionEncoder import VisionEncoder
+from joey.utils import get_tensor_3d
+from joey.new_layers import FullyConnected3d, Norm2d, FullyConnected2d
+from devito import Operator, Inc, Eq, Function
+from scipy.special import log_softmax
+import numpy as np
+
+
+class ViT(Module):
+    r"""Vision Transformer Model
+
+        A transformer model to solve vision tasks by treating images as sequences of tokens.
+
+        Args:
+            image_size      (int): Size of input image
+            channel_size    (int): Size of the channel
+            patch_size      (int): Max patch size, determines number of split images/patches and token size
+            embed_size      (int): Embedding size of input
+            num_heads       (int): Number of heads in Multi-Headed Attention
+            classes         (int): Number of classes for classification of data
+            hidden_size     (int): Number of hidden layers
+
+    """
+
+    def __init__(self, image_size: int, channel_size: int, patch_size: int, embed_size: int, num_heads: int,
+                 classes: int, num_layers: int, hidden_size: int, batch: int = 64, generate_code=False):
+
+        self.p = patch_size
+        self.image_size = image_size
+        self.embed_size = embed_size
+        self.num_patches = (image_size // patch_size) ** 2
+        self.patch_size = channel_size * (patch_size ** 2)
+        self.num_heads = num_heads
+        self.classes = classes
+        self.num_layers = num_layers
+        self.hidden_size = hidden_size
+
+        img_shape = (batch, int((self.image_size / self.p) * (self.image_size / self.p)), self.patch_size)
+
+        self._R = get_tensor_3d('result_1srt', (batch, self.num_patches + 1, self.embed_size))
+
+        d, e = default_dim_allocator(2)
+        x, y, z = self._R.dimensions
+
+        self.embeddings = FullyConnected3d(input_size=img_shape, weight_size=(self.embed_size, self.patch_size))
+        self.class_token = get_tensor_3d('class_token', (1, 1, self.embed_size), dims=(d, e, z))
+        self.positional_encoding = get_tensor_3d('pos_enc', (1, self.num_patches + 1, self.embed_size), dims=(d, y, z))
+
+        self.class_token.data[:] = np.random.rand(*self.class_token.shape)
+        self.positional_encoding.data[:] = np.random.rand(*self.positional_encoding.shape)
+
+        self.encoders = []
+        for layer in range(self.num_layers):
+            vision_encoder = VisionEncoder(
+                embed_size=self.embed_size,
+                num_heads=self.num_heads,
+                batch_size=batch,
+                lines=self.num_patches + 1,
+                hidden_size=self.hidden_size,
+                name='encoder' + str(layer)
+            )
+            self.encoders.append(vision_encoder)
+
+        self.norm = Norm2d(input_size=(batch, self.embed_size), weight_size=(self.embed_size,))
+        self.classifier = FullyConnected2d(input_size=(batch, self.embed_size), weight_size=(self.classes,
+                                                                                             self.embed_size))
+
+        if generate_code:
+            eqs, args = self.equations()
+            self._arg_dict = dict(args)
+            self._op = Operator(eqs)
+            self._op.cfunction
+
+    def equations(self):
+
+        a, b, c = self.embeddings.result.dimensions
+        d, e, _ = self.class_token.dimensions
+        x, y, z = self.result.dimensions
+
+        t0, u0, v0 = self.encoders[0].norm1.input.dimensions
+
+        eqs = [
+            Eq(self.result, 0),
+            *self.embeddings.equations()[0],
+            Eq(self.result[a, b, c], self.embeddings.result[a, b, c]),
+            Eq(self.result[x, self.num_patches, z], self.class_token[0, 0, z]),
+            Inc(self.result[x, y, z], self.positional_encoding[d, y, z]),
+            Eq(self.encoders[0].norm1.input[t0, u0, v0], self.result[t0, u0, v0])
+        ]
+
+        for index, encoder in enumerate(self.encoders):
+            if index > 0:
+                t, u, v = self.encoders[index].input.dimensions
+                eqs.append(
+                    Eq(self.encoders[index].input[t, u, v], self.encoders[index-1].result[t, u, v])
+                )
+            eqs += encoder.equations()[0]
+
+        last_enconder = self.encoders[-1].result
+
+        a, b = self.norm.input.dimensions
+        i, j = self.classifier.input.dimensions
+        _, x, _ = last_enconder.shape
+        eqs += [
+            Eq(self.norm.input[a, b], last_enconder[a, x - 1, b]),
+            *self.norm.equations()[0],
+            Eq(self.classifier.input[i, j], self.norm.result[i, j]),
+            *self.classifier.equations()[0]
+        ]
+
+        return eqs, []
+
+    def _allocate(self, **kwargs) -> (Function, Function, Function,
+                                      Function, Function, Function,
+                                      Function):
+        pass
+
+    def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+        return [], []
+
+    def forward(self, x):
+
+        b, c, h, w = x.shape
+        x = x.reshape(b, int((h / self.p) * (w / self.p)), c * self.p * self.p)
+        self.embeddings.input.data[:] = x
+
+        self._op.apply()
+
+        return log_softmax(self.classifier.result.data, axis=-1)
\ No newline at end of file
diff --git a/joey/module/MultiHeadAttention.py b/joey/module/MultiHeadAttention.py
new file mode 100644
index 0000000..4ab86e2
--- /dev/null
+++ b/joey/module/MultiHeadAttention.py
@@ -0,0 +1,152 @@
+import math
+
+from devito import Function, Operator, Eq, Inc, Constant, exp
+
+from joey import Module, default_dim_allocator
+from joey.utils import get_tensor_4d, get_tensor_3d
+from joey.new_layers import FullyConnected3d
+
+from torch import nn
+import torch
+from torch import functional as F
+
+
+class MultiHeadAttention(Module):
+    r"""Multi-headed Attention for input Query, Key, Value
+
+        Multi-headed Attention is a module for attention mechanisms which runs through attention in several times in
+        parallel, then the multiple outputs are concatenated and linearly transformed
+
+        Args:
+            embed_size  (int): Max embedding size
+            num_heads   (int): Number of heads in multi-headed attention; Number of splits in the embedding size
+            batch_dim   (int, optional): The dimension in which batch dimensions is
+
+        """
+
+    def __init__(self,
+                 embed_size: int,
+                 num_heads: int,
+                 lines: int,
+                 batch_size: int,
+                 batch_dim: int = 0,
+                 generate_code=False,
+                 name='att'
+                 ):
+        self.name = name
+        self.embed_size = embed_size
+        self.num_heads = num_heads
+        self.batch_dim = batch_dim
+        self.lines = lines
+        self.batch_size = batch_size
+
+        self.head_size = self.embed_size // self.num_heads
+
+        assert self.head_size * self.num_heads == self.embed_size, "Heads cannot split Embedding size equally"
+
+        self._I = get_tensor_3d(name=('input_' + self.name),
+                                shape=(self.batch_size, self.lines, self.embed_size))
+
+        self.Q = FullyConnected3d(input_size=(self.batch_size, self.lines, self.embed_size),
+                                  weight_size=(self.embed_size, self.embed_size))
+
+        self.K = FullyConnected3d(input_size=(self.batch_size, self.lines, self.embed_size),
+                                  weight_size=(self.embed_size, self.embed_size))
+
+        self.V = FullyConnected3d(input_size=(self.batch_size, self.lines, self.embed_size),
+                                  weight_size=(self.embed_size, self.embed_size))
+
+        self.linear = FullyConnected3d(input_size=(self.batch_size, self.lines, self.embed_size),
+                                       weight_size=(self.embed_size, self.embed_size))
+
+        reshaped = (self.batch_size, self.lines, self.num_heads, self.head_size)
+        shape_sum = (self.batch_size, self.num_heads, self.lines, 1)
+        shape_scores = (self.batch_size, self.num_heads, self.lines, self.lines)
+
+        self.q_reshaped = get_tensor_4d('q_4d_', reshaped)
+        self.k_reshaped = get_tensor_4d('k_4d_', reshaped)
+        self.v_reshaped = get_tensor_4d('v_4d_', reshaped)
+
+        b, q, k, h, e, h1 = default_dim_allocator(6)
+
+        self.sqrt_embeded = Constant(name + 'sqrt_embed', value=math.sqrt(self.embed_size))
+
+        self.scores = get_tensor_4d(name=('bhqk' + self.name), shape=shape_scores, dims=[b, q, k, e])
+        self.scores_result = get_tensor_4d(name=('scores_result' + self.name), shape=shape_scores)
+        self.attention = get_tensor_4d(name=('attention' + self.name), shape=reshaped)
+        self.expon = get_tensor_4d(name=('expon' + self.name), shape=shape_scores, dims=[b, q, k, e])
+        self.sum_all = get_tensor_4d(name=('sum_all' + self.name), shape=shape_sum, dims=[b, q, k, h1])
+
+        self._R = get_tensor_3d(name=('result_' + self.name), shape=(self.batch_size, self.lines, self.embed_size))
+
+        if generate_code:
+            eqs, args = self.equations()
+            self._arg_dict = dict(args)
+            self._op = Operator(eqs)
+            self._op.cfunction
+
+    def equations(self) -> (list, list):
+        x1, y1, z1, w1 = self.Q._dimensions
+        d1, d2, d3 = self.Q.input.dimensions
+
+        q_a, q_b, q_c, q_d = self.q_reshaped.dimensions
+        k_a, k_b, k_c, k_d = self.k_reshaped.dimensions
+        v_a, v_b, v_c, v_d = self.v_reshaped.dimensions
+
+        b, q, h, e = self.q_reshaped.dimensions
+        _, k, _, _ = self.k_reshaped.dimensions
+
+        b2, h2, q2, k2 = self.scores.dimensions
+        b3, h3, q3, k3 = self.scores_result.dimensions
+        _, _, _, h1 = self.sum_all.dimensions
+
+        eqs = [
+            Eq(self.Q.input[d1, d2, d3], self.input[d1, d2, d3]),
+            Eq(self.K.input[d1, d2, d3], self.input[d1, d2, d3]),
+            Eq(self.V.input[d1, d2, d3], self.input[d1, d2, d3]),
+            *self.Q.equations(dims=(x1, y1, z1, w1))[0],
+            *self.K.equations(dims=(x1, y1, z1, w1))[0],
+            *self.V.equations(dims=(x1, y1, z1, w1))[0],
+            # Forward Equations for Query Key and Value
+            Eq(self.q_reshaped[q_a, q_b, q_c, q_d], self.Q.result[q_a, q_b, (q_c * self.head_size) + q_d]),
+            Eq(self.k_reshaped[k_a, k_b, k_c, k_d], self.K.result[k_a, k_b, (k_c * self.head_size) + k_d]),
+            Eq(self.v_reshaped[v_a, v_b, v_c, v_d], self.V.result[v_a, v_b, (v_c * self.head_size) + v_d]),
+            # Einsum over Query and Key
+            Eq(self.scores[b2, h2, q2, k2], 0),
+            *[Inc(self.scores[b, i, q, k], self.q_reshaped[b, q, i, e] * self.k_reshaped[b, k, i, e]) for i in range(
+                self.num_heads
+            )],
+            # Scores divided by sqrt(embed_size)
+            Eq(self.scores[b2, h2, q2, k2], self.scores[b2, h2, q2, k2] / self.sqrt_embeded),
+            # Sofmax(scores)
+            Eq(self.expon[b2, h2, q2, k2], exp(self.scores[b2, h2, q2, k2])),
+            Eq(self.sum_all[b2, h2, q2, h1], 0),
+            Inc(self.sum_all[b2, h2, q2, h1], self.expon[b2, h2, q2, k2]),
+            Eq(self.scores_result[b3, h3, q3, k3], self.expon[b3, h3, q3, k3] / self.sum_all[b3, h3, q3, h1]),
+        ]
+
+        i, k, j, l = self.attention.dimensions
+        _, _, _, m = self.scores_result.dimensions
+        a, b, c, d = self.attention.shape
+
+        x, y, z = self._R.dimensions
+
+        eqs += [
+            Eq(self.attention[i, k, j, l], 0),
+            *[Inc(self.attention[i, k, z, l], self.scores_result[i, z, k, m] * self.v_reshaped[i, m, z, l]) for z in
+              range(self.num_heads)],
+            Eq(self.linear.input[i, k, (j * d) + l], self.attention[i, k, j, l]),
+            *self.linear.equations()[0],
+            Eq(self.result[x, y, z], self.linear.result[x, y, z])
+        ]
+
+        return eqs, []
+
+    def _allocate(self, **kwargs) -> (
+            Function, Function, Function,
+            Function, Function, Function,
+            Function):
+        pass
+
+    def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+        return [], []
diff --git a/joey/module/VisionEncoder.py b/joey/module/VisionEncoder.py
new file mode 100644
index 0000000..2b4ac57
--- /dev/null
+++ b/joey/module/VisionEncoder.py
@@ -0,0 +1,110 @@
+import numpy as np
+import torch
+from devito import Function, Operator, Eq, Inc
+from torch import nn
+
+from joey import Module
+from joey.activation import ReLU
+from joey.module.MultiHeadAttention import MultiHeadAttention, MultiHeadAttentionTorch
+from joey.new_layers import Norm3d, FullyConnected3d
+from joey.utils import get_tensor_3d
+
+
+class VisionEncoder(Module):
+    r"""Vision Encoder Model
+
+           An Encoder Layer with the added functionality to encode important local structures of a tokenized image
+
+           Args:
+               embed_size      (int): Embedding Size of Input
+               num_heads       (int): Number of heads in multi-headed attention
+               hidden_size     (int): Number of hidden layers
+               dropout         (float, optional): A probability from 0 to 1 which determines the dropout rate
+
+       """
+    def __init__(self,
+                 embed_size: int,
+                 num_heads: int,
+                 hidden_size: int,
+                 lines: int,
+                 batch_size: int,
+                 dropout: float = 0.1,
+                 name='vision_encoder',
+                 generate_code=False
+                 ):
+        self.embed_size = embed_size
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.name = name
+        self.batch_size = batch_size
+        self.lines = lines
+
+        self.norm1 = Norm3d(input_size=(self.batch_size, self.lines, self.embed_size),
+                            weight_size=(self.embed_size,),
+                            name='norm_1_' + self.name)
+        self.norm2 = Norm3d(input_size=(self.batch_size, self.lines, self.embed_size),
+                            weight_size=(self.embed_size,),
+                            name='norm_2_' + self.name)
+
+        self._I = self.norm1.input
+
+        self.attention = MultiHeadAttention(
+            embed_size=self.embed_size,
+            batch_dim=0,
+            num_heads=self.num_heads,
+            lines=self.lines,
+            batch_size=self.batch_size,
+            generate_code=False,
+            name=name + '_att_'
+        )
+
+        first_in = (self.batch_size, self.lines, self.embed_size)
+        second_in = (self.batch_size, self.lines, self.embed_size * 4)
+
+        mlp1 = FullyConnected3d(input_size=first_in,
+                                weight_size=(4 * self.embed_size, self.embed_size),
+                                activation=ReLU())
+        mlp2 = FullyConnected3d(input_size=second_in,
+                                weight_size=(self.embed_size, 4 * self.embed_size,))
+        self.mlp = [mlp1, mlp2]
+
+        self._R = get_tensor_3d('result_encoder_' + self.name, (self.batch_size, self.lines, self.embed_size))
+
+        if generate_code:
+            eqs, args = self.equations()
+            self._arg_dict = dict(args)
+            self._op = Operator(eqs)
+            self._op.cfunction
+
+    def equations(self) -> (list, list):
+        a, b, c = self.result.dimensions
+        x, y, z = self.attention.result.dimensions
+        u, v, t = self.mlp[0].input.dimensions
+        g, j, k = self.mlp[1].input.dimensions
+        p, q, r = self.result.dimensions
+
+        return [
+            Eq(self.result[a, b, c], 0),
+            *self.norm1.equations()[0],
+            Eq(self.result[a, b, c], self.norm1.result[a, b, c]),
+            Eq(self.attention.input[a, b, c], self.result[a, b, c]),
+            *self.attention.equations()[0],
+            Inc(self.result[x, y, z], self.attention.result[x, y, z]),
+            Eq(self.norm2.input[x, y, z], self.result[x, y, z]),
+            *self.norm2.equations()[0],
+            Eq(self.mlp[0].input[u, v, t], self.norm2.result[u, v, t]),
+            *self.mlp[0].equations()[0],
+            Eq(self.mlp[1].input[g, j, k], self.mlp[0].result[g, j, k]),
+            *self.mlp[1].equations()[0],
+            Inc(self.result[p, q, r], self.mlp[1].result[p, q, r])
+        ], []
+
+    def _allocate(self, **kwargs) -> (Function, Function, Function,
+                                      Function, Function, Function,
+                                      Function):
+        pass
+
+    def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+        pass
+
diff --git a/joey/module/__init__.py b/joey/module/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/joey/net.py b/joey/net.py
index 609af66..f37a618 100644
--- a/joey/net.py
+++ b/joey/net.py
@@ -2,6 +2,8 @@
 import numpy as np
 from devito import Eq, Operator
 
+from joey import Layer
+
 
 class Net:
     """
@@ -55,12 +57,7 @@ def __init__(self, layers: list):
 
     def _init_parameters(self):
         for layer in self._layers:
-            if layer.kernel is not None:
-                layer.kernel.data[:] = \
-                    np.random.rand(*layer.kernel.shape) - 0.5
-
-            if layer.bias is not None:
-                layer.bias.data[:] = np.random.rand(*layer.bias.shape) - 0.5
+            layer.init_params()
 
     def _gen_eqs(self):
         eqs = []
@@ -82,14 +79,16 @@ def _gen_eqs(self):
             eqs += layer_eqs
             input_function = layer.result
 
-        return (eqs, args)
+        print(eqs)
+        return eqs, args
 
     def _gen_backprop_eqs(self):
         eqs = []
         args = []
 
         for i in range(len(self._layers)):
-            layer = self._layers[i]
+
+            layer: Layer = self._layers[i]
 
             if layer.kernel_gradients is not None:
                 eqs.append(Eq(layer.kernel_gradients, 0))
@@ -104,6 +103,8 @@ def _gen_backprop_eqs(self):
         for i in range(len(self._layers) - 1, -1, -1):
             if i < len(self._layers) - 1:
                 prev_layer = self._layers[i + 1]
+                if not prev_layer.propagate:
+                    prev_layer = None
             else:
                 prev_layer = None
 
@@ -129,7 +130,7 @@ def _gen_backprop_eqs(self):
                 eqs.append(Eq(layer.bias_gradients,
                               layer.bias_gradients / batch_size))
 
-        return (eqs, args)
+        return eqs, args
 
     @property
     def pytorch_parameters(self):
diff --git a/joey/new_layers.py b/joey/new_layers.py
new file mode 100644
index 0000000..b8caa6e
--- /dev/null
+++ b/joey/new_layers.py
@@ -0,0 +1,404 @@
+from functools import reduce
+
+import numpy as np
+from devito import Grid, Eq, Inc, Max, Function, exp, sum, Constant, sqrt
+from numpy.core.multiarray import array
+from scipy.special import softmax
+
+from joey import Layer, default_name_allocator, default_dim_allocator
+from joey.funtional import Dropout3d, Softmax3d
+from joey.utils import get_tensor_3d, get_tensor_2d
+
+
+class FullyConnected2d(Layer):
+    """
+    A Layer subclass corresponding to a full connection (FC) layer.
+
+    Parameters
+    ----------
+    weight_size : (int, int)
+        The shape of a weight matrix (represented internally by a NumPy array)
+        expressed as (rows, columns).
+    input_size : (int, int, int)
+        The shape of input data expressed as (rows, columns).
+    name_allocator_func : zero-argument function, optional
+        See Layer.__doc__.
+    dim_allocator_func : one-argument function, optional
+        See Layer.__doc__.
+    activation : Activation, optional
+        See Layer.__doc__. The actual default value is Dummy.
+    generate_code : bool, optional
+        See Layer.__doc__.
+    """
+
+    def __init__(self, weight_size, input_size, name_allocator_func=default_name_allocator,
+                 dim_allocator_func=default_dim_allocator, activation=None,
+                 generate_code=False):
+        super().__init__(weight_size, input_size, activation,
+                         name_allocator_func, dim_allocator_func,
+                         generate_code)
+
+    def _allocate(self, weight_size, input_size, name_allocator_func, dim_allocator_func, **kwargs):
+
+        t1, t2, t3 = dim_allocator_func(3)
+
+        self._dimensions = (t1, t2, t3)
+
+        gridW = Grid(shape=weight_size, dimensions=(t3, t2))
+        W = Function(name=name_allocator_func(), grid=gridW, space_order=0,
+                     dtype=np.float64)
+
+        gridV_dimensions = (t1, t2)
+        gridR_dimensions = (t1, t3)
+        gridR_shape = (input_size[0], weight_size[0])
+
+        gridV = Grid(shape=input_size, dimensions=gridV_dimensions)
+        V = Function(name=name_allocator_func(), grid=gridV, space_order=0,
+                     dtype=np.float64)
+
+        gridR = Grid(shape=gridR_shape, dimensions=gridR_dimensions)
+        R = Function(name=name_allocator_func(), grid=gridR, space_order=0,
+                     dtype=np.float64)
+
+        if self._activation is not None:
+            self._T = Function(name=name_allocator_func(), grid=gridR,
+                               space_order=0, dtype=np.float64)
+
+        bias_grid = Grid(shape=weight_size[0],
+                         dimensions=(t3,))
+        bias = Function(name=name_allocator_func(), grid=bias_grid,
+                        space_order=0, dtype=np.float64)
+
+        kernel_grad = Function(name=name_allocator_func(),
+                               grid=gridW, space_order=0, dtype=np.float64)
+
+        output_grad = Function(name=name_allocator_func(),
+                               grid=gridR, space_order=0,
+                               dtype=np.float64)
+
+        bias_grad = Function(name=name_allocator_func(),
+                             grid=bias_grid, space_order=0, dtype=np.float64)
+
+        return W, V, R, bias, kernel_grad, output_grad, bias_grad
+
+    def execute(self, input_data, bias, weight_data=None):
+        if weight_data is not None:
+            self._K.data[:] = weight_data
+
+        self._I.data[:] = input_data
+        self._bias.data[:] = bias
+
+        if self._activation is not None:
+            self._T.data[:] = 0
+
+        self._R.data[:] = 0
+
+        return super().execute()
+
+    def equations(self, dims=None, zero=False):
+
+        a, b, c = dims if dims else self._dimensions
+
+        eqs = [Eq(self.result, 0)]
+        eqs += [Inc(self.result[a, c], self.kernel[c, b] * self.input[a, b])]
+
+        if self._activation is not None:
+            eqs.append(Eq(self.result[a, c], self._activation(self.bias[c] + self.result[a, c])))
+        else:
+            eqs.append(Inc(self.result[a, c], self.bias[c]))
+
+        return eqs, []
+
+    def backprop_equations(self, prev_layer, next_layer):
+        layer = self
+
+        if prev_layer is None:
+            return ([Inc(layer.bias_gradients, layer.result_gradients),
+                     Inc(layer.kernel_gradients,
+                         layer.input * layer.result_gradients)], [])
+
+        return ([Inc(layer.result_gradients,
+                     prev_layer.kernel *
+                     prev_layer.result_gradients)] +
+                layer.activation.backprop_eqs(layer) +
+                [Inc(layer.bias_gradients, layer.result_gradients),
+                 Eq(layer.kernel_gradients,
+                    layer.kernel_gradients + layer.input * layer.result_gradients)
+                 ], [])
+class FullyConnected3d(Layer):
+    """
+    A Layer subclass corresponding to a full connection (FC) layer.
+
+    Parameters
+    ----------
+    weight_size : (int, int)
+        The shape of a weight matrix (represented internally by a NumPy array)
+        expressed as (rows, columns).
+    input_size : (int, int, int)
+        The shape of input data expressed as (rows, columns).
+    name_allocator_func : zero-argument function, optional
+        See Layer.__doc__.
+    dim_allocator_func : one-argument function, optional
+        See Layer.__doc__.
+    activation : Activation, optional
+        See Layer.__doc__. The actual default value is Dummy.
+    generate_code : bool, optional
+        See Layer.__doc__.
+    """
+
+    def __init__(self, weight_size, input_size, name_allocator_func=default_name_allocator,
+                 dim_allocator_func=default_dim_allocator, activation=None,
+                 generate_code=False):
+        super().__init__(weight_size, input_size, activation,
+                         name_allocator_func, dim_allocator_func,
+                         generate_code)
+
+    def _allocate(self, weight_size, input_size, name_allocator_func, dim_allocator_func, **kwargs):
+
+        t1, t2, t3, t4 = dim_allocator_func(4)
+
+        self._dimensions = (t1, t2, t3, t4)
+
+        gridW = Grid(shape=weight_size, dimensions=(t4, t3))
+        W = Function(name=name_allocator_func(), grid=gridW, space_order=0,
+                     dtype=np.float64)
+
+        gridV_dimensions = (t1, t2, t3)
+        gridR_dimensions = (t1, t2, t4)
+        gridR_shape = (input_size[0], input_size[1], weight_size[0])
+
+        gridV = Grid(shape=input_size, dimensions=gridV_dimensions)
+        V = Function(name=name_allocator_func(), grid=gridV, space_order=0,
+                     dtype=np.float64)
+
+        gridR = Grid(shape=gridR_shape, dimensions=gridR_dimensions)
+        R = Function(name=name_allocator_func(), grid=gridR, space_order=0,
+                     dtype=np.float64)
+
+        if self._activation is not None:
+            self._T = Function(name=name_allocator_func(), grid=gridR,
+                               space_order=0, dtype=np.float64)
+
+        bias_grid = Grid(shape=weight_size[0],
+                         dimensions=(t4,))
+        bias = Function(name=name_allocator_func(), grid=bias_grid,
+                        space_order=0, dtype=np.float64)
+
+        kernel_grad = Function(name=name_allocator_func(),
+                               grid=gridW, space_order=0, dtype=np.float64)
+
+        output_grad = Function(name=name_allocator_func(),
+                               grid=gridR, space_order=0,
+                               dtype=np.float64)
+
+        bias_grad = Function(name=name_allocator_func(),
+                             grid=bias_grid, space_order=0, dtype=np.float64)
+
+        return W, V, R, bias, kernel_grad, output_grad, bias_grad
+
+    def execute(self, input_data, bias, weight_data=None):
+        if weight_data is not None:
+            self._K.data[:] = weight_data
+
+        self._I.data[:] = input_data
+        self._bias.data[:] = bias
+
+        if self._activation is not None:
+            self._T.data[:] = 0
+
+        self._R.data[:] = 0
+
+        return super().execute()
+
+    def equations(self, dims=None, zero=False):
+
+        a, b, c, d = dims if dims else self._dimensions
+
+        eqs = [Eq(self.result, 0)]
+        eqs += [Inc(self.result[a, b, d], self.kernel[d, c] * self.input[a, b, c])]
+
+        if self._activation is not None:
+            eqs.append(Eq(self.result[a, b, d], self._activation(self.bias[d] + self.result[a, b, d])))
+        else:
+            eqs.append(Inc(self.result[a, b, d], self.bias[d]))
+
+        return eqs, []
+
+    def backprop_equations(self, prev_layer, next_layer):
+        layer = self
+
+        if prev_layer is None:
+            return ([Inc(layer.bias_gradients, layer.result_gradients),
+                     Inc(layer.kernel_gradients,
+                         layer.input * layer.result_gradients)], [])
+
+        return ([Inc(layer.result_gradients,
+                     prev_layer.kernel *
+                     prev_layer.result_gradients)] +
+                layer.activation.backprop_eqs(layer) +
+                [Inc(layer.bias_gradients, layer.result_gradients),
+                 Eq(layer.kernel_gradients,
+                    layer.kernel_gradients + layer.input * layer.result_gradients)
+                 ], [])
+
+
+class Norm3d(Layer):
+
+    def __init__(self, weight_size, input_size, name_allocator_func=default_name_allocator,
+                 dim_allocator_func=default_dim_allocator, activation=None,
+                 generate_code=False, **kwargs):
+        super().__init__(weight_size, input_size, activation,
+                         name_allocator_func, dim_allocator_func,
+                         generate_code, **kwargs)
+
+    def _allocate(self, weight_size, input_size, name_allocator_func, dim_allocator_func, **kwargs):
+        batch, row, col, col2 = dim_allocator_func(4)
+
+        self.eps = kwargs.get('eps', 1e-6)
+        self._dimensions = (batch, row, col, col2)
+        self.shape = input_size
+
+        self.N = weight_size[0]
+
+        gridW = Grid(shape=weight_size, dimensions=(col,))
+        W = Function(name=name_allocator_func(), grid=gridW, space_order=0, dtype=np.float64)
+
+        gridV_dimensions = (batch, row, col)
+        gridR_dimensions = (batch, row, col)
+        gridR_shape = input_size
+
+        gridV = Grid(shape=input_size, dimensions=gridV_dimensions)
+        V = Function(name=name_allocator_func(), grid=gridV, space_order=0, dtype=np.float64)
+
+        gridR = Grid(shape=gridR_shape, dimensions=gridR_dimensions)
+        R = Function(name=name_allocator_func(), grid=gridR, space_order=0, dtype=np.float64)
+
+        bias_grid = Grid(shape=weight_size[0], dimensions=(col,))
+        bias = Function(name=name_allocator_func(), grid=bias_grid, space_order=0, dtype=np.float64)
+
+        kernel_grad = Function(name=name_allocator_func(), grid=gridW, space_order=0, dtype=np.float64)
+
+        output_grad = Function(name=name_allocator_func(), grid=gridR, space_order=0, dtype=np.float64)
+
+        bias_grad = Function(name=name_allocator_func(), grid=bias_grid, space_order=0, dtype=np.float64)
+
+        return W, V, R, bias, kernel_grad, output_grad, bias_grad
+
+    def init_params(self):
+        self._K.data[:] = np.ones(self.N)
+        self._bias.data[:] = np.zeros(self.N)
+
+    def execute(self, kernel_data=None, input_data=None, bias=None) -> array:
+        pass
+
+    def equations(self, zero=False) -> (list, list):
+        batch, row, col, col2 = self._dimensions
+        axis = Constant(name=self.name + 'dim_1', value=self._I.shape[-1])
+        eps = Constant(name=self.name + 'eps', value=self.eps)
+
+        result_sum = get_tensor_3d(default_name_allocator('result_sum_' + self.name),
+                                   shape=(self.shape[0:2] + (1,)),
+                                   dims=(batch, row, col2))
+        result_mean = get_tensor_3d(default_name_allocator('result_mean_' + self.name),
+                                    shape=(self.shape[0:2] + (1,)),
+                                    dims=(batch, row, col2))
+        result_std = get_tensor_3d(default_name_allocator('result_std_' + self.name),
+                                   shape=(self.shape[0:2] + (1,)),
+                                   dims=(batch, row, col2))
+        eqs = [Eq(self.result, 0)]
+        eqs += [
+            Eq(result_sum, 0),
+            Inc(result_sum, self.input),
+            Eq(result_mean, result_sum / axis),
+            Inc(result_std, ((self.input - result_mean) ** 2)),
+            Eq(result_std, result_std / axis),
+            Eq(result_std, sqrt(result_std)),
+            Eq(self.result, self.kernel * (self.input - result_mean) / (result_std + eps)),
+            Inc(self.result, self.bias[col])
+        ]
+
+        return eqs, []
+
+    def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+        return [], []
+
+
+class Norm2d(Layer):
+
+    def __init__(self, weight_size, input_size, name_allocator_func=default_name_allocator,
+                 dim_allocator_func=default_dim_allocator, activation=None,
+                 generate_code=False, **kwargs):
+        super().__init__(weight_size, input_size, activation,
+                         name_allocator_func, dim_allocator_func,
+                         generate_code, **kwargs)
+
+    def _allocate(self, weight_size, input_size, name_allocator_func, dim_allocator_func, **kwargs):
+        row, col, col2 = dim_allocator_func(3)
+
+        self.eps = kwargs.get('eps', 1e-6)
+        self._dimensions = (row, col, col2)
+        self.shape = input_size
+
+        self.N = weight_size[0]
+
+        gridW = Grid(shape=weight_size, dimensions=(col,))
+        W = Function(name=name_allocator_func(), grid=gridW, space_order=0, dtype=np.float64)
+
+        gridV_dimensions = (row, col)
+        gridR_dimensions = (row, col)
+        gridR_shape = input_size
+
+        gridV = Grid(shape=input_size, dimensions=gridV_dimensions)
+        V = Function(name=name_allocator_func(), grid=gridV, space_order=0, dtype=np.float64)
+
+        gridR = Grid(shape=gridR_shape, dimensions=gridR_dimensions)
+        R = Function(name=name_allocator_func(), grid=gridR, space_order=0, dtype=np.float64)
+
+        bias_grid = Grid(shape=weight_size[0], dimensions=(col,))
+        bias = Function(name=name_allocator_func(), grid=bias_grid, space_order=0, dtype=np.float64)
+
+        kernel_grad = Function(name=name_allocator_func(), grid=gridW, space_order=0, dtype=np.float64)
+
+        output_grad = Function(name=name_allocator_func(), grid=gridR, space_order=0, dtype=np.float64)
+
+        bias_grad = Function(name=name_allocator_func(), grid=bias_grid, space_order=0, dtype=np.float64)
+
+        return W, V, R, bias, kernel_grad, output_grad, bias_grad
+
+    def init_params(self):
+        self._K.data[:] = np.ones(self.N)
+        self._bias.data[:] = np.zeros(self.N)
+
+    def execute(self, kernel_data=None, input_data=None, bias=None) -> array:
+        pass
+
+    def equations(self, zero=False) -> (list, list):
+        row, col, col2 = self._dimensions
+        axis = Constant(name=self.name + 'dim_1', value=self._I.shape[-1])
+        eps = Constant(name=self.name + 'eps', value=self.eps)
+
+        result_sum = get_tensor_2d(default_name_allocator('result_sum_' + self.name),
+                                   shape=(self.shape[0:1] + (1,)),
+                                   dims=(row, col2))
+        result_mean = get_tensor_2d(default_name_allocator('result_mean_' + self.name),
+                                    shape=(self.shape[0:1] + (1,)),
+                                    dims=(row, col2))
+        result_std = get_tensor_2d(default_name_allocator('result_std_' + self.name),
+                                   shape=(self.shape[0:1] + (1,)),
+                                   dims=(row, col2))
+        eqs = [Eq(self.result, 0)]
+        eqs += [
+            Eq(result_sum, 0),
+            Inc(result_sum, self.input),
+            Eq(result_mean, result_sum / axis),
+            Inc(result_std, ((self.input - result_mean) ** 2)),
+            Eq(result_std, result_std / axis),
+            Eq(result_std, sqrt(result_std)),
+            Eq(self.result, self.kernel * (self.input - result_mean) / (result_std + eps)),
+            Inc(self.result, self.bias[col])
+        ]
+
+        return eqs, []
+
+    def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+        return [], []
diff --git a/joey/utils.py b/joey/utils.py
new file mode 100644
index 0000000..686016f
--- /dev/null
+++ b/joey/utils.py
@@ -0,0 +1,33 @@
+from devito import Function, SpaceDimension
+import numpy as np
+
+from joey import default_dim_allocator, default_name_allocator
+
+
+def get_tensor_4d(name, shape, dims=None):
+    a, b, c, d = shape
+    _a, _b, _c, _d = default_dim_allocator(4) if not dims else dims
+
+    return Function(name=default_name_allocator(name), shape=(a, b, c, d), dimensions=(_a, _b, _c, _d),
+                    dtype=np.float32)
+
+
+def get_tensor_3d(name, shape, dims=None):
+    a, b, c = shape
+    _a, _b, _c = default_dim_allocator(3) if not dims else dims
+
+    return Function(name=default_name_allocator(name), shape=(a, b, c), dimensions=(_a, _b, _c), dtype=np.float32)
+
+
+def get_tensor_2d(name, shape, dims=None):
+    a, b = shape
+    _a, _b = default_dim_allocator(2) if not dims else dims
+
+    return Function(name=default_name_allocator(name), shape=(a, b), dimensions=(_a, _b), dtype=np.float32)
+
+
+def get_tensor_1d(name, shape, dim=None):
+    a = shape
+    _a = default_dim_allocator(1)[0] if not dim else dim
+
+    return Function(name=default_name_allocator(name), shape=(a,), dimensions=(_a,), dtype=np.float32)
diff --git a/joey/validate.py b/joey/validate.py
new file mode 100644
index 0000000..f14c0ec
--- /dev/null
+++ b/joey/validate.py
@@ -0,0 +1,58 @@
+import numpy as np
+from torch import optim
+from torchvision import datasets, transforms, models
+import torchvision.transforms as transforms
+from devito import logger
+import torch
+
+from new_layers import x as net
+
+# logger.set_log_level(level='ERROR')
+
+mean, std = (0.5,), (0.5,)
+BATCH_SIZE = 64
+
+transform = transforms.Compose([transforms.ToTensor(),
+                                transforms.Normalize(mean, std)
+                                ])
+
+trainset = datasets.MNIST('../data/MNIST/', download=True, train=True, transform=transform)
+trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=False)
+
+testset = datasets.MNIST('../data/MNIST/', download=True, train=False, transform=transform)
+testloader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)
+
+optimizer = optim.SGD(net.pytorch_parameters, lr=0.001, momentum=0.9)
+criterion = torch.nn.CrossEntropyLoss()
+
+def loss_grad(result, expected):
+    gradients = []
+    for b in range(len(result.result.data)):
+        row = []
+        for i in range(10):
+            result = [i, b]
+            if i == expected[b]:
+                result -= 1
+            row.append(result)
+        gradients.append(row)
+
+    return gradients
+
+
+for img, label in trainloader:
+    img = img.reshape(28, 28, 64)
+
+    # print("Input Image Dimensions: {}".format(img.size()))
+    # print("Label Dimensions: {}".format(label.size()))
+    # print("-" * 100)
+
+    out = net.forward(img.detach().numpy())
+    # loss = criterion(torch.from_numpy(out), label.t())
+    # print(loss)
+    net.backward(np.random.rand(64, 10), loss_grad, optimizer)
+
+    # print("Output Dimensions: {}".format(out.shape))
+    # break
+
+
+# sys.exit(0)
diff --git a/requirements.txt b/requirements.txt
index 08a2554..5db82d7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,9 @@
-devito
-torch
-torchvision
+devito~=4.8.0
+torch~=1.13.1
+torchvision~=0.14.1
+
+numpy~=1.24.2
+scipy~=1.10.1
+sympy~=1.11.1
+pytest~=7.2.1
+setuptools~=59.6.0
\ No newline at end of file
diff --git a/tests/test_ViT.py b/tests/test_ViT.py
new file mode 100644
index 0000000..bfb16cb
--- /dev/null
+++ b/tests/test_ViT.py
@@ -0,0 +1,63 @@
+import numpy as np
+import torch
+from torchvision import datasets
+import torchvision.transforms as transforms
+from devito import logger
+
+from joey.models.ViT import ViT
+from tests.utils import transfer_weights_ViT
+
+logger.set_log_level(level='ERROR')
+image_size = 28
+channel_size = 1
+patch_size = 7
+embed_size = 512
+num_heads = 8
+classes = 10
+num_layers = 3
+hidden_size = 256
+dropout = 0.2
+
+np.random.seed(0)
+
+BATCH_SIZE = 64
+
+mean, std = (0.5,), (0.5,)
+
+transform = transforms.Compose([transforms.ToTensor(),
+                                transforms.Normalize(mean, std)
+                                ])
+
+trainset = datasets.MNIST('../data/MNIST/', download=True, train=True, transform=transform)
+trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=False)
+
+testset = datasets.MNIST('../data/MNIST/', download=True, train=False, transform=transform)
+testloader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE, shuffle=True)
+
+model = ViT(image_size, channel_size, patch_size, embed_size, num_heads, classes, num_layers, hidden_size,
+            generate_code=True)
+
+
+def test_eval_model():
+    transfer_weights_ViT(model)
+
+    y_true_test = []
+    y_pred_test = []
+
+    for batch_idx, (img, labels) in enumerate(testloader):
+        if img.size(0) != 64:
+            continue
+        preds = model.forward(img.detach().numpy())
+        y_pred_test.extend(preds.argmax(axis=-1).tolist())
+        y_true_test.extend(labels.detach().tolist())
+        if batch_idx == 10:
+            break
+
+    total_correct = len([True for x, y in zip(y_pred_test, y_true_test) if x == y])
+    total = len(y_pred_test)
+    accuracy = total_correct * 100 / total
+
+    print("Test Accuracy%: ", accuracy, "==", total_correct, "/", total)
+
+    return accuracy >= 95
+
diff --git a/tests/utils.py b/tests/utils.py
index 002ec72..3d72da4 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,6 +1,8 @@
 import numpy as np
 from os import environ
 
+import torch
+
 
 def compare(devito, pytorch, tolerance):
     pytorch = pytorch.detach().numpy()
@@ -26,3 +28,47 @@ def get_run_count():
         return 1000
     else:
         return 1
+
+
+def transfer_weights_ViT(model):
+
+    weights_pretained = torch.load('../examples/resources/model_weights_ViT')
+
+    def equal_layer(num):
+        model.encoders[num].norm1.kernel.data[:] = weights_pretained[f'encoders.{num}.norm1.weight'].detach().numpy()
+        model.encoders[num].norm1.bias.data[:] = weights_pretained[f'encoders.{num}.norm1.bias'].detach().numpy()
+        model.encoders[num].norm2.kernel.data[:] = weights_pretained[f'encoders.{num}.norm2.weight'].detach().numpy()
+        model.encoders[num].norm2.bias.data[:] = weights_pretained[f'encoders.{num}.norm2.bias'].detach().numpy()
+        model.encoders[num].attention.Q.kernel.data[:] = weights_pretained[
+            f'encoders.{num}.attention.Q.weight'].detach().numpy()
+        model.encoders[num].attention.Q.bias.data[:] = weights_pretained[f'encoders.{num}.attention.Q.bias'].detach().numpy()
+        model.encoders[num].attention.K.kernel.data[:] = weights_pretained[
+            f'encoders.{num}.attention.K.weight'].detach().numpy()
+        model.encoders[num].attention.K.bias.data[:] = weights_pretained[f'encoders.{num}.attention.K.bias'].detach().numpy()
+        model.encoders[num].attention.V.kernel.data[:] = weights_pretained[
+            f'encoders.{num}.attention.V.weight'].detach().numpy()
+        model.encoders[num].attention.V.bias.data[:] = weights_pretained[f'encoders.{num}.attention.V.bias'].detach().numpy()
+        model.encoders[num].attention.linear.kernel.data[:] = weights_pretained[
+            f'encoders.{num}.attention.linear.weight'].detach().numpy()
+        model.encoders[num].attention.linear.bias.data[:] = weights_pretained[
+            f'encoders.{num}.attention.linear.bias'].detach().numpy()
+        model.encoders[num].mlp[0].kernel.data[:] = weights_pretained[f'encoders.{num}.mlp.0.weight'].detach().numpy()
+        model.encoders[num].mlp[0].bias.data[:] = weights_pretained[f'encoders.{num}.mlp.0.bias'].detach().numpy()
+        model.encoders[num].mlp[1].kernel.data[:] = weights_pretained[f'encoders.{num}.mlp.2.weight'].detach().numpy()
+        model.encoders[num].mlp[1].bias.data[:] = weights_pretained[f'encoders.{num}.mlp.2.bias'].detach().numpy()
+
+    with torch.no_grad():
+        model.embeddings.kernel.data[:] = weights_pretained['embeddings.weight'].detach().numpy()
+        model.embeddings.bias.data[:] = weights_pretained['embeddings.bias'].detach().numpy()
+        model.class_token.data[:] = weights_pretained['class_token'].detach().numpy()
+        model.positional_encoding.data[:] = weights_pretained['positional_encoding'].detach().numpy()
+
+        for i in range(len(model.encoders)):
+            equal_layer(i)
+
+        model.norm.kernel.data[:] = weights_pretained['norm.weight'].detach().numpy()
+        model.norm.bias.data[:] = weights_pretained['norm.bias'].detach().numpy()
+        model.classifier.kernel.data[:] = weights_pretained['classifier.0.weight'].detach().numpy()
+        model.classifier.bias.data[:] = weights_pretained['classifier.0.bias'].detach().numpy()
+
+