diff --git a/.gitignore b/.gitignore
index 62b2ca8..52d9f37 100644
--- a/.gitignore
+++ b/.gitignore
@@ -189,3 +189,4 @@ dmypy.json
# End of https://www.toptal.com/developers/gitignore/api/python,emacs
+data/
\ No newline at end of file
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..07b79ea
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,37 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/joey2.0.iml b/.idea/joey2.0.iml
new file mode 100644
index 0000000..9289075
--- /dev/null
+++ b/.idea/joey2.0.iml
@@ -0,0 +1,15 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..1f79844
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..55c4497
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/README.md b/README.md
index eac241b..8a72c9e 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@ Joey is a machine learning framework running on top of [Devito](https://github.c
* A backward pass through a neural network with batch processing
* Producing backpropagation equations automatically based on the list of layers in a neural network (only a loss function must be defined manually by the user)
* Training a neural network with PyTorch optimizers
+* Initiate a transformer neural network for image classification
Unlike other machine learning frameworks, Joey generates and compiles an optimized low-level code on-the-spot (using Devito) for both standalone layers and proper neural networks.
@@ -16,10 +17,23 @@ Unlike other machine learning frameworks, Joey generates and compiles an optimiz
* 2D max pooling (other types of 2D pooling can be implemented by the user by extending the `Pooling` abstract class)
* Full connection
* Flattening (an internal layer turning 2D data with channels into a 1D vector or 2D matrix, depending on the batch size)
+* 3D FullyConnected
+* Einsun function
+* Dropout 1, 2, 3 and 4 dimensions
+* Norm 2D
+* Norm 3D
+* Softmax 3D and 4D function
+
+## Supported modules
+* MultiHeadAttention
+* VisionEnconder
+
+## Built-in Models
+* ViT (Vision Transformer)
## Supported activation functions
* ReLU
-* Softmax (only via the `FullyConnectedSoftmax` class)
+* Softmax (only via the `FullyConnectedSoftmax` class or with `Softmax3d` / `Softmax4d` function)
* Dummy (`f(x) = x`)
Other activation functions can be implemented by extending the `Activation` abstract class.
diff --git a/examples/ViT_Running.py b/examples/ViT_Running.py
new file mode 100644
index 0000000..9e5f5f8
--- /dev/null
+++ b/examples/ViT_Running.py
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/examples/resources/model_weights_ViT b/examples/resources/model_weights_ViT
new file mode 100644
index 0000000..53c2b4d
Binary files /dev/null and b/examples/resources/model_weights_ViT differ
diff --git a/joey/base.py b/joey/base.py
index d4db2d5..37dd95d 100644
--- a/joey/base.py
+++ b/joey/base.py
@@ -1,5 +1,7 @@
from abc import ABC, abstractmethod
-from devito import Operator, Function, dimensions
+
+import numpy as np
+from devito import Operator, Function, dimensions, SpaceDimension
from joey import Activation
from joey import activation as activ
from numpy import array
@@ -8,21 +10,60 @@
dim_index = 0
-def default_name_allocator():
+def default_name_allocator(name=''):
global index
- name = 'f' + str(index)
+ _name = 'f' + name + str(index)
index += 1
- return name
+ return _name
def default_dim_allocator(count):
global dim_index
- names = ''
+ names = []
for i in range(count):
- names += 'd' + str(dim_index) + ' '
+ names.append('d' + str(dim_index))
dim_index += 1
- names = names[:-1]
- return dimensions(names)
+ return [SpaceDimension(n) for n in names]
+
+
+class Module(ABC):
+ @property
+ def input(self):
+ """A Function object corresponding to an input data array."""
+ return self._I
+
+ @property
+ def result(self):
+ """A Function object corresponding to a result array."""
+ return self._R
+
+ @abstractmethod
+ def equations(self) -> (list, list):
+ pass
+
+ def init_params(self):
+ if self.kernel is not None:
+ self.kernel.data[:] = \
+ np.random.rand(*self.kernel.shape) - 0.5
+
+ if self.bias is not None:
+ self.bias.data[:] = np.random.rand(*self.bias.shape) - 0.5
+
+ @abstractmethod
+ def _allocate(self, **kwargs) -> (Function, Function, Function,
+ Function, Function, Function,
+ Function):
+
+ pass
+
+ def execute(self, kernel_data=None, input_data=None, bias=None) -> array:
+
+ self._op.apply(**self._arg_dict)
+ return self._R.data
+
+ @abstractmethod
+ def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+ pass
class Layer(ABC):
@@ -62,7 +103,7 @@ def __init__(self, kernel_size,
input_size, activation=activ.Dummy(),
name_allocator_func=default_name_allocator,
dim_allocator_func=default_dim_allocator,
- generate_code=False):
+ generate_code=False, **kwargs):
if activation is None:
activation = activ.Dummy()
@@ -71,12 +112,14 @@ def __init__(self, kernel_size,
"its subclass")
self._activation = activation
-
- self._K, self._I, self._R, self._bias, self._KG, self._RG, \
- self._biasG = self._allocate(kernel_size,
- input_size,
- name_allocator_func,
- dim_allocator_func)
+ self.propagate = True
+ self.back_propagate = True
+ self.name = kwargs.get('name', '')
+ self._K, self._I, self._R, self._bias, self._KG, self._RG, self._biasG = self._allocate(kernel_size,
+ input_size,
+ name_allocator_func,
+ dim_allocator_func,
+ **kwargs)
if generate_code:
eqs, args = self.equations()
@@ -89,6 +132,11 @@ def kernel(self):
"""A Function object corresponding to a kernel/weight array."""
return self._K
+ @property
+ def weight(self):
+ """A Function object corresponding to a kernel/weight array."""
+ return self._K.data
+
@property
def input(self):
"""A Function object corresponding to an input data array."""
@@ -135,27 +183,35 @@ def pytorch_parameters(self):
kernel_parameter = None
bias_parameter = None
- if self._K is not None:
+ if self._K is not None and self.propagate:
kernel_tensor = from_numpy(self._K.data)
kernel_parameter = Parameter(kernel_tensor, requires_grad=False)
if self._KG is not None:
kernel_parameter.grad = from_numpy(self._KG.data)
- if self._bias is not None:
+ if self._bias is not None and self.propagate:
bias_tensor = from_numpy(self._bias.data)
bias_parameter = Parameter(bias_tensor, requires_grad=False)
if self._biasG is not None:
bias_parameter.grad = from_numpy(self._biasG.data)
- return (kernel_parameter, bias_parameter)
+ return kernel_parameter, bias_parameter
+
+ def init_params(self):
+ if self.kernel is not None:
+ self.kernel.data[:] = \
+ np.random.rand(*self.kernel.shape) - 0.5
+
+ if self.bias is not None:
+ self.bias.data[:] = np.random.rand(*self.bias.shape) - 0.5
@abstractmethod
def _allocate(self, kernel_size, input_size, name_allocator_func,
- dim_allocator_func) -> (Function, Function, Function,
- Function, Function, Function,
- Function):
+ dim_allocator_func, **kwargs) -> (Function, Function, Function,
+ Function, Function, Function,
+ Function):
"""
This method should return a (Function, Function, Function, Function,
Function, Function, Function) object corresponding to a kernel,
diff --git a/joey/funtional.py b/joey/funtional.py
new file mode 100644
index 0000000..59b24af
--- /dev/null
+++ b/joey/funtional.py
@@ -0,0 +1,310 @@
+from abc import ABC, abstractmethod
+from functools import reduce
+import numpy as np
+from devito import Eq, Function, exp, Inc
+
+from joey import default_name_allocator, Layer, default_dim_allocator
+from joey.utils import get_tensor_2d, get_tensor_1d, get_tensor_3d, get_tensor_4d
+
+
+def kernel_shape(x):
+ return reduce(lambda a, b: a * b, x)
+
+
+class Functional(Layer):
+
+ def _allocate(self, kernel_size, input_size, name_allocator_func, dim_allocator_func, **kwargs) -> (
+ Function, Function, Function,
+ Function, Function, Function,
+ Function):
+ return self._K, self._I, self._R, self.bias, self._KG, self._RG, self._biasG
+
+ def execute(self, kernel_data=None, input_data=None, bias=None) -> np.array:
+ pass
+
+ @abstractmethod
+ def equations(self) -> (list, list):
+ pass
+
+ @abstractmethod
+ def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+ pass
+
+ def __init__(self):
+ pass
+
+
+class BaseDropout(Functional):
+ def equations(self) -> (list, list):
+ pass
+
+ def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+ pass
+
+ def init_params(self):
+ K = int(self.N * self.dropout)
+ arr = np.array([0] * K + [1] * (self.N - K))
+ np.random.shuffle(arr)
+
+ self._K.data[:] = arr.reshape(*self.shape)
+
+
+class Dropout1d(BaseDropout):
+ def __init__(self, name, shape, kernel_size, input_size, dropout=0.1, **kwargs):
+ self.N = kernel_shape(shape)
+ self.dropout = dropout
+ self.name = name
+ self.shape = shape
+ self.propagate = False
+
+ self._I = get_tensor_1d(default_name_allocator('input_' + self.name), shape=self.shape)
+ self._K = get_tensor_1d(default_name_allocator('kernel_' + self.name), shape=self.shape,
+ dims=self._I.dimensions)
+ self._R = get_tensor_1d(default_name_allocator('result_' + self.name), shape=self.shape,
+ dims=self._I.dimensions)
+
+ self._bias, self._KG, self._RG, self._biasG = None, None, None, None
+
+ def equations(self) -> (list, list):
+ a = self._R.dimensions
+ return [
+ Eq(self._R[a], self._I[a] * self._K[a])
+ ], []
+
+ def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+ return [], []
+
+
+class Dropout2d(BaseDropout):
+ def __init__(self, name, shape, kernel_size, input_size, dropout=0.1, **kwargs):
+ self.N = kernel_shape(shape)
+ self.dropout = dropout
+ self.name = name
+ self.shape = shape
+ self.propagate = False
+
+ self._I = get_tensor_2d(default_name_allocator('input_' + self.name), shape=self.shape)
+ self._K = get_tensor_2d(default_name_allocator('kernel_' + self.name), shape=self.shape,
+ dims=self._I.dimensions)
+ self._R = get_tensor_2d(default_name_allocator('result_' + self.name), shape=self.shape,
+ dims=self._I.dimensions)
+
+ self._bias, self._KG, self._RG, self._biasG = None, None, None, None
+
+ def equations(self) -> (list, list):
+ a, b = self._R.dimensions
+ return [
+ Eq(self._R[a, b], self._I[a, b] * self._K[a, b])
+ ], []
+
+ def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+ return [], []
+
+
+class Dropout3d(BaseDropout):
+ def __init__(self, name, shape, dropout=0.1, **kwargs):
+ self.N = kernel_shape(shape)
+ self.dropout = dropout
+ self.name = name
+ self.shape = shape
+ self.propagate = False
+
+ self._I = get_tensor_3d(default_name_allocator('input_' + self.name), shape=self.shape)
+ self._K = get_tensor_3d(default_name_allocator('kernel_' + self.name), shape=self.shape,
+ dims=self._I.dimensions)
+ self._R = get_tensor_3d(default_name_allocator('result_' + self.name), shape=self.shape,
+ dims=self._I.dimensions)
+
+ self._bias, self._KG, self._RG, self._biasG = None, None, None, None
+
+ def equations(self) -> (list, list):
+ a, b, c = self._R.dimensions
+ return [
+ Eq(self._R[a, b, c], self._I[a, b, c] * self._K[a, b, c])
+ ], []
+
+ def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+ return [], []
+
+
+class Dropout4d(BaseDropout):
+ def __init__(self, name, shape, dropout=0.1, **kwargs):
+ self.N = kernel_shape(shape)
+ self.dropout = dropout
+ self.name = name
+ self.shape = shape
+ self.propagate = False
+
+ self._I = get_tensor_4d(default_name_allocator('input_' + self.name), shape=self.shape)
+ self._K = get_tensor_4d(default_name_allocator('kernel_' + self.name), shape=self.shape,
+ dims=self._I.dimensions)
+ self._R = get_tensor_4d(default_name_allocator('result_' + self.name), shape=self.shape,
+ dims=self._I.dimensions)
+
+ self._bias, self._KG, self._RG, self._biasG = None, None, None, None
+
+ def equations(self) -> (list, list):
+ a, b, c, d = self._R.dimensions
+ return [
+ Eq(self._R[a, b, c, d], self._I[a, b, c, d] * self._K[a, b, c, d])
+ ], []
+
+ def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+ return [], []
+
+
+class Softmax3d(Functional):
+ def __init__(self, name, shape, **kwargs):
+ self.name = name
+ self.shape = shape
+ self.propagate = False
+
+ self._I = get_tensor_3d(default_name_allocator('input_' + self.name), shape=self.shape)
+ self._R = get_tensor_3d(default_name_allocator('result_' + self.name), shape=self.shape)
+
+ self._bias, self._KG, self._RG, self._biasG, self._K = None, None, None, None, None
+
+ def equations(self) -> (list, list):
+ a, b, c = self._I.dimensions
+ x, y, z = self._R.dimensions
+
+ h = default_dim_allocator(1)[0]
+ expon = get_tensor_3d(default_name_allocator('exponential_' + self.name), shape=self.shape, dims=(a, b, c))
+ sum_last_axis = get_tensor_3d(default_name_allocator('sum_all_' + self.name), shape=(self.shape[0:2] + (1,)),
+ dims=(a, b, h))
+
+ return [
+ Eq(self.result, 0),
+ Eq(expon[a, b, c], exp(self._I[a, b, c])),
+ Eq(sum_last_axis[a, b, h], 0),
+ Inc(sum_last_axis[a, b, h], expon[a, b, c]),
+ Eq(self.result[x, y, z], expon[x, y, z] / sum_last_axis[x, y, h]),
+ ], []
+
+ def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+ return [], []
+
+ def init_params(self):
+ pass
+
+
+class Softmax4d(Functional):
+ def __init__(self, name, shape):
+ self.name = name
+ self.shape = shape
+ self.propagate = False
+
+ self._I = get_tensor_4d(default_name_allocator('input_' + self.name), shape=self.shape)
+ self._R = get_tensor_4d(default_name_allocator('result_' + self.name), shape=self.shape)
+
+ self.dimensions = self._R.dimensions
+
+ self._bias, self._KG, self._RG, self._biasG, self._K = None, None, None, None, None
+
+ def equations(self) -> (list, list):
+ a, b, c, d = self._I.dimensions
+ x, y, z, w = self._R.dimensions
+
+ h = default_dim_allocator(1)[0]
+ expon = get_tensor_4d(default_name_allocator('exponential_' + self.name), shape=self.shape, dims=(a, b, c, d))
+ sum_last_axis = get_tensor_4d(default_name_allocator('sum_all_' + self.name), shape=(self.shape[0:3] + (1,)),
+ dims=(a, b, c, h))
+ eqs = [Eq(self.result, 0)]
+ eqs += [
+ Eq(expon[a, b, c, d], exp(self.input[a, b, c, d])),
+ Eq(sum_last_axis[a, b, c, h], 0),
+ Inc(sum_last_axis[a, b, c, h], expon[a, b, c, d]),
+ Eq(self.result[x, y, z, w], expon[x, y, z, w] / sum_last_axis[x, y, z, h]),
+
+ ]
+
+ return eqs, []
+
+ def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+ return [], []
+
+ def init_params(self):
+ pass
+
+
+class Expand3to4(Functional):
+
+ def __init__(self, name, shape_in, shape_out):
+ assert shape_in[-1] == (shape_out[-1] * shape_out[-2]), 'The last Input dimension must match the ' \
+ 'multiplication ' \
+ 'of the 2 last Result dimensions.'
+
+ self.name = name
+ self.propagate = False
+
+ self._I = get_tensor_3d(default_name_allocator('input_' + self.name), shape=shape_in)
+ self._R = get_tensor_4d(default_name_allocator('result_' + self.name), shape=shape_out)
+
+ self.dimensions = self._R.dimensions
+
+ self._bias, self._KG, self._RG, self._biasG, self._K = None, None, None, None, None
+
+ def init_params(self):
+ pass
+
+ def equations(self) -> (list, list):
+ a, b, c, d = self._R.dimensions
+ _, _, D = self._I.shape
+ return [
+ Eq(self._R[a, b, c, d], self._I[a, b, (c * D) + d])
+ ], []
+
+ def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+ return [], []
+
+
+class Contract4to3(Functional):
+
+ def __init__(self, name, shape_in, shape_out):
+ assert shape_out[-1] == (shape_in[-1] * shape_in[-2]), 'The last Result dimension must match the ' \
+ 'multiplication ' \
+ 'of the 2 last Input dimensions.'
+
+ self.name = name
+ self.propagate = False
+
+ self._I = get_tensor_4d(default_name_allocator('input_' + self.name), shape=shape_in)
+ self._R = get_tensor_3d(default_name_allocator('result_' + self.name), shape=shape_out)
+
+ self.dimensions = self._R.dimensions
+
+ self._bias, self._KG, self._RG, self._biasG, self._K = None, None, None, None, None
+
+ def init_params(self):
+ pass
+
+ def equations(self) -> (list, list):
+ a, b, c, d = self._I.dimensions
+ _, _, _, D = self._I.shape
+ return [
+ Eq(self._R[a, b, (c * D + d)], self._I[a, b, c, d])
+ ], []
+
+ def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+ return [], []
+
+
+class Reduce2ndDimension3d(Functional):
+
+ def __init__(self, name, shape):
+
+ self._I = get_tensor_3d('reduce_input_' + name, shape=shape)
+ self._R = get_tensor_2d('reduce_result_' + name, shape=(shape[0], shape[-1]))
+
+ self._bias, self._KG, self._RG, self._biasG, self._K = None, None, None, None, None
+
+ def equations(self) -> (list, list):
+ a, b = self.result.dimensions
+ _, X, _ = self.input.shape
+ return [
+ Eq(self._R[a, b], self._I[a, X-1, b])
+
+ ], []
+
+ def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+ return [], []
diff --git a/joey/models/ViT.py b/joey/models/ViT.py
new file mode 100644
index 0000000..7fcc74a
--- /dev/null
+++ b/joey/models/ViT.py
@@ -0,0 +1,130 @@
+from joey import Module, default_dim_allocator
+from joey.module.VisionEncoder import VisionEncoder
+from joey.utils import get_tensor_3d
+from joey.new_layers import FullyConnected3d, Norm2d, FullyConnected2d
+from devito import Operator, Inc, Eq, Function
+from scipy.special import log_softmax
+import numpy as np
+
+
+class ViT(Module):
+ r"""Vision Transformer Model
+
+ A transformer model to solve vision tasks by treating images as sequences of tokens.
+
+ Args:
+ image_size (int): Size of input image
+ channel_size (int): Size of the channel
+ patch_size (int): Max patch size, determines number of split images/patches and token size
+ embed_size (int): Embedding size of input
+ num_heads (int): Number of heads in Multi-Headed Attention
+ classes (int): Number of classes for classification of data
+ hidden_size (int): Number of hidden layers
+
+ """
+
+ def __init__(self, image_size: int, channel_size: int, patch_size: int, embed_size: int, num_heads: int,
+ classes: int, num_layers: int, hidden_size: int, batch: int = 64, generate_code=False):
+
+ self.p = patch_size
+ self.image_size = image_size
+ self.embed_size = embed_size
+ self.num_patches = (image_size // patch_size) ** 2
+ self.patch_size = channel_size * (patch_size ** 2)
+ self.num_heads = num_heads
+ self.classes = classes
+ self.num_layers = num_layers
+ self.hidden_size = hidden_size
+
+ img_shape = (batch, int((self.image_size / self.p) * (self.image_size / self.p)), self.patch_size)
+
+ self._R = get_tensor_3d('result_1srt', (batch, self.num_patches + 1, self.embed_size))
+
+ d, e = default_dim_allocator(2)
+ x, y, z = self._R.dimensions
+
+ self.embeddings = FullyConnected3d(input_size=img_shape, weight_size=(self.embed_size, self.patch_size))
+ self.class_token = get_tensor_3d('class_token', (1, 1, self.embed_size), dims=(d, e, z))
+ self.positional_encoding = get_tensor_3d('pos_enc', (1, self.num_patches + 1, self.embed_size), dims=(d, y, z))
+
+ self.class_token.data[:] = np.random.rand(*self.class_token.shape)
+ self.positional_encoding.data[:] = np.random.rand(*self.positional_encoding.shape)
+
+ self.encoders = []
+ for layer in range(self.num_layers):
+ vision_encoder = VisionEncoder(
+ embed_size=self.embed_size,
+ num_heads=self.num_heads,
+ batch_size=batch,
+ lines=self.num_patches + 1,
+ hidden_size=self.hidden_size,
+ name='encoder' + str(layer)
+ )
+ self.encoders.append(vision_encoder)
+
+ self.norm = Norm2d(input_size=(batch, self.embed_size), weight_size=(self.embed_size,))
+ self.classifier = FullyConnected2d(input_size=(batch, self.embed_size), weight_size=(self.classes,
+ self.embed_size))
+
+ if generate_code:
+ eqs, args = self.equations()
+ self._arg_dict = dict(args)
+ self._op = Operator(eqs)
+ self._op.cfunction
+
+ def equations(self):
+
+ a, b, c = self.embeddings.result.dimensions
+ d, e, _ = self.class_token.dimensions
+ x, y, z = self.result.dimensions
+
+ t0, u0, v0 = self.encoders[0].norm1.input.dimensions
+
+ eqs = [
+ Eq(self.result, 0),
+ *self.embeddings.equations()[0],
+ Eq(self.result[a, b, c], self.embeddings.result[a, b, c]),
+ Eq(self.result[x, self.num_patches, z], self.class_token[0, 0, z]),
+ Inc(self.result[x, y, z], self.positional_encoding[d, y, z]),
+ Eq(self.encoders[0].norm1.input[t0, u0, v0], self.result[t0, u0, v0])
+ ]
+
+ for index, encoder in enumerate(self.encoders):
+ if index > 0:
+ t, u, v = self.encoders[index].input.dimensions
+ eqs.append(
+ Eq(self.encoders[index].input[t, u, v], self.encoders[index-1].result[t, u, v])
+ )
+ eqs += encoder.equations()[0]
+
+ last_enconder = self.encoders[-1].result
+
+ a, b = self.norm.input.dimensions
+ i, j = self.classifier.input.dimensions
+ _, x, _ = last_enconder.shape
+ eqs += [
+ Eq(self.norm.input[a, b], last_enconder[a, x - 1, b]),
+ *self.norm.equations()[0],
+ Eq(self.classifier.input[i, j], self.norm.result[i, j]),
+ *self.classifier.equations()[0]
+ ]
+
+ return eqs, []
+
+ def _allocate(self, **kwargs) -> (Function, Function, Function,
+ Function, Function, Function,
+ Function):
+ pass
+
+ def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+ return [], []
+
+ def forward(self, x):
+
+ b, c, h, w = x.shape
+ x = x.reshape(b, int((h / self.p) * (w / self.p)), c * self.p * self.p)
+ self.embeddings.input.data[:] = x
+
+ self._op.apply()
+
+ return log_softmax(self.classifier.result.data, axis=-1)
\ No newline at end of file
diff --git a/joey/module/MultiHeadAttention.py b/joey/module/MultiHeadAttention.py
new file mode 100644
index 0000000..4ab86e2
--- /dev/null
+++ b/joey/module/MultiHeadAttention.py
@@ -0,0 +1,152 @@
+import math
+
+from devito import Function, Operator, Eq, Inc, Constant, exp
+
+from joey import Module, default_dim_allocator
+from joey.utils import get_tensor_4d, get_tensor_3d
+from joey.new_layers import FullyConnected3d
+
+from torch import nn
+import torch
+from torch import functional as F
+
+
+class MultiHeadAttention(Module):
+ r"""Multi-headed Attention for input Query, Key, Value
+
+ Multi-headed Attention is a module for attention mechanisms which runs through attention in several times in
+ parallel, then the multiple outputs are concatenated and linearly transformed
+
+ Args:
+ embed_size (int): Max embedding size
+ num_heads (int): Number of heads in multi-headed attention; Number of splits in the embedding size
+ batch_dim (int, optional): The dimension in which batch dimensions is
+
+ """
+
+ def __init__(self,
+ embed_size: int,
+ num_heads: int,
+ lines: int,
+ batch_size: int,
+ batch_dim: int = 0,
+ generate_code=False,
+ name='att'
+ ):
+ self.name = name
+ self.embed_size = embed_size
+ self.num_heads = num_heads
+ self.batch_dim = batch_dim
+ self.lines = lines
+ self.batch_size = batch_size
+
+ self.head_size = self.embed_size // self.num_heads
+
+ assert self.head_size * self.num_heads == self.embed_size, "Heads cannot split Embedding size equally"
+
+ self._I = get_tensor_3d(name=('input_' + self.name),
+ shape=(self.batch_size, self.lines, self.embed_size))
+
+ self.Q = FullyConnected3d(input_size=(self.batch_size, self.lines, self.embed_size),
+ weight_size=(self.embed_size, self.embed_size))
+
+ self.K = FullyConnected3d(input_size=(self.batch_size, self.lines, self.embed_size),
+ weight_size=(self.embed_size, self.embed_size))
+
+ self.V = FullyConnected3d(input_size=(self.batch_size, self.lines, self.embed_size),
+ weight_size=(self.embed_size, self.embed_size))
+
+ self.linear = FullyConnected3d(input_size=(self.batch_size, self.lines, self.embed_size),
+ weight_size=(self.embed_size, self.embed_size))
+
+ reshaped = (self.batch_size, self.lines, self.num_heads, self.head_size)
+ shape_sum = (self.batch_size, self.num_heads, self.lines, 1)
+ shape_scores = (self.batch_size, self.num_heads, self.lines, self.lines)
+
+ self.q_reshaped = get_tensor_4d('q_4d_', reshaped)
+ self.k_reshaped = get_tensor_4d('k_4d_', reshaped)
+ self.v_reshaped = get_tensor_4d('v_4d_', reshaped)
+
+ b, q, k, h, e, h1 = default_dim_allocator(6)
+
+ self.sqrt_embeded = Constant(name + 'sqrt_embed', value=math.sqrt(self.embed_size))
+
+ self.scores = get_tensor_4d(name=('bhqk' + self.name), shape=shape_scores, dims=[b, q, k, e])
+ self.scores_result = get_tensor_4d(name=('scores_result' + self.name), shape=shape_scores)
+ self.attention = get_tensor_4d(name=('attention' + self.name), shape=reshaped)
+ self.expon = get_tensor_4d(name=('expon' + self.name), shape=shape_scores, dims=[b, q, k, e])
+ self.sum_all = get_tensor_4d(name=('sum_all' + self.name), shape=shape_sum, dims=[b, q, k, h1])
+
+ self._R = get_tensor_3d(name=('result_' + self.name), shape=(self.batch_size, self.lines, self.embed_size))
+
+ if generate_code:
+ eqs, args = self.equations()
+ self._arg_dict = dict(args)
+ self._op = Operator(eqs)
+ self._op.cfunction
+
+ def equations(self) -> (list, list):
+ x1, y1, z1, w1 = self.Q._dimensions
+ d1, d2, d3 = self.Q.input.dimensions
+
+ q_a, q_b, q_c, q_d = self.q_reshaped.dimensions
+ k_a, k_b, k_c, k_d = self.k_reshaped.dimensions
+ v_a, v_b, v_c, v_d = self.v_reshaped.dimensions
+
+ b, q, h, e = self.q_reshaped.dimensions
+ _, k, _, _ = self.k_reshaped.dimensions
+
+ b2, h2, q2, k2 = self.scores.dimensions
+ b3, h3, q3, k3 = self.scores_result.dimensions
+ _, _, _, h1 = self.sum_all.dimensions
+
+ eqs = [
+ Eq(self.Q.input[d1, d2, d3], self.input[d1, d2, d3]),
+ Eq(self.K.input[d1, d2, d3], self.input[d1, d2, d3]),
+ Eq(self.V.input[d1, d2, d3], self.input[d1, d2, d3]),
+ *self.Q.equations(dims=(x1, y1, z1, w1))[0],
+ *self.K.equations(dims=(x1, y1, z1, w1))[0],
+ *self.V.equations(dims=(x1, y1, z1, w1))[0],
+ # Forward Equations for Query Key and Value
+ Eq(self.q_reshaped[q_a, q_b, q_c, q_d], self.Q.result[q_a, q_b, (q_c * self.head_size) + q_d]),
+ Eq(self.k_reshaped[k_a, k_b, k_c, k_d], self.K.result[k_a, k_b, (k_c * self.head_size) + k_d]),
+ Eq(self.v_reshaped[v_a, v_b, v_c, v_d], self.V.result[v_a, v_b, (v_c * self.head_size) + v_d]),
+ # Einsum over Query and Key
+ Eq(self.scores[b2, h2, q2, k2], 0),
+ *[Inc(self.scores[b, i, q, k], self.q_reshaped[b, q, i, e] * self.k_reshaped[b, k, i, e]) for i in range(
+ self.num_heads
+ )],
+ # Scores divided by sqrt(embed_size)
+ Eq(self.scores[b2, h2, q2, k2], self.scores[b2, h2, q2, k2] / self.sqrt_embeded),
+ # Sofmax(scores)
+ Eq(self.expon[b2, h2, q2, k2], exp(self.scores[b2, h2, q2, k2])),
+ Eq(self.sum_all[b2, h2, q2, h1], 0),
+ Inc(self.sum_all[b2, h2, q2, h1], self.expon[b2, h2, q2, k2]),
+ Eq(self.scores_result[b3, h3, q3, k3], self.expon[b3, h3, q3, k3] / self.sum_all[b3, h3, q3, h1]),
+ ]
+
+ i, k, j, l = self.attention.dimensions
+ _, _, _, m = self.scores_result.dimensions
+ a, b, c, d = self.attention.shape
+
+ x, y, z = self._R.dimensions
+
+ eqs += [
+ Eq(self.attention[i, k, j, l], 0),
+ *[Inc(self.attention[i, k, z, l], self.scores_result[i, z, k, m] * self.v_reshaped[i, m, z, l]) for z in
+ range(self.num_heads)],
+ Eq(self.linear.input[i, k, (j * d) + l], self.attention[i, k, j, l]),
+ *self.linear.equations()[0],
+ Eq(self.result[x, y, z], self.linear.result[x, y, z])
+ ]
+
+ return eqs, []
+
+ def _allocate(self, **kwargs) -> (
+ Function, Function, Function,
+ Function, Function, Function,
+ Function):
+ pass
+
+ def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+ return [], []
diff --git a/joey/module/VisionEncoder.py b/joey/module/VisionEncoder.py
new file mode 100644
index 0000000..2b4ac57
--- /dev/null
+++ b/joey/module/VisionEncoder.py
@@ -0,0 +1,110 @@
+import numpy as np
+import torch
+from devito import Function, Operator, Eq, Inc
+from torch import nn
+
+from joey import Module
+from joey.activation import ReLU
+from joey.module.MultiHeadAttention import MultiHeadAttention, MultiHeadAttentionTorch
+from joey.new_layers import Norm3d, FullyConnected3d
+from joey.utils import get_tensor_3d
+
+
+class VisionEncoder(Module):
+ r"""Vision Encoder Model
+
+ An Encoder Layer with the added functionality to encode important local structures of a tokenized image
+
+ Args:
+ embed_size (int): Embedding Size of Input
+ num_heads (int): Number of heads in multi-headed attention
+ hidden_size (int): Number of hidden layers
+ dropout (float, optional): A probability from 0 to 1 which determines the dropout rate
+
+ """
+ def __init__(self,
+ embed_size: int,
+ num_heads: int,
+ hidden_size: int,
+ lines: int,
+ batch_size: int,
+ dropout: float = 0.1,
+ name='vision_encoder',
+ generate_code=False
+ ):
+ self.embed_size = embed_size
+ self.num_heads = num_heads
+ self.hidden_size = hidden_size
+ self.dropout = dropout
+ self.name = name
+ self.batch_size = batch_size
+ self.lines = lines
+
+ self.norm1 = Norm3d(input_size=(self.batch_size, self.lines, self.embed_size),
+ weight_size=(self.embed_size,),
+ name='norm_1_' + self.name)
+ self.norm2 = Norm3d(input_size=(self.batch_size, self.lines, self.embed_size),
+ weight_size=(self.embed_size,),
+ name='norm_2_' + self.name)
+
+ self._I = self.norm1.input
+
+ self.attention = MultiHeadAttention(
+ embed_size=self.embed_size,
+ batch_dim=0,
+ num_heads=self.num_heads,
+ lines=self.lines,
+ batch_size=self.batch_size,
+ generate_code=False,
+ name=name + '_att_'
+ )
+
+ first_in = (self.batch_size, self.lines, self.embed_size)
+ second_in = (self.batch_size, self.lines, self.embed_size * 4)
+
+ mlp1 = FullyConnected3d(input_size=first_in,
+ weight_size=(4 * self.embed_size, self.embed_size),
+ activation=ReLU())
+ mlp2 = FullyConnected3d(input_size=second_in,
+ weight_size=(self.embed_size, 4 * self.embed_size,))
+ self.mlp = [mlp1, mlp2]
+
+ self._R = get_tensor_3d('result_encoder_' + self.name, (self.batch_size, self.lines, self.embed_size))
+
+ if generate_code:
+ eqs, args = self.equations()
+ self._arg_dict = dict(args)
+ self._op = Operator(eqs)
+ self._op.cfunction
+
+ def equations(self) -> (list, list):
+ a, b, c = self.result.dimensions
+ x, y, z = self.attention.result.dimensions
+ u, v, t = self.mlp[0].input.dimensions
+ g, j, k = self.mlp[1].input.dimensions
+ p, q, r = self.result.dimensions
+
+ return [
+ Eq(self.result[a, b, c], 0),
+ *self.norm1.equations()[0],
+ Eq(self.result[a, b, c], self.norm1.result[a, b, c]),
+ Eq(self.attention.input[a, b, c], self.result[a, b, c]),
+ *self.attention.equations()[0],
+ Inc(self.result[x, y, z], self.attention.result[x, y, z]),
+ Eq(self.norm2.input[x, y, z], self.result[x, y, z]),
+ *self.norm2.equations()[0],
+ Eq(self.mlp[0].input[u, v, t], self.norm2.result[u, v, t]),
+ *self.mlp[0].equations()[0],
+ Eq(self.mlp[1].input[g, j, k], self.mlp[0].result[g, j, k]),
+ *self.mlp[1].equations()[0],
+ Inc(self.result[p, q, r], self.mlp[1].result[p, q, r])
+ ], []
+
+ def _allocate(self, **kwargs) -> (Function, Function, Function,
+ Function, Function, Function,
+ Function):
+ pass
+
+ def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+ pass
+
diff --git a/joey/module/__init__.py b/joey/module/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/joey/net.py b/joey/net.py
index 609af66..f37a618 100644
--- a/joey/net.py
+++ b/joey/net.py
@@ -2,6 +2,8 @@
import numpy as np
from devito import Eq, Operator
+from joey import Layer
+
class Net:
"""
@@ -55,12 +57,7 @@ def __init__(self, layers: list):
def _init_parameters(self):
for layer in self._layers:
- if layer.kernel is not None:
- layer.kernel.data[:] = \
- np.random.rand(*layer.kernel.shape) - 0.5
-
- if layer.bias is not None:
- layer.bias.data[:] = np.random.rand(*layer.bias.shape) - 0.5
+ layer.init_params()
def _gen_eqs(self):
eqs = []
@@ -82,14 +79,16 @@ def _gen_eqs(self):
eqs += layer_eqs
input_function = layer.result
- return (eqs, args)
+ print(eqs)
+ return eqs, args
def _gen_backprop_eqs(self):
eqs = []
args = []
for i in range(len(self._layers)):
- layer = self._layers[i]
+
+ layer: Layer = self._layers[i]
if layer.kernel_gradients is not None:
eqs.append(Eq(layer.kernel_gradients, 0))
@@ -104,6 +103,8 @@ def _gen_backprop_eqs(self):
for i in range(len(self._layers) - 1, -1, -1):
if i < len(self._layers) - 1:
prev_layer = self._layers[i + 1]
+ if not prev_layer.propagate:
+ prev_layer = None
else:
prev_layer = None
@@ -129,7 +130,7 @@ def _gen_backprop_eqs(self):
eqs.append(Eq(layer.bias_gradients,
layer.bias_gradients / batch_size))
- return (eqs, args)
+ return eqs, args
@property
def pytorch_parameters(self):
diff --git a/joey/new_layers.py b/joey/new_layers.py
new file mode 100644
index 0000000..b8caa6e
--- /dev/null
+++ b/joey/new_layers.py
@@ -0,0 +1,404 @@
+from functools import reduce
+
+import numpy as np
+from devito import Grid, Eq, Inc, Max, Function, exp, sum, Constant, sqrt
+from numpy.core.multiarray import array
+from scipy.special import softmax
+
+from joey import Layer, default_name_allocator, default_dim_allocator
+from joey.funtional import Dropout3d, Softmax3d
+from joey.utils import get_tensor_3d, get_tensor_2d
+
+
+class FullyConnected2d(Layer):
+ """
+ A Layer subclass corresponding to a full connection (FC) layer.
+
+ Parameters
+ ----------
+ weight_size : (int, int)
+ The shape of a weight matrix (represented internally by a NumPy array)
+ expressed as (rows, columns).
+ input_size : (int, int, int)
+ The shape of input data expressed as (rows, columns).
+ name_allocator_func : zero-argument function, optional
+ See Layer.__doc__.
+ dim_allocator_func : one-argument function, optional
+ See Layer.__doc__.
+ activation : Activation, optional
+ See Layer.__doc__. The actual default value is Dummy.
+ generate_code : bool, optional
+ See Layer.__doc__.
+ """
+
+ def __init__(self, weight_size, input_size, name_allocator_func=default_name_allocator,
+ dim_allocator_func=default_dim_allocator, activation=None,
+ generate_code=False):
+ super().__init__(weight_size, input_size, activation,
+ name_allocator_func, dim_allocator_func,
+ generate_code)
+
+ def _allocate(self, weight_size, input_size, name_allocator_func, dim_allocator_func, **kwargs):
+
+ t1, t2, t3 = dim_allocator_func(3)
+
+ self._dimensions = (t1, t2, t3)
+
+ gridW = Grid(shape=weight_size, dimensions=(t3, t2))
+ W = Function(name=name_allocator_func(), grid=gridW, space_order=0,
+ dtype=np.float64)
+
+ gridV_dimensions = (t1, t2)
+ gridR_dimensions = (t1, t3)
+ gridR_shape = (input_size[0], weight_size[0])
+
+ gridV = Grid(shape=input_size, dimensions=gridV_dimensions)
+ V = Function(name=name_allocator_func(), grid=gridV, space_order=0,
+ dtype=np.float64)
+
+ gridR = Grid(shape=gridR_shape, dimensions=gridR_dimensions)
+ R = Function(name=name_allocator_func(), grid=gridR, space_order=0,
+ dtype=np.float64)
+
+ if self._activation is not None:
+ self._T = Function(name=name_allocator_func(), grid=gridR,
+ space_order=0, dtype=np.float64)
+
+ bias_grid = Grid(shape=weight_size[0],
+ dimensions=(t3,))
+ bias = Function(name=name_allocator_func(), grid=bias_grid,
+ space_order=0, dtype=np.float64)
+
+ kernel_grad = Function(name=name_allocator_func(),
+ grid=gridW, space_order=0, dtype=np.float64)
+
+ output_grad = Function(name=name_allocator_func(),
+ grid=gridR, space_order=0,
+ dtype=np.float64)
+
+ bias_grad = Function(name=name_allocator_func(),
+ grid=bias_grid, space_order=0, dtype=np.float64)
+
+ return W, V, R, bias, kernel_grad, output_grad, bias_grad
+
+ def execute(self, input_data, bias, weight_data=None):
+ if weight_data is not None:
+ self._K.data[:] = weight_data
+
+ self._I.data[:] = input_data
+ self._bias.data[:] = bias
+
+ if self._activation is not None:
+ self._T.data[:] = 0
+
+ self._R.data[:] = 0
+
+ return super().execute()
+
+ def equations(self, dims=None, zero=False):
+
+ a, b, c = dims if dims else self._dimensions
+
+ eqs = [Eq(self.result, 0)]
+ eqs += [Inc(self.result[a, c], self.kernel[c, b] * self.input[a, b])]
+
+ if self._activation is not None:
+ eqs.append(Eq(self.result[a, c], self._activation(self.bias[c] + self.result[a, c])))
+ else:
+ eqs.append(Inc(self.result[a, c], self.bias[c]))
+
+ return eqs, []
+
+ def backprop_equations(self, prev_layer, next_layer):
+ layer = self
+
+ if prev_layer is None:
+ return ([Inc(layer.bias_gradients, layer.result_gradients),
+ Inc(layer.kernel_gradients,
+ layer.input * layer.result_gradients)], [])
+
+ return ([Inc(layer.result_gradients,
+ prev_layer.kernel *
+ prev_layer.result_gradients)] +
+ layer.activation.backprop_eqs(layer) +
+ [Inc(layer.bias_gradients, layer.result_gradients),
+ Eq(layer.kernel_gradients,
+ layer.kernel_gradients + layer.input * layer.result_gradients)
+ ], [])
+class FullyConnected3d(Layer):
+ """
+ A Layer subclass corresponding to a full connection (FC) layer.
+
+ Parameters
+ ----------
+ weight_size : (int, int)
+ The shape of a weight matrix (represented internally by a NumPy array)
+ expressed as (rows, columns).
+ input_size : (int, int, int)
+ The shape of input data expressed as (rows, columns).
+ name_allocator_func : zero-argument function, optional
+ See Layer.__doc__.
+ dim_allocator_func : one-argument function, optional
+ See Layer.__doc__.
+ activation : Activation, optional
+ See Layer.__doc__. The actual default value is Dummy.
+ generate_code : bool, optional
+ See Layer.__doc__.
+ """
+
+ def __init__(self, weight_size, input_size, name_allocator_func=default_name_allocator,
+ dim_allocator_func=default_dim_allocator, activation=None,
+ generate_code=False):
+ super().__init__(weight_size, input_size, activation,
+ name_allocator_func, dim_allocator_func,
+ generate_code)
+
+ def _allocate(self, weight_size, input_size, name_allocator_func, dim_allocator_func, **kwargs):
+
+ t1, t2, t3, t4 = dim_allocator_func(4)
+
+ self._dimensions = (t1, t2, t3, t4)
+
+ gridW = Grid(shape=weight_size, dimensions=(t4, t3))
+ W = Function(name=name_allocator_func(), grid=gridW, space_order=0,
+ dtype=np.float64)
+
+ gridV_dimensions = (t1, t2, t3)
+ gridR_dimensions = (t1, t2, t4)
+ gridR_shape = (input_size[0], input_size[1], weight_size[0])
+
+ gridV = Grid(shape=input_size, dimensions=gridV_dimensions)
+ V = Function(name=name_allocator_func(), grid=gridV, space_order=0,
+ dtype=np.float64)
+
+ gridR = Grid(shape=gridR_shape, dimensions=gridR_dimensions)
+ R = Function(name=name_allocator_func(), grid=gridR, space_order=0,
+ dtype=np.float64)
+
+ if self._activation is not None:
+ self._T = Function(name=name_allocator_func(), grid=gridR,
+ space_order=0, dtype=np.float64)
+
+ bias_grid = Grid(shape=weight_size[0],
+ dimensions=(t4,))
+ bias = Function(name=name_allocator_func(), grid=bias_grid,
+ space_order=0, dtype=np.float64)
+
+ kernel_grad = Function(name=name_allocator_func(),
+ grid=gridW, space_order=0, dtype=np.float64)
+
+ output_grad = Function(name=name_allocator_func(),
+ grid=gridR, space_order=0,
+ dtype=np.float64)
+
+ bias_grad = Function(name=name_allocator_func(),
+ grid=bias_grid, space_order=0, dtype=np.float64)
+
+ return W, V, R, bias, kernel_grad, output_grad, bias_grad
+
+ def execute(self, input_data, bias, weight_data=None):
+ if weight_data is not None:
+ self._K.data[:] = weight_data
+
+ self._I.data[:] = input_data
+ self._bias.data[:] = bias
+
+ if self._activation is not None:
+ self._T.data[:] = 0
+
+ self._R.data[:] = 0
+
+ return super().execute()
+
+ def equations(self, dims=None, zero=False):
+
+ a, b, c, d = dims if dims else self._dimensions
+
+ eqs = [Eq(self.result, 0)]
+ eqs += [Inc(self.result[a, b, d], self.kernel[d, c] * self.input[a, b, c])]
+
+ if self._activation is not None:
+ eqs.append(Eq(self.result[a, b, d], self._activation(self.bias[d] + self.result[a, b, d])))
+ else:
+ eqs.append(Inc(self.result[a, b, d], self.bias[d]))
+
+ return eqs, []
+
+ def backprop_equations(self, prev_layer, next_layer):
+ layer = self
+
+ if prev_layer is None:
+ return ([Inc(layer.bias_gradients, layer.result_gradients),
+ Inc(layer.kernel_gradients,
+ layer.input * layer.result_gradients)], [])
+
+ return ([Inc(layer.result_gradients,
+ prev_layer.kernel *
+ prev_layer.result_gradients)] +
+ layer.activation.backprop_eqs(layer) +
+ [Inc(layer.bias_gradients, layer.result_gradients),
+ Eq(layer.kernel_gradients,
+ layer.kernel_gradients + layer.input * layer.result_gradients)
+ ], [])
+
+
+class Norm3d(Layer):
+
+ def __init__(self, weight_size, input_size, name_allocator_func=default_name_allocator,
+ dim_allocator_func=default_dim_allocator, activation=None,
+ generate_code=False, **kwargs):
+ super().__init__(weight_size, input_size, activation,
+ name_allocator_func, dim_allocator_func,
+ generate_code, **kwargs)
+
+ def _allocate(self, weight_size, input_size, name_allocator_func, dim_allocator_func, **kwargs):
+ batch, row, col, col2 = dim_allocator_func(4)
+
+ self.eps = kwargs.get('eps', 1e-6)
+ self._dimensions = (batch, row, col, col2)
+ self.shape = input_size
+
+ self.N = weight_size[0]
+
+ gridW = Grid(shape=weight_size, dimensions=(col,))
+ W = Function(name=name_allocator_func(), grid=gridW, space_order=0, dtype=np.float64)
+
+ gridV_dimensions = (batch, row, col)
+ gridR_dimensions = (batch, row, col)
+ gridR_shape = input_size
+
+ gridV = Grid(shape=input_size, dimensions=gridV_dimensions)
+ V = Function(name=name_allocator_func(), grid=gridV, space_order=0, dtype=np.float64)
+
+ gridR = Grid(shape=gridR_shape, dimensions=gridR_dimensions)
+ R = Function(name=name_allocator_func(), grid=gridR, space_order=0, dtype=np.float64)
+
+ bias_grid = Grid(shape=weight_size[0], dimensions=(col,))
+ bias = Function(name=name_allocator_func(), grid=bias_grid, space_order=0, dtype=np.float64)
+
+ kernel_grad = Function(name=name_allocator_func(), grid=gridW, space_order=0, dtype=np.float64)
+
+ output_grad = Function(name=name_allocator_func(), grid=gridR, space_order=0, dtype=np.float64)
+
+ bias_grad = Function(name=name_allocator_func(), grid=bias_grid, space_order=0, dtype=np.float64)
+
+ return W, V, R, bias, kernel_grad, output_grad, bias_grad
+
+ def init_params(self):
+ self._K.data[:] = np.ones(self.N)
+ self._bias.data[:] = np.zeros(self.N)
+
+ def execute(self, kernel_data=None, input_data=None, bias=None) -> array:
+ pass
+
+ def equations(self, zero=False) -> (list, list):
+ batch, row, col, col2 = self._dimensions
+ axis = Constant(name=self.name + 'dim_1', value=self._I.shape[-1])
+ eps = Constant(name=self.name + 'eps', value=self.eps)
+
+ result_sum = get_tensor_3d(default_name_allocator('result_sum_' + self.name),
+ shape=(self.shape[0:2] + (1,)),
+ dims=(batch, row, col2))
+ result_mean = get_tensor_3d(default_name_allocator('result_mean_' + self.name),
+ shape=(self.shape[0:2] + (1,)),
+ dims=(batch, row, col2))
+ result_std = get_tensor_3d(default_name_allocator('result_std_' + self.name),
+ shape=(self.shape[0:2] + (1,)),
+ dims=(batch, row, col2))
+ eqs = [Eq(self.result, 0)]
+ eqs += [
+ Eq(result_sum, 0),
+ Inc(result_sum, self.input),
+ Eq(result_mean, result_sum / axis),
+ Inc(result_std, ((self.input - result_mean) ** 2)),
+ Eq(result_std, result_std / axis),
+ Eq(result_std, sqrt(result_std)),
+ Eq(self.result, self.kernel * (self.input - result_mean) / (result_std + eps)),
+ Inc(self.result, self.bias[col])
+ ]
+
+ return eqs, []
+
+ def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+ return [], []
+
+
+class Norm2d(Layer):
+
+ def __init__(self, weight_size, input_size, name_allocator_func=default_name_allocator,
+ dim_allocator_func=default_dim_allocator, activation=None,
+ generate_code=False, **kwargs):
+ super().__init__(weight_size, input_size, activation,
+ name_allocator_func, dim_allocator_func,
+ generate_code, **kwargs)
+
+ def _allocate(self, weight_size, input_size, name_allocator_func, dim_allocator_func, **kwargs):
+ row, col, col2 = dim_allocator_func(3)
+
+ self.eps = kwargs.get('eps', 1e-6)
+ self._dimensions = (row, col, col2)
+ self.shape = input_size
+
+ self.N = weight_size[0]
+
+ gridW = Grid(shape=weight_size, dimensions=(col,))
+ W = Function(name=name_allocator_func(), grid=gridW, space_order=0, dtype=np.float64)
+
+ gridV_dimensions = (row, col)
+ gridR_dimensions = (row, col)
+ gridR_shape = input_size
+
+ gridV = Grid(shape=input_size, dimensions=gridV_dimensions)
+ V = Function(name=name_allocator_func(), grid=gridV, space_order=0, dtype=np.float64)
+
+ gridR = Grid(shape=gridR_shape, dimensions=gridR_dimensions)
+ R = Function(name=name_allocator_func(), grid=gridR, space_order=0, dtype=np.float64)
+
+ bias_grid = Grid(shape=weight_size[0], dimensions=(col,))
+ bias = Function(name=name_allocator_func(), grid=bias_grid, space_order=0, dtype=np.float64)
+
+ kernel_grad = Function(name=name_allocator_func(), grid=gridW, space_order=0, dtype=np.float64)
+
+ output_grad = Function(name=name_allocator_func(), grid=gridR, space_order=0, dtype=np.float64)
+
+ bias_grad = Function(name=name_allocator_func(), grid=bias_grid, space_order=0, dtype=np.float64)
+
+ return W, V, R, bias, kernel_grad, output_grad, bias_grad
+
+ def init_params(self):
+ self._K.data[:] = np.ones(self.N)
+ self._bias.data[:] = np.zeros(self.N)
+
+ def execute(self, kernel_data=None, input_data=None, bias=None) -> array:
+ pass
+
+ def equations(self, zero=False) -> (list, list):
+ row, col, col2 = self._dimensions
+ axis = Constant(name=self.name + 'dim_1', value=self._I.shape[-1])
+ eps = Constant(name=self.name + 'eps', value=self.eps)
+
+ result_sum = get_tensor_2d(default_name_allocator('result_sum_' + self.name),
+ shape=(self.shape[0:1] + (1,)),
+ dims=(row, col2))
+ result_mean = get_tensor_2d(default_name_allocator('result_mean_' + self.name),
+ shape=(self.shape[0:1] + (1,)),
+ dims=(row, col2))
+ result_std = get_tensor_2d(default_name_allocator('result_std_' + self.name),
+ shape=(self.shape[0:1] + (1,)),
+ dims=(row, col2))
+ eqs = [Eq(self.result, 0)]
+ eqs += [
+ Eq(result_sum, 0),
+ Inc(result_sum, self.input),
+ Eq(result_mean, result_sum / axis),
+ Inc(result_std, ((self.input - result_mean) ** 2)),
+ Eq(result_std, result_std / axis),
+ Eq(result_std, sqrt(result_std)),
+ Eq(self.result, self.kernel * (self.input - result_mean) / (result_std + eps)),
+ Inc(self.result, self.bias[col])
+ ]
+
+ return eqs, []
+
+ def backprop_equations(self, prev_layer, next_layer) -> (list, list):
+ return [], []
diff --git a/joey/utils.py b/joey/utils.py
new file mode 100644
index 0000000..686016f
--- /dev/null
+++ b/joey/utils.py
@@ -0,0 +1,33 @@
+from devito import Function, SpaceDimension
+import numpy as np
+
+from joey import default_dim_allocator, default_name_allocator
+
+
+def get_tensor_4d(name, shape, dims=None):
+ a, b, c, d = shape
+ _a, _b, _c, _d = default_dim_allocator(4) if not dims else dims
+
+ return Function(name=default_name_allocator(name), shape=(a, b, c, d), dimensions=(_a, _b, _c, _d),
+ dtype=np.float32)
+
+
+def get_tensor_3d(name, shape, dims=None):
+ a, b, c = shape
+ _a, _b, _c = default_dim_allocator(3) if not dims else dims
+
+ return Function(name=default_name_allocator(name), shape=(a, b, c), dimensions=(_a, _b, _c), dtype=np.float32)
+
+
+def get_tensor_2d(name, shape, dims=None):
+ a, b = shape
+ _a, _b = default_dim_allocator(2) if not dims else dims
+
+ return Function(name=default_name_allocator(name), shape=(a, b), dimensions=(_a, _b), dtype=np.float32)
+
+
+def get_tensor_1d(name, shape, dim=None):
+ a = shape
+ _a = default_dim_allocator(1)[0] if not dim else dim
+
+ return Function(name=default_name_allocator(name), shape=(a,), dimensions=(_a,), dtype=np.float32)
diff --git a/joey/validate.py b/joey/validate.py
new file mode 100644
index 0000000..f14c0ec
--- /dev/null
+++ b/joey/validate.py
@@ -0,0 +1,58 @@
+import numpy as np
+from torch import optim
+from torchvision import datasets, transforms, models
+import torchvision.transforms as transforms
+from devito import logger
+import torch
+
+from new_layers import x as net
+
+# logger.set_log_level(level='ERROR')
+
+mean, std = (0.5,), (0.5,)
+BATCH_SIZE = 64
+
+transform = transforms.Compose([transforms.ToTensor(),
+ transforms.Normalize(mean, std)
+ ])
+
+trainset = datasets.MNIST('../data/MNIST/', download=True, train=True, transform=transform)
+trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=False)
+
+testset = datasets.MNIST('../data/MNIST/', download=True, train=False, transform=transform)
+testloader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)
+
+optimizer = optim.SGD(net.pytorch_parameters, lr=0.001, momentum=0.9)
+criterion = torch.nn.CrossEntropyLoss()
+
+def loss_grad(result, expected):
+ gradients = []
+ for b in range(len(result.result.data)):
+ row = []
+ for i in range(10):
+ result = [i, b]
+ if i == expected[b]:
+ result -= 1
+ row.append(result)
+ gradients.append(row)
+
+ return gradients
+
+
+for img, label in trainloader:
+ img = img.reshape(28, 28, 64)
+
+ # print("Input Image Dimensions: {}".format(img.size()))
+ # print("Label Dimensions: {}".format(label.size()))
+ # print("-" * 100)
+
+ out = net.forward(img.detach().numpy())
+ # loss = criterion(torch.from_numpy(out), label.t())
+ # print(loss)
+ net.backward(np.random.rand(64, 10), loss_grad, optimizer)
+
+ # print("Output Dimensions: {}".format(out.shape))
+ # break
+
+
+# sys.exit(0)
diff --git a/requirements.txt b/requirements.txt
index 08a2554..5db82d7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,9 @@
-devito
-torch
-torchvision
+devito~=4.8.0
+torch~=1.13.1
+torchvision~=0.14.1
+
+numpy~=1.24.2
+scipy~=1.10.1
+sympy~=1.11.1
+pytest~=7.2.1
+setuptools~=59.6.0
\ No newline at end of file
diff --git a/tests/test_ViT.py b/tests/test_ViT.py
new file mode 100644
index 0000000..bfb16cb
--- /dev/null
+++ b/tests/test_ViT.py
@@ -0,0 +1,63 @@
+import numpy as np
+import torch
+from torchvision import datasets
+import torchvision.transforms as transforms
+from devito import logger
+
+from joey.models.ViT import ViT
+from tests.utils import transfer_weights_ViT
+
+logger.set_log_level(level='ERROR')
+image_size = 28
+channel_size = 1
+patch_size = 7
+embed_size = 512
+num_heads = 8
+classes = 10
+num_layers = 3
+hidden_size = 256
+dropout = 0.2
+
+np.random.seed(0)
+
+BATCH_SIZE = 64
+
+mean, std = (0.5,), (0.5,)
+
+transform = transforms.Compose([transforms.ToTensor(),
+ transforms.Normalize(mean, std)
+ ])
+
+trainset = datasets.MNIST('../data/MNIST/', download=True, train=True, transform=transform)
+trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=False)
+
+testset = datasets.MNIST('../data/MNIST/', download=True, train=False, transform=transform)
+testloader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE, shuffle=True)
+
+model = ViT(image_size, channel_size, patch_size, embed_size, num_heads, classes, num_layers, hidden_size,
+ generate_code=True)
+
+
+def test_eval_model():
+ transfer_weights_ViT(model)
+
+ y_true_test = []
+ y_pred_test = []
+
+ for batch_idx, (img, labels) in enumerate(testloader):
+ if img.size(0) != 64:
+ continue
+ preds = model.forward(img.detach().numpy())
+ y_pred_test.extend(preds.argmax(axis=-1).tolist())
+ y_true_test.extend(labels.detach().tolist())
+ if batch_idx == 10:
+ break
+
+ total_correct = len([True for x, y in zip(y_pred_test, y_true_test) if x == y])
+ total = len(y_pred_test)
+ accuracy = total_correct * 100 / total
+
+ print("Test Accuracy%: ", accuracy, "==", total_correct, "/", total)
+
+ return accuracy >= 95
+
diff --git a/tests/utils.py b/tests/utils.py
index 002ec72..3d72da4 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,6 +1,8 @@
import numpy as np
from os import environ
+import torch
+
def compare(devito, pytorch, tolerance):
pytorch = pytorch.detach().numpy()
@@ -26,3 +28,47 @@ def get_run_count():
return 1000
else:
return 1
+
+
+def transfer_weights_ViT(model):
+
+ weights_pretained = torch.load('../examples/resources/model_weights_ViT')
+
+ def equal_layer(num):
+ model.encoders[num].norm1.kernel.data[:] = weights_pretained[f'encoders.{num}.norm1.weight'].detach().numpy()
+ model.encoders[num].norm1.bias.data[:] = weights_pretained[f'encoders.{num}.norm1.bias'].detach().numpy()
+ model.encoders[num].norm2.kernel.data[:] = weights_pretained[f'encoders.{num}.norm2.weight'].detach().numpy()
+ model.encoders[num].norm2.bias.data[:] = weights_pretained[f'encoders.{num}.norm2.bias'].detach().numpy()
+ model.encoders[num].attention.Q.kernel.data[:] = weights_pretained[
+ f'encoders.{num}.attention.Q.weight'].detach().numpy()
+ model.encoders[num].attention.Q.bias.data[:] = weights_pretained[f'encoders.{num}.attention.Q.bias'].detach().numpy()
+ model.encoders[num].attention.K.kernel.data[:] = weights_pretained[
+ f'encoders.{num}.attention.K.weight'].detach().numpy()
+ model.encoders[num].attention.K.bias.data[:] = weights_pretained[f'encoders.{num}.attention.K.bias'].detach().numpy()
+ model.encoders[num].attention.V.kernel.data[:] = weights_pretained[
+ f'encoders.{num}.attention.V.weight'].detach().numpy()
+ model.encoders[num].attention.V.bias.data[:] = weights_pretained[f'encoders.{num}.attention.V.bias'].detach().numpy()
+ model.encoders[num].attention.linear.kernel.data[:] = weights_pretained[
+ f'encoders.{num}.attention.linear.weight'].detach().numpy()
+ model.encoders[num].attention.linear.bias.data[:] = weights_pretained[
+ f'encoders.{num}.attention.linear.bias'].detach().numpy()
+ model.encoders[num].mlp[0].kernel.data[:] = weights_pretained[f'encoders.{num}.mlp.0.weight'].detach().numpy()
+ model.encoders[num].mlp[0].bias.data[:] = weights_pretained[f'encoders.{num}.mlp.0.bias'].detach().numpy()
+ model.encoders[num].mlp[1].kernel.data[:] = weights_pretained[f'encoders.{num}.mlp.2.weight'].detach().numpy()
+ model.encoders[num].mlp[1].bias.data[:] = weights_pretained[f'encoders.{num}.mlp.2.bias'].detach().numpy()
+
+ with torch.no_grad():
+ model.embeddings.kernel.data[:] = weights_pretained['embeddings.weight'].detach().numpy()
+ model.embeddings.bias.data[:] = weights_pretained['embeddings.bias'].detach().numpy()
+ model.class_token.data[:] = weights_pretained['class_token'].detach().numpy()
+ model.positional_encoding.data[:] = weights_pretained['positional_encoding'].detach().numpy()
+
+ for i in range(len(model.encoders)):
+ equal_layer(i)
+
+ model.norm.kernel.data[:] = weights_pretained['norm.weight'].detach().numpy()
+ model.norm.bias.data[:] = weights_pretained['norm.bias'].detach().numpy()
+ model.classifier.kernel.data[:] = weights_pretained['classifier.0.weight'].detach().numpy()
+ model.classifier.bias.data[:] = weights_pretained['classifier.0.bias'].detach().numpy()
+
+