diff --git a/.gitignore b/.gitignore index 62b2ca8..52d9f37 100644 --- a/.gitignore +++ b/.gitignore @@ -189,3 +189,4 @@ dmypy.json # End of https://www.toptal.com/developers/gitignore/api/python,emacs +data/ \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..07b79ea --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,37 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/joey2.0.iml b/.idea/joey2.0.iml new file mode 100644 index 0000000..9289075 --- /dev/null +++ b/.idea/joey2.0.iml @@ -0,0 +1,15 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..1f79844 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..55c4497 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README.md b/README.md index eac241b..8a72c9e 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ Joey is a machine learning framework running on top of [Devito](https://github.c * A backward pass through a neural network with batch processing * Producing backpropagation equations automatically based on the list of layers in a neural network (only a loss function must be defined manually by the user) * Training a neural network with PyTorch optimizers +* Initiate a transformer neural network for image classification Unlike other machine learning frameworks, Joey generates and compiles an optimized low-level code on-the-spot (using Devito) for both standalone layers and proper neural networks. @@ -16,10 +17,23 @@ Unlike other machine learning frameworks, Joey generates and compiles an optimiz * 2D max pooling (other types of 2D pooling can be implemented by the user by extending the `Pooling` abstract class) * Full connection * Flattening (an internal layer turning 2D data with channels into a 1D vector or 2D matrix, depending on the batch size) +* 3D FullyConnected +* Einsun function +* Dropout 1, 2, 3 and 4 dimensions +* Norm 2D +* Norm 3D +* Softmax 3D and 4D function + +## Supported modules +* MultiHeadAttention +* VisionEnconder + +## Built-in Models +* ViT (Vision Transformer) ## Supported activation functions * ReLU -* Softmax (only via the `FullyConnectedSoftmax` class) +* Softmax (only via the `FullyConnectedSoftmax` class or with `Softmax3d` / `Softmax4d` function) * Dummy (`f(x) = x`) Other activation functions can be implemented by extending the `Activation` abstract class. diff --git a/examples/ViT_Running.py b/examples/ViT_Running.py new file mode 100644 index 0000000..9e5f5f8 --- /dev/null +++ b/examples/ViT_Running.py @@ -0,0 +1,7 @@ + + + + + + + diff --git a/examples/resources/model_weights_ViT b/examples/resources/model_weights_ViT new file mode 100644 index 0000000..53c2b4d Binary files /dev/null and b/examples/resources/model_weights_ViT differ diff --git a/joey/base.py b/joey/base.py index d4db2d5..37dd95d 100644 --- a/joey/base.py +++ b/joey/base.py @@ -1,5 +1,7 @@ from abc import ABC, abstractmethod -from devito import Operator, Function, dimensions + +import numpy as np +from devito import Operator, Function, dimensions, SpaceDimension from joey import Activation from joey import activation as activ from numpy import array @@ -8,21 +10,60 @@ dim_index = 0 -def default_name_allocator(): +def default_name_allocator(name=''): global index - name = 'f' + str(index) + _name = 'f' + name + str(index) index += 1 - return name + return _name def default_dim_allocator(count): global dim_index - names = '' + names = [] for i in range(count): - names += 'd' + str(dim_index) + ' ' + names.append('d' + str(dim_index)) dim_index += 1 - names = names[:-1] - return dimensions(names) + return [SpaceDimension(n) for n in names] + + +class Module(ABC): + @property + def input(self): + """A Function object corresponding to an input data array.""" + return self._I + + @property + def result(self): + """A Function object corresponding to a result array.""" + return self._R + + @abstractmethod + def equations(self) -> (list, list): + pass + + def init_params(self): + if self.kernel is not None: + self.kernel.data[:] = \ + np.random.rand(*self.kernel.shape) - 0.5 + + if self.bias is not None: + self.bias.data[:] = np.random.rand(*self.bias.shape) - 0.5 + + @abstractmethod + def _allocate(self, **kwargs) -> (Function, Function, Function, + Function, Function, Function, + Function): + + pass + + def execute(self, kernel_data=None, input_data=None, bias=None) -> array: + + self._op.apply(**self._arg_dict) + return self._R.data + + @abstractmethod + def backprop_equations(self, prev_layer, next_layer) -> (list, list): + pass class Layer(ABC): @@ -62,7 +103,7 @@ def __init__(self, kernel_size, input_size, activation=activ.Dummy(), name_allocator_func=default_name_allocator, dim_allocator_func=default_dim_allocator, - generate_code=False): + generate_code=False, **kwargs): if activation is None: activation = activ.Dummy() @@ -71,12 +112,14 @@ def __init__(self, kernel_size, "its subclass") self._activation = activation - - self._K, self._I, self._R, self._bias, self._KG, self._RG, \ - self._biasG = self._allocate(kernel_size, - input_size, - name_allocator_func, - dim_allocator_func) + self.propagate = True + self.back_propagate = True + self.name = kwargs.get('name', '') + self._K, self._I, self._R, self._bias, self._KG, self._RG, self._biasG = self._allocate(kernel_size, + input_size, + name_allocator_func, + dim_allocator_func, + **kwargs) if generate_code: eqs, args = self.equations() @@ -89,6 +132,11 @@ def kernel(self): """A Function object corresponding to a kernel/weight array.""" return self._K + @property + def weight(self): + """A Function object corresponding to a kernel/weight array.""" + return self._K.data + @property def input(self): """A Function object corresponding to an input data array.""" @@ -135,27 +183,35 @@ def pytorch_parameters(self): kernel_parameter = None bias_parameter = None - if self._K is not None: + if self._K is not None and self.propagate: kernel_tensor = from_numpy(self._K.data) kernel_parameter = Parameter(kernel_tensor, requires_grad=False) if self._KG is not None: kernel_parameter.grad = from_numpy(self._KG.data) - if self._bias is not None: + if self._bias is not None and self.propagate: bias_tensor = from_numpy(self._bias.data) bias_parameter = Parameter(bias_tensor, requires_grad=False) if self._biasG is not None: bias_parameter.grad = from_numpy(self._biasG.data) - return (kernel_parameter, bias_parameter) + return kernel_parameter, bias_parameter + + def init_params(self): + if self.kernel is not None: + self.kernel.data[:] = \ + np.random.rand(*self.kernel.shape) - 0.5 + + if self.bias is not None: + self.bias.data[:] = np.random.rand(*self.bias.shape) - 0.5 @abstractmethod def _allocate(self, kernel_size, input_size, name_allocator_func, - dim_allocator_func) -> (Function, Function, Function, - Function, Function, Function, - Function): + dim_allocator_func, **kwargs) -> (Function, Function, Function, + Function, Function, Function, + Function): """ This method should return a (Function, Function, Function, Function, Function, Function, Function) object corresponding to a kernel, diff --git a/joey/funtional.py b/joey/funtional.py new file mode 100644 index 0000000..59b24af --- /dev/null +++ b/joey/funtional.py @@ -0,0 +1,310 @@ +from abc import ABC, abstractmethod +from functools import reduce +import numpy as np +from devito import Eq, Function, exp, Inc + +from joey import default_name_allocator, Layer, default_dim_allocator +from joey.utils import get_tensor_2d, get_tensor_1d, get_tensor_3d, get_tensor_4d + + +def kernel_shape(x): + return reduce(lambda a, b: a * b, x) + + +class Functional(Layer): + + def _allocate(self, kernel_size, input_size, name_allocator_func, dim_allocator_func, **kwargs) -> ( + Function, Function, Function, + Function, Function, Function, + Function): + return self._K, self._I, self._R, self.bias, self._KG, self._RG, self._biasG + + def execute(self, kernel_data=None, input_data=None, bias=None) -> np.array: + pass + + @abstractmethod + def equations(self) -> (list, list): + pass + + @abstractmethod + def backprop_equations(self, prev_layer, next_layer) -> (list, list): + pass + + def __init__(self): + pass + + +class BaseDropout(Functional): + def equations(self) -> (list, list): + pass + + def backprop_equations(self, prev_layer, next_layer) -> (list, list): + pass + + def init_params(self): + K = int(self.N * self.dropout) + arr = np.array([0] * K + [1] * (self.N - K)) + np.random.shuffle(arr) + + self._K.data[:] = arr.reshape(*self.shape) + + +class Dropout1d(BaseDropout): + def __init__(self, name, shape, kernel_size, input_size, dropout=0.1, **kwargs): + self.N = kernel_shape(shape) + self.dropout = dropout + self.name = name + self.shape = shape + self.propagate = False + + self._I = get_tensor_1d(default_name_allocator('input_' + self.name), shape=self.shape) + self._K = get_tensor_1d(default_name_allocator('kernel_' + self.name), shape=self.shape, + dims=self._I.dimensions) + self._R = get_tensor_1d(default_name_allocator('result_' + self.name), shape=self.shape, + dims=self._I.dimensions) + + self._bias, self._KG, self._RG, self._biasG = None, None, None, None + + def equations(self) -> (list, list): + a = self._R.dimensions + return [ + Eq(self._R[a], self._I[a] * self._K[a]) + ], [] + + def backprop_equations(self, prev_layer, next_layer) -> (list, list): + return [], [] + + +class Dropout2d(BaseDropout): + def __init__(self, name, shape, kernel_size, input_size, dropout=0.1, **kwargs): + self.N = kernel_shape(shape) + self.dropout = dropout + self.name = name + self.shape = shape + self.propagate = False + + self._I = get_tensor_2d(default_name_allocator('input_' + self.name), shape=self.shape) + self._K = get_tensor_2d(default_name_allocator('kernel_' + self.name), shape=self.shape, + dims=self._I.dimensions) + self._R = get_tensor_2d(default_name_allocator('result_' + self.name), shape=self.shape, + dims=self._I.dimensions) + + self._bias, self._KG, self._RG, self._biasG = None, None, None, None + + def equations(self) -> (list, list): + a, b = self._R.dimensions + return [ + Eq(self._R[a, b], self._I[a, b] * self._K[a, b]) + ], [] + + def backprop_equations(self, prev_layer, next_layer) -> (list, list): + return [], [] + + +class Dropout3d(BaseDropout): + def __init__(self, name, shape, dropout=0.1, **kwargs): + self.N = kernel_shape(shape) + self.dropout = dropout + self.name = name + self.shape = shape + self.propagate = False + + self._I = get_tensor_3d(default_name_allocator('input_' + self.name), shape=self.shape) + self._K = get_tensor_3d(default_name_allocator('kernel_' + self.name), shape=self.shape, + dims=self._I.dimensions) + self._R = get_tensor_3d(default_name_allocator('result_' + self.name), shape=self.shape, + dims=self._I.dimensions) + + self._bias, self._KG, self._RG, self._biasG = None, None, None, None + + def equations(self) -> (list, list): + a, b, c = self._R.dimensions + return [ + Eq(self._R[a, b, c], self._I[a, b, c] * self._K[a, b, c]) + ], [] + + def backprop_equations(self, prev_layer, next_layer) -> (list, list): + return [], [] + + +class Dropout4d(BaseDropout): + def __init__(self, name, shape, dropout=0.1, **kwargs): + self.N = kernel_shape(shape) + self.dropout = dropout + self.name = name + self.shape = shape + self.propagate = False + + self._I = get_tensor_4d(default_name_allocator('input_' + self.name), shape=self.shape) + self._K = get_tensor_4d(default_name_allocator('kernel_' + self.name), shape=self.shape, + dims=self._I.dimensions) + self._R = get_tensor_4d(default_name_allocator('result_' + self.name), shape=self.shape, + dims=self._I.dimensions) + + self._bias, self._KG, self._RG, self._biasG = None, None, None, None + + def equations(self) -> (list, list): + a, b, c, d = self._R.dimensions + return [ + Eq(self._R[a, b, c, d], self._I[a, b, c, d] * self._K[a, b, c, d]) + ], [] + + def backprop_equations(self, prev_layer, next_layer) -> (list, list): + return [], [] + + +class Softmax3d(Functional): + def __init__(self, name, shape, **kwargs): + self.name = name + self.shape = shape + self.propagate = False + + self._I = get_tensor_3d(default_name_allocator('input_' + self.name), shape=self.shape) + self._R = get_tensor_3d(default_name_allocator('result_' + self.name), shape=self.shape) + + self._bias, self._KG, self._RG, self._biasG, self._K = None, None, None, None, None + + def equations(self) -> (list, list): + a, b, c = self._I.dimensions + x, y, z = self._R.dimensions + + h = default_dim_allocator(1)[0] + expon = get_tensor_3d(default_name_allocator('exponential_' + self.name), shape=self.shape, dims=(a, b, c)) + sum_last_axis = get_tensor_3d(default_name_allocator('sum_all_' + self.name), shape=(self.shape[0:2] + (1,)), + dims=(a, b, h)) + + return [ + Eq(self.result, 0), + Eq(expon[a, b, c], exp(self._I[a, b, c])), + Eq(sum_last_axis[a, b, h], 0), + Inc(sum_last_axis[a, b, h], expon[a, b, c]), + Eq(self.result[x, y, z], expon[x, y, z] / sum_last_axis[x, y, h]), + ], [] + + def backprop_equations(self, prev_layer, next_layer) -> (list, list): + return [], [] + + def init_params(self): + pass + + +class Softmax4d(Functional): + def __init__(self, name, shape): + self.name = name + self.shape = shape + self.propagate = False + + self._I = get_tensor_4d(default_name_allocator('input_' + self.name), shape=self.shape) + self._R = get_tensor_4d(default_name_allocator('result_' + self.name), shape=self.shape) + + self.dimensions = self._R.dimensions + + self._bias, self._KG, self._RG, self._biasG, self._K = None, None, None, None, None + + def equations(self) -> (list, list): + a, b, c, d = self._I.dimensions + x, y, z, w = self._R.dimensions + + h = default_dim_allocator(1)[0] + expon = get_tensor_4d(default_name_allocator('exponential_' + self.name), shape=self.shape, dims=(a, b, c, d)) + sum_last_axis = get_tensor_4d(default_name_allocator('sum_all_' + self.name), shape=(self.shape[0:3] + (1,)), + dims=(a, b, c, h)) + eqs = [Eq(self.result, 0)] + eqs += [ + Eq(expon[a, b, c, d], exp(self.input[a, b, c, d])), + Eq(sum_last_axis[a, b, c, h], 0), + Inc(sum_last_axis[a, b, c, h], expon[a, b, c, d]), + Eq(self.result[x, y, z, w], expon[x, y, z, w] / sum_last_axis[x, y, z, h]), + + ] + + return eqs, [] + + def backprop_equations(self, prev_layer, next_layer) -> (list, list): + return [], [] + + def init_params(self): + pass + + +class Expand3to4(Functional): + + def __init__(self, name, shape_in, shape_out): + assert shape_in[-1] == (shape_out[-1] * shape_out[-2]), 'The last Input dimension must match the ' \ + 'multiplication ' \ + 'of the 2 last Result dimensions.' + + self.name = name + self.propagate = False + + self._I = get_tensor_3d(default_name_allocator('input_' + self.name), shape=shape_in) + self._R = get_tensor_4d(default_name_allocator('result_' + self.name), shape=shape_out) + + self.dimensions = self._R.dimensions + + self._bias, self._KG, self._RG, self._biasG, self._K = None, None, None, None, None + + def init_params(self): + pass + + def equations(self) -> (list, list): + a, b, c, d = self._R.dimensions + _, _, D = self._I.shape + return [ + Eq(self._R[a, b, c, d], self._I[a, b, (c * D) + d]) + ], [] + + def backprop_equations(self, prev_layer, next_layer) -> (list, list): + return [], [] + + +class Contract4to3(Functional): + + def __init__(self, name, shape_in, shape_out): + assert shape_out[-1] == (shape_in[-1] * shape_in[-2]), 'The last Result dimension must match the ' \ + 'multiplication ' \ + 'of the 2 last Input dimensions.' + + self.name = name + self.propagate = False + + self._I = get_tensor_4d(default_name_allocator('input_' + self.name), shape=shape_in) + self._R = get_tensor_3d(default_name_allocator('result_' + self.name), shape=shape_out) + + self.dimensions = self._R.dimensions + + self._bias, self._KG, self._RG, self._biasG, self._K = None, None, None, None, None + + def init_params(self): + pass + + def equations(self) -> (list, list): + a, b, c, d = self._I.dimensions + _, _, _, D = self._I.shape + return [ + Eq(self._R[a, b, (c * D + d)], self._I[a, b, c, d]) + ], [] + + def backprop_equations(self, prev_layer, next_layer) -> (list, list): + return [], [] + + +class Reduce2ndDimension3d(Functional): + + def __init__(self, name, shape): + + self._I = get_tensor_3d('reduce_input_' + name, shape=shape) + self._R = get_tensor_2d('reduce_result_' + name, shape=(shape[0], shape[-1])) + + self._bias, self._KG, self._RG, self._biasG, self._K = None, None, None, None, None + + def equations(self) -> (list, list): + a, b = self.result.dimensions + _, X, _ = self.input.shape + return [ + Eq(self._R[a, b], self._I[a, X-1, b]) + + ], [] + + def backprop_equations(self, prev_layer, next_layer) -> (list, list): + return [], [] diff --git a/joey/models/ViT.py b/joey/models/ViT.py new file mode 100644 index 0000000..7fcc74a --- /dev/null +++ b/joey/models/ViT.py @@ -0,0 +1,130 @@ +from joey import Module, default_dim_allocator +from joey.module.VisionEncoder import VisionEncoder +from joey.utils import get_tensor_3d +from joey.new_layers import FullyConnected3d, Norm2d, FullyConnected2d +from devito import Operator, Inc, Eq, Function +from scipy.special import log_softmax +import numpy as np + + +class ViT(Module): + r"""Vision Transformer Model + + A transformer model to solve vision tasks by treating images as sequences of tokens. + + Args: + image_size (int): Size of input image + channel_size (int): Size of the channel + patch_size (int): Max patch size, determines number of split images/patches and token size + embed_size (int): Embedding size of input + num_heads (int): Number of heads in Multi-Headed Attention + classes (int): Number of classes for classification of data + hidden_size (int): Number of hidden layers + + """ + + def __init__(self, image_size: int, channel_size: int, patch_size: int, embed_size: int, num_heads: int, + classes: int, num_layers: int, hidden_size: int, batch: int = 64, generate_code=False): + + self.p = patch_size + self.image_size = image_size + self.embed_size = embed_size + self.num_patches = (image_size // patch_size) ** 2 + self.patch_size = channel_size * (patch_size ** 2) + self.num_heads = num_heads + self.classes = classes + self.num_layers = num_layers + self.hidden_size = hidden_size + + img_shape = (batch, int((self.image_size / self.p) * (self.image_size / self.p)), self.patch_size) + + self._R = get_tensor_3d('result_1srt', (batch, self.num_patches + 1, self.embed_size)) + + d, e = default_dim_allocator(2) + x, y, z = self._R.dimensions + + self.embeddings = FullyConnected3d(input_size=img_shape, weight_size=(self.embed_size, self.patch_size)) + self.class_token = get_tensor_3d('class_token', (1, 1, self.embed_size), dims=(d, e, z)) + self.positional_encoding = get_tensor_3d('pos_enc', (1, self.num_patches + 1, self.embed_size), dims=(d, y, z)) + + self.class_token.data[:] = np.random.rand(*self.class_token.shape) + self.positional_encoding.data[:] = np.random.rand(*self.positional_encoding.shape) + + self.encoders = [] + for layer in range(self.num_layers): + vision_encoder = VisionEncoder( + embed_size=self.embed_size, + num_heads=self.num_heads, + batch_size=batch, + lines=self.num_patches + 1, + hidden_size=self.hidden_size, + name='encoder' + str(layer) + ) + self.encoders.append(vision_encoder) + + self.norm = Norm2d(input_size=(batch, self.embed_size), weight_size=(self.embed_size,)) + self.classifier = FullyConnected2d(input_size=(batch, self.embed_size), weight_size=(self.classes, + self.embed_size)) + + if generate_code: + eqs, args = self.equations() + self._arg_dict = dict(args) + self._op = Operator(eqs) + self._op.cfunction + + def equations(self): + + a, b, c = self.embeddings.result.dimensions + d, e, _ = self.class_token.dimensions + x, y, z = self.result.dimensions + + t0, u0, v0 = self.encoders[0].norm1.input.dimensions + + eqs = [ + Eq(self.result, 0), + *self.embeddings.equations()[0], + Eq(self.result[a, b, c], self.embeddings.result[a, b, c]), + Eq(self.result[x, self.num_patches, z], self.class_token[0, 0, z]), + Inc(self.result[x, y, z], self.positional_encoding[d, y, z]), + Eq(self.encoders[0].norm1.input[t0, u0, v0], self.result[t0, u0, v0]) + ] + + for index, encoder in enumerate(self.encoders): + if index > 0: + t, u, v = self.encoders[index].input.dimensions + eqs.append( + Eq(self.encoders[index].input[t, u, v], self.encoders[index-1].result[t, u, v]) + ) + eqs += encoder.equations()[0] + + last_enconder = self.encoders[-1].result + + a, b = self.norm.input.dimensions + i, j = self.classifier.input.dimensions + _, x, _ = last_enconder.shape + eqs += [ + Eq(self.norm.input[a, b], last_enconder[a, x - 1, b]), + *self.norm.equations()[0], + Eq(self.classifier.input[i, j], self.norm.result[i, j]), + *self.classifier.equations()[0] + ] + + return eqs, [] + + def _allocate(self, **kwargs) -> (Function, Function, Function, + Function, Function, Function, + Function): + pass + + def backprop_equations(self, prev_layer, next_layer) -> (list, list): + return [], [] + + def forward(self, x): + + b, c, h, w = x.shape + x = x.reshape(b, int((h / self.p) * (w / self.p)), c * self.p * self.p) + self.embeddings.input.data[:] = x + + self._op.apply() + + return log_softmax(self.classifier.result.data, axis=-1) \ No newline at end of file diff --git a/joey/module/MultiHeadAttention.py b/joey/module/MultiHeadAttention.py new file mode 100644 index 0000000..4ab86e2 --- /dev/null +++ b/joey/module/MultiHeadAttention.py @@ -0,0 +1,152 @@ +import math + +from devito import Function, Operator, Eq, Inc, Constant, exp + +from joey import Module, default_dim_allocator +from joey.utils import get_tensor_4d, get_tensor_3d +from joey.new_layers import FullyConnected3d + +from torch import nn +import torch +from torch import functional as F + + +class MultiHeadAttention(Module): + r"""Multi-headed Attention for input Query, Key, Value + + Multi-headed Attention is a module for attention mechanisms which runs through attention in several times in + parallel, then the multiple outputs are concatenated and linearly transformed + + Args: + embed_size (int): Max embedding size + num_heads (int): Number of heads in multi-headed attention; Number of splits in the embedding size + batch_dim (int, optional): The dimension in which batch dimensions is + + """ + + def __init__(self, + embed_size: int, + num_heads: int, + lines: int, + batch_size: int, + batch_dim: int = 0, + generate_code=False, + name='att' + ): + self.name = name + self.embed_size = embed_size + self.num_heads = num_heads + self.batch_dim = batch_dim + self.lines = lines + self.batch_size = batch_size + + self.head_size = self.embed_size // self.num_heads + + assert self.head_size * self.num_heads == self.embed_size, "Heads cannot split Embedding size equally" + + self._I = get_tensor_3d(name=('input_' + self.name), + shape=(self.batch_size, self.lines, self.embed_size)) + + self.Q = FullyConnected3d(input_size=(self.batch_size, self.lines, self.embed_size), + weight_size=(self.embed_size, self.embed_size)) + + self.K = FullyConnected3d(input_size=(self.batch_size, self.lines, self.embed_size), + weight_size=(self.embed_size, self.embed_size)) + + self.V = FullyConnected3d(input_size=(self.batch_size, self.lines, self.embed_size), + weight_size=(self.embed_size, self.embed_size)) + + self.linear = FullyConnected3d(input_size=(self.batch_size, self.lines, self.embed_size), + weight_size=(self.embed_size, self.embed_size)) + + reshaped = (self.batch_size, self.lines, self.num_heads, self.head_size) + shape_sum = (self.batch_size, self.num_heads, self.lines, 1) + shape_scores = (self.batch_size, self.num_heads, self.lines, self.lines) + + self.q_reshaped = get_tensor_4d('q_4d_', reshaped) + self.k_reshaped = get_tensor_4d('k_4d_', reshaped) + self.v_reshaped = get_tensor_4d('v_4d_', reshaped) + + b, q, k, h, e, h1 = default_dim_allocator(6) + + self.sqrt_embeded = Constant(name + 'sqrt_embed', value=math.sqrt(self.embed_size)) + + self.scores = get_tensor_4d(name=('bhqk' + self.name), shape=shape_scores, dims=[b, q, k, e]) + self.scores_result = get_tensor_4d(name=('scores_result' + self.name), shape=shape_scores) + self.attention = get_tensor_4d(name=('attention' + self.name), shape=reshaped) + self.expon = get_tensor_4d(name=('expon' + self.name), shape=shape_scores, dims=[b, q, k, e]) + self.sum_all = get_tensor_4d(name=('sum_all' + self.name), shape=shape_sum, dims=[b, q, k, h1]) + + self._R = get_tensor_3d(name=('result_' + self.name), shape=(self.batch_size, self.lines, self.embed_size)) + + if generate_code: + eqs, args = self.equations() + self._arg_dict = dict(args) + self._op = Operator(eqs) + self._op.cfunction + + def equations(self) -> (list, list): + x1, y1, z1, w1 = self.Q._dimensions + d1, d2, d3 = self.Q.input.dimensions + + q_a, q_b, q_c, q_d = self.q_reshaped.dimensions + k_a, k_b, k_c, k_d = self.k_reshaped.dimensions + v_a, v_b, v_c, v_d = self.v_reshaped.dimensions + + b, q, h, e = self.q_reshaped.dimensions + _, k, _, _ = self.k_reshaped.dimensions + + b2, h2, q2, k2 = self.scores.dimensions + b3, h3, q3, k3 = self.scores_result.dimensions + _, _, _, h1 = self.sum_all.dimensions + + eqs = [ + Eq(self.Q.input[d1, d2, d3], self.input[d1, d2, d3]), + Eq(self.K.input[d1, d2, d3], self.input[d1, d2, d3]), + Eq(self.V.input[d1, d2, d3], self.input[d1, d2, d3]), + *self.Q.equations(dims=(x1, y1, z1, w1))[0], + *self.K.equations(dims=(x1, y1, z1, w1))[0], + *self.V.equations(dims=(x1, y1, z1, w1))[0], + # Forward Equations for Query Key and Value + Eq(self.q_reshaped[q_a, q_b, q_c, q_d], self.Q.result[q_a, q_b, (q_c * self.head_size) + q_d]), + Eq(self.k_reshaped[k_a, k_b, k_c, k_d], self.K.result[k_a, k_b, (k_c * self.head_size) + k_d]), + Eq(self.v_reshaped[v_a, v_b, v_c, v_d], self.V.result[v_a, v_b, (v_c * self.head_size) + v_d]), + # Einsum over Query and Key + Eq(self.scores[b2, h2, q2, k2], 0), + *[Inc(self.scores[b, i, q, k], self.q_reshaped[b, q, i, e] * self.k_reshaped[b, k, i, e]) for i in range( + self.num_heads + )], + # Scores divided by sqrt(embed_size) + Eq(self.scores[b2, h2, q2, k2], self.scores[b2, h2, q2, k2] / self.sqrt_embeded), + # Sofmax(scores) + Eq(self.expon[b2, h2, q2, k2], exp(self.scores[b2, h2, q2, k2])), + Eq(self.sum_all[b2, h2, q2, h1], 0), + Inc(self.sum_all[b2, h2, q2, h1], self.expon[b2, h2, q2, k2]), + Eq(self.scores_result[b3, h3, q3, k3], self.expon[b3, h3, q3, k3] / self.sum_all[b3, h3, q3, h1]), + ] + + i, k, j, l = self.attention.dimensions + _, _, _, m = self.scores_result.dimensions + a, b, c, d = self.attention.shape + + x, y, z = self._R.dimensions + + eqs += [ + Eq(self.attention[i, k, j, l], 0), + *[Inc(self.attention[i, k, z, l], self.scores_result[i, z, k, m] * self.v_reshaped[i, m, z, l]) for z in + range(self.num_heads)], + Eq(self.linear.input[i, k, (j * d) + l], self.attention[i, k, j, l]), + *self.linear.equations()[0], + Eq(self.result[x, y, z], self.linear.result[x, y, z]) + ] + + return eqs, [] + + def _allocate(self, **kwargs) -> ( + Function, Function, Function, + Function, Function, Function, + Function): + pass + + def backprop_equations(self, prev_layer, next_layer) -> (list, list): + return [], [] diff --git a/joey/module/VisionEncoder.py b/joey/module/VisionEncoder.py new file mode 100644 index 0000000..2b4ac57 --- /dev/null +++ b/joey/module/VisionEncoder.py @@ -0,0 +1,110 @@ +import numpy as np +import torch +from devito import Function, Operator, Eq, Inc +from torch import nn + +from joey import Module +from joey.activation import ReLU +from joey.module.MultiHeadAttention import MultiHeadAttention, MultiHeadAttentionTorch +from joey.new_layers import Norm3d, FullyConnected3d +from joey.utils import get_tensor_3d + + +class VisionEncoder(Module): + r"""Vision Encoder Model + + An Encoder Layer with the added functionality to encode important local structures of a tokenized image + + Args: + embed_size (int): Embedding Size of Input + num_heads (int): Number of heads in multi-headed attention + hidden_size (int): Number of hidden layers + dropout (float, optional): A probability from 0 to 1 which determines the dropout rate + + """ + def __init__(self, + embed_size: int, + num_heads: int, + hidden_size: int, + lines: int, + batch_size: int, + dropout: float = 0.1, + name='vision_encoder', + generate_code=False + ): + self.embed_size = embed_size + self.num_heads = num_heads + self.hidden_size = hidden_size + self.dropout = dropout + self.name = name + self.batch_size = batch_size + self.lines = lines + + self.norm1 = Norm3d(input_size=(self.batch_size, self.lines, self.embed_size), + weight_size=(self.embed_size,), + name='norm_1_' + self.name) + self.norm2 = Norm3d(input_size=(self.batch_size, self.lines, self.embed_size), + weight_size=(self.embed_size,), + name='norm_2_' + self.name) + + self._I = self.norm1.input + + self.attention = MultiHeadAttention( + embed_size=self.embed_size, + batch_dim=0, + num_heads=self.num_heads, + lines=self.lines, + batch_size=self.batch_size, + generate_code=False, + name=name + '_att_' + ) + + first_in = (self.batch_size, self.lines, self.embed_size) + second_in = (self.batch_size, self.lines, self.embed_size * 4) + + mlp1 = FullyConnected3d(input_size=first_in, + weight_size=(4 * self.embed_size, self.embed_size), + activation=ReLU()) + mlp2 = FullyConnected3d(input_size=second_in, + weight_size=(self.embed_size, 4 * self.embed_size,)) + self.mlp = [mlp1, mlp2] + + self._R = get_tensor_3d('result_encoder_' + self.name, (self.batch_size, self.lines, self.embed_size)) + + if generate_code: + eqs, args = self.equations() + self._arg_dict = dict(args) + self._op = Operator(eqs) + self._op.cfunction + + def equations(self) -> (list, list): + a, b, c = self.result.dimensions + x, y, z = self.attention.result.dimensions + u, v, t = self.mlp[0].input.dimensions + g, j, k = self.mlp[1].input.dimensions + p, q, r = self.result.dimensions + + return [ + Eq(self.result[a, b, c], 0), + *self.norm1.equations()[0], + Eq(self.result[a, b, c], self.norm1.result[a, b, c]), + Eq(self.attention.input[a, b, c], self.result[a, b, c]), + *self.attention.equations()[0], + Inc(self.result[x, y, z], self.attention.result[x, y, z]), + Eq(self.norm2.input[x, y, z], self.result[x, y, z]), + *self.norm2.equations()[0], + Eq(self.mlp[0].input[u, v, t], self.norm2.result[u, v, t]), + *self.mlp[0].equations()[0], + Eq(self.mlp[1].input[g, j, k], self.mlp[0].result[g, j, k]), + *self.mlp[1].equations()[0], + Inc(self.result[p, q, r], self.mlp[1].result[p, q, r]) + ], [] + + def _allocate(self, **kwargs) -> (Function, Function, Function, + Function, Function, Function, + Function): + pass + + def backprop_equations(self, prev_layer, next_layer) -> (list, list): + pass + diff --git a/joey/module/__init__.py b/joey/module/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/joey/net.py b/joey/net.py index 609af66..f37a618 100644 --- a/joey/net.py +++ b/joey/net.py @@ -2,6 +2,8 @@ import numpy as np from devito import Eq, Operator +from joey import Layer + class Net: """ @@ -55,12 +57,7 @@ def __init__(self, layers: list): def _init_parameters(self): for layer in self._layers: - if layer.kernel is not None: - layer.kernel.data[:] = \ - np.random.rand(*layer.kernel.shape) - 0.5 - - if layer.bias is not None: - layer.bias.data[:] = np.random.rand(*layer.bias.shape) - 0.5 + layer.init_params() def _gen_eqs(self): eqs = [] @@ -82,14 +79,16 @@ def _gen_eqs(self): eqs += layer_eqs input_function = layer.result - return (eqs, args) + print(eqs) + return eqs, args def _gen_backprop_eqs(self): eqs = [] args = [] for i in range(len(self._layers)): - layer = self._layers[i] + + layer: Layer = self._layers[i] if layer.kernel_gradients is not None: eqs.append(Eq(layer.kernel_gradients, 0)) @@ -104,6 +103,8 @@ def _gen_backprop_eqs(self): for i in range(len(self._layers) - 1, -1, -1): if i < len(self._layers) - 1: prev_layer = self._layers[i + 1] + if not prev_layer.propagate: + prev_layer = None else: prev_layer = None @@ -129,7 +130,7 @@ def _gen_backprop_eqs(self): eqs.append(Eq(layer.bias_gradients, layer.bias_gradients / batch_size)) - return (eqs, args) + return eqs, args @property def pytorch_parameters(self): diff --git a/joey/new_layers.py b/joey/new_layers.py new file mode 100644 index 0000000..b8caa6e --- /dev/null +++ b/joey/new_layers.py @@ -0,0 +1,404 @@ +from functools import reduce + +import numpy as np +from devito import Grid, Eq, Inc, Max, Function, exp, sum, Constant, sqrt +from numpy.core.multiarray import array +from scipy.special import softmax + +from joey import Layer, default_name_allocator, default_dim_allocator +from joey.funtional import Dropout3d, Softmax3d +from joey.utils import get_tensor_3d, get_tensor_2d + + +class FullyConnected2d(Layer): + """ + A Layer subclass corresponding to a full connection (FC) layer. + + Parameters + ---------- + weight_size : (int, int) + The shape of a weight matrix (represented internally by a NumPy array) + expressed as (rows, columns). + input_size : (int, int, int) + The shape of input data expressed as (rows, columns). + name_allocator_func : zero-argument function, optional + See Layer.__doc__. + dim_allocator_func : one-argument function, optional + See Layer.__doc__. + activation : Activation, optional + See Layer.__doc__. The actual default value is Dummy. + generate_code : bool, optional + See Layer.__doc__. + """ + + def __init__(self, weight_size, input_size, name_allocator_func=default_name_allocator, + dim_allocator_func=default_dim_allocator, activation=None, + generate_code=False): + super().__init__(weight_size, input_size, activation, + name_allocator_func, dim_allocator_func, + generate_code) + + def _allocate(self, weight_size, input_size, name_allocator_func, dim_allocator_func, **kwargs): + + t1, t2, t3 = dim_allocator_func(3) + + self._dimensions = (t1, t2, t3) + + gridW = Grid(shape=weight_size, dimensions=(t3, t2)) + W = Function(name=name_allocator_func(), grid=gridW, space_order=0, + dtype=np.float64) + + gridV_dimensions = (t1, t2) + gridR_dimensions = (t1, t3) + gridR_shape = (input_size[0], weight_size[0]) + + gridV = Grid(shape=input_size, dimensions=gridV_dimensions) + V = Function(name=name_allocator_func(), grid=gridV, space_order=0, + dtype=np.float64) + + gridR = Grid(shape=gridR_shape, dimensions=gridR_dimensions) + R = Function(name=name_allocator_func(), grid=gridR, space_order=0, + dtype=np.float64) + + if self._activation is not None: + self._T = Function(name=name_allocator_func(), grid=gridR, + space_order=0, dtype=np.float64) + + bias_grid = Grid(shape=weight_size[0], + dimensions=(t3,)) + bias = Function(name=name_allocator_func(), grid=bias_grid, + space_order=0, dtype=np.float64) + + kernel_grad = Function(name=name_allocator_func(), + grid=gridW, space_order=0, dtype=np.float64) + + output_grad = Function(name=name_allocator_func(), + grid=gridR, space_order=0, + dtype=np.float64) + + bias_grad = Function(name=name_allocator_func(), + grid=bias_grid, space_order=0, dtype=np.float64) + + return W, V, R, bias, kernel_grad, output_grad, bias_grad + + def execute(self, input_data, bias, weight_data=None): + if weight_data is not None: + self._K.data[:] = weight_data + + self._I.data[:] = input_data + self._bias.data[:] = bias + + if self._activation is not None: + self._T.data[:] = 0 + + self._R.data[:] = 0 + + return super().execute() + + def equations(self, dims=None, zero=False): + + a, b, c = dims if dims else self._dimensions + + eqs = [Eq(self.result, 0)] + eqs += [Inc(self.result[a, c], self.kernel[c, b] * self.input[a, b])] + + if self._activation is not None: + eqs.append(Eq(self.result[a, c], self._activation(self.bias[c] + self.result[a, c]))) + else: + eqs.append(Inc(self.result[a, c], self.bias[c])) + + return eqs, [] + + def backprop_equations(self, prev_layer, next_layer): + layer = self + + if prev_layer is None: + return ([Inc(layer.bias_gradients, layer.result_gradients), + Inc(layer.kernel_gradients, + layer.input * layer.result_gradients)], []) + + return ([Inc(layer.result_gradients, + prev_layer.kernel * + prev_layer.result_gradients)] + + layer.activation.backprop_eqs(layer) + + [Inc(layer.bias_gradients, layer.result_gradients), + Eq(layer.kernel_gradients, + layer.kernel_gradients + layer.input * layer.result_gradients) + ], []) +class FullyConnected3d(Layer): + """ + A Layer subclass corresponding to a full connection (FC) layer. + + Parameters + ---------- + weight_size : (int, int) + The shape of a weight matrix (represented internally by a NumPy array) + expressed as (rows, columns). + input_size : (int, int, int) + The shape of input data expressed as (rows, columns). + name_allocator_func : zero-argument function, optional + See Layer.__doc__. + dim_allocator_func : one-argument function, optional + See Layer.__doc__. + activation : Activation, optional + See Layer.__doc__. The actual default value is Dummy. + generate_code : bool, optional + See Layer.__doc__. + """ + + def __init__(self, weight_size, input_size, name_allocator_func=default_name_allocator, + dim_allocator_func=default_dim_allocator, activation=None, + generate_code=False): + super().__init__(weight_size, input_size, activation, + name_allocator_func, dim_allocator_func, + generate_code) + + def _allocate(self, weight_size, input_size, name_allocator_func, dim_allocator_func, **kwargs): + + t1, t2, t3, t4 = dim_allocator_func(4) + + self._dimensions = (t1, t2, t3, t4) + + gridW = Grid(shape=weight_size, dimensions=(t4, t3)) + W = Function(name=name_allocator_func(), grid=gridW, space_order=0, + dtype=np.float64) + + gridV_dimensions = (t1, t2, t3) + gridR_dimensions = (t1, t2, t4) + gridR_shape = (input_size[0], input_size[1], weight_size[0]) + + gridV = Grid(shape=input_size, dimensions=gridV_dimensions) + V = Function(name=name_allocator_func(), grid=gridV, space_order=0, + dtype=np.float64) + + gridR = Grid(shape=gridR_shape, dimensions=gridR_dimensions) + R = Function(name=name_allocator_func(), grid=gridR, space_order=0, + dtype=np.float64) + + if self._activation is not None: + self._T = Function(name=name_allocator_func(), grid=gridR, + space_order=0, dtype=np.float64) + + bias_grid = Grid(shape=weight_size[0], + dimensions=(t4,)) + bias = Function(name=name_allocator_func(), grid=bias_grid, + space_order=0, dtype=np.float64) + + kernel_grad = Function(name=name_allocator_func(), + grid=gridW, space_order=0, dtype=np.float64) + + output_grad = Function(name=name_allocator_func(), + grid=gridR, space_order=0, + dtype=np.float64) + + bias_grad = Function(name=name_allocator_func(), + grid=bias_grid, space_order=0, dtype=np.float64) + + return W, V, R, bias, kernel_grad, output_grad, bias_grad + + def execute(self, input_data, bias, weight_data=None): + if weight_data is not None: + self._K.data[:] = weight_data + + self._I.data[:] = input_data + self._bias.data[:] = bias + + if self._activation is not None: + self._T.data[:] = 0 + + self._R.data[:] = 0 + + return super().execute() + + def equations(self, dims=None, zero=False): + + a, b, c, d = dims if dims else self._dimensions + + eqs = [Eq(self.result, 0)] + eqs += [Inc(self.result[a, b, d], self.kernel[d, c] * self.input[a, b, c])] + + if self._activation is not None: + eqs.append(Eq(self.result[a, b, d], self._activation(self.bias[d] + self.result[a, b, d]))) + else: + eqs.append(Inc(self.result[a, b, d], self.bias[d])) + + return eqs, [] + + def backprop_equations(self, prev_layer, next_layer): + layer = self + + if prev_layer is None: + return ([Inc(layer.bias_gradients, layer.result_gradients), + Inc(layer.kernel_gradients, + layer.input * layer.result_gradients)], []) + + return ([Inc(layer.result_gradients, + prev_layer.kernel * + prev_layer.result_gradients)] + + layer.activation.backprop_eqs(layer) + + [Inc(layer.bias_gradients, layer.result_gradients), + Eq(layer.kernel_gradients, + layer.kernel_gradients + layer.input * layer.result_gradients) + ], []) + + +class Norm3d(Layer): + + def __init__(self, weight_size, input_size, name_allocator_func=default_name_allocator, + dim_allocator_func=default_dim_allocator, activation=None, + generate_code=False, **kwargs): + super().__init__(weight_size, input_size, activation, + name_allocator_func, dim_allocator_func, + generate_code, **kwargs) + + def _allocate(self, weight_size, input_size, name_allocator_func, dim_allocator_func, **kwargs): + batch, row, col, col2 = dim_allocator_func(4) + + self.eps = kwargs.get('eps', 1e-6) + self._dimensions = (batch, row, col, col2) + self.shape = input_size + + self.N = weight_size[0] + + gridW = Grid(shape=weight_size, dimensions=(col,)) + W = Function(name=name_allocator_func(), grid=gridW, space_order=0, dtype=np.float64) + + gridV_dimensions = (batch, row, col) + gridR_dimensions = (batch, row, col) + gridR_shape = input_size + + gridV = Grid(shape=input_size, dimensions=gridV_dimensions) + V = Function(name=name_allocator_func(), grid=gridV, space_order=0, dtype=np.float64) + + gridR = Grid(shape=gridR_shape, dimensions=gridR_dimensions) + R = Function(name=name_allocator_func(), grid=gridR, space_order=0, dtype=np.float64) + + bias_grid = Grid(shape=weight_size[0], dimensions=(col,)) + bias = Function(name=name_allocator_func(), grid=bias_grid, space_order=0, dtype=np.float64) + + kernel_grad = Function(name=name_allocator_func(), grid=gridW, space_order=0, dtype=np.float64) + + output_grad = Function(name=name_allocator_func(), grid=gridR, space_order=0, dtype=np.float64) + + bias_grad = Function(name=name_allocator_func(), grid=bias_grid, space_order=0, dtype=np.float64) + + return W, V, R, bias, kernel_grad, output_grad, bias_grad + + def init_params(self): + self._K.data[:] = np.ones(self.N) + self._bias.data[:] = np.zeros(self.N) + + def execute(self, kernel_data=None, input_data=None, bias=None) -> array: + pass + + def equations(self, zero=False) -> (list, list): + batch, row, col, col2 = self._dimensions + axis = Constant(name=self.name + 'dim_1', value=self._I.shape[-1]) + eps = Constant(name=self.name + 'eps', value=self.eps) + + result_sum = get_tensor_3d(default_name_allocator('result_sum_' + self.name), + shape=(self.shape[0:2] + (1,)), + dims=(batch, row, col2)) + result_mean = get_tensor_3d(default_name_allocator('result_mean_' + self.name), + shape=(self.shape[0:2] + (1,)), + dims=(batch, row, col2)) + result_std = get_tensor_3d(default_name_allocator('result_std_' + self.name), + shape=(self.shape[0:2] + (1,)), + dims=(batch, row, col2)) + eqs = [Eq(self.result, 0)] + eqs += [ + Eq(result_sum, 0), + Inc(result_sum, self.input), + Eq(result_mean, result_sum / axis), + Inc(result_std, ((self.input - result_mean) ** 2)), + Eq(result_std, result_std / axis), + Eq(result_std, sqrt(result_std)), + Eq(self.result, self.kernel * (self.input - result_mean) / (result_std + eps)), + Inc(self.result, self.bias[col]) + ] + + return eqs, [] + + def backprop_equations(self, prev_layer, next_layer) -> (list, list): + return [], [] + + +class Norm2d(Layer): + + def __init__(self, weight_size, input_size, name_allocator_func=default_name_allocator, + dim_allocator_func=default_dim_allocator, activation=None, + generate_code=False, **kwargs): + super().__init__(weight_size, input_size, activation, + name_allocator_func, dim_allocator_func, + generate_code, **kwargs) + + def _allocate(self, weight_size, input_size, name_allocator_func, dim_allocator_func, **kwargs): + row, col, col2 = dim_allocator_func(3) + + self.eps = kwargs.get('eps', 1e-6) + self._dimensions = (row, col, col2) + self.shape = input_size + + self.N = weight_size[0] + + gridW = Grid(shape=weight_size, dimensions=(col,)) + W = Function(name=name_allocator_func(), grid=gridW, space_order=0, dtype=np.float64) + + gridV_dimensions = (row, col) + gridR_dimensions = (row, col) + gridR_shape = input_size + + gridV = Grid(shape=input_size, dimensions=gridV_dimensions) + V = Function(name=name_allocator_func(), grid=gridV, space_order=0, dtype=np.float64) + + gridR = Grid(shape=gridR_shape, dimensions=gridR_dimensions) + R = Function(name=name_allocator_func(), grid=gridR, space_order=0, dtype=np.float64) + + bias_grid = Grid(shape=weight_size[0], dimensions=(col,)) + bias = Function(name=name_allocator_func(), grid=bias_grid, space_order=0, dtype=np.float64) + + kernel_grad = Function(name=name_allocator_func(), grid=gridW, space_order=0, dtype=np.float64) + + output_grad = Function(name=name_allocator_func(), grid=gridR, space_order=0, dtype=np.float64) + + bias_grad = Function(name=name_allocator_func(), grid=bias_grid, space_order=0, dtype=np.float64) + + return W, V, R, bias, kernel_grad, output_grad, bias_grad + + def init_params(self): + self._K.data[:] = np.ones(self.N) + self._bias.data[:] = np.zeros(self.N) + + def execute(self, kernel_data=None, input_data=None, bias=None) -> array: + pass + + def equations(self, zero=False) -> (list, list): + row, col, col2 = self._dimensions + axis = Constant(name=self.name + 'dim_1', value=self._I.shape[-1]) + eps = Constant(name=self.name + 'eps', value=self.eps) + + result_sum = get_tensor_2d(default_name_allocator('result_sum_' + self.name), + shape=(self.shape[0:1] + (1,)), + dims=(row, col2)) + result_mean = get_tensor_2d(default_name_allocator('result_mean_' + self.name), + shape=(self.shape[0:1] + (1,)), + dims=(row, col2)) + result_std = get_tensor_2d(default_name_allocator('result_std_' + self.name), + shape=(self.shape[0:1] + (1,)), + dims=(row, col2)) + eqs = [Eq(self.result, 0)] + eqs += [ + Eq(result_sum, 0), + Inc(result_sum, self.input), + Eq(result_mean, result_sum / axis), + Inc(result_std, ((self.input - result_mean) ** 2)), + Eq(result_std, result_std / axis), + Eq(result_std, sqrt(result_std)), + Eq(self.result, self.kernel * (self.input - result_mean) / (result_std + eps)), + Inc(self.result, self.bias[col]) + ] + + return eqs, [] + + def backprop_equations(self, prev_layer, next_layer) -> (list, list): + return [], [] diff --git a/joey/utils.py b/joey/utils.py new file mode 100644 index 0000000..686016f --- /dev/null +++ b/joey/utils.py @@ -0,0 +1,33 @@ +from devito import Function, SpaceDimension +import numpy as np + +from joey import default_dim_allocator, default_name_allocator + + +def get_tensor_4d(name, shape, dims=None): + a, b, c, d = shape + _a, _b, _c, _d = default_dim_allocator(4) if not dims else dims + + return Function(name=default_name_allocator(name), shape=(a, b, c, d), dimensions=(_a, _b, _c, _d), + dtype=np.float32) + + +def get_tensor_3d(name, shape, dims=None): + a, b, c = shape + _a, _b, _c = default_dim_allocator(3) if not dims else dims + + return Function(name=default_name_allocator(name), shape=(a, b, c), dimensions=(_a, _b, _c), dtype=np.float32) + + +def get_tensor_2d(name, shape, dims=None): + a, b = shape + _a, _b = default_dim_allocator(2) if not dims else dims + + return Function(name=default_name_allocator(name), shape=(a, b), dimensions=(_a, _b), dtype=np.float32) + + +def get_tensor_1d(name, shape, dim=None): + a = shape + _a = default_dim_allocator(1)[0] if not dim else dim + + return Function(name=default_name_allocator(name), shape=(a,), dimensions=(_a,), dtype=np.float32) diff --git a/joey/validate.py b/joey/validate.py new file mode 100644 index 0000000..f14c0ec --- /dev/null +++ b/joey/validate.py @@ -0,0 +1,58 @@ +import numpy as np +from torch import optim +from torchvision import datasets, transforms, models +import torchvision.transforms as transforms +from devito import logger +import torch + +from new_layers import x as net + +# logger.set_log_level(level='ERROR') + +mean, std = (0.5,), (0.5,) +BATCH_SIZE = 64 + +transform = transforms.Compose([transforms.ToTensor(), + transforms.Normalize(mean, std) + ]) + +trainset = datasets.MNIST('../data/MNIST/', download=True, train=True, transform=transform) +trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=False) + +testset = datasets.MNIST('../data/MNIST/', download=True, train=False, transform=transform) +testloader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False) + +optimizer = optim.SGD(net.pytorch_parameters, lr=0.001, momentum=0.9) +criterion = torch.nn.CrossEntropyLoss() + +def loss_grad(result, expected): + gradients = [] + for b in range(len(result.result.data)): + row = [] + for i in range(10): + result = [i, b] + if i == expected[b]: + result -= 1 + row.append(result) + gradients.append(row) + + return gradients + + +for img, label in trainloader: + img = img.reshape(28, 28, 64) + + # print("Input Image Dimensions: {}".format(img.size())) + # print("Label Dimensions: {}".format(label.size())) + # print("-" * 100) + + out = net.forward(img.detach().numpy()) + # loss = criterion(torch.from_numpy(out), label.t()) + # print(loss) + net.backward(np.random.rand(64, 10), loss_grad, optimizer) + + # print("Output Dimensions: {}".format(out.shape)) + # break + + +# sys.exit(0) diff --git a/requirements.txt b/requirements.txt index 08a2554..5db82d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,9 @@ -devito -torch -torchvision +devito~=4.8.0 +torch~=1.13.1 +torchvision~=0.14.1 + +numpy~=1.24.2 +scipy~=1.10.1 +sympy~=1.11.1 +pytest~=7.2.1 +setuptools~=59.6.0 \ No newline at end of file diff --git a/tests/test_ViT.py b/tests/test_ViT.py new file mode 100644 index 0000000..bfb16cb --- /dev/null +++ b/tests/test_ViT.py @@ -0,0 +1,63 @@ +import numpy as np +import torch +from torchvision import datasets +import torchvision.transforms as transforms +from devito import logger + +from joey.models.ViT import ViT +from tests.utils import transfer_weights_ViT + +logger.set_log_level(level='ERROR') +image_size = 28 +channel_size = 1 +patch_size = 7 +embed_size = 512 +num_heads = 8 +classes = 10 +num_layers = 3 +hidden_size = 256 +dropout = 0.2 + +np.random.seed(0) + +BATCH_SIZE = 64 + +mean, std = (0.5,), (0.5,) + +transform = transforms.Compose([transforms.ToTensor(), + transforms.Normalize(mean, std) + ]) + +trainset = datasets.MNIST('../data/MNIST/', download=True, train=True, transform=transform) +trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=False) + +testset = datasets.MNIST('../data/MNIST/', download=True, train=False, transform=transform) +testloader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE, shuffle=True) + +model = ViT(image_size, channel_size, patch_size, embed_size, num_heads, classes, num_layers, hidden_size, + generate_code=True) + + +def test_eval_model(): + transfer_weights_ViT(model) + + y_true_test = [] + y_pred_test = [] + + for batch_idx, (img, labels) in enumerate(testloader): + if img.size(0) != 64: + continue + preds = model.forward(img.detach().numpy()) + y_pred_test.extend(preds.argmax(axis=-1).tolist()) + y_true_test.extend(labels.detach().tolist()) + if batch_idx == 10: + break + + total_correct = len([True for x, y in zip(y_pred_test, y_true_test) if x == y]) + total = len(y_pred_test) + accuracy = total_correct * 100 / total + + print("Test Accuracy%: ", accuracy, "==", total_correct, "/", total) + + return accuracy >= 95 + diff --git a/tests/utils.py b/tests/utils.py index 002ec72..3d72da4 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,6 +1,8 @@ import numpy as np from os import environ +import torch + def compare(devito, pytorch, tolerance): pytorch = pytorch.detach().numpy() @@ -26,3 +28,47 @@ def get_run_count(): return 1000 else: return 1 + + +def transfer_weights_ViT(model): + + weights_pretained = torch.load('../examples/resources/model_weights_ViT') + + def equal_layer(num): + model.encoders[num].norm1.kernel.data[:] = weights_pretained[f'encoders.{num}.norm1.weight'].detach().numpy() + model.encoders[num].norm1.bias.data[:] = weights_pretained[f'encoders.{num}.norm1.bias'].detach().numpy() + model.encoders[num].norm2.kernel.data[:] = weights_pretained[f'encoders.{num}.norm2.weight'].detach().numpy() + model.encoders[num].norm2.bias.data[:] = weights_pretained[f'encoders.{num}.norm2.bias'].detach().numpy() + model.encoders[num].attention.Q.kernel.data[:] = weights_pretained[ + f'encoders.{num}.attention.Q.weight'].detach().numpy() + model.encoders[num].attention.Q.bias.data[:] = weights_pretained[f'encoders.{num}.attention.Q.bias'].detach().numpy() + model.encoders[num].attention.K.kernel.data[:] = weights_pretained[ + f'encoders.{num}.attention.K.weight'].detach().numpy() + model.encoders[num].attention.K.bias.data[:] = weights_pretained[f'encoders.{num}.attention.K.bias'].detach().numpy() + model.encoders[num].attention.V.kernel.data[:] = weights_pretained[ + f'encoders.{num}.attention.V.weight'].detach().numpy() + model.encoders[num].attention.V.bias.data[:] = weights_pretained[f'encoders.{num}.attention.V.bias'].detach().numpy() + model.encoders[num].attention.linear.kernel.data[:] = weights_pretained[ + f'encoders.{num}.attention.linear.weight'].detach().numpy() + model.encoders[num].attention.linear.bias.data[:] = weights_pretained[ + f'encoders.{num}.attention.linear.bias'].detach().numpy() + model.encoders[num].mlp[0].kernel.data[:] = weights_pretained[f'encoders.{num}.mlp.0.weight'].detach().numpy() + model.encoders[num].mlp[0].bias.data[:] = weights_pretained[f'encoders.{num}.mlp.0.bias'].detach().numpy() + model.encoders[num].mlp[1].kernel.data[:] = weights_pretained[f'encoders.{num}.mlp.2.weight'].detach().numpy() + model.encoders[num].mlp[1].bias.data[:] = weights_pretained[f'encoders.{num}.mlp.2.bias'].detach().numpy() + + with torch.no_grad(): + model.embeddings.kernel.data[:] = weights_pretained['embeddings.weight'].detach().numpy() + model.embeddings.bias.data[:] = weights_pretained['embeddings.bias'].detach().numpy() + model.class_token.data[:] = weights_pretained['class_token'].detach().numpy() + model.positional_encoding.data[:] = weights_pretained['positional_encoding'].detach().numpy() + + for i in range(len(model.encoders)): + equal_layer(i) + + model.norm.kernel.data[:] = weights_pretained['norm.weight'].detach().numpy() + model.norm.bias.data[:] = weights_pretained['norm.bias'].detach().numpy() + model.classifier.kernel.data[:] = weights_pretained['classifier.0.weight'].detach().numpy() + model.classifier.bias.data[:] = weights_pretained['classifier.0.bias'].detach().numpy() + +