Fix softmax overflow

paulilioaica · Sep 26, 2024 · d05a6b6 · d05a6b6
1 parent 186792a
commit d05a6b6
Show file tree

Hide file tree

Showing 4 changed files with 2,730 additions and 154 deletions.
diff --git a/src/loss_functions/cce.py b/src/loss_functions/cce.py
@@ -31,8 +31,7 @@ def backward(self):
             sample_size = len(self.y_pred[batch])
             for sample in range(sample_size):
                 for i in range(len(self.y_pred[batch][sample])):
-                    pred_value = max(self.y_pred[batch][sample][i].value, self.epsilon)
-                    self.y_pred[batch][sample][i].grad = -self.y_true[batch][sample][i].value / pred_value / batch_size
+                    self.y_pred[batch][sample][i].grad = (self.y_pred[batch][sample][i].value - self.y_true[batch][sample][i].value)
                     self.y_pred[batch][sample][i].backward()
 
     def __repr__(self):

diff --git a/src/models/simple_model.py b/src/models/simple_model.py
@@ -1,17 +1,17 @@
 from nn.linear_layer import LinearLayer
-from activation_functions.relu import ReLUActivation
+from activation_functions.sigmoid import SigmoidActivation
 from normalization.softmax import Softmax
 from nn.module import Module
 
 class SimpleModel(Module):
     def __init__(self, input_size, hidden_size, output_size):
         self.layer1 = LinearLayer(input_size, hidden_size)
         self.layer2 = LinearLayer(hidden_size, output_size)
-        self.relu = ReLUActivation()
+        self.sigmoid = SigmoidActivation()
         self.softmax = Softmax()
 
     def __call__(self, x):
         x = self.layer1(x)
-        x = self.relu(x)
+        x = self.sigmoid(x)
         x = self.layer2(x)
         return self.softmax(x)
diff --git a/src/normalization/softmax.py b/src/normalization/softmax.py
@@ -2,17 +2,27 @@
 import math
 
 class Softmax(NormalizationFunction):
+    def __init__(self, temperature=3, clip_value=50):
+        self.temperature = temperature
+        self.clip_value = clip_value
+
     def _softmax(self, x):
         max_value = max([i.value for i in x])
-        e_x = [math.exp(i.value - max_value) for i in x]
-        return [i / sum(e_x) for i in e_x]
-
+
+        clipped_logits = [(i.value - max_value) / self.temperature for i in x]
+        clipped_logits = [min(self.clip_value, max(-self.clip_value, logit)) for logit in clipped_logits]
+
+        log_sum_exp = math.log(sum([math.exp(logit) for logit in clipped_logits]))
+        softmax_values = [math.exp(logit - log_sum_exp) for logit in clipped_logits]
+
+        return softmax_values
+
     def forward(self, input):
         softmax_values = self._softmax(input)
         return softmax_values
 
     def _build_backward_function(self, input, out):
         def _backward():
-            input.grad += out.value * (1 - out.value) if out.requires_grad else 0
+            if out.requires_grad:
+                input.grad += out.value * (1 - out.value) * out.grad / self.temperature
         return _backward
-