diff --git a/benchmarks/ex01_xor.nim b/benchmarks/ex01_xor.nim
index e815d97b8..ef1e48aac 100644
--- a/benchmarks/ex01_xor.nim
+++ b/benchmarks/ex01_xor.nim
@@ -1,50 +1,52 @@
 import ../src/arraymancer
 
 # Learning XOR function with a neural network.
-
-# Autograd context / neuralnet graph
-let ctx = newContext Tensor[float32]
-let bsz = 32 # batch size
-
-let x_train_bool = randomTensor([bsz * 100, 2], 1).astype(bool)
-let y_bool = x_train_bool[_,0] xor x_train_bool[_,1]
-let x_train = ctx.variable(x_train_bool.astype(float32))
-let y = y_bool.astype(float32)
-
-# We will build the following network:
-# Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss
-
-let layer_3neurons = ctx.variable(
-                       randomTensor(3, 2, 2.0f) -. 1.0f,
-                       requires_grad = true
-                     )
-
-let classifier_layer = ctx.variable(
-                         randomTensor(1, 3, 2.0f) -. 1.0f,
-                         requires_grad = true
-                       )
-
-# Stochastic Gradient Descent
-let optim = newSGD[float32](
-    layer_3neurons, classifier_layer, 0.01f
-  )
-
-# Learning loop
-for epoch in 0..10000:
-  for batch_id in 0..<100:
-
-    # minibatch offset in the Tensor
-    let offset = batch_id * 32
-    let x = x_train[offset ..< offset + 32, _]
-    let target = y[offset ..< offset + 32, _]
-
-    # Building the network
-    let n1 = relu linear(x, layer_3neurons)
-    let n2 = linear(n1, classifier_layer)
-    let loss = n2.sigmoid_cross_entropy(target)
-
-    # Compute the gradient (i.e. contribution of each parameter to the loss)
-    loss.backprop()
-
-    # Correct the weights now that we have the gradient information
-    optim.update()
+proc main() =
+  # Autograd context / neuralnet graph
+  let ctx = newContext Tensor[float32]
+  let bsz = 32 # batch size
+
+  let x_train_bool = randomTensor([bsz * 100, 2], 1).astype(bool)
+  let y_bool = x_train_bool[_,0] xor x_train_bool[_,1]
+  let x_train = ctx.variable(x_train_bool.astype(float32))
+  let y = y_bool.astype(float32)
+
+  # We will build the following network:
+  # Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss
+
+  let layer_3neurons = ctx.variable(
+                        randomTensor(3, 2, 2.0f) -. 1.0f,
+                        requires_grad = true
+                      )
+
+  let classifier_layer = ctx.variable(
+                          randomTensor(1, 3, 2.0f) -. 1.0f,
+                          requires_grad = true
+                        )
+
+  # Stochastic Gradient Descent
+  let optim = newSGD[float32](
+      layer_3neurons, classifier_layer, 0.01f
+    )
+
+  # Learning loop
+  for epoch in 0..10000:
+    for batch_id in 0..<100:
+
+      # minibatch offset in the Tensor
+      let offset = batch_id * 32
+      let x = x_train[offset ..< offset + 32, _]
+      let target = y[offset ..< offset + 32, _]
+
+      # Building the network
+      let n1 = relu linear(x, layer_3neurons)
+      let n2 = linear(n1, classifier_layer)
+      let loss = n2.sigmoid_cross_entropy(target)
+
+      # Compute the gradient (i.e. contribution of each parameter to the loss)
+      loss.backprop()
+
+      # Correct the weights now that we have the gradient information
+      optim.update()
+
+main()
diff --git a/benchmarks/implementation/softmax_cross_entropy_bench_batch_first.nim b/benchmarks/implementation/softmax_cross_entropy_bench_batch_first.nim
index 74047d2b3..4457aeef7 100644
--- a/benchmarks/implementation/softmax_cross_entropy_bench_batch_first.nim
+++ b/benchmarks/implementation/softmax_cross_entropy_bench_batch_first.nim
@@ -176,6 +176,7 @@ proc softmax_cross_entropy_backward1[T](
 
   result = zeros_like(cached_tensor)
 
+  # TODO: nested parallelism has suspect performance
   for i in 0||(batch_size-1):
     let (max, sumexp) = cached_tensor[i,_].streaming_max_sumexp
 
diff --git a/benchmarks/implementation/softmax_cross_entropy_bench_batch_last.nim b/benchmarks/implementation/softmax_cross_entropy_bench_batch_last.nim
index 54cbd6bc5..acd16871a 100644
--- a/benchmarks/implementation/softmax_cross_entropy_bench_batch_last.nim
+++ b/benchmarks/implementation/softmax_cross_entropy_bench_batch_last.nim
@@ -176,6 +176,7 @@ proc softmax_cross_entropy_backward1[T](
 
   result = zeros_like(cached_tensor)
 
+  # TODO: nested parallelism has suspect performance
   for i in 0||(batch_size-1): # Can't use OpenMP - SIGSEGV Illegal Address
     let (max, sumexp) = cached_tensor[_,i].streaming_max_sumexp
 
diff --git a/examples/ex06_shakespeare_generator.nim b/examples/ex06_shakespeare_generator.nim
index bf179d590..1cdf1e56b 100644
--- a/examples/ex06_shakespeare_generator.nim
+++ b/examples/ex06_shakespeare_generator.nim
@@ -56,7 +56,7 @@ const
 #
 # ################################################################
 
-func strToTensor(str: string|TaintedString): Tensor[PrintableIdx] =
+proc strToTensor(str: string|TaintedString): Tensor[PrintableIdx] =
   result = newTensor[PrintableIdx](str.len)
 
   # For each x in result, map the corresponding char index
diff --git a/src/arraymancer/io/io_image.nim b/src/arraymancer/io/io_image.nim
index e998cd83d..3e211f9e0 100644
--- a/src/arraymancer/io/io_image.nim
+++ b/src/arraymancer/io/io_image.nim
@@ -27,7 +27,7 @@ proc read_image*(filepath: string): Tensor[uint8] =
   ## Usage example with conversion to [0..1] float:
   ## .. code:: nim
   ##   let raw_img = read_image('path/to/image.png')
-  ##   let img = raw_img.map_inline:
+  ##   let img = forEach x in raw_img:
   ##     x.float32 / 255.0
 
   var width, height, channels: int
diff --git a/src/arraymancer/laser/strided_iteration/foreach_common.nim b/src/arraymancer/laser/strided_iteration/foreach_common.nim
index ff46b67dc..06c424040 100644
--- a/src/arraymancer/laser/strided_iteration/foreach_common.nim
+++ b/src/arraymancer/laser/strided_iteration/foreach_common.nim
@@ -4,14 +4,73 @@
 # This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  macros,
+  std/[macros, strutils],
   ../compiler_optim_hints
 
-template isVar[T: object](x: T): bool =
+template isVar[T](x: T): bool =
   ## Workaround due to `is` operator not working for `var`
   ## https://github.com/nim-lang/Nim/issues/9443
   compiles(addr(x))
 
+proc aliasTensor(id: int, tensor: NimNode): tuple[alias: NimNode, isVar: NimNode] =
+  ## Produce an alias variable for a tensor
+  ## Supports:
+  ## - identifiers
+  ## - dot call for field access
+  ## - foo[1] for array access
+  if tensor.kind notin {nnkIdent, nnkSym, nnkDotExpr, nnkBracketExpr, nnkCall}:
+    error "Expected a variable identifier or field access or array indexing but found \"" & tensor.repr() &
+      "\". Please assign to a variable first."
+
+  var t = tensor
+
+  # nnkCall handles the annoying typed AST we get in generics
+  # Call
+  #   OpenSymChoice
+  #     Sym "[]"
+  #     Sym "[]"
+  #     ...
+  #   DotExpr
+  #     Ident "self"
+  #     Ident "first_moments"
+  #   Ident "i"
+  if tensor.kind == nnkCall:
+    tensor[0].expectKind nnkOpenSymChoice
+    tensor[0][0].expectKind nnkSym
+    doAssert tensor[0][0].eqIdent("[]")
+
+    # Rewrite the AST to untyped
+    t = nnkBracketExpr.newTree(
+      tensor[1]
+    )
+    for i in 2 ..< tensor.len:
+      t.add tensor[i]
+
+  let isVar = block:
+    # Handle slicing cases like foo[0..<1, 0..<2]
+    # that do not return `var` but are technically `var`
+    # if `foo` is var
+    if t.kind in {nnkDotExpr, nnkBracketExpr}:
+      let t0 = t[0]
+      quote do: isVar(`t0`)
+    else:
+      quote do: isVar(`t`)
+
+  proc flattenedName(node: NimNode): string =
+    if node.kind in {nnkDotExpr, nnkBracketExpr}:
+      result = flattenedName(node[0])
+      result &= '_'
+      result &= flattenedName(node[1])
+    elif node.kind in {nnkIdent, nnkSym}:
+      result = $node
+    else:
+      error "Expected a field nameor dot expression or array access but found \"" &
+        t.repr()
+
+  let alias = flattenedName(t)
+
+  return (newIdentNode($alias & "_alias" & $id & '_'), isVar)
+
 proc initForEach*(
         params: NimNode,
         values, aliases, raw_ptrs: var NimNode,
@@ -24,7 +83,9 @@ proc initForEach*(
   var tensors = nnkBracket.newTree()
 
   template syntaxError() {.dirty.} =
-    error "Syntax error: argument " & ($arg.kind).substr(3) & " in position #" & $i & " was unexpected."
+    error "Syntax error: argument " &
+      ($arg.kind).substr(3) & " in position #" & $i & " was unexpected." &
+      "\n    " & arg.repr()
 
   for i, arg in params:
     if arg.kind == nnkInfix:
@@ -56,15 +117,15 @@ proc initForEach*(
   aliases_stmt.add newCall(bindSym"withCompilerOptimHints")
 
   for i, tensor in tensors:
-    let alias = newIdentNode($tensor & "_alias" & $i & '_')
+    let (alias, detectVar) = aliasTensor(i, tensor)
     aliases.add alias
     aliases_stmt.add quote do:
-      when isVar(`tensor`):
+      when `detectVar`:
         var `alias`{.align_variable.} = `tensor`
       else:
         let `alias`{.align_variable.} = `tensor`
 
-    let raw_ptr_i = genSym(nskLet, $tensor & "_raw_data" & $i & '_')
+    let raw_ptr_i = genSym(nskLet, $alias & "_raw_data" & $i & '_')
     raw_ptrs_stmt.add quote do:
       let `raw_ptr_i`{.restrict.} = `alias`.unsafe_raw_offset()
     raw_ptrs.add raw_ptr_i
@@ -74,7 +135,9 @@ proc initForEach*(
   for i in 1 ..< aliases.len:
     let alias_i = aliases[i]
     test_shapes.add quote do:
-      assert `alias0`.shape == `alias_i`.shape
+      assert `alias0`.shape == `alias_i`.shape,
+        "\n " & astToStr(`alias0`) & ".shape is " & $`alias0`.shape & "\n" &
+        "\n " & astToStr(`alias_i`) & ".shape is " & $`alias_i`.shape & "\n"
 
 template stridedVarsSetup*(): untyped {.dirty.} =
   for i, alias in aliases:
diff --git a/src/arraymancer/laser/tensor/initialization.nim b/src/arraymancer/laser/tensor/initialization.nim
index 727e70dbc..5d65144ad 100644
--- a/src/arraymancer/laser/tensor/initialization.nim
+++ b/src/arraymancer/laser/tensor/initialization.nim
@@ -136,14 +136,20 @@ proc copyFromRaw*[T](dst: var Tensor[T], buffer: ptr T, len: Natural) =
     withCompilerOptimHints()
     doAssert dst.size == len, "Tensor size and buffer length should be the same"
     let buf{.restrict.} = cast[ptr UncheckedArray[T]](buffer)
-    omp_parallel_chunks(
-            len, chunk_offset, chunk_size,
-            OMP_MEMORY_BOUND_GRAIN_SIZE * 4):
-        copyMem(
-          dst.unsafe_raw_offset[chunk_offset].addr,
-          buf[chunk_offset].unsafeAddr,
-          chunk_size * sizeof(T)
-        )
+    # No OpenMP - unexplained overhead https://github.com/mratsim/Arraymancer/issues/485
+    # omp_parallel_chunks(
+    #         len, chunk_offset, chunk_size,
+    #         OMP_MEMORY_BOUND_GRAIN_SIZE * 4):
+    #     copyMem(
+    #       dst.unsafe_raw_offset[chunk_offset].addr,
+    #       buf[chunk_offset].unsafeAddr,
+    #       chunk_size * sizeof(T)
+    #     )
+    copyMem(
+      dst.unsafe_raw_offset[0].addr,
+      buf[0].unsafeAddr,
+      len * sizeof(T)
+    )
   else:
     {.fatal: "Only non-ref types and types with trivial destructors can be raw copied.".}
 
@@ -166,14 +172,19 @@ proc setZero*[T](t: var Tensor[T], check_contiguous: static bool = true) =
   when not (T is KnownSupportsCopyMem):
     t.storage.raw_buffer.reset()
     t.storage.raw_buffer.setLen(t.size)
-  else:
-    omp_parallel_chunks(
-          t.size, chunk_offset, chunk_size,
-          OMP_MEMORY_BOUND_GRAIN_SIZE * 4):
-      zeroMem(
-        t.unsafe_raw_offset[chunk_offset].addr,
-        chunk_size * sizeof(T)
-      )
+  else: # if setZero or newTensor are used in OpenMP parallel regions
+        # making this parallel will kill performance.
+    # omp_parallel_chunks(
+    #       t.size, chunk_offset, chunk_size,
+    #       OMP_MEMORY_BOUND_GRAIN_SIZE * 4):
+    #   zeroMem(
+    #     t.unsafe_raw_offset[chunk_offset].addr,
+    #     chunk_size * sizeof(T)
+    #   )
+    zeroMem(
+      t.unsafe_raw_offset[0].addr,
+      t.size * sizeof(T)
+    )
 
 proc newTensor*[T](shape: varargs[int]): Tensor[T] =
   var size: int
diff --git a/src/arraymancer/linear_algebra/helpers/least_squares_lapack.nim b/src/arraymancer/linear_algebra/helpers/least_squares_lapack.nim
index 4be57d7d8..0f8f2d232 100644
--- a/src/arraymancer/linear_algebra/helpers/least_squares_lapack.nim
+++ b/src/arraymancer/linear_algebra/helpers/least_squares_lapack.nim
@@ -53,11 +53,13 @@ proc gelsd*[T: SomeFloat](
   var b2: Tensor[T]
   b2.newMatrixUninitColMajor(ldb.int, nrhs.int)
 
-  var b2_slice = b2[0 ..< b.shape[0], 0 ..< nrhs] # Workaround because slicing does no produce a var at the moment
-  apply2_inline(b2_slice, b):
-    # paste b in b2.
-    # if b2 is larger, the rest is zeros
-    y
+  var b2_slice = b2[0 ..< b.shape[0], 0 ..< nrhs]
+  if is1d:
+    b2_slice = b2_slice.squeeze(1)
+
+  forEach ib2 in b2_slice,
+          ib in b:
+    ib2 = ib
 
   # Temporary sizes
   const smlsiz = 25'i32 # equal to the maximum size of the subproblems at the bottom of the computation tree
diff --git a/src/arraymancer/ml/dimensionality_reduction/pca.nim b/src/arraymancer/ml/dimensionality_reduction/pca.nim
index 129b47481..bea769cdd 100644
--- a/src/arraymancer/ml/dimensionality_reduction/pca.nim
+++ b/src/arraymancer/ml/dimensionality_reduction/pca.nim
@@ -187,8 +187,10 @@ proc pca_detailed*[T: SomeFloat](
 
   # Variance explained by Singular Values
   let bessel_correction = T(result.n_observations - 1)
-  result.explained_variance = map_inline(S):
-    x * x / bessel_correction
+  result.explained_variance = newTensorUninit[T](S.shape)
+  forEach ev in result.explained_variance,
+          s in S:
+    ev = s*s / bessel_correction
 
   # Since we are using SVD truncated to `n_components` we need to
   # refer back to the original matrix for total variance
diff --git a/src/arraymancer/ml/metrics/common_error_functions.nim b/src/arraymancer/ml/metrics/common_error_functions.nim
index 504421b97..c1534c606 100644
--- a/src/arraymancer/ml/metrics/common_error_functions.nim
+++ b/src/arraymancer/ml/metrics/common_error_functions.nim
@@ -22,7 +22,11 @@ proc squared_error*[T](y, y_true: T): T {.inline.} =
 
 proc squared_error*[T](y, y_true: Tensor[T]): Tensor[T] {.noInit.} =
   ## Element-wise squared error for a tensor, |y_true - y| ^2
-  result = map2_inline(y_true, y, squared_error(x,y))
+  result = newTensorUninit[T](y.shape)
+  forEach r in result,
+          testval in y,
+          truthval in y_true:
+    r = squared_error(testval,truthval)
 
 proc mean_squared_error*[T](y, y_true: Tensor[T]): T =
   ## Also known as MSE or L2 loss, mean squared error between elements:
@@ -48,7 +52,11 @@ proc relative_error*[T](y, y_true: Tensor[T]): Tensor[T] {.noInit.} =
   ## Normally the relative error is defined as |y_true - x| / |y_true|,
   ## but here max is used to make it symmetric and to prevent dividing by zero,
   ## guaranteed to return zero in the case when both values are zero.
-  result = map2_inline(y, y_true, relative_error(x,y))
+  result = newTensorUninit[T](y.shape)
+  forEach r in result,
+          testval in y,
+          truthval in y_true:
+    r = relative_error(testval,truthval)
 
 proc mean_relative_error*[T](y, y_true: Tensor[T]): T =
   ## Mean relative error for Tensor, mean of the element-wise
@@ -67,10 +75,14 @@ proc absolute_error*[T: SomeFloat](y, y_true: T): T {.inline.} =
 
 proc absolute_error*[T](y, y_true: Tensor[T]): Tensor[T] {.noInit.} =
   ## Element-wise absolute error for a tensor
-  result = map2_inline(y, y_true, y.absolute_error(x))
+  result = newTensorUninit[T](y.shape)
+  forEach r in result,
+          testval in y,
+          truthval in y_true:
+    r = absolute_error(testval,truthval)
 
 proc mean_absolute_error*[T](y, y_true: Tensor[T]): T =
   ## Also known as L1 loss, absolute error between elements:
   ## sum(|y_true - y|)/m
   ## where m is the number of elements
-  result = map2_inline(y, y_true, y.absolute_error(x)).mean()
+  result = y.absolute_error(y_true).mean()
diff --git a/src/arraymancer/nn/loss/mean_square_error_loss.nim b/src/arraymancer/nn/loss/mean_square_error_loss.nim
index f59d8730a..1de096e85 100644
--- a/src/arraymancer/nn/loss/mean_square_error_loss.nim
+++ b/src/arraymancer/nn/loss/mean_square_error_loss.nim
@@ -30,8 +30,10 @@ proc mse_backward_ag[TT](self: MSELoss[TT], payload: Payload[TT]): SmallDiffs[TT
                                                   # See also Stanford course: http://theory.stanford.edu/~tim/s15/l/l15.pdf
 
   result = newDiffs[TT](1)
-  result[0] = map2_inline(self.cache.value, self.target):
-    norm * (x - y)
+  forEach r0 in result[0],
+          v in self.cache.value,
+          t in self.target:
+    r0 = norm * (v - t)
 
 proc mse_cache[TT](result: Variable[TT], input: Variable[TT], target: TT) =
   ## We expect input with shape [batch_size, features]
diff --git a/src/arraymancer/nn/optimizers/optimizers.nim b/src/arraymancer/nn/optimizers/optimizers.nim
index f5dd2c7bd..a7caf27c0 100644
--- a/src/arraymancer/nn/optimizers/optimizers.nim
+++ b/src/arraymancer/nn/optimizers/optimizers.nim
@@ -47,8 +47,9 @@ proc update*(self: Sgd) =
   for v in self.params:
     # v.value -= learning rate * grad
     if v.requires_grad:
-      apply2_inline(v.value, v.grad):
-        x - self.lr * y
+      forEach val in v.value,
+              g in v.grad:
+        val -= self.lr * g
       # Zero the gradient
       v.grad = v.value.zeros_like # TODO "setZero" instead of a new allocation
 
@@ -128,19 +129,23 @@ proc update*(self: var SgdMomentum) =
       # This implementation of both kinds of momentum follows that of Tensorflow
       # and closely mirrors the formulation of Sustkever et. al.
       # Update the moments with the previous update.
-      apply2_inline(self.moments[i], v.grad):
-        self.momentum * x - self.lr * y
+      forEach mi in self.moments[i],
+              g in v.grad:
+        mi = self.momentum * mi - self.lr * g
 
       # When momentum = 0 this acts identically to SGD without momentum.
       if self.nesterov:
         # Update the params with formula Value = value - lr * gradient + momentum * v
         # where v = - lr * gradient + momentum * moment
-        apply3_inline(v.value, v.grad, self.moments[i]):
-          x - self.lr * y + self.momentum * z
+        forEach val in v.value,
+                g in v.grad,
+                mi in self.moments[i]:
+          val += -(self.lr * g) + self.momentum * mi
       else:
         # Update the params with formula Value = value - lr * gradient + momentum * moment
-        apply2_inline(v.value, self.moments[i]):
-          x + y
+        forEach val in v.value,
+                mi in self.moments[i]:
+          val += mi
 
       # Zero the gradient
       v.grad = v.value.zeros_like # TODO "setZero" instead of a new allocation
@@ -212,19 +217,21 @@ proc update*(self: var Adam) =
     let v = self.params[i]
     if v.requires_grad:
       # Update biaised first moment estimate
-      apply2_inline(self.first_moments[i], v.grad):
-        self.beta1 * x + (1 - self.beta1) * y
+      forEach fmi in self.first_moments[i], g in v.grad:
+        fmi = self.beta1 * fmi + (1 - self.beta1) * g
       # Update biaised second moment estimate
-      apply2_inline(self.second_moments[i], v.grad):
-        self.beta2 * x + (1 - self.beta2) * y * y
+      forEach smi in self.second_moments[i], g in v.grad:
+        smi = self.beta2 * smi + (1 - self.beta2) * g * g
       # Adjust weight
-      apply3_inline(v.value, self.first_moments[i], self.second_moments[i]):
-        x - lr_t * y / (z.sqrt + self.epsilon)
+      forEach val in v.value,
+              fmi in self.first_moments[i],
+              smi in self.second_moments[i]:
+        val -= lr_t * fmi / (smi.sqrt + self.epsilon)
 
       # Zero the gradient
       v.grad = v.value.zeros_like # TODO "setZero" instead of a new allocation
 
-func optimizerAdam*[M, T](
+proc optimizerAdam*[M, T](
         model: M,
         learning_rate: T = T(0.001),
         beta1 = T(0.9), beta2 = T(0.999),
diff --git a/src/arraymancer/nn_primitives/nnp_activation.nim b/src/arraymancer/nn_primitives/nnp_activation.nim
index a9be939b5..7ee75b37e 100644
--- a/src/arraymancer/nn_primitives/nnp_activation.nim
+++ b/src/arraymancer/nn_primitives/nnp_activation.nim
@@ -29,14 +29,19 @@ proc sigmoid*[T: SomeFloat](t: Tensor[T]): Tensor[T] {.noInit.}=
 
   # stable: proc sigmoid_closure(x: T): T = 0.5.T * (tanh(0.5.T * x) + 1.T)
 
-  result = map_inline(t):
-    sigmoid(x)
+  result = newTensorUninit[T](t.shape)
+  forEach dst in result, src in t:
+    dst = sigmoid(src)
 
 proc relu*[T](t: Tensor[T]): Tensor[T] {.noInit.}=
-  t.map_inline max(0.T,x)
+  result = newTensorUninit[T](t.shape)
+  forEach dst in result, src in t:
+    dst = max(0.T, src)
 
 proc tanh*[T: SomeFloat](t: Tensor[T]): Tensor[T] {.noInit.}=
-  t.map_inline tanh(x)
+  result = newTensorUninit[T](t.shape)
+  forEach dst in result, src in t:
+    dst = tanh(src)
 
 # ##################################################################################################
 # In-place forward
@@ -46,32 +51,37 @@ proc msigmoid*[T: SomeFloat](t: var Tensor[T]) =
   ## Note: Canonical sigmoid is not stable for large negative value
 
   # stable: proc sigmoid_closure(x: T): T = 0.5.T * (tanh(0.5.T * x) + 1.T)
-  apply_inline(t):
-    sigmoid(x)
+  forEach x in t:
+    x = sigmoid(x)
 
 proc mrelu*[T](t: var Tensor[T]) =
-  t.apply_inline max(0.T, x)
+  forEach x in t:
+    x = max(0.T, x)
 
 proc mtanh*[T: SomeFloat](t: var Tensor[T]) =
-  t.apply_inline tanh(x)
+  forEach x in t:
+    x = tanh(x)
 
 # ##################################################################################################
 # Backward
 
 proc sigmoid_backward*[T](gradient: Tensor[T], cached_tensor: Tensor[T]): Tensor[T]{.noInit.}=
-  result = map2_inline(cached_tensor, gradient):
-    x * (1 - x) * y
+  result = newTensorUninit[T](gradient.shape)
+  forEach r in result, c in cached_tensor, g in gradient:
+    r = c*(1-c) * g
 
 proc relu_backward*[T](gradient: Tensor[T], cached_tensor: Tensor[T]): Tensor[T]{.noInit.}=
-  result = map2_inline(cached_tensor, gradient):
-    if x <= 0.T:
-      0.T
+  result = newTensorUninit[T](gradient.shape)
+  forEach r in result, c in cached_tensor, g in gradient:
+    if c <= 0.T:
+      r = 0.T
     else:
-      y
+      r = g
 
 proc tanh_backward*[T](gradient: Tensor[T], cached_tensor: Tensor[T]): Tensor[T]{.noInit.}=
-  result = map2_inline(cached_tensor, gradient):
-    y * (1 - x * x)
+  result = newTensorUninit[T](gradient.shape)
+  forEach r in result, c in cached_tensor, g in gradient:
+    r = g * (1 - c * c)
 
 # ####################################################################################################
 # Documentation
diff --git a/src/arraymancer/nn_primitives/nnp_gru.nim b/src/arraymancer/nn_primitives/nnp_gru.nim
index 1453081eb..e154e5717 100644
--- a/src/arraymancer/nn_primitives/nnp_gru.nim
+++ b/src/arraymancer/nn_primitives/nnp_gru.nim
@@ -63,7 +63,6 @@ proc gru_cell_inference*[T: SomeFloat](
     # Slices
     sr = (0 ..< H)|1
     sz = (H ..< 2*H)|1
-    srz = (0 ..< 2*H)|1
     s = (2*H ..< 3*H)|1
 
 
@@ -73,19 +72,29 @@ proc gru_cell_inference*[T: SomeFloat](
   linear(input, W3, bW3, W3x)
   linear(hidden, U3, bU3, U3h)
 
-  # Step 2 - Computing reset (r) and update (z) gate
-  var W2ru = W3x[_, srz] # shape [batch_size, 2*H] - we reuse the previous buffer
-  apply2_inline(W2ru, U3h[_, srz]):
-    sigmoid(x + y)
-
-  # Step 3 - Computing candidate hidden state ñ
-  var n = W3x[_, s] # shape [batch_size, H] - we reuse the previous buffer
-  apply3_inline(n, W2ru[_, sr], U3h[_, s]):
-    tanh(x + y * z)
-
-  # Step 4 - Update the hidden state
-  apply3_inline(hidden, W3x[_, sz], n):
-    (1 - y) * z + y * x
+  # Step 2 - Fused evaluation of the 4 GRU equations
+  # r  =    σ(Wr * x + bWr +       Ur * h + bUr)
+  # z  =    σ(Wz * x + bWz +       Uz * h + bUz)
+  # n  = tanh(W  * x + bW  + r *. (U  * h + bU ))
+  # h' = (1 - z) *. n + z *. h
+
+  # shape [batch_size, H] - we reuse the previous buffers
+  forEach wrx in W3x[_, sr], # Wr*x
+          wzx in W3x[_, sz], # Wz*x
+          wx in W3x[_, s],   # W*x
+          urh in U3h[_, sr], # Ur*h
+          uzh in U3h[_, sz], # Uz*h
+          uh in U3h[_, s],   # U*h
+          h in hidden:       # hidden state
+    # Reset (r) gate and Update (z) gate
+    let r = sigmoid(wrx + urh)
+    let z = sigmoid(wzx + uzh)
+
+    # Candidate hidden state ñ
+    let n = tanh(wx + r * uh)
+
+    # h' = (1 - z) *. ñ + z *. h
+    h = (1-z) * n + z*h
 
 proc gru_cell_forward*[T: SomeFloat](
   input,
@@ -124,26 +133,38 @@ proc gru_cell_forward*[T: SomeFloat](
   linear(input, W3, bW3, W3x)
   linear(hidden, U3, bU3, U3h)
 
-  # # Saving for backprop
-  apply2_inline(Uh, U3h[_, s]):
-    y
-
-  # Step 2 - Computing reset (r) and update (z) gate
-  apply3_inline(r, W3x[_, sr], U3h[_, sr]):
-    sigmoid(y + z)
-
-  apply3_inline(z, W3x[_, sz], U3h[_, sz]):
-    sigmoid(y + z)
-
-  # Step 3 - Computing candidate hidden state ñ
-  # TODO: need apply4 / loopfusion for efficient
-  # buffer passing in Stacked GRU implementation
-  n = map3_inline(W3x[_, s], r, U3h[_, s]):
-    tanh(x + y * z)
-
-  # Step 4 - Update the hidden state
-  apply3_inline(hidden, z, n):
-    (1 - y) * z + y * x
+  # Step 2 - Fused evaluation of the 4 GRU equations
+  #          and saving for backprop
+  # r  =    σ(Wr * x + bWr +       Ur * h + bUr)
+  # z  =    σ(Wz * x + bWz +       Uz * h + bUz)
+  # n  = tanh(W  * x + bW  + r *. (U  * h + bU ))
+  # h' = (1 - z) *. n + z *. h
+
+  # shape [batch_size, H] - we reuse the previous buffers
+  forEach wrx in W3x[_, sr], # Wr*x
+          wzx in W3x[_, sz], # Wz*x
+          wx in W3x[_, s],   # W*x
+          urh in U3h[_, sr], # Ur*h
+          uzh in U3h[_, sz], # Uz*h
+          uh in U3h[_, s],   # U*h
+          h in hidden,       # hidden state
+          saveUh in Uh,      # U*h cache for backprop
+          reset in r,        # reset gate cache for backprop
+          update in z,       # update gate cache for backprop
+          candidate in n:    # candidate hidden state cache for backprop
+
+    # Cache for backprop
+    saveUh = uh
+
+    # Reset (r) gate and Update (z) gate
+    reset = sigmoid(wrx + urh)
+    update = sigmoid(wzx + uzh)
+
+    # Candidate hidden state ñ
+    candidate = tanh(wx + reset * uh)
+
+    # h' = (1 - z) *. ñ + z *. h
+    h = (1-update) * candidate + update*h
 
 proc gru_cell_backward*[T: SomeFloat](
   dx, dh, dW3, dU3,          # input and weights gradients
@@ -162,6 +183,9 @@ proc gru_cell_backward*[T: SomeFloat](
   ##   - dnext: gradient flowing back from the next layer
   ##   - x, h, W3, U3: inputs saved from the forward pass
   ##   - r, z, n, Uh: intermediate results saved from the forward pass of shape [batch_size, hidden_size]
+
+  # TODO: fused backprop with forEach
+
   # Backprop of step 4 - z part
   let dz = (h - n) *. dnext
   let dn = (1.0.T -. z) *. dnext
@@ -188,8 +212,8 @@ proc gru_cell_backward*[T: SomeFloat](
   linear_backward(h, U3, dU3h, dh, dU3, dbU3)
 
   # Backprop of step 4 - h part
-  apply3_inline(dh, dnext, z):
-    x + y * z
+  forEach dhi in dh, dni in dnext, zi in z:
+    dhi += dni * zi
 
 proc gru_inference*[T: SomeFloat](
   input: Tensor[T],
@@ -351,10 +375,6 @@ proc gru_forward*[T: SomeFloat](
         n_lts = ns[layer, timestep, _, _].squeeze(0).squeeze(0)
         Uh_lts = Uhs[layer, timestep, _, _].squeeze(0).squeeze(0)
 
-      # TODO: gru_cell_forward will detach `nl``
-      # due to a missing apply4/loop-fusion operation
-      var n_tmp = n_lts
-
       let input_ts = block:
             if layer == 0:
               input[timestep, _, _].squeeze(0)
@@ -364,14 +384,10 @@ proc gru_forward*[T: SomeFloat](
       gru_cell_forward(
         input_ts,
         W3l, U3l, bW3l, bU3l,
-        r_lts, z_lts, n_tmp, Uh_lts,
+        r_lts, z_lts, n_lts, Uh_lts,
         hiddenl
       )
       output[timestep, _, _] = hiddenl.unsqueeze(0)
-      # TODO: apply/loop-fusion
-      # copy n_tmpl back to nl
-      apply2_inline(n_lts, n_tmp):
-        y
 
 proc gru_backward*[T: SomeFloat](
   dInput, dHidden0,                    # Input and starting hidden state gradient
diff --git a/src/arraymancer/nn_primitives/nnp_linear.nim b/src/arraymancer/nn_primitives/nnp_linear.nim
index 098a62bda..e3f211d6b 100644
--- a/src/arraymancer/nn_primitives/nnp_linear.nim
+++ b/src/arraymancer/nn_primitives/nnp_linear.nim
@@ -24,7 +24,6 @@ proc linear*[T](input, weight: Tensor[T], bias: Tensor[T], output: var Tensor[T]
   #   - bias tensor shape [1, out_features]
   # Output does not need to be initialized to 0 or the proper shape, data will be overwritten
   # Output is: Y = x * W.transpose + b
-
   output = input * weight.transpose # TODO: with the transpose the non-matching rows and cols is confusing
   output +.= bias
 
diff --git a/src/arraymancer/nn_primitives/nnp_sigmoid_cross_entropy.nim b/src/arraymancer/nn_primitives/nnp_sigmoid_cross_entropy.nim
index 74a3bf544..0d8247b58 100644
--- a/src/arraymancer/nn_primitives/nnp_sigmoid_cross_entropy.nim
+++ b/src/arraymancer/nn_primitives/nnp_sigmoid_cross_entropy.nim
@@ -41,14 +41,20 @@ proc sigmoid_cross_entropy*[T](input, target: Tensor[T]): T =
 
   # ln1p(x) does ln(1 + x) but avoids catastrophic cancellation if x << 1.
 
-  # result = 0.T
-  # for xi, ti in zip(input, target):
-  #   result += (-ti * xi +  max(xi,0) + ln1p(exp(-abs(xi))) ) / T(input.shape[1])
-
-  # We need parallel fused map2 -> reduce for all loss functions
-  result = sum:
-    map2_inline(input, target):
-      -y * x +  max(x,0) + ln1p(exp(-abs(x))) # This leverage the logsumexp trick to improve numerical stability
+  result = 0.T
+  for xi, ti in zip(input, target):
+    result += (-ti * xi +  max(xi,0) + ln1p(exp(-abs(xi))) ) / T(input.shape[1])
+
+  # TODO - Parallel fused map-reduce, openmp issue - https://github.com/mratsim/Arraymancer/issues/485
+  # forEachStaged ii in input, ti in target:
+  #   before_loop:
+  #     var local_sum{.exportc.} = 0.T
+  #   in_loop:
+  #     # This leverage the logsumexp trick to improve numerical stability
+  #     local_sum += -ti * ii +  max(ii,0) + ln1p(exp(-abs(ii)))
+  #   after_loop:
+  #     {.emit: "#pragma omp atomic".}
+  #     {.emit: "`result` += `local_sum`;".}
 
   # Normalize by batch_size
   result /= T(batch_size)
diff --git a/src/arraymancer/nn_primitives/nnp_softmax.nim b/src/arraymancer/nn_primitives/nnp_softmax.nim
index edb54b646..e5f9c1350 100644
--- a/src/arraymancer/nn_primitives/nnp_softmax.nim
+++ b/src/arraymancer/nn_primitives/nnp_softmax.nim
@@ -29,10 +29,11 @@ proc softmax*[T](input: Tensor[T]): Tensor[T] {.noInit.} =
   let batch_size = input.shape[0]
   result = zeros_like(input)
 
+  # TODO: the performance of nested parallel regions is highly suspect
   for i in 0||(batch_size-1):
     let (max, sumexp) = input[i, _].streaming_max_sumexp
 
     var res_slice = result[i, _]
 
-    apply2_inline(res_slice, input[i, _]):
-      stable_softmax(y, max, sumexp)
+    forEachSerial r in res_slice, inpi in input[i, _]:
+      r = stable_softmax(inpi, max, sumexp)
diff --git a/src/arraymancer/nn_primitives/nnp_softmax_cross_entropy.nim b/src/arraymancer/nn_primitives/nnp_softmax_cross_entropy.nim
index 1af1b45af..2b8e3d39f 100644
--- a/src/arraymancer/nn_primitives/nnp_softmax_cross_entropy.nim
+++ b/src/arraymancer/nn_primitives/nnp_softmax_cross_entropy.nim
@@ -146,13 +146,14 @@ proc softmax_cross_entropy_backward*[T](
 
   result = zeros_like(cached_tensor)
 
+  # TODO: nested parallelism has suspect performance
   for i in 0||(batch_size-1):
     let (max, sumexp) = cached_tensor[i,_].streaming_max_sumexp
 
     var res_slice = result[i,_]
 
-    apply3_inline(res_slice, cached_tensor[i,_], target[i,_]):
-      grad * (stable_softmax(y, max, sumexp) - z) / T(batch_size)
+    forEachSerial r in res_slice, ci in cached_tensor[i,_], ti in target[i,_]:
+      r = grad * (stable_softmax(ci, max, sumexp) - ti) / T(batch_size)
 
 
 proc sparse_softmax_cross_entropy_backward*[T; Idx: SomeNumber or byte or char or enum](
@@ -185,13 +186,14 @@ proc sparse_softmax_cross_entropy_backward*[T; Idx: SomeNumber or byte or char o
   for i, truth_idx in enumerate(target):
     result[i, int(truth_idx)] = -1
 
+  # TODO: the performance of nested parallel regions is highly suspect
   for i in 0||(batch_size-1):
     let (max, sumexp) = cached_tensor[i, _].streaming_max_sumexp
 
     var res_slice = result[i, _]
 
-    apply2_inline(res_slice, cached_tensor[i, _]):
-      grad * (stable_softmax(y, max, sumexp) + x) / T(batch_size)
+    forEachSerial r in res_slice, ci in cached_tensor[i, _]:
+      r = grad * (stable_softmax(ci, max, sumexp) + r) / T(batch_size)
 
 
 # ################################################
diff --git a/src/arraymancer/stats/kde.nim b/src/arraymancer/stats/kde.nim
index 4746ade56..c5f468559 100644
--- a/src/arraymancer/stats/kde.nim
+++ b/src/arraymancer/stats/kde.nim
@@ -179,7 +179,8 @@ proc kde*[T: SomeNumber; U: int | Tensor[SomeNumber] | openArray[SomeNumber]](
 
   if normalize:
     let normFactor = 1.0 / (result.sum * (maxT - minT) / nsamples.float)
-    result.apply_inline(normFactor * x)
+    forEach x in result:
+      x *= normFactor
 
 proc kde*[T: SomeNumber; U: KernelKind | string; V: int | Tensor[SomeNumber] | openArray[SomeNumber]](
     t: Tensor[T],
diff --git a/src/arraymancer/tensor.nim b/src/arraymancer/tensor.nim
index 7cdf33708..f106e584e 100644
--- a/src/arraymancer/tensor.nim
+++ b/src/arraymancer/tensor.nim
@@ -17,6 +17,7 @@ import nimblas
 export OrderType
 
 import  ./laser/dynamic_stack_arrays,
+        ./laser/strided_iteration/foreach,
         ./tensor/data_structure,
         ./tensor/init_cpu,
         ./tensor/init_copy_cpu,
@@ -45,6 +46,7 @@ import  ./laser/dynamic_stack_arrays,
         ./tensor/exporting
 
 export  dynamic_stack_arrays,
+        foreach,
         data_structure,
         init_cpu,
         init_copy_cpu,
diff --git a/src/arraymancer/tensor/higher_order_applymap.nim b/src/arraymancer/tensor/higher_order_applymap.nim
index 061ce6dcd..e4c81bd17 100644
--- a/src/arraymancer/tensor/higher_order_applymap.nim
+++ b/src/arraymancer/tensor/higher_order_applymap.nim
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import  ./backend/openmp,
+import  ../laser/strided_iteration/[foreach, foreach_staged],
+        ./backend/openmp,
         ./private/p_checks,
         ./data_structure, ./init_cpu, ./accessors, ./accessors_macros_write
 
+export foreach, foreach_staged
 import sugar except enumerate
 
 # ####################################################################
@@ -206,10 +208,8 @@ proc apply*[T: KnownSupportsCopyMem](t: var Tensor[T], f: proc(x:var T)) =
   ##       x += 1
   ##     a.apply(pluseqone) # Apply the in-place function pluseqone
   ## ``apply`` is especially useful to do multiple element-wise operations on a tensor in a single loop over the data.
-
-  omp_parallel_blocks(block_offset, block_size, t.size):
-    for x in t.mitems(block_offset, block_size):
-      f(x)
+  forEach x in t:
+    f(x)
 
 proc map2*[T, U; V: KnownSupportsCopyMem](t1: Tensor[T],
                                           f: (T,U) -> V,
diff --git a/src/arraymancer/tensor/lapack.nim b/src/arraymancer/tensor/lapack.nim
index 1414510d8..30acc564e 100644
--- a/src/arraymancer/tensor/lapack.nim
+++ b/src/arraymancer/tensor/lapack.nim
@@ -17,6 +17,16 @@ import  ./data_structure,
         ./higher_order_applymap
 
 proc frobenius_inner_prod*[T](a,b: Tensor[T]): T =
-  sum:
-    map2_inline(a,b):
-      x * y
+  # sum:
+  #   map2_inline(a,b):
+  #     x * y
+  forEachStaged ai in a, bi in b:
+    openmp_config:
+      use_simd: true
+    before_loop:
+      var local_sum{.exportc.} = 0.T
+    in_loop:
+      local_sum += ai * bi
+    after_loop:
+      {.emit: "#pragma omp atomic".}
+      {.emit: "`result` += `local_sum`;".}
diff --git a/src/arraymancer/tensor/math_functions.nim b/src/arraymancer/tensor/math_functions.nim
index 26472fdb1..8169afd89 100644
--- a/src/arraymancer/tensor/math_functions.nim
+++ b/src/arraymancer/tensor/math_functions.nim
@@ -26,7 +26,9 @@ proc elwise_mul*[T](a, b: Tensor[T]): Tensor[T] {.noInit.} =
 
 proc melwise_mul*[T](a: var Tensor[T], b: Tensor[T]) =
   ## Element-wise multiply
-  a.apply2_inline(b, x * y)
+  forEach x in a,
+          y in b:
+    x *= y
 
 proc elwise_div*[T: Someinteger](a, b: Tensor[T]): Tensor[T] {.noInit.} =
   ## Element-wise division
@@ -38,11 +40,15 @@ proc elwise_div*[T: SomeFloat](a, b: Tensor[T]): Tensor[T] {.noInit.} =
 
 proc melwise_div*[T: Someinteger](a: var Tensor[T], b: Tensor[T]) =
   ## Element-wise division (in-place)
-  a.apply2_inline(b, x div y)
+  forEach x in a,
+          y in b:
+    x = x div y
 
 proc melwise_div*[T: SomeFloat](a: var Tensor[T], b: Tensor[T]) =
   ## Element-wise division (in-place)
-  a.apply2_inline(b, x / y)
+  forEach x in a,
+          y in b:
+    x /= y
 
 proc reciprocal*[T: SomeFloat](t: Tensor[T]): Tensor[T] {.noInit.} =
   ## Return a tensor with the reciprocal 1/x of all elements
@@ -50,17 +56,21 @@ proc reciprocal*[T: SomeFloat](t: Tensor[T]): Tensor[T] {.noInit.} =
 
 proc mreciprocal*[T: SomeFloat](t: var Tensor[T]) =
   ## Apply the reciprocal 1/x in-place to all elements of the Tensor
-  t.apply_inline(1.T/x)
+  forEach x in t:
+    x = 1.T/x
 
 proc reciprocal*[T: Complex[float32] or Complex[float64]](t: Tensor[T]): Tensor[T] {.noInit.} =
   ## Return a tensor with the reciprocal 1/x of all elements
   type F = T.T # Get float subtype of Complex[T]
-  t.map_inline(complex(1.F, 0.F)/x)
+  result = newTensorUninit[T](t.shape)
+  forEach x in t, r in result:
+    r = complex(1.F, 0.F)/x
 
 proc mreciprocal*[T: Complex[float32] or Complex[float64]](t: var Tensor[T]) =
   ## Apply the reciprocal 1/x in-place to all elements of the Tensor
   type F = T.T # Get float subtype of Complex[T]
-  t.apply_inline(complex(1.F, 0.F)/x)
+  forEach x in t:
+    x = complex(1.F, 0.F)/x
 
 proc negate*[T: SomeSignedInt|SomeFloat](t: Tensor[T]): Tensor[T] {.noInit.} =
   ## Return a tensor with all elements negated (10 -> -10)
@@ -68,7 +78,8 @@ proc negate*[T: SomeSignedInt|SomeFloat](t: Tensor[T]): Tensor[T] {.noInit.} =
 
 proc mnegate*[T: SomeSignedInt|SomeFloat](t: var Tensor[T]) =
   ## Negate in-place all elements of the tensor (10 -> -10)
-  t.apply_inline(-x)
+  forEach x in t:
+    x = -x
 
 proc `-`*[T: SomeNumber](t: Tensor[T]): Tensor[T] {.noInit.} =
   ## Negate all values of a Tensor
@@ -91,13 +102,15 @@ proc abs*(t: Tensor[Complex[float32]]): Tensor[float32] {.noInit.} =
 proc mabs*[T](t: var Tensor[T]) =
   ## Return a Tensor with absolute values of all elements
   # FIXME: how to inplace convert Tensor[Complex] to Tensor[float]
-  t.apply_inline(abs(x))
+  forEach x in t:
+    x = abs(x)
 
 proc clamp*[T](t: Tensor[T], min, max: T): Tensor[T] {.noInit.} =
   t.map_inline(clamp(x, min, max))
 
 proc mclamp*[T](t: var Tensor[T], min, max: T) =
-  t.apply_inline(clamp(x, min, max))
+  forEach x in t:
+    x = clamp(x, min, max)
 
 proc square*[T](x: T): T {.inline.} =
   ## Return x*x
diff --git a/src/arraymancer/tensor/operators_blas_l1.nim b/src/arraymancer/tensor/operators_blas_l1.nim
index 2dd676399..886653799 100644
--- a/src/arraymancer/tensor/operators_blas_l1.nim
+++ b/src/arraymancer/tensor/operators_blas_l1.nim
@@ -90,16 +90,19 @@ proc `*=`*[T: SomeNumber|Complex[float32]|Complex[float64]](t: var Tensor[T], a:
   ## Element-wise multiplication by a scalar (in-place)
   if t.size == 0:
     return
-  t.apply_inline(x * a)
+  forEach x in t:
+    x *= a
 
 proc `/=`*[T: SomeFloat|Complex[float32]|Complex[float64]](t: var Tensor[T], a: T) =
   ## Element-wise division by a scalar (in-place)
   if t.size == 0:
     return
-  t.apply_inline(x / a)
+  forEach x in t:
+    x /= a
 
 proc `/=`*[T: SomeInteger](t: var Tensor[T], a: T) =
   ## Element-wise division by a scalar (in-place)
   if t.size == 0:
     return
-  t.apply_inline(x div a)
+  forEach x in t:
+    x = x div a
diff --git a/src/arraymancer/tensor/operators_broadcasted.nim b/src/arraymancer/tensor/operators_broadcasted.nim
index 5918c914d..c7f9dfa7a 100644
--- a/src/arraymancer/tensor/operators_broadcasted.nim
+++ b/src/arraymancer/tensor/operators_broadcasted.nim
@@ -145,31 +145,36 @@ proc `+.=`*[T: SomeNumber|Complex[float32]|Complex[float64]](t: var Tensor[T], v
   ## Tensor in-place addition with a broadcasted scalar.
   if t.size == 0:
     return
-  t.apply_inline(x + val)
+  forEach x in t:
+    x += val
 
 proc `-.=`*[T: SomeNumber|Complex[float32]|Complex[float64]](t: var Tensor[T], val: T) =
   ## Tensor in-place substraction with a broadcasted scalar.
   if t.size == 0:
     return
-  t.apply_inline(x - val)
+  forEach x in t:
+    x -= val
 
 proc `^.=`*[T: SomeFloat|Complex[float32]|Complex[float64]](t: var Tensor[T], exponent: T) =
   ## Compute in-place element-wise exponentiation
   if t.size == 0:
     return
-  t.apply_inline pow(x, exponent)
+  forEach x in t:
+    x = x.pow(exponent)
 
 proc `*.=`*[T: SomeNumber|Complex[float32]|Complex[float64]](t: var Tensor[T], val: T) =
   ## Tensor in-place multiplication with a broadcasted scalar.
   if t.size == 0:
     return
-  t.apply_inline(x * val)
+  forEach x in t:
+    x *= val
 
 proc `/.=`*[T: SomeNumber|Complex[float32]|Complex[float64]](t: var Tensor[T], val: T) =
   ## Tensor in-place division with a broadcasted scalar.
   if t.size == 0:
     return
-  t.apply_inline(x / val)
+  forEach x in t:
+    x /= val
 
 
 # ##############################################
diff --git a/tests/manual_checks/optimizers/test_optimizers.nim b/tests/manual_checks/optimizers/test_optimizers.nim
index d8dd643ac..8ef0b82fb 100644
--- a/tests/manual_checks/optimizers/test_optimizers.nim
+++ b/tests/manual_checks/optimizers/test_optimizers.nim
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import
-  ../../src/arraymancer, ../testutils,
+  ../../../src/arraymancer, ../../testutils,
   unittest, random, strformat
 
 # ############################################################
@@ -25,14 +25,14 @@ import
 # We use the Rosenbrock function for testing optimizers
 # https://en.wikipedia.org/wiki/Rosenbrock_function
 
-func rosenbrock[T](x, y: Tensor[T], a: static T = 1, b: static T = 100): Tensor[T] =
+proc rosenbrock[T](x, y: Tensor[T], a: static T = 1, b: static T = 100): Tensor[T] =
   # f(x, y) = (a - x)² + b(y - x²)²
   result = map2_inline(x, y):
     let u = a - x
     let v = y - x*x
     u * u + b * v * v
 
-func drosenbrock[T](x, y: Tensor[T], a: static T = 1, b: static T = 100): tuple[dx, dy: Tensor[T]] =
+proc drosenbrock[T](x, y: Tensor[T], a: static T = 1, b: static T = 100): tuple[dx, dy: Tensor[T]] =
   result.dx = map2_inline(x, y):
     2.T*x - 2.T*a + 4*b*x*x*x - 4*b*x*y
   result.dy = map2_inline(x, y):