feat(chainable): caution momentum, not update

ClashLuke · ClashLuke · commit 512ffd050f13 · 2025-01-17T09:32:58.000+01:00
diff --git a/heavyball/chainable.py b/heavyball/chainable.py
@@ -364,10 +364,12 @@ def _update_psgd_cache(cached, Q_cache, q):
     return Q_cache
 
 
-def _cached_psgd_precond_grad(group, cache_expr, exprs, update, Q_mat, Q_cache):
+def _cached_psgd_precond_grad(group, cache_expr, exprs, update, Q_mat, Q_cache, grad):
     if group.get('is_cached', False):
-        return utils.precond_grad_cached_(cache_expr, update, *Q_cache)
-    return utils.psgd_precond_grad(exprs[-1], update, *Q_mat)
+        out = utils.precond_grad_cached_(cache_expr, update, *Q_cache, caution=group['caution'], grad=grad)
+    out = utils.psgd_precond_grad(exprs[-1], update, *Q_mat, caution=group['caution'], grad=grad)
+    group['caution'] = False  # we already cautioned here - shouldn't do it again
+    return out
 
 
 def _fused_cached_psgd_precond_grad(group, grad, param, cache_expr, exprs, update, Q_mat, Q_cache):
@@ -387,15 +389,15 @@ def scale_by_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
     Q_mat = _update_psgd_precond(cached, Q_cache, group, param,
                                  update if group['momentum_into_precond_update'] else grad, Q_mat, Q, exprs, prob)
-    return _cached_psgd_precond_grad(group, cache_expr, exprs, update, Q_mat, Q_cache)
+    return _cached_psgd_precond_grad(group, cache_expr, exprs, update, Q_mat, Q_cache, grad)
 
 
 @general_guard("Q", "exprs", ("Q_cache", None), ("cache_expr", None), init_fn=_init_psgd, skip_first=False)
 @no_state_no_foreach
 def scale_by_delayed_psgd(group, update, grad, param, Q, exprs, Q_cache, cache_expr: str, cached: bool = False,
                           prob: Optional[callable] = None):
     Q_mat = utils.line_to_triu(Q) if group['store_triu_as_line'] else Q
-    precond = _cached_psgd_precond_grad(group, cache_expr, exprs, update, Q_mat, Q_cache)
+    precond = _cached_psgd_precond_grad(group, cache_expr, exprs, update, Q_mat, Q_cache, grad)
     _ = _update_psgd_precond(cached, Q_cache, group, param, update if group['momentum_into_precond_update'] else grad,
                              Q_mat, Q, exprs, prob)
     return precond
@@ -467,6 +469,8 @@ def _step(self, group):
                             f'only supported with foreach=True (currently foreach={group["foreach"]}).')
             group['base_lr'] = group['lr']
 
+        caution = group['caution']
+
         vals = list(self.split_p_and_g_in_group(group, should_promote=self.promote, beta1=utils.get_beta1(group)))
 
         if not vals:
@@ -492,6 +496,7 @@ def _step(self, group):
         else:
             chain(self.state_, group, g, p, *self.fns)
 
+        group['caution'] = caution
         group['lr'] = group['prev_lr']
         group['step'] = None
 
diff --git a/heavyball/utils.py b/heavyball/utils.py
@@ -1300,7 +1300,10 @@ def psgd_should_update(group, prob: Union[float, callable], rng: Optional[random
 
 
 @decorator_knowngood
-def precond_grad_cached_(expr: str, ea: Tensor, *cached_q: Tensor, cast: bool = True):
+def precond_grad_cached_(expr: str, ea: Tensor, *cached_q: Tensor, caution: bool = False, grad: Optional[Tensor] = None,
+                         cast: bool = True):
+    if caution:
+        ea = _compilable_cautioning(grad, ea)
     md = min_dtype(list(cached_q) + [ea])
     args = [q.to(md) for q in cached_q]
     args = args + [ea.to(md)]
@@ -1312,8 +1315,8 @@ def precond_grad_cached_(expr: str, ea: Tensor, *cached_q: Tensor, cast: bool =
 
 @decorator_knowngood
 def _compilable_fused_precond_grad_cached_(expr: str, ea: Tensor, param, lr, grad, decay, caution, *cached_q: Tensor):
-    precond = precond_grad_cached_(expr, ea, *cached_q, cast=False)
-    update_param_(param, precond, lr, decay, caution=caution, grad=grad)
+    precond = precond_grad_cached_(expr, ea, *cached_q, caution=caution, grad=grad, cast=False)
+    update_param_(param, precond, lr, decay, caution=False)
 
 
 def fused_precond_grad_cached_(expr: str, ea: Tensor, param, lr, grad, decay, caution, *cached_q: Tensor):
@@ -1322,7 +1325,9 @@ def fused_precond_grad_cached_(expr: str, ea: Tensor, param, lr, grad, decay, ca
 
 
 @decorator_knowngood
-def psgd_precond_grad(expr: str, ea: Tensor, *preconds: Tensor):
+def psgd_precond_grad(expr: str, ea: Tensor, *preconds: Tensor, caution: bool = False, grad: Optional[Tensor] = None):
+    if caution:
+        ea = _compilable_cautioning(grad, ea)
     md = min_dtype(list(preconds) + [ea])
     args = [q.to(md) for q in preconds]
     args = args + args + [ea.to(md)]
@@ -1332,8 +1337,8 @@ def psgd_precond_grad(expr: str, ea: Tensor, *preconds: Tensor):
 
 @decorator_knowngood
 def _compilable_fused_psgd_precond_grad(expr: str, ea: Tensor, param, lr, grad, decay, caution, *preconds: Tensor):
-    precond = psgd_precond_grad(expr, ea, *preconds)
-    update_param_(param, precond, lr, decay, caution=caution, grad=grad)
+    precond = psgd_precond_grad(expr, ea, *preconds, caution=caution, grad=grad)
+    update_param_(param, precond, lr, decay, caution=False, grad=grad)
 
 
 def fused_psgd_precond_grad(expr: str, ea: Tensor, param, lr, grad, decay, caution, *preconds: Tensor):