fix: reduce grad accumulation

ex3ndr · Jul 11, 2024 · b14fb7a · b14fb7a
1 parent f90b836
commit b14fb7a
Show file tree

Hide file tree

Showing 2 changed files with 2 additions and 2 deletions.
diff --git a/train.py b/train.py
@@ -39,7 +39,7 @@
 # 6k tokens is routhly 3 rows, because a single row is a 1500-2500 tokens
 # We have MUCH faster GPUs and therefore instead of gradient accumulation,
 # we increase batch size 4x and reduce number of gradients to just 4x
-train_grad_accum_every = 8
+train_grad_accum_every = 2
 train_batch_size = 8
 
 # We speculate that learning rate is given for all GPUs, so we divide it by number of GPUs

diff --git a/train_ar.py b/train_ar.py
@@ -39,7 +39,7 @@
 # 6k tokens is routhly 3 rows, because a single row is a 1500-2500 tokens
 # We have MUCH faster GPUs and therefore instead of gradient accumulation,
 # we increase batch size 4x and reduce number of gradients to just 4x
-train_grad_accum_every = 8
+train_grad_accum_every = 2
 train_batch_size = 8
 
 # We speculate that learning rate is given for all GPUs, so we divide it by number of GPUs