diff --git a/000 Zettelkasten/2D Convolutions/index.html b/000 Zettelkasten/2D Convolutions/index.html index 3fab2b8b..4a2b31d7 100644 --- a/000 Zettelkasten/2D Convolutions/index.html +++ b/000 Zettelkasten/2D Convolutions/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Ahead-of-Time (AOT) Compilation/index.html b/000 Zettelkasten/Ahead-of-Time (AOT) Compilation/index.html index c47c92f8..554e7261 100644 --- a/000 Zettelkasten/Ahead-of-Time (AOT) Compilation/index.html +++ b/000 Zettelkasten/Ahead-of-Time (AOT) Compilation/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Are less inductive biases better or worse?/index.html b/000 Zettelkasten/Are less inductive biases better or worse?/index.html index a12b8cd3..2a3f2912 100644 --- a/000 Zettelkasten/Are less inductive biases better or worse?/index.html +++ b/000 Zettelkasten/Are less inductive biases better or worse?/index.html @@ -7064,6 +7064,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Bit Palettization/index.html b/000 Zettelkasten/Bit Palettization/index.html index f69fa7ed..74e1d5d3 100644 --- a/000 Zettelkasten/Bit Palettization/index.html +++ b/000 Zettelkasten/Bit Palettization/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Block Expansion/index.html b/000 Zettelkasten/Block Expansion/index.html index 2dcb42f2..352ff5af 100644 --- a/000 Zettelkasten/Block Expansion/index.html +++ b/000 Zettelkasten/Block Expansion/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Convergence rate and Hessian spectra/index.html b/000 Zettelkasten/Convergence rate and Hessian spectra/index.html index e5c5de46..5cbea49a 100644 --- a/000 Zettelkasten/Convergence rate and Hessian spectra/index.html +++ b/000 Zettelkasten/Convergence rate and Hessian spectra/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Depthwise separable convolutions/index.html b/000 Zettelkasten/Depthwise separable convolutions/index.html index ae98f970..07ad3123 100644 --- a/000 Zettelkasten/Depthwise separable convolutions/index.html +++ b/000 Zettelkasten/Depthwise separable convolutions/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Do Vision Foundation models exist?/index.html b/000 Zettelkasten/Do Vision Foundation models exist?/index.html index 5811650a..64ee16ac 100644 --- a/000 Zettelkasten/Do Vision Foundation models exist?/index.html +++ b/000 Zettelkasten/Do Vision Foundation models exist?/index.html @@ -7064,6 +7064,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Effect of weight symmetries on training dynamics/index.html b/000 Zettelkasten/Effect of weight symmetries on training dynamics/index.html index 67875527..da14d826 100644 --- a/000 Zettelkasten/Effect of weight symmetries on training dynamics/index.html +++ b/000 Zettelkasten/Effect of weight symmetries on training dynamics/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Equivariance Initialization/index.html b/000 Zettelkasten/Equivariance Initialization/index.html index 1d3824ef..5bb16fad 100644 --- a/000 Zettelkasten/Equivariance Initialization/index.html +++ b/000 Zettelkasten/Equivariance Initialization/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Grokking/index.html b/000 Zettelkasten/Grokking/index.html index 64635e35..7355c218 100644 --- a/000 Zettelkasten/Grokking/index.html +++ b/000 Zettelkasten/Grokking/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Group Axioms/index.html b/000 Zettelkasten/Group Axioms/index.html index 3e35ea9a..d70e5536 100644 --- a/000 Zettelkasten/Group Axioms/index.html +++ b/000 Zettelkasten/Group Axioms/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Group direct product/index.html b/000 Zettelkasten/Group direct product/index.html index fbfcbd86..22134d66 100644 --- a/000 Zettelkasten/Group direct product/index.html +++ b/000 Zettelkasten/Group direct product/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Hardware-specific structured pruning/index.html b/000 Zettelkasten/Hardware-specific structured pruning/index.html index bda0e02c..8c8b0839 100644 --- a/000 Zettelkasten/Hardware-specific structured pruning/index.html +++ b/000 Zettelkasten/Hardware-specific structured pruning/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Input-dependent convolutions/index.html b/000 Zettelkasten/Input-dependent convolutions/index.html index 49725e70..6c1eb83e 100644 --- a/000 Zettelkasten/Input-dependent convolutions/index.html +++ b/000 Zettelkasten/Input-dependent convolutions/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/K-Means-based Quantization/index.html b/000 Zettelkasten/K-Means-based Quantization/index.html index 4a134123..eb81eed9 100644 --- a/000 Zettelkasten/K-Means-based Quantization/index.html +++ b/000 Zettelkasten/K-Means-based Quantization/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/KV Cache/index.html b/000 Zettelkasten/KV Cache/index.html index 738cf99b..65907792 100644 --- a/000 Zettelkasten/KV Cache/index.html +++ b/000 Zettelkasten/KV Cache/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Linear Quantization/index.html b/000 Zettelkasten/Linear Quantization/index.html index a342b7fd..b3b6697d 100644 --- a/000 Zettelkasten/Linear Quantization/index.html +++ b/000 Zettelkasten/Linear Quantization/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/LoRa Adapter/index.html b/000 Zettelkasten/LoRa Adapter/index.html index 0dea2456..6823d01c 100644 --- a/000 Zettelkasten/LoRa Adapter/index.html +++ b/000 Zettelkasten/LoRa Adapter/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Masked Image Modelling/index.html b/000 Zettelkasten/Masked Image Modelling/index.html index d199977e..2c0001d5 100644 --- a/000 Zettelkasten/Masked Image Modelling/index.html +++ b/000 Zettelkasten/Masked Image Modelling/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Maximal pruning and functional recovery/index.html b/000 Zettelkasten/Maximal pruning and functional recovery/index.html index e2af6fb5..f3900a14 100644 --- a/000 Zettelkasten/Maximal pruning and functional recovery/index.html +++ b/000 Zettelkasten/Maximal pruning and functional recovery/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Mean Attention Distance/index.html b/000 Zettelkasten/Mean Attention Distance/index.html index 503730ae..245e3607 100644 --- a/000 Zettelkasten/Mean Attention Distance/index.html +++ b/000 Zettelkasten/Mean Attention Distance/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Multiple global minima/index.html b/000 Zettelkasten/Multiple global minima/index.html index bba208f1..97d3cd89 100644 --- a/000 Zettelkasten/Multiple global minima/index.html +++ b/000 Zettelkasten/Multiple global minima/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Neural Network Quantization/index.html b/000 Zettelkasten/Neural Network Quantization/index.html index 48336403..ac79494a 100644 --- a/000 Zettelkasten/Neural Network Quantization/index.html +++ b/000 Zettelkasten/Neural Network Quantization/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Non-translationally equivariant convolutions/index.html b/000 Zettelkasten/Non-translationally equivariant convolutions/index.html index a0ee70d4..c97e6df9 100644 --- a/000 Zettelkasten/Non-translationally equivariant convolutions/index.html +++ b/000 Zettelkasten/Non-translationally equivariant convolutions/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Positive Logic Programs/index.html b/000 Zettelkasten/Positive Logic Programs/index.html index f24cab57..8cc3733d 100644 --- a/000 Zettelkasten/Positive Logic Programs/index.html +++ b/000 Zettelkasten/Positive Logic Programs/index.html @@ -7097,6 +7097,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Priors over Neural Network weights/index.html b/000 Zettelkasten/Priors over Neural Network weights/index.html index ccfebb23..7cbb0275 100644 --- a/000 Zettelkasten/Priors over Neural Network weights/index.html +++ b/000 Zettelkasten/Priors over Neural Network weights/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/PyTorch Functionalization/index.html b/000 Zettelkasten/PyTorch Functionalization/index.html index 1971bcb8..decae061 100644 --- a/000 Zettelkasten/PyTorch Functionalization/index.html +++ b/000 Zettelkasten/PyTorch Functionalization/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/PyTorch Quantization for TensorRT/index.html b/000 Zettelkasten/PyTorch Quantization for TensorRT/index.html index 795c4867..6ad1551b 100644 --- a/000 Zettelkasten/PyTorch Quantization for TensorRT/index.html +++ b/000 Zettelkasten/PyTorch Quantization for TensorRT/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Representation (Group Theory)/index.html b/000 Zettelkasten/Representation (Group Theory)/index.html index 61a1ee7f..9b290111 100644 --- a/000 Zettelkasten/Representation (Group Theory)/index.html +++ b/000 Zettelkasten/Representation (Group Theory)/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/000 Zettelkasten/Residual stream/index.html b/000 Zettelkasten/Residual stream/index.html index d2f5e438..b54d63c3 100644 --- a/000 Zettelkasten/Residual stream/index.html +++ b/000 Zettelkasten/Residual stream/index.html @@ -7022,6 +7022,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/A Brief Review of Hypernetworks in Deep Learning/index.html b/100 Reference notes/101 Literature/A Brief Review of Hypernetworks in Deep Learning/index.html index ce41312c..e512fb35 100644 --- a/100 Reference notes/101 Literature/A Brief Review of Hypernetworks in Deep Learning/index.html +++ b/100 Reference notes/101 Literature/A Brief Review of Hypernetworks in Deep Learning/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/A ConvNet for the 2020s/index.html b/100 Reference notes/101 Literature/A ConvNet for the 2020s/index.html index 16fc2506..c591581e 100644 --- a/100 Reference notes/101 Literature/A ConvNet for the 2020s/index.html +++ b/100 Reference notes/101 Literature/A ConvNet for the 2020s/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/A Hierarchy of Graph Neural Networks Based on Learnable Local Features/index.html b/100 Reference notes/101 Literature/A Hierarchy of Graph Neural Networks Based on Learnable Local Features/index.html index 824a12ac..39af5f9e 100644 --- a/100 Reference notes/101 Literature/A Hierarchy of Graph Neural Networks Based on Learnable Local Features/index.html +++ b/100 Reference notes/101 Literature/A Hierarchy of Graph Neural Networks Based on Learnable Local Features/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/A Mathematical Framework for Transformer Circuits/index.html b/100 Reference notes/101 Literature/A Mathematical Framework for Transformer Circuits/index.html index 4a94a74f..53e4747b 100644 --- a/100 Reference notes/101 Literature/A Mathematical Framework for Transformer Circuits/index.html +++ b/100 Reference notes/101 Literature/A Mathematical Framework for Transformer Circuits/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/A general theory of correct, incorrect, and extrinsic equivariance/index.html b/100 Reference notes/101 Literature/A general theory of correct, incorrect, and extrinsic equivariance/index.html index d3c0fae8..21cea985 100644 --- a/100 Reference notes/101 Literature/A general theory of correct, incorrect, and extrinsic equivariance/index.html +++ b/100 Reference notes/101 Literature/A general theory of correct, incorrect, and extrinsic equivariance/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/A survey of quantization methods for efficient neural network inference/index.html b/100 Reference notes/101 Literature/A survey of quantization methods for efficient neural network inference/index.html index 391b20c3..a9483763 100644 --- a/100 Reference notes/101 Literature/A survey of quantization methods for efficient neural network inference/index.html +++ b/100 Reference notes/101 Literature/A survey of quantization methods for efficient neural network inference/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/AWQ - Activation-aware Weight Quantization for LLM Compression and Acceleration/index.html b/100 Reference notes/101 Literature/AWQ - Activation-aware Weight Quantization for LLM Compression and Acceleration/index.html index fdcc2cbd..b1afa323 100644 --- a/100 Reference notes/101 Literature/AWQ - Activation-aware Weight Quantization for LLM Compression and Acceleration/index.html +++ b/100 Reference notes/101 Literature/AWQ - Activation-aware Weight Quantization for LLM Compression and Acceleration/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Adapting Vision Foundation Models for Plant Phenotyping/index.html b/100 Reference notes/101 Literature/Adapting Vision Foundation Models for Plant Phenotyping/index.html index 2201baf4..ca0c3bf2 100644 --- a/100 Reference notes/101 Literature/Adapting Vision Foundation Models for Plant Phenotyping/index.html +++ b/100 Reference notes/101 Literature/Adapting Vision Foundation Models for Plant Phenotyping/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/An Image is Worth More Than 16x16 Patches - Exploring Transformers on Individual Pixels/index.html b/100 Reference notes/101 Literature/An Image is Worth More Than 16x16 Patches - Exploring Transformers on Individual Pixels/index.html index 685b30ee..ebc34f9a 100644 --- a/100 Reference notes/101 Literature/An Image is Worth More Than 16x16 Patches - Exploring Transformers on Individual Pixels/index.html +++ b/100 Reference notes/101 Literature/An Image is Worth More Than 16x16 Patches - Exploring Transformers on Individual Pixels/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/An Investigation into Neural Net Optimization via Hessian Eigenvalue Density/index.html b/100 Reference notes/101 Literature/An Investigation into Neural Net Optimization via Hessian Eigenvalue Density/index.html index e77dc2d9..b32c986b 100644 --- a/100 Reference notes/101 Literature/An Investigation into Neural Net Optimization via Hessian Eigenvalue Density/index.html +++ b/100 Reference notes/101 Literature/An Investigation into Neural Net Optimization via Hessian Eigenvalue Density/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/An image is worth 16x16 words - Transformers for image recognition at scale/index.html b/100 Reference notes/101 Literature/An image is worth 16x16 words - Transformers for image recognition at scale/index.html index c7187677..facbd93c 100644 --- a/100 Reference notes/101 Literature/An image is worth 16x16 words - Transformers for image recognition at scale/index.html +++ b/100 Reference notes/101 Literature/An image is worth 16x16 words - Transformers for image recognition at scale/index.html @@ -7081,6 +7081,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Apple Intelligence Foundation Language Models/index.html b/100 Reference notes/101 Literature/Apple Intelligence Foundation Language Models/index.html index 9d51713f..36000b40 100644 --- a/100 Reference notes/101 Literature/Apple Intelligence Foundation Language Models/index.html +++ b/100 Reference notes/101 Literature/Apple Intelligence Foundation Language Models/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Approximately equivariant networks for imperfectly symmetric dynamics/index.html b/100 Reference notes/101 Literature/Approximately equivariant networks for imperfectly symmetric dynamics/index.html index c720b060..92af625f 100644 --- a/100 Reference notes/101 Literature/Approximately equivariant networks for imperfectly symmetric dynamics/index.html +++ b/100 Reference notes/101 Literature/Approximately equivariant networks for imperfectly symmetric dynamics/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Approximation-Generalization Trade-offs under (Approximate) Group Equivariance/index.html b/100 Reference notes/101 Literature/Approximation-Generalization Trade-offs under (Approximate) Group Equivariance/index.html index 41bafb09..0db3192b 100644 --- a/100 Reference notes/101 Literature/Approximation-Generalization Trade-offs under (Approximate) Group Equivariance/index.html +++ b/100 Reference notes/101 Literature/Approximation-Generalization Trade-offs under (Approximate) Group Equivariance/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Autoequivariant Network Search via Group Decomposition/index.html b/100 Reference notes/101 Literature/Autoequivariant Network Search via Group Decomposition/index.html index 07b0b025..015b068f 100644 --- a/100 Reference notes/101 Literature/Autoequivariant Network Search via Group Decomposition/index.html +++ b/100 Reference notes/101 Literature/Autoequivariant Network Search via Group Decomposition/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Battle of the Backbones - A Large-Scale Comparison of Pretrained Models across Computer Vision Tasks/index.html b/100 Reference notes/101 Literature/Battle of the Backbones - A Large-Scale Comparison of Pretrained Models across Computer Vision Tasks/index.html index 119656ad..33e28e94 100644 --- a/100 Reference notes/101 Literature/Battle of the Backbones - A Large-Scale Comparison of Pretrained Models across Computer Vision Tasks/index.html +++ b/100 Reference notes/101 Literature/Battle of the Backbones - A Large-Scale Comparison of Pretrained Models across Computer Vision Tasks/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Block Transformer - Global-to-Local Language Modeling for Fast Inference/index.html b/100 Reference notes/101 Literature/Block Transformer - Global-to-Local Language Modeling for Fast Inference/index.html index 6b613a61..8cac96b4 100644 --- a/100 Reference notes/101 Literature/Block Transformer - Global-to-Local Language Modeling for Fast Inference/index.html +++ b/100 Reference notes/101 Literature/Block Transformer - Global-to-Local Language Modeling for Fast Inference/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/BoxeR - Box-Attention for 2D and 3D Transformers/index.html b/100 Reference notes/101 Literature/BoxeR - Box-Attention for 2D and 3D Transformers/index.html index 5bd25217..d843a9f2 100644 --- a/100 Reference notes/101 Literature/BoxeR - Box-Attention for 2D and 3D Transformers/index.html +++ b/100 Reference notes/101 Literature/BoxeR - Box-Attention for 2D and 3D Transformers/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Building on Efficient Foundations - Effectively Training LLMs with Structured Feedforward Layers/index.html b/100 Reference notes/101 Literature/Building on Efficient Foundations - Effectively Training LLMs with Structured Feedforward Layers/index.html index f026ee12..1e94ad5f 100644 --- a/100 Reference notes/101 Literature/Building on Efficient Foundations - Effectively Training LLMs with Structured Feedforward Layers/index.html +++ b/100 Reference notes/101 Literature/Building on Efficient Foundations - Effectively Training LLMs with Structured Feedforward Layers/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/CKConv - Continuous Kernel Convolution For Sequential Data/index.html b/100 Reference notes/101 Literature/CKConv - Continuous Kernel Convolution For Sequential Data/index.html index 89a73fd9..813a2661 100644 --- a/100 Reference notes/101 Literature/CKConv - Continuous Kernel Convolution For Sequential Data/index.html +++ b/100 Reference notes/101 Literature/CKConv - Continuous Kernel Convolution For Sequential Data/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Color Equivariant Convolutional Networks/index.html b/100 Reference notes/101 Literature/Color Equivariant Convolutional Networks/index.html index e240b79d..1058271b 100644 --- a/100 Reference notes/101 Literature/Color Equivariant Convolutional Networks/index.html +++ b/100 Reference notes/101 Literature/Color Equivariant Convolutional Networks/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Color Space Transformation Network/index.html b/100 Reference notes/101 Literature/Color Space Transformation Network/index.html index 6b7a9430..daa4d5a8 100644 --- a/100 Reference notes/101 Literature/Color Space Transformation Network/index.html +++ b/100 Reference notes/101 Literature/Color Space Transformation Network/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/ConViT - Improving Vision Transformers with Soft Convolutional Inductive Biases/index.html b/100 Reference notes/101 Literature/ConViT - Improving Vision Transformers with Soft Convolutional Inductive Biases/index.html index b58777e7..32d1c7b6 100644 --- a/100 Reference notes/101 Literature/ConViT - Improving Vision Transformers with Soft Convolutional Inductive Biases/index.html +++ b/100 Reference notes/101 Literature/ConViT - Improving Vision Transformers with Soft Convolutional Inductive Biases/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/DETRs Beat YOLOs on Real-time Object Detection/index.html b/100 Reference notes/101 Literature/DETRs Beat YOLOs on Real-time Object Detection/index.html index 2decb214..04a17aea 100644 --- a/100 Reference notes/101 Literature/DETRs Beat YOLOs on Real-time Object Detection/index.html +++ b/100 Reference notes/101 Literature/DETRs Beat YOLOs on Real-time Object Detection/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/DETRs with Collaborative Hybrid Assignments Training/index.html b/100 Reference notes/101 Literature/DETRs with Collaborative Hybrid Assignments Training/index.html index 5ca6437d..1c8c0599 100644 --- a/100 Reference notes/101 Literature/DETRs with Collaborative Hybrid Assignments Training/index.html +++ b/100 Reference notes/101 Literature/DETRs with Collaborative Hybrid Assignments Training/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/DINOv2 - Learning Robust Visual Features without Supervision/index.html b/100 Reference notes/101 Literature/DINOv2 - Learning Robust Visual Features without Supervision/index.html index cf1c5086..197f892f 100644 --- a/100 Reference notes/101 Literature/DINOv2 - Learning Robust Visual Features without Supervision/index.html +++ b/100 Reference notes/101 Literature/DINOv2 - Learning Robust Visual Features without Supervision/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Deep Learning Book/index.html b/100 Reference notes/101 Literature/Deep Learning Book/index.html index 7df3e154..e654588a 100644 --- a/100 Reference notes/101 Literature/Deep Learning Book/index.html +++ b/100 Reference notes/101 Literature/Deep Learning Book/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/DenseNets Reloaded - Paradigm Shift Beyond ResNets and ViTs/index.html b/100 Reference notes/101 Literature/DenseNets Reloaded - Paradigm Shift Beyond ResNets and ViTs/index.html index bb44f623..d55c70c0 100644 --- a/100 Reference notes/101 Literature/DenseNets Reloaded - Paradigm Shift Beyond ResNets and ViTs/index.html +++ b/100 Reference notes/101 Literature/DenseNets Reloaded - Paradigm Shift Beyond ResNets and ViTs/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Discovering Symmetry Breaking in Physical Systems with Relaxed Group Convolution/index.html b/100 Reference notes/101 Literature/Discovering Symmetry Breaking in Physical Systems with Relaxed Group Convolution/index.html index 64c842a5..260b9c93 100644 --- a/100 Reference notes/101 Literature/Discovering Symmetry Breaking in Physical Systems with Relaxed Group Convolution/index.html +++ b/100 Reference notes/101 Literature/Discovering Symmetry Breaking in Physical Systems with Relaxed Group Convolution/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/EVA-02 - A Visual Representation for Neon Genesis/index.html b/100 Reference notes/101 Literature/EVA-02 - A Visual Representation for Neon Genesis/index.html index c1991578..1651c718 100644 --- a/100 Reference notes/101 Literature/EVA-02 - A Visual Representation for Neon Genesis/index.html +++ b/100 Reference notes/101 Literature/EVA-02 - A Visual Representation for Neon Genesis/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Early Convolutions Help Transformers See Better/index.html b/100 Reference notes/101 Literature/Early Convolutions Help Transformers See Better/index.html index 7fba0652..d14b2f0c 100644 --- a/100 Reference notes/101 Literature/Early Convolutions Help Transformers See Better/index.html +++ b/100 Reference notes/101 Literature/Early Convolutions Help Transformers See Better/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Efficient Equivariant Transfer Learning from Pretrained Models/index.html b/100 Reference notes/101 Literature/Efficient Equivariant Transfer Learning from Pretrained Models/index.html index d11364e9..296c6070 100644 --- a/100 Reference notes/101 Literature/Efficient Equivariant Transfer Learning from Pretrained Models/index.html +++ b/100 Reference notes/101 Literature/Efficient Equivariant Transfer Learning from Pretrained Models/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Efficient Modulation for Vision Networks/index.html b/100 Reference notes/101 Literature/Efficient Modulation for Vision Networks/index.html index a48aca2a..33b96991 100644 --- a/100 Reference notes/101 Literature/Efficient Modulation for Vision Networks/index.html +++ b/100 Reference notes/101 Literature/Efficient Modulation for Vision Networks/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/EfficientViT-SAM - Accelerated Segment Anything Model Without Accuracy Loss/index.html b/100 Reference notes/101 Literature/EfficientViT-SAM - Accelerated Segment Anything Model Without Accuracy Loss/index.html index 0ebbf2fe..b0da0fc7 100644 --- a/100 Reference notes/101 Literature/EfficientViT-SAM - Accelerated Segment Anything Model Without Accuracy Loss/index.html +++ b/100 Reference notes/101 Literature/EfficientViT-SAM - Accelerated Segment Anything Model Without Accuracy Loss/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Emergent Equivariance in Deep Ensembles/index.html b/100 Reference notes/101 Literature/Emergent Equivariance in Deep Ensembles/index.html index f86f0774..60f7ba6f 100644 --- a/100 Reference notes/101 Literature/Emergent Equivariance in Deep Ensembles/index.html +++ b/100 Reference notes/101 Literature/Emergent Equivariance in Deep Ensembles/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Emerging Properties in Self-Supervised Vision Transformers/index.html b/100 Reference notes/101 Literature/Emerging Properties in Self-Supervised Vision Transformers/index.html index 3e279710..45066ebc 100644 --- a/100 Reference notes/101 Literature/Emerging Properties in Self-Supervised Vision Transformers/index.html +++ b/100 Reference notes/101 Literature/Emerging Properties in Self-Supervised Vision Transformers/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/End-to-End Object Detection with Transformers/index.html b/100 Reference notes/101 Literature/End-to-End Object Detection with Transformers/index.html index 70ee0164..fd568355 100644 --- a/100 Reference notes/101 Literature/End-to-End Object Detection with Transformers/index.html +++ b/100 Reference notes/101 Literature/End-to-End Object Detection with Transformers/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Equi-Tuning - Group Equivariant Fine-Tuning of Pretrained Models/index.html b/100 Reference notes/101 Literature/Equi-Tuning - Group Equivariant Fine-Tuning of Pretrained Models/index.html index b866e66c..58ca2114 100644 --- a/100 Reference notes/101 Literature/Equi-Tuning - Group Equivariant Fine-Tuning of Pretrained Models/index.html +++ b/100 Reference notes/101 Literature/Equi-Tuning - Group Equivariant Fine-Tuning of Pretrained Models/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Equivariance with Learned Canonicalization Functions/index.html b/100 Reference notes/101 Literature/Equivariance with Learned Canonicalization Functions/index.html index 67878cd1..db525be8 100644 --- a/100 Reference notes/101 Literature/Equivariance with Learned Canonicalization Functions/index.html +++ b/100 Reference notes/101 Literature/Equivariance with Learned Canonicalization Functions/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Equivariance-aware architectural optimization of neural networks/index.html b/100 Reference notes/101 Literature/Equivariance-aware architectural optimization of neural networks/index.html index cc85121e..bf9168bf 100644 --- a/100 Reference notes/101 Literature/Equivariance-aware architectural optimization of neural networks/index.html +++ b/100 Reference notes/101 Literature/Equivariance-aware architectural optimization of neural networks/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Exact Conversion of In-Context Learning to Model Weights in Linearized-Attention Transformers/index.html b/100 Reference notes/101 Literature/Exact Conversion of In-Context Learning to Model Weights in Linearized-Attention Transformers/index.html index 6333f80d..57aba428 100644 --- a/100 Reference notes/101 Literature/Exact Conversion of In-Context Learning to Model Weights in Linearized-Attention Transformers/index.html +++ b/100 Reference notes/101 Literature/Exact Conversion of In-Context Learning to Model Weights in Linearized-Attention Transformers/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Exploiting Redundancy - Separable Group Convolutional Networks on Lie Groups/index.html b/100 Reference notes/101 Literature/Exploiting Redundancy - Separable Group Convolutional Networks on Lie Groups/index.html index 7261f2e6..78751f7d 100644 --- a/100 Reference notes/101 Literature/Exploiting Redundancy - Separable Group Convolutional Networks on Lie Groups/index.html +++ b/100 Reference notes/101 Literature/Exploiting Redundancy - Separable Group Convolutional Networks on Lie Groups/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Exploring Plain Vision Transformer Backbones for Object Detection/index.html b/100 Reference notes/101 Literature/Exploring Plain Vision Transformer Backbones for Object Detection/index.html index 0ee5cd93..e8b52da9 100644 --- a/100 Reference notes/101 Literature/Exploring Plain Vision Transformer Backbones for Object Detection/index.html +++ b/100 Reference notes/101 Literature/Exploring Plain Vision Transformer Backbones for Object Detection/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Fast, Expressive SE(n) Equivariant Networks through Weight-Sharing in Position-Orientation Space/index.html b/100 Reference notes/101 Literature/Fast, Expressive SE(n) Equivariant Networks through Weight-Sharing in Position-Orientation Space/index.html index 6b4675c0..e50aabb9 100644 --- a/100 Reference notes/101 Literature/Fast, Expressive SE(n) Equivariant Networks through Weight-Sharing in Position-Orientation Space/index.html +++ b/100 Reference notes/101 Literature/Fast, Expressive SE(n) Equivariant Networks through Weight-Sharing in Position-Orientation Space/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/FlexiViT - One Model for All Patch Sizes/index.html b/100 Reference notes/101 Literature/FlexiViT - One Model for All Patch Sizes/index.html index bad3db07..8af65fa6 100644 --- a/100 Reference notes/101 Literature/FlexiViT - One Model for All Patch Sizes/index.html +++ b/100 Reference notes/101 Literature/FlexiViT - One Model for All Patch Sizes/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/G-SGD - Optimizing ReLU Neural Networks in its Positively Scale-Invariant Space/index.html b/100 Reference notes/101 Literature/G-SGD - Optimizing ReLU Neural Networks in its Positively Scale-Invariant Space/index.html index 450e61e8..ebf880e3 100644 --- a/100 Reference notes/101 Literature/G-SGD - Optimizing ReLU Neural Networks in its Positively Scale-Invariant Space/index.html +++ b/100 Reference notes/101 Literature/G-SGD - Optimizing ReLU Neural Networks in its Positively Scale-Invariant Space/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Grokked Transformers are Implicit Reasoners - A Mechanistic Journey to the Edge of Generalization/index.html b/100 Reference notes/101 Literature/Grokked Transformers are Implicit Reasoners - A Mechanistic Journey to the Edge of Generalization/index.html index 4f947355..104f109a 100644 --- a/100 Reference notes/101 Literature/Grokked Transformers are Implicit Reasoners - A Mechanistic Journey to the Edge of Generalization/index.html +++ b/100 Reference notes/101 Literature/Grokked Transformers are Implicit Reasoners - A Mechanistic Journey to the Edge of Generalization/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Harmonics of Learning - Universal Fourier Features Emerge in Invariant Networks/index.html b/100 Reference notes/101 Literature/Harmonics of Learning - Universal Fourier Features Emerge in Invariant Networks/index.html index 10cfbc55..65c878b4 100644 --- a/100 Reference notes/101 Literature/Harmonics of Learning - Universal Fourier Features Emerge in Invariant Networks/index.html +++ b/100 Reference notes/101 Literature/Harmonics of Learning - Universal Fourier Features Emerge in Invariant Networks/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/How do vision transformers work?/index.html b/100 Reference notes/101 Literature/How do vision transformers work?/index.html index 09a893af..6fc949c2 100644 --- a/100 Reference notes/101 Literature/How do vision transformers work?/index.html +++ b/100 Reference notes/101 Literature/How do vision transformers work?/index.html @@ -7090,6 +7090,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Hydra - Bidirectional State Space Models Through Generalized Matrix Mixers/index.html b/100 Reference notes/101 Literature/Hydra - Bidirectional State Space Models Through Generalized Matrix Mixers/index.html index c88d6d16..60d0c07d 100644 --- a/100 Reference notes/101 Literature/Hydra - Bidirectional State Space Models Through Generalized Matrix Mixers/index.html +++ b/100 Reference notes/101 Literature/Hydra - Bidirectional State Space Models Through Generalized Matrix Mixers/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Improving Convergence and Generalization Using Parameter Symmetries/index.html b/100 Reference notes/101 Literature/Improving Convergence and Generalization Using Parameter Symmetries/index.html index fb22e5cc..467eb97a 100644 --- a/100 Reference notes/101 Literature/Improving Convergence and Generalization Using Parameter Symmetries/index.html +++ b/100 Reference notes/101 Literature/Improving Convergence and Generalization Using Parameter Symmetries/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/In Search of Projectively Equivariant Networks/index.html b/100 Reference notes/101 Literature/In Search of Projectively Equivariant Networks/index.html index 13969dd3..3c700b26 100644 --- a/100 Reference notes/101 Literature/In Search of Projectively Equivariant Networks/index.html +++ b/100 Reference notes/101 Literature/In Search of Projectively Equivariant Networks/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Knowledge Transfer from Vision Foundation Models for Efficient Training of Small Task-specific Models/index.html b/100 Reference notes/101 Literature/Knowledge Transfer from Vision Foundation Models for Efficient Training of Small Task-specific Models/index.html index 20cd526d..76ecf1f2 100644 --- a/100 Reference notes/101 Literature/Knowledge Transfer from Vision Foundation Models for Efficient Training of Small Task-specific Models/index.html +++ b/100 Reference notes/101 Literature/Knowledge Transfer from Vision Foundation Models for Efficient Training of Small Task-specific Models/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/LRP-QViT - Mixed-Precision Vision Transformer Quantization via Layer-wise Relevance Propagation/index.html b/100 Reference notes/101 Literature/LRP-QViT - Mixed-Precision Vision Transformer Quantization via Layer-wise Relevance Propagation/index.html index c244ffb0..e97c6410 100644 --- a/100 Reference notes/101 Literature/LRP-QViT - Mixed-Precision Vision Transformer Quantization via Layer-wise Relevance Propagation/index.html +++ b/100 Reference notes/101 Literature/LRP-QViT - Mixed-Precision Vision Transformer Quantization via Layer-wise Relevance Propagation/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Learned Gridification for Efficient Point Cloud Processing/index.html b/100 Reference notes/101 Literature/Learned Gridification for Efficient Point Cloud Processing/index.html index 6cd323c5..e3761675 100644 --- a/100 Reference notes/101 Literature/Learned Gridification for Efficient Point Cloud Processing/index.html +++ b/100 Reference notes/101 Literature/Learned Gridification for Efficient Point Cloud Processing/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Learning Partial Equivariances from Data/index.html b/100 Reference notes/101 Literature/Learning Partial Equivariances from Data/index.html index 21680824..dedf67f1 100644 --- a/100 Reference notes/101 Literature/Learning Partial Equivariances from Data/index.html +++ b/100 Reference notes/101 Literature/Learning Partial Equivariances from Data/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Learning both Weights and Connections for Efficient Neural Networks/index.html b/100 Reference notes/101 Literature/Learning both Weights and Connections for Efficient Neural Networks/index.html index 543b098f..52aac1bf 100644 --- a/100 Reference notes/101 Literature/Learning both Weights and Connections for Efficient Neural Networks/index.html +++ b/100 Reference notes/101 Literature/Learning both Weights and Connections for Efficient Neural Networks/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Learning with Unmasked Tokens Drives Stronger Vision Learners/index.html b/100 Reference notes/101 Literature/Learning with Unmasked Tokens Drives Stronger Vision Learners/index.html index 834c7630..e291fd97 100644 --- a/100 Reference notes/101 Literature/Learning with Unmasked Tokens Drives Stronger Vision Learners/index.html +++ b/100 Reference notes/101 Literature/Learning with Unmasked Tokens Drives Stronger Vision Learners/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Llama 2 - Open Foundation and Fine-Tuned Chat Models/index.html b/100 Reference notes/101 Literature/Llama 2 - Open Foundation and Fine-Tuned Chat Models/index.html index 5dcfb634..b695a497 100644 --- a/100 Reference notes/101 Literature/Llama 2 - Open Foundation and Fine-Tuned Chat Models/index.html +++ b/100 Reference notes/101 Literature/Llama 2 - Open Foundation and Fine-Tuned Chat Models/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/LoRA - Low-Rank Adaptation of Large Language Models/index.html b/100 Reference notes/101 Literature/LoRA - Low-Rank Adaptation of Large Language Models/index.html index 1de109f9..bdfe2cff 100644 --- a/100 Reference notes/101 Literature/LoRA - Low-Rank Adaptation of Large Language Models/index.html +++ b/100 Reference notes/101 Literature/LoRA - Low-Rank Adaptation of Large Language Models/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Mamba - Linear-Time Sequence Modeling with Selective State Spaces/index.html b/100 Reference notes/101 Literature/Mamba - Linear-Time Sequence Modeling with Selective State Spaces/index.html index dbc49ff5..5dc35668 100644 --- a/100 Reference notes/101 Literature/Mamba - Linear-Time Sequence Modeling with Selective State Spaces/index.html +++ b/100 Reference notes/101 Literature/Mamba - Linear-Time Sequence Modeling with Selective State Spaces/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Memorization Through the Lens of Curvature of Loss Function Around Samples/index.html b/100 Reference notes/101 Literature/Memorization Through the Lens of Curvature of Loss Function Around Samples/index.html index ab1855da..7c97c678 100644 --- a/100 Reference notes/101 Literature/Memorization Through the Lens of Curvature of Loss Function Around Samples/index.html +++ b/100 Reference notes/101 Literature/Memorization Through the Lens of Curvature of Loss Function Around Samples/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Mixture of LoRa Experts/index.html b/100 Reference notes/101 Literature/Mixture of LoRa Experts/index.html index 324b3b64..35b1cd85 100644 --- a/100 Reference notes/101 Literature/Mixture of LoRa Experts/index.html +++ b/100 Reference notes/101 Literature/Mixture of LoRa Experts/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/MobileCLIP - Fast Image-Text Models through Multi-Modal Reinforced Training/index.html b/100 Reference notes/101 Literature/MobileCLIP - Fast Image-Text Models through Multi-Modal Reinforced Training/index.html index 18b4d30f..7ea05344 100644 --- a/100 Reference notes/101 Literature/MobileCLIP - Fast Image-Text Models through Multi-Modal Reinforced Training/index.html +++ b/100 Reference notes/101 Literature/MobileCLIP - Fast Image-Text Models through Multi-Modal Reinforced Training/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/MobileViT - light-weight, general-purpose, and mobile-friendly vision transformer/index.html b/100 Reference notes/101 Literature/MobileViT - light-weight, general-purpose, and mobile-friendly vision transformer/index.html index aec5eea4..77ba0851 100644 --- a/100 Reference notes/101 Literature/MobileViT - light-weight, general-purpose, and mobile-friendly vision transformer/index.html +++ b/100 Reference notes/101 Literature/MobileViT - light-weight, general-purpose, and mobile-friendly vision transformer/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Model Compression in Practice - Lessons Learned from Practitioners Creating On-device Machine Learning Experiences/index.html b/100 Reference notes/101 Literature/Model Compression in Practice - Lessons Learned from Practitioners Creating On-device Machine Learning Experiences/index.html index cae79e22..48d14646 100644 --- a/100 Reference notes/101 Literature/Model Compression in Practice - Lessons Learned from Practitioners Creating On-device Machine Learning Experiences/index.html +++ b/100 Reference notes/101 Literature/Model Compression in Practice - Lessons Learned from Practitioners Creating On-device Machine Learning Experiences/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Neural Mechanics - Symmetry and Broken Conservation Laws in Deep Learning Dynamics/index.html b/100 Reference notes/101 Literature/Neural Mechanics - Symmetry and Broken Conservation Laws in Deep Learning Dynamics/index.html index 501a40e4..f5964876 100644 --- a/100 Reference notes/101 Literature/Neural Mechanics - Symmetry and Broken Conservation Laws in Deep Learning Dynamics/index.html +++ b/100 Reference notes/101 Literature/Neural Mechanics - Symmetry and Broken Conservation Laws in Deep Learning Dynamics/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/On Good Practices for Task-Specific Distillation of Large Pretrained Visual Models/index.html b/100 Reference notes/101 Literature/On Good Practices for Task-Specific Distillation of Large Pretrained Visual Models/index.html index 75ccdc1d..6dcb3958 100644 --- a/100 Reference notes/101 Literature/On Good Practices for Task-Specific Distillation of Large Pretrained Visual Models/index.html +++ b/100 Reference notes/101 Literature/On Good Practices for Task-Specific Distillation of Large Pretrained Visual Models/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/On the Relationship between Self-Attention and Convolutional Layers/index.html b/100 Reference notes/101 Literature/On the Relationship between Self-Attention and Convolutional Layers/index.html index 5ff7da75..071f2745 100644 --- a/100 Reference notes/101 Literature/On the Relationship between Self-Attention and Convolutional Layers/index.html +++ b/100 Reference notes/101 Literature/On the Relationship between Self-Attention and Convolutional Layers/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/On the Symmetries of Deep Learning Models and their Internal Representations/index.html b/100 Reference notes/101 Literature/On the Symmetries of Deep Learning Models and their Internal Representations/index.html index 1ba3d65d..a957c64d 100644 --- a/100 Reference notes/101 Literature/On the Symmetries of Deep Learning Models and their Internal Representations/index.html +++ b/100 Reference notes/101 Literature/On the Symmetries of Deep Learning Models and their Internal Representations/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/OpenELM - An Efficient Language Model Family with Open-source Training and Inference Framework/index.html b/100 Reference notes/101 Literature/OpenELM - An Efficient Language Model Family with Open-source Training and Inference Framework/index.html index 00cb2644..fcb67cec 100644 --- a/100 Reference notes/101 Literature/OpenELM - An Efficient Language Model Family with Open-source Training and Inference Framework/index.html +++ b/100 Reference notes/101 Literature/OpenELM - An Efficient Language Model Family with Open-source Training and Inference Framework/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Optimal Brain Damage/index.html b/100 Reference notes/101 Literature/Optimal Brain Damage/index.html index 3a71a8f1..d18023ee 100644 --- a/100 Reference notes/101 Literature/Optimal Brain Damage/index.html +++ b/100 Reference notes/101 Literature/Optimal Brain Damage/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Optimization Dynamics of Equivariant and Augmented Neural Networks/index.html b/100 Reference notes/101 Literature/Optimization Dynamics of Equivariant and Augmented Neural Networks/index.html index 3caeb339..840d1a49 100644 --- a/100 Reference notes/101 Literature/Optimization Dynamics of Equivariant and Augmented Neural Networks/index.html +++ b/100 Reference notes/101 Literature/Optimization Dynamics of Equivariant and Augmented Neural Networks/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Parameter Efficient Fine-tuning of Self-supervised ViTs without Catastrophic Forgetting/index.html b/100 Reference notes/101 Literature/Parameter Efficient Fine-tuning of Self-supervised ViTs without Catastrophic Forgetting/index.html index 887045f9..2897ccde 100644 --- a/100 Reference notes/101 Literature/Parameter Efficient Fine-tuning of Self-supervised ViTs without Catastrophic Forgetting/index.html +++ b/100 Reference notes/101 Literature/Parameter Efficient Fine-tuning of Self-supervised ViTs without Catastrophic Forgetting/index.html @@ -7075,6 +7075,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Parameter-Efficient Fine-Tuning for Pre-Trained Vision Models - A Survey/index.html b/100 Reference notes/101 Literature/Parameter-Efficient Fine-Tuning for Pre-Trained Vision Models - A Survey/index.html index af7a9201..4080b1d5 100644 --- a/100 Reference notes/101 Literature/Parameter-Efficient Fine-Tuning for Pre-Trained Vision Models - A Survey/index.html +++ b/100 Reference notes/101 Literature/Parameter-Efficient Fine-Tuning for Pre-Trained Vision Models - A Survey/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Progress measures for grokking via mechanistic interpretability/index.html b/100 Reference notes/101 Literature/Progress measures for grokking via mechanistic interpretability/index.html index 0ac53a5e..2a3dd8c7 100644 --- a/100 Reference notes/101 Literature/Progress measures for grokking via mechanistic interpretability/index.html +++ b/100 Reference notes/101 Literature/Progress measures for grokking via mechanistic interpretability/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Provably Strict Generalisation Benefit for Equivariant Models/index.html b/100 Reference notes/101 Literature/Provably Strict Generalisation Benefit for Equivariant Models/index.html index 5ec8a168..61a36345 100644 --- a/100 Reference notes/101 Literature/Provably Strict Generalisation Benefit for Equivariant Models/index.html +++ b/100 Reference notes/101 Literature/Provably Strict Generalisation Benefit for Equivariant Models/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/ProxylessNAS - Direct Neural Architecture Search on Target Task and Hardware/index.html b/100 Reference notes/101 Literature/ProxylessNAS - Direct Neural Architecture Search on Target Task and Hardware/index.html index 72e3b0dc..88cf45af 100644 --- a/100 Reference notes/101 Literature/ProxylessNAS - Direct Neural Architecture Search on Target Task and Hardware/index.html +++ b/100 Reference notes/101 Literature/ProxylessNAS - Direct Neural Architecture Search on Target Task and Hardware/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/R-MAE - Regions Meet Masked Autoencoders/index.html b/100 Reference notes/101 Literature/R-MAE - Regions Meet Masked Autoencoders/index.html index b950f344..9cbd49db 100644 --- a/100 Reference notes/101 Literature/R-MAE - Regions Meet Masked Autoencoders/index.html +++ b/100 Reference notes/101 Literature/R-MAE - Regions Meet Masked Autoencoders/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Refusal in Language Models Is Mediated by a Single Direction/index.html b/100 Reference notes/101 Literature/Refusal in Language Models Is Mediated by a Single Direction/index.html index 9f32c39b..5cd379e5 100644 --- a/100 Reference notes/101 Literature/Refusal in Language Models Is Mediated by a Single Direction/index.html +++ b/100 Reference notes/101 Literature/Refusal in Language Models Is Mediated by a Single Direction/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Relaxed Octahedral Group Convolution for Learning Symmetry Breaking in 3D Physical Systems/index.html b/100 Reference notes/101 Literature/Relaxed Octahedral Group Convolution for Learning Symmetry Breaking in 3D Physical Systems/index.html index ef87bae1..60bbdeb9 100644 --- a/100 Reference notes/101 Literature/Relaxed Octahedral Group Convolution for Learning Symmetry Breaking in 3D Physical Systems/index.html +++ b/100 Reference notes/101 Literature/Relaxed Octahedral Group Convolution for Learning Symmetry Breaking in 3D Physical Systems/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Relaxing Equivariance Constraints with Non-stationary Continuous Filters/index.html b/100 Reference notes/101 Literature/Relaxing Equivariance Constraints with Non-stationary Continuous Filters/index.html index 64ac4165..0524933d 100644 --- a/100 Reference notes/101 Literature/Relaxing Equivariance Constraints with Non-stationary Continuous Filters/index.html +++ b/100 Reference notes/101 Literature/Relaxing Equivariance Constraints with Non-stationary Continuous Filters/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Retrospective - EIE - Efficient Inference Engine onSparse and Compressed Neural Network/index.html b/100 Reference notes/101 Literature/Retrospective - EIE - Efficient Inference Engine onSparse and Compressed Neural Network/index.html index 06d9f5dd..8bbd383d 100644 --- a/100 Reference notes/101 Literature/Retrospective - EIE - Efficient Inference Engine onSparse and Compressed Neural Network/index.html +++ b/100 Reference notes/101 Literature/Retrospective - EIE - Efficient Inference Engine onSparse and Compressed Neural Network/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Revealing the Utilized Rank of Subspaces of Learning in Neural Networks/index.html b/100 Reference notes/101 Literature/Revealing the Utilized Rank of Subspaces of Learning in Neural Networks/index.html index 20d52e60..06335c13 100644 --- a/100 Reference notes/101 Literature/Revealing the Utilized Rank of Subspaces of Learning in Neural Networks/index.html +++ b/100 Reference notes/101 Literature/Revealing the Utilized Rank of Subspaces of Learning in Neural Networks/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Rewrite the Stars/index.html b/100 Reference notes/101 Literature/Rewrite the Stars/index.html index 89cb14c5..427deb5c 100644 --- a/100 Reference notes/101 Literature/Rewrite the Stars/index.html +++ b/100 Reference notes/101 Literature/Rewrite the Stars/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/SAM-CLIP - Merging Vision Foundation Models towards Semantic and Spatial Understanding/index.html b/100 Reference notes/101 Literature/SAM-CLIP - Merging Vision Foundation Models towards Semantic and Spatial Understanding/index.html index bc6697b1..540c0bed 100644 --- a/100 Reference notes/101 Literature/SAM-CLIP - Merging Vision Foundation Models towards Semantic and Spatial Understanding/index.html +++ b/100 Reference notes/101 Literature/SAM-CLIP - Merging Vision Foundation Models towards Semantic and Spatial Understanding/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Scaling (Down) CLIP - A Comprehensive Analysis of Data, Architecture, and Training Strategies/index.html b/100 Reference notes/101 Literature/Scaling (Down) CLIP - A Comprehensive Analysis of Data, Architecture, and Training Strategies/index.html index dbaf474e..95a1f921 100644 --- a/100 Reference notes/101 Literature/Scaling (Down) CLIP - A Comprehensive Analysis of Data, Architecture, and Training Strategies/index.html +++ b/100 Reference notes/101 Literature/Scaling (Down) CLIP - A Comprehensive Analysis of Data, Architecture, and Training Strategies/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Segment Anything/index.html b/100 Reference notes/101 Literature/Segment Anything/index.html index be402197..b85bfb64 100644 --- a/100 Reference notes/101 Literature/Segment Anything/index.html +++ b/100 Reference notes/101 Literature/Segment Anything/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Self-Supervised Detection of Perfect and Partial Input-Dependent Symmetries/index.html b/100 Reference notes/101 Literature/Self-Supervised Detection of Perfect and Partial Input-Dependent Symmetries/index.html index ffad4360..e5870cd1 100644 --- a/100 Reference notes/101 Literature/Self-Supervised Detection of Perfect and Partial Input-Dependent Symmetries/index.html +++ b/100 Reference notes/101 Literature/Self-Supervised Detection of Perfect and Partial Input-Dependent Symmetries/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/SimPLR - A Simple and Plain Transformer for Scaling-Efficient Object Detection and Segmentation/index.html b/100 Reference notes/101 Literature/SimPLR - A Simple and Plain Transformer for Scaling-Efficient Object Detection and Segmentation/index.html index c8f39067..7c9f9e18 100644 --- a/100 Reference notes/101 Literature/SimPLR - A Simple and Plain Transformer for Scaling-Efficient Object Detection and Segmentation/index.html +++ b/100 Reference notes/101 Literature/SimPLR - A Simple and Plain Transformer for Scaling-Efficient Object Detection and Segmentation/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Simultaneous linear connectivity of neural networks modulo permutation/index.html b/100 Reference notes/101 Literature/Simultaneous linear connectivity of neural networks modulo permutation/index.html index 6e84e867..11728fa1 100644 --- a/100 Reference notes/101 Literature/Simultaneous linear connectivity of neural networks modulo permutation/index.html +++ b/100 Reference notes/101 Literature/Simultaneous linear connectivity of neural networks modulo permutation/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Stand-Alone Self-Attention in Vision Models/index.html b/100 Reference notes/101 Literature/Stand-Alone Self-Attention in Vision Models/index.html index fbc4f74d..2839c5d5 100644 --- a/100 Reference notes/101 Literature/Stand-Alone Self-Attention in Vision Models/index.html +++ b/100 Reference notes/101 Literature/Stand-Alone Self-Attention in Vision Models/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Surgical Fine-Tuning Improves Adaptation to Distribution Shifts/index.html b/100 Reference notes/101 Literature/Surgical Fine-Tuning Improves Adaptation to Distribution Shifts/index.html index ea7e50f9..69d30d88 100644 --- a/100 Reference notes/101 Literature/Surgical Fine-Tuning Improves Adaptation to Distribution Shifts/index.html +++ b/100 Reference notes/101 Literature/Surgical Fine-Tuning Improves Adaptation to Distribution Shifts/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Surgical-DINO - Adapter Learning of Foundation Models for Depth Estimation in Endoscopic Surgery/index.html b/100 Reference notes/101 Literature/Surgical-DINO - Adapter Learning of Foundation Models for Depth Estimation in Endoscopic Surgery/index.html index 6db134b5..e98d244e 100644 --- a/100 Reference notes/101 Literature/Surgical-DINO - Adapter Learning of Foundation Models for Depth Estimation in Endoscopic Surgery/index.html +++ b/100 Reference notes/101 Literature/Surgical-DINO - Adapter Learning of Foundation Models for Depth Estimation in Endoscopic Surgery/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Symmetries in Overparametrized Neural Networks - A Mean-Field View/index.html b/100 Reference notes/101 Literature/Symmetries in Overparametrized Neural Networks - A Mean-Field View/index.html index c9526e9c..e217f40d 100644 --- a/100 Reference notes/101 Literature/Symmetries in Overparametrized Neural Networks - A Mean-Field View/index.html +++ b/100 Reference notes/101 Literature/Symmetries in Overparametrized Neural Networks - A Mean-Field View/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Talaria - Interactively Optimizing Machine Learning Models for Efficient Inference/index.html b/100 Reference notes/101 Literature/Talaria - Interactively Optimizing Machine Learning Models for Efficient Inference/index.html index 72df769a..cce586a9 100644 --- a/100 Reference notes/101 Literature/Talaria - Interactively Optimizing Machine Learning Models for Efficient Inference/index.html +++ b/100 Reference notes/101 Literature/Talaria - Interactively Optimizing Machine Learning Models for Efficient Inference/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/The Empirical Impact of Neural Parameter Symmetries, or Lack Thereof/index.html b/100 Reference notes/101 Literature/The Empirical Impact of Neural Parameter Symmetries, or Lack Thereof/index.html index 88918db5..0a84edbb 100644 --- a/100 Reference notes/101 Literature/The Empirical Impact of Neural Parameter Symmetries, or Lack Thereof/index.html +++ b/100 Reference notes/101 Literature/The Empirical Impact of Neural Parameter Symmetries, or Lack Thereof/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/The Lie derivative for measuring learned equivariance/index.html b/100 Reference notes/101 Literature/The Lie derivative for measuring learned equivariance/index.html index 7b52442c..ee462899 100644 --- a/100 Reference notes/101 Literature/The Lie derivative for measuring learned equivariance/index.html +++ b/100 Reference notes/101 Literature/The Lie derivative for measuring learned equivariance/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/The Unreasonable Ineffectiveness of the Deeper Layers/index.html b/100 Reference notes/101 Literature/The Unreasonable Ineffectiveness of the Deeper Layers/index.html index c83e6ef3..0a78d547 100644 --- a/100 Reference notes/101 Literature/The Unreasonable Ineffectiveness of the Deeper Layers/index.html +++ b/100 Reference notes/101 Literature/The Unreasonable Ineffectiveness of the Deeper Layers/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/TiC-CLIP - Continual Training of CLIP models/index.html b/100 Reference notes/101 Literature/TiC-CLIP - Continual Training of CLIP models/index.html index c6da8a6c..cf1ebc5a 100644 --- a/100 Reference notes/101 Literature/TiC-CLIP - Continual Training of CLIP models/index.html +++ b/100 Reference notes/101 Literature/TiC-CLIP - Continual Training of CLIP models/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Training quantized nets - A deeper understanding/index.html b/100 Reference notes/101 Literature/Training quantized nets - A deeper understanding/index.html index 4b6f7aee..c331afa5 100644 --- a/100 Reference notes/101 Literature/Training quantized nets - A deeper understanding/index.html +++ b/100 Reference notes/101 Literature/Training quantized nets - A deeper understanding/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Understanding Deep Learning - Chapter 10/index.html b/100 Reference notes/101 Literature/Understanding Deep Learning - Chapter 10/index.html index e4bc036c..4398a9f7 100644 --- a/100 Reference notes/101 Literature/Understanding Deep Learning - Chapter 10/index.html +++ b/100 Reference notes/101 Literature/Understanding Deep Learning - Chapter 10/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Understanding Deep Learning - Chapter 20/index.html b/100 Reference notes/101 Literature/Understanding Deep Learning - Chapter 20/index.html index 23fd7198..6f9981ff 100644 --- a/100 Reference notes/101 Literature/Understanding Deep Learning - Chapter 20/index.html +++ b/100 Reference notes/101 Literature/Understanding Deep Learning - Chapter 20/index.html @@ -7031,6 +7031,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Understanding symmetries in deep networks/index.html b/100 Reference notes/101 Literature/Understanding symmetries in deep networks/index.html index fc13cfb0..259a2f7d 100644 --- a/100 Reference notes/101 Literature/Understanding symmetries in deep networks/index.html +++ b/100 Reference notes/101 Literature/Understanding symmetries in deep networks/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Using Degeneracy in the Loss Landscape for Mechanistic Interpretability/index.html b/100 Reference notes/101 Literature/Using Degeneracy in the Loss Landscape for Mechanistic Interpretability/index.html index 1b9af335..2519a9ff 100644 --- a/100 Reference notes/101 Literature/Using Degeneracy in the Loss Landscape for Mechanistic Interpretability/index.html +++ b/100 Reference notes/101 Literature/Using Degeneracy in the Loss Landscape for Mechanistic Interpretability/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/ViDT - An Efficient and Effective Fully Transformer-based Object Detector/index.html b/100 Reference notes/101 Literature/ViDT - An Efficient and Effective Fully Transformer-based Object Detector/index.html index 25f44696..21e6ce28 100644 --- a/100 Reference notes/101 Literature/ViDT - An Efficient and Effective Fully Transformer-based Object Detector/index.html +++ b/100 Reference notes/101 Literature/ViDT - An Efficient and Effective Fully Transformer-based Object Detector/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Vision Mamba - Efficient Visual Representation Learning with Bidirectional State Space Model/index.html b/100 Reference notes/101 Literature/Vision Mamba - Efficient Visual Representation Learning with Bidirectional State Space Model/index.html index 2e261fdf..de8b0115 100644 --- a/100 Reference notes/101 Literature/Vision Mamba - Efficient Visual Representation Learning with Bidirectional State Space Model/index.html +++ b/100 Reference notes/101 Literature/Vision Mamba - Efficient Visual Representation Learning with Bidirectional State Space Model/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/Vision Transformers Need Registers/index.html b/100 Reference notes/101 Literature/Vision Transformers Need Registers/index.html index 37eb0698..89ada49b 100644 --- a/100 Reference notes/101 Literature/Vision Transformers Need Registers/index.html +++ b/100 Reference notes/101 Literature/Vision Transformers Need Registers/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/101 Literature/What Do Self-Supervised Vision Transformers Learn?/index.html b/100 Reference notes/101 Literature/What Do Self-Supervised Vision Transformers Learn?/index.html index 7a21a28b..fa299767 100644 --- a/100 Reference notes/101 Literature/What Do Self-Supervised Vision Transformers Learn?/index.html +++ b/100 Reference notes/101 Literature/What Do Self-Supervised Vision Transformers Learn?/index.html @@ -7066,6 +7066,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Albert Gu/index.html b/100 Reference notes/102 Authors/Albert Gu/index.html index 0b328975..9616d2e3 100644 --- a/100 Reference notes/102 Authors/Albert Gu/index.html +++ b/100 Reference notes/102 Authors/Albert Gu/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Alex Flinth/index.html b/100 Reference notes/102 Authors/Alex Flinth/index.html index cab54736..445d894d 100644 --- a/100 Reference notes/102 Authors/Alex Flinth/index.html +++ b/100 Reference notes/102 Authors/Alex Flinth/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Alexander Kirillov/index.html b/100 Reference notes/102 Authors/Alexander Kirillov/index.html index 97d237b7..da21c0a0 100644 --- a/100 Reference notes/102 Authors/Alexander Kirillov/index.html +++ b/100 Reference notes/102 Authors/Alexander Kirillov/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Alexey Dosovitskiy/index.html b/100 Reference notes/102 Authors/Alexey Dosovitskiy/index.html index 82722b2b..717c9ab9 100644 --- a/100 Reference notes/102 Authors/Alexey Dosovitskiy/index.html +++ b/100 Reference notes/102 Authors/Alexey Dosovitskiy/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Ananya Kumar/index.html b/100 Reference notes/102 Authors/Ananya Kumar/index.html index ba4b0453..977231a1 100644 --- a/100 Reference notes/102 Authors/Ananya Kumar/index.html +++ b/100 Reference notes/102 Authors/Ananya Kumar/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Andreas Loukas/index.html b/100 Reference notes/102 Authors/Andreas Loukas/index.html index 55adc1a7..081df542 100644 --- a/100 Reference notes/102 Authors/Andreas Loukas/index.html +++ b/100 Reference notes/102 Authors/Andreas Loukas/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Andreas Savakis/index.html b/100 Reference notes/102 Authors/Andreas Savakis/index.html index d1b008b6..0c1bd356 100644 --- a/100 Reference notes/102 Authors/Andreas Savakis/index.html +++ b/100 Reference notes/102 Authors/Andreas Savakis/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Angela Fan/index.html b/100 Reference notes/102 Authors/Angela Fan/index.html index 12886cae..74528e43 100644 --- a/100 Reference notes/102 Authors/Angela Fan/index.html +++ b/100 Reference notes/102 Authors/Angela Fan/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Annie S. Chen/index.html b/100 Reference notes/102 Authors/Annie S. Chen/index.html index e8cecd80..554f2959 100644 --- a/100 Reference notes/102 Authors/Annie S. Chen/index.html +++ b/100 Reference notes/102 Authors/Annie S. Chen/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Antonio Orvieto/index.html b/100 Reference notes/102 Authors/Antonio Orvieto/index.html index e26b25b4..94e1b68e 100644 --- a/100 Reference notes/102 Authors/Antonio Orvieto/index.html +++ b/100 Reference notes/102 Authors/Antonio Orvieto/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Ardavan Pedram/index.html b/100 Reference notes/102 Authors/Ardavan Pedram/index.html index 0fb0e91c..a0e72866 100644 --- a/100 Reference notes/102 Authors/Ardavan Pedram/index.html +++ b/100 Reference notes/102 Authors/Ardavan Pedram/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Armand Joulin/index.html b/100 Reference notes/102 Authors/Armand Joulin/index.html index 7b5eb137..23ec2437 100644 --- a/100 Reference notes/102 Authors/Armand Joulin/index.html +++ b/100 Reference notes/102 Authors/Armand Joulin/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Attila Lengyel/index.html b/100 Reference notes/102 Authors/Attila Lengyel/index.html index f5954a30..c9957259 100644 --- a/100 Reference notes/102 Authors/Attila Lengyel/index.html +++ b/100 Reference notes/102 Authors/Attila Lengyel/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Boshi Wang/index.html b/100 Reference notes/102 Authors/Boshi Wang/index.html index 75f0b60e..6f37fee1 100644 --- a/100 Reference notes/102 Authors/Boshi Wang/index.html +++ b/100 Reference notes/102 Authors/Boshi Wang/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Byeongho Heo/index.html b/100 Reference notes/102 Authors/Byeongho Heo/index.html index 5d3db156..ec655b92 100644 --- a/100 Reference notes/102 Authors/Byeongho Heo/index.html +++ b/100 Reference notes/102 Authors/Byeongho Heo/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Caglar Gulcehre/index.html b/100 Reference notes/102 Authors/Caglar Gulcehre/index.html index debe434c..655953fc 100644 --- a/100 Reference notes/102 Authors/Caglar Gulcehre/index.html +++ b/100 Reference notes/102 Authors/Caglar Gulcehre/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Carmen Amo Alonso/index.html b/100 Reference notes/102 Authors/Carmen Amo Alonso/index.html index 0bda908d..4ba3f407 100644 --- a/100 Reference notes/102 Authors/Carmen Amo Alonso/index.html +++ b/100 Reference notes/102 Authors/Carmen Amo Alonso/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Cees G. M. Snoek/index.html b/100 Reference notes/102 Authors/Cees G. M. Snoek/index.html index fb12729e..76aab9b9 100644 --- a/100 Reference notes/102 Authors/Cees G. M. Snoek/index.html +++ b/100 Reference notes/102 Authors/Cees G. M. Snoek/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Chelsea Finn/index.html b/100 Reference notes/102 Authors/Chelsea Finn/index.html index 09fa2319..5f1b613f 100644 --- a/100 Reference notes/102 Authors/Chelsea Finn/index.html +++ b/100 Reference notes/102 Authors/Chelsea Finn/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Chong Wang/index.html b/100 Reference notes/102 Authors/Chong Wang/index.html index 68e433d1..ed90bf55 100644 --- a/100 Reference notes/102 Authors/Chong Wang/index.html +++ b/100 Reference notes/102 Authors/Chong Wang/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Christopher Olah/index.html b/100 Reference notes/102 Authors/Christopher Olah/index.html index 6c10061f..2ea296d9 100644 --- a/100 Reference notes/102 Authors/Christopher Olah/index.html +++ b/100 Reference notes/102 Authors/Christopher Olah/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Daniel M. Roy/index.html b/100 Reference notes/102 Authors/Daniel M. Roy/index.html index 85fc7fb2..6e310b79 100644 --- a/100 Reference notes/102 Authors/Daniel M. Roy/index.html +++ b/100 Reference notes/102 Authors/Daniel M. Roy/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Daniel Ulbricht/index.html b/100 Reference notes/102 Authors/Daniel Ulbricht/index.html index 81aa6acf..a05cbdfa 100644 --- a/100 Reference notes/102 Authors/Daniel Ulbricht/index.html +++ b/100 Reference notes/102 Authors/Daniel Ulbricht/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/David M. Knigge/index.html b/100 Reference notes/102 Authors/David M. Knigge/index.html index cc9b59d1..cc96ad92 100644 --- a/100 Reference notes/102 Authors/David M. Knigge/index.html +++ b/100 Reference notes/102 Authors/David M. Knigge/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/David W. Romero/index.html b/100 Reference notes/102 Authors/David W. Romero/index.html index f0b70547..afcd7cd6 100644 --- a/100 Reference notes/102 Authors/David W. Romero/index.html +++ b/100 Reference notes/102 Authors/David W. Romero/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Diane Larlus/index.html b/100 Reference notes/102 Authors/Diane Larlus/index.html index bc91b18b..166c4f03 100644 --- a/100 Reference notes/102 Authors/Diane Larlus/index.html +++ b/100 Reference notes/102 Authors/Diane Larlus/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Donghyun Kim/index.html b/100 Reference notes/102 Authors/Donghyun Kim/index.html index 72b97a4d..37f2411d 100644 --- a/100 Reference notes/102 Authors/Donghyun Kim/index.html +++ b/100 Reference notes/102 Authors/Donghyun Kim/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Dongyoon Han/index.html b/100 Reference notes/102 Authors/Dongyoon Han/index.html index 006ca0bd..b6d31b21 100644 --- a/100 Reference notes/102 Authors/Dongyoon Han/index.html +++ b/100 Reference notes/102 Authors/Dongyoon Han/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Duy-Kien Nguyen/index.html b/100 Reference notes/102 Authors/Duy-Kien Nguyen/index.html index 76dfebfd..b14f6b9d 100644 --- a/100 Reference notes/102 Authors/Duy-Kien Nguyen/index.html +++ b/100 Reference notes/102 Authors/Duy-Kien Nguyen/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Edward J. Hu/index.html b/100 Reference notes/102 Authors/Edward J. Hu/index.html index 04d5ea26..bbdb2087 100644 --- a/100 Reference notes/102 Authors/Edward J. Hu/index.html +++ b/100 Reference notes/102 Authors/Edward J. Hu/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Edward Z. Yang/index.html b/100 Reference notes/102 Authors/Edward Z. Yang/index.html index b4d0b62f..f00750c5 100644 --- a/100 Reference notes/102 Authors/Edward Z. Yang/index.html +++ b/100 Reference notes/102 Authors/Edward Z. Yang/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Eric Mintun/index.html b/100 Reference notes/102 Authors/Eric Mintun/index.html index 14e8afaa..74b0b84c 100644 --- a/100 Reference notes/102 Authors/Eric Mintun/index.html +++ b/100 Reference notes/102 Authors/Eric Mintun/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Erik J. Bekkers/index.html b/100 Reference notes/102 Authors/Erik J. Bekkers/index.html index 604caacc..6265dbe1 100644 --- a/100 Reference notes/102 Authors/Erik J. Bekkers/index.html +++ b/100 Reference notes/102 Authors/Erik J. Bekkers/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Eshan Verma/index.html b/100 Reference notes/102 Authors/Eshan Verma/index.html index b585ee17..78a88893 100644 --- a/100 Reference notes/102 Authors/Eshan Verma/index.html +++ b/100 Reference notes/102 Authors/Eshan Verma/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Fahim Tajwar/index.html b/100 Reference notes/102 Authors/Fahim Tajwar/index.html index 2ae8cc55..06342149 100644 --- a/100 Reference notes/102 Authors/Fahim Tajwar/index.html +++ b/100 Reference notes/102 Authors/Fahim Tajwar/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Fartash Faghri/index.html b/100 Reference notes/102 Authors/Fartash Faghri/index.html index 8e355d9b..49c93a81 100644 --- a/100 Reference notes/102 Authors/Fartash Faghri/index.html +++ b/100 Reference notes/102 Authors/Fartash Faghri/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Francisco Massa/index.html b/100 Reference notes/102 Authors/Francisco Massa/index.html index 1cc4cb5a..cbafc4f6 100644 --- a/100 Reference notes/102 Authors/Francisco Massa/index.html +++ b/100 Reference notes/102 Authors/Francisco Massa/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Fred Hohman/index.html b/100 Reference notes/102 Authors/Fred Hohman/index.html index eb4d79b7..5c4638e3 100644 --- a/100 Reference notes/102 Authors/Fred Hohman/index.html +++ b/100 Reference notes/102 Authors/Fred Hohman/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Furu Wei/index.html b/100 Reference notes/102 Authors/Furu Wei/index.html index 01b8e94b..adef121a 100644 --- a/100 Reference notes/102 Authors/Furu Wei/index.html +++ b/100 Reference notes/102 Authors/Furu Wei/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Gabriel Synnaeve/index.html b/100 Reference notes/102 Authors/Gabriel Synnaeve/index.html index 60e07b0a..d170c35f 100644 --- a/100 Reference notes/102 Authors/Gabriel Synnaeve/index.html +++ b/100 Reference notes/102 Authors/Gabriel Synnaeve/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Gintare Karolina Dziugaite/index.html b/100 Reference notes/102 Authors/Gintare Karolina Dziugaite/index.html index bba76af2..43888103 100644 --- a/100 Reference notes/102 Authors/Gintare Karolina Dziugaite/index.html +++ b/100 Reference notes/102 Authors/Gintare Karolina Dziugaite/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Hadi Pouransari/index.html b/100 Reference notes/102 Authors/Hadi Pouransari/index.html index 8d3b26ca..9decc6c7 100644 --- a/100 Reference notes/102 Authors/Hadi Pouransari/index.html +++ b/100 Reference notes/102 Authors/Hadi Pouransari/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Han Cai/index.html b/100 Reference notes/102 Authors/Han Cai/index.html index 0130afd8..6b43a340 100644 --- a/100 Reference notes/102 Authors/Han Cai/index.html +++ b/100 Reference notes/102 Authors/Han Cai/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Hanzi Mao/index.html b/100 Reference notes/102 Authors/Hanzi Mao/index.html index cef83044..78da9577 100644 --- a/100 Reference notes/102 Authors/Hanzi Mao/index.html +++ b/100 Reference notes/102 Authors/Hanzi Mao/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Haoxiang Wang/index.html b/100 Reference notes/102 Authors/Haoxiang Wang/index.html index 1a8d31a2..018d7d40 100644 --- a/100 Reference notes/102 Authors/Haoxiang Wang/index.html +++ b/100 Reference notes/102 Authors/Haoxiang Wang/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git "a/100 Reference notes/102 Authors/Herv\303\251 Jegou/index.html" "b/100 Reference notes/102 Authors/Herv\303\251 Jegou/index.html" index 957f562c..d8e21f8b 100644 --- "a/100 Reference notes/102 Authors/Herv\303\251 Jegou/index.html" +++ "b/100 Reference notes/102 Authors/Herv\303\251 Jegou/index.html" @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Huaxiu Yao/index.html b/100 Reference notes/102 Authors/Huaxiu Yao/index.html index e91e83f6..4ae7ab83 100644 --- a/100 Reference notes/102 Authors/Huaxiu Yao/index.html +++ b/100 Reference notes/102 Authors/Huaxiu Yao/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Hugo Touvron/index.html b/100 Reference notes/102 Authors/Hugo Touvron/index.html index ea9ffeee..92b9854c 100644 --- a/100 Reference notes/102 Authors/Hugo Touvron/index.html +++ b/100 Reference notes/102 Authors/Hugo Touvron/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Huizi Mao/index.html b/100 Reference notes/102 Authors/Huizi Mao/index.html index b083a486..1e2ded5e 100644 --- a/100 Reference notes/102 Authors/Huizi Mao/index.html +++ b/100 Reference notes/102 Authors/Huizi Mao/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Isha Garg/index.html b/100 Reference notes/102 Authors/Isha Garg/index.html index 2f01639b..60d55922 100644 --- a/100 Reference notes/102 Authors/Isha Garg/index.html +++ b/100 Reference notes/102 Authors/Isha Garg/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Ishan Misra/index.html b/100 Reference notes/102 Authors/Ishan Misra/index.html index 269f5218..eecff373 100644 --- a/100 Reference notes/102 Authors/Ishan Misra/index.html +++ b/100 Reference notes/102 Authors/Ishan Misra/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Jan E. Gerken/index.html b/100 Reference notes/102 Authors/Jan E. Gerken/index.html index f96a0079..af7ccc8e 100644 --- a/100 Reference notes/102 Authors/Jan E. Gerken/index.html +++ b/100 Reference notes/102 Authors/Jan E. Gerken/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Javier Maass Martinez/index.html b/100 Reference notes/102 Authors/Javier Maass Martinez/index.html index 32e981ff..2801bd0e 100644 --- a/100 Reference notes/102 Authors/Javier Maass Martinez/index.html +++ b/100 Reference notes/102 Authors/Javier Maass Martinez/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Jean-Baptiste Cordonnier/index.html b/100 Reference notes/102 Authors/Jean-Baptiste Cordonnier/index.html index cc53370a..c12b977e 100644 --- a/100 Reference notes/102 Authors/Jean-Baptiste Cordonnier/index.html +++ b/100 Reference notes/102 Authors/Jean-Baptiste Cordonnier/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Jeff Pool/index.html b/100 Reference notes/102 Authors/Jeff Pool/index.html index d93e575f..60480edf 100644 --- a/100 Reference notes/102 Authors/Jeff Pool/index.html +++ b/100 Reference notes/102 Authors/Jeff Pool/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Jesse Cai/index.html b/100 Reference notes/102 Authors/Jesse Cai/index.html index 8c95172c..0473b25b 100644 --- a/100 Reference notes/102 Authors/Jesse Cai/index.html +++ b/100 Reference notes/102 Authors/Jesse Cai/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Jing Pu/index.html b/100 Reference notes/102 Authors/Jing Pu/index.html index 0f7ce914..56594e8b 100644 --- a/100 Reference notes/102 Authors/Jing Pu/index.html +++ b/100 Reference notes/102 Authors/Jing Pu/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Joaquin Fontbona/index.html b/100 Reference notes/102 Authors/Joaquin Fontbona/index.html index ddc5beaa..5ae37f89 100644 --- a/100 Reference notes/102 Authors/Joaquin Fontbona/index.html +++ b/100 Reference notes/102 Authors/Joaquin Fontbona/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/John Denker/index.html b/100 Reference notes/102 Authors/John Denker/index.html index 0efa58f4..ecc031a3 100644 --- a/100 Reference notes/102 Authors/John Denker/index.html +++ b/100 Reference notes/102 Authors/John Denker/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/John Tran/index.html b/100 Reference notes/102 Authors/John Tran/index.html index 6cec1522..87b2afff 100644 --- a/100 Reference notes/102 Authors/John Tran/index.html +++ b/100 Reference notes/102 Authors/John Tran/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Julien Mairal/index.html b/100 Reference notes/102 Authors/Julien Mairal/index.html index d86d7417..cb0beca3 100644 --- a/100 Reference notes/102 Authors/Julien Mairal/index.html +++ b/100 Reference notes/102 Authors/Julien Mairal/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Juliette Marrie/index.html b/100 Reference notes/102 Authors/Juliette Marrie/index.html index 8021d6f7..ec0d6c60 100644 --- a/100 Reference notes/102 Authors/Juliette Marrie/index.html +++ b/100 Reference notes/102 Authors/Juliette Marrie/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Kaiming He/index.html b/100 Reference notes/102 Authors/Kaiming He/index.html index feceaa9a..bcae5394 100644 --- a/100 Reference notes/102 Authors/Kaiming He/index.html +++ b/100 Reference notes/102 Authors/Kaiming He/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Kamyar Azizzadenesheli/index.html b/100 Reference notes/102 Authors/Kamyar Azizzadenesheli/index.html index 481d37d6..b4160554 100644 --- a/100 Reference notes/102 Authors/Kamyar Azizzadenesheli/index.html +++ b/100 Reference notes/102 Authors/Kamyar Azizzadenesheli/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Kaushik Roy/index.html b/100 Reference notes/102 Authors/Kaushik Roy/index.html index 7beb4b0a..0939969c 100644 --- a/100 Reference notes/102 Authors/Kaushik Roy/index.html +++ b/100 Reference notes/102 Authors/Kaushik Roy/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Lawrence Chan/index.html b/100 Reference notes/102 Authors/Lawrence Chan/index.html index c3279a4c..122a2129 100644 --- a/100 Reference notes/102 Authors/Lawrence Chan/index.html +++ b/100 Reference notes/102 Authors/Lawrence Chan/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Lucius Bushnaq/index.html b/100 Reference notes/102 Authors/Lucius Bushnaq/index.html index 30c97baf..a936e86b 100644 --- a/100 Reference notes/102 Authors/Lucius Bushnaq/index.html +++ b/100 Reference notes/102 Authors/Lucius Bushnaq/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git "a/100 Reference notes/102 Authors/Maciej Wo\305\202czyk/index.html" "b/100 Reference notes/102 Authors/Maciej Wo\305\202czyk/index.html" index b51177e6..982e1073 100644 --- "a/100 Reference notes/102 Authors/Maciej Wo\305\202czyk/index.html" +++ "b/100 Reference notes/102 Authors/Maciej Wo\305\202czyk/index.html" @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Mahmoud Assran/index.html b/100 Reference notes/102 Authors/Mahmoud Assran/index.html index 2598a991..5661ade9 100644 --- a/100 Reference notes/102 Authors/Mahmoud Assran/index.html +++ b/100 Reference notes/102 Authors/Mahmoud Assran/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Marc Finzi/index.html b/100 Reference notes/102 Authors/Marc Finzi/index.html index 13445fe0..cad8b934 100644 --- a/100 Reference notes/102 Authors/Marc Finzi/index.html +++ b/100 Reference notes/102 Authors/Marc Finzi/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Mark A. Horowitz/index.html b/100 Reference notes/102 Authors/Mark A. Horowitz/index.html index 336a0e92..a9ac5491 100644 --- a/100 Reference notes/102 Authors/Mark A. Horowitz/index.html +++ b/100 Reference notes/102 Authors/Mark A. Horowitz/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Martin Jaggi/index.html b/100 Reference notes/102 Authors/Martin Jaggi/index.html index 231a42d5..86597d1b 100644 --- a/100 Reference notes/102 Authors/Martin Jaggi/index.html +++ b/100 Reference notes/102 Authors/Martin Jaggi/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Martin R. Oswald/index.html b/100 Reference notes/102 Authors/Martin R. Oswald/index.html index 50b25c03..b14a93e4 100644 --- a/100 Reference notes/102 Authors/Martin R. Oswald/index.html +++ b/100 Reference notes/102 Authors/Martin R. Oswald/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Mathilde Caron/index.html b/100 Reference notes/102 Authors/Mathilde Caron/index.html index 6437faee..ad1b1d71 100644 --- a/100 Reference notes/102 Authors/Mathilde Caron/index.html +++ b/100 Reference notes/102 Authors/Mathilde Caron/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Maxime Oquab/index.html b/100 Reference notes/102 Authors/Maxime Oquab/index.html index f9e9b4e0..ff9e0eb8 100644 --- a/100 Reference notes/102 Authors/Maxime Oquab/index.html +++ b/100 Reference notes/102 Authors/Maxime Oquab/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Mehrdad Farajtabar/index.html b/100 Reference notes/102 Authors/Mehrdad Farajtabar/index.html index e91f547b..23bce2c2 100644 --- a/100 Reference notes/102 Authors/Mehrdad Farajtabar/index.html +++ b/100 Reference notes/102 Authors/Mehrdad Farajtabar/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Michael Arbel/index.html b/100 Reference notes/102 Authors/Michael Arbel/index.html index 7eb62fda..5a7675bc 100644 --- a/100 Reference notes/102 Authors/Michael Arbel/index.html +++ b/100 Reference notes/102 Authors/Michael Arbel/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Mohammad Rastegari/index.html b/100 Reference notes/102 Authors/Mohammad Rastegari/index.html index 4a5711d3..cc7413b8 100644 --- a/100 Reference notes/102 Authors/Mohammad Rastegari/index.html +++ b/100 Reference notes/102 Authors/Mohammad Rastegari/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Namuk Park/index.html b/100 Reference notes/102 Authors/Namuk Park/index.html index 12606097..ab4e29c9 100644 --- a/100 Reference notes/102 Authors/Namuk Park/index.html +++ b/100 Reference notes/102 Authors/Namuk Park/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Navin Ranjan/index.html b/100 Reference notes/102 Authors/Navin Ranjan/index.html index 2919501d..156e817f 100644 --- a/100 Reference notes/102 Authors/Navin Ranjan/index.html +++ b/100 Reference notes/102 Authors/Navin Ranjan/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Neel Nanda/index.html b/100 Reference notes/102 Authors/Neel Nanda/index.html index 903fab7a..d932b7fb 100644 --- a/100 Reference notes/102 Authors/Neel Nanda/index.html +++ b/100 Reference notes/102 Authors/Neel Nanda/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Nicolas Carion/index.html b/100 Reference notes/102 Authors/Nicolas Carion/index.html index d09748a3..b28036b6 100644 --- a/100 Reference notes/102 Authors/Nicolas Carion/index.html +++ b/100 Reference notes/102 Authors/Nicolas Carion/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Nicolas Usunier/index.html b/100 Reference notes/102 Authors/Nicolas Usunier/index.html index b125e3f3..38edbd5d 100644 --- a/100 Reference notes/102 Authors/Nicolas Usunier/index.html +++ b/100 Reference notes/102 Authors/Nicolas Usunier/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Oncel Tuzel/index.html b/100 Reference notes/102 Authors/Oncel Tuzel/index.html index 21f2217c..7ff01952 100644 --- a/100 Reference notes/102 Authors/Oncel Tuzel/index.html +++ b/100 Reference notes/102 Authors/Oncel Tuzel/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git "a/100 Reference notes/102 Authors/Patrick Forr\303\251/index.html" "b/100 Reference notes/102 Authors/Patrick Forr\303\251/index.html" index b50f1451..b4596d45 100644 --- "a/100 Reference notes/102 Authors/Patrick Forr\303\251/index.html" +++ "b/100 Reference notes/102 Authors/Patrick Forr\303\251/index.html" @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Pavan Kumar Anasosalu Vasu/index.html b/100 Reference notes/102 Authors/Pavan Kumar Anasosalu Vasu/index.html index 45f94d7b..3222e3ad 100644 --- a/100 Reference notes/102 Authors/Pavan Kumar Anasosalu Vasu/index.html +++ b/100 Reference notes/102 Authors/Pavan Kumar Anasosalu Vasu/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Percy Liang/index.html b/100 Reference notes/102 Authors/Percy Liang/index.html index 4b6a93d0..7a9ebd3a 100644 --- a/100 Reference notes/102 Authors/Percy Liang/index.html +++ b/100 Reference notes/102 Authors/Percy Liang/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Piotr Bojanowski/index.html b/100 Reference notes/102 Authors/Piotr Bojanowski/index.html index 059e33ee..d82a1012 100644 --- a/100 Reference notes/102 Authors/Piotr Bojanowski/index.html +++ b/100 Reference notes/102 Authors/Piotr Bojanowski/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Raviteja Vemulapalli/index.html b/100 Reference notes/102 Authors/Raviteja Vemulapalli/index.html index 19ed6b1f..59f131a8 100644 --- a/100 Reference notes/102 Authors/Raviteja Vemulapalli/index.html +++ b/100 Reference notes/102 Authors/Raviteja Vemulapalli/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Razvan Pascanu/index.html b/100 Reference notes/102 Authors/Razvan Pascanu/index.html index a88eea35..c32b113a 100644 --- a/100 Reference notes/102 Authors/Razvan Pascanu/index.html +++ b/100 Reference notes/102 Authors/Razvan Pascanu/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Robin Walters/index.html b/100 Reference notes/102 Authors/Robin Walters/index.html index 5e7cee7d..3d4abb72 100644 --- a/100 Reference notes/102 Authors/Robin Walters/index.html +++ b/100 Reference notes/102 Authors/Robin Walters/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Rose Yu/index.html b/100 Reference notes/102 Authors/Rose Yu/index.html index f7848663..04777d4e 100644 --- a/100 Reference notes/102 Authors/Rose Yu/index.html +++ b/100 Reference notes/102 Authors/Rose Yu/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Ross Girshick/index.html b/100 Reference notes/102 Authors/Ross Girshick/index.html index 70d99bb1..309876ab 100644 --- a/100 Reference notes/102 Authors/Ross Girshick/index.html +++ b/100 Reference notes/102 Authors/Ross Girshick/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Rui Wang/index.html b/100 Reference notes/102 Authors/Rui Wang/index.html index 26332656..779108be 100644 --- a/100 Reference notes/102 Authors/Rui Wang/index.html +++ b/100 Reference notes/102 Authors/Rui Wang/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Ruoming Pang/index.html b/100 Reference notes/102 Authors/Ruoming Pang/index.html index cb27b2d4..180de093 100644 --- a/100 Reference notes/102 Authors/Ruoming Pang/index.html +++ b/100 Reference notes/102 Authors/Ruoming Pang/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Sachin Mehta/index.html b/100 Reference notes/102 Authors/Sachin Mehta/index.html index b399838e..674eec0f 100644 --- a/100 Reference notes/102 Authors/Sachin Mehta/index.html +++ b/100 Reference notes/102 Authors/Sachin Mehta/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Sangdoo Yun/index.html b/100 Reference notes/102 Authors/Sangdoo Yun/index.html index 36cd4633..fa3ef3ee 100644 --- a/100 Reference notes/102 Authors/Sangdoo Yun/index.html +++ b/100 Reference notes/102 Authors/Sangdoo Yun/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Sanghyuk Chun/index.html b/100 Reference notes/102 Authors/Sanghyuk Chun/index.html index 2f052de6..09ac7f36 100644 --- a/100 Reference notes/102 Authors/Sanghyuk Chun/index.html +++ b/100 Reference notes/102 Authors/Sanghyuk Chun/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Sara Solla/index.html b/100 Reference notes/102 Authors/Sara Solla/index.html index c77e6635..786b99ef 100644 --- a/100 Reference notes/102 Authors/Sara Solla/index.html +++ b/100 Reference notes/102 Authors/Sara Solla/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Sergey Zagoruyko/index.html b/100 Reference notes/102 Authors/Sergey Zagoruyko/index.html index cbd5d2ff..b177f609 100644 --- a/100 Reference notes/102 Authors/Sergey Zagoruyko/index.html +++ b/100 Reference notes/102 Authors/Sergey Zagoruyko/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Shaohan Huang/index.html b/100 Reference notes/102 Authors/Shaohan Huang/index.html index d1dac55f..d021b0c5 100644 --- a/100 Reference notes/102 Authors/Shaohan Huang/index.html +++ b/100 Reference notes/102 Authors/Shaohan Huang/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Simon J.D. Prince/index.html b/100 Reference notes/102 Authors/Simon J.D. Prince/index.html index 62b061ff..3596b2cd 100644 --- a/100 Reference notes/102 Authors/Simon J.D. Prince/index.html +++ b/100 Reference notes/102 Authors/Simon J.D. Prince/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Skander Moalla/index.html b/100 Reference notes/102 Authors/Skander Moalla/index.html index 4d3216b2..09d44839 100644 --- a/100 Reference notes/102 Authors/Skander Moalla/index.html +++ b/100 Reference notes/102 Authors/Skander Moalla/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Soham De/index.html b/100 Reference notes/102 Authors/Soham De/index.html index a577e182..3f812681 100644 --- a/100 Reference notes/102 Authors/Soham De/index.html +++ b/100 Reference notes/102 Authors/Soham De/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Song Han/index.html b/100 Reference notes/102 Authors/Song Han/index.html index da91c153..a49b99fb 100644 --- a/100 Reference notes/102 Authors/Song Han/index.html +++ b/100 Reference notes/102 Authors/Song Han/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Songkuk Kim/index.html b/100 Reference notes/102 Authors/Songkuk Kim/index.html index 9297d07c..3546425e 100644 --- a/100 Reference notes/102 Authors/Songkuk Kim/index.html +++ b/100 Reference notes/102 Authors/Songkuk Kim/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Sourya Basu/index.html b/100 Reference notes/102 Authors/Sourya Basu/index.html index b4d8dea8..a5eeaa5e 100644 --- a/100 Reference notes/102 Authors/Sourya Basu/index.html +++ b/100 Reference notes/102 Authors/Sourya Basu/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git "a/100 Reference notes/102 Authors/St\303\251phane d'Ascoli/index.html" "b/100 Reference notes/102 Authors/St\303\251phane d'Ascoli/index.html" index f202c02f..5255869e 100644 --- "a/100 Reference notes/102 Authors/St\303\251phane d'Ascoli/index.html" +++ "b/100 Reference notes/102 Authors/St\303\251phane d'Ascoli/index.html" @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Sukjun Hwang/index.html b/100 Reference notes/102 Authors/Sukjun Hwang/index.html index 092ec620..b47c2fb7 100644 --- a/100 Reference notes/102 Authors/Sukjun Hwang/index.html +++ b/100 Reference notes/102 Authors/Sukjun Hwang/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Taekyung Kim/index.html b/100 Reference notes/102 Authors/Taekyung Kim/index.html index 7cc8acfd..53a7fedb 100644 --- a/100 Reference notes/102 Authors/Taekyung Kim/index.html +++ b/100 Reference notes/102 Authors/Taekyung Kim/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Tete Xiao/index.html b/100 Reference notes/102 Authors/Tete Xiao/index.html index 0740ea4f..5ff19014 100644 --- a/100 Reference notes/102 Authors/Tete Xiao/index.html +++ b/100 Reference notes/102 Authors/Tete Xiao/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Tom Gunter/index.html b/100 Reference notes/102 Authors/Tom Gunter/index.html index 445f9f84..854098d8 100644 --- a/100 Reference notes/102 Authors/Tom Gunter/index.html +++ b/100 Reference notes/102 Authors/Tom Gunter/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Tom Lieberum/index.html b/100 Reference notes/102 Authors/Tom Lieberum/index.html index 0eec60b1..f8a6ffa6 100644 --- a/100 Reference notes/102 Authors/Tom Lieberum/index.html +++ b/100 Reference notes/102 Authors/Tom Lieberum/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Vaibhav Aggarwal/index.html b/100 Reference notes/102 Authors/Vaibhav Aggarwal/index.html index cefd9e85..8c3c350c 100644 --- a/100 Reference notes/102 Authors/Vaibhav Aggarwal/index.html +++ b/100 Reference notes/102 Authors/Vaibhav Aggarwal/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/William J. Dally/index.html b/100 Reference notes/102 Authors/William J. Dally/index.html index 81ae5939..9c067b9d 100644 --- a/100 Reference notes/102 Authors/William J. Dally/index.html +++ b/100 Reference notes/102 Authors/William J. Dally/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Wonjae Kim/index.html b/100 Reference notes/102 Authors/Wonjae Kim/index.html index 160fba8b..10105f6b 100644 --- a/100 Reference notes/102 Authors/Wonjae Kim/index.html +++ b/100 Reference notes/102 Authors/Wonjae Kim/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Xiang Yue/index.html b/100 Reference notes/102 Authors/Xiang Yue/index.html index 827bc374..38f8f8f5 100644 --- a/100 Reference notes/102 Authors/Xiang Yue/index.html +++ b/100 Reference notes/102 Authors/Xiang Yue/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Xingyu Liu/index.html b/100 Reference notes/102 Authors/Xingyu Liu/index.html index a65b1abd..40bdabe4 100644 --- a/100 Reference notes/102 Authors/Xingyu Liu/index.html +++ b/100 Reference notes/102 Authors/Xingyu Liu/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Xinlei Chen/index.html b/100 Reference notes/102 Authors/Xinlei Chen/index.html index 6bdfbfab..38cbfd2d 100644 --- a/100 Reference notes/102 Authors/Xinlei Chen/index.html +++ b/100 Reference notes/102 Authors/Xinlei Chen/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Xiuying Wei/index.html b/100 Reference notes/102 Authors/Xiuying Wei/index.html index 6dc0579f..d020e51d 100644 --- a/100 Reference notes/102 Authors/Xiuying Wei/index.html +++ b/100 Reference notes/102 Authors/Xiuying Wei/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Xu Ma/index.html b/100 Reference notes/102 Authors/Xu Ma/index.html index 21edb622..cb5dee77 100644 --- a/100 Reference notes/102 Authors/Xu Ma/index.html +++ b/100 Reference notes/102 Authors/Xu Ma/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Xun Wu/index.html b/100 Reference notes/102 Authors/Xun Wu/index.html index 23869b6e..a1bc72b6 100644 --- a/100 Reference notes/102 Authors/Xun Wu/index.html +++ b/100 Reference notes/102 Authors/Xun Wu/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Yanghao Li/index.html b/100 Reference notes/102 Authors/Yanghao Li/index.html index 2b3121c8..7a439bf5 100644 --- a/100 Reference notes/102 Authors/Yanghao Li/index.html +++ b/100 Reference notes/102 Authors/Yanghao Li/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Yann LeCun/index.html b/100 Reference notes/102 Authors/Yann LeCun/index.html index 1ddc509b..972b4e99 100644 --- a/100 Reference notes/102 Authors/Yann LeCun/index.html +++ b/100 Reference notes/102 Authors/Yann LeCun/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Yelong Shen/index.html b/100 Reference notes/102 Authors/Yelong Shen/index.html index f95f837c..5c3e64f7 100644 --- a/100 Reference notes/102 Authors/Yelong Shen/index.html +++ b/100 Reference notes/102 Authors/Yelong Shen/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Yoonho Lee/index.html b/100 Reference notes/102 Authors/Yoonho Lee/index.html index 0ac5636d..11da03f0 100644 --- a/100 Reference notes/102 Authors/Yoonho Lee/index.html +++ b/100 Reference notes/102 Authors/Yoonho Lee/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Zeyuan Allen-Zhu/index.html b/100 Reference notes/102 Authors/Zeyuan Allen-Zhu/index.html index 5adcac77..18e06acb 100644 --- a/100 Reference notes/102 Authors/Zeyuan Allen-Zhu/index.html +++ b/100 Reference notes/102 Authors/Zeyuan Allen-Zhu/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Zhuoyang Zhang/index.html b/100 Reference notes/102 Authors/Zhuoyang Zhang/index.html index 65da6c33..68c301ae 100644 --- a/100 Reference notes/102 Authors/Zhuoyang Zhang/index.html +++ b/100 Reference notes/102 Authors/Zhuoyang Zhang/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Ziaoyi Zhang/index.html b/100 Reference notes/102 Authors/Ziaoyi Zhang/index.html index dc375a9f..f9c3b2c2 100644 --- a/100 Reference notes/102 Authors/Ziaoyi Zhang/index.html +++ b/100 Reference notes/102 Authors/Ziaoyi Zhang/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/102 Authors/Zirui Wang/index.html b/100 Reference notes/102 Authors/Zirui Wang/index.html index 5aa5cec2..4b2e6707 100644 --- a/100 Reference notes/102 Authors/Zirui Wang/index.html +++ b/100 Reference notes/102 Authors/Zirui Wang/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Anthropic/index.html b/100 Reference notes/103 Affiliations/Anthropic/index.html index ac1bce9b..a64ae259 100644 --- a/100 Reference notes/103 Affiliations/Anthropic/index.html +++ b/100 Reference notes/103 Affiliations/Anthropic/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Apollo Research/index.html b/100 Reference notes/103 Affiliations/Apollo Research/index.html index b5170650..a31c03e3 100644 --- a/100 Reference notes/103 Affiliations/Apollo Research/index.html +++ b/100 Reference notes/103 Affiliations/Apollo Research/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Apple/index.html b/100 Reference notes/103 Affiliations/Apple/index.html index ae93bb0e..600df7f2 100644 --- a/100 Reference notes/103 Affiliations/Apple/index.html +++ b/100 Reference notes/103 Affiliations/Apple/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/CLAIRE/index.html b/100 Reference notes/103 Affiliations/CLAIRE/index.html index 1c324acd..a45bf80c 100644 --- a/100 Reference notes/103 Affiliations/CLAIRE/index.html +++ b/100 Reference notes/103 Affiliations/CLAIRE/index.html @@ -7075,6 +7075,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Carnegie Mellon University/index.html b/100 Reference notes/103 Affiliations/Carnegie Mellon University/index.html index eeae00f6..7619614b 100644 --- a/100 Reference notes/103 Affiliations/Carnegie Mellon University/index.html +++ b/100 Reference notes/103 Affiliations/Carnegie Mellon University/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Chalmers University of Technology/index.html b/100 Reference notes/103 Affiliations/Chalmers University of Technology/index.html index 3c11f85b..c81c0f7c 100644 --- a/100 Reference notes/103 Affiliations/Chalmers University of Technology/index.html +++ b/100 Reference notes/103 Affiliations/Chalmers University of Technology/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/EPFL/index.html b/100 Reference notes/103 Affiliations/EPFL/index.html index 78f5ffb3..5be6d301 100644 --- a/100 Reference notes/103 Affiliations/EPFL/index.html +++ b/100 Reference notes/103 Affiliations/EPFL/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/FAIR/index.html b/100 Reference notes/103 Affiliations/FAIR/index.html index f3c689ee..6b11d9db 100644 --- a/100 Reference notes/103 Affiliations/FAIR/index.html +++ b/100 Reference notes/103 Affiliations/FAIR/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Google DeepMind/index.html b/100 Reference notes/103 Affiliations/Google DeepMind/index.html index 3cafa68e..8135014a 100644 --- a/100 Reference notes/103 Affiliations/Google DeepMind/index.html +++ b/100 Reference notes/103 Affiliations/Google DeepMind/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Google/index.html b/100 Reference notes/103 Affiliations/Google/index.html index 453f2036..b500688c 100644 --- a/100 Reference notes/103 Affiliations/Google/index.html +++ b/100 Reference notes/103 Affiliations/Google/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/IBM Research/index.html b/100 Reference notes/103 Affiliations/IBM Research/index.html index e2910500..9da762f0 100644 --- a/100 Reference notes/103 Affiliations/IBM Research/index.html +++ b/100 Reference notes/103 Affiliations/IBM Research/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/INRIA/index.html b/100 Reference notes/103 Affiliations/INRIA/index.html index d8b5c945..9f2c45f2 100644 --- a/100 Reference notes/103 Affiliations/INRIA/index.html +++ b/100 Reference notes/103 Affiliations/INRIA/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/MIT/index.html b/100 Reference notes/103 Affiliations/MIT/index.html index 2aa38103..5549a338 100644 --- a/100 Reference notes/103 Affiliations/MIT/index.html +++ b/100 Reference notes/103 Affiliations/MIT/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/McGill University/index.html b/100 Reference notes/103 Affiliations/McGill University/index.html index d9ef6a89..7b2ca697 100644 --- a/100 Reference notes/103 Affiliations/McGill University/index.html +++ b/100 Reference notes/103 Affiliations/McGill University/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Meta/index.html b/100 Reference notes/103 Affiliations/Meta/index.html index d6dba8b7..c7be9b70 100644 --- a/100 Reference notes/103 Affiliations/Meta/index.html +++ b/100 Reference notes/103 Affiliations/Meta/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Microsoft/index.html b/100 Reference notes/103 Affiliations/Microsoft/index.html index 53b70c75..a8e78b5a 100644 --- a/100 Reference notes/103 Affiliations/Microsoft/index.html +++ b/100 Reference notes/103 Affiliations/Microsoft/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Mila Quebec AI Institute/index.html b/100 Reference notes/103 Affiliations/Mila Quebec AI Institute/index.html index 78608f09..82dfc1db 100644 --- a/100 Reference notes/103 Affiliations/Mila Quebec AI Institute/index.html +++ b/100 Reference notes/103 Affiliations/Mila Quebec AI Institute/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/NVIDIA/index.html b/100 Reference notes/103 Affiliations/NVIDIA/index.html index 11570f0a..fe7867fc 100644 --- a/100 Reference notes/103 Affiliations/NVIDIA/index.html +++ b/100 Reference notes/103 Affiliations/NVIDIA/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Naver AI Lab/index.html b/100 Reference notes/103 Affiliations/Naver AI Lab/index.html index 6bd19bea..29acbdc1 100644 --- a/100 Reference notes/103 Affiliations/Naver AI Lab/index.html +++ b/100 Reference notes/103 Affiliations/Naver AI Lab/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Naver Cloud AI/index.html b/100 Reference notes/103 Affiliations/Naver Cloud AI/index.html index 89f7c493..8ef0fadd 100644 --- a/100 Reference notes/103 Affiliations/Naver Cloud AI/index.html +++ b/100 Reference notes/103 Affiliations/Naver Cloud AI/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Naver Labs Europe/index.html b/100 Reference notes/103 Affiliations/Naver Labs Europe/index.html index 6694085e..6e84dcc4 100644 --- a/100 Reference notes/103 Affiliations/Naver Labs Europe/index.html +++ b/100 Reference notes/103 Affiliations/Naver Labs Europe/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/New York University/index.html b/100 Reference notes/103 Affiliations/New York University/index.html index 3957e5b3..7f9d00b5 100644 --- a/100 Reference notes/103 Affiliations/New York University/index.html +++ b/100 Reference notes/103 Affiliations/New York University/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Northeastern University/index.html b/100 Reference notes/103 Affiliations/Northeastern University/index.html index 8201334b..c6341a00 100644 --- a/100 Reference notes/103 Affiliations/Northeastern University/index.html +++ b/100 Reference notes/103 Affiliations/Northeastern University/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/OpenAI/index.html b/100 Reference notes/103 Affiliations/OpenAI/index.html index e68213b7..9aae3314 100644 --- a/100 Reference notes/103 Affiliations/OpenAI/index.html +++ b/100 Reference notes/103 Affiliations/OpenAI/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Princeton University/index.html b/100 Reference notes/103 Affiliations/Princeton University/index.html index 12aabf88..0d3f8961 100644 --- a/100 Reference notes/103 Affiliations/Princeton University/index.html +++ b/100 Reference notes/103 Affiliations/Princeton University/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/PyTorch/index.html b/100 Reference notes/103 Affiliations/PyTorch/index.html index 13f0706f..ed0295be 100644 --- a/100 Reference notes/103 Affiliations/PyTorch/index.html +++ b/100 Reference notes/103 Affiliations/PyTorch/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Rochester Institute of Technology/index.html b/100 Reference notes/103 Affiliations/Rochester Institute of Technology/index.html index 8a704505..b0e0e34e 100644 --- a/100 Reference notes/103 Affiliations/Rochester Institute of Technology/index.html +++ b/100 Reference notes/103 Affiliations/Rochester Institute of Technology/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Stanford/index.html b/100 Reference notes/103 Affiliations/Stanford/index.html index b1990192..6c022261 100644 --- a/100 Reference notes/103 Affiliations/Stanford/index.html +++ b/100 Reference notes/103 Affiliations/Stanford/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/TU Delft/index.html b/100 Reference notes/103 Affiliations/TU Delft/index.html index e0ac9be5..8e99072d 100644 --- a/100 Reference notes/103 Affiliations/TU Delft/index.html +++ b/100 Reference notes/103 Affiliations/TU Delft/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Tsinghua University/index.html b/100 Reference notes/103 Affiliations/Tsinghua University/index.html index b72e453d..c0a48cf0 100644 --- a/100 Reference notes/103 Affiliations/Tsinghua University/index.html +++ b/100 Reference notes/103 Affiliations/Tsinghua University/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/UC Berkeley/index.html b/100 Reference notes/103 Affiliations/UC Berkeley/index.html index 65eaf594..8bb13336 100644 --- a/100 Reference notes/103 Affiliations/UC Berkeley/index.html +++ b/100 Reference notes/103 Affiliations/UC Berkeley/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/UC San Diego/index.html b/100 Reference notes/103 Affiliations/UC San Diego/index.html index cc5b6d80..48acba26 100644 --- a/100 Reference notes/103 Affiliations/UC San Diego/index.html +++ b/100 Reference notes/103 Affiliations/UC San Diego/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/UC Santa Barbara/index.html b/100 Reference notes/103 Affiliations/UC Santa Barbara/index.html index 8efded3a..21e570a3 100644 --- a/100 Reference notes/103 Affiliations/UC Santa Barbara/index.html +++ b/100 Reference notes/103 Affiliations/UC Santa Barbara/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/UCLA/index.html b/100 Reference notes/103 Affiliations/UCLA/index.html index f4a82f90..e7b5814c 100644 --- a/100 Reference notes/103 Affiliations/UCLA/index.html +++ b/100 Reference notes/103 Affiliations/UCLA/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/University of Amsterdam/index.html b/100 Reference notes/103 Affiliations/University of Amsterdam/index.html index 7eeb271a..d8445caa 100644 --- a/100 Reference notes/103 Affiliations/University of Amsterdam/index.html +++ b/100 Reference notes/103 Affiliations/University of Amsterdam/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/University of Chile/index.html b/100 Reference notes/103 Affiliations/University of Chile/index.html index 578130d6..9d22ae56 100644 --- a/100 Reference notes/103 Affiliations/University of Chile/index.html +++ b/100 Reference notes/103 Affiliations/University of Chile/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/University of Illinois at Urbana-Champaign/index.html b/100 Reference notes/103 Affiliations/University of Illinois at Urbana-Champaign/index.html index 3f0bb0c9..fb190db8 100644 --- a/100 Reference notes/103 Affiliations/University of Illinois at Urbana-Champaign/index.html +++ b/100 Reference notes/103 Affiliations/University of Illinois at Urbana-Champaign/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/University of Oxford/index.html b/100 Reference notes/103 Affiliations/University of Oxford/index.html index 4be48ad3..9193ac33 100644 --- a/100 Reference notes/103 Affiliations/University of Oxford/index.html +++ b/100 Reference notes/103 Affiliations/University of Oxford/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Vector Institute/index.html b/100 Reference notes/103 Affiliations/Vector Institute/index.html index bc78a635..779502ef 100644 --- a/100 Reference notes/103 Affiliations/Vector Institute/index.html +++ b/100 Reference notes/103 Affiliations/Vector Institute/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Vrije Universiteit Amsterdam/index.html b/100 Reference notes/103 Affiliations/Vrije Universiteit Amsterdam/index.html index c780983d..a9336dc1 100644 --- a/100 Reference notes/103 Affiliations/Vrije Universiteit Amsterdam/index.html +++ b/100 Reference notes/103 Affiliations/Vrije Universiteit Amsterdam/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/103 Affiliations/Yonsei University/index.html b/100 Reference notes/103 Affiliations/Yonsei University/index.html index 50c65972..c2c19b4b 100644 --- a/100 Reference notes/103 Affiliations/Yonsei University/index.html +++ b/100 Reference notes/103 Affiliations/Yonsei University/index.html @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/EPFL-CS439 - Optimization for Machine Learning/index.html b/100 Reference notes/104 Other/EPFL-CS439 - Optimization for Machine Learning/index.html index 947407b8..34d1768f 100644 --- a/100 Reference notes/104 Other/EPFL-CS439 - Optimization for Machine Learning/index.html +++ b/100 Reference notes/104 Other/EPFL-CS439 - Optimization for Machine Learning/index.html @@ -14,7 +14,7 @@ - + @@ -7024,6 +7024,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/GPU mode - Sparsity/index.html b/100 Reference notes/104 Other/GPU mode - Sparsity/index.html new file mode 100644 index 00000000..adf2022e --- /dev/null +++ b/100 Reference notes/104 Other/GPU mode - Sparsity/index.html @@ -0,0 +1,7952 @@ + + + + + + + + + + + + + + + + + + + + + + + + + GPU mode Sparsity - Second Brain + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + Skip to content + + +
    +
    + +
    + + + + + + +
    + + +
    + +
    + + + + + + +
    +
    + + + +
    +
    +
    + + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    + +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git "a/100 Reference notes/104 Other/Introducing Apple\342\200\231s On-Device and Server Foundation Models/index.html" "b/100 Reference notes/104 Other/Introducing Apple\342\200\231s On-Device and Server Foundation Models/index.html" index ec2dcad7..85604fe7 100644 --- "a/100 Reference notes/104 Other/Introducing Apple\342\200\231s On-Device and Server Foundation Models/index.html" +++ "b/100 Reference notes/104 Other/Introducing Apple\342\200\231s On-Device and Server Foundation Models/index.html" @@ -11,7 +11,7 @@ - + @@ -7019,6 +7019,27 @@ + + +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + diff --git a/100 Reference notes/104 Other/Introduction to Quantization on PyTorch/index.html b/100 Reference notes/104 Other/Introduction to Quantization on PyTorch/index.html index 5c9bf918..2696d70b 100644 --- a/100 Reference notes/104 Other/Introduction to Quantization on PyTorch/index.html +++ b/100 Reference notes/104 Other/Introduction to Quantization on PyTorch/index.html @@ -7021,6 +7021,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/Let's talk about the Python Dispatcher/index.html b/100 Reference notes/104 Other/Let's talk about the Python Dispatcher/index.html index ccc81feb..6542b648 100644 --- a/100 Reference notes/104 Other/Let's talk about the Python Dispatcher/index.html +++ b/100 Reference notes/104 Other/Let's talk about the Python Dispatcher/index.html @@ -7016,6 +7016,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/MIT-65940 - TinyML and Efficient Deep Learning Computing/index.html b/100 Reference notes/104 Other/MIT-65940 - TinyML and Efficient Deep Learning Computing/index.html index ba48060c..243b5972 100644 --- a/100 Reference notes/104 Other/MIT-65940 - TinyML and Efficient Deep Learning Computing/index.html +++ b/100 Reference notes/104 Other/MIT-65940 - TinyML and Efficient Deep Learning Computing/index.html @@ -7016,6 +7016,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/Optimizing Vision Transformer Model for Deployment/index.html b/100 Reference notes/104 Other/Optimizing Vision Transformer Model for Deployment/index.html index 59fc07f1..78ca7158 100644 --- a/100 Reference notes/104 Other/Optimizing Vision Transformer Model for Deployment/index.html +++ b/100 Reference notes/104 Other/Optimizing Vision Transformer Model for Deployment/index.html @@ -7016,6 +7016,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/PyTorch - ExecuTorch - Export IR Specification/index.html b/100 Reference notes/104 Other/PyTorch - ExecuTorch - Export IR Specification/index.html index 2cb8241d..10d105f9 100644 --- a/100 Reference notes/104 Other/PyTorch - ExecuTorch - Export IR Specification/index.html +++ b/100 Reference notes/104 Other/PyTorch - ExecuTorch - Export IR Specification/index.html @@ -7021,6 +7021,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/PyTorch - ExecuTorch - How ExecuTorch works?/index.html b/100 Reference notes/104 Other/PyTorch - ExecuTorch - How ExecuTorch works?/index.html index d7603beb..f2e30bdd 100644 --- a/100 Reference notes/104 Other/PyTorch - ExecuTorch - How ExecuTorch works?/index.html +++ b/100 Reference notes/104 Other/PyTorch - ExecuTorch - How ExecuTorch works?/index.html @@ -7021,6 +7021,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/PyTorch - ExecuTorch - Quantization Overview/index.html b/100 Reference notes/104 Other/PyTorch - ExecuTorch - Quantization Overview/index.html index 83730b3d..10eced29 100644 --- a/100 Reference notes/104 Other/PyTorch - ExecuTorch - Quantization Overview/index.html +++ b/100 Reference notes/104 Other/PyTorch - ExecuTorch - Quantization Overview/index.html @@ -7016,6 +7016,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/PyTorch - Functionalization in PyTorch - Everything you need to know/index.html b/100 Reference notes/104 Other/PyTorch - Functionalization in PyTorch - Everything you need to know/index.html index 227cd6a7..0f3539eb 100644 --- a/100 Reference notes/104 Other/PyTorch - Functionalization in PyTorch - Everything you need to know/index.html +++ b/100 Reference notes/104 Other/PyTorch - Functionalization in PyTorch - Everything you need to know/index.html @@ -7016,6 +7016,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/PyTorch - PyTorch 2 Export Post Training Quantization/index.html b/100 Reference notes/104 Other/PyTorch - PyTorch 2 Export Post Training Quantization/index.html index f39a3c0a..ec12227f 100644 --- a/100 Reference notes/104 Other/PyTorch - PyTorch 2 Export Post Training Quantization/index.html +++ b/100 Reference notes/104 Other/PyTorch - PyTorch 2 Export Post Training Quantization/index.html @@ -7016,6 +7016,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/PyTorch - Quantization/index.html b/100 Reference notes/104 Other/PyTorch - Quantization/index.html index 086568a9..d7d35593 100644 --- a/100 Reference notes/104 Other/PyTorch - Quantization/index.html +++ b/100 Reference notes/104 Other/PyTorch - Quantization/index.html @@ -7021,6 +7021,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/PyTorch Compilers - What makes PyTorch beloved makes it hard to compile/index.html b/100 Reference notes/104 Other/PyTorch Compilers - What makes PyTorch beloved makes it hard to compile/index.html index 11494558..68bd4986 100644 --- a/100 Reference notes/104 Other/PyTorch Compilers - What makes PyTorch beloved makes it hard to compile/index.html +++ b/100 Reference notes/104 Other/PyTorch Compilers - What makes PyTorch beloved makes it hard to compile/index.html @@ -7016,6 +7016,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/PyTorch Conference 2024 - Fast Sparse Vision Transformers with minimal accuracy loss/index.html b/100 Reference notes/104 Other/PyTorch Conference 2024 - Fast Sparse Vision Transformers with minimal accuracy loss/index.html index 13ca8912..93af396d 100644 --- a/100 Reference notes/104 Other/PyTorch Conference 2024 - Fast Sparse Vision Transformers with minimal accuracy loss/index.html +++ b/100 Reference notes/104 Other/PyTorch Conference 2024 - Fast Sparse Vision Transformers with minimal accuracy loss/index.html @@ -7016,6 +7016,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git "a/100 Reference notes/104 Other/PyTorch Conference 2024 - What\342\200\231s new in torch.export?/index.html" "b/100 Reference notes/104 Other/PyTorch Conference 2024 - What\342\200\231s new in torch.export?/index.html" index c9875256..5bce6ca4 100644 --- "a/100 Reference notes/104 Other/PyTorch Conference 2024 - What\342\200\231s new in torch.export?/index.html" +++ "b/100 Reference notes/104 Other/PyTorch Conference 2024 - What\342\200\231s new in torch.export?/index.html" @@ -7021,6 +7021,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/PyTorch Conference 2024/index.html b/100 Reference notes/104 Other/PyTorch Conference 2024/index.html index 54adb508..cd14e920 100644 --- a/100 Reference notes/104 Other/PyTorch Conference 2024/index.html +++ b/100 Reference notes/104 Other/PyTorch Conference 2024/index.html @@ -7016,6 +7016,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/PyTorch Eager Mode Quantization TensorRT Acceleration/index.html b/100 Reference notes/104 Other/PyTorch Eager Mode Quantization TensorRT Acceleration/index.html index 20aaaa45..6e18bed2 100644 --- a/100 Reference notes/104 Other/PyTorch Eager Mode Quantization TensorRT Acceleration/index.html +++ b/100 Reference notes/104 Other/PyTorch Eager Mode Quantization TensorRT Acceleration/index.html @@ -7016,6 +7016,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/PyTorch internals/index.html b/100 Reference notes/104 Other/PyTorch internals/index.html index 3ba94e00..09ca8d57 100644 --- a/100 Reference notes/104 Other/PyTorch internals/index.html +++ b/100 Reference notes/104 Other/PyTorch internals/index.html @@ -7016,6 +7016,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/Quantized Transfer Learning for Computer Vision Tutorial/index.html b/100 Reference notes/104 Other/Quantized Transfer Learning for Computer Vision Tutorial/index.html index 0584e812..1d09d942 100644 --- a/100 Reference notes/104 Other/Quantized Transfer Learning for Computer Vision Tutorial/index.html +++ b/100 Reference notes/104 Other/Quantized Transfer Learning for Computer Vision Tutorial/index.html @@ -7016,6 +7016,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 11/index.html b/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 11/index.html index 8dd7fb55..cb5c064f 100644 --- a/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 11/index.html +++ b/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 11/index.html @@ -76,7 +76,7 @@
    - + Skip to content @@ -7021,6 +7021,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • @@ -7442,12 +7463,45 @@ @@ -7772,12 +7826,45 @@ @@ -7806,8 +7893,6 @@ -

    Reinforcement Learning An Introduction Chapter 11

    - @@ -7826,6 +7911,34 @@

    Reinforcement Learning An Introduction Chapter 11

    +

    11.1 Semi-gradient Methods

    +
    +

    Equation 11.1: Per-step importance sampling ratio

    +
    \[ +\rho_t \doteq \rho_{t:T-1} = \frac{\pi(A_t \mid S_t)}{b(A_t \mid S_t)} +\]
    +
    +

    todo

    +

    11.4 Linear Value-function Geometry

    +

    TODO:
    +- [x] 11.11 mu norm equation ✅ 2024-10-01
    +- [x] 11.17 and 11.18 bellman error ✅ 2024-10-01
    +- [ ] 11.19 mean square bellman error

    +
    +

    Equation 11.11: \(\mu\)-norm

    +
    \[ +||\mathbf{v}||^2_\mu \doteq \sum_{s \in \mathcal{S}} \mu(s) v(s)^2 +\]
    +
    +
    +

    Equation 11.17 and 11.18: Bellman error

    +
    \[ +\begin{align} +\bar{\delta}_{\mathbf{w}}(s) &\doteq \left( \sum_a \pi(a \mid s) \sum_{s', r} p(s', r \mid s, a)[r + \gamma v_{\mathbf{w}}(s')] \right) - v_{\mathbf{w}}(s) \tag{11.17} \\ +&= \mathbb{E}_\pi[R_{t+1} - \gamma v_{\mathbf{w}}(S_{t+1}) - v_{\mathbf{w}}(S_{t}) \mid S_t = s, A_t \sim \pi] +\end{align} +\]
    +

    11.5 Gradient Descent in the Bellman Error

    Mean-squared temporal difference error

    @@ -7867,7 +7980,7 @@

    11.5 Gradient Descent in the - 2024-10-01 + 2024-10-01 diff --git a/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 3/index.html b/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 3/index.html index 63611df0..616fefce 100644 --- a/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 3/index.html +++ b/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 3/index.html @@ -7021,6 +7021,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 4/index.html b/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 4/index.html index 35cc3722..307920b2 100644 --- a/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 4/index.html +++ b/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 4/index.html @@ -7021,6 +7021,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 5/index.html b/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 5/index.html index 6967fe1f..41b1bb85 100644 --- a/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 5/index.html +++ b/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 5/index.html @@ -7021,6 +7021,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 6/index.html b/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 6/index.html index a56b5204..e6d9c17f 100644 --- a/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 6/index.html +++ b/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 6/index.html @@ -7021,6 +7021,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 7/index.html b/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 7/index.html index 9b49f317..ff76cb47 100644 --- a/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 7/index.html +++ b/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 7/index.html @@ -7021,6 +7021,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 9/index.html b/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 9/index.html index 629e1b78..fcfaa312 100644 --- a/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 9/index.html +++ b/100 Reference notes/104 Other/Reinforcement Learning - An Introduction - Chapter 9/index.html @@ -7021,6 +7021,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/Reinforcement Learning - An Introduction/index.html b/100 Reference notes/104 Other/Reinforcement Learning - An Introduction/index.html index 32c60574..c9278b08 100644 --- a/100 Reference notes/104 Other/Reinforcement Learning - An Introduction/index.html +++ b/100 Reference notes/104 Other/Reinforcement Learning - An Introduction/index.html @@ -7016,6 +7016,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing - Lecture 12/index.html b/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing - Lecture 12/index.html index a5732220..6284ac73 100644 --- a/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing - Lecture 12/index.html +++ b/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing - Lecture 12/index.html @@ -7016,6 +7016,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing - Lecture 3/index.html b/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing - Lecture 3/index.html index 7d96046f..46089296 100644 --- a/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing - Lecture 3/index.html +++ b/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing - Lecture 3/index.html @@ -7016,6 +7016,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing - Lecture 5/index.html b/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing - Lecture 5/index.html index 8d39a9f2..9471c9c3 100644 --- a/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing - Lecture 5/index.html +++ b/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing - Lecture 5/index.html @@ -7016,6 +7016,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing - Lecture 6/index.html b/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing - Lecture 6/index.html index 640910b3..9232c77a 100644 --- a/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing - Lecture 6/index.html +++ b/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing - Lecture 6/index.html @@ -7016,6 +7016,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing/index.html b/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing/index.html index f4f02f06..30cfdc69 100644 --- a/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing/index.html +++ b/100 Reference notes/104 Other/TinyML and Efficient Deep Learning Computing/index.html @@ -7016,6 +7016,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/100 Reference notes/104 Other/Tweet - Stable Diffusion XL on iPhone with Core ML!/index.html b/100 Reference notes/104 Other/Tweet - Stable Diffusion XL on iPhone with Core ML!/index.html index c0423668..3b28288b 100644 --- a/100 Reference notes/104 Other/Tweet - Stable Diffusion XL on iPhone with Core ML!/index.html +++ b/100 Reference notes/104 Other/Tweet - Stable Diffusion XL on iPhone with Core ML!/index.html @@ -7014,6 +7014,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/404.html b/404.html index b6b59dd5..a3297162 100644 --- a/404.html +++ b/404.html @@ -7006,6 +7006,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/index.html b/index.html index 7f7ffcbd..524dc16d 100644 --- a/index.html +++ b/index.html @@ -7025,6 +7025,27 @@ +
  • + + + + + GPU mode Sparsity + + + + +
  • + + + + + + + + + +
  • diff --git a/search/search_index.json b/search/search_index.json index 78b8deec..f6a45af1 100644 --- a/search/search_index.json +++ b/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Home","text":"

    {{ blog_content }}

    "},{"location":"000%20Zettelkasten/2D%20Convolutions/","title":"2D Convolutions","text":"

    Fully comprehensive resource with animations: Conv2d

    ","tags":["cnn"]},{"location":"000%20Zettelkasten/Ahead-of-Time%20%28AOT%29%20Compilation/","title":"Ahead of Time (AOT) Compilation","text":"

    Generally: Compilation that occurs before the program is executed.

    Specifically to ML (PyTorch): - When a model is AOT compiled (using torch.jit.script(or trace) or torch.export), the entire program is translated from python into an intermediate representation that is independent of it. That is, you don't need a python interpreter to run that IR. - Note: torchscript is AOT in the sense that it requires to capture the whole graph before runtime but it performs further optimizations just-in-time.

    ","tags":["compilers","pytorch","optimization"]},{"location":"000%20Zettelkasten/Are%20less%20inductive%20biases%20better%20or%20worse%3F/","title":"Are less inductive biases better or worse?","text":"

    There's a general consensus that less inductive biases are better, intuitively because it helps optimization by allowing for more hardware-friendly architectures, etc.

    First, An image is worth 16x16 words - Transformers for image recognition at scale shows that ViTs, with minimal inductive biases, outperform ConvNets. ViTs have: - No translational equivariance baked in - No locality inductive bias enforced - Although positional encodings exist and fixed sinusoidal encodings can be used, they are mostly learned and randomly/zero initialized. They show that Vision Transformers scale better than ConvNets and Mixed Architectures (Convolutional stems + Transformer).

    A ConvNet for the 2020s proves that ResNets are outdated and improves the network with recent advances to match ViTs performance.

    The Lie derivative for measuring learned equivariance shows surprising result: ViTs exhibit more translational equivariance after training than ConvNets, as measured per their Lie Derivative.

    An Image is Worth More Than 16x16 Patches - Exploring Transformers on Individual Pixels tackles the toy question of dropping the convolutional stem that does the patchification in ViTs, with the intention of further reducing inductive biases. They prove that the resulting model (although too computationally intensive to be used in practice), competes with ViTs.

    How do vision transformers work? argues that the benefit of Vision Transformers is not that they have less inductive biases, but that the their operations are input dependent (see Input-dependent convolutions) and that Self Attention acts as a smoothing mechanism (that helps with better training dynamics on the large data regimes). They ablate this decision by constraining ViTs attention to be local, outperforming ViTs with global attention both in small and large data regimes. This is a strong indication that locality constraints are useful.

    Learning with Unmasked Tokens Drives Stronger Vision Learners implicitly counter-argues How do vision transformers work? by noticing that MIM-trained ViTs exhibit localized attention maps and \"fixing\" it. Their approach outperforms other MIM-trained ViTs, so locality as good inductive bias is not definitely answered.

    ","tags":["dl_theory","question"]},{"location":"000%20Zettelkasten/Are%20less%20inductive%20biases%20better%20or%20worse%3F/#vits-vs-dense-prediction-tasks","title":"ViTs vs Dense prediction tasks","text":"

    A ConvNet for the 2020s mentions that ViTs struggle on dense prediction tasks and they require hierarchical architectural choices (Swin Transformer) to do well. These choices re-introduce inductive biases.

    However, there's recent promising work that is (I think) successfully dropping these constraints: - Exploring Plain Vision Transformer Backbones for Object Detection - SimPLR - A Simple and Plain Transformer for Scaling-Efficient Object Detection and Segmentation

    ","tags":["dl_theory","question"]},{"location":"000%20Zettelkasten/Bit%20Palettization/","title":"Bit Palettization","text":"

    Seems to be similar to K-Means-based Quantization.

    [...] we use 6-bit palettization, a type of quantization that compresses model weights from a 16-bit floating-point representation to just 6 bits per parameter. The name \u201cpalettization\u201d refers to a technique similar to the one used in computer graphics to work with a limited set of colors: the color table (or \u201cpalette\u201d) contains a fixed number of colors, and the colors in the image are replaced with the indexes of the closest colors available in the palette. This immediately provides the benefit of drastically reducing storage size, and thus reducing download time and on-device disk use.

    References: - https://huggingface.co/blog/stable-diffusion-xl-coreml#what-is-mixed-bit-palettization - https://huggingface.co/blog/fast-diffusers-coreml

    Notes: - Multiplying by this weight matrix intuitively should be slower, it would be interesting to see what is the tradeoff speed vs memory. This tweet Tweet - Stable Diffusion XL on iPhone with Core ML! suggests that it runs faster than the non-quantized alternative.

    ","tags":["efficient_dl","transformers"]},{"location":"000%20Zettelkasten/Block%20Expansion/","title":"Block Expansion","text":"

    Key idea: - Introduce extra transformer block that is initialized to be the identity function and train that.

    From Parameter Efficient Fine-tuning of Self-supervised ViTs without Catastrophic Forgetting

    We introduce the concept of Block Expansion for fine-tuning pre-trained ViTs, building upon an idea that was recently proposed for language models\u00a0[27]\u00a0but has yet to be explored in vision. This technique is used to augment the capacity of a model without altering its initial output. In a ViT model comprised of sequential transformer blocks\u00a0(\\(\\phi_0,\\phi_1,\u2026,\\phi_N\\)), Block Expansion adds an identity block\u00a0(\\(\\phi_{id}\\))\u00a0after a set of transformer blocks such that\u00a0\\(\\phi_{id}(x)=x\\), meaning it returns the input as its output, ensuring the model\u2019s output remains unchanged immediately after expansion. To expand a model from\u00a0\ud835\udc41\u00a0to\u00a0\ud835\udc41\u2032\u00a0blocks, the original blocks are first grouped into sets containing\u00a0\ud835\udc40\u00a0blocks each. Within each set, an identity copy of the topmost block is created and placed on top, effectively increasing the model\u2019s depth without initially changing its behavior. In each newly expanded block, two linear layers are zero-initialized to enable identity mapping, as shown in Figure\u00a01\u00a0(c). These newly added blocks are only fine-tuned with the new data while the remaining blocks are frozen.

    ","tags":["efficient_dl","transformers"]},{"location":"000%20Zettelkasten/Convergence%20rate%20and%20Hessian%20spectra/","title":"Convergence rate and Hessian spectra","text":"
    • Remember: If a Hessian matrix is positive definite everywhere, then the function is convex => bad neg eigenvalues
    • Large eigenvalues of the Metrics for flatness Some metrics, such as the maximum Hessian eigenvalue, measure the worstcase loss increase under an adversarial perturbation to the weights [10, 16], while other proposed metrics, such as the Hessian trace, measure the expected loss increase under random perturbations to the weights.
    ","tags":["optimizability"]},{"location":"000%20Zettelkasten/Depthwise%20separable%20convolutions/","title":"Depthwise separable convolutions","text":"

    Splits the computation into two steps:\u00a0depthwise convolution\u00a0applies a single convolutional filter per each input channel and\u00a0pointwise convolution\u00a0is used to create a linear combination of the output of the depthwise convolution.

    Related ideas are often used to reduce the size/complexity of convolutional layers. It reduces expressivity of convolutions but its less parameters. For example Exploiting Redundancy - Separable Group Convolutional Networks on Lie Groups

    Also used in (ConvNext) A ConvNet for the 2020s

    ","tags":["cnn"]},{"location":"000%20Zettelkasten/Do%20Vision%20Foundation%20models%20exist%3F/","title":"Do Vision Foundation models exist?","text":"","tags":["question","foundation_models","computer_vision"]},{"location":"000%20Zettelkasten/Do%20Vision%20Foundation%20models%20exist%3F/#object-detection","title":"Object detection","text":"

    Research using DINOv2 as a backbone for object detection:

    DINOv2 \u274c - Poor Object Detection Performance with DINOv2 Backbone and Faster R-CNN Head on Cityscapes Dataset - Using mask rcnn head but still relevant, maybe dinov2 is not a good object detection backbone?

    DINOv2 \u2705

    \"NVIDIA has also released a foundational model called NV-Dinov2, which is available through the NVIDIA AI Enterprise program. NV-Dinov2 is a visual foundational model trained on an NVIDIA proprietary large scale dataset.\" NV-DINOv2 - NVIDIA provides CLIP VIT and DINO VIT backbones for object detection and segmentation (closed source) - This signals that it is not only possible but actually useful in production (the tao toolkit specifically markets to providing enterprise-ready vision transformers) - However it also very specifically states the inferior performance of vits compared with specifically trained dense-prediction networks: > \"To mitigate the inferior performance of a standard vision transformer (ViT) on dense prediction tasks, TAO supports the\u00a0ViT-Adapter_\u00a0architecture. This allows a powerful ViT that has learned rich semantic representations from a large corpus of data to achieve comparable performance to vision-specific transformers on dense prediction tasks.\"

    • Exploring Plain Vision Transformer Backbones for Object Detection

      • VitDET with DINO backbone gh issue
        • There's some caveats but they are fixable
    • SimPLR - A Simple and Plain Transformer for Scaling-Efficient Object Detection and Segmentation

      • Improves over ViTDet
    ","tags":["question","foundation_models","computer_vision"]},{"location":"000%20Zettelkasten/Equivariance%20Initialization/","title":"Equivariance Initialization","text":"

    Related: - Priors over Neural Network weights

    ","tags":["dl_theory"]},{"location":"000%20Zettelkasten/Group%20Axioms/","title":"Group Axioms","text":"

    A group is a non-empty set \\(G\\) together with a binary operation on \\(G\\) (\\(\\cdot\\)), that fulfills the following axioms: 1. Associativity: For all \\(a, b, c \\in G\\), one has \\((a \\cdot b) \\cdot c = a \\cdot (b \\cdot c)\\) 2. Identity element: There exists an element \\(e\\in G\\) such that, for every \\(a \\in G\\), \\(e \\cdot a = a\\) and \\(a \\cdot e = a\\) 3. Inverse element: For each \\(a\\in G\\), there exists a unique element \\(b\\in G\\) such that \\(a \\cdot b = e\\) and \\(b \\cdot a = e\\), where \\(e\\) is the identity element. The inverse of \\(a\\) is denoted as \\(a^{-1}\\)

    ","tags":["math"]},{"location":"000%20Zettelkasten/Group%20direct%20product/","title":"Group direct product","text":"

    Given groups \\(G\\) (with operation *) and \\(H\\) (with operation \\(\\Delta\\)), the direct product \\(G \\times H\\) is defined as follows: 1. The underlying set is the Cartesian product, \\(G \\times H\\). That is, the ordered pairs \\((g, h)\\), where \\(g \\in G\\) and \\(h \\in H\\). 2. The binary operation on \\(G \\times H\\) is defined component-wise.

    \\[ (g_1, h_1) \\cdot (g_2, h_2) = (g_1 * g_2, h_1 \\Delta h_2) \\]

    The resulting algebraic object satisfies the Group Axioms.

    ","tags":["math"]},{"location":"000%20Zettelkasten/Hardware-specific%20structured%20pruning/","title":"Hardware specific structured pruning","text":"

    Key Idea

    Some GPU architectures can take advantage of specific sparsity patterns.

    According to this the training procedure would look as follows:

    NVIDIA has developed a simple and universal recipe for sparsifying deep neural networks for inference\u00a0using this 2:4 structured sparsity pattern. The network is first trained using dense weights, then fine-grained structured pruning is applied, and finally the remaining non-zero weights are fine-tuned with additional training steps. This method results in virtually no loss in inferencing accuracy based on evaluation across dozens of networks spanning vision, object detection, segmentation, natural language modeling, and translation.

    References: - TinyML and Efficient Deep Learning Computing - Lecture 3 - https://developer.nvidia.com/blog/accelerating-inference-with-sparsity-using-ampere-and-tensorrt/ - https://developer.nvidia.com/blog/structured-sparsity-in-the-nvidia-ampere-architecture-and-applications-in-search-engines/

    ","tags":["efficient_dl","hardware_aware_dl"]},{"location":"000%20Zettelkasten/Input-dependent%20convolutions/","title":"Input dependent convolutions","text":"
    • How do vision transformers work? states that the key advantage of Self Attention over Convolutions is not the long range dependencies (global attention) but rather its data specificity (aka input dependency)
    • This is related to Mamba - Linear-Time Sequence Modeling with Selective State Spaces's insight :
      • \"We identify a key limitation of prior models: the ability to efficiently select data in an input-dependent manner (i.e. focus on or ignore particular inputs).\"

    There most likely is work on input-dependent convolutions: - [ ] CKConv - Continuous Kernel Convolution For Sequential Data is probably related, but haven't read it in full. Check this. - [ ] Review literature on input-dependent convolutions

    ","tags":["cnn","theory"]},{"location":"000%20Zettelkasten/K-Means-based%20Quantization/","title":"K Means based Quantization","text":"

    Perform clustering on weights, and replace weights with cluster int index matrix (to which cluster each weight entry belongs to) and a list of float centroids.

    Storing integers consumes less memory while you can keep fully precision on the float centroids (although you lose precision because it does not necessarily correspond to an actual value in the previous weight matrix).

    Resources: - https://scikit-learn.org/stable/auto_examples/cluster/plot_color_quantization.html - TinyML and Efficient Deep Learning Computing - Lecture 5

    ","tags":["efficient_dl"]},{"location":"000%20Zettelkasten/KV%20Cache/","title":"KV Cache","text":"

    From: TinyML and Efficient Deep Learning Computing - Lecture 12

    ","tags":["efficient_dl","transformers"]},{"location":"000%20Zettelkasten/Linear%20Quantization/","title":"Linear Quantization","text":"

    Visualization

    Then, for each layer in your network (linear, conv, etc), you represent the matrices involved like the previous formulation, do some arithmetic to see what you can precompute and zero-out and voil\u00e1

    ","tags":["efficient_dl"]},{"location":"000%20Zettelkasten/LoRa%20Adapter/","title":"LoRa Adapter","text":"

    Image source: https://medium.com/@bnjmn_marie/lora-load-and-merge-your-adapters-with-care-3204119f0426

    ","tags":["efficient_dl","transformers"]},{"location":"000%20Zettelkasten/Masked%20Image%20Modelling/","title":"Masked Image Modelling","text":"

    It seems like MIM objectives are becoming a strong learning objective for vision foundation models. Right now it seems to be the closest answer to: Do Vision Foundation models exist?

    However, intuitively it seems a bit like a weak signal, as it focuses on individual patches/pixels, without much consideration to semantic information. This is echoed on Learning with Unmasked Tokens Drives Stronger Vision Learners:

    However, MIM strategies often encounter challenges, such as local dependency on attention to understand entire context of an image. For example, liu\u00a0et al.\u00a0[36]\u00a0revealed that MAE\u00a0[22], a state-of-the-art MIM method, exhibits shorter average attention distances. Furthermore, we observe that attention map patterns by MAE substantiate extremely local behavior (See Fig.\u00a01) indeed. In other words, the MAE-trained attention mechanism less integrates information across the entire image pixels and tends to focus on specific input regions. This is presumably attributed to MIM-pretraining, primarily dedicated to predicting low-level pixel details (e.g., color or texture) without a comprehensive understanding of less-regional information (e.g., the input structure or shape).

    Related papers: - Learning with Unmasked Tokens Drives Stronger Vision Learners - DINOv2 - Learning Robust Visual Features without Supervision - Learning with Unmasked Tokens Drives Stronger Vision Learners - What Do Self-Supervised Vision Transformers Learn? \ud83d\udea8

    ","tags":["foundation_models","computer_vision"]},{"location":"000%20Zettelkasten/Maximal%20pruning%20and%20functional%20recovery/","title":"Maximal pruning and functional recovery","text":"

    Key Idea

    You can iteratively prune and finetune the network weights and still maintain performance up to some pruning ratio.

    Reference: - TinyML and Efficient Deep Learning Computing - Lecture 3 - Learning both Weights and Connections for Efficient Neural Networks

    ","tags":["dl_theory","efficient_dl"]},{"location":"000%20Zettelkasten/Mean%20Attention%20Distance/","title":"Mean Attention Distance","text":"

    Introduced in An image is worth 16x16 words - Transformers for image recognition at scale.

    From What Do Self-Supervised Vision Transformers Learn?

    \u201cAttention distance is defined as the average distance between the query tokens and key tokens considering their self-attention weights. Therefore, it conceptually corresponds to the size of the receptive fields in CNNs.\u201d (Park et al., 2023, p. 3)

    Key Observation

    Can be used to measure what is the how much local or global information is a transformer using. See What Do Self-Supervised Vision Transformers Learn?.

    ","tags":["dl_theory","transformers"]},{"location":"000%20Zettelkasten/Multiple%20global%20minima/","title":"Multiple global minima","text":"

    We expect loss functions for deep networks to have a large family of equivalent global minima.

    • Fully connected networks: permutation of the hidden units
    • Convolutional networks: permuting the channels and convolution kernels appropriately.
    • ...

    The above modifications all produce the same output for every input. However, the global minimum only depends on the output at the training data points.

    In overparameterized networks, there will also be families of solutions that behave identically at the data points but differently between them. All of these are also global minima.

    References: - Understanding Deep Learning - Chapter 20 (20.3.1)

    ","tags":["optimizability","dl_theory"]},{"location":"000%20Zettelkasten/Neural%20Network%20Quantization/","title":"Neural Network Quantization","text":"

    Related: - HuggingFace Docs - A survey of quantization methods for efficient neural network inference - A recent (2024) work by Han et al: AWQ - Activation-aware Weight Quantization for LLM Compression and Acceleration

    ","tags":["quantization","efficient_dl"]},{"location":"000%20Zettelkasten/Non-translationally%20equivariant%20convolutions/","title":"Non translationally equivariant convolutions","text":"

    I'm not sure if this makes sense at all, just tracking paper ideas lmao

    See: - Input-dependent convolutions - How do vision transformers work?

    ","tags":["cnn","convolutions","equivariance","partial_equivariance"]},{"location":"000%20Zettelkasten/Positive%20Logic%20Programs/","title":"Positive Logic Programs","text":"","tags":["knowledge_representation"]},{"location":"000%20Zettelkasten/Positive%20Logic%20Programs/#positive-logic-programs","title":"Positive logic programs","text":"

    Two components: 1. Facts: a. 2. Rules: a :- b, c, d , which is the same as b \u2227 c \u2227 d \u2192 a

    This is a positive logic program:

    rainy(amsterdam).\nrainy(vienna).\nwet(X) :- rainy(X). # eq: \u2200x. (Rainy(x) \u2192 Wet(x))\n

    ","tags":["knowledge_representation"]},{"location":"000%20Zettelkasten/Positive%20Logic%20Programs/#database-semantics","title":"Database semantics","text":"

    Assumptions 1. Domain closure: The objects mentioned are the only objects. 2. Unique-names assumption: Two variables can't refer to the same object 3. Closed-world assumption: Whatever we don't know is false

    What does the database semantics allow us to do?
    1. We can specify a relation by the set of inputs that are true
    2. We can specify objects simply by the terms that point to them
    3. We don't have to explicitly define what function symbols mean

    Thus, an interpretation is a set that defines which atoms are true. The remainder are false.

    ","tags":["knowledge_representation"]},{"location":"000%20Zettelkasten/Positive%20Logic%20Programs/#models","title":"Models","text":"What is a model?

    A model is an interpretation which makes all rules of a program true.

    However, we're not interested in all models, we want the highest expressivity at the lowest information.

    What is the definition of a minimal model?

    A model is minimal if no strict subset exist that is also a model.

    How do you construct a minimal model?

    Start with facts and add new literals that are on the lhs of a rule where all body is in M.

    M = {f for f in facts}\nwhile True:\n    for head, body in rules:\n        if all(l in M for l in body):\n            M.add(l)\n

    What is the definition of a supported model?

    A model is supported if all its atoms are supported. An atom of a model is supported if it appears as a head where the body is true.

    What properties does minimal models and supported models have for positive logic programs?

    For positive logic programs:

    • Minimal models are unique
    • A minimal model is also a supported model (but not necessarily viceversa)
    ","tags":["knowledge_representation"]},{"location":"000%20Zettelkasten/Positive%20Logic%20Programs/#normal-logic-programs","title":"Normal logic programs","text":"

    Now we allow negation.

    a :- b_1, ..., b_n, not c_1, ..., not c_m.\n
    Do properties of minimal models for PL still hold for NL? Why?

    No, negation removes allows for non-uniqueness of minimal models.

    ","tags":["knowledge_representation"]},{"location":"000%20Zettelkasten/Priors%20over%20Neural%20Network%20weights/","title":"Priors over Neural Network weights","text":"

    From Understanding Deep Learning - Chapter 10, 1d convolutions can be represented as weight matrices from a MLP with a specific prior where the diagonals are the same (d).

    Rotationally equivariant convolutions can be implemented by isotropic filters (a prior on the conv2d weight):

    ","tags":["dl_theory","equivariance"]},{"location":"000%20Zettelkasten/PyTorch%20Functionalization/","title":"PyTorch Functionalization","text":"

    Given a program/function of PyTorch operators, functionalization will return a new function, that: 1. Has the same semantics as the old function 2. Has no mutations in it

    Functionalization operates at the level of our ATen API.

    More info on PyTorch - Functionalization in PyTorch - Everything you need to know

    ","tags":["pytorch","compilers"]},{"location":"000%20Zettelkasten/PyTorch%20Quantization%20for%20TensorRT/","title":"PyTorch Quantization for TensorRT","text":"

    There seems to be quite a few possible ways to do this: - PyTorch Eager Mode Quantization TensorRT Acceleration , seems a bit cumbersome: 1. torchao quantization 2. ONNX conversion 3. Graph Surgery (changing some ops in the onnx graph) 4. tensorrt conversion - Not sure if it works, but would be ideal 1. torch.export 2. torchao quantization 3. tensorrt conversion - Less ideal would be: 1. torchao quantization 2. torch.export 3. tensorrt conversion - I've already sort of tried this using the vgg ptq example from tensorrt, but torch.export complained that it couldn't translate the quantized operations

    ","tags":["quantization","efficient_dl"]},{"location":"000%20Zettelkasten/Representation%20%28Group%20Theory%29/","title":"Representation (Group Theory)","text":"

    Property required:

    \\[ p(g)p(h) = p(g \\cdot h) \\]

    A representation of a group action can be a linear operator like:

    \\[ p(\\theta) = [sin(\\theta) ...] \\]","tags":["math","group_theory"]},{"location":"000%20Zettelkasten/Residual%20stream/","title":"Residual stream","text":"

    \"A transformer\u00a0starts with a token embedding, followed by a series of \u201cresidual blocks\u201d, and finally a token unembedding. Each residual block consists of an attention layer, followed by an MLP layer. Both the attention and MLP layers each \u201cread\u201d their input from the residual stream (by performing a linear projection), and then \u201cwrite\u201d their result to the residual stream by adding a linear projection back in.\u00a0Each attention layer consists of multiple heads, which operate in parallel.\" A Mathematical Framework for Transformer Circuits

    ","tags":["mechinterp","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/A%20Brief%20Review%20of%20Hypernetworks%20in%20Deep%20Learning/","title":"A Brief Review of Hypernetworks in Deep Learning","text":"Properties authors Vinod Kumar Chahuan, Jiandong Zhou, Ping Lu, Soheila Molaei, David A. Clifton year 2023 url https://arxiv.org/abs/2306.06955

    Abstract

    Hypernetworks, or hypernets in short, are neural networks that generate weights for another neural network, known as the target network. They have emerged as a powerful deep learning technique that allows for greater flexibility, adaptability, dynamism, faster training, information sharing, and model compression etc. Hypernets have shown promising results in a variety of deep learning problems, including continual learning, causal inference, transfer learning, weight pruning, uncertainty quantification, zero-shot learning, natural language processing, and reinforcement learning etc. Despite their success across different problem settings, currently, there is no review available to inform the researchers about the developments and to help in utilizing hypernets. To fill this gap, we review the progress in hypernets. We present an illustrative example to train deep neural networks using hypernets and propose categorizing hypernets based on five design criteria as inputs, outputs, variability of inputs and outputs, and architecture of hypernets. We also review applications of hypernets across different deep learning problem settings, followed by a discussion of general scenarios where hypernets can be effectively employed. Finally, we discuss the challenges and future directions that remain under-explored in the field of hypernets. We believe that hypernetworks have the potential to revolutionize the field of deep learning. They offer a new way to design and train neural networks, and they have the potential to improve the performance of deep learning models on a variety of tasks. Through this review, we aim to inspire further advancements in deep learning through hypernetworks.

    ","tags":["paper","hypernetworks"]},{"location":"100%20Reference%20notes/101%20Literature/A%20ConvNet%20for%20the%202020s/","title":"A ConvNet for the 2020s","text":"Properties authors Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie year 2022 url https://arxiv.org/abs/2201.03545

    Abstract

    The \"Roaring 20s\" of visual recognition began with the introduction of Vision Transformers (ViTs), which quickly superseded ConvNets as the state-of-the-art image classification model. A vanilla ViT, on the other hand, faces difficulties when applied to general computer vision tasks such as object detection and semantic segmentation. It is the hierarchical Transformers (e.g., Swin Transformers) that reintroduced several ConvNet priors, making Transformers practically viable as a generic vision backbone and demonstrating remarkable performance on a wide variety of vision tasks. However, the effectiveness of such hybrid approaches is still largely credited to the intrinsic superiority of Transformers, rather than the inherent inductive biases of convolutions. In this work, we reexamine the design spaces and test the limits of what a pure ConvNet can achieve. We gradually \"modernize\" a standard ResNet toward the design of a vision Transformer, and discover several key components that contribute to the performance difference along the way. The outcome of this exploration is a family of pure ConvNet models dubbed ConvNeXt. Constructed entirely from standard ConvNet modules, ConvNeXts compete favorably with Transformers in terms of accuracy and scalability, achieving 87.8% ImageNet top-1 accuracy and outperforming Swin Transformers on COCO detection and ADE20K segmentation, while maintaining the simplicity and efficiency of standard ConvNets.

    ","tags":["cnn","foundation_models","computer_vision","dl_theory","paper"]},{"location":"100%20Reference%20notes/101%20Literature/A%20ConvNet%20for%20the%202020s/#notes","title":"Notes","text":"

    Authors modernize ConvNets with SOTA architectural choices and training recipes to achieve SOTA ViT performance on dense prediction tasks (Object Detection, etc). { width=\"500\" }

    Important limitation, scaling laws for ConvNext are not proved to be as good as ViTs, although they also mention that they are promising:

    These findings are encouraging but not yet completely convincing \u2014 our exploration thus far has been limited to a small scale, but vision Transformers\u2019 scaling behavior is what truly distinguishes them.

    Table 1. Classification accuracy on ImageNet-1K. Similar to Transformers, ConvNeXt also shows promising scaling behavior with higher-capacity models and a larger (pre-training) dataset.

    • What are the follow ups for this paper regarding scaling laws of modern convnets when compared to vits?

    One of the main motivations of this paper is that ViTs were not very good at dense prediction tasks such as object detection:

    A vanilla ViT, on the other hand, faces difficulties when applied to general computer vision tasks such as object detection and semantic segmentation. It is the hierarchical Transformers (e.g., Swin Transformers) that reintroduced several ConvNet priors, making Transformers practically viable as a generic vision backbone and demonstrating remarkable performance on a wide variety of vision tasks.

    ","tags":["cnn","foundation_models","computer_vision","dl_theory","paper"]},{"location":"100%20Reference%20notes/101%20Literature/A%20Hierarchy%20of%20Graph%20Neural%20Networks%20Based%20on%20Learnable%20Local%20Features/","title":"A Hierarchy of Graph Neural Networks Based on Learnable Local Features","text":"Properties authors Michael Linghzhi Li, Meng Dong, Jiawei Zhou, Alexander M. Rush year 2019 url https://arxiv.org/abs/1911.05256

    Abstract

    Graph neural networks (GNNs) are a powerful tool to learn representations on graphs by iteratively aggregating features from node neighbourhoods. Many variant models have been proposed, but there is limited understanding on both how to compare different architectures and how to construct GNNs systematically. Here, we propose a hierarchy of GNNs based on their aggregation regions. We derive theoretical results about the discriminative power and feature representation capabilities of each class. Then, we show how this framework can be utilized to systematically construct arbitrarily powerful GNNs. As an example, we construct a simple architecture that exceeds the expressiveness of the Weisfeiler-Lehman graph isomorphism test. We empirically validate our theory on both synthetic and real-world benchmarks, and demonstrate our example's theoretical power translates to strong results on node classification, graph classification, and graph regression tasks.

    Interesting insight: - \u201cUsing this hierarchy, we can derive theoretical results which provide insight into GNNs. For example, we show that no matter how many layers are added, networks which only aggregate over immediate neighbors cannot learn the number of triangles in a node\u2019s neighbourhood\u201d (Li et al., 2019, p. 1)

    HOWEVER: - you can bypass this by encoding geometric information like position and orientation, see Fast, Expressive SE(n) Equivariant Networks through Weight-Sharing in Position-Orientation Space slides

    Michael Lingzhi Li,\u00a0Meng Dong,\u00a0Jiawei Zhou,\u00a0Alexander M. Rush

    ","tags":["gcn","graphs","gnn","paper"]},{"location":"100%20Reference%20notes/101%20Literature/A%20Mathematical%20Framework%20for%20Transformer%20Circuits/","title":"A Mathematical Framework for Transformer Circuits","text":"Properties authors Nelson Elhage, Neel Nanda, Catherine Olsson, Tom Henighan, Nicholas Joseph, Ben Mann, Amanda Askell, Yuntao Bai, Anna Chen, Tom Conerly, Nova DasSarma, Dawn Drain, Deep Ganguli, Zac Hatfield-Dodds, Danny Hernandez, Andy Jones, Jackson Kernion, Liane Lovitt, Kamal Ndousse, Dario Amodei, Tom Brown, Jack Clark, Jared Kaplan, Sam McCandlish, Christopher Olah year 2021 url https://transformer-circuits.pub/2021/framework/index.html

    Abstract

    Transformer [1] language models are an emerging technology that is gaining increasingly broad real-world use, for example in systems like GPT-3 [2], LaMDA\u00a0[3], Codex\u00a0[4], Meena\u00a0[5], Gopher\u00a0[6], and similar models. \u00a0However, as these models scale, their open-endedness and high capacity creates an increasing scope for unexpected and sometimes harmful behaviors. \u00a0Even years after a large model is trained, both creators and users routinely discover model capabilities \u2013 including problematic behaviors \u2013 they were previously unaware of.

    One avenue for addressing these issues is\u00a0mechanistic interpretability, attempting to reverse engineer the detailed computations performed by transformers, similar to how a programmer might try to reverse engineer complicated binaries into human-readable source code. \u00a0If this were possible, it could potentially provide a more systematic approach to explaining current safety problems, identifying new ones, and perhaps even anticipating the safety problems of powerful future models that have not yet been built. \u00a0A previous project, the\u00a0Distill\u00a0Circuits\u00a0thread\u00a0[7], has attempted to reverse engineer vision models, but so far there hasn\u2019t been a comparable project for transformers or language models.

    In this paper, we attempt to take initial, very preliminary steps towards reverse-engineering transformers. \u00a0Given the incredible complexity and size of modern language models, we have found it most fruitful to start with the simplest possible models and work our way up from there. \u00a0Our aim is to discover simple algorithmic patterns, motifs, or frameworks that can subsequently be applied to larger and more complex models. \u00a0Specifically, in this paper we will study\u00a0transformers with two layers or less which have only attention blocks\u00a0\u2013 this is in contrast to a large, modern transformer like GPT-3, which has 96 layers and alternates attention blocks with MLP blocks.

    We find that by conceptualizing the operation of transformers in a new but mathematically equivalent way, we are able to make sense of these small models and gain significant understanding of how they operate internally. \u00a0Of particular note, we find that specific attention heads that we term \u201cinduction heads\u201d can explain in-context learning in these small models, and that these heads only develop in models with at least two attention layers. \u00a0We also go through some examples of these heads operating in action on specific data.

    We don\u2019t attempt to apply to our insights to larger models in this first paper, but in a\u00a0forthcoming paper, we will show that both our mathematical framework for understanding transformers, and the concept of induction heads, continues to be at least partially relevant for much larger and more realistic models \u2013 though we remain a very long way from being able to fully reverse engineer such models.

    ","tags":["paper","mechinterp","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/A%20general%20theory%20of%20correct%2C%20incorrect%2C%20and%20extrinsic%20equivariance/","title":"A general theory of correct, incorrect, and extrinsic equivariance","text":"Properties authors Dian Wang, Xupeng Zhu, Jung Yeon Park, Mingxi Jia, Guanang Su, Robert Platt, Robin Walters year 2024 url https://proceedings.neurips.cc/paper_files/paper/2023/hash/7dc7793c89b93887e126a86f22ef63c6-Abstract-Conference.html

    Abstract

    Although equivariant machine learning has proven effective at many tasks, success depends heavily on the assumption that the ground truth function is symmetric over the entire domain matching the symmetry in an equivariant neural network. A missing piece in the equivariant learning literature is the analysis of equivariant networks when symmetry exists only partially in the domain. In this work, we present a general theory for such a situation. We propose pointwise definitions of correct, incorrect, and extrinsic equivariance, which allow us to quantify continuously the degree of each type of equivariance a function displays. We then study the impact of various degrees of incorrect or extrinsic symmetry on model error. We prove error lower bounds for invariant or equivariant networks in classification or regression settings with partially incorrect symmetry. We also analyze the potentially harmful effects of extrinsic equivariance. Experiments validate these results in three different environments.

    ","tags":["equivariance","relaxed_equivariance","dl_theory","paper"]},{"location":"100%20Reference%20notes/101%20Literature/A%20survey%20of%20quantization%20methods%20for%20efficient%20neural%20network%20inference/","title":"A survey of quantization methods for efficient neural network inference","text":"Properties authors Amir Gholami, Sehoon Kim, Zhen Dong, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer year 2021 url https://arxiv.org/abs/2103.13630

    Abstract

    As soon as abstract mathematical computations were adapted to computation on digital computers, the problem of efficient representation, manipulation, and communication of the numerical values in those computations arose. Strongly related to the problem of numerical representation is the problem of quantization: in what manner should a set of continuous real-valued numbers be distributed over a fixed discrete set of numbers to minimize the number of bits required and also to maximize the accuracy of the attendant computations? This perennial problem of quantization is particularly relevant whenever memory and/or computational resources are severely restricted, and it has come to the forefront in recent years due to the remarkable performance of Neural Network models in computer vision, natural language processing, and related areas. Moving from floating-point representations to low-precision fixed integer values represented in four bits or less holds the potential to reduce the memory footprint and latency by a factor of 16x; and, in fact, reductions of 4x to 8x are often realized in practice in these applications. Thus, it is not surprising that quantization has emerged recently as an important and very active sub-area of research in the efficient implementation of computations associated with Neural Networks. In this article, we survey approaches to the problem of quantizing the numerical values in deep Neural Network computations, covering the advantages/disadvantages of current methods. With this survey and its organization, we hope to have presented a useful snapshot of the current research in quantization for Neural Networks and to have given an intelligent organization to ease the evaluation of future research in this area.

    ","tags":["paper","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/AWQ%20-%20Activation-aware%20Weight%20Quantization%20for%20LLM%20Compression%20and%20Acceleration/","title":"AWQ Activation aware Weight Quantization for LLM Compression and Acceleration","text":"Properties authors Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Wei-Ming Chen, Wei-Chen Wang, Guangxuan Xiao, Xingyu Dang, Chuang Gan, Song Han year 2023 url https://arxiv.org/abs/2306.00978

    Abstract

    Large language models (LLMs) have fundamentally transformed the capabilities of numerous applications, from natural language processing to more intricate domain-specific tasks in robotics and autonomous driving. Moreover, the importance of on-device LLMs has grown significantly in the recent years. Running LLMs on edge devices not only promises reduced latency and improved user experience but also aligns with the increasing need for user privacy, as data processing can occur locally. However, the astronomical model sizes of modern LLMs and constraints of the edge devices, primarily in terms of memory size and bandwidth, pose significant deployment challenges. In this paper, we propose Activation-aware Weight Quantization (AWQ), a hardware-friendly approach for LLM low-bit weight-only quantization. Our method is based on the observation that weights are not equally important: protecting only 1% of salient weights can greatly reduce quantization error. We then propose to search for the optimal per-channel scaling that protects the salient weights by observing the activation, not weights. AWQ does not rely on any backpropagation or reconstruction, so it can well preserve LLMs' generalization ability on different domains and modalities, without overfitting to the calibration set. AWQ outperforms existing work on various language modeling and domain-specific benchmarks (coding and math). Thanks to better generalization, it achieves excellent quantization performance for instruction-tuned LMs and, for the first time, multi-modal LMs. Alongside AWQ, we implement TinyChat, an efficient and flexible inference framework tailored for on-device LLM/VLMs, offering more than 3x speedup over the Huggingface FP16 implementation on both desktop and mobile GPUs. It also democratizes the deployment of the 70B Llama-2 model on mobile GPUs.

    ","tags":["paper","efficient_dl","quantization"]},{"location":"100%20Reference%20notes/101%20Literature/Adapting%20Vision%20Foundation%20Models%20for%20Plant%20Phenotyping/","title":"Adapting Vision Foundation Models for Plant Phenotyping","text":"Properties authors Feng Chen, Mario Valerio Giuffrida, Sotirios A. Tsaftaris year 2023 url https://openaccess.thecvf.com/content/ICCV2023W/CVPPA/html/Chen_Adapting_Vision_Foundation_Models_for_Plant_Phenotyping_ICCVW_2023_paper.html

    Abstract

    Foundation models are large models pre-trained on tremendous amount of data. They can be typically adapted to diverse downstream tasks with minimal effort. However, as foundation models are usually pre-trained on images or texts sourced from the Internet, their performance in specialized domains, such as plant phenotyping, comes into question. In addition, fully fine-tuning foundation models is time-consuming and requires high computational power. This paper investigates the efficient adaptation of foundation models for plant phenotyping settings and tasks. We perform extensive experiments on fine-tuning three foundation models, MAE, DINO, and DINOv2 on three essential plant phenotyping tasks: leaf counting, instance segmentation, and disease classification. In particular, the pre-trained backbones are kept frozen, while two distinct fine-tuning methods are evaluated, namely adapter tuning (using LoRA) and decoder tuning. The experimental results show that a foundation model can be efficiently adapted to multiple plant phenotyping tasks, yielding similar performance as the state-of-the-art (SoTA) models specifically designed or trained for each task. Despite exhibiting great transferability over different tasks, the fine-tuned foundation models perform slightly worse than the SoTA task-specific models in some scenarios, which requires further investigation.

    ","tags":["paper","peft","foundation_models"]},{"location":"100%20Reference%20notes/101%20Literature/Adapting%20Vision%20Foundation%20Models%20for%20Plant%20Phenotyping/#notes","title":"Notes","text":"

    Motivation / Problem

    Foundation models struggle with specialized data like (plant phenotyping, cancer predictions)

    Research question

    Which efficient fine-tuning technique is most promising for adapting foundation models (MAE, DINO, DINOv2) in specialized data?

    Methods

    Benchmarked fine-tuning methods include decoder fine-tuning (aka linear probing) and adapter tuning (linear probing + LoRa)

    Results

    1. LoRa consistently beats DT
    2. VFM w/ LoRa are often competitive fully-trained/finetuned SOTA
    3. It's not clear that one vfm beats another, each model (DINO, DINOv2, MAE) have metrics and tasks where they shine
    4. LoRa can help dampen issues of data scarcity, domain shifts and class imbalance
    ","tags":["paper","peft","foundation_models"]},{"location":"100%20Reference%20notes/101%20Literature/An%20Image%20is%20Worth%20More%20Than%2016x16%20Patches%20-%20Exploring%20Transformers%20on%20Individual%20Pixels/","title":"An Image is Worth More Than 16x16 Patches Exploring Transformers on Individual Pixels","text":"Properties authors Duy-Kien Nguyen, Mahmoud Assran, Unnat Jain, Martin R. Oswald, Cees G. M. Snoek, Xinlei Chen year 2024 url https://arxiv.org/abs/2406.09415v1

    Abstract

    This work does not introduce a new method. Instead, we present an interesting finding that questions the necessity of the inductive bias -- locality in modern computer vision architectures. Concretely, we find that vanilla Transformers can operate by directly treating each individual pixel as a token and achieve highly performant results. This is substantially different from the popular design in Vision Transformer, which maintains the inductive bias from ConvNets towards local neighborhoods (e.g. by treating each 16x16 patch as a token). We mainly showcase the effectiveness of pixels-as-tokens across three well-studied tasks in computer vision: supervised learning for object classification, self-supervised learning via masked autoencoding, and image generation with diffusion models. Although directly operating on individual pixels is less computationally practical, we believe the community must be aware of this surprising piece of knowledge when devising the next generation of neural architectures for computer vision.

    Comments: - Seems to contradict How do vision transformers work? in their position that inductive biases do improve vits. - [ ] Might be useful to check this.

    ","tags":["paper","dl_theory","vit"]},{"location":"100%20Reference%20notes/101%20Literature/An%20Investigation%20into%20Neural%20Net%20Optimization%20via%20Hessian%20Eigenvalue%20Density/","title":"An Investigation into Neural Net Optimization via Hessian Eigenvalue Density","text":"Properties authors Behrooz Ghorbani, Shankar Krishnan, Ying Xiao

    Abstract

    To understand the dynamics of optimization in deep neural networks, we develop a tool to study the evolution of the entire Hessian spectrum throughout the optimization process. Using this, we study a number of hypotheses concerning smoothness, curvature, and sharpness in the deep learning literature. We then thoroughly analyze a crucial structural feature of the spectra: in nonbatch normalized networks, we observe the rapid appearance of large isolated eigenvalues in the spectrum, along with a surprising concentration of the gradient in the corresponding eigenspaces. In batch normalized networks, these two effects are almost absent. We characterize these effects, and explain how they affect optimization speed through both theory and experiments. As part of this work, we adapt advanced tools from numerical linear algebra that allow scalable and accurate estimation of the entire Hessian spectrum of ImageNet-scale neural networks; this technique may be of independent interest in other applications

    ","tags":["dl_theory","optimizability","optimization","paper"]},{"location":"100%20Reference%20notes/101%20Literature/An%20image%20is%20worth%2016x16%20words%20-%20Transformers%20for%20image%20recognition%20at%20scale/","title":"An image is worth 16x16 words Transformers for image recognition at scale","text":"Properties authors Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby url https://arxiv.org/abs/2010.11929 year 2020

    Abstract

    While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train.

    ","tags":["vit","transformers","paper"]},{"location":"100%20Reference%20notes/101%20Literature/An%20image%20is%20worth%2016x16%20words%20-%20Transformers%20for%20image%20recognition%20at%20scale/#notes","title":"Notes","text":"","tags":["vit","transformers","paper"]},{"location":"100%20Reference%20notes/101%20Literature/An%20image%20is%20worth%2016x16%20words%20-%20Transformers%20for%20image%20recognition%20at%20scale/#regarding-inductive-biases","title":"Regarding inductive biases","text":"

    Inductive bias. We note that Vision Transformer has much less image-specific inductive bias than CNNs. In CNNs, locality, two-dimensional neighborhood structure, and translation equivariance are baked into each layer throughout the whole model. In ViT, only MLP layers are local and translationally equivariant, while the self-attention layers are global. The two-dimensional neighborhood structure is used very sparingly: in the beginning of the model by cutting the image into patches and at fine-tuning time for adjusting the position embeddings for images of different resolution (as described below). Other than that, the position embeddings at initialization time carry no information about the 2D positions of the patches and all spatial relations between the patches have to be learned from scratch.

    Interesting insight about Hybrid ViTs (40 conv layers + transformer blocks): - It is better on small data regimes but shows no improvement on large data regimes.

    ","tags":["vit","transformers","paper"]},{"location":"100%20Reference%20notes/101%20Literature/Apple%20Intelligence%20Foundation%20Language%20Models/","title":"Apple Intelligence Foundation Language Models","text":"Properties authors Tom Gunter, Zirui Wang, Chong Wang, Ruoming Pang, Andy Narayanan, Aonan Zhang, Bowen Zhang, Chen Chen, Chung-Cheng Chiu, David Qiu, Deepak Gopinath, Dian Ang Yap, Dong Yin, Feng Nan, Floris Weers, Guoli Yin, Haoshuo Huang, Jianyu Wang, Jiarui Lu, John Peebles, Ke Ye, Mark Lee, Nan Du, Qibin Chen, Quentin Keunebroek, Sam Wiseman, Syd Evans, Tao Lei, Vivek Rathod, Xiang Kong, Xianzhi Du, Yanghao Li, Yongqiang Wang, Yuan Gao, Zaid Ahmed, Zhaoyang Xu, Zhiyun Lu, Al Rashid, Albin Madappally Jose, Alec Doane, Alfredo Bencomo, Allison Vanderby, Andrew Hansen, Ankur Jain, Anupama Mann Anupama, Areeba Kamal, Bugu Wu, Carolina Brum, Charlie Maalouf, Chinguun Erdenebileg, Chris Dulhanty, Dominik Moritz, Doug Kang, Eduardo Jimenez, Evan Ladd, Fangping Shi, Felix Bai, Frank Chu, Fred Hohman, Hadas Kotek, Hannah Gillis Coleman, Jane Li, Jeffrey Bigham, Jeffery Cao, Jeff Lai, Jessica Cheung, Jiulong Shan, Joe Zhou, John Li, Jun Qin, Karanjeet Singh, Karla Vega, Kelvin Zou, Laura Heckman, Lauren Gardiner, Margit Bowler, Maria Cordell, Meng Cao, Nicole Hay, Nilesh Shahdadpuri, Otto Godwin, Pranay Dighe, Pushyami Rachapudi, Ramsey Tantawi, Roman Frigg, Sam Davarnia, Sanskruti Shah, Saptarshi Guha, Sasha Sirovica, Shen Ma, Shuang Ma, Simon Wang, Sulgi Kim, Suma Jayaram, Vaishaal Shankar, Varsha Paidi, Vivek Kumar, Xin Wang, Xin Zheng, Walker Cheng , Yael Shrager, Yang Ye, Yasu Tanaka, Yihao Guo, Yunsong Meng, Zhao Tang Luo, Zhi Ouyang, Alp Aygar, Alvin Wan, Andrew Walkingshaw, Andy Narayanan, Antonie Lin, Arsalan Farooq, Brent Ramerth, Colorado Reed, Chris Bartels, Chris Chaney, David Riazati, Eric Liang Yang, Erin Feldman, Gabriel Hochstrasser, Guillaume Seguin, Irina Belousova, Joris Pelemans, Karen Yang, Keivan Alizadeh Vahid, Liangliang Cao, Mahyar Najibi, Marco Zuliani, Max Horton, Minsik Cho, Nikhil Bhendawade, Patrick Dong, Piotr Maj, Pulkit Agrawal, Qi Shan, Qichen Fu, Regan Poston, Sam Xu, Shuangning Liu, Sushma Rao, Tashweena Heeramun, Thomas Merth, Uday Rayala, Victor Cui, Vivek Rangarajan Sridhar, Wencong Zhang, Wenqi Zhang, Wentao Wu, Xingyu Zhou, Xinwen Liu, Yang Zhao, Yin Xia, Zhile Ren, Zhongzheng Ren year 2024 url https://arxiv.org/abs/2407.21075

    Abstract

    We present foundation language models developed to power Apple Intelligence features, including a ~3 billion parameter model designed to run efficiently on devices and a large server-based language model designed for Private Cloud Compute. These models are designed to perform a wide range of tasks efficiently, accurately, and responsibly. This report describes the model architecture, the data used to train the model, the training process, how the models are optimized for inference, and the evaluation results. We highlight our focus on Responsible AI and how the principles are applied throughout the model development.

    ","tags":["paper","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/Apple%20Intelligence%20Foundation%20Language%20Models/#notes","title":"Notes","text":"

    \u201cA shared input/output embedding matrix [Press and Wolf, 2016] to reduce memory usage for parameters.\u201d (Gunter et al., 2024, p. 2)

    This reminds me of the Residual stream interpretation of transformers.

    \u201cThe model is compressed and quantized, on average under 4-bit-perweight, after the post-training stages (details of the quantization scheme will be discussed later). The quantized model often shows a moderate level of quality loss. Therefore, instead of directly passing the quantized model to application teams for feature development, we attach a set of parameter-efficient LoRa Adapters for quality recovery. We make sure that these LoRA adapters training recipes are consistent with pre-training and post-training processes. Then, products will fine-tune their own feature-specific LoRA adapters by initializing the adapter weights from the accuracy-recovery adapters, while keeping the quantized base model frozen.\u201d (Gunter et al., 2024, p. 16)

    So the recipe is: - Pre-training/Post-training - Compression? and Quantization (leads to accuracy loss) - LoRa fine-tuning to recover accuracy, call it LoRa Recovery, I'll assume this - For a specific task, initialize LoRa adapter to the LoRa Recovery Some details: - Rank 16 LoRa - Does each LoRa adapter also share the same precision as the underlying weight block/matrix? I suppose so

    \u201cSpecifically, our AFM-on-device model running on Apple Neural Engine (ANE) uses Bit Palettization: for projection weights, every 16 columns/rows share the same quantization constants (i.e., lookup tables) and are quantized using K-means with 16 unique values (4-bit).\u201d (Gunter et al., 2024, p. 17)

    ","tags":["paper","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/Approximately%20equivariant%20networks%20for%20imperfectly%20symmetric%20dynamics/","title":"Approximately equivariant networks for imperfectly symmetric dynamics","text":"Properties authors Rui Wang, Robin Walters, Rose Yu year 2022 url https://proceedings.mlr.press/v162/wang22aa.html

    Abstract

    Incorporating symmetry as an inductive bias into neural network architecture has led to improvements in generalization, data efficiency, and physical consistency in dynamics modeling. Methods such as CNNs or equivariant neural networks use weight tying to enforce symmetries such as shift invariance or rotational equivariance. However, despite the fact that physical laws obey many symmetries, real-world dynamical data rarely conforms to strict mathematical symmetry either due to noisy or incomplete data or to symmetry breaking features in the underlying dynamical system. We explore approximately equivariant networks which are biased towards preserving symmetry but are not strictly constrained to do so. By relaxing equivariance constraints, we find that our models can outperform both baselines with no symmetry bias and baselines with overly strict symmetry in both simulated turbulence domains and real-world multi-stream jet flow.

    ","tags":["relaxed_equivariance","equivariance","dl_theory","paper"]},{"location":"100%20Reference%20notes/101%20Literature/Approximation-Generalization%20Trade-offs%20under%20%28Approximate%29%20Group%20Equivariance/","title":"Approximation Generalization Trade offs under (Approximate) Group Equivariance","text":"Properties authors Mircea Petrache, Shubhendu Trivedi","tags":["paper"]},{"location":"100%20Reference%20notes/101%20Literature/Autoequivariant%20Network%20Search%20via%20Group%20Decomposition/","title":"Autoequivariant Network Search via Group Decomposition","text":"Properties authors Sourya Basu","tags":["paper"]},{"location":"100%20Reference%20notes/101%20Literature/Battle%20of%20the%20Backbones%20-%20A%20Large-Scale%20Comparison%20of%20Pretrained%20Models%20across%20Computer%20Vision%20Tasks/","title":"Battle of the Backbones A Large Scale Comparison of Pretrained Models across Computer Vision Tasks","text":"Properties authors Micah Goldblum, Hossein Souri, Renkun Ni, Manli Shu, Viraj Prabhu, Gowthami Somepalli, Prithvijt Chattopadhyay, Mark Ibrahim, Adrien Bardes, Judy Hoffman, Rama Chellappa, Andrew Gordon Wilson, Tom Goldstein year 2023 url https://arxiv.org/abs/2310.19909

    Abstract

    Neural network based computer vision systems are typically built on a backbone, a pretrained or randomly initialized feature extractor. Several years ago, the default option was an ImageNet-trained convolutional neural network. However, the recent past has seen the emergence of countless backbones pretrained using various algorithms and datasets. While this abundance of choice has led to performance increases for a range of systems, it is difficult for practitioners to make informed decisions about which backbone to choose. Battle of the Backbones (BoB) makes this choice easier by benchmarking a diverse suite of pretrained models, including vision-language models, those trained via self-supervised learning, and the Stable Diffusion backbone, across a diverse set of computer vision tasks ranging from classification to object detection to OOD generalization and more. Furthermore, BoB sheds light on promising directions for the research community to advance computer vision by illuminating strengths and weakness of existing approaches through a comprehensive analysis conducted on more than 1500 training runs. While vision transformers (ViTs) and self-supervised learning (SSL) are increasingly popular, we find that convolutional neural networks pretrained in a supervised fashion on large training sets still perform best on most tasks among the models we consider. Moreover, in apples-to-apples comparisons on the same architectures and similarly sized pretraining datasets, we find that SSL backbones are highly competitive, indicating that future works should perform SSL pretraining with advanced architectures and larger pretraining datasets. We release the raw results of our experiments along with code that allows researchers to put their own backbones through the gauntlet here:\u00a0this https URL

    ","tags":["paper","foundation_models","computer_vision","vit","transformers","cnn"]},{"location":"100%20Reference%20notes/101%20Literature/Battle%20of%20the%20Backbones%20-%20A%20Large-Scale%20Comparison%20of%20Pretrained%20Models%20across%20Computer%20Vision%20Tasks/#notes","title":"Notes","text":"

    It would be nice to see an update with DINOv2 - Learning Robust Visual Features without Supervision and EVA-02 - A Visual Representation for Neon Genesis.

    A performance comparison of ViTs and CNNs. Modern architectures strongly outperform vanilla ViTs. We see in Table 2 that the best performing backbone (ConvNeXt-Base) is convolutional, with a hierarchical transformer (SwinV2-Base) being a close second. The latter transformer architecture incorporates a strong spatial inductive bias. These findings suggest that the community should move past vanilla ViTs which are still used frequently. As a caveat, we do not evaluate very large models, and it is possible that ViTs might outperform their more advanced variants or convolutional networks at larger scales.

    Battle of the \u201csmall\u201d backbones. Keeping limited resources in mind, we also compare the \u201csmall\u201d subset of backbones in BoB (< 30M parameters) \u2013 with ViT-Small, ConvNeXt-Tiny, Swin-Tiny and ResNet-50 architectures. Overall, we find Supervised ConvNeXt-T trained on IN-1k to be the best, followed by Supervised SwinV2-T trained on IN-1k and DINO ViT-S trained on IN-1k. Interestingly, supervised learning again dominates, and backbones pretrained on just IN-1k outperform ones trained on a considerably more diverse and larger dataset (MiDaS).

    Object Detection & Segmentation. For object detection and instance segmentation, we find \u201cSupervised ConvNeXt-Base trained on IN-21K\u201d > \u201cSupervised SwinV2-Base trained on IN-21k (finetuned on IN-1k)\u201d > \u201cSupervised ConvNeXt-Base trained on IN-1k\u201d.

    These results are probably outdated since many foundation models already beat Swinv2 - SimPLR - A Simple and Plain Transformer for Scaling-Efficient Object Detection and Segmentation - Exploring Plain Vision Transformer Backbones for Object Detection

    ","tags":["paper","foundation_models","computer_vision","vit","transformers","cnn"]},{"location":"100%20Reference%20notes/101%20Literature/Block%20Transformer%20-%20Global-to-Local%20Language%20Modeling%20for%20Fast%20Inference/","title":"Block Transformer Global to Local Language Modeling for Fast Inference","text":"Properties authors Namgyu Ho, Sangmin Bae, Taehyeon Kim, Hyunjik Jo, Yireun Kim, Tal Schuster, Adam Fisch, James Thorne, Se-Young Yun year 2024 url https://arxiv.org/abs/2406.02657

    Abstract

    This paper presents the Block Transformer architecture which adopts hierarchical global-to-local modeling to autoregressive transformers to mitigate the inference bottlenecks of self-attention. To apply self-attention, the key-value (KV) cache of all previous sequences must be retrieved from memory at every decoding step. Thereby, this KV cache IO becomes a significant bottleneck in batch inference. We notice that these costs stem from applying self-attention on the global context, therefore we isolate the expensive bottlenecks of global modeling to lower layers and apply fast local modeling in upper layers. To mitigate the remaining costs in the lower layers, we aggregate input tokens into fixed size blocks and then apply self-attention at this coarse level. Context information is aggregated into a single embedding to enable upper layers to decode the next block of tokens, without global attention. Free of global attention bottlenecks, the upper layers can fully utilize the compute hardware to maximize inference throughput. By leveraging global and local modules, the Block Transformer architecture demonstrates 10-20x gains in inference throughput compared to vanilla transformers with equivalent perplexity. Our work introduces a new approach to optimize language model inference through novel application of global-to-local modeling. Code is available at https://github.com/itsnamgyu/block-transformer.

    ","tags":["efficient_dl","transformers","paper"]},{"location":"100%20Reference%20notes/101%20Literature/BoxeR%20-%20Box-Attention%20for%202D%20and%203D%20Transformers/","title":"BoxeR Box Attention for 2D and 3D Transformers","text":"Properties authors Duy-Kien Nguyen, Jihong Ju, Olaf Booij, Martin R. Oswald, Cees G. M. Snoek year 2021 url https://arxiv.org/abs/2111.13087

    Abstract

    In this paper, we propose a simple attention mechanism, we call box-attention. It enables spatial interaction between grid features, as sampled from boxes of interest, and improves the learning capability of transformers for several vision tasks. Specifically, we present BoxeR, short for Box Transformer, which attends to a set of boxes by predicting their transformation from a reference window on an input feature map. The BoxeR computes attention weights on these boxes by considering its grid structure. Notably, BoxeR-2D naturally reasons about box information within its attention module, making it suitable for end-to-end instance detection and segmentation tasks. By learning invariance to rotation in the box-attention module, BoxeR-3D is capable of generating discriminative information from a bird's-eye view plane for 3D end-to-end object detection. Our experiments demonstrate that the proposed BoxeR-2D achieves state-of-the-art results on COCO detection and instance segmentation. Besides, BoxeR-3D improves over the end-to-end 3D object detection baseline and already obtains a compelling performance for the vehicle category of Waymo Open, without any class-specific optimization. Code is available at\u00a0this https URL.

    ","tags":["paper","transformers","object_detection"]},{"location":"100%20Reference%20notes/101%20Literature/Building%20on%20Efficient%20Foundations%20-%20Effectively%20Training%20LLMs%20with%20Structured%20Feedforward%20Layers/","title":"Building on Efficient Foundations Effectively Training LLMs with Structured Feedforward Layers","text":"Properties authors Xiuying Wei, Skander Moalla, Razvan Pascanu, Caglar Gulcehre year 2024 url https://arxiv.org/abs/2406.16450v1

    Abstract

    State-of-the-art results in large language models (LLMs) often rely on scale, which becomes computationally expensive. This has sparked a research agenda to reduce these models' parameter count and computational costs without significantly impacting their performance. Our study focuses on transformer-based LLMs, specifically targeting the computationally intensive feedforward networks (FFN), which are less studied than attention blocks. We consider three candidate linear layer approximations in the FFN by combining efficient low-rank and block-diagonal matrices. In contrast to many previous works that examined these approximations, our study i) explores these structures from the training-from-scratch perspective, ii) scales up to 1.3B parameters, and iii) is conducted within recent Transformer-based LLMs rather than convolutional architectures. We first demonstrate they can lead to actual computational gains in various scenarios, including online decoding when using a pre-merge technique. Additionally, we propose a novel training regime, called \\textit{self-guided training}, aimed at improving the poor training dynamics that these approximations exhibit when used from initialization. Experiments on the large RefinedWeb dataset show that our methods are both efficient and effective for training and inference. Interestingly, these structured FFNs exhibit steeper scaling curves than the original models. Further applying self-guided training to the structured matrices with 32\\% FFN parameters and 2.5\u00d7\u00a0speed-up enables only a 0.4 perplexity increase under the same training FLOPs. Finally, we develop the wide and structured networks surpassing the current medium-sized and large-sized Transformer in perplexity and throughput performance. Our code is available at \\url{this https URL}.

    ","tags":["paper","efficient_dl","llm","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/Building%20on%20Efficient%20Foundations%20-%20Effectively%20Training%20LLMs%20with%20Structured%20Feedforward%20Layers/#notes","title":"Notes","text":"
    • Note to self: Read this in depth \u23eb #personal
    ","tags":["paper","efficient_dl","llm","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/CKConv%20-%20Continuous%20Kernel%20Convolution%20For%20Sequential%20Data/","title":"CKConv Continuous Kernel Convolution For Sequential Data","text":"Properties authors David W. Romero, Anna Kuzina, Erik J. Bekkers, Jakub M. Tomczak, Mark Hoogendoorn year 2021 url https://arxiv.org/abs/2102.02611

    Abstract

    Conventional neural architectures for sequential data present important limitations. Recurrent networks suffer from exploding and vanishing gradients, small effective memory horizons, and must be trained sequentially. Convolutional networks are unable to handle sequences of unknown size and their memory horizon must be defined a priori. In this work, we show that all these problems can be solved by formulating convolutional kernels in CNNs as continuous functions. The resulting Continuous Kernel Convolution (CKConv) allows us to model arbitrarily long sequences in a parallel manner, within a single operation, and without relying on any form of recurrence. We show that Continuous Kernel Convolutional Networks (CKCNNs) obtain state-of-the-art results in multiple datasets, e.g., permuted MNIST, and, thanks to their continuous nature, are able to handle non-uniformly sampled datasets and irregularly-sampled data natively. CKCNNs match or perform better than neural ODEs designed for these purposes in a faster and simpler manner.

    ","tags":["paper","convolutions","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Color%20Equivariant%20Convolutional%20Networks/","title":"Color Equivariant Convolutional Networks","text":"Properties authors Attila Lengyel, Ombretta Strafforello, Robert-Jan Bruintjes, Alexander Gielisse, Jan van Gemert

    References: - Learning Partial Equivariances from Data

    ","tags":["paper"]},{"location":"100%20Reference%20notes/101%20Literature/Color%20Space%20Transformation%20Network/","title":"Color Space Transformation Network","text":"Properties authors Alexandros Karargyris year 2015 url https://arxiv.org/abs/1511.01064

    Abstract

    Deep networks have become very popular over the past few years. The main reason for this widespread use is their excellent ability to learn and predict knowledge in a very easy and efficient way. Convolutional neural networks and auto-encoders have become the normal in the area of imaging and computer vision achieving unprecedented accuracy levels in many applications. The most common strategy is to build and train networks with many layers by tuning their hyper-parameters. While this approach has proven to be a successful way to build robust deep learning schemes it suffers from high complexity. In this paper we introduce a module that learns color space transformations within a network. Given a large dataset of colored images the color space transformation module tries to learn color space transformations that increase overall classification accuracy. This module has shown to increase overall accuracy for the same network design and to achieve faster convergence. It is part of a broader family of image transformations (e.g. spatial transformer network).

    ","tags":["cnn","paper"]},{"location":"100%20Reference%20notes/101%20Literature/ConViT%20-%20Improving%20Vision%20Transformers%20with%20Soft%20Convolutional%20Inductive%20Biases/","title":"ConViT Improving Vision Transformers with Soft Convolutional Inductive Biases","text":"Properties authors St\u00e9phane d'Ascoli, Hugo Touvron, Matthew L. Leavitt, Ari S. Morcos, Giulio Biroli, Levent Sagun

    Abstract

    TODO: - [ ] Read paper - [ ] Add main text summary

    From Early Convolutions Help Transformers See Better, where [9] is this paper:

    We did not observe evidence that the hard locality constraint in early layers hampers the representational capacity of the network, as might be feared [9]. [...] This perspective resonates with the findings of [9], who observe that early transformer blocks prefer to learn more local attention patterns than later blocks.

    This is contrary to How do vision transformers work?, as they claim that locality constraint is beneficial to ViTs.

    Haven't fully read this paper, so the above contradiction might be incorrect.

    ","tags":["vit","computer_vision","cnn","transformers","inductive_bias","paper"]},{"location":"100%20Reference%20notes/101%20Literature/DETRs%20Beat%20YOLOs%20on%20Real-time%20Object%20Detection/","title":"DETRs Beat YOLOs on Real time Object Detection","text":"Properties authors Yian Zhao, Wenyu Lv, Shangliang Xu, Jinman Wei, Guanzhong Wang, Qingqing Dang, Yi Liu, Jie Chen year 2023 url https://arxiv.org/abs/2304.08069v3

    Abstract

    The YOLO series has become the most popular framework for real-time object detection due to its reasonable trade-off between speed and accuracy. However, we observe that the speed and accuracy of YOLOs are negatively affected by the NMS. Recently, end-to-end Transformer-based detectors (DETRs) have provided an alternative to eliminating NMS. Nevertheless, the high computational cost limits their practicality and hinders them from fully exploiting the advantage of excluding NMS. In this paper, we propose the Real-Time DEtection TRansformer (RT-DETR), the first real-time end-to-end object detector to our best knowledge that addresses the above dilemma. We build RT-DETR in two steps, drawing on the advanced DETR: first we focus on maintaining accuracy while improving speed, followed by maintaining speed while improving accuracy. Specifically, we design an efficient hybrid encoder to expeditiously process multi-scale features by decoupling intra-scale interaction and cross-scale fusion to improve speed. Then, we propose the uncertainty-minimal query selection to provide high-quality initial queries to the decoder, thereby improving accuracy. In addition, RT-DETR supports flexible speed tuning by adjusting the number of decoder layers to adapt to various scenarios without retraining. Our RT-DETR-R50 / R101 achieves 53.1% / 54.3% AP on COCO and 108 / 74 FPS on T4 GPU, outperforming previously advanced YOLOs in both speed and accuracy. We also develop scaled RT-DETRs that outperform the lighter YOLO detectors (S and M models). Furthermore, RT-DETR-R50 outperforms DINO-R50 by 2.2% AP in accuracy and about 21 times in FPS. After pre-training with Objects365, RT-DETR-R50 / R101 achieves 55.3% / 56.2% AP. The project page:\u00a0this https URL.

    ","tags":["paper","computer_vision","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/DETRs%20with%20Collaborative%20Hybrid%20Assignments%20Training/","title":"DETRs with Collaborative Hybrid Assignments Training","text":"Properties authors Zhuofan Zong, Guanglu Song, Yu Liu year 2023 url https://arxiv.org/abs/2211.12860v5

    Abstract

    In this paper, we provide the observation that too few queries assigned as positive samples in DETR with one-to-one set matching leads to sparse supervision on the encoder's output which considerably hurt the discriminative feature learning of the encoder and vice visa for attention learning in the decoder. To alleviate this, we present a novel collaborative hybrid assignments training scheme, namely\u00a0\ue22fo-DETR, to learn more efficient and effective DETR-based detectors from versatile label assignment manners. This new training scheme can easily enhance the encoder's learning ability in end-to-end detectors by training the multiple parallel auxiliary heads supervised by one-to-many label assignments such as ATSS and Faster RCNN. In addition, we conduct extra customized positive queries by extracting the positive coordinates from these auxiliary heads to improve the training efficiency of positive samples in the decoder. In inference, these auxiliary heads are discarded and thus our method introduces no additional parameters and computational cost to the original detector while requiring no hand-crafted non-maximum suppression (NMS). We conduct extensive experiments to evaluate the effectiveness of the proposed approach on DETR variants, including DAB-DETR, Deformable-DETR, and DINO-Deformable-DETR. The state-of-the-art DINO-Deformable-DETR with Swin-L can be improved from 58.5% to 59.5% AP on COCO val. Surprisingly, incorporated with ViT-L backbone, we achieve 66.0% AP on COCO test-dev and 67.9% AP on LVIS val, outperforming previous methods by clear margins with much fewer model sizes. Codes are available at \\url{this https URL}.

    ","tags":["paper","object_detection","computer_vision","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/DETRs%20with%20Collaborative%20Hybrid%20Assignments%20Training/#notes","title":"Notes","text":"

    Beats EVA-02 - A Visual Representation for Neon Genesis on object detection.

    Weights for CO-DINO Swin-L (64.1 box AP on COCO val): https://github.com/Sense-X/Co-DETR?tab=readme-ov-file

    ","tags":["paper","object_detection","computer_vision","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/DINOv2%20-%20Learning%20Robust%20Visual%20Features%20without%20Supervision/","title":"DINOv2 Learning Robust Visual Features without Supervision","text":"Properties authors Maxime Oquab, Timoth\u00e9e Darcet, Th\u00e9o Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Rusell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Herv\u00e9 Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski year 2023 url https://arxiv.org/abs/2304.07193

    Abstract

    The recent breakthroughs in natural language processing for model pretraining on large quantities of data have opened the way for similar foundation models in computer vision. These models could greatly simplify the use of images in any system by producing all-purpose visual features, i.e., features that work across image distributions and tasks without finetuning. This work shows that existing pretraining methods, especially self-supervised methods, can produce such features if trained on enough curated data from diverse sources. We revisit existing approaches and combine different techniques to scale our pretraining in terms of data and model size. Most of the technical contributions aim at accelerating and stabilizing the training at scale. In terms of data, we propose an automatic pipeline to build a dedicated, diverse, and curated image dataset instead of uncurated data, as typically done in the self-supervised literature. In terms of models, we train a ViT model (Dosovitskiy et al., 2020) with 1B parameters and distill it into a series of smaller models that surpass the best available all-purpose features, OpenCLIP (Ilharco et al., 2021) on most of the benchmarks at image and pixel levels.

    ","tags":["paper","foundation_models","computer_vision","vit"]},{"location":"100%20Reference%20notes/101%20Literature/Deep%20Learning%20Book/","title":"Deep Learning Book","text":"Properties authors Ian Goodfellow, Yoshua Bengio, Aaron Courville year 2016 url https://www.deeplearningbook.org/","tags":["dl_theory","textbook"]},{"location":"100%20Reference%20notes/101%20Literature/DenseNets%20Reloaded%20-%20Paradigm%20Shift%20Beyond%20ResNets%20and%20ViTs/","title":"DenseNets Reloaded Paradigm Shift Beyond ResNets and ViTs","text":"Properties authors Donghyun Kim, Byeongho Heo, Dongyoon Han year 2024 url https://arxiv.org/abs/2403.19588

    Abstract

    This paper revives Densely Connected Convolutional Networks (DenseNets) and reveals the underrated effectiveness over predominant ResNet-style architectures. We believe DenseNets' potential was overlooked due to untouched training methods and traditional design elements not fully revealing their capabilities. Our pilot study shows dense connections through concatenation are strong, demonstrating that DenseNets can be revitalized to compete with modern architectures. We methodically refine suboptimal components - architectural adjustments, block redesign, and improved training recipes towards widening DenseNets and boosting memory efficiency while keeping concatenation shortcuts. Our models, employing simple architectural elements, ultimately surpass Swin Transformer, ConvNeXt, and DeiT-III - key architectures in the residual learning lineage. Furthermore, our models exhibit near state-of-the-art performance on ImageNet-1K, competing with the very recent models and downstream tasks, ADE20k semantic segmentation, and COCO object detection/instance segmentation. Finally, we provide empirical analyses that uncover the merits of the concatenation over additive shortcuts, steering a renewed preference towards DenseNet-style designs. Our code is available at\u00a0this https URL.

    ","tags":["cnn","dl_theory","optimizability","paper"]},{"location":"100%20Reference%20notes/101%20Literature/Discovering%20Symmetry%20Breaking%20in%20Physical%20Systems%20with%20Relaxed%20Group%20Convolution/","title":"Discovering Symmetry Breaking in Physical Systems with Relaxed Group Convolution","text":"Properties authors Rui Wang, Elyssa Hofgard, Han Gao, Robin Walters, Tess E Smidt year 2024 url https://arxiv.org/abs/2310.02299

    Abstract

    Modeling symmetry breaking is essential for understanding the fundamental changes in the behaviors and properties of physical systems, from microscopic particle interactions to macroscopic phenomena like fluid dynamics and cosmic structures. Thus, identifying sources of asymmetry is an important tool for understanding physical systems. In this paper, we focus on learning asymmetries of data using relaxed group convolutions. We provide both theoretical and empirical evidence that this flexible convolution technique allows the model to maintain the highest level of equivariance that is consistent with data and discover the subtle symmetry-breaking factors in various physical systems. We employ various relaxed group convolution architectures to uncover various symmetry-breaking factors that are interpretable and physically meaningful in different physical systems, including the phase transition of crystal structure, the isotropy and homogeneity breaking in turbulent flow, and the time-reversal symmetry breaking in pendulum systems.

    Observations: - \"In the relaxed group convolution, the initial relaxed (equivariant) weights\u00a0{\ud835\udc64\ud835\udc59\u2062(\u210e)}\u00a0in each layer are set to be the same for all\u00a0\u210e, ensuring that the model exhibits equivariance prior to being trained. [...] we prove that these relaxed weights only deviate from being equal when the symmetries of the input and the output are lower than that of the model.\" (Related to Equivariance Initialization)

    ","tags":["equivariance","relaxed_equivariance","dl_theory","paper"]},{"location":"100%20Reference%20notes/101%20Literature/EVA-02%20-%20A%20Visual%20Representation%20for%20Neon%20Genesis/","title":"EVA 02 A Visual Representation for Neon Genesis","text":"Properties authors Yuxin Fang, Quan Sun, Xinggang Wang, Tiejun Huang, Xinlong Wang, Yue Cao year 2023 url https://arxiv.org/abs/2303.11331

    Abstract

    We launch EVA-02, a next-generation Transformer-based visual representation pre-trained to reconstruct strong and robust language-aligned vision features via masked image modeling. With an updated plain Transformer architecture as well as extensive pre-training from an open & accessible giant CLIP vision encoder, EVA-02 demonstrates superior performance compared to prior state-of-the-art approaches across various representative vision tasks, while utilizing significantly fewer parameters and compute budgets. Notably, using exclusively publicly accessible training data, EVA-02 with only 304M parameters achieves a phenomenal 90.0 fine-tuning top-1 accuracy on ImageNet-1K val set. Additionally, our EVA-02-CLIP can reach up to 80.4 zero-shot top-1 on ImageNet-1K, outperforming the previous largest & best open-sourced CLIP with only ~1/6 parameters and ~1/6 image-text training data. We offer four EVA-02 variants in various model sizes, ranging from 6M to 304M parameters, all with impressive performance. To facilitate open access and open research, we release the complete suite of EVA-02 to the community at\u00a0this https URL.

    ","tags":["paper","foundation_models","computer_vision","vit"]},{"location":"100%20Reference%20notes/101%20Literature/Early%20Convolutions%20Help%20Transformers%20See%20Better/","title":"Early Convolutions Help Transformers See Better","text":"Properties authors Tete Xiao, Mannat Singh, Eric Mintun, Trevor Darrell, Piotr Doll\u00e1r, Ross Girshick

    Hypothesis

    ViT's patchify convolution is contrary to standard early layers in CNNs. Maybe that's the cause?

    Main idea

    Replace patchify convolution with a small number of convolutional layers and drop one transformer block to make comparison fair.

    Notes for myself: - Interesting experimentation regarding #optimizability , maybe take into account into hessian analysis

    ","tags":["cnn","transformers","vit","optimizability","paper"]},{"location":"100%20Reference%20notes/101%20Literature/Efficient%20Equivariant%20Transfer%20Learning%20from%20Pretrained%20Models/","title":"Efficient Equivariant Transfer Learning from Pretrained Models","text":"Properties authors Sourya Basu

    Builds on top of Equi-Tuning - Group Equivariant Fine-Tuning of Pretrained Models and Equivariance with Learned Canonicalization Functions

    Hypothesis

    Pretrained models provide better quality features for certain transformations than others and simply averaging them is bad.

    Main idea

    Lambda-Equitune: Weighted average with learned weights, \\(\\lambda\\).

    \\[ M_G^\\lambda(x) = \\frac{1}{\\sum_{g \\in G} \\lambda(gx)} \\sum_{g \\in G} \\lambda(gx) g^{-1} M(gx) \\]","tags":["paper"]},{"location":"100%20Reference%20notes/101%20Literature/Efficient%20Modulation%20for%20Vision%20Networks/","title":"Efficient Modulation for Vision Networks","text":"Properties authors Xu Ma, Xiyang Dai, Jianwei Yang, Bin Xiao, Yinpeng Chen, Yun Fu, Lu Yuan year 2024 url https://arxiv.org/abs/2403.19963

    Abstract

    In this work, we present efficient modulation, a novel design for efficient vision networks. We revisit the modulation mechanism, which operates input through convolutional context modeling and feature projection layers, and fuses features via element-wise multiplication and an MLP block. We demonstrate that the modulation mechanism is particularly well suited for efficient networks and further tailor the modulation design by proposing the efficient modulation (EfficientMod) block, which is considered the essential building block for our networks. Benefiting from the prominent representational ability of modulation mechanism and the proposed efficient design, our network can accomplish better trade-offs between accuracy and efficiency and set new state-of-the-art performance in the zoo of efficient networks. When integrating EfficientMod with the vanilla self-attention block, we obtain the hybrid architecture which further improves the performance without loss of efficiency. We carry out comprehensive experiments to verify EfficientMod's performance. With fewer parameters, our EfficientMod-s performs 0.6 top-1 accuracy better than EfficientFormerV2-s2 and is 25% faster on GPU, and 2.9 better than MobileViTv2-1.0 at the same GPU latency. Additionally, our method presents a notable improvement in downstream tasks, outperforming EfficientFormerV2-s by 3.6 mIoU on the ADE20K benchmark. Code and checkpoints are available at\u00a0this https URL.

    ","tags":["efficient_dl","computer_vision"]},{"location":"100%20Reference%20notes/101%20Literature/EfficientViT-SAM%20-%20Accelerated%20Segment%20Anything%20Model%20Without%20Accuracy%20Loss/","title":"EfficientViT SAM Accelerated Segment Anything Model Without Accuracy Loss","text":"Properties authors Zhuoyang Zhang, Han Cai, Song Han year 2024 url https://arxiv.org/abs/2402.05008

    Abstract

    We present EfficientViT-SAM, a new family of accelerated segment anything models. We retain SAM's lightweight prompt encoder and mask decoder while replacing the heavy image encoder with EfficientViT. For the training, we begin with the knowledge distillation from the SAM-ViT-H image encoder to EfficientViT. Subsequently, we conduct end-to-end training on the SA-1B dataset. Benefiting from EfficientViT's efficiency and capacity, EfficientViT-SAM delivers 48.9x measured TensorRT speedup on A100 GPU over SAM-ViT-H without sacrificing performance. Our code and pre-trained models are released at\u00a0this https URL.

    ","tags":["paper","efficient_dl","vit"]},{"location":"100%20Reference%20notes/101%20Literature/Emergent%20Equivariance%20in%20Deep%20Ensembles/","title":"Emergent Equivariance in Deep Ensembles","text":"Properties authors Jan E. Gerken, Pan Kessel year 2024 url https://arxiv.org/abs/2403.03103

    Abstract

    We demonstrate that deep ensembles are secretly equivariant models. More precisely, we show that deep ensembles become equivariant for all inputs and at all training times by simply using data augmentation. Crucially, equivariance holds off-manifold and for any architecture in the infinite width limit. The equivariance is emergent in the sense that predictions of individual ensemble members are not equivariant but their collective prediction is. Neural tangent kernel theory is used to derive this result and we verify our theoretical insights using detailed numerical experiments.

    ","tags":["equivariance","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Emerging%20Properties%20in%20Self-Supervised%20Vision%20Transformers/","title":"Emerging Properties in Self Supervised Vision Transformers","text":"Properties authors Mathilde Caron, Hugo Touvron, Ishan Misra, Herv\u00e9 Jegou, Julien Mairal, Piotr Bojanowski, Armand Joulin year 2021 url https://arxiv.org/abs/2104.14294

    Abstract

    In this paper, we question if self-supervised learning provides new properties to Vision Transformer (ViT) that stand out compared to convolutional networks (convnets). Beyond the fact that adapting self-supervised methods to this architecture works particularly well, we make the following observations: first, self-supervised ViT features contain explicit information about the semantic segmentation of an image, which does not emerge as clearly with supervised ViTs, nor with convnets. Second, these features are also excellent k-NN classifiers, reaching 78.3% top-1 on ImageNet with a small ViT. Our study also underlines the importance of momentum encoder, multi-crop training, and the use of small patches with ViTs. We implement our findings into a simple self-supervised method, called DINO, which we interpret as a form of self-distillation with no labels. We show the synergy between DINO and ViTs by achieving 80.1% top-1 on ImageNet in linear evaluation with ViT-Base.

    ","tags":["paper","foundation_models","computer_vision","vit"]},{"location":"100%20Reference%20notes/101%20Literature/End-to-End%20Object%20Detection%20with%20Transformers/","title":"End to End Object Detection with Transformers","text":"Properties authors Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko year 2020 url https://arxiv.org/abs/2005.12872

    Abstract

    We present a new method that views object detection as a direct set prediction problem. Our approach streamlines the detection pipeline, effectively removing the need for many hand-designed components like a non-maximum suppression procedure or anchor generation that explicitly encode our prior knowledge about the task. The main ingredients of the new framework, called DEtection TRansformer or DETR, are a set-based global loss that forces unique predictions via bipartite matching, and a transformer encoder-decoder architecture. Given a fixed small set of learned object queries, DETR reasons about the relations of the objects and the global image context to directly output the final set of predictions in parallel. The new model is conceptually simple and does not require a specialized library, unlike many other modern detectors. DETR demonstrates accuracy and run-time performance on par with the well-established and highly-optimized Faster RCNN baseline on the challenging COCO object detection dataset. Moreover, DETR can be easily generalized to produce panoptic segmentation in a unified manner. We show that it significantly outperforms competitive baselines. Training code and pretrained models are available at\u00a0this https URL.

    ","tags":["paper","computer_vision","object_detection","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/Equi-Tuning%20-%20Group%20Equivariant%20Fine-Tuning%20of%20Pretrained%20Models/","title":"Equi Tuning Group Equivariant Fine Tuning of Pretrained Models","text":"Properties authors Sourya Basu

    Main idea

    Given non-equivariant pre-trained model \\(M(x)\\), define equivariant model \\(M_G(x)\\), as the average of the inverted predictions for all group actions on input \\(x\\)

    \\[ M_G(x) = \\frac{1}{|G|} \\sum_{g \\in G} g^{-1} M(g x) \\]

    Abstract

    ","tags":["dl2"]},{"location":"100%20Reference%20notes/101%20Literature/Equivariance%20with%20Learned%20Canonicalization%20Functions/","title":"Equivariance with Learned Canonicalization Functions","text":"Properties authors S\u00e9kou-Oumar Kaba, Arnab Kumar Mondal, Yan Zhang, Yoshua Bengio, Siamak Ravanbakhsh

    Main idea

    We learn a canonicalization function \\(h\\) either by a neural network or an optimization procedure. $$ \\phi(x) = h'(x) f(h(x)^{-1} x) $$

    Abstract

    ","tags":["dl2"]},{"location":"100%20Reference%20notes/101%20Literature/Equivariance-aware%20architectural%20optimization%20of%20neural%20networks/","title":"Equivariance aware architectural optimization of neural networks","text":"Properties authors Kaitlin Maile, Dennis G. Wilson, Patrick Forr\u00e9

    References: - Learning Partial Equivariances from Data

    Abstract

    ","tags":["dl2"]},{"location":"100%20Reference%20notes/101%20Literature/Exact%20Conversion%20of%20In-Context%20Learning%20to%20Model%20Weights%20in%20Linearized-Attention%20Transformers/","title":"Exact Conversion of In Context Learning to Model Weights in Linearized Attention Transformers","text":"Properties authors Brian K Chen, Tianyang Hu, Hui Jin, Hwee Kuan Lee, Kenji Kawaguchi year 2024 url https://arxiv.org/abs/2406.02847

    Abstract

    In-Context Learning (ICL) has been a powerful emergent property of large language models that has attracted increasing attention in recent years. In contrast to regular gradient-based learning, ICL is highly interpretable and does not require parameter updates. In this paper, we show that, for linearized transformer networks, ICL can be made explicit and permanent through the inclusion of bias terms. We mathematically demonstrate the equivalence between a model with ICL demonstration prompts and the same model with the additional bias terms. Our algorithm (ICLCA) allows for exact conversion in an inexpensive manner. Existing methods are not exact and require expensive parameter updates. We demonstrate the efficacy of our approach through experiments that show the exact incorporation of ICL tokens into a linear transformer. We further suggest how our method can be adapted to achieve cheap approximate conversion of ICL tokens, even in regular transformer networks that are not linearized. Our experiments on GPT-2 show that, even though the conversion is only approximate, the model still gains valuable context from the included bias terms.

    ","tags":["paper","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/Exploiting%20Redundancy%20-%20Separable%20Group%20Convolutional%20Networks%20on%20Lie%20Groups/","title":"Exploiting Redundancy Separable Group Convolutional Networks on Lie Groups","text":"Properties authors David M. Knigge, David W. Romero, Erik J. Bekkers

    Abstract

    In this work, we investigate the properties of representations learned by regular G-CNNs, and show considerable parameter redundancy in group convolution kernels. This finding motivates further weight-tying by sharing convolution kernels over subgroups. To this end, we introduce convolution kernels that are separable over the subgroup and channel dimensions.

    Interesting because it reduces the total parameter count by separating group convolution kernels. This also has a regularisation effect.

    Citations: - Relaxing Equivariance Constraints with Non-stationary Continuous Filters

    ","tags":["dl2"]},{"location":"100%20Reference%20notes/101%20Literature/Exploring%20Plain%20Vision%20Transformer%20Backbones%20for%20Object%20Detection/","title":"Exploring Plain Vision Transformer Backbones for Object Detection","text":"Properties authors Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He year 2023 url https://arxiv.org/abs/2203.16527

    Abstract

    We explore the plain, non-hierarchical Vision Transformer (ViT) as a backbone network for object detection. This design enables the original ViT architecture to be fine-tuned for object detection without needing to redesign a hierarchical backbone for pre-training. With minimal adaptations for fine-tuning, our plain-backbone detector can achieve competitive results. Surprisingly, we observe: (i) it is sufficient to build a simple feature pyramid from a single-scale feature map (without the common FPN design) and (ii) it is sufficient to use window attention (without shifting) aided with very few cross-window propagation blocks. With plain ViT backbones pre-trained as Masked Autoencoders (MAE), our detector, named ViTDet, can compete with the previous leading methods that were all based on hierarchical backbones, reaching up to 61.3 AP_box on the COCO dataset using only ImageNet-1K pre-training. We hope our study will draw attention to research on plain-backbone detectors. Code for ViTDet is available in Detectron2.

    ","tags":["paper","computer_vision","object_detection","transformers","vit"]},{"location":"100%20Reference%20notes/101%20Literature/Exploring%20Plain%20Vision%20Transformer%20Backbones%20for%20Object%20Detection/#notes","title":"Notes","text":"
    • It effectively adapts a pre-trained vision transformers as backbones and decoder heads by adding minimal layers in between to make them work
    • Requires full fine-tuning
    • Ranks #16 on https://paperswithcode.com/sota/object-detection-on-coco-minival, ~4 box map points lower than the first spot

    Code and weights at: https://github.com/facebookresearch/detectron2/tree/main/projects/ViTDet

    ","tags":["paper","computer_vision","object_detection","transformers","vit"]},{"location":"100%20Reference%20notes/101%20Literature/Fast%2C%20Expressive%20SE%28n%29%20Equivariant%20Networks%20through%20Weight-Sharing%20in%20Position-Orientation%20Space/","title":"Fast, Expressive SE(n) Equivariant Networks through Weight Sharing in Position Orientation Space","text":"Properties authors Erik J. Bekkers, Sharvaree Vadgama, Rob D. Hesselink, Putri A. van der Linden, David W. Romero

    Abstract

    ","tags":["dl2"]},{"location":"100%20Reference%20notes/101%20Literature/FlexiViT%20-%20One%20Model%20for%20All%20Patch%20Sizes/","title":"FlexiViT One Model for All Patch Sizes","text":"Properties authors Lucas Beyer, Pavel Izmailov, Alexander Kolesnikov, Mathilde Caron, Simon Kornblith, Xiaohua Zhai, Matthias Minderer, Michael Tschannen, Ibrahim Alabdulmoshin, Filip Pavetic year 2022 url https://arxiv.org/abs/2212.08013

    Abstract

    Vision Transformers convert images to sequences by slicing them into patches. The size of these patches controls a speed/accuracy tradeoff, with smaller patches leading to higher accuracy at greater computational cost, but changing the patch size typically requires retraining the model. In this paper, we demonstrate that simply randomizing the patch size at training time leads to a single set of weights that performs well across a wide range of patch sizes, making it possible to tailor the model to different compute budgets at deployment time. We extensively evaluate the resulting model, which we call FlexiViT, on a wide range of tasks, including classification, image-text retrieval, open-world detection, panoptic segmentation, and semantic segmentation, concluding that it usually matches, and sometimes outperforms, standard ViT models trained at a single patch size in an otherwise identical setup. Hence, FlexiViT training is a simple drop-in improvement for ViT that makes it easy to add compute-adaptive capabilities to most models relying on a ViT backbone architecture. Code and pre-trained models are available at\u00a0this https URL

    ","tags":["paper","foundation_models","computer_vision","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/FlexiViT%20-%20One%20Model%20for%20All%20Patch%20Sizes/#notes","title":"Notes","text":"
    • Read in depth, seems very promising
    • Google already filed a patent for this: https://patents.google.com/patent/US20240169715A1/en
    ","tags":["paper","foundation_models","computer_vision","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/G-SGD%20-%20Optimizing%20ReLU%20Neural%20Networks%20in%20its%20Positively%20Scale-Invariant%20Space/","title":"G SGD Optimizing ReLU Neural Networks in its Positively Scale Invariant Space","text":"Properties authors Qi Meng, Shuxin Zheng, Huishuai Zhang, Wei Chen, Zhi-Ming Ma, Tie-Yan Liu year 2018 url https://arxiv.org/abs/1802.03713

    Abstract

    It is well known that neural networks with rectified linear units (ReLU) activation functions are positively scale-invariant. Conventional algorithms like stochastic gradient descent optimize the neural networks in the vector space of weights, which is, however, not positively scale-invariant. This mismatch may lead to problems during the optimization process. Then, a natural question is: \\emph{can we construct a new vector space that is positively scale-invariant and sufficient to represent ReLU neural networks so as to better facilitate the optimization process }? In this paper, we provide our positive answer to this question. First, we conduct a formal study on the positive scaling operators which forms a transformation group, denoted as\u00a0\ue233. We show that the value of a path (i.e. the product of the weights along the path) in the neural network is invariant to positive scaling and prove that the value vector of all the paths is sufficient to represent the neural networks under mild conditions. Second, we show that one can identify some basis paths out of all the paths and prove that the linear span of their value vectors (denoted as\u00a0\ue233-space) is an invariant space with lower dimension under the positive scaling group. Finally, we design stochastic gradient descent algorithm in\u00a0\ue233-space (abbreviated as\u00a0\ue233-SGD) to optimize the value vector of the basis paths of neural networks with little extra cost by leveraging back-propagation. Our experiments show that\u00a0\ue233-SGD significantly outperforms the conventional SGD algorithm in optimizing ReLU networks on benchmark datasets.

    ","tags":["dl_theory","dl2"]},{"location":"100%20Reference%20notes/101%20Literature/Grokked%20Transformers%20are%20Implicit%20Reasoners%20-%20A%20Mechanistic%20Journey%20to%20the%20Edge%20of%20Generalization/","title":"Grokked Transformers are Implicit Reasoners A Mechanistic Journey to the Edge of Generalization","text":"Properties authors Boshi Wang, Xiang Yue, Yu Su, Huan Sun year 2024 url https://arxiv.org/abs/2405.15071

    Abstract

    We study whether transformers can learn to implicitly reason over parametric knowledge, a skill that even the most capable language models struggle with. Focusing on two representative reasoning types, composition and comparison, we consistently find that transformers can learn implicit reasoning, but only through grokking, i.e., extended training far beyond overfitting. The levels of generalization also vary across reasoning types: when faced with out-of-distribution examples, transformers fail to systematically generalize for composition but succeed for comparison. We delve into the model's internals throughout training, conducting analytical experiments that reveal: 1) the mechanism behind grokking, such as the formation of the generalizing circuit and its relation to the relative efficiency of generalizing and memorizing circuits, and 2) the connection between systematicity and the configuration of the generalizing circuit. Our findings guide data and training setup to better induce implicit reasoning and suggest potential improvements to the transformer architecture, such as encouraging cross-layer knowledge sharing. Furthermore, we demonstrate that for a challenging reasoning task with a large search space, GPT-4-Turbo and Gemini-1.5-Pro based on non-parametric memory fail badly regardless of prompting styles or retrieval augmentation, while a fully grokked transformer can achieve near-perfect accuracy, showcasing the power of parametric memory for complex reasoning.

    ","tags":["paper","transformers","mechinterp"]},{"location":"100%20Reference%20notes/101%20Literature/Harmonics%20of%20Learning%20-%20Universal%20Fourier%20Features%20Emerge%20in%20Invariant%20Networks/","title":"Harmonics of Learning Universal Fourier Features Emerge in Invariant Networks","text":"Properties authors Giovanni Luca Marchetti, Christopher Hillar, Danica Kragic, Sophia Sanborn year 2023 url https://arxiv.org/abs/2312.08550

    Abstract

    In this work, we formally prove that, under certain conditions, if a neural network is invariant to a finite group then its weights recover the Fourier transform on that group. This provides a mathematical explanation for the emergence of Fourier features -- a ubiquitous phenomenon in both biological and artificial learning systems. The results hold even for non-commutative groups, in which case the Fourier transform encodes all the irreducible unitary group representations. Our findings have consequences for the problem of symmetry discovery. Specifically, we demonstrate that the algebraic structure of an unknown group can be recovered from the weights of a network that is at least approximately invariant within certain bounds. Overall, this work contributes to a foundation for an algebraic learning theory of invariant neural network representations.

    ","tags":["theory","equivariance","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/How%20do%20vision%20transformers%20work%3F/","title":"How do vision transformers work?","text":"Properties authors Namuk Park, Songkuk Kim year 2022 url https://arxiv.org/abs/2202.06709

    Abstract

    The success of multi-head self-attentions (MSAs) for computer vision is now indisputable. However, little is known about how MSAs work. We present fundamental explanations to help better understand the nature of MSAs. In particular, we demonstrate the following properties of MSAs and Vision Transformers (ViTs): (1) MSAs improve not only accuracy but also generalization by flattening the loss landscapes. Such improvement is primarily attributable to their data specificity, not long-range dependency. On the other hand, ViTs suffer from non-convex losses. Large datasets and loss landscape smoothing methods alleviate this problem; (2) MSAs and Convs exhibit opposite behaviors. For example, MSAs are low-pass filters, but Convs are high-pass filters. Therefore, MSAs and Convs are complementary; (3) Multi-stage neural networks behave like a series connection of small individual models. In addition, MSAs at the end of a stage play a key role in prediction. Based on these insights, we propose AlterNet, a model in which Conv blocks at the end of a stage are replaced with MSA blocks. AlterNet outperforms CNNs not only in large data regimes but also in small data regimes. The code is available at\u00a0this https URL.

    ","tags":["vit","computer_vision","cnn","optimizability"]},{"location":"100%20Reference%20notes/101%20Literature/How%20do%20vision%20transformers%20work%3F/#notes","title":"Notes","text":"","tags":["vit","computer_vision","cnn","optimizability"]},{"location":"100%20Reference%20notes/101%20Literature/How%20do%20vision%20transformers%20work%3F/#the-question-of-inductive-biases","title":"The question of inductive biases","text":"

    Contrary to our expectations, experimental results show that the stronger the inductive bias, the lower both the test error and the training NLL. This indicates that ViT does not overfit training datasets. In addition, appropriate inductive biases, such as locality constraints for MSAs, helps NNs learn strong representations. We also observe these phenomena on CIFAR-10 and ImageNet as shown in Fig. C.1. Figure C.2 also supports that weak inductive biases disrupt NN training. In this experiment, extremely small patch sizes for the embedding hurt the predictive performance of ViT.

    Long range (global) attention is worse than local attention. MSA are good because they smooth loss landscape and are input dependent.

    What properties of MSAs do we need to improve optimization? We present various evidences to support that MSA is generalized spatial smoothing. It means that MSAs improve performance because their formulation\u2014Eq. (1)\u2014is an appropriate inductive bias. Their weak inductive bias disrupts NN training. In particular, a key feature of MSAs is their data specificity, not long-range dependency. As an extreme example, local MSAs with a 3 \u00d7 3 receptive field outperforms global MSA because they reduce unnecessary degrees of freedom.

    As far as my understanding goes, local MSA is not translation equivariant because it still is input dependent. So Local MSA has locality inductive bias but not translation equivariance. This is interesting, normal ConvNets do locality inductive bias by translation equivariance and it is not straight forward to remove their translation equivariance. Tracking at Input-dependent convolutions and Non-translationally equivariant convolutions.

    Locality inductive biases help with more stable training dynamics

    ","tags":["vit","computer_vision","cnn","optimizability"]},{"location":"100%20Reference%20notes/101%20Literature/How%20do%20vision%20transformers%20work%3F/#hessian-spectra","title":"Hessian Spectra","text":"

    Legend: ViT (red), CNN (blue) - ViT has small magnitude and negative values - CNN has large magnitude and positive values

    ","tags":["vit","computer_vision","cnn","optimizability"]},{"location":"100%20Reference%20notes/101%20Literature/Hydra%20-%20Bidirectional%20State%20Space%20Models%20Through%20Generalized%20Matrix%20Mixers/","title":"Hydra Bidirectional State Space Models Through Generalized Matrix Mixers","text":"Properties authors Sukjun Hwang, Aakash Lahoti, Tri Dao, Albert Gu year 2024 url https://arxiv.org/abs/2407.09941

    Abstract

    A wide array of sequence models are built on a framework modeled after Transformers, comprising alternating sequence mixer and channel mixer layers. This paper studies a unifying matrix mixer view of sequence mixers that can be conceptualized as a linear map on the input sequence. This framework encompasses a broad range of well-known sequence models, including the self-attention of Transformers as well as recent strong alternatives such as structured state space models (SSMs), and allows understanding downstream characteristics such as efficiency and expressivity through properties of their structured matrix class. We identify a key axis of matrix parameterizations termed sequence alignment, which increases the flexibility and performance of matrix mixers, providing insights into the strong performance of Transformers and recent SSMs such as Mamba. Furthermore, the matrix mixer framework offers a systematic approach to developing sequence mixers with desired properties, allowing us to develop several new sub-quadratic sequence models. In particular, we propose a natural bidirectional extension of the Mamba model (Hydra), parameterized as a quasiseparable matrix mixer, which demonstrates superior performance over other sequence models including Transformers on non-causal tasks. As a drop-in replacement for attention layers, Hydra outperforms BERT by 0.8 points on the GLUE benchmark and ViT by 2% Top-1 accuracy on ImageNet.

    ","tags":["paper","sequence_models"]},{"location":"100%20Reference%20notes/101%20Literature/Improving%20Convergence%20and%20Generalization%20Using%20Parameter%20Symmetries/","title":"Improving Convergence and Generalization Using Parameter Symmetries","text":"Properties authors Bo Zhao, Robert M Gower, Robin Walters, Rose Yu year 2023 url https://arxiv.org/abs/2305.13404

    Abstract

    In overparametrized models, different values of the parameters may result in the same loss value. Parameter space symmetries are transformations that change the model parameters but leave the loss invariant. Teleportation applies such transformations to accelerate optimization. However, the exact mechanism behind this algorithm's success is not well understood. In this paper, we show that teleportation not only speeds up optimization in the short-term, but gives overall faster time to convergence. Additionally, we show that teleporting to minima with different curvatures improves generalization and provide insights on the connection between the curvature of the minima and generalization ability. Finally, we show that integrating teleportation into a wide range of optimization algorithms and optimization-based meta-learning improves convergence.

    ","tags":["equivariance","relaxed_equivariance","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/In%20Search%20of%20Projectively%20Equivariant%20Networks/","title":"In Search of Projectively Equivariant Networks","text":"Properties authors Georg Bokman, Axel Flinth, Fredrik Kahl year 2022 url https://arxiv.org/abs/2209.14719

    Abstract

    Equivariance of linear neural network layers is well studied. In this work, we relax the equivariance condition to only be true in a projective sense. We propose a way to construct a projectively equivariant neural network through building a standard equivariant network where the linear group representations acting on each intermediate feature space are\"multiplicatively modified lifts\"of projective group representations. By theoretically studying the relation of projectively and linearly equivariant linear layers, we show that our approach is the most general possible when building a network out of linear layers. The theory is showcased in two simple experiments.

    ","tags":["equivariance","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Knowledge%20Transfer%20from%20Vision%20Foundation%20Models%20for%20Efficient%20Training%20of%20Small%20Task-specific%20Models/","title":"Knowledge Transfer from Vision Foundation Models for Efficient Training of Small Task specific Models","text":"Properties authors Raviteja Vemulapalli, Hadi Pouransari, Fartash Faghri, Sachin Mehta, Mehrdad Farajtabar, Mohammad Rastegari, Oncel Tuzel year 2023 url https://arxiv.org/abs/2311.18237

    Abstract

    Vision Foundation Models (VFMs) pretrained on massive datasets exhibit impressive performance on various downstream tasks, especially with limited labeled target data. However, due to their high inference compute cost, these models cannot be deployed for many real-world applications. Motivated by this, we ask the following important question, \"How can we leverage the knowledge from a large VFM to train a small task-specific model for a new target task with limited labeled training data?\", and propose a simple task-oriented knowledge transfer approach as a highly effective solution to this problem. Our experimental results on five target tasks show that the proposed approach outperforms task-agnostic VFM distillation, web-scale CLIP pretraining, supervised ImageNet pretraining, and self-supervised DINO pretraining by up to 11.6%, 22.1%, 13.7%, and 29.8%, respectively. Furthermore, the proposed approach also demonstrates up to 9x, 4x and 15x reduction in pretraining compute cost when compared to task-agnostic VFM distillation, ImageNet pretraining and DINO pretraining, respectively, while outperforming them. We also show that the dataset used for transferring knowledge has a significant effect on the final target task performance, and introduce a retrieval-augmented knowledge transfer strategy that uses web-scale image retrieval to curate effective transfer sets.

    ","tags":["efficient_dl","paper","foundation_models"]},{"location":"100%20Reference%20notes/101%20Literature/LRP-QViT%20-%20Mixed-Precision%20Vision%20Transformer%20Quantization%20via%20Layer-wise%20Relevance%20Propagation/","title":"LRP QViT Mixed Precision Vision Transformer Quantization via Layer wise Relevance Propagation","text":"Properties authors Navin Ranjan, Andreas Savakis year 2024 url https://arxiv.org/abs/2401.11243

    Abstract

    Vision transformers (ViTs) have demonstrated remarkable performance across various visual tasks. However, ViT models suffer from substantial computational and memory requirements, making it challenging to deploy them on resource-constrained platforms. Quantization is a popular approach for reducing model size, but most studies mainly focus on equal bit-width quantization for the entire network, resulting in sub-optimal solutions. While there are few works on mixed precision quantization (MPQ) for ViTs, they typically rely on search space-based methods or employ mixed precision arbitrarily. In this paper, we introduce LRP-QViT, an explainability-based method for assigning mixed-precision bit allocations to different layers based on their importance during classification. Specifically, to measure the contribution score of each layer in predicting the target class, we employ the Layer-wise Relevance Propagation (LRP) method. LRP assigns local relevance at the output layer and propagates it through all layers, distributing the relevance until it reaches the input layers. These relevance scores serve as indicators for computing the layer contribution score. Additionally, we have introduced a clipped channel-wise quantization aimed at eliminating outliers from post-LayerNorm activations to alleviate severe inter-channel variations. To validate and assess our approach, we employ LRP-QViT across ViT, DeiT, and Swin transformer models on various datasets. Our experimental findings demonstrate that both our fixed-bit and mixed-bit post-training quantization methods surpass existing models in the context of 4-bit and 6-bit quantization.

    ","tags":["paper","vit","computer_vision","peft","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/Learned%20Gridification%20for%20Efficient%20Point%20Cloud%20Processing/","title":"Learned Gridification for Efficient Point Cloud Processing","text":"Properties authors Putri A. van der Linden, Erik J. Bekkers, David W. Romero

    Abstract

    ","tags":["dl2"]},{"location":"100%20Reference%20notes/101%20Literature/Learning%20Partial%20Equivariances%20from%20Data/","title":"Learning Partial Equivariances from Data","text":"Properties authors David W. Romero, Suhas Lohit year 2021 url https://arxiv.org/abs/2110.10211

    Monte Carlo Approximation of Group Convolutions

    We can approximate Group Convolutions on the expectation by uniformly sampling group actions \\(v_j\\). $$ (\\psi \\hat{*} f)(u_i) = \\sum_j \\psi (v_j^{-1} u_i)f(v_j) \\bar{\\mu}_{\\mathcal{G}} (v_j) $$

    Main idea

    1. Prioritize sampling of specific group elements during the group convolution by learning a probability distribution over them.
    2. 1D continuous groups: use reparametrization trick on the Lie algebra of the group, which is uniform over a connected set of group elements but zero otherwise. \\(\\to\\) Partial Equivariance
    3. 1D discrete groups: Bernoulli Distribution over all possible element combinations

    Citations: - Self-Supervised Detection of Perfect and Partial Input-Dependent Symmetries - Color Equivariant Convolutional Networks - Equivariance-aware architectural optimization of neural networks - Approximation-Generalization Trade-offs under (Approximate) Group Equivariance

    ","tags":["dl2","equivariance","partial_equivariance"]},{"location":"100%20Reference%20notes/101%20Literature/Learning%20both%20Weights%20and%20Connections%20for%20Efficient%20Neural%20Networks/","title":"Learning both Weights and Connections for Efficient Neural Networks","text":"Properties authors Song Han, Jeff Pool, John Tran, William J. Dally year 2015 url https://arxiv.org/abs/1506.02626

    Abstract

    Neural networks are both computationally intensive and memory intensive, making them difficult to deploy on embedded systems. Also, conventional networks fix the architecture before training starts; as a result, training cannot improve the architecture. To address these limitations, we describe a method to reduce the storage and computation required by neural networks by an order of magnitude without affecting their accuracy by learning only the important connections. Our method prunes redundant connections using a three-step method. First, we train the network to learn which connections are important. Next, we prune the unimportant connections. Finally, we retrain the network to fine tune the weights of the remaining connections. On the ImageNet dataset, our method reduced the number of parameters of AlexNet by a factor of 9x, from 61 million to 6.7 million, without incurring accuracy loss. Similar experiments with VGG-16 found that the number of parameters can be reduced by 13x, from 138 million to 10.3 million, again with no loss of accuracy.

    ","tags":["paper","efficient_dl","pruning","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Learning%20with%20Unmasked%20Tokens%20Drives%20Stronger%20Vision%20Learners/","title":"Learning with Unmasked Tokens Drives Stronger Vision Learners","text":"Properties authors Taekyung Kim, Sanghyuk Chun, Byeongho Heo, Dongyoon Han year 2024 url https://arxiv.org/abs/2310.13593

    Abstract

    Masked image modeling (MIM) has become a leading self-supervised learning strategy. MIMs such as Masked Autoencoder (MAE) learn strong representations by randomly masking input tokens for the encoder to process, with the decoder reconstructing the masked tokens to the input. However, MIM pre-trained encoders often exhibit a limited attention span, attributed to MIM's sole focus on regressing masked tokens only, which may impede the encoder's broader context learning. To tackle the limitation, we improve MIM by explicitly incorporating unmasked tokens into the training process. Specifically, our method enables the encoder to learn from broader context supervision, allowing unmasked tokens to experience broader contexts while the decoder reconstructs masked tokens. Thus, the encoded unmasked tokens are equipped with extensive contextual information, empowering masked tokens to leverage the enhanced unmasked tokens for MIM. As a result, our simple remedy trains more discriminative representations revealed by achieving 84.2% top-1 accuracy with ViT-B on ImageNet-1K with 0.6%p gain. We attribute the success to the enhanced pre-training method, as evidenced by the singular value spectrum and attention analyses. Finally, our models achieve significant performance gains at the downstream semantic segmentation and fine-grained visual classification tasks; and on diverse robust evaluation metrics. Code is available at\u00a0this https URL

    ","tags":["paper","foundation_models","computer_vision","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/Learning%20with%20Unmasked%20Tokens%20Drives%20Stronger%20Vision%20Learners/#notes","title":"Notes","text":"

    Some notes regarding MIM as a good objective are on Masked Image Modelling.

    However, MIM strategies often encounter challenges, such as local dependency on attention to understand entire context of an image. For example, liu\u00a0et al.\u00a0[36]\u00a0revealed that MAE\u00a0[22], a state-of-the-art MIM method, exhibits shorter average attention distances. Furthermore, we observe that attention map patterns by MAE substantiate extremely local behavior (See Fig.\u00a01) indeed. In other words, the MAE-trained attention mechanism less integrates information across the entire image pixels and tends to focus on specific input regions. This is presumably attributed to MIM-pretraining, primarily dedicated to predicting low-level pixel details (e.g., color or texture) without a comprehensive understanding of less-regional information (e.g., the input structure or shape).

    This maybe should not really be an issue: How do vision transformers work? explicitly constraint ViTs to only use local attention and they improve performance. So maybe this is an advantage? See Are less inductive biases better or worse?.

    ","tags":["paper","foundation_models","computer_vision","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/Llama%202%20-%20Open%20Foundation%20and%20Fine-Tuned%20Chat%20Models/","title":"Llama 2 Open Foundation and Fine Tuned Chat Models","text":"Properties authors Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom year 2023 url https://arxiv.org/abs/2307.09288

    Abstract

    In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models outperform open-source chat models on most benchmarks we tested, and based on our human evaluations for helpfulness and safety, may be a suitable substitute for closed-source models. We provide a detailed description of our approach to fine-tuning and safety improvements of Llama 2-Chat in order to enable the community to build on our work and contribute to the responsible development of LLMs.

    ","tags":["paper","foundation_models","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/LoRA%20-%20Low-Rank%20Adaptation%20of%20Large%20Language%20Models/","title":"LoRA Low Rank Adaptation of Large Language Models","text":"Properties authors Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen year 2021 url https://arxiv.org/abs/2106.09685

    Abstract

    An important paradigm of natural language processing consists of large-scale pre-training on general domain data and adaptation to particular tasks or domains. As we pre-train larger models, full fine-tuning, which retrains all model parameters, becomes less feasible. Using GPT-3 175B as an example -- deploying independent instances of fine-tuned models, each with 175B parameters, is prohibitively expensive. We propose Low-Rank Adaptation, or LoRA, which freezes the pre-trained model weights and injects trainable rank decomposition matrices into each layer of the Transformer architecture, greatly reducing the number of trainable parameters for downstream tasks. Compared to GPT-3 175B fine-tuned with Adam, LoRA can reduce the number of trainable parameters by 10,000 times and the GPU memory requirement by 3 times. LoRA performs on-par or better than fine-tuning in model quality on RoBERTa, DeBERTa, GPT-2, and GPT-3, despite having fewer trainable parameters, a higher training throughput, and, unlike adapters, no additional inference latency. We also provide an empirical investigation into rank-deficiency in language model adaptation, which sheds light on the efficacy of LoRA. We release a package that facilitates the integration of LoRA with PyTorch models and provide our implementations and model checkpoints for RoBERTa, DeBERTa, and GPT-2 at\u00a0this https URL.

    ","tags":["paper","efficient_dl","peft"]},{"location":"100%20Reference%20notes/101%20Literature/Mamba%20-%20Linear-Time%20Sequence%20Modeling%20with%20Selective%20State%20Spaces/","title":"Mamba Linear Time Sequence Modeling with Selective State Spaces","text":"Properties authors Albert Gu, Tri Dao year 2023

    Abstract

    Foundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module. Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformers' computational inefficiency on long sequences, but they have not performed as well as attention on important modalities such as language. We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements. First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token. Second, even though this change prevents the use of efficient convolutions, we design a hardware-aware parallel algorithm in recurrent mode. We integrate these selective SSMs into a simplified end-to-end neural network architecture without attention or even MLP blocks (Mamba). Mamba enjoys fast inference (5\u00d7\u00a0higher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences. As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics. On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.

    ","tags":["foundation_models","convolutions"]},{"location":"100%20Reference%20notes/101%20Literature/Memorization%20Through%20the%20Lens%20of%20Curvature%20of%20Loss%20Function%20Around%20Samples/","title":"Memorization Through the Lens of Curvature of Loss Function Around Samples","text":"Properties authors Isha Garg, Deepak Ravikumar, Kaushik Roy year 2024 url https://openreview.net/forum?id=WQbDS9RydY

    Abstract

    Deep neural networks are over-parameterized and easily overfit to and memorize the datasets that they train on. In the extreme case, it has been shown that networks can memorize a randomly labeled dataset. In this paper, we propose using the curvature of the loss function around each training sample, averaged over training epochs, as a measure of memorization of a sample. We show that this curvature metric effectively captures memorization statistics, both qualitatively and quantitatively in popular image datasets. We provide quantitative validation of the proposed metric against memorization scores released by Feldman & Zhang (2020). Further, experiments on mislabeled data detection show that corrupted samples are learned with high curvature and using curvature for identifying mislabelled examples outperforms existing approaches. Qualitatively, we find that high curvature samples correspond to long-tailed, mislabeled, or conflicting instances, indicating a likelihood of memorization. Notably, this analysis helps us find, to the best of our knowledge, a novel failure mode on the CIFAR100 and ImageNet datasets: that of duplicated images with differing labels.

    ","tags":["paper","dl_theory","llm"]},{"location":"100%20Reference%20notes/101%20Literature/Mixture%20of%20LoRa%20Experts/","title":"Mixture of LoRa Experts","text":"Properties authors Xun Wu, Shaohan Huang, Furu Wei year 2024 url https://arxiv.org/abs/2404.13628

    Abstract

    LoRA has gained widespread acceptance in the fine-tuning of large pre-trained models to cater to a diverse array of downstream tasks, showcasing notable effectiveness and efficiency, thereby solidifying its position as one of the most prevalent fine-tuning techniques. Due to the modular nature of LoRA's plug-and-play plugins, researchers have delved into the amalgamation of multiple LoRAs to empower models to excel across various downstream tasks. Nonetheless, extant approaches for LoRA fusion grapple with inherent challenges. Direct arithmetic merging may result in the loss of the original pre-trained model's generative capabilities or the distinct identity of LoRAs, thereby yielding suboptimal outcomes. On the other hand, Reference tuning-based fusion exhibits limitations concerning the requisite flexibility for the effective combination of multiple LoRAs. In response to these challenges, this paper introduces the Mixture of LoRA Experts (MoLE) approach, which harnesses hierarchical control and unfettered branch selection. The MoLE approach not only achieves superior LoRA fusion performance in comparison to direct arithmetic merging but also retains the crucial flexibility for combining LoRAs effectively. Extensive experimental evaluations conducted in both the Natural Language Processing (NLP) and Vision & Language (V&L) domains substantiate the efficacy of MoLE.

    ","tags":["paper","peft","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/MobileCLIP%20-%20Fast%20Image-Text%20Models%20through%20Multi-Modal%20Reinforced%20Training/","title":"MobileCLIP Fast Image Text Models through Multi Modal Reinforced Training","text":"Properties authors Pavan Kumar Anasosalu Vasu, Hadi Pouransari, Fartash Faghri, Raviteja Vemulapalli, Oncel Tuzel year 2023 url https://arxiv.org/abs/2311.17049

    Abstract

    Contrastive pretraining of image-text foundation models, such as CLIP, demonstrated excellent zero-shot performance and improved robustness on a wide range of downstream tasks. However, these models utilize large transformer-based encoders with significant memory and latency overhead which pose challenges for deployment on mobile devices. In this work, we introduce MobileCLIP -- a new family of efficient image-text models optimized for runtime performance along with a novel and efficient training approach, namely multi-modal reinforced training. The proposed training approach leverages knowledge transfer from an image captioning model and an ensemble of strong CLIP encoders to improve the accuracy of efficient models. Our approach avoids train-time compute overhead by storing the additional knowledge in a reinforced dataset. MobileCLIP sets a new state-of-the-art latency-accuracy tradeoff for zero-shot classification and retrieval tasks on several datasets. Our MobileCLIP-S2 variant is 2.3\u00d7\u00a0faster while more accurate compared to previous best CLIP model based on ViT-B/16. We further demonstrate the effectiveness of our multi-modal reinforced training by training a CLIP model based on ViT-B/16 image backbone and achieving +2.9% average performance improvement on 38 evaluation benchmarks compared to the previous best. Moreover, we show that the proposed approach achieves 10\u00d7-1000\u00d7\u00a0improved learning efficiency when compared with non-reinforced CLIP training. Code and models are available at\u00a0this https URL\u00a0.

    ","tags":["paper","efficient_dl","efficient_vision","computer_vision","multimodal"]},{"location":"100%20Reference%20notes/101%20Literature/MobileViT%20-%20light-weight%2C%20general-purpose%2C%20and%20mobile-friendly%20vision%20transformer/","title":"MobileViT light weight, general purpose, and mobile friendly vision transformer","text":"Properties authors Sachin Mehta, Mohammad Rastegari year 2022 url https://arxiv.org/abs/2110.02178

    Abstract

    Light-weight convolutional neural networks (CNNs) are the de-facto for mobile vision tasks. Their spatial inductive biases allow them to learn representations with fewer parameters across different vision tasks. However, these networks are spatially local. To learn global representations, self-attention-based vision trans-formers (ViTs) have been adopted. Unlike CNNs, ViTs are heavy-weight. In this paper, we ask the following question: is it possible to combine the strengths of CNNs and ViTs to build a light-weight and low latency network for mobile vision tasks? Towards this end, we introduce MobileViT, a light-weight and general-purpose vision transformer for mobile devices. MobileViT presents a different perspective for the global processing of information with transformers, i.e., transformers as convolutions. Our results show that MobileViT significantly outperforms CNN- and ViT-based networks across different tasks and datasets. On the ImageNet-1k dataset, MobileViT achieves top-1 accuracy of 78.4% with about 6 million parameters, which is 3.2% and 6.2% more accurate than MobileNetv3 (CNN-based) and DeIT (ViT-based) for a similar number of parameters. On the MS-COCO object detection task, MobileViT is 5.7% more accurate than MobileNetv3 for a similar number of parameters.

    Our source code is open-source and available at:\u00a0this https URL

    ","tags":["paper","efficient_dl","efficient_vision"]},{"location":"100%20Reference%20notes/101%20Literature/Model%20Compression%20in%20Practice%20-%20Lessons%20Learned%20from%20Practitioners%20Creating%20On-device%20Machine%20Learning%20Experiences/","title":"Model Compression in Practice Lessons Learned from Practitioners Creating On device Machine Learning Experiences","text":"Properties authors Fred Hohman, Mary Beth Kery, Donghao Ren, Dominik Moritz year 2024 url https://arxiv.org/abs/2310.04621

    Abstract

    On-device machine learning (ML) promises to improve the privacy, responsiveness, and proliferation of new, intelligent user experiences by moving ML computation onto everyday personal devices. However, today's large ML models must be drastically compressed to run efficiently on-device, a hurtle that requires deep, yet currently niche expertise. To engage the broader human-centered ML community in on-device ML experiences, we present the results from an interview study with 30 experts at Apple that specialize in producing efficient models. We compile tacit knowledge that experts have developed through practical experience with model compression across different hardware platforms. Our findings offer pragmatic considerations missing from prior work, covering the design process, trade-offs, and technical strategies that go into creating efficient models. Finally, we distill design recommendations for tooling to help ease the difficulty of this work and bring on-device ML into to more widespread practice.

    ","tags":["paper","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/Model%20Compression%20in%20Practice%20-%20Lessons%20Learned%20from%20Practitioners%20Creating%20On-device%20Machine%20Learning%20Experiences/#notes","title":"Notes","text":"

    Specific techniques on models weights help reduce size, but to get an efficient model comes from more careful design of the loss function, the system, which parts should and should not be modeled with ML. - [ ] How does the design of a loss function affect a model's efficiency? Note to myself to look into this in the future.

    Although posttraining quantization is considered \u201ceasy\u201d [E9] as far as ML compression techniques go, practitioners emphasized that it still often takes complex code to implement and there are many algorithm variations A survey of quantization methods for efficient neural network inference to experiment with [T5]. For models that need high accuracy, post-training quantization may not be enough to hit budget without unacceptable accuracy degradation [E9, E4, E13, E5].

    • Okay, so it's important to try a bunch of quantization techniques. Got it.

    \u201cIf you want to go to lower bit quantization, such as 4 or below, it\u2019s almost impossible to use post-training quantization because the difference in accuracy gets way too big. So for this level of compression you need to do training-aware compression.\u201d \u2014 E9

    • Cool, I didn't know that training aware compression was such an important thing to consider, from an industry perspective, not just research.

    Although training-aware compression is considered the best form of optimization A survey of quantization methods for efficient neural network inference, a major drawback is that is must be included in initial model training: \u201cNot starting early with compression is a dead end,\u201d [E3].

    • Why is that though? Why should compression-aware training happen from the start and not in the middle of training or even in finetuning? #rq

    [...] practitioners suggest estimating how much compression will be feasible with simple post-training quantization. To estimate quantization savings before training a model, first initialize the ML model architecture with random weights, then quantize, and test the model\u2019s speed and size on-device

    Strategy #6: Compression can degrade the accuracy of a model and change its behavior in unpredictable ways. It is essential to create a robust evaluation pipeline (e.g., defining metrics, curating test sets) before you start optimizing your model, so that you can reliably observe shifts in model error afterwards. To prevent degradation from a failed optimization, compare optimized models with varying amounts of compression to your original model, inspecting the metrics, subpopulation behaviors, and internals, such as weights and activations, to ensure they are within expected ranges.

    Okay, Robust evaluation pipeline is fundamental: Need to create unit tests, and for quantization specifically check that the distributions of weights (obviously) and activations (less obviously) are within the expected ranges. The latter might happen because of compounding degradation, this mean that errors in early layers caused by quantization might compound to later layers in unexpected ways.

    ","tags":["paper","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/Neural%20Mechanics%20-%20Symmetry%20and%20Broken%20Conservation%20Laws%20in%20Deep%20Learning%20Dynamics/","title":"Neural Mechanics Symmetry and Broken Conservation Laws in Deep Learning Dynamics","text":"Properties authors Daniel Kunin, Javier Sagastuy-Brena, Surya Ganguli, Daniel L.K. Yamins, Hidenori Tanaka year 2020 url https://arxiv.org/abs/2012.04728

    Abstract

    Understanding the dynamics of neural network parameters during training is one of the key challenges in building a theoretical foundation for deep learning. A central obstacle is that the motion of a network in high-dimensional parameter space undergoes discrete finite steps along complex stochastic gradients derived from real-world datasets. We circumvent this obstacle through a unifying theoretical framework based on intrinsic symmetries embedded in a network's architecture that are present for any dataset. We show that any such symmetry imposes stringent geometric constraints on gradients and Hessians, leading to an associated conservation law in the continuous-time limit of stochastic gradient descent (SGD), akin to Noether's theorem in physics. We further show that finite learning rates used in practice can actually break these symmetry induced conservation laws. We apply tools from finite difference methods to derive modified gradient flow, a differential equation that better approximates the numerical trajectory taken by SGD at finite learning rates. We combine modified gradient flow with our framework of symmetries to derive exact integral expressions for the dynamics of certain parameter combinations. We empirically validate our analytic expressions for learning dynamics on VGG-16 trained on Tiny ImageNet. Overall, by exploiting symmetry, our work demonstrates that we can analytically describe the learning dynamics of various parameter combinations at finite learning rates and batch sizes for state of the art architectures trained on any dataset.

    ","tags":["dl2","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/On%20Good%20Practices%20for%20Task-Specific%20Distillation%20of%20Large%20Pretrained%20Visual%20Models/","title":"On Good Practices for Task Specific Distillation of Large Pretrained Visual Models","text":"Properties authors Juliette Marrie, Michael Arbel, Julien Mairal, Diane Larlus year 2024 url https://arxiv.org/abs/2402.11305

    Abstract

    Large pretrained visual models exhibit remarkable generalization across diverse recognition tasks. Yet, real-world applications often demand compact models tailored to specific problems. Variants of knowledge distillation have been devised for such a purpose, enabling task-specific compact models (the students) to learn from a generic large pretrained one (the teacher). In this paper, we show that the excellent robustness and versatility of recent pretrained models challenge common practices established in the literature, calling for a new set of optimal guidelines for task-specific distillation. To address the lack of samples in downstream tasks, we also show that a variant of Mixup based on stable diffusion complements standard data augmentation. This strategy eliminates the need for engineered text prompts and improves distillation of generic models into streamlined specialized networks.

    ","tags":["paper","distillation","foundation_models","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/On%20Good%20Practices%20for%20Task-Specific%20Distillation%20of%20Large%20Pretrained%20Visual%20Models/#notes","title":"Notes","text":"","tags":["paper","distillation","foundation_models","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/On%20the%20Relationship%20between%20Self-Attention%20and%20Convolutional%20Layers/","title":"On the Relationship between Self Attention and Convolutional Layers","text":"Properties authors Jean-Baptiste Cordonnier, Andreas Loukas, Martin Jaggi year 2020 url https://arxiv.org/abs/1911.03584

    Abstract

    Recent trends of incorporating attention mechanisms in vision have led researchers to reconsider the supremacy of convolutional layers as a primary building block. Beyond helping CNNs to handle long-range dependencies, Stand-Alone Self-Attention in Vision Models showed that attention can completely replace convolution and achieve state-of-the-art performance on vision tasks. This raises the question: do learned attention layers operate similarly to convolutional layers? This work provides evidence that attention layers can perform convolution and, indeed, they often learn to do so in practice. Specifically, we prove that a multi-head self-attention layer with sufficient number of heads is at least as expressive as any convolutional layer. Our numerical experiments then show that self-attention layers attend to pixel-grid patterns similarly to CNN layers, corroborating our analysis.

    ","tags":["transformers","convolutions","theory"]},{"location":"100%20Reference%20notes/101%20Literature/On%20the%20Relationship%20between%20Self-Attention%20and%20Convolutional%20Layers/#notes","title":"Notes","text":"
    • Note to self: fully read article, it looks fun \u23eb #personal
    ","tags":["transformers","convolutions","theory"]},{"location":"100%20Reference%20notes/101%20Literature/On%20the%20Symmetries%20of%20Deep%20Learning%20Models%20and%20their%20Internal%20Representations/","title":"On the Symmetries of Deep Learning Models and their Internal Representations","text":"Properties authors Charles Godfrey, Davis Brown, Tegan Emerson, Henry Kvnige year 2022 url https://arxiv.org/abs/2205.14258

    Abstract

    Symmetry is a fundamental tool in the exploration of a broad range of complex systems. In machine learning symmetry has been explored in both models and data. In this paper we seek to connect the symmetries arising from the architecture of a family of models with the symmetries of that family's internal representation of data. We do this by calculating a set of fundamental symmetry groups, which we call the intertwiner groups of the model. We connect intertwiner groups to a model's internal representations of data through a range of experiments that probe similarities between hidden states across models with the same architecture. Our work suggests that the symmetries of a network are propagated into the symmetries in that network's representation of data, providing us with a better understanding of how architecture affects the learning and prediction process. Finally, we speculate that for ReLU networks, the intertwiner groups may provide a justification for the common practice of concentrating model interpretability exploration on the activation basis in hidden layers rather than arbitrary linear combinations thereof.

    Notes: - The following papers study the effect of weight space symmetries on training dynamics: - Neural Mechanics - Symmetry and Broken Conservation Laws in Deep Learning Dynamics - Understanding symmetries in deep networks - G-SGD - Optimizing ReLU Neural Networks in its Positively Scale-Invariant Space - Deep Learning Book

    ","tags":["dl_theory","dl2"]},{"location":"100%20Reference%20notes/101%20Literature/OpenELM%20-%20An%20Efficient%20Language%20Model%20Family%20with%20Open-source%20Training%20and%20Inference%20Framework/","title":"OpenELM An Efficient Language Model Family with Open source Training and Inference Framework","text":"Properties authors Sachin Mehta, Mohammad Hossein Sekhavat, Qingqing Cao, Maxwell Horton, Yanzi Jin, Chenfan Sun, Iman Mirzadeh, Dmitry Belenko, Peter Zatloukal, Mohammad Rastegari year 2024 url https://arxiv.org/abs/2404.14619

    Abstract

    The reproducibility and transparency of large language models are crucial for advancing open research, ensuring the trustworthiness of results, and enabling investigations into data and model biases, as well as potential risks. To this end, we release OpenELM, a state-of-the-art open language model. OpenELM uses a layer-wise scaling strategy to efficiently allocate parameters within each layer of the transformer model, leading to enhanced accuracy. For example, with a parameter budget of approximately one billion parameters, OpenELM exhibits a 2.36% improvement in accuracy compared to OLMo while requiring\u00a02\u00d7\u00a0fewer pre-training tokens. Diverging from prior practices that only provide model weights and inference code, and pre-train on private datasets, our release includes the complete framework for training and evaluation of the language model on publicly available datasets, including training logs, multiple checkpoints, and pre-training configurations. We also release code to convert models to MLX library for inference and fine-tuning on Apple devices. This comprehensive release aims to empower and strengthen the open research community, paving the way for future open research endeavors. Our source code along with pre-trained model weights and training recipes is available at \\url{this https URL}. Additionally, \\model models can be found on HuggingFace at: \\url{this https URL}.

    ","tags":["llm","foundation_models"]},{"location":"100%20Reference%20notes/101%20Literature/Optimal%20Brain%20Damage/","title":"Optimal Brain Damage","text":"Properties authors John Denker, Sara Solla, Yann LeCun year 1989 url https://proceedings.neurips.cc/paper/1989/hash/6c9882bbac1c7093bd25041881277658-Abstract.html

    Abstract

    We have used information-theoretic ideas to derive a class of practical and nearly optimal schemes for adapting the size of a neural network. By removing unimportant weights from a network, several improvements can be expected: better generalization, fewer training examples required, and improved speed of learning and/or classification. The basic idea is to use second-derivative information to make a tradeoff between network complexity and training set error. Experiments confirm the usefulness of the methods on a real-world application.

    OBD Pruning Algorithm

    Use saliency measure based on Hessian (loss wrt parameters) to pick which parameters to prune. Finetune afterwards.

    ","tags":["paper","efficient_vision","efficient_dl","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Optimization%20Dynamics%20of%20Equivariant%20and%20Augmented%20Neural%20Networks/","title":"Optimization Dynamics of Equivariant and Augmented Neural Networks","text":"Properties authors Alex Flinth, Fredrik Ohlsson year 2023 url https://arxiv.org/abs/2303.13458

    Abstract

    We investigate the optimization of multilayer perceptrons on symmetric data. We compare the strategy of constraining the architecture to be equivariant to that of using augmentation. We show that, under natural assumptions on the loss and non-linearities, the sets of equivariant stationary points are identical for the two strategies, and that the set of equivariant layers is invariant under the gradient flow for augmented models. Finally, we show that stationary points may be unstable for augmented training although they are stable for the equivariant models.

    Main observations: 1. They show that if the augmented model is equivariantly initialized, it will remain equivariant during training (See Equivariance Initialization) 3. Compared to the equivariant approach, augmentation introduces no new equivariant stationary points, nor does it exclude existing ones. (See Multiple global minima) 4. The existence of a stable equivariant minimum is not guaranteed by augmentation. (See Multiple global minima)

    Regarding Equivariance Initialization in this work:

    We initialize \u03a6A with equivariant layers A0 \u2208 E by drawing matrices randomly from a standard Gaussian distribution, and then projecting them orthogonally onto E. We train the network on (finite) datasets D using gradient descent in three different ways.

    My intuition is that they do something like the isotropic convolution from Priors over Neural Network weights

    ","tags":["dl_theory","equivariance","optimization"]},{"location":"100%20Reference%20notes/101%20Literature/Parameter%20Efficient%20Fine-tuning%20of%20Self-supervised%20ViTs%20without%20Catastrophic%20Forgetting/","title":"Parameter Efficient Fine tuning of Self supervised ViTs without Catastrophic Forgetting","text":"Properties authors Reza Akbarian Bafghi, Nidhin Harilal, Claire Monteleoni, Maziar Raissi year 2024 url https://arxiv.org/abs/2404.17245

    Abstract

    Artificial neural networks often suffer from catastrophic forgetting, where learning new concepts leads to a complete loss of previously acquired knowledge. We observe that this issue is particularly magnified in vision transformers (ViTs), where post-pre-training and fine-tuning on new tasks can significantly degrade the model's original general abilities. For instance, a DINO ViT-Base/16 pre-trained on ImageNet-1k loses over 70% accuracy on ImageNet-1k after just 10 iterations of fine-tuning on CIFAR-100. Overcoming this stability-plasticity dilemma is crucial for enabling ViTs to continuously learn and adapt to new domains while preserving their initial knowledge. In this work, we study two new parameter-efficient fine-tuning strategies: (1)~Block Expansion, and (2) Low-rank adaptation (LoRA). Our experiments reveal that using either Block Expansion or LoRA on self-supervised pre-trained ViTs surpass fully fine-tuned ViTs in new domains while offering significantly greater parameter efficiency. Notably, we find that Block Expansion experiences only a minimal performance drop in the pre-training domain, thereby effectively mitigating catastrophic forgetting in pre-trained ViTs.

    ","tags":["paper"]},{"location":"100%20Reference%20notes/101%20Literature/Parameter%20Efficient%20Fine-tuning%20of%20Self-supervised%20ViTs%20without%20Catastrophic%20Forgetting/#paper-results","title":"Paper Results","text":"Model N. params CIFAR-100 IN-1K Mean Standard Fine-tuning All 85.9 M 88.13 25.24 56.69 Top-3 21.3 M 84.56 74.15 79.36 Linear 76.9 K 80.57 76.11 78.34 LoRA \ud835\udc5f=4 301 K 87.91 66.82 77.37 \ud835\udc5f=8 448 K 88.27 65.99 77.13 \ud835\udc5f=16 743 K 87.84 65.06 76.45 Block Expansion \ud835\udc5d=1 7.2 M 82.72 75.75 79.24 \ud835\udc5d=2 14.3 M 86.70 75.54 81.12 \ud835\udc5d=3 21.3 M 88.58 74.61 81.60 \ud835\udc5d=4 28.4 M 89.09 72.28 80.69","tags":["paper"]},{"location":"100%20Reference%20notes/101%20Literature/Parameter%20Efficient%20Fine-tuning%20of%20Self-supervised%20ViTs%20without%20Catastrophic%20Forgetting/#observations","title":"Observations","text":"
    • Linear only fine-tuning does pretty well, kinda surprising.
    • It's kind of suprising that LoRa Adapter do bad, but does it matter? What is the purpose of making LoRa resistant to catastrophic forgetting if the whole point of it is to be able to hot-swap modules depending on the task?
    • Also worthy to point out that Block Expansion requires training parameters in the order of millions while LoRa only requires thousands.
    ","tags":["paper"]},{"location":"100%20Reference%20notes/101%20Literature/Parameter-Efficient%20Fine-Tuning%20for%20Pre-Trained%20Vision%20Models%20-%20A%20Survey/","title":"Parameter Efficient Fine Tuning for Pre Trained Vision Models A Survey","text":"Properties authors Yi Xin, Siqi Luo, Haodi Zhou, Junlong Du, Xiaohong Liu, Yue Fan, Qing Li, Yuntao Du year 2024 url https://arxiv.org/abs/2402.02242

    Abstract

    Large-scale pre-trained vision models (PVMs) have shown great potential for adaptability across various downstream vision tasks. However, with state-of-the-art PVMs growing to billions or even trillions of parameters, the standard full fine-tuning paradigm is becoming unsustainable due to high computational and storage demands. In response, researchers are exploring parameter-efficient fine-tuning (PEFT), which seeks to exceed the performance of full fine-tuning with minimal parameter modifications. This survey provides a comprehensive overview and future directions for visual PEFT, offering a systematic review of the latest advancements. First, we provide a formal definition of PEFT and discuss model pre-training methods. We then categorize existing methods into three categories: addition-based, partial-based, and unified-based. Finally, we introduce the commonly used datasets and applications and suggest potential future research challenges. A comprehensive collection of resources is available at\u00a0this https URL.

    ","tags":["paper","efficient_dl","efficient_vision","transformers","peft"]},{"location":"100%20Reference%20notes/101%20Literature/Progress%20measures%20for%20grokking%20via%20mechanistic%20interpretability/","title":"Progress measures for grokking via mechanistic interpretability","text":"Properties authors Neel Nanda, Lawrence Chan, Tom Lieberum, Jess Smith, Jacob Steinhardt year 2023 url https://arxiv.org/abs/2301.05217

    Abstract

    Neural networks often exhibit emergent behavior, where qualitatively new capabilities arise from scaling up the amount of parameters, training data, or training steps. One approach to understanding emergence is to find continuous \\textit{progress measures} that underlie the seemingly discontinuous qualitative changes. We argue that progress measures can be found via mechanistic interpretability: reverse-engineering learned behaviors into their individual components. As a case study, we investigate the recently-discovered phenomenon of ``grokking'' exhibited by small transformers trained on modular addition tasks. We fully reverse engineer the algorithm learned by these networks, which uses discrete Fourier transforms and trigonometric identities to convert addition to rotation about a circle. We confirm the algorithm by analyzing the activations and weights and by performing ablations in Fourier space. Based on this understanding, we define progress measures that allow us to study the dynamics of training and split training into three continuous phases: memorization, circuit formation, and cleanup. Our results show that grokking, rather than being a sudden shift, arises from the gradual amplification of structured mechanisms encoded in the weights, followed by the later removal of memorizing components.

    Related - Grokking

    ","tags":["paper","interpretability","mechinterp"]},{"location":"100%20Reference%20notes/101%20Literature/Provably%20Strict%20Generalisation%20Benefit%20for%20Equivariant%20Models/","title":"Provably Strict Generalisation Benefit for Equivariant Models","text":"Properties authors Bryn Elesedy, Sheheryar Zaidi year 2021 url https://arxiv.org/abs/2102.10333

    Abstract

    It is widely believed that engineering a model to be invariant/equivariant improves generalisation. Despite the growing popularity of this approach, a precise characterisation of the generalisation benefit is lacking. By considering the simplest case of linear models, this paper provides the first provably non-zero improvement in generalisation for invariant/equivariant models when the target distribution is invariant/equivariant with respect to a compact group. Moreover, our work reveals an interesting relationship between generalisation, the number of training examples and properties of the group action. Our results rest on an observation of the structure of function spaces under averaging operators which, along with its consequences for feature averaging, may be of independent interest.

    ","tags":["dl_theory","equivariance"]},{"location":"100%20Reference%20notes/101%20Literature/ProxylessNAS%20-%20Direct%20Neural%20Architecture%20Search%20on%20Target%20Task%20and%20Hardware/","title":"ProxylessNAS Direct Neural Architecture Search on Target Task and Hardware","text":"Properties authors Han Cai, Ligeng Zhu, Song Han year 2019 url https://arxiv.org/abs/1812.00332

    Abstract

    Neural architecture search (NAS) has a great impact by automatically designing effective neural network architectures. However, the prohibitive computational demand of conventional NAS algorithms (e.g.\u00a0104\u00a0GPU hours) makes it difficult to \\emph{directly} search the architectures on large-scale tasks (e.g. ImageNet). Differentiable NAS can reduce the cost of GPU hours via a continuous representation of network architecture but suffers from the high GPU memory consumption issue (grow linearly w.r.t. candidate set size). As a result, they need to utilize~\\emph{proxy} tasks, such as training on a smaller dataset, or learning with only a few blocks, or training just for a few epochs. These architectures optimized on proxy tasks are not guaranteed to be optimal on the target task. In this paper, we present \\emph{ProxylessNAS} that can \\emph{directly} learn the architectures for large-scale target tasks and target hardware platforms. We address the high memory consumption issue of differentiable NAS and reduce the computational cost (GPU hours and GPU memory) to the same level of regular training while still allowing a large candidate set. Experiments on CIFAR-10 and ImageNet demonstrate the effectiveness of directness and specialization. On CIFAR-10, our model achieves 2.08\\% test error with only 5.7M parameters, better than the previous state-of-the-art architecture AmoebaNet-B, while using 6\u00d7\u00a0fewer parameters. On ImageNet, our model achieves 3.1\\% better top-1 accuracy than MobileNetV2, while being 1.2\u00d7\u00a0faster with measured GPU latency. We also apply ProxylessNAS to specialize neural architectures for hardware with direct hardware metrics (e.g. latency) and provide insights for efficient CNN architecture design.

    ","tags":["paper","efficient_dl","nas"]},{"location":"100%20Reference%20notes/101%20Literature/ProxylessNAS%20-%20Direct%20Neural%20Architecture%20Search%20on%20Target%20Task%20and%20Hardware/#notes","title":"Notes","text":"
    • To avoid measuring performance on the target device, they learn a latency model.
      1. They take multiple measurements of a device with different architectures.
      2. They train a model to predict the latency given the architecture.
    ","tags":["paper","efficient_dl","nas"]},{"location":"100%20Reference%20notes/101%20Literature/R-MAE%20-%20Regions%20Meet%20Masked%20Autoencoders/","title":"R MAE Regions Meet Masked Autoencoders","text":"Properties authors Duy-Kien Nguyen, Vaibhav Aggarwal, Yanghao Li, Martin R. Oswald, Alexander Kirillov, Cees G. M. Snoek, Xinlei Chen year 2023 url https://arxiv.org/abs/2306.05411

    Abstract

    In this work, we explore regions as a potential visual analogue of words for self-supervised image representation learning. Inspired by Masked Autoencoding (MAE), a generative pre-training baseline, we propose masked region autoencoding to learn from groups of pixels or regions. Specifically, we design an architecture which efficiently addresses the one-to-many mapping between images and regions, while being highly effective especially with high-quality regions. When integrated with MAE, our approach (R-MAE) demonstrates consistent improvements across various pre-training datasets and downstream detection and segmentation benchmarks, with negligible computational overheads. Beyond the quantitative evaluation, our analysis indicates the models pre-trained with masked region autoencoding unlock the potential for interactive segmentation. The code is provided at\u00a0this https URL.

    ","tags":["paper","foundation_models"]},{"location":"100%20Reference%20notes/101%20Literature/R-MAE%20-%20Regions%20Meet%20Masked%20Autoencoders/#note","title":"Note","text":"
    • Note to self: Read in depth
    ","tags":["paper","foundation_models"]},{"location":"100%20Reference%20notes/101%20Literature/Refusal%20in%20Language%20Models%20Is%20Mediated%20by%20a%20Single%20Direction/","title":"Refusal in Language Models Is Mediated by a Single Direction","text":"Properties authors Andy Arditi, Oscar Obeso, Aaquib Syed, Daniel Paleka, Nina Rimsky, Wes Gurnee, Neel Nanda year 2024 url https://arxiv.org/abs/2406.11717

    Abstract

    Conversational large language models are fine-tuned for both instruction-following and safety, resulting in models that obey benign requests but refuse harmful ones. While this refusal behavior is widespread across chat models, its underlying mechanisms remain poorly understood. In this work, we show that refusal is mediated by a one-dimensional subspace, across 13 popular open-source chat models up to 72B parameters in size. Specifically, for each model, we find a single direction such that erasing this direction from the model's residual stream activations prevents it from refusing harmful instructions, while adding this direction elicits refusal on even harmless instructions. Leveraging this insight, we propose a novel white-box jailbreak method that surgically disables refusal with minimal effect on other capabilities. Finally, we mechanistically analyze how adversarial suffixes suppress propagation of the refusal-mediating direction. Our findings underscore the brittleness of current safety fine-tuning methods. More broadly, our work showcases how an understanding of model internals can be leveraged to develop practical methods for controlling model behavior.

    ","tags":["paper","transformers","mechinterp","interpretability"]},{"location":"100%20Reference%20notes/101%20Literature/Relaxed%20Octahedral%20Group%20Convolution%20for%20Learning%20Symmetry%20Breaking%20in%203D%20Physical%20Systems/","title":"Relaxed Octahedral Group Convolution for Learning Symmetry Breaking in 3D Physical Systems","text":"Properties authors Rui Wang, Robin Walters, Tess E Smidt year 2023 url https://arxiv.org/abs/2310.02299

    Abstract

    Deep equivariant models use symmetries to improve sample efficiency and generalization. However, the assumption of perfect symmetry in many of these models can sometimes be restrictive, especially when the data does not perfectly align with such symmetries. Thus, we introduce relaxed octahedral group convolution for modeling 3D physical systems in this paper. This flexible convolution technique provably allows the model to both maintain the highest level of equivariance that is consistent with data and discover the subtle symmetry-breaking factors in the physical systems. Empirical results validate that our approach can not only provide insights into the symmetry-breaking factors in phase transitions but also achieves superior performance in fluid super-resolution tasks.

    ","tags":["relaxed_equivariance","equivariance"]},{"location":"100%20Reference%20notes/101%20Literature/Relaxing%20Equivariance%20Constraints%20with%20Non-stationary%20Continuous%20Filters/","title":"Relaxing Equivariance Constraints with Non stationary Continuous Filters","text":"Properties authors David W. Romero

    Abstract

    ","tags":["dl2","equivariance","partial_equivariance"]},{"location":"100%20Reference%20notes/101%20Literature/Retrospective%20-%20EIE%20-%20Efficient%20Inference%20Engine%20onSparse%20and%20Compressed%20Neural%20Network/","title":"Retrospective EIE Efficient Inference Engine onSparse and Compressed Neural Network","text":"Properties authors Song Han, Xingyu Liu, Huizi Mao, Jing Pu, Ardavan Pedram, Mark A. Horowitz, William J. Dally year 2023 url https://arxiv.org/abs/2306.09552

    Abstract

    EIE proposed to accelerate pruned and compressed neural networks, exploiting weight sparsity, activation sparsity, and 4-bit weight-sharing in neural network accelerators. Since published in ISCA'16, it opened a new design space to accelerate pruned and sparse neural networks and spawned many algorithm-hardware co-designs for model compression and acceleration, both in academia and commercial AI chips. In retrospect, we review the background of this project, summarize the pros and cons, and discuss new opportunities where pruning, sparsity, and low precision can accelerate emerging deep learning workloads.

    ","tags":["paper","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/Revealing%20the%20Utilized%20Rank%20of%20Subspaces%20of%20Learning%20in%20Neural%20Networks/","title":"Revealing the Utilized Rank of Subspaces of Learning in Neural Networks","text":"Properties authors Isha Garg, Christian Koguchi, Eshan Verma, Daniel Ulbricht year 2024 url https://arxiv.org/abs/2407.04797

    Abstract

    In this work, we study how well the learned weights of a neural network utilize the space available to them. This notion is related to capacity, but additionally incorporates the interaction of the network architecture with the dataset. Most learned weights appear to be full rank, and are therefore not amenable to low rank decomposition. This deceptively implies that the weights are utilizing the entire space available to them. We propose a simple data-driven transformation that projects the weights onto the subspace where the data and the weight interact. This preserves the functional mapping of the layer and reveals its low rank structure. In our findings, we conclude that most models utilize a fraction of the available space. For instance, for ViTB-16 and ViTL-16 trained on ImageNet, the mean layer utilization is 35% and 20% respectively. Our transformation results in reducing the parameters to 50% and 25% respectively, while resulting in less than 0.2% accuracy drop after fine-tuning. We also show that self-supervised pre-training drives this utilization up to 70%, justifying its suitability for downstream tasks.

    ","tags":["paper","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Rewrite%20the%20Stars/","title":"Rewrite the Stars","text":"Properties authors Xu Ma, Xiyang Dai, Yue Bai, Yizhou Wang, Yun Fu year 2024 url https://arxiv.org/abs/2403.19967

    Abstract

    Recent studies have drawn attention to the untapped potential of the \"star operation\" (element-wise multiplication) in network design. While intuitive explanations abound, the foundational rationale behind its application remains largely unexplored. Our study attempts to reveal the star operation's ability to map inputs into high-dimensional, non-linear feature spaces -- akin to kernel tricks -- without widening the network. We further introduce StarNet, a simple yet powerful prototype, demonstrating impressive performance and low latency under compact network structure and efficient budget. Like stars in the sky, the star operation appears unremarkable but holds a vast universe of potential. Our work encourages further exploration across tasks, with codes available at\u00a0this https URL.

    ","tags":["dl_theory","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/SAM-CLIP%20-%20Merging%20Vision%20Foundation%20Models%20towards%20Semantic%20and%20Spatial%20Understanding/","title":"SAM CLIP Merging Vision Foundation Models towards Semantic and Spatial Understanding","text":"Properties authors Haoxiang Wang, Fartash Faghri, Raviteja Vemulapalli, Mehrdad Farajtabar, Sachin Mehta, Mohammad Rastegari, Oncel Tuzel, Hadi Pouransari, Pavan Kumar Anasosalu Vasu year 2024 url https://arxiv.org/abs/2310.15308

    Abstract

    The landscape of publicly available vision foundation models (VFMs), such as CLIP and Segment Anything Model (SAM), is expanding rapidly. VFMs are endowed with distinct capabilities stemming from their pre-training objectives. For instance, CLIP excels in semantic understanding, while SAM specializes in spatial understanding for segmentation. In this work, we introduce a simple recipe to efficiently merge VFMs into a unified model that absorbs their expertise. Our method integrates techniques of multi-task learning, continual learning, and distillation. Further, it demands significantly less computational cost compared to traditional multi-task training from scratch, and it only needs a small fraction of the pre-training datasets that were initially used to train individual models. By applying our method to SAM and CLIP, we obtain SAM-CLIP: a unified model that combines the capabilities of SAM and CLIP into a single vision transformer. Compared with deploying SAM and CLIP independently, our merged model, SAM-CLIP, reduces storage and compute costs for inference, making it well-suited for edge device applications. We show that SAM-CLIP not only retains the foundational strengths of SAM and CLIP, but also introduces synergistic functionalities, notably in zero-shot semantic segmentation, where SAM-CLIP establishes new state-of-the-art results on 5 benchmarks. It outperforms previous models that are specifically designed for this task by a large margin, including +6.8% and +5.9% mean IoU improvement on Pascal-VOC and COCO-Stuff datasets, respectively.

    ","tags":["paper","efficient_dl","efficient_vision"]},{"location":"100%20Reference%20notes/101%20Literature/Scaling%20%28Down%29%20CLIP%20-%20A%20Comprehensive%20Analysis%20of%20Data%2C%20Architecture%2C%20and%20Training%20Strategies/","title":"Scaling (Down) CLIP A Comprehensive Analysis of Data, Architecture, and Training Strategies","text":"Properties authors Zichao Li, Cihang Xie, Ekin Dogus Cubuk year 2024 url https://arxiv.org/abs/2404.08197

    Abstract

    This paper investigates the performance of the Contrastive Language-Image Pre-training (CLIP) when scaled down to limited computation budgets. We explore CLIP along three dimensions: data, architecture, and training strategies. With regards to data, we demonstrate the significance of high-quality training data and show that a smaller dataset of high-quality data can outperform a larger dataset with lower quality. We also examine how model performance varies with different dataset sizes, suggesting that smaller ViT models are better suited for smaller datasets, while larger models perform better on larger datasets with fixed compute. Additionally, we provide guidance on when to choose a CNN-based architecture or a ViT-based architecture for CLIP training. We compare four CLIP training strategies - SLIP, FLIP, CLIP, and CLIP+Data Augmentation - and show that the choice of training strategy depends on the available compute resource. Our analysis reveals that CLIP+Data Augmentation can achieve comparable performance to CLIP using only half of the training data. This work provides practical insights into how to effectively train and deploy CLIP models, making them more accessible and affordable for practical use in various applications.

    ","tags":["efficient_dl","vit","cnn"]},{"location":"100%20Reference%20notes/101%20Literature/Segment%20Anything/","title":"Segment Anything","text":"Properties authors Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alexander C. Berg, Wan-Yen Lo, Piotr Doll\u00e1r, Ross Girshick year 2023 url https://arxiv.org/abs/2304.02643

    Abstract

    We introduce the Segment Anything (SA) project: a new task, model, and dataset for image segmentation. Using our efficient model in a data collection loop, we built the largest segmentation dataset to date (by far), with over 1 billion masks on 11M licensed and privacy respecting images. The model is designed and trained to be promptable, so it can transfer zero-shot to new image distributions and tasks. We evaluate its capabilities on numerous tasks and find that its zero-shot performance is impressive -- often competitive with or even superior to prior fully supervised results. We are releasing the Segment Anything Model (SAM) and corresponding dataset (SA-1B) of 1B masks and 11M images at\u00a0this https URL\u00a0to foster research into foundation models for computer vision.

    ","tags":["paper","segmentation","computer_vision","foundation_models"]},{"location":"100%20Reference%20notes/101%20Literature/Self-Supervised%20Detection%20of%20Perfect%20and%20Partial%20Input-Dependent%20Symmetries/","title":"Self Supervised Detection of Perfect and Partial Input Dependent Symmetries","text":"Properties authors David W. Romero, Alonso Urbano","tags":["dl2","equivariance","partial_equivariance","inductive_bias"]},{"location":"100%20Reference%20notes/101%20Literature/SimPLR%20-%20A%20Simple%20and%20Plain%20Transformer%20for%20Scaling-Efficient%20Object%20Detection%20and%20Segmentation/","title":"SimPLR A Simple and Plain Transformer for Scaling Efficient Object Detection and Segmentation","text":"Properties authors Duy-Kien Nguyen, Martin R. Oswald, Cees G. M. Snoek year 2024 url https://arxiv.org/abs/2310.05920

    Abstract

    The ability to detect objects in images at varying scales has played a pivotal role in the design of modern object detectors. Despite considerable progress in removing hand-crafted components and simplifying the architecture with transformers, multi-scale feature maps and/or pyramid design remain a key factor for their empirical success. In this paper, we show that this reliance on either feature pyramids or an hierarchical backbone is unnecessary and a transformer-based detector with scale-aware attention enables the plain detector 'SimPLR' whose backbone and detection head are both non-hierarchical and operate on single-scale features. We find through our experiments that SimPLR with scale-aware attention is plain and simple, yet competitive with multi-scale vision transformer alternatives. Compared to the multi-scale and single-scale state-of-the-art, our model scales much better with bigger capacity (self-supervised) models and more pre-training data, allowing us to report a consistently better accuracy and faster runtime for object detection, instance segmentation as well as panoptic segmentation. Code will be released.

    ","tags":["paper","object_detection","computer_vision","vit"]},{"location":"100%20Reference%20notes/101%20Literature/SimPLR%20-%20A%20Simple%20and%20Plain%20Transformer%20for%20Scaling-Efficient%20Object%20Detection%20and%20Segmentation/#notes","title":"Notes","text":"

    \u201cDespite enabling plain-backbone detectors, feature pyramids are still an important factor in ViTDet to detect objects at various scales\u201d (Nguyen et al., 2024, p. 4)

    Not really an issue as far as I understand, but in the spirit of less inductive biases it makes sense. Feature pyramids intuitively hardcode scale information.

    \u201cMost recently, Lin et al. [35] introduce the transformer-based detector, PlainDETR, which also removes the multi-scale input. However, it still relies on multi-scale features to generate the object proposals for its decoder.\u201d (Nguyen et al., 2024, p. 4)

    Don't quite understand this, does this still allow arbitrary vits? - [ ] Read PlainDETR \ud83d\udd3d

    ","tags":["paper","object_detection","computer_vision","vit"]},{"location":"100%20Reference%20notes/101%20Literature/Simultaneous%20linear%20connectivity%20of%20neural%20networks%20modulo%20permutation/","title":"Simultaneous linear connectivity of neural networks modulo permutation","text":"Properties authors Ekansh Sharma, Devin Kwok, Tom Denton, Daniel M. Roy, David Rolnick, Gintare Karolina Dziugaite year 2024 url https://arxiv.org/abs/2404.06498

    Abstract

    Neural networks typically exhibit permutation symmetries which contribute to the non-convexity of the networks' loss landscapes, since linearly interpolating between two permuted versions of a trained network tends to encounter a high loss barrier. Recent work has argued that permutation symmetries are the only sources of non-convexity, meaning there are essentially no such barriers between trained networks if they are permuted appropriately. In this work, we refine these arguments into three distinct claims of increasing strength. We show that existing evidence only supports \"weak linear connectivity\"-that for each pair of networks belonging to a set of SGD solutions, there exist (multiple) permutations that linearly connect it with the other networks. In contrast, the claim \"strong linear connectivity\"-that for each network, there exists one permutation that simultaneously connects it with the other networks-is both intuitively and practically more desirable. This stronger claim would imply that the loss landscape is convex after accounting for permutation, and enable linear interpolation between three or more independently trained models without increased loss. In this work, we introduce an intermediate claim-that for certain sequences of networks, there exists one permutation that simultaneously aligns matching pairs of networks from these sequences. Specifically, we discover that a single permutation aligns sequences of iteratively trained as well as iteratively pruned networks, meaning that two networks exhibit low loss barriers at each step of their optimization and sparsification trajectories respectively. Finally, we provide the first evidence that strong linear connectivity may be possible under certain conditions, by showing that barriers decrease with increasing network width when interpolating among three networks.

    ","tags":["dl_theory","linear_connectivity","network_permutation_symmetries"]},{"location":"100%20Reference%20notes/101%20Literature/Stand-Alone%20Self-Attention%20in%20Vision%20Models/","title":"Stand Alone Self Attention in Vision Models","text":"Properties authors Prajit Ramachandran, Niki Parmar, Ashish Vaswani, Irwan Bello, Anselm Levskaya, Jonathon Shlens year 2019

    Abstract

    Convolutions are a fundamental building block of modern computer vision systems. Recent approaches have argued for going beyond convolutions in order to capture long-range dependencies. These efforts focus on augmenting convolutional models with content-based interactions, such as self-attention and non-local means, to achieve gains on a number of vision tasks. The natural question that arises is whether attention can be a stand-alone primitive for vision models instead of serving as just an augmentation on top of convolutions. In developing and testing a pure self-attention vision model, we verify that self-attention can indeed be an effective stand-alone layer. A simple procedure of replacing all instances of spatial convolutions with a form of self-attention applied to ResNet model produces a fully self-attentional model that outperforms the baseline on ImageNet classification with 12% fewer FLOPS and 29% fewer parameters. On COCO object detection, a pure self-attention model matches the mAP of a baseline RetinaNet while having 39% fewer FLOPS and 34% fewer parameters. Detailed ablation studies demonstrate that self-attention is especially impactful when used in later layers. These results establish that stand-alone self-attention is an important addition to the vision practitioner's toolbox.

    ","tags":["vit","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/Surgical%20Fine-Tuning%20Improves%20Adaptation%20to%20Distribution%20Shifts/","title":"Surgical Fine Tuning Improves Adaptation to Distribution Shifts","text":"Properties authors Yoonho Lee, Annie S. Chen, Fahim Tajwar, Huaxiu Yao, Percy Liang, Chelsea Finn, Ananya Kumar year 2022 url https://arxiv.org/abs/2210.11466

    Abstract

    A common approach to transfer learning under distribution shift is to fine-tune the last few layers of a pre-trained model, preserving learned features while also adapting to the new task. This paper shows that in such settings, selectively fine-tuning a subset of layers (which we term surgical fine-tuning) matches or outperforms commonly used fine-tuning approaches. Moreover, the type of distribution shift influences which subset is more effective to tune: for example, for image corruptions, fine-tuning only the first few layers works best. We validate our findings systematically across seven real-world data tasks spanning three types of distribution shifts. Theoretically, we prove that for two-layer neural networks in an idealized setting, first-layer tuning can outperform fine-tuning all layers. Intuitively, fine-tuning more parameters on a small target dataset can cause information learned during pre-training to be forgotten, and the relevant information depends on the type of shift.

    Notes: - Paper mentions that it depends on what kind of distribution shift the choice of layers (subset of parameters) to finetune. - They provide an automatic procedure to select those layers that beats full finetuning but is suboptimal when compared to expert/surgical finetuning. Suggest future work in this regard.

    ","tags":["paper","peft","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/Surgical-DINO%20-%20Adapter%20Learning%20of%20Foundation%20Models%20for%20Depth%20Estimation%20in%20Endoscopic%20Surgery/","title":"Surgical DINO Adapter Learning of Foundation Models for Depth Estimation in Endoscopic Surgery","text":"Properties authors Beilei Cui, Mobarakol Islam, Long Bai, Hongliang Ren year 2024 url https://arxiv.org/abs/2401.06013

    Abstract

    Purpose: Depth estimation in robotic surgery is vital in 3D reconstruction, surgical navigation and augmented reality visualization. Although the foundation model exhibits outstanding performance in many vision tasks, including depth estimation (e.g., DINOv2), recent works observed its limitations in medical and surgical domain-specific applications. This work presents a low-ranked adaptation (LoRA) of the foundation model for surgical depth estimation. Methods: We design a foundation model-based depth estimation method, referred to as Surgical-DINO, a low-rank adaptation of the DINOv2 for depth estimation in endoscopic surgery. We build LoRA layers and integrate them into DINO to adapt with surgery-specific domain knowledge instead of conventional fine-tuning. During training, we freeze the DINO image encoder, which shows excellent visual representation capacity, and only optimize the LoRA layers and depth decoder to integrate features from the surgical scene. Results: Our model is extensively validated on a MICCAI challenge dataset of SCARED, which is collected from da Vinci Xi endoscope surgery. We empirically show that Surgical-DINO significantly outperforms all the state-of-the-art models in endoscopic depth estimation tasks. The analysis with ablation studies has shown evidence of the remarkable effect of our LoRA layers and adaptation. Conclusion: Surgical-DINO shed some light on the successful adaptation of the foundation models into the surgical domain for depth estimation. There is clear evidence in the results that zero-shot prediction on pre-trained weights in computer vision datasets or naive fine-tuning is not sufficient to use the foundation model in the surgical domain directly. Code is available at\u00a0this https URL.

    References: - LoRA - Low-Rank Adaptation of Large Language Models

    Keywords: - LoRa Adapter

    ","tags":["paper","efficient_dl","efficient_vision","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/Symmetries%20in%20Overparametrized%20Neural%20Networks%20-%20A%20Mean-Field%20View/","title":"Symmetries in Overparametrized Neural Networks A Mean Field View","text":"Properties authors Javier Maass Martinez, Joaquin Fontbona year 2024 url https://arxiv.org/abs/2405.19995

    Abstract

    We develop a Mean-Field (MF) view of the learning dynamics of overparametrized Artificial Neural Networks (NN) under data symmetric in law wrt the action of a general compact group\u00a0G. We consider for this a class of generalized shallow NNs given by an ensemble of\u00a0N\u00a0multi-layer units, jointly trained using stochastic gradient descent (SGD) and possibly symmetry-leveraging (SL) techniques, such as Data Augmentation (DA), Feature Averaging (FA) or Equivariant Architectures (EA). We introduce the notions of weakly and strongly invariant laws (WI and SI) on the parameter space of each single unit, corresponding, respectively, to\u00a0G-invariant distributions, and to distributions supported on parameters fixed by the group action (which encode EA). This allows us to define symmetric models compatible with taking\u00a0N\u2192\u221e\u00a0and give an interpretation of the asymptotic dynamics of DA, FA and EA in terms of Wasserstein Gradient Flows describing their MF limits. When activations respect the group action, we show that, for symmetric data, DA, FA and freely-trained models obey the exact same MF dynamic, which stays in the space of WI laws and minimizes therein the population risk. We also give a counterexample to the general attainability of an optimum over SI laws. Despite this, quite remarkably, we show that the set of SI laws is also preserved by the MF dynamics even when freely trained. This sharply contrasts the finite-N\u00a0setting, in which EAs are generally not preserved by unconstrained SGD. We illustrate the validity of our findings as\u00a0N\u00a0gets larger in a teacher-student experimental setting, training a student NN to learn from a WI, SI or arbitrary teacher model through various SL schemes. We last deduce a data-driven heuristic to discover the largest subspace of parameters supporting SI distributions for a problem, that could be used for designing EA with minimal generalization error.

    ","tags":["dl_theory","equivariance"]},{"location":"100%20Reference%20notes/101%20Literature/Talaria%20-%20Interactively%20Optimizing%20Machine%20Learning%20Models%20for%20Efficient%20Inference/","title":"Talaria Interactively Optimizing Machine Learning Models for Efficient Inference","text":"Properties authors Fred Hohman, Chaoqun Wang, Jinmook Lee, Jochen G\u00f6rtler, Dominik Moritz, Jeffrey P Bigham, Zhile Ren, Cecile Foret, Qi Shan, Ziaoyi Zhang year 2024 url https://arxiv.org/abs/2404.03085

    Abstract

    On-device machine learning (ML) moves computation from the cloud to personal devices, protecting user privacy and enabling intelligent user experiences. However, fitting models on devices with limited resources presents a major technical challenge: practitioners need to optimize models and balance hardware metrics such as model size, latency, and power. To help practitioners create efficient ML models, we designed and developed Talaria: a model visualization and optimization system. Talaria enables practitioners to compile models to hardware, interactively visualize model statistics, and simulate optimizations to test the impact on inference metrics. Since its internal deployment two years ago, we have evaluated Talaria using three methodologies: (1) a log analysis highlighting its growth of 800+ practitioners submitting 3,600+ models; (2) a usability survey with 26 users assessing the utility of 20 Talaria features; and (3) a qualitative interview with the 7 most active users about their experience using Talaria.

    ","tags":["efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/The%20Empirical%20Impact%20of%20Neural%20Parameter%20Symmetries%2C%20or%20Lack%20Thereof/","title":"The Empirical Impact of Neural Parameter Symmetries, or Lack Thereof","text":"Properties authors Derek Lim, Moe Putterman, Robin Walters, Haggai Maron, Stefanie Jegelka year 2024 url https://arxiv.org/abs/2405.20231

    Abstract

    Many algorithms and observed phenomena in deep learning appear to be affected by parameter symmetries -- transformations of neural network parameters that do not change the underlying neural network function. These include linear mode connectivity, model merging, Bayesian neural network inference, metanetworks, and several other characteristics of optimization or loss-landscapes. However, theoretical analysis of the relationship between parameter space symmetries and these phenomena is difficult. In this work, we empirically investigate the impact of neural parameter symmetries by introducing new neural network architectures that have reduced parameter space symmetries. We develop two methods, with some provable guarantees, of modifying standard neural networks to reduce parameter space symmetries. With these new methods, we conduct a comprehensive experimental study consisting of multiple tasks aimed at assessing the effect of removing parameter symmetries. Our experiments reveal several interesting observations on the empirical impact of parameter symmetries; for instance, we observe linear mode connectivity between our networks without alignment of weight spaces, and we find that our networks allow for faster and more effective Bayesian neural network training.

    ","tags":["equivariance","relaxed_equivariance","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/The%20Lie%20derivative%20for%20measuring%20learned%20equivariance/","title":"The Lie derivative for measuring learned equivariance","text":"Properties authors Nate Gruver, Marc Finzi, Micah Goldblum, Andrew Gordon Wilson year 2022 url https://arxiv.org/abs/2210.02984

    Abstract

    The Lie derivative is introduced, a method for measuring equivariance with strong mathematical foundations and minimal hyperparameters that finds that transformers can be more equivariant than convolutional neural networks after training, and that as models get larger and more accurate they tend to display more equivariance, regardless of architecture.

    ","tags":["equivariance"]},{"location":"100%20Reference%20notes/101%20Literature/The%20Lie%20derivative%20for%20measuring%20learned%20equivariance/#notes","title":"Notes","text":"","tags":["equivariance"]},{"location":"100%20Reference%20notes/101%20Literature/The%20Unreasonable%20Ineffectiveness%20of%20the%20Deeper%20Layers/","title":"The Unreasonable Ineffectiveness of the Deeper Layers","text":"Properties authors Andrey Gromov, Kushal Tirumala, Hassan Shapourian, Paolo Glorioso, Daniel A. Roberts year 2024 url https://arxiv.org/abs/2403.17887

    Abstract

    We empirically study a simple layer-pruning strategy for popular families of open-weight pretrained LLMs, finding minimal degradation of performance on different question-answering benchmarks until after a large fraction (up to half) of the layers are removed. To prune these models, we identify the optimal block of layers to prune by considering similarity across layers; then, to \"heal\" the damage, we perform a small amount of finetuning. In particular, we use parameter-efficient finetuning (PEFT) methods, specifically quantization and Low Rank Adapters (QLoRA), such that each of our experiments can be performed on a single A100 GPU. From a practical perspective, these results suggest that layer pruning methods can complement other PEFT strategies to further reduce computational resources of finetuning on the one hand, and can improve the memory and latency of inference on the other hand. From a scientific perspective, the robustness of these LLMs to the deletion of layers implies either that current pretraining methods are not properly leveraging the parameters in the deeper layers of the network or that the shallow layers play a critical role in storing knowledge.

    ","tags":["transformers","efficient_dl","pruning","quantization"]},{"location":"100%20Reference%20notes/101%20Literature/TiC-CLIP%20-%20Continual%20Training%20of%20CLIP%20models/","title":"TiC CLIP Continual Training of CLIP models","text":"Properties authors Saurabh Garg, Mehrdad Farajtabar, Hadi Pouransari, Raviteja Vemulapalli, Sachin Mehta, Oncel Tuzel, Vaishaal Shankar, Fartash Faghri year 2024 url https://arxiv.org/abs/2310.16226

    Abstract

    Keeping large foundation models up to date on latest data is inherently expensive. To avoid the prohibitive costs of constantly retraining, it is imperative to continually train these models. This problem is exacerbated by the lack of any large scale continual learning benchmarks or baselines. We introduce the first set of web-scale Time-Continual (TiC) benchmarks for training vision-language models: TiC-DataComp, TiC-YFCC, and TiC-Redcaps. TiC-DataComp, our largest dataset, contains over 12.7B timestamped image-text pairs spanning 9 years (2014-2022). We first use our benchmarks to curate various dynamic evaluations to measure temporal robustness of existing models. We show OpenAI's CLIP (trained on data up to 2020) loses\u00a0\u22488%\u00a0zero-shot accuracy on our curated retrieval task from 2021-2022 compared with more recently trained models in OpenCLIP repository. We then study how to efficiently train models on time-continuous data. We demonstrate that a simple rehearsal-based approach that continues training from the last checkpoint and replays old data reduces compute by\u00a02.5\u00d7\u00a0when compared to the standard practice of retraining from scratch. Code is available at\u00a0this https URL.

    ","tags":["paper","continual_learning","multimodal"]},{"location":"100%20Reference%20notes/101%20Literature/Training%20quantized%20nets%20-%20A%20deeper%20understanding/","title":"Training quantized nets A deeper understanding","text":"Properties authors Hao Li, Soham De, Zheng Xu, Christoph Studer, Hanan Samet, Tom Goldstein year 2017 url https://arxiv.org/abs/1706.02379

    Abstract

    Currently, deep neural networks are deployed on low-power portable devices by first training a full-precision model using powerful hardware, and then deriving a corresponding low-precision model for efficient inference on such systems. However, training models directly with coarsely quantized weights is a key step towards learning on embedded platforms that have limited computing resources, memory capacity, and power consumption. Numerous recent publications have studied methods for training quantized networks, but these studies have mostly been empirical. In this work, we investigate training methods for quantized neural networks from a theoretical viewpoint. We first explore accuracy guarantees for training methods under convexity assumptions. We then look at the behavior of these algorithms for non-convex problems, and show that training algorithms that exploit high-precision representations have an important greedy search phase that purely quantized training methods lack, which explains the difficulty of training using low-precision arithmetic.

    ","tags":["paper","quantization"]},{"location":"100%20Reference%20notes/101%20Literature/Training%20quantized%20nets%20-%20A%20deeper%20understanding/#notes","title":"Notes","text":"
    • Read paper
    ","tags":["paper","quantization"]},{"location":"100%20Reference%20notes/101%20Literature/Understanding%20Deep%20Learning%20-%20Chapter%2010/","title":"Understanding Deep Learning Chapter 10","text":"Properties authors Simon J.D. Prince year 2023 url https://udlbook.github.io/udlbook/","tags":["textbook","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Understanding%20Deep%20Learning%20-%20Chapter%2020/","title":"Understanding Deep Learning Chapter 20","text":"Properties authors Simon J.D. Prince year 2023 url https://udlbook.github.io/udlbook/","tags":["textbook","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Understanding%20Deep%20Learning%20-%20Chapter%2020/#chapter-20-why-does-deep-learning-work","title":"Chapter 20: Why does deep learning work?","text":"

    Contents

    • 20.1 The case against deep learning
    • 20.2 Factors that influence fitting performance
    • 20.3 Properties of loss functions
    • 20.4 Factors that determine generalization
    • 20.5 Do we need so many parameters?
    • 20.6 Do networks have to be deep?
    • 20.7 Summary
    ","tags":["textbook","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Understanding%20symmetries%20in%20deep%20networks/","title":"Understanding symmetries in deep networks","text":"Properties authors Vijay Badrinarayanan, Bamdev Mishra, Roberto Cipolla year 2015 url https://arxiv.org/abs/1511.01029

    Abstract

    Recent works have highlighted scale invariance or symmetry present in the weight space of a typical deep network and the adverse effect it has on the Euclidean gradient based stochastic gradient descent optimization. In this work, we show that a commonly used deep network, which uses convolution, batch normalization, reLU, max-pooling, and sub-sampling pipeline, possess more complex forms of symmetry arising from scaling-based reparameterization of the network weights. We propose to tackle the issue of the weight space symmetry by constraining the filters to lie on the unit-norm manifold. Consequently, training the network boils down to using stochastic gradient descent updates on the unit-norm manifold. Our empirical evidence based on the MNIST dataset shows that the proposed updates improve the test performance beyond what is achieved with batch normalization and without sacrificing the computational efficiency of the weight updates.

    ,

    ","tags":["dl_theory","dl2"]},{"location":"100%20Reference%20notes/101%20Literature/Using%20Degeneracy%20in%20the%20Loss%20Landscape%20for%20Mechanistic%20Interpretability/","title":"Using Degeneracy in the Loss Landscape for Mechanistic Interpretability","text":"Properties authors Lucius Bushnaq, Jake Mendel, Stefan Heimersheim, Dan Braun, Nicholas Goldowsky-Dill, Kaarel H\u00e4nni, Cindy Wu, Marius Hobbhahn year 2024 url https://arxiv.org/abs/2405.10927

    Abstract

    Mechanistic Interpretability aims to reverse engineer the algorithms implemented by neural networks by studying their weights and activations. An obstacle to reverse engineering neural networks is that many of the parameters inside a network are not involved in the computation being implemented by the network. These degenerate parameters may obfuscate internal structure. Singular learning theory teaches us that neural network parameterizations are biased towards being more degenerate, and parameterizations with more degeneracy are likely to generalize further. We identify 3 ways that network parameters can be degenerate: linear dependence between activations in a layer; linear dependence between gradients passed back to a layer; ReLUs which fire on the same subset of datapoints. We also present a heuristic argument that modular networks are likely to be more degenerate, and we develop a metric for identifying modules in a network that is based on this argument. We propose that if we can represent a neural network in a way that is invariant to reparameterizations that exploit the degeneracies, then this representation is likely to be more interpretable, and we provide some evidence that such a representation is likely to have sparser interactions. We introduce the Interaction Basis, a tractable technique to obtain a representation that is invariant to degeneracies from linear dependence of activations or Jacobians.

    ","tags":["paper","dl_theory","mechinterp","optimization"]},{"location":"100%20Reference%20notes/101%20Literature/ViDT%20-%20An%20Efficient%20and%20Effective%20Fully%20Transformer-based%20Object%20Detector/","title":"ViDT An Efficient and Effective Fully Transformer based Object Detector","text":"Properties authors Hwanjun Song, Deqing Sun, Sanghyuk Chun, Varun Jampani, Dongyoon Han, Byeongho Heo, Wonjae Kim, Ming-Hsuan Yang year 2021 url https://arxiv.org/abs/2110.03921

    Abstract

    Transformers are transforming the landscape of computer vision, especially for recognition tasks. Detection transformers are the first fully end-to-end learning systems for object detection, while vision transformers are the first fully transformer-based architecture for image classification. In this paper, we integrate Vision and Detection Transformers (ViDT) to build an effective and efficient object detector. ViDT introduces a reconfigured attention module to extend the recent Swin Transformer to be a standalone object detector, followed by a computationally efficient transformer decoder that exploits multi-scale features and auxiliary techniques essential to boost the detection performance without much increase in computational load. Extensive evaluation results on the Microsoft COCO benchmark dataset demonstrate that ViDT obtains the best AP and latency trade-off among existing fully transformer-based object detectors, and achieves 49.2AP owing to its high scalability for large models. We will release the code and trained models at\u00a0this https URL

    ","tags":["paper","object_detection","vit","computer_vision"]},{"location":"100%20Reference%20notes/101%20Literature/Vision%20Mamba%20-%20Efficient%20Visual%20Representation%20Learning%20with%20Bidirectional%20State%20Space%20Model/","title":"Vision Mamba Efficient Visual Representation Learning with Bidirectional State Space Model","text":"Properties authors Lianghui Zhu, Bencheng Liao, Qian Zhang, Xinlong Wang, Wenyu Liu, Xinggang Wang year 2024 url https://arxiv.org/abs/2401.09417

    Abstract

    Recently the state space models (SSMs) with efficient hardware-aware designs, i.e., the Mamba deep learning model, have shown great potential for long sequence modeling. Meanwhile building efficient and generic vision backbones purely upon SSMs is an appealing direction. However, representing visual data is challenging for SSMs due to the position-sensitivity of visual data and the requirement of global context for visual understanding. In this paper, we show that the reliance on self-attention for visual representation learning is not necessary and propose a new generic vision backbone with bidirectional Mamba blocks (Vim), which marks the image sequences with position embeddings and compresses the visual representation with bidirectional state space models. On ImageNet classification, COCO object detection, and ADE20k semantic segmentation tasks, Vim achieves higher performance compared to well-established vision transformers like DeiT, while also demonstrating significantly improved computation & memory efficiency. For example, Vim is 2.8\u00d7\u00a0faster than DeiT and saves 86.8% GPU memory when performing batch inference to extract features on images with a resolution of 1248\u00d71248. The results demonstrate that Vim is capable of overcoming the computation & memory constraints on performing Transformer-style understanding for high-resolution images and it has great potential to be the next-generation backbone for vision foundation models. Code is available at\u00a0this https URL.

    ","tags":["transformers","mamba","ssm","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/Vision%20Transformers%20Need%20Registers/","title":"Vision Transformers Need Registers","text":"Properties authors Timoth\u00e9e Darcet, Maxime Oquab, Julien Mairal, Piotr Bojanowski year 2023 url https://arxiv.org/pdf/2309.16588

    Abstract

    Transformers have recently emerged as a powerful tool for learning visual representations. In this paper, we identify and characterize artifacts in feature maps of both supervised and self-supervised ViT networks. The artifacts correspond to high-norm tokens appearing during inference primarily in low-informative background areas of images, that are repurposed for internal computations. We propose a simple yet effective solution based on providing additional tokens to the input sequence of the Vision Transformer to fill that role. We show that this solution fixes that problem entirely for both supervised and self-supervised models, sets a new state of the art for self-supervised visual models on dense visual prediction tasks, enables object discovery methods with larger models, and most importantly leads to smoother feature maps and attention maps for downstream visual processing.

    ","tags":["paper","vit","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Vision%20Transformers%20Need%20Registers/#note","title":"Note","text":"
    • note to myself:
      • Read paper in depth #personal \ud83d\udd3c
    ","tags":["paper","vit","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/What%20Do%20Self-Supervised%20Vision%20Transformers%20Learn%3F/","title":"What Do Self Supervised Vision Transformers Learn?","text":"Properties authors Namuk Park, Wonjae Kim, Byeongho Heo, Taekyung Kim, Sangdoo Yun year 2023 url https://arxiv.org/abs/2305.00729

    Abstract

    We present a comparative study on how and why contrastive learning (CL) and masked image modeling (MIM) differ in their representations and in their performance of downstream tasks. In particular, we demonstrate that self-supervised Vision Transformers (ViTs) have the following properties: (1) CL trains self-attentions to capture longer-range global patterns than MIM, such as the shape of an object, especially in the later layers of the ViT architecture. This CL property helps ViTs linearly separate images in their representation spaces. However, it also makes the self-attentions collapse into homogeneity for all query tokens and heads. Such homogeneity of self-attention reduces the diversity of representations, worsening scalability and dense prediction performance. (2) CL utilizes the low-frequency signals of the representations, but MIM utilizes high-frequencies. Since low- and high-frequency information respectively represent shapes and textures, CL is more shape-oriented and MIM more texture-oriented. (3) CL plays a crucial role in the later layers, while MIM mainly focuses on the early layers. Upon these analyses, we find that CL and MIM can complement each other and observe that even the simplest harmonization can help leverage the advantages of both methods. The code is available at\u00a0this https URL.

    ","tags":["paper","dl_theory","vit","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/What%20Do%20Self-Supervised%20Vision%20Transformers%20Learn%3F/#notes","title":"Notes","text":"

    Another certified banger\u2122 by Naver AI Lab. Also check How do vision transformers work? (the link might not be working because of the interrogation symbol on the name, will fix later).

    • Add annotations from Zotero \ud83d\udd3d
    ","tags":["paper","dl_theory","vit","transformers"]},{"location":"100%20Reference%20notes/102%20Authors/Albert%20Gu/","title":"Albert Gu","text":"Properties affiliation Carnegie Mellon University"},{"location":"100%20Reference%20notes/102%20Authors/Alex%20Flinth/","title":"Alex Flinth","text":"Properties affiliation Umea University"},{"location":"100%20Reference%20notes/102%20Authors/Alexander%20Kirillov/","title":"Alexander Kirillov","text":"Properties affiliation OpenAI, FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Alexey%20Dosovitskiy/","title":"Alexey Dosovitskiy","text":"Properties affiliation Google"},{"location":"100%20Reference%20notes/102%20Authors/Ananya%20Kumar/","title":"Ananya Kumar","text":"Properties affiliation Stanford"},{"location":"100%20Reference%20notes/102%20Authors/Andreas%20Loukas/","title":"Andreas Loukas","text":"Properties affiliation EPFL"},{"location":"100%20Reference%20notes/102%20Authors/Andreas%20Savakis/","title":"Andreas Savakis","text":"Properties affiliation Rochester Institute of Technology"},{"location":"100%20Reference%20notes/102%20Authors/Angela%20Fan/","title":"Angela Fan","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Annie%20S.%20Chen/","title":"Annie S. Chen","text":"Properties affiliation Stanford"},{"location":"100%20Reference%20notes/102%20Authors/Antonio%20Orvieto/","title":"Antonio Orvieto","text":"Properties affiliation Max Planck Institute for Intelligent Systems"},{"location":"100%20Reference%20notes/102%20Authors/Ardavan%20Pedram/","title":"Ardavan Pedram","text":"Properties affiliation Stanford, Samsung"},{"location":"100%20Reference%20notes/102%20Authors/Armand%20Joulin/","title":"Armand Joulin","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Attila%20Lengyel/","title":"Attila Lengyel","text":"Properties affiliation TU Delft"},{"location":"100%20Reference%20notes/102%20Authors/Boshi%20Wang/","title":"Boshi Wang","text":"Properties affiliation The Ohio State University"},{"location":"100%20Reference%20notes/102%20Authors/Byeongho%20Heo/","title":"Byeongho Heo","text":"Properties affiliation Naver AI Lab"},{"location":"100%20Reference%20notes/102%20Authors/Caglar%20Gulcehre/","title":"Caglar Gulcehre","text":"Properties affiliation CLAIRE, EPFL"},{"location":"100%20Reference%20notes/102%20Authors/Carmen%20Amo%20Alonso/","title":"Carmen Amo Alonso","text":"Properties affiliation ETH Zurich"},{"location":"100%20Reference%20notes/102%20Authors/Cees%20G.%20M.%20Snoek/","title":"Cees G. M. Snoek","text":"Properties affiliation University of Amsterdam"},{"location":"100%20Reference%20notes/102%20Authors/Chelsea%20Finn/","title":"Chelsea Finn","text":"Properties affiliation Stanford"},{"location":"100%20Reference%20notes/102%20Authors/Chong%20Wang/","title":"Chong Wang","text":"Properties affiliation Apple, Princeton University"},{"location":"100%20Reference%20notes/102%20Authors/Christopher%20Olah/","title":"Christopher Olah","text":"Properties affiliation Anthropic"},{"location":"100%20Reference%20notes/102%20Authors/Daniel%20M.%20Roy/","title":"Daniel M. Roy","text":"Properties affiliation Vector Institute"},{"location":"100%20Reference%20notes/102%20Authors/Daniel%20Ulbricht/","title":"Daniel Ulbricht","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/David%20M.%20Knigge/","title":"David M. Knigge","text":"Properties affiliation University of Amsterdam"},{"location":"100%20Reference%20notes/102%20Authors/David%20W.%20Romero/","title":"David W. Romero","text":"Properties affiliation Vrije Universiteit Amsterdam"},{"location":"100%20Reference%20notes/102%20Authors/Diane%20Larlus/","title":"Diane Larlus","text":"Properties affiliation Naver Labs Europe"},{"location":"100%20Reference%20notes/102%20Authors/Donghyun%20Kim/","title":"Donghyun Kim","text":"Properties affiliation Naver Cloud AI"},{"location":"100%20Reference%20notes/102%20Authors/Dongyoon%20Han/","title":"Dongyoon Han","text":"Properties affiliation Naver AI Lab"},{"location":"100%20Reference%20notes/102%20Authors/Duy-Kien%20Nguyen/","title":"Duy Kien Nguyen","text":"Properties affiliation University of Amsterdam"},{"location":"100%20Reference%20notes/102%20Authors/Edward%20J.%20Hu/","title":"Edward J. Hu","text":"Properties affiliation Microsoft"},{"location":"100%20Reference%20notes/102%20Authors/Edward%20Z.%20Yang/","title":"Edward Z. Yang","text":"Properties affiliation FAIR, Stanford, MIT, PyTorch

    Notes: - Has a pretty cool YouTube channel where he shares (bi-weekly) PyTorch meetings - For me, it's a nice source to get more involved with PyTorch compiler-ish libraries/tools like [[ExecuTorch|ExecuTorch]], [[torch.export|torch.export]] - Also it is interesting to see the interaction between engineers

    "},{"location":"100%20Reference%20notes/102%20Authors/Eric%20Mintun/","title":"Eric Mintun","text":"Properties affiliation FAIR, UC Santa Barbara"},{"location":"100%20Reference%20notes/102%20Authors/Erik%20J.%20Bekkers/","title":"Erik J. Bekkers","text":"Properties affiliation University of Amsterdam"},{"location":"100%20Reference%20notes/102%20Authors/Eshan%20Verma/","title":"Eshan Verma","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Fahim%20Tajwar/","title":"Fahim Tajwar","text":"Properties affiliation Stanford"},{"location":"100%20Reference%20notes/102%20Authors/Fartash%20Faghri/","title":"Fartash Faghri","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Francisco%20Massa/","title":"Francisco Massa","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Fred%20Hohman/","title":"Fred Hohman","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Furu%20Wei/","title":"Furu Wei","text":"Properties affiliation Microsoft"},{"location":"100%20Reference%20notes/102%20Authors/Gabriel%20Synnaeve/","title":"Gabriel Synnaeve","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Gintare%20Karolina%20Dziugaite/","title":"Gintare Karolina Dziugaite","text":"Properties affiliation Google DeepMind"},{"location":"100%20Reference%20notes/102%20Authors/Hadi%20Pouransari/","title":"Hadi Pouransari","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Han%20Cai/","title":"Han Cai","text":"Properties affiliation MIT, Shanghai Jiao Tong University"},{"location":"100%20Reference%20notes/102%20Authors/Hanzi%20Mao/","title":"Hanzi Mao","text":"Properties affiliation FAIR, NVIDIA"},{"location":"100%20Reference%20notes/102%20Authors/Haoxiang%20Wang/","title":"Haoxiang Wang","text":"Properties affiliation Apple, University of Illinois at Urbana-Champaign"},{"location":"100%20Reference%20notes/102%20Authors/Herv%C3%A9%20Jegou/","title":"Herv\u00e9 Jegou","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Huaxiu%20Yao/","title":"Huaxiu Yao","text":"Properties affiliation Stanford"},{"location":"100%20Reference%20notes/102%20Authors/Hugo%20Touvron/","title":"Hugo Touvron","text":"Properties affiliation FAIR, Sorbonne University"},{"location":"100%20Reference%20notes/102%20Authors/Huizi%20Mao/","title":"Huizi Mao","text":"Properties affiliation NVIDIA"},{"location":"100%20Reference%20notes/102%20Authors/Isha%20Garg/","title":"Isha Garg","text":"Properties affiliation Purdue University, Apple"},{"location":"100%20Reference%20notes/102%20Authors/Ishan%20Misra/","title":"Ishan Misra","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Jan%20E.%20Gerken/","title":"Jan E. Gerken","text":"Properties affiliation Chalmers University of Technology"},{"location":"100%20Reference%20notes/102%20Authors/Javier%20Maass%20Martinez/","title":"Javier Maass Martinez","text":"Properties affiliation University of Chile"},{"location":"100%20Reference%20notes/102%20Authors/Jean-Baptiste%20Cordonnier/","title":"Jean Baptiste Cordonnier","text":"Properties affiliation EPFL"},{"location":"100%20Reference%20notes/102%20Authors/Jeff%20Pool/","title":"Jeff Pool","text":"Properties affiliation NVIDIA"},{"location":"100%20Reference%20notes/102%20Authors/Jesse%20Cai/","title":"Jesse Cai","text":"Properties affiliation Meta, UCLA, PyTorch"},{"location":"100%20Reference%20notes/102%20Authors/Jing%20Pu/","title":"Jing Pu","text":"Properties affiliation Google, Stanford"},{"location":"100%20Reference%20notes/102%20Authors/Joaquin%20Fontbona/","title":"Joaquin Fontbona","text":"Properties affiliation University of Chile"},{"location":"100%20Reference%20notes/102%20Authors/John%20Denker/","title":"John Denker","text":"Properties affiliation Nokia Bell Labs"},{"location":"100%20Reference%20notes/102%20Authors/John%20Tran/","title":"John Tran","text":"Properties affiliation NVIDIA"},{"location":"100%20Reference%20notes/102%20Authors/Julien%20Mairal/","title":"Julien Mairal","text":"Properties affiliation INRIA"},{"location":"100%20Reference%20notes/102%20Authors/Juliette%20Marrie/","title":"Juliette Marrie","text":"Properties affiliation Naver Labs Europe, INRIA"},{"location":"100%20Reference%20notes/102%20Authors/Kaiming%20He/","title":"Kaiming He","text":"Properties affiliation FAIR, MIT"},{"location":"100%20Reference%20notes/102%20Authors/Kamyar%20Azizzadenesheli/","title":"Kamyar Azizzadenesheli","text":"Properties affiliation NVIDIA, Purdue University"},{"location":"100%20Reference%20notes/102%20Authors/Kaushik%20Roy/","title":"Kaushik Roy","text":"Properties affiliation Purdue University"},{"location":"100%20Reference%20notes/102%20Authors/Lawrence%20Chan/","title":"Lawrence Chan","text":"Properties affiliation UC Berkeley"},{"location":"100%20Reference%20notes/102%20Authors/Lucius%20Bushnaq/","title":"Lucius Bushnaq","text":"Properties affiliation Apollo Research"},{"location":"100%20Reference%20notes/102%20Authors/Maciej%20Wo%C5%82czyk/","title":"Maciej Wo\u0142czyk","text":"Properties affiliation IDEAS NCBR"},{"location":"100%20Reference%20notes/102%20Authors/Mahmoud%20Assran/","title":"Mahmoud Assran","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Marc%20Finzi/","title":"Marc Finzi","text":"Properties affiliation New York University"},{"location":"100%20Reference%20notes/102%20Authors/Mark%20A.%20Horowitz/","title":"Mark A. Horowitz","text":"Properties affiliation Stanford"},{"location":"100%20Reference%20notes/102%20Authors/Martin%20Jaggi/","title":"Martin Jaggi","text":"Properties affiliation EPFL"},{"location":"100%20Reference%20notes/102%20Authors/Martin%20R.%20Oswald/","title":"Martin R. Oswald","text":"Properties affiliation University of Amsterdam"},{"location":"100%20Reference%20notes/102%20Authors/Mathilde%20Caron/","title":"Mathilde Caron","text":"Properties affiliation FAIR, INRIA"},{"location":"100%20Reference%20notes/102%20Authors/Maxime%20Oquab/","title":"Maxime Oquab","text":"Properties affiliation FAIR, INRIA"},{"location":"100%20Reference%20notes/102%20Authors/Mehrdad%20Farajtabar/","title":"Mehrdad Farajtabar","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Michael%20Arbel/","title":"Michael Arbel","text":"Properties affiliation INRIA"},{"location":"100%20Reference%20notes/102%20Authors/Mohammad%20Rastegari/","title":"Mohammad Rastegari","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Namuk%20Park/","title":"Namuk Park","text":"Properties affiliation Naver AI Lab, Prescient Design, Genentech"},{"location":"100%20Reference%20notes/102%20Authors/Navin%20Ranjan/","title":"Navin Ranjan","text":"Properties affiliation Rochester Institute of Technology"},{"location":"100%20Reference%20notes/102%20Authors/Neel%20Nanda/","title":"Neel Nanda","text":"Properties affiliation Google DeepMind, Anthropic"},{"location":"100%20Reference%20notes/102%20Authors/Nicolas%20Carion/","title":"Nicolas Carion","text":"Properties affiliation New York University"},{"location":"100%20Reference%20notes/102%20Authors/Nicolas%20Usunier/","title":"Nicolas Usunier","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Oncel%20Tuzel/","title":"Oncel Tuzel","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Patrick%20Forr%C3%A9/","title":"Patrick Forr\u00e9","text":"Properties affiliation University of Amsterdam"},{"location":"100%20Reference%20notes/102%20Authors/Pavan%20Kumar%20Anasosalu%20Vasu/","title":"Pavan Kumar Anasosalu Vasu","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Percy%20Liang/","title":"Percy Liang","text":"Properties affiliation Stanford"},{"location":"100%20Reference%20notes/102%20Authors/Piotr%20Bojanowski/","title":"Piotr Bojanowski","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Raviteja%20Vemulapalli/","title":"Raviteja Vemulapalli","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Razvan%20Pascanu/","title":"Razvan Pascanu","text":"Properties affiliation Google DeepMind"},{"location":"100%20Reference%20notes/102%20Authors/Robin%20Walters/","title":"Robin Walters","text":"Properties affiliation Northeastern University"},{"location":"100%20Reference%20notes/102%20Authors/Rose%20Yu/","title":"Rose Yu","text":"Properties affiliation UC San Diego"},{"location":"100%20Reference%20notes/102%20Authors/Ross%20Girshick/","title":"Ross Girshick","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Rui%20Wang/","title":"Rui Wang","text":"Properties affiliation MIT, UC San Diego"},{"location":"100%20Reference%20notes/102%20Authors/Ruoming%20Pang/","title":"Ruoming Pang","text":"Properties affiliation Apple, Princeton University"},{"location":"100%20Reference%20notes/102%20Authors/Sachin%20Mehta/","title":"Sachin Mehta","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Sangdoo%20Yun/","title":"Sangdoo Yun","text":"Properties affiliation Naver AI Lab"},{"location":"100%20Reference%20notes/102%20Authors/Sanghyuk%20Chun/","title":"Sanghyuk Chun","text":"Properties affiliation Naver AI Lab"},{"location":"100%20Reference%20notes/102%20Authors/Sara%20Solla/","title":"Sara Solla","text":"Properties affiliation Northwestern University"},{"location":"100%20Reference%20notes/102%20Authors/Sergey%20Zagoruyko/","title":"Sergey Zagoruyko","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Shaohan%20Huang/","title":"Shaohan Huang","text":"Properties affiliation Microsoft"},{"location":"100%20Reference%20notes/102%20Authors/Simon%20J.D.%20Prince/","title":"Simon J.D. Prince","text":"Properties affiliation University of Bath"},{"location":"100%20Reference%20notes/102%20Authors/Skander%20Moalla/","title":"Skander Moalla","text":"Properties affiliation CLAIRE, EPFL"},{"location":"100%20Reference%20notes/102%20Authors/Soham%20De/","title":"Soham De","text":"Properties affiliation Google DeepMind, University of Maryland"},{"location":"100%20Reference%20notes/102%20Authors/Song%20Han/","title":"Song Han","text":"Properties affiliation MIT"},{"location":"100%20Reference%20notes/102%20Authors/Songkuk%20Kim/","title":"Songkuk Kim","text":"Properties affiliation Yonsei University"},{"location":"100%20Reference%20notes/102%20Authors/Sourya%20Basu/","title":"Sourya Basu","text":"Properties affiliation University of Illinois at Urbana-Champaign, IBM Research"},{"location":"100%20Reference%20notes/102%20Authors/St%C3%A9phane%20d%27Ascoli/","title":"St\u00e9phane d'Ascoli","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Sukjun%20Hwang/","title":"Sukjun Hwang","text":"Properties affiliation Carnegie Mellon University"},{"location":"100%20Reference%20notes/102%20Authors/Taekyung%20Kim/","title":"Taekyung Kim","text":"Properties affiliation Naver AI Lab"},{"location":"100%20Reference%20notes/102%20Authors/Tete%20Xiao/","title":"Tete Xiao","text":"Properties affiliation FAIR

    Associations: FAIR, UC Berkeley

    "},{"location":"100%20Reference%20notes/102%20Authors/Tom%20Gunter/","title":"Tom Gunter","text":"Properties affiliation Apple, University of Oxford"},{"location":"100%20Reference%20notes/102%20Authors/Tom%20Lieberum/","title":"Tom Lieberum","text":"Properties affiliation University of Amsterdam, Google DeepMind"},{"location":"100%20Reference%20notes/102%20Authors/Vaibhav%20Aggarwal/","title":"Vaibhav Aggarwal","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/William%20J.%20Dally/","title":"William J. Dally","text":"Properties affiliation Stanford, NVIDIA"},{"location":"100%20Reference%20notes/102%20Authors/Wonjae%20Kim/","title":"Wonjae Kim","text":"Properties affiliation Naver AI Lab"},{"location":"100%20Reference%20notes/102%20Authors/Xiang%20Yue/","title":"Xiang Yue","text":"Properties affiliation Carnegie Mellon University"},{"location":"100%20Reference%20notes/102%20Authors/Xingyu%20Liu/","title":"Xingyu Liu","text":"Properties affiliation Carnegie Mellon University"},{"location":"100%20Reference%20notes/102%20Authors/Xinlei%20Chen/","title":"Xinlei Chen","text":"Properties affiliation FAIR, Zhejiang University, Carnegie Mellon University, Zhejiang University"},{"location":"100%20Reference%20notes/102%20Authors/Xiuying%20Wei/","title":"Xiuying Wei","text":"Properties affiliation EPFL, CLAIRE"},{"location":"100%20Reference%20notes/102%20Authors/Xu%20Ma/","title":"Xu Ma","text":"Properties affiliation Northeastern University"},{"location":"100%20Reference%20notes/102%20Authors/Xun%20Wu/","title":"Xun Wu","text":"Properties affiliation Microsoft, Tsinghua University"},{"location":"100%20Reference%20notes/102%20Authors/Yanghao%20Li/","title":"Yanghao Li","text":"Properties affiliation FAIR, Apple"},{"location":"100%20Reference%20notes/102%20Authors/Yann%20LeCun/","title":"Yann LeCun","text":"Properties affiliation FAIR, New York University"},{"location":"100%20Reference%20notes/102%20Authors/Yelong%20Shen/","title":"Yelong Shen","text":"Properties affiliation Microsoft"},{"location":"100%20Reference%20notes/102%20Authors/Yoonho%20Lee/","title":"Yoonho Lee","text":"Properties affiliation Stanford"},{"location":"100%20Reference%20notes/102%20Authors/Zeyuan%20Allen-Zhu/","title":"Zeyuan Allen Zhu","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Zhuoyang%20Zhang/","title":"Zhuoyang Zhang","text":"Properties affiliation NVIDIA, Tsinghua University"},{"location":"100%20Reference%20notes/102%20Authors/Ziaoyi%20Zhang/","title":"Ziaoyi Zhang","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Zirui%20Wang/","title":"Zirui Wang","text":"Properties affiliation Apple, Google, Carnegie Mellon University"},{"location":"100%20Reference%20notes/103%20Affiliations/CLAIRE/","title":"CLAIRE","text":""},{"location":"100%20Reference%20notes/103%20Affiliations/CLAIRE/#three-essential-pilars-of-the-lab","title":"Three essential pilars of the lab","text":"

    Efficient deep learning algorithms

    • Efficient RL
    • Sample efficient learning algorithms
    • Model Recycling
    • Efficient sequence models

    Robust, safe and responsible algorithms

    • RLHF/Alignment
    • Uncertainty aware/Bayesian algorithms
    • Offline RL
    • Active learning/Human in the loop algorithms
    • Better evaluations

    Improving reasoning: Moving from system 1 to system 2 level thinking

    • Improving reasoning
    • Creativity
    • Deliberation
    • Causality
    • Imagination
    • Planning
    "},{"location":"100%20Reference%20notes/103%20Affiliations/CLAIRE/#notes","title":"Notes","text":"
    • omg, this is amazing
    • Note to self: Look at CLAIRE's research \u23eb
    "},{"location":"100%20Reference%20notes/103%20Affiliations/FAIR/","title":"FAIR","text":"

    Related: FAIR

    "},{"location":"100%20Reference%20notes/103%20Affiliations/Naver%20Labs%20Europe/","title":"Naver Labs Europe","text":"

    Related to Naver AI Lab

    "},{"location":"100%20Reference%20notes/104%20Other/EPFL-CS439%20-%20Optimization%20for%20Machine%20Learning/","title":"EPFL CS439 Optimization for Machine Learning","text":"Properties authors Martin Jaggi, Nicolas Flammarion year 2024 url https://github.com/epfml/OptML_course/tree/master

    Abstract

    This course teaches an overview of modern mathematical optimization methods, for applications in machine learning and data science. In particular, scalability of algorithms to large datasets will be discussed in theory and in implementation.

    Topics

    Convexity, Gradient Methods, Proximal algorithms, Subgradient Methods, Stochastic and Online Variants of mentioned methods, Coordinate Descent, Frank-Wolfe, Accelerated Methods, Primal-Dual context and certificates, Lagrange and Fenchel Duality, Second-Order Methods including Quasi-Newton Methods, Derivative-Free Optimization.

    ","tags":["course","optimization"]},{"location":"100%20Reference%20notes/104%20Other/Introducing%20Apple%E2%80%99s%20On-Device%20and%20Server%20Foundation%20Models/","title":"Introducing Apple\u2019s On Device and Server Foundation Models","text":"Properties year 2024 url https://machinelearning.apple.com/research/introducing-apple-foundation-models","tags":["efficient_dl"]},{"location":"100%20Reference%20notes/104%20Other/Introducing%20Apple%E2%80%99s%20On-Device%20and%20Server%20Foundation%20Models/#pre-training","title":"## Pre-Training","text":"

    Our foundation models are trained on\u00a0Apple's AXLearn framework, an open-source project we released in 2023. It builds on top of JAX and XLA, and allows us to train the models with high efficiency and scalability on various training hardware and cloud platforms, including TPUs and both cloud and on-premise GPUs. We used a combination of data parallelism, tensor parallelism, sequence parallelism, and Fully Sharded Data Parallel (FSDP) to scale training along multiple dimensions such as data, model, and sequence length.

    ","tags":["efficient_dl"]},{"location":"100%20Reference%20notes/104%20Other/Introducing%20Apple%E2%80%99s%20On-Device%20and%20Server%20Foundation%20Models/#optimization","title":"Optimization","text":"

    In addition to ensuring our generative models are highly capable, we have used a range of innovative techniques to optimize them on-device and on our private cloud for speed and efficiency. We have applied an extensive set of optimizations for both first token and extended token inference performance.

    Both the on-device and server models use grouped-query-attention. We use shared input and output vocab embedding tables to reduce memory requirements and inference cost. These shared embedding tensors are mapped without duplications. The on-device model uses a vocab size of 49K, while the server model uses a vocab size of 100K, which includes additional language and technical tokens.

    For on-device inference, we use low-Bit Palettization, a critical optimization technique that achieves the necessary memory, power, and performance requirements. To maintain model quality, we developed a new framework using LoRA adapters that incorporates a mixed 2-bit and 4-bit configuration strategy \u2014 averaging 3.5 bits-per-weight \u2014 to achieve the same accuracy as the uncompressed models.

    Additionally, we use an interactive model latency and power analysis tool,\u00a0Talaria, to better guide the bit rate selection for each operation. We also utilize activation quantization and embedding quantization, and have developed an approach to enable efficient Key-Value (KV) cache update on our neural engines.

    References: Talaria - Interactively Optimizing Machine Learning Models for Efficient Inference Notes: - Might be useful to look at KV Cache hardware-dependency

    With this set of optimizations, on iPhone 15 Pro we are able to reach time-to-first-token latency of about 0.6 millisecond per prompt token, and a generation rate of 30 tokens per second. Notably, this performance is attained before employing token speculation techniques, from which we see further enhancement on the token generation rate.

    ","tags":["efficient_dl"]},{"location":"100%20Reference%20notes/104%20Other/Introducing%20Apple%E2%80%99s%20On-Device%20and%20Server%20Foundation%20Models/#model-adaptation","title":"Model Adaptation","text":"

    Our foundation models are fine-tuned for users\u2019 everyday activities, and can dynamically specialize themselves on-the-fly for the task at hand. We utilize adapters, small neural network modules that can be plugged into various layers of the pre-trained model, to fine-tune our models for specific tasks. For our models we adapt the attention matrices, the attention projection matrix, and the fully connected layers in the point-wise feedforward networks for a suitable set of the decoding layers of the transformer architecture.

    Notes: - How do you adapt the attention matrices? Is it like a bias? `A[i][j] += lora[i][j] - Attention projection matrix I suppose referes to the projection matrices \\(W_Q, W_K, W_V\\)

    By fine-tuning only the adapter layers, the original parameters of the base pre-trained model remain unchanged, preserving the general knowledge of the model while tailoring the adapter layers to support specific tasks.

    We represent the values of the adapter parameters using 16 bits, and for the ~3 billion parameter on-device model, the parameters for a rank 16 adapter typically require 10s of megabytes. The adapter models can be dynamically loaded, temporarily cached in memory, and swapped \u2014 giving our foundation model the ability to specialize itself on the fly for the task at hand while efficiently managing memory and guaranteeing the operating system's responsiveness.

    To facilitate the training of the adapters, we created an efficient infrastructure that allows us to rapidly retrain, test, and deploy adapters when either the base model or the training data gets updated. The adapter parameters are initialized using\u00a0the accuracy-recovery adapter introduced in the Optimization section.

    ","tags":["efficient_dl"]},{"location":"100%20Reference%20notes/104%20Other/Introduction%20to%20Quantization%20on%20PyTorch/","title":"Introduction to Quantization on PyTorch","text":"Properties authors Raghuraman Krishnamoorthi, James Reed, Min Ni, Chris Gottbrath, Seth Weidman year 2020 url https://pytorch.org/blog/introduction-to-quantization-on-pytorch/","tags":["website","efficient_dl","quantization"]},{"location":"100%20Reference%20notes/104%20Other/Introduction%20to%20Quantization%20on%20PyTorch/#notes","title":"Notes","text":"

    Quantization aware training is typically only used in CNN models when post training static or dynamic quantization doesn\u2019t yield sufficient accuracy. This can occur with models that are highly optimized to achieve small size (such as Mobilenet).

    Currently, operator coverage is limited and may restrict the choices listed in the table below: The table below provides a guideline.

    Model Type Preferred scheme Why LSTM/RNN Dynamic Quantization Throughput dominated by compute/memory bandwidth for weights BERT/Transformer Dynamic Quantization Throughput dominated by compute/memory bandwidth for weights CNN Static Quantization Throughput limited by memory bandwidth for activations CNN Quantization Aware Training In the case where accuracy can't be achieved with static quantization

    Does the Transformer row apply also for vision transformers? Since the number of tokens is quite large.

    Model Float Latency (ms) Quantized Latency (ms) Inference Performance Gain Device Notes BERT 581 313 1.8x Xeon-D2191 (1.6GHz) Batch size = 1, Maximum sequence length= 128, Single thread, x86-64, Dynamic quantization Resnet-50 214 103 2x Xeon-D2191 (1.6GHz) Single thread, x86-64, Static quantization Mobilenet-v2 97 17 5.7x Samsung S9 Static quantization, Floating point numbers are based on Caffe2 run-time and are not optimized

    So I should expect something around ~2x latency improvement with dynamic quantization

    ","tags":["website","efficient_dl","quantization"]},{"location":"100%20Reference%20notes/104%20Other/Let%27s%20talk%20about%20the%20Python%20Dispatcher/","title":"Let's talk about the Python Dispatcher","text":"Properties authors Edward Z. Yang year 2020 url http://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/","tags":["blog"]},{"location":"100%20Reference%20notes/104%20Other/MIT-65940%20-%20TinyML%20and%20Efficient%20Deep%20Learning%20Computing/","title":"MIT 65940 TinyML and Efficient Deep Learning Computing","text":"Properties authors Song Han year 2023 url https://hanlab.mit.edu/courses/2023-fall-65940","tags":["course"]},{"location":"100%20Reference%20notes/104%20Other/Optimizing%20Vision%20Transformer%20Model%20for%20Deployment/","title":"Optimizing Vision Transformer Model for Deployment","text":"Properties authors Jeff Tang, Geeta Chauhan year 2021 url https://pytorch.org/tutorials/beginner/vt_tutorial.html","tags":["website"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20ExecuTorch%20-%20Export%20IR%20Specification/","title":"PyTorch ExecuTorch Export IR Specification","text":"Properties authors PyTorch - Functionalization in PyTorch - Everything you need to know year 2024 url https://pytorch.org/executorch/main/ir-exir.html

    The Exported IR is a specification that consists of the following parts:

    1. A definition of computation graph model.
    2. Set of operators allowed in the graph.

    A dialect also provides further constraints meant for a specific purpose or stage in some compilation phase. Some dialects are: - aten dialect - edge dialect - backend dialect

    Executorch compilation first exports to aten, then to edge and finally to backend.

    ","tags":["paper"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20ExecuTorch%20-%20Export%20IR%20Specification/#aten-dialect","title":"Aten Dialect","text":"
    • PyTorch Functionalization is performed, removing any tensor aliases and mutations, and allowing for more flexible graph transformations to be made.
    ","tags":["paper"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20ExecuTorch%20-%20How%20ExecuTorch%20works%3F/","title":"PyTorch ExecuTorch How ExecuTorch works?","text":"Properties authors PyTorch Quantization for TensorRT year 2024 url https://pytorch.org/executorch/main/intro-how-it-works","tags":["pytorch","compilers","efficient_dl","documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20ExecuTorch%20-%20How%20ExecuTorch%20works%3F/#what-are-the-steps-to-run-a-model-with-executorch","title":"What are the steps to run a model with ExecuTorch?","text":"","tags":["pytorch","compilers","efficient_dl","documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20ExecuTorch%20-%20How%20ExecuTorch%20works%3F/#1-export-the-model","title":"1. Export the model","text":"
    • Capture the pytorch program as a graph
    ","tags":["pytorch","compilers","efficient_dl","documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20ExecuTorch%20-%20How%20ExecuTorch%20works%3F/#2-compile-the-exported-model-to-an-executorch-program","title":"2. Compile the exported model to an ExecuTorch program","text":"

    Captured Graph -> ExecuTorch program

    Possible Optimizations: - Compressing the model (e.g., quantization) - Lowering subgraphs to on-device specialized hardware accelerators to improve latency. - memory planning, i.e. to efficiently plan the location of intermediate tensors to reduce the runtime memory footprint.

    ","tags":["pytorch","compilers","efficient_dl","documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20ExecuTorch%20-%20How%20ExecuTorch%20works%3F/#3-run-the-executorch-program-to-a-target-device","title":"3. Run the ExecuTorch program to a target device","text":"
    • Light runtime with memory planning for fast inference :)
    ","tags":["pytorch","compilers","efficient_dl","documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20ExecuTorch%20-%20How%20ExecuTorch%20works%3F/#key-benefits","title":"Key Benefits","text":"
    • Export that is robust and powerful
    • Operator Standardization
    • Standardization for compiler interfaces (aka delegates) and the OSS ecosystem
    • First-party SDK and toolchain
    • Ease of customization
    • Low overhead runtime and execution
    ","tags":["pytorch","compilers","efficient_dl","documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20ExecuTorch%20-%20Quantization%20Overview/","title":"PyTorch ExecuTorch Quantization Overview","text":"Properties authors PyTorch Quantization for TensorRT year 2024 url https://pytorch.org/executorch/main/quantization-overview.html

    { width=\"400\" }

    Quantization is usually tied to execution backends that have quantized operators implemented. Thus each backend is opinionated about how the model should be quantized, expressed in a backend specific\u00a0Quantizer\u00a0class.

    ","tags":["documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20Functionalization%20in%20PyTorch%20-%20Everything%20you%20need%20to%20know/","title":"PyTorch Functionalization in PyTorch Everything you need to know","text":"Properties authors Brian Hirsh year 2023 url https://dev-discuss.pytorch.org/t/functionalization-in-pytorch-everything-you-wanted-to-know/965

    Given a program/function of PyTorch operators, functionalization will return a new function, that: 1. Has the same semantics as the old function 2. Has no mutations in it

    Exposed in functorch API.

    Functionalization operates at the level of our ATen API.

    Why? - Compilers don't like mutations: Graph partitioning is harder if nodes have side effects, etc.

    Notes: - PyTorch Functionalization

    ","tags":["documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20PyTorch%202%20Export%20Post%20Training%20Quantization/","title":"PyTorch PyTorch 2 Export Post Training Quantization","text":"Properties authors Jerry Zhang year 2024 url https://pytorch.org/tutorials/prototype/pt2e_quant_ptq.html

    Uses prepare_pt2e and convert_pt2e.

    float_model(Python)                          Example Input\n    \\                                              /\n     \\                                            /\n\u2014-------------------------------------------------------\n|                        export                        |\n\u2014-------------------------------------------------------\n                            |\n                    FX Graph in ATen     Backend Specific Quantizer\n                            |                       /\n\u2014--------------------------------------------------------\n|                     prepare_pt2e                      |\n\u2014--------------------------------------------------------\n                            |\n                     Calibrate/Train\n                            |\n\u2014--------------------------------------------------------\n|                    convert_pt2e                       |\n\u2014--------------------------------------------------------\n                            |\n                    Quantized Model\n                            |\n\u2014--------------------------------------------------------\n|                       Lowering                        |\n\u2014--------------------------------------------------------\n                            |\n        Executorch, Inductor or <Other Backends>\n
    ","tags":["documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20Quantization/","title":"PyTorch Quantization","text":"Properties authors PyTorch Quantization for TensorRT year 2024 url https://pytorch.org/docs/main/quantization.html#prototype-pytorch-2-export-quantization","tags":["documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20Quantization/#backendhardware-support","title":"Backend/Hardware Support","text":"Hardware Kernel Library Eager Mode Quantization FX Graph Mode Quantization Quantization Mode Support server CPU fbgemm/onednn Supported All Supported mobile CPU qnnpack/xnnpack server GPU TensorRT (early prototype) Not support this it requires a graph Supported Static Quantization

    Today, PyTorch supports the following backends for running quantized operators efficiently:

    • x86 CPUs with AVX2 support or higher (without AVX2 some operations have inefficient implementations), via\u00a0x86\u00a0optimized by\u00a0fbgemm\u00a0and\u00a0onednn\u00a0(see the details at\u00a0RFC)
    • ARM CPUs (typically found in mobile/embedded devices), via\u00a0qnnpack
    • (early prototype) support for NVidia GPU via\u00a0TensorRT\u00a0through\u00a0fx2trt\u00a0(to be open sourced)

    Note: - This is a bit old, as fx2trt is already available in torch-tensorrt. However, there

    ","tags":["documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20Compilers%20-%20What%20makes%20PyTorch%20beloved%20makes%20it%20hard%20to%20compile/","title":"PyTorch Compilers What makes PyTorch beloved makes it hard to compile","text":"Properties authors Peng Wu year 2022 url https://chips-compilers-mlsys-22.github.io/assets/slides/PyTorch%20Compilers%20(Compiler%20&%20Chips%20Symposium%202022).pdf

    Multiple pytorch compilers - TorchScript (torch.jit.script, torch.jit.trace) - supports python subset - full graph capture = Ahead-of-Time (AOT) Compilation - executed by TS interpreter - nnc, nvfuser - torch.fx - torch.package, torch.deploy - torch-mlir - TorchDynamo, TorchInductor - TorchDynamo captures partial graphs (if strict=False), and falls-back to eager.

    What makes TorchDynamo graph capture sound and out-of-the-box? - Partial graph capture: Ability to skip unwanted parts of eager - Guarded graphs: Ability to check if captured graph is valid for execution - Note: Basically, it inserts assertions/runtime checks to see that the partial graph is sound at runtime, if not, it jit recompiles. - Just-in-time recapture: recapture a graph if captured graph is invalid for execution

    Dynamo workflow - Captures FX Graph - Sends FX Graph to compiler hook to compile (which can be another compiler like TRT or torchscript)

    { width=\"800\" }

    Note: tbh this seems like an arbitrary separation, because torchdynamo also is meant for inference (torch.export), but this is probably because this tutorial is 2 years old

    ","tags":["presentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20Conference%202024%20-%20Fast%20Sparse%20Vision%20Transformers%20with%20minimal%20accuracy%20loss/","title":"PyTorch Conference 2024 Fast Sparse Vision Transformers with minimal accuracy loss","text":"Properties authors Jesse Cai year 2024 url https://static.sched.com/hosted_files/pytorch2024/c6/Sparsifying%20ViT%20lightning%20talk%20slides.pdf?_gl=119zah9b_gcl_auMTk3MjgxODE5OC4xNzI3MjU4NDM2FPAU*MTk3MjgxODE5OC4xNzI3MjU4NDM2

    Nice, it is on torchao

    Notes: - Don't quite understand what does Core or AO mean in this context, but at least torch.compile is acknowledged :p

    ","tags":["presentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20Conference%202024%20-%20What%E2%80%99s%20new%20in%20torch.export%3F/","title":"PyTorch Conference 2024 What\u2019s new in torch.export?","text":"Properties authors Avik Chaudhuri year 2024 url https://static.sched.com/hosted_files/pytorch2024/6b/What%E2%80%99s%20new%20in%20torch.export_.pptx.pdf?_gl=11s5cwnu_gcl_au*MTk3MjgxODE5OC4xNzI3MjU4NDM2","tags":["presentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20Conference%202024%20-%20What%E2%80%99s%20new%20in%20torch.export%3F/#recap-what-is-torchexport-and-why","title":"[Recap] What is torch.export and why?","text":"
    • \"Sound\", whole-graph capture of pytorch models
    • Emits \"IR\": backend-agnostic
    • For easier backend-specific lowering (trt, etc)
    • For python-free environments
    ","tags":["presentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20Conference%202024%20-%20What%E2%80%99s%20new%20in%20torch.export%3F/#composable-apis","title":"Composable APIs","text":"
    • Useful: torch.export.export_for_inference
    ","tags":["presentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20Conference%202024/","title":"PyTorch Conference 2024","text":"Properties year 2024

    Some interesting talks for #efficient_dl : - PyTorch Conference 2024 - What\u2019s new in torch.export? - PyTorch Conference 2024 - Fast Sparse Vision Transformers with minimal accuracy loss

    ","tags":["conference"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20Eager%20Mode%20Quantization%20TensorRT%20Acceleration/","title":"PyTorch Eager Mode Quantization TensorRT Acceleration","text":"Properties authors Lei Mao year 2024 url https://leimao.github.io/blog/PyTorch-Eager-Mode-Quantization-TensorRT-Acceleration/

    Abstract

    The TensorRT acceleration for the quantized PyTorch model from the PyTorch eager mode quantization interface involves three steps:

    1. Perform PyTorch eager mode quantization on the floating-point PyTorch model in PyTorch and export the quantized PyTorch model to ONNX.
    2. Fix the quantized ONNX model graph so that it can be parsed by the TensorRT parser.
    3. Build the quantized ONNX model to a TensorRT engine, profile the performance, and verify the accuracy.> 1

    The source code for this post can be found on GitHub .

    ","tags":["website","paper"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20internals/","title":"PyTorch internals","text":"Properties authors Edward Z. Yang year 2019 url http://blog.ezyang.com/2019/05/pytorch-internals/

    Depending on tensor metadata (if it's CUDA, or sparse, etc) it's dispatched to different implementations () { width=\"500\" }

    ","tags":["blog"]},{"location":"100%20Reference%20notes/104%20Other/Quantized%20Transfer%20Learning%20for%20Computer%20Vision%20Tutorial/","title":"Quantized Transfer Learning for Computer Vision Tutorial","text":"Properties authors Zafar Takhirov url https://pytorch.org/tutorials/intermediate/quantized_transfer_learning_tutorial.html","tags":["website"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%2011/","title":"Reinforcement Learning An Introduction Chapter 11","text":"Properties authors Richard S. Sutton, Andrew G. Barton year 2018","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%2011/#115-gradient-descent-in-the-bellman-error","title":"11.5 Gradient Descent in the Bellman Error","text":"

    Mean-squared temporal difference error

    \\[ \\begin{align} \\overline{TDE}(\\mathbf{w}) &= \\sum_{s \\in \\mathcal{S}} \\mu(s) \\mathbb{E}\\left[\\delta_t^2 \\mid S_t = s, A_t \\sim \\pi \\right] \\\\ &= \\sum_{s \\in \\mathcal{S}} \\mu(s) \\mathbb{E}\\left[\\rho_t \\delta_t^2 \\mid S_t = s, A_t \\sim b \\right] \\\\ &= \\mathbb{E}_b\\left[\\rho_t \\delta_t^2 \\right] \\end{align} \\]

    Equation 11.23: Weight update of naive residual-gradient algoritm

    \\[ \\begin{align} \\mathbf{w}_{t+1} &= \\mathbf{w}_t - \\frac{1}{2} \\alpha \\nabla(\\rho_t \\delta_t^2) \\\\ &= \\mathbf{w}_t - \\alpha \\rho_t \\delta_t \\nabla(\\delta_t) \\\\ &= \\mathbf{w}_t - \\alpha \\rho_t \\delta_t (\\nabla \\hat{v}(S_t, \\mathbf{w}_t) - \\gamma \\nabla \\hat{v}(S_{t+1}, \\mathbf{w}_t)) \\tag{11.23} \\\\ \\end{align} \\]","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%203/","title":"Reinforcement Learning An Introduction Chapter 3","text":"Properties authors Richard S. Sutton, Andrew G. Barton year 2018","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%203/#31-the-agent-environment-interface","title":"3.1 The Agent-Environment Interface","text":"

    Equation 3.1: Trajectory

    \\[ S_0,A_0,R_1,S_1,A_1,R_2,S_2,A_2,R_3, \\dots \\tag{3.1} \\]

    Equation 3.2: MDP dynamics

    \\[ p(s', r \\mid s, a) \\doteq \\Pr \\{ S_t = s', R_t = r \\mid S_{t-1} = s, A_{t-1} = a \\} \\tag{3.2} \\]

    You can obtain the state-transition probabilities and the with the law of total probability. You can obtain the expected reward also.

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%203/#32-goals-and-rewards","title":"3.2 Goals and Rewards","text":"What is the reward hypothesis?

    The reward hypothesis is the idea that all of what we mean by goals and purposes can be well thought of as the maximization of the expected value of the cumulative sum of a received scalar signal (called reward).

    • The reward signal is your way of communicating to the agent what you want it to achieve not how you want it to achieve it.
    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%203/#33-returns-and-episodes","title":"3.3 Returns and Episodes","text":"

    Equation 3.7: Undiscounted return

    \\[ G_t \\doteq R_{t+1} + R_{t+2} + R_{t+3} + \\dots + R_T \\tag{3.7} \\]

    Equation 3.8: Discounted return

    \\[ G_t \\doteq R_{t+1} + \\gamma R_{t+2} + \\gamma^2 R_{t+3} + \\dots = \\sum_{k=0}^{\\infty} \\gamma^k R_{t+k+1} \\tag{3.8} \\]

    Where \\(\\gamma\\) is the discount rate.

    Equation 3.9: Recursive definition of return

    You can group Eq 3.8 into a recursive definition of the return.

    \\[ G_t \\doteq R_{t+1} + \\gamma G_{t+1} \\tag{3.9} \\]","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%203/#34-unified-notation-for-episodic-and-continuing-tasks","title":"3.4 Unified Notation for Episodic and Continuing Tasks","text":"","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%203/#35-policies-and-value-functions","title":"3.5 Policies and Value Functions","text":"

    A policy \\(\\pi(a \\mid s)\\) is a probability distribution over actions given states.

    Equation 3.12: State-value function

    $$ v_{\\pi}(s) \\doteq \\mathbb{E}_{\\pi}[G_t \\mid S_t = s] \\;\\; \\forall s \\in \\mathcal{S} \\tag{3.12}

    $$

    Equation 3.13: Action-value function

    \\[ q_{\\pi}(s, a) \\doteq \\mathbb{E}_{\\pi}[G_t \\mid S_t = s, A_t = a] \\;\\; \\forall s \\in \\mathcal{S}, a \\in \\mathcal{A} \\tag{3.13} \\]

    Equation 3.14: Bellman equation for \\(v_{\\pi}\\)

    \\[ \\begin{align} v_\\pi(s) &\\doteq \\mathbb{E}_{\\pi}[G_t \\mid S_t = s] \\\\ &= \\mathbb{E}_{\\pi}[R_{t+1} + \\gamma G_{t+1} \\mid S_t = s] \\tag{by (3.9)} \\\\ &= \\sum_{a} \\pi(a \\mid s) \\sum_{s', r} p(s', r \\mid s, a) \\left[r + \\gamma \\mathbb{E}_{\\pi}\\left[G_{t+1} \\mid S_{t+1} = s'\\right]\\right] \\\\ &= \\sum_{a} \\pi(a \\mid s) \\sum_{s', r} p(s', r \\mid s, a) [r + \\gamma v_\\pi(s')] \\tag{3.14} \\end{align} \\]","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%203/#36-optimal-policies-and-optimal-value-functions","title":"3.6 Optimal Policies and Optimal Value Functions","text":"

    Equation 3.15: Optimal state-value function

    \\[ v_*(s) \\doteq \\max_{\\pi} v_{\\pi}(s) \\tag{3.15} \\]

    Equation 3.16: Optimal action-value function

    \\[ q_*(s, a) \\doteq \\max_{\\pi} q_{\\pi}(s, a) \\tag{3.16} \\]

    Equation 3.17

    \\[ q_*(s, a) = \\mathbb{E}[R_{t+1} + \\gamma v_*(S_{t+1}) \\mid S_t = s, A_t = a] \\tag{3.17} \\]

    Equation 3.18 and 3.19: Bellman optimality equations for \\(v_*\\)

    \\[ \\begin{align} v_*(s) &= \\max_{a \\in \\mathcal{A}(s)} q_{\\pi_*}(s, a) \\\\ &= \\max_{a} \\mathbb{E}_{\\pi_*}[G_t \\mid S_t = s, A_t = a] \\tag{by (3.9)}\\\\ &= \\max_{a} \\mathbb{E}_{\\pi_*}[R_{t+1} + \\gamma G_{t+1} \\mid S_t = s, A_t = a] \\\\ &= \\max_{a} \\mathbb{E}[R_{t+1} + \\gamma v_*(S_{t+1}) \\mid S_t = s, A_t = a] \\tag{3.18} \\\\ &= \\max_{a} \\sum_{s', r} p(s', r \\mid s, a) [r + \\gamma v_*(s')] \\tag{3.19} \\\\ \\end{align} \\]

    Equation 3.20: Bellman optimality equation for \\(q_*\\)

    \\[ \\begin{align} q_*(s, a) &= \\mathbb{E}[R_{t+1} + \\gamma \\max_{a'} q_*(S_{t+1}, a') \\mid S_t = s, A_t = a] \\\\ &= \\sum_{s', r} p(s', r \\mid s, a) [r + \\gamma \\max_{a'} q_*(s', a')] \\tag{3.20} \\end{align} \\]

    Any policy that is greedy with respect to the optimal evaluation function \\(v_*\\) is an optimal policy.

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%204/","title":"Reinforcement Learning An Introduction Chapter 4","text":"Properties authors Richard S. Sutton, Andrew G. Barton year 2018","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%204/#41-policy-evaluation","title":"4.1 Policy evaluation","text":"

    Equations 4.3 and 4.4

    \\[ \\begin{align} v_{\\pi}(s) &\\doteq \\mathbb{E}_{\\pi}[G_t \\mid S_t = s] \\\\ &= \\mathbb{E}_{\\pi}[R_{t+1} + \\gamma G_{t+1} \\mid S_t = s] && (\\text{from (3.9)})\\\\ &= \\mathbb{E}_{\\pi}[R_{t+1} + \\gamma v_{\\pi}(S_{t+1}) \\mid S_t = s] && (4.3)\\\\ &= \\sum_a \\pi(a \\mid s) \\sum_{s',r} p(s', r \\mid s, a) \\left[ r + \\gamma v_{\\pi}(s') \\right] && (4.4), \\end{align} \\]

    Equation 4.5

    \\[ \\begin{align} v_{k+1}(s) &\\doteq \\mathbb{E}_{\\pi} [ R_{t+1} + \\gamma v_k(S_{t+1}) \\mid S_t = s ] \\\\ & = \\sum_a \\pi(a \\mid s) \\sum_{s', r} p(s', r \\mid s, a) \\left[ r + \\gamma v_k(s') \\right] && (4.5), \\end{align} \\]

    { width=\"600\" }

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%204/#42-policy-improvement","title":"4.2 Policy Improvement","text":"

    Equation 4.6

    \\[ \\begin{align} q_\\pi(s, a) &\\doteq \\mathbb{E}[R_{t+1} + \\gamma v_\\pi(S_{t+1}) \\mid S_t = s, A_t = a] && (4.6)\\\\ &= \\sum_{s', r}p(s', r \\mid s, a)[r + \\gamma v_\\pi(s')] \\\\ \\end{align} \\]","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%204/#43-policy-iteration","title":"4.3 Policy Iteration","text":"

    { width=\"600\" }

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%204/#44-value-iteration","title":"4.4 Value Iteration","text":"

    \"This algorithm is called value iteration. It can be written as a particularly simple update operation that combines the policy improvement and truncated policy evaluation steps.\"

    Equation 4.10

    \\[ \\begin{align} v_{k+1} &\\doteq \\max_{a} \\mathbb{E} [R_{t+1} + \\gamma v_k(S_{t+1}) \\mid S_t =s, A = a] \\\\ &= \\max_{a} \\sum_{s', r}p(s', r \\mid s, a)[r + \\gamma v_k(s')] && (4.10) \\\\ \\end{align} \\]

    { width=\"600\" }

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%204/#45-asynchronous-dynamic-programming","title":"4.5 Asynchronous Dynamic Programming","text":"

    \"These algorithms update the values of states in any order whatsoever, using whatever values of other states happen to be available. [...] To converge correctly, however, an asynchronous algorithm must continue to update the values of all the states: it can\u2019t ignore any state after some point in the computation. Asynchronous DP algorithms allow great flexibility in selecting states to update.\"

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%205/","title":"Reinforcement Learning An Introduction Chapter 5","text":"Properties authors Richard S. Sutton, Andrew G. Barton year 2018","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%205/#51-monte-carlo-prediction","title":"5.1 Monte Carlo prediction","text":"

    first-visit mc - independence assumptions, easier theoretically every-visit mc

    • TODO: finish notes
    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%205/#54-monte-carlo-control-without-exploring-starts","title":"5.4 Monte Carlo Control without Exploring Starts","text":"
    • \\(\\epsilon-\\)greedy policy

      • All non-greedy actions have minimum probability of \\(\\frac{\\epsilon}{|\\mathcal{A}|}\\)
      • Greedy action has probability \\((1 - \\epsilon) + \\frac{\\epsilon}{|\\mathcal{A}|}\\)
    • TODO: finish notes

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%205/#55-off-policy-prediction-via-importance-sampling","title":"5.5 Off-policy Prediction via Importance Sampling","text":"

    Given a starting state \\(S_t\\), the probability of the subsequent state-action trajectory, \\(A_t, S_{t+1}, A_{t+1}, \\dots, S_T\\), under the policy \\(\\pi\\) is given by:

    \\[ \\begin{align} Pr\\{A_t, S_{t+1}, A_{t+1}, \\dots, S_T \\mid S_t, A_{t:T-1} \\sim \\pi\\} & = \\prod_{k=t}^{T-1} \\pi(A_k \\mid S_k) p(S_{k+1} \\mid S_k, A_k) \\end{align} \\]

    Equation 5.3: Important sampling ratio

    \\[ \\rho_{t:T-1} \\doteq \\frac{\\prod_{k=t}^{T-1} \\pi(A_k \\mid S_k) p(S_{k+1} \\mid S_k, A_k)}{\\prod_{k=t}^{T-1} b(A_k \\mid S_k) p(S_{k+1} \\mid S_k, A_k)} = \\prod_{k=t}^{T-1} \\frac{\\pi(A_k \\mid S_k)}{b(A_k \\mid S_k)} \\tag{5.3} \\]

    Equation 5.4: Value function for target function \\(\\pi\\) under behavior policy \\(b\\)

    The importance sampling ratio allows us to compute the correct expected value to compute \\(v_\\pi\\):

    \\[ \\begin{align} v_\\pi(s) &\\doteq \\mathbb{E}_b[\\rho_{t:T - 1}G_t \\mid S_t = s] \\tag{5.4} \\\\ \\end{align} \\]

    Equation 5.5: Ordinary importance sampling

    \\[ V(s) \\doteq \\frac{\\sum_{t \\in \\mathcal{T}(s)} \\rho_{t:T-1} G_t}{|\\mathcal{T}(s)|} \\tag{5.5} \\]

    Equation 5.6: Weighted importance sampling

    \\[ V(s) \\doteq \\frac{\\sum_{t \\in \\mathcal{T}(s)} \\rho_{t:T-1} G_t}{\\sum_{t \\in \\mathcal{T}(s)} \\rho_{t:T-1}} \\tag{5.6} \\]

    In practice, weighted importance sampling has much lower error at the beginning.

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%205/#56-incremental-implementation","title":"5.6 Incremental Implementation","text":"","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%205/#todo","title":"todo","text":"","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%206/","title":"Reinforcement Learning An Introduction Chapter 6","text":"Properties authors Richard S. Sutton, Andrew G. Barton year 2018","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%206/#61-td-prediction","title":"6.1 TD Prediction","text":"

    Equation 6.2: TD(0) update

    \\[ \\begin{align} V(S_t) &\\leftarrow V(S_t) + \\alpha \\left[ R_{t+1} + \\gamma V(S_{t+1}) - V(S_t) \\right] \\tag{6.2} \\\\ \\end{align} \\]

    Equations 6.3 and 6.4: Relationship between TD(0), MC and DP

    \\[ \\begin{align} v_\\pi(s) &\\doteq \\mathbb{E}_\\pi[G_t \\mid S_t = s] \\tag{6.3} \\\\ &= \\mathbb{E}_\\pi[R_{t+1} + \\gamma G_{t+1} \\mid S_t = s] \\tag{from (3.9)} \\\\ &= \\mathbb{E}_\\pi[R_{t+1} + \\gamma v_\\pi(S_{t+1}) \\mid S_t = s] \\tag{6.4} \\\\ \\end{align} \\] Why is (6.3) called the Monte Carlo estimate?

    Because the expected value is not known, and sampled returns are used in its place.

    Why is (6.4) called the Dynamic Programming estimate?

    Although the expectation is known, the value function is not, as we use the estimate \\(V(S_t)\\).

    By looking at the previous two answers, what does TD(0) estimate and how does that differ from the previous methods?

    TD(0) maintains both an estimate of the value function and uses a sample reward as the estimate to the expectation.

    Equation 6.5: TD error

    \\[ \\begin{align} \\delta_t &\\doteq R_{t+1} + \\gamma V(S_{t+1}) - V(S_t) \\tag{6.5} \\end{align} \\]","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%206/#64-sarsa-on-policy-td-control","title":"6.4 Sarsa: On-policy TD Control","text":"

    Equation 6.7

    \\[ \\begin{align} Q(S_t, A_t) \\leftarrow Q(S_t, A_t) + \\alpha \\left[ R_{t+1} + \\gamma Q(S_{t+1}, A_{t+1}) - Q(S_t, A_t) \\right] \\end{align} \\]

    { width=\"900\" }

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%206/#65-q-learning-off-policy-td-control","title":"6.5 Q-learning: Off-policy TD Control","text":"

    Equation 6.8

    \\[ \\begin{align} Q(S_t, A_t) \\leftarrow Q(S_t, A_t) + \\alpha \\left[ R_{t+1} + \\gamma \\max_a Q(S_{t+1}, a) - Q(S_t, A_t) \\right] \\end{align} \\]

    { width=\"700\" }

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%206/#66-expected-sarsa","title":"6.6 Expected SARSA","text":"

    Equation 6.9

    \\[ \\begin{align} Q(S_t, A_t) &\\leftarrow Q(S_t, A_t) + \\alpha \\left[ R_{t+1} + \\gamma \\mathbb{E}_\\pi [Q(S_{t+1}, A_{t+1}) \\mid S_{t+1}] - Q(S_t, A_t) \\right] \\\\ &= Q(S_t, A_t) + \\alpha \\left[ R_{t+1} + \\gamma \\sum_a \\pi(a \\mid S_{t+1}) Q(S_{t+1}, a) - Q(S_t, A_t) \\right] && (6.9) \\end{align} \\]

    It's more computationally demanding but it's more stable and fares better than q learning and sarsa.

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%206/#67-maximization-bias-and-double-learning","title":"6.7 Maximization Bias and Double Learning","text":"

    \"All the control algorithms that we have discussed so far involve maximization in the construction of their target policies\"

    this causes maximization bias: - think of estimating the mean of N(-0.1, 1) - this estimate might at some point be 0.1 and the other option might be correctly 0 - the optimal choice is to pick 0, but because we take the max of an estimate, we positively bias ourselves

    The general way to solve it is to estimate two different value functions, one for getting the value (\\(Q_2\\)) and the other for obtaining the best action \\(Q_1\\).

    \\[ \\begin{align} A^* &= \\text{argmax}_a Q_1(a) \\\\ Q_2(A^*) &= Q_2(\\text{argmax}_a Q_1(a)) \\end{align} \\]

    This effectively debiases the estimate \\(\\mathbb{E}[Q_2(A^*)] = q(A^*)\\)

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%207/","title":"Reinforcement Learning An Introduction Chapter 7","text":"Properties authors Richard S. Sutton, Andrew G. Barton year 2018","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%207/#71-n-step-td-prediction","title":"7.1 \\(n\\)-step TD prediction","text":"

    One-step return:

    \\[ G_{t:t+1} \\doteq R_{t+1} + \\gamma V_t(S_{t+1}) \\]

    Equation 7.1: \\(n\\)-step return

    \\[ G_{t:t+n} \\doteq R_{t+1} + \\gamma R_{t+2} + \\dots + \\gamma^{n-1} R_{t+n} + \\gamma^n V_{t + n - 1}(S_{t+n}) \\tag{7.1} \\]","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%209/","title":"Reinforcement Learning An Introduction Chapter 9","text":"Properties authors Richard S. Sutton, Andrew G. Barton year 2018","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%209/#9-on-policy-prediction-with-approximation","title":"9. On-policy prediction with approximation","text":"

    Problem setting: In most real scenarios, the number of states is too large for tabular learning algorithms, so we will approximate the value function by a learned, parametrized function: \\(\\(\\hat{v}(s, \\mathbf{w}) \\approx v_\\pi(s)\\)\\) - Examples of possible modelling choices for this function could be linear functions, non linear functions, neural networks, etc. - \\(\\mathbf{w} \\in R^d\\) , \\(d \\ll |\\mathcal{S}|\\) , which means that updating on state affects multiple: generalization - This formulation allows for partially observable states. - Side note: not all convergence proofs apply to all function classes (for more info see UCL x DeepMind 7/13)

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%209/#91-value-function-approximation","title":"9.1 Value-function approximation","text":"

    New notation! (\\(s\\to u\\) is an update rule for \\(v(s)\\) using new expression \\(u\\))

    How does the learning setting differ between neural networks (supervised) and reinforcement learning?

    RL requires modeling to allow:

    • online learning (while interacting with environment), incrementally acquire data
      • Remember that supervised learning suffers from catastrophic forgetting
    • Non-stationary target functions

    Supervised Learning assumes iid sampling from a fixed but unknown data distribution

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%209/#92-the-prediction-objective-overlineve","title":"9.2 The Prediction Objective (\\(\\overline{VE}\\))","text":"Why do we need a prediction objective now? What has changed?

    In the tabular setting we had two nice properties:

    • the learned value function could actually converge exactly to the true value function
    • the value of a state was decoupled from other states

    Without these two, we must say which states are most important to us.

    Equation 9.1

    \\[ \\begin{align} \\overline{VE}(\\mathbf{w}) &\\doteq \\sum_{s \\in \\mathcal{S}} \\mu(s) \\left[v_{\\pi}(s) - \\hat{v}(s, \\mathbf{w})\\right]^2 && \\tag{9.1} \\end{align} \\]

    Where: - \\(\\mu(s)\\) is the state distribution (reminder: non-negative, sums to one)

    For on-policy episodic tasks, \\(\\mu(s)\\) is called the on-policy distribution, which can be defined as follows:

    Equations 9.2 and 9.3

    \\[ \\begin{align} \\eta(s) = h(s) + \\sum_{\\bar{s}} \\eta(\\bar{s}) \\sum_a \\pi(a \\mid \\bar{s})p(s \\mid \\bar{s}, a), && \\text{for all } s \\in S && \\tag{9.2} \\end{align} \\] \\[ \\begin{align} \\mu(s) = \\frac{\\eta(s)}{\\sum_{s'}\\eta(s')} && \\tag{9.3} \\end{align} \\]

    Where: - \\(h(s)\\) is the probability that an episode begins in a state \\(s\\). - \\(\\eta(s)\\) is the number of time steps spent on average in a state \\(s\\) for a single episode. - Interpretation of 2 terms: Time is spent in a \\(s\\) if an episode starts in \\(s\\) or if another state transitions into \\(s\\).

    • \\(\\overline{VE}\\) only guarantees local optimality.
    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%209/#93-stochastic-gradient-and-semi-gradient-methods","title":"9.3 Stochastic-gradient and Semi-gradient Methods","text":"

    Equations 9.4 and 9.5

    \\[ \\begin{align} \\mathbf{w}_{t+1} &= \\mathbf{w}_t - \\frac{1}{2} \\alpha \\nabla \\left[v_{\\pi}(S_t) - \\hat{v}(S_t, \\mathbf{w}_t) \\right] && \\tag{9.4} \\\\ &= \\mathbf{w}_t + \\alpha \\left[v_{\\pi}(S_t) - \\hat{v}(S_t, \\mathbf{w}_t) \\right] \\nabla \\hat{v}(S_t, \\mathbf{w}_t) && \\tag{9.5} \\end{align} \\]

    However, since we don't know the true \\(v_\\pi(s)\\), we can replace it with the target output \\(U_t\\):

    Equation 9.7

    \\[ \\begin{align} \\mathbf{w}_{t+1} &= \\mathbf{w}_t + \\alpha \\left[U_t - \\hat{v}(S_t, \\mathbf{w}_t) \\right] \\nabla \\hat{v}(S_t, \\mathbf{w}_t) && \\tag{9.7} \\end{align} \\]

    Where: - \\(U_t\\) should be an unbiased estimate of \\(v_\\pi(s)\\), that is: - \\(\\mathbb{E}[U_t \\mid S_t=s] = v_\\pi(s)\\) - With local optimum convergence guarantees.

    Examples of \\(U_t\\): - Monte Carlo target: \\(U_t = G_t\\) (that is, the reward achieved until the end of the episode), unbiased. - Bootstrapping targets are biased because they depend on \\(\\mathbf{w}\\) through \\(\\hat{v}(S_t, \\mathbf{w})\\) . - To make them unbiased, you can treat the dependent expressions as constants (stop the gradient flow). This yields semi-gradient methods.

    Semi-gradient methods: - Do not converge as robustly as gradient methods, aside from the linear case. - Faster, enable online/continual learning.

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%209/#94-linear-methods","title":"9.4 Linear Methods","text":"

    Equation 9.8

    \\[ \\begin{align} \\hat{v}(s, \\mathbf{w}) \\doteq \\mathbf{w}^\\intercal \\mathbf{x}(s) = \\sum_{i=1}^d w_i x_i(s) && \\tag{9.8} \\end{align} \\]

    Where:

    • \\(\\mathbf{x}(s) = \\left(x_1(s), \\dots, x_d(s)\\right)^\\intercal\\)
    • The gradient Monte Carlo algorithm converges to the global optimum of the VE under linear function approximation if \\(\\alpha\\) is reduced over time according to the usual conditions.
    • Chapter also explores the convergence of TD(0) with SGD and linear approximation and finds it converges to the TD fixed point (Eqs. 9.11, 9.12), \\(\\mathbf{w}_{TD}\\). This is not the global optimum, but a point near the local optimum.

    Equation 9.14

    Interpretation: The asymptotic error of the TD method is no more than \\(\\frac{1}{1-\\gamma}\\) times the smallest possible error.

    \\[ \\begin{align} \\overline{VE}(\\mathbf{w}_{TD}) & \\leq \\frac{1}{1-\\gamma} \\min_{\\mathbf{w}} \\overline{VE}(\\mathbf{w}) \\tag{9.14} \\end{align} \\]

    Equation 9.15

    \\[ \\mathbf{w}_{t+n} \\doteq \\mathbf{w}_{t+n-1} + \\alpha \\left[ G_{t:t+n} - \\hat{v}(S_t, \\mathbf{w}_{t+n-1}) \\right] \\nabla \\hat{v}(S_t, \\mathbf{w}_{t+n-1}), \\quad 0 \\leq t < T, \\tag{9.15} \\]

    Equation 9.16

    \\[ G_{t:t+n} \\doteq R_{t+1} + \\gamma R_{t+2} + \\cdots + \\gamma^{n-1} R_{t+n} + \\gamma^n \\hat{v}(S_{t+n}, \\mathbf{w}_{t+n-1}), \\quad 0 \\leq t \\leq T - n. \\tag{9.16} \\]","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%209/#95-feature-construction-for-linear-methods","title":"9.5 Feature Construction for Linear Methods","text":"
    • 9.5.1 Polynomials
    • 9.5.2 Fourier Basis
    • 9.5.3 Coarse coding
    • 9.5.4 Tile Coding
    • 9.5.5 Radial Basis Functions
    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%209/#96-selecting-step-size-parameters-manually","title":"9.6 Selecting Step-Size Parameters Manually","text":"

    Equation 9.19

    Suppose you wanted to learn in about \\(\\tau\\) experiences with substantially the same feature vector. A good rule of thumb for setting the step-size parameter of linear SGD methods is:

    \\[ \\begin{align} \\alpha \\doteq \\left(\\tau \\mathbb{E}\\left[\\mathbf{x}^\\intercal\\mathbf{x}\\right]\\right)^{-1} \\tag{9.19} \\end{align} \\]","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction/","title":"Reinforcement Learning An Introduction","text":"Properties authors Richard S. Sutton, Andrew G. Barton year 2018
    • Reinforcement Learning - An Introduction - Chapter 3
    • Reinforcement Learning - An Introduction - Chapter 4
    • Reinforcement Learning - An Introduction - Chapter 6
    • Reinforcement Learning - An Introduction - Chapter 9
    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/TinyML%20and%20Efficient%20Deep%20Learning%20Computing%20-%20Lecture%2012/","title":"TinyML and Efficient Deep Learning Computing Lecture 12","text":"Properties authors Song Han year 2023 url https://www.dropbox.com/scl/fi/spgvr9owflz6s1lt5po17/lec12.pdf?rlkey=cwqpteopgvsdgnxd8xtcniadr&e=2&dl=0","tags":["lecture"]},{"location":"100%20Reference%20notes/104%20Other/TinyML%20and%20Efficient%20Deep%20Learning%20Computing%20-%20Lecture%203/","title":"TinyML and Efficient Deep Learning Computing Lecture 3","text":"Properties authors Song Han year 2023 url https://www.dropbox.com/scl/fi/2oxmtvoeccyuw47yfambb/lec03.pdf?rlkey=3ykm0g21ibsoqn7xnw43v7aaw&e=1&dl=0","tags":["lecture"]},{"location":"100%20Reference%20notes/104%20Other/TinyML%20and%20Efficient%20Deep%20Learning%20Computing%20-%20Lecture%205/","title":"TinyML and Efficient Deep Learning Computing Lecture 5","text":"Properties authors Song Han year 2023 url https://www.dropbox.com/scl/fi/eos92o2fgys6gk0gizogl/lec05.pdf?rlkey=2hohvi8jcvjw3f8m8vugfa2mz&e=1&dl=0

    Content: 1. Reviews numeric datatypes (floating point, etc) 2. Learns basic concept of quantization 3. Introduces three types of common neural network quantization: - K-Means-based Quantization - Linear Quantization - [[Binary and Ternary Quantization|Binary and Ternary Quantization]] (will be covered on Lecture 6)

    ","tags":["lecture"]},{"location":"100%20Reference%20notes/104%20Other/TinyML%20and%20Efficient%20Deep%20Learning%20Computing%20-%20Lecture%206/","title":"TinyML and Efficient Deep Learning Computing Lecture 6","text":"Properties authors Song Han year 2023 url https://www.dropbox.com/scl/fi/1mo0umu0qtq7uxap2l5m3/lec06.pdf?rlkey=bdl2mgusgajddjuvjxb0fot36&e=2&dl=0

    Content: 1. Quantization Granularity 1. Per tensor quantization: same quantization parameters for the entire matrix 2. Per channel quantization: sometimes each channels have considerably different weight distributions, have different quantization parameters per channel/row 3. Group quantization: similar idea 2. Dynamic Range Clipping - To quantize activations, we must keep track of activations statistics - Use KL divergence to measure information loss - Allocating dynamic range to outliers hurts representation ability (see below image) - 3. Rounding

    Quantization Aware Training - To minimize the loss of accuracy, especially aggressive quantization with 4 bits and lower bit width, neural network will be trained/fine-tuned with quantized weights and activations. - Usually, fine-tuning a pre-trained floating point model provides better accuracy than training from scratch.

    ","tags":["lecture"]},{"location":"100%20Reference%20notes/104%20Other/TinyML%20and%20Efficient%20Deep%20Learning%20Computing/","title":"TinyML and Efficient Deep Learning Computing","text":"Properties authors Song Han year 2023 url https://hanlab.mit.edu/courses/2023-fall-65940","tags":["course"]},{"location":"100%20Reference%20notes/104%20Other/Tweet%20-%20Stable%20Diffusion%20XL%20on%20iPhone%20with%20Core%20ML%21/","title":"Tweet Stable Diffusion XL on iPhone with Core ML!","text":"Properties authors Atila Orhon year 2023 url https://x.com/atiorh/status/1707402410870862002

    We compressed the diffusion model using our Mixed-Bit Palettization technique (described in https://huggingface.co/blog/stable-diffusion-xl-coreml\u2026) which yields an average of 4.04-bits (5.2GB -> 1.3GB) while maintaining higher accuracy than linear 8-bit quantization. Compressed model runs faster too

    Notes - 4 times smaller memory footprint - Better than linear 8-bit quantization - Faster inference time

    ","tags":["efficient_dl","tweet"]}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Home","text":"

    {{ blog_content }}

    "},{"location":"000%20Zettelkasten/2D%20Convolutions/","title":"2D Convolutions","text":"

    Fully comprehensive resource with animations: Conv2d

    ","tags":["cnn"]},{"location":"000%20Zettelkasten/Ahead-of-Time%20%28AOT%29%20Compilation/","title":"Ahead of Time (AOT) Compilation","text":"

    Generally: Compilation that occurs before the program is executed.

    Specifically to ML (PyTorch): - When a model is AOT compiled (using torch.jit.script(or trace) or torch.export), the entire program is translated from python into an intermediate representation that is independent of it. That is, you don't need a python interpreter to run that IR. - Note: torchscript is AOT in the sense that it requires to capture the whole graph before runtime but it performs further optimizations just-in-time.

    ","tags":["compilers","pytorch","optimization"]},{"location":"000%20Zettelkasten/Are%20less%20inductive%20biases%20better%20or%20worse%3F/","title":"Are less inductive biases better or worse?","text":"

    There's a general consensus that less inductive biases are better, intuitively because it helps optimization by allowing for more hardware-friendly architectures, etc.

    First, An image is worth 16x16 words - Transformers for image recognition at scale shows that ViTs, with minimal inductive biases, outperform ConvNets. ViTs have: - No translational equivariance baked in - No locality inductive bias enforced - Although positional encodings exist and fixed sinusoidal encodings can be used, they are mostly learned and randomly/zero initialized. They show that Vision Transformers scale better than ConvNets and Mixed Architectures (Convolutional stems + Transformer).

    A ConvNet for the 2020s proves that ResNets are outdated and improves the network with recent advances to match ViTs performance.

    The Lie derivative for measuring learned equivariance shows surprising result: ViTs exhibit more translational equivariance after training than ConvNets, as measured per their Lie Derivative.

    An Image is Worth More Than 16x16 Patches - Exploring Transformers on Individual Pixels tackles the toy question of dropping the convolutional stem that does the patchification in ViTs, with the intention of further reducing inductive biases. They prove that the resulting model (although too computationally intensive to be used in practice), competes with ViTs.

    How do vision transformers work? argues that the benefit of Vision Transformers is not that they have less inductive biases, but that the their operations are input dependent (see Input-dependent convolutions) and that Self Attention acts as a smoothing mechanism (that helps with better training dynamics on the large data regimes). They ablate this decision by constraining ViTs attention to be local, outperforming ViTs with global attention both in small and large data regimes. This is a strong indication that locality constraints are useful.

    Learning with Unmasked Tokens Drives Stronger Vision Learners implicitly counter-argues How do vision transformers work? by noticing that MIM-trained ViTs exhibit localized attention maps and \"fixing\" it. Their approach outperforms other MIM-trained ViTs, so locality as good inductive bias is not definitely answered.

    ","tags":["dl_theory","question"]},{"location":"000%20Zettelkasten/Are%20less%20inductive%20biases%20better%20or%20worse%3F/#vits-vs-dense-prediction-tasks","title":"ViTs vs Dense prediction tasks","text":"

    A ConvNet for the 2020s mentions that ViTs struggle on dense prediction tasks and they require hierarchical architectural choices (Swin Transformer) to do well. These choices re-introduce inductive biases.

    However, there's recent promising work that is (I think) successfully dropping these constraints: - Exploring Plain Vision Transformer Backbones for Object Detection - SimPLR - A Simple and Plain Transformer for Scaling-Efficient Object Detection and Segmentation

    ","tags":["dl_theory","question"]},{"location":"000%20Zettelkasten/Bit%20Palettization/","title":"Bit Palettization","text":"

    Seems to be similar to K-Means-based Quantization.

    [...] we use 6-bit palettization, a type of quantization that compresses model weights from a 16-bit floating-point representation to just 6 bits per parameter. The name \u201cpalettization\u201d refers to a technique similar to the one used in computer graphics to work with a limited set of colors: the color table (or \u201cpalette\u201d) contains a fixed number of colors, and the colors in the image are replaced with the indexes of the closest colors available in the palette. This immediately provides the benefit of drastically reducing storage size, and thus reducing download time and on-device disk use.

    References: - https://huggingface.co/blog/stable-diffusion-xl-coreml#what-is-mixed-bit-palettization - https://huggingface.co/blog/fast-diffusers-coreml

    Notes: - Multiplying by this weight matrix intuitively should be slower, it would be interesting to see what is the tradeoff speed vs memory. This tweet Tweet - Stable Diffusion XL on iPhone with Core ML! suggests that it runs faster than the non-quantized alternative.

    ","tags":["efficient_dl","transformers"]},{"location":"000%20Zettelkasten/Block%20Expansion/","title":"Block Expansion","text":"

    Key idea: - Introduce extra transformer block that is initialized to be the identity function and train that.

    From Parameter Efficient Fine-tuning of Self-supervised ViTs without Catastrophic Forgetting

    We introduce the concept of Block Expansion for fine-tuning pre-trained ViTs, building upon an idea that was recently proposed for language models\u00a0[27]\u00a0but has yet to be explored in vision. This technique is used to augment the capacity of a model without altering its initial output. In a ViT model comprised of sequential transformer blocks\u00a0(\\(\\phi_0,\\phi_1,\u2026,\\phi_N\\)), Block Expansion adds an identity block\u00a0(\\(\\phi_{id}\\))\u00a0after a set of transformer blocks such that\u00a0\\(\\phi_{id}(x)=x\\), meaning it returns the input as its output, ensuring the model\u2019s output remains unchanged immediately after expansion. To expand a model from\u00a0\ud835\udc41\u00a0to\u00a0\ud835\udc41\u2032\u00a0blocks, the original blocks are first grouped into sets containing\u00a0\ud835\udc40\u00a0blocks each. Within each set, an identity copy of the topmost block is created and placed on top, effectively increasing the model\u2019s depth without initially changing its behavior. In each newly expanded block, two linear layers are zero-initialized to enable identity mapping, as shown in Figure\u00a01\u00a0(c). These newly added blocks are only fine-tuned with the new data while the remaining blocks are frozen.

    ","tags":["efficient_dl","transformers"]},{"location":"000%20Zettelkasten/Convergence%20rate%20and%20Hessian%20spectra/","title":"Convergence rate and Hessian spectra","text":"
    • Remember: If a Hessian matrix is positive definite everywhere, then the function is convex => bad neg eigenvalues
    • Large eigenvalues of the Metrics for flatness Some metrics, such as the maximum Hessian eigenvalue, measure the worstcase loss increase under an adversarial perturbation to the weights [10, 16], while other proposed metrics, such as the Hessian trace, measure the expected loss increase under random perturbations to the weights.
    ","tags":["optimizability"]},{"location":"000%20Zettelkasten/Depthwise%20separable%20convolutions/","title":"Depthwise separable convolutions","text":"

    Splits the computation into two steps:\u00a0depthwise convolution\u00a0applies a single convolutional filter per each input channel and\u00a0pointwise convolution\u00a0is used to create a linear combination of the output of the depthwise convolution.

    Related ideas are often used to reduce the size/complexity of convolutional layers. It reduces expressivity of convolutions but its less parameters. For example Exploiting Redundancy - Separable Group Convolutional Networks on Lie Groups

    Also used in (ConvNext) A ConvNet for the 2020s

    ","tags":["cnn"]},{"location":"000%20Zettelkasten/Do%20Vision%20Foundation%20models%20exist%3F/","title":"Do Vision Foundation models exist?","text":"","tags":["question","foundation_models","computer_vision"]},{"location":"000%20Zettelkasten/Do%20Vision%20Foundation%20models%20exist%3F/#object-detection","title":"Object detection","text":"

    Research using DINOv2 as a backbone for object detection:

    DINOv2 \u274c - Poor Object Detection Performance with DINOv2 Backbone and Faster R-CNN Head on Cityscapes Dataset - Using mask rcnn head but still relevant, maybe dinov2 is not a good object detection backbone?

    DINOv2 \u2705

    \"NVIDIA has also released a foundational model called NV-Dinov2, which is available through the NVIDIA AI Enterprise program. NV-Dinov2 is a visual foundational model trained on an NVIDIA proprietary large scale dataset.\" NV-DINOv2 - NVIDIA provides CLIP VIT and DINO VIT backbones for object detection and segmentation (closed source) - This signals that it is not only possible but actually useful in production (the tao toolkit specifically markets to providing enterprise-ready vision transformers) - However it also very specifically states the inferior performance of vits compared with specifically trained dense-prediction networks: > \"To mitigate the inferior performance of a standard vision transformer (ViT) on dense prediction tasks, TAO supports the\u00a0ViT-Adapter_\u00a0architecture. This allows a powerful ViT that has learned rich semantic representations from a large corpus of data to achieve comparable performance to vision-specific transformers on dense prediction tasks.\"

    • Exploring Plain Vision Transformer Backbones for Object Detection

      • VitDET with DINO backbone gh issue
        • There's some caveats but they are fixable
    • SimPLR - A Simple and Plain Transformer for Scaling-Efficient Object Detection and Segmentation

      • Improves over ViTDet
    ","tags":["question","foundation_models","computer_vision"]},{"location":"000%20Zettelkasten/Equivariance%20Initialization/","title":"Equivariance Initialization","text":"

    Related: - Priors over Neural Network weights

    ","tags":["dl_theory"]},{"location":"000%20Zettelkasten/Group%20Axioms/","title":"Group Axioms","text":"

    A group is a non-empty set \\(G\\) together with a binary operation on \\(G\\) (\\(\\cdot\\)), that fulfills the following axioms: 1. Associativity: For all \\(a, b, c \\in G\\), one has \\((a \\cdot b) \\cdot c = a \\cdot (b \\cdot c)\\) 2. Identity element: There exists an element \\(e\\in G\\) such that, for every \\(a \\in G\\), \\(e \\cdot a = a\\) and \\(a \\cdot e = a\\) 3. Inverse element: For each \\(a\\in G\\), there exists a unique element \\(b\\in G\\) such that \\(a \\cdot b = e\\) and \\(b \\cdot a = e\\), where \\(e\\) is the identity element. The inverse of \\(a\\) is denoted as \\(a^{-1}\\)

    ","tags":["math"]},{"location":"000%20Zettelkasten/Group%20direct%20product/","title":"Group direct product","text":"

    Given groups \\(G\\) (with operation *) and \\(H\\) (with operation \\(\\Delta\\)), the direct product \\(G \\times H\\) is defined as follows: 1. The underlying set is the Cartesian product, \\(G \\times H\\). That is, the ordered pairs \\((g, h)\\), where \\(g \\in G\\) and \\(h \\in H\\). 2. The binary operation on \\(G \\times H\\) is defined component-wise.

    \\[ (g_1, h_1) \\cdot (g_2, h_2) = (g_1 * g_2, h_1 \\Delta h_2) \\]

    The resulting algebraic object satisfies the Group Axioms.

    ","tags":["math"]},{"location":"000%20Zettelkasten/Hardware-specific%20structured%20pruning/","title":"Hardware specific structured pruning","text":"

    Key Idea

    Some GPU architectures can take advantage of specific sparsity patterns.

    According to this the training procedure would look as follows:

    NVIDIA has developed a simple and universal recipe for sparsifying deep neural networks for inference\u00a0using this 2:4 structured sparsity pattern. The network is first trained using dense weights, then fine-grained structured pruning is applied, and finally the remaining non-zero weights are fine-tuned with additional training steps. This method results in virtually no loss in inferencing accuracy based on evaluation across dozens of networks spanning vision, object detection, segmentation, natural language modeling, and translation.

    References: - TinyML and Efficient Deep Learning Computing - Lecture 3 - https://developer.nvidia.com/blog/accelerating-inference-with-sparsity-using-ampere-and-tensorrt/ - https://developer.nvidia.com/blog/structured-sparsity-in-the-nvidia-ampere-architecture-and-applications-in-search-engines/

    ","tags":["efficient_dl","hardware_aware_dl"]},{"location":"000%20Zettelkasten/Input-dependent%20convolutions/","title":"Input dependent convolutions","text":"
    • How do vision transformers work? states that the key advantage of Self Attention over Convolutions is not the long range dependencies (global attention) but rather its data specificity (aka input dependency)
    • This is related to Mamba - Linear-Time Sequence Modeling with Selective State Spaces's insight :
      • \"We identify a key limitation of prior models: the ability to efficiently select data in an input-dependent manner (i.e. focus on or ignore particular inputs).\"

    There most likely is work on input-dependent convolutions: - [ ] CKConv - Continuous Kernel Convolution For Sequential Data is probably related, but haven't read it in full. Check this. - [ ] Review literature on input-dependent convolutions

    ","tags":["cnn","theory"]},{"location":"000%20Zettelkasten/K-Means-based%20Quantization/","title":"K Means based Quantization","text":"

    Perform clustering on weights, and replace weights with cluster int index matrix (to which cluster each weight entry belongs to) and a list of float centroids.

    Storing integers consumes less memory while you can keep fully precision on the float centroids (although you lose precision because it does not necessarily correspond to an actual value in the previous weight matrix).

    Resources: - https://scikit-learn.org/stable/auto_examples/cluster/plot_color_quantization.html - TinyML and Efficient Deep Learning Computing - Lecture 5

    ","tags":["efficient_dl"]},{"location":"000%20Zettelkasten/KV%20Cache/","title":"KV Cache","text":"

    From: TinyML and Efficient Deep Learning Computing - Lecture 12

    ","tags":["efficient_dl","transformers"]},{"location":"000%20Zettelkasten/Linear%20Quantization/","title":"Linear Quantization","text":"

    Visualization

    Then, for each layer in your network (linear, conv, etc), you represent the matrices involved like the previous formulation, do some arithmetic to see what you can precompute and zero-out and voil\u00e1

    ","tags":["efficient_dl"]},{"location":"000%20Zettelkasten/LoRa%20Adapter/","title":"LoRa Adapter","text":"

    Image source: https://medium.com/@bnjmn_marie/lora-load-and-merge-your-adapters-with-care-3204119f0426

    ","tags":["efficient_dl","transformers"]},{"location":"000%20Zettelkasten/Masked%20Image%20Modelling/","title":"Masked Image Modelling","text":"

    It seems like MIM objectives are becoming a strong learning objective for vision foundation models. Right now it seems to be the closest answer to: Do Vision Foundation models exist?

    However, intuitively it seems a bit like a weak signal, as it focuses on individual patches/pixels, without much consideration to semantic information. This is echoed on Learning with Unmasked Tokens Drives Stronger Vision Learners:

    However, MIM strategies often encounter challenges, such as local dependency on attention to understand entire context of an image. For example, liu\u00a0et al.\u00a0[36]\u00a0revealed that MAE\u00a0[22], a state-of-the-art MIM method, exhibits shorter average attention distances. Furthermore, we observe that attention map patterns by MAE substantiate extremely local behavior (See Fig.\u00a01) indeed. In other words, the MAE-trained attention mechanism less integrates information across the entire image pixels and tends to focus on specific input regions. This is presumably attributed to MIM-pretraining, primarily dedicated to predicting low-level pixel details (e.g., color or texture) without a comprehensive understanding of less-regional information (e.g., the input structure or shape).

    Related papers: - Learning with Unmasked Tokens Drives Stronger Vision Learners - DINOv2 - Learning Robust Visual Features without Supervision - Learning with Unmasked Tokens Drives Stronger Vision Learners - What Do Self-Supervised Vision Transformers Learn? \ud83d\udea8

    ","tags":["foundation_models","computer_vision"]},{"location":"000%20Zettelkasten/Maximal%20pruning%20and%20functional%20recovery/","title":"Maximal pruning and functional recovery","text":"

    Key Idea

    You can iteratively prune and finetune the network weights and still maintain performance up to some pruning ratio.

    Reference: - TinyML and Efficient Deep Learning Computing - Lecture 3 - Learning both Weights and Connections for Efficient Neural Networks

    ","tags":["dl_theory","efficient_dl"]},{"location":"000%20Zettelkasten/Mean%20Attention%20Distance/","title":"Mean Attention Distance","text":"

    Introduced in An image is worth 16x16 words - Transformers for image recognition at scale.

    From What Do Self-Supervised Vision Transformers Learn?

    \u201cAttention distance is defined as the average distance between the query tokens and key tokens considering their self-attention weights. Therefore, it conceptually corresponds to the size of the receptive fields in CNNs.\u201d (Park et al., 2023, p. 3)

    Key Observation

    Can be used to measure what is the how much local or global information is a transformer using. See What Do Self-Supervised Vision Transformers Learn?.

    ","tags":["dl_theory","transformers"]},{"location":"000%20Zettelkasten/Multiple%20global%20minima/","title":"Multiple global minima","text":"

    We expect loss functions for deep networks to have a large family of equivalent global minima.

    • Fully connected networks: permutation of the hidden units
    • Convolutional networks: permuting the channels and convolution kernels appropriately.
    • ...

    The above modifications all produce the same output for every input. However, the global minimum only depends on the output at the training data points.

    In overparameterized networks, there will also be families of solutions that behave identically at the data points but differently between them. All of these are also global minima.

    References: - Understanding Deep Learning - Chapter 20 (20.3.1)

    ","tags":["optimizability","dl_theory"]},{"location":"000%20Zettelkasten/Neural%20Network%20Quantization/","title":"Neural Network Quantization","text":"

    Related: - HuggingFace Docs - A survey of quantization methods for efficient neural network inference - A recent (2024) work by Han et al: AWQ - Activation-aware Weight Quantization for LLM Compression and Acceleration

    ","tags":["quantization","efficient_dl"]},{"location":"000%20Zettelkasten/Non-translationally%20equivariant%20convolutions/","title":"Non translationally equivariant convolutions","text":"

    I'm not sure if this makes sense at all, just tracking paper ideas lmao

    See: - Input-dependent convolutions - How do vision transformers work?

    ","tags":["cnn","convolutions","equivariance","partial_equivariance"]},{"location":"000%20Zettelkasten/Positive%20Logic%20Programs/","title":"Positive Logic Programs","text":"","tags":["knowledge_representation"]},{"location":"000%20Zettelkasten/Positive%20Logic%20Programs/#positive-logic-programs","title":"Positive logic programs","text":"

    Two components: 1. Facts: a. 2. Rules: a :- b, c, d , which is the same as b \u2227 c \u2227 d \u2192 a

    This is a positive logic program:

    rainy(amsterdam).\nrainy(vienna).\nwet(X) :- rainy(X). # eq: \u2200x. (Rainy(x) \u2192 Wet(x))\n

    ","tags":["knowledge_representation"]},{"location":"000%20Zettelkasten/Positive%20Logic%20Programs/#database-semantics","title":"Database semantics","text":"

    Assumptions 1. Domain closure: The objects mentioned are the only objects. 2. Unique-names assumption: Two variables can't refer to the same object 3. Closed-world assumption: Whatever we don't know is false

    What does the database semantics allow us to do?
    1. We can specify a relation by the set of inputs that are true
    2. We can specify objects simply by the terms that point to them
    3. We don't have to explicitly define what function symbols mean

    Thus, an interpretation is a set that defines which atoms are true. The remainder are false.

    ","tags":["knowledge_representation"]},{"location":"000%20Zettelkasten/Positive%20Logic%20Programs/#models","title":"Models","text":"What is a model?

    A model is an interpretation which makes all rules of a program true.

    However, we're not interested in all models, we want the highest expressivity at the lowest information.

    What is the definition of a minimal model?

    A model is minimal if no strict subset exist that is also a model.

    How do you construct a minimal model?

    Start with facts and add new literals that are on the lhs of a rule where all body is in M.

    M = {f for f in facts}\nwhile True:\n    for head, body in rules:\n        if all(l in M for l in body):\n            M.add(l)\n

    What is the definition of a supported model?

    A model is supported if all its atoms are supported. An atom of a model is supported if it appears as a head where the body is true.

    What properties does minimal models and supported models have for positive logic programs?

    For positive logic programs:

    • Minimal models are unique
    • A minimal model is also a supported model (but not necessarily viceversa)
    ","tags":["knowledge_representation"]},{"location":"000%20Zettelkasten/Positive%20Logic%20Programs/#normal-logic-programs","title":"Normal logic programs","text":"

    Now we allow negation.

    a :- b_1, ..., b_n, not c_1, ..., not c_m.\n
    Do properties of minimal models for PL still hold for NL? Why?

    No, negation removes allows for non-uniqueness of minimal models.

    ","tags":["knowledge_representation"]},{"location":"000%20Zettelkasten/Priors%20over%20Neural%20Network%20weights/","title":"Priors over Neural Network weights","text":"

    From Understanding Deep Learning - Chapter 10, 1d convolutions can be represented as weight matrices from a MLP with a specific prior where the diagonals are the same (d).

    Rotationally equivariant convolutions can be implemented by isotropic filters (a prior on the conv2d weight):

    ","tags":["dl_theory","equivariance"]},{"location":"000%20Zettelkasten/PyTorch%20Functionalization/","title":"PyTorch Functionalization","text":"

    Given a program/function of PyTorch operators, functionalization will return a new function, that: 1. Has the same semantics as the old function 2. Has no mutations in it

    Functionalization operates at the level of our ATen API.

    More info on PyTorch - Functionalization in PyTorch - Everything you need to know

    ","tags":["pytorch","compilers"]},{"location":"000%20Zettelkasten/PyTorch%20Quantization%20for%20TensorRT/","title":"PyTorch Quantization for TensorRT","text":"

    There seems to be quite a few possible ways to do this: - PyTorch Eager Mode Quantization TensorRT Acceleration , seems a bit cumbersome: 1. torchao quantization 2. ONNX conversion 3. Graph Surgery (changing some ops in the onnx graph) 4. tensorrt conversion - Not sure if it works, but would be ideal 1. torch.export 2. torchao quantization 3. tensorrt conversion - Less ideal would be: 1. torchao quantization 2. torch.export 3. tensorrt conversion - I've already sort of tried this using the vgg ptq example from tensorrt, but torch.export complained that it couldn't translate the quantized operations

    ","tags":["quantization","efficient_dl"]},{"location":"000%20Zettelkasten/Representation%20%28Group%20Theory%29/","title":"Representation (Group Theory)","text":"

    Property required:

    \\[ p(g)p(h) = p(g \\cdot h) \\]

    A representation of a group action can be a linear operator like:

    \\[ p(\\theta) = [sin(\\theta) ...] \\]","tags":["math","group_theory"]},{"location":"000%20Zettelkasten/Residual%20stream/","title":"Residual stream","text":"

    \"A transformer\u00a0starts with a token embedding, followed by a series of \u201cresidual blocks\u201d, and finally a token unembedding. Each residual block consists of an attention layer, followed by an MLP layer. Both the attention and MLP layers each \u201cread\u201d their input from the residual stream (by performing a linear projection), and then \u201cwrite\u201d their result to the residual stream by adding a linear projection back in.\u00a0Each attention layer consists of multiple heads, which operate in parallel.\" A Mathematical Framework for Transformer Circuits

    ","tags":["mechinterp","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/A%20Brief%20Review%20of%20Hypernetworks%20in%20Deep%20Learning/","title":"A Brief Review of Hypernetworks in Deep Learning","text":"Properties authors Vinod Kumar Chahuan, Jiandong Zhou, Ping Lu, Soheila Molaei, David A. Clifton year 2023 url https://arxiv.org/abs/2306.06955

    Abstract

    Hypernetworks, or hypernets in short, are neural networks that generate weights for another neural network, known as the target network. They have emerged as a powerful deep learning technique that allows for greater flexibility, adaptability, dynamism, faster training, information sharing, and model compression etc. Hypernets have shown promising results in a variety of deep learning problems, including continual learning, causal inference, transfer learning, weight pruning, uncertainty quantification, zero-shot learning, natural language processing, and reinforcement learning etc. Despite their success across different problem settings, currently, there is no review available to inform the researchers about the developments and to help in utilizing hypernets. To fill this gap, we review the progress in hypernets. We present an illustrative example to train deep neural networks using hypernets and propose categorizing hypernets based on five design criteria as inputs, outputs, variability of inputs and outputs, and architecture of hypernets. We also review applications of hypernets across different deep learning problem settings, followed by a discussion of general scenarios where hypernets can be effectively employed. Finally, we discuss the challenges and future directions that remain under-explored in the field of hypernets. We believe that hypernetworks have the potential to revolutionize the field of deep learning. They offer a new way to design and train neural networks, and they have the potential to improve the performance of deep learning models on a variety of tasks. Through this review, we aim to inspire further advancements in deep learning through hypernetworks.

    ","tags":["paper","hypernetworks"]},{"location":"100%20Reference%20notes/101%20Literature/A%20ConvNet%20for%20the%202020s/","title":"A ConvNet for the 2020s","text":"Properties authors Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie year 2022 url https://arxiv.org/abs/2201.03545

    Abstract

    The \"Roaring 20s\" of visual recognition began with the introduction of Vision Transformers (ViTs), which quickly superseded ConvNets as the state-of-the-art image classification model. A vanilla ViT, on the other hand, faces difficulties when applied to general computer vision tasks such as object detection and semantic segmentation. It is the hierarchical Transformers (e.g., Swin Transformers) that reintroduced several ConvNet priors, making Transformers practically viable as a generic vision backbone and demonstrating remarkable performance on a wide variety of vision tasks. However, the effectiveness of such hybrid approaches is still largely credited to the intrinsic superiority of Transformers, rather than the inherent inductive biases of convolutions. In this work, we reexamine the design spaces and test the limits of what a pure ConvNet can achieve. We gradually \"modernize\" a standard ResNet toward the design of a vision Transformer, and discover several key components that contribute to the performance difference along the way. The outcome of this exploration is a family of pure ConvNet models dubbed ConvNeXt. Constructed entirely from standard ConvNet modules, ConvNeXts compete favorably with Transformers in terms of accuracy and scalability, achieving 87.8% ImageNet top-1 accuracy and outperforming Swin Transformers on COCO detection and ADE20K segmentation, while maintaining the simplicity and efficiency of standard ConvNets.

    ","tags":["cnn","foundation_models","computer_vision","dl_theory","paper"]},{"location":"100%20Reference%20notes/101%20Literature/A%20ConvNet%20for%20the%202020s/#notes","title":"Notes","text":"

    Authors modernize ConvNets with SOTA architectural choices and training recipes to achieve SOTA ViT performance on dense prediction tasks (Object Detection, etc). { width=\"500\" }

    Important limitation, scaling laws for ConvNext are not proved to be as good as ViTs, although they also mention that they are promising:

    These findings are encouraging but not yet completely convincing \u2014 our exploration thus far has been limited to a small scale, but vision Transformers\u2019 scaling behavior is what truly distinguishes them.

    Table 1. Classification accuracy on ImageNet-1K. Similar to Transformers, ConvNeXt also shows promising scaling behavior with higher-capacity models and a larger (pre-training) dataset.

    • What are the follow ups for this paper regarding scaling laws of modern convnets when compared to vits?

    One of the main motivations of this paper is that ViTs were not very good at dense prediction tasks such as object detection:

    A vanilla ViT, on the other hand, faces difficulties when applied to general computer vision tasks such as object detection and semantic segmentation. It is the hierarchical Transformers (e.g., Swin Transformers) that reintroduced several ConvNet priors, making Transformers practically viable as a generic vision backbone and demonstrating remarkable performance on a wide variety of vision tasks.

    ","tags":["cnn","foundation_models","computer_vision","dl_theory","paper"]},{"location":"100%20Reference%20notes/101%20Literature/A%20Hierarchy%20of%20Graph%20Neural%20Networks%20Based%20on%20Learnable%20Local%20Features/","title":"A Hierarchy of Graph Neural Networks Based on Learnable Local Features","text":"Properties authors Michael Linghzhi Li, Meng Dong, Jiawei Zhou, Alexander M. Rush year 2019 url https://arxiv.org/abs/1911.05256

    Abstract

    Graph neural networks (GNNs) are a powerful tool to learn representations on graphs by iteratively aggregating features from node neighbourhoods. Many variant models have been proposed, but there is limited understanding on both how to compare different architectures and how to construct GNNs systematically. Here, we propose a hierarchy of GNNs based on their aggregation regions. We derive theoretical results about the discriminative power and feature representation capabilities of each class. Then, we show how this framework can be utilized to systematically construct arbitrarily powerful GNNs. As an example, we construct a simple architecture that exceeds the expressiveness of the Weisfeiler-Lehman graph isomorphism test. We empirically validate our theory on both synthetic and real-world benchmarks, and demonstrate our example's theoretical power translates to strong results on node classification, graph classification, and graph regression tasks.

    Interesting insight: - \u201cUsing this hierarchy, we can derive theoretical results which provide insight into GNNs. For example, we show that no matter how many layers are added, networks which only aggregate over immediate neighbors cannot learn the number of triangles in a node\u2019s neighbourhood\u201d (Li et al., 2019, p. 1)

    HOWEVER: - you can bypass this by encoding geometric information like position and orientation, see Fast, Expressive SE(n) Equivariant Networks through Weight-Sharing in Position-Orientation Space slides

    Michael Lingzhi Li,\u00a0Meng Dong,\u00a0Jiawei Zhou,\u00a0Alexander M. Rush

    ","tags":["gcn","graphs","gnn","paper"]},{"location":"100%20Reference%20notes/101%20Literature/A%20Mathematical%20Framework%20for%20Transformer%20Circuits/","title":"A Mathematical Framework for Transformer Circuits","text":"Properties authors Nelson Elhage, Neel Nanda, Catherine Olsson, Tom Henighan, Nicholas Joseph, Ben Mann, Amanda Askell, Yuntao Bai, Anna Chen, Tom Conerly, Nova DasSarma, Dawn Drain, Deep Ganguli, Zac Hatfield-Dodds, Danny Hernandez, Andy Jones, Jackson Kernion, Liane Lovitt, Kamal Ndousse, Dario Amodei, Tom Brown, Jack Clark, Jared Kaplan, Sam McCandlish, Christopher Olah year 2021 url https://transformer-circuits.pub/2021/framework/index.html

    Abstract

    Transformer [1] language models are an emerging technology that is gaining increasingly broad real-world use, for example in systems like GPT-3 [2], LaMDA\u00a0[3], Codex\u00a0[4], Meena\u00a0[5], Gopher\u00a0[6], and similar models. \u00a0However, as these models scale, their open-endedness and high capacity creates an increasing scope for unexpected and sometimes harmful behaviors. \u00a0Even years after a large model is trained, both creators and users routinely discover model capabilities \u2013 including problematic behaviors \u2013 they were previously unaware of.

    One avenue for addressing these issues is\u00a0mechanistic interpretability, attempting to reverse engineer the detailed computations performed by transformers, similar to how a programmer might try to reverse engineer complicated binaries into human-readable source code. \u00a0If this were possible, it could potentially provide a more systematic approach to explaining current safety problems, identifying new ones, and perhaps even anticipating the safety problems of powerful future models that have not yet been built. \u00a0A previous project, the\u00a0Distill\u00a0Circuits\u00a0thread\u00a0[7], has attempted to reverse engineer vision models, but so far there hasn\u2019t been a comparable project for transformers or language models.

    In this paper, we attempt to take initial, very preliminary steps towards reverse-engineering transformers. \u00a0Given the incredible complexity and size of modern language models, we have found it most fruitful to start with the simplest possible models and work our way up from there. \u00a0Our aim is to discover simple algorithmic patterns, motifs, or frameworks that can subsequently be applied to larger and more complex models. \u00a0Specifically, in this paper we will study\u00a0transformers with two layers or less which have only attention blocks\u00a0\u2013 this is in contrast to a large, modern transformer like GPT-3, which has 96 layers and alternates attention blocks with MLP blocks.

    We find that by conceptualizing the operation of transformers in a new but mathematically equivalent way, we are able to make sense of these small models and gain significant understanding of how they operate internally. \u00a0Of particular note, we find that specific attention heads that we term \u201cinduction heads\u201d can explain in-context learning in these small models, and that these heads only develop in models with at least two attention layers. \u00a0We also go through some examples of these heads operating in action on specific data.

    We don\u2019t attempt to apply to our insights to larger models in this first paper, but in a\u00a0forthcoming paper, we will show that both our mathematical framework for understanding transformers, and the concept of induction heads, continues to be at least partially relevant for much larger and more realistic models \u2013 though we remain a very long way from being able to fully reverse engineer such models.

    ","tags":["paper","mechinterp","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/A%20general%20theory%20of%20correct%2C%20incorrect%2C%20and%20extrinsic%20equivariance/","title":"A general theory of correct, incorrect, and extrinsic equivariance","text":"Properties authors Dian Wang, Xupeng Zhu, Jung Yeon Park, Mingxi Jia, Guanang Su, Robert Platt, Robin Walters year 2024 url https://proceedings.neurips.cc/paper_files/paper/2023/hash/7dc7793c89b93887e126a86f22ef63c6-Abstract-Conference.html

    Abstract

    Although equivariant machine learning has proven effective at many tasks, success depends heavily on the assumption that the ground truth function is symmetric over the entire domain matching the symmetry in an equivariant neural network. A missing piece in the equivariant learning literature is the analysis of equivariant networks when symmetry exists only partially in the domain. In this work, we present a general theory for such a situation. We propose pointwise definitions of correct, incorrect, and extrinsic equivariance, which allow us to quantify continuously the degree of each type of equivariance a function displays. We then study the impact of various degrees of incorrect or extrinsic symmetry on model error. We prove error lower bounds for invariant or equivariant networks in classification or regression settings with partially incorrect symmetry. We also analyze the potentially harmful effects of extrinsic equivariance. Experiments validate these results in three different environments.

    ","tags":["equivariance","relaxed_equivariance","dl_theory","paper"]},{"location":"100%20Reference%20notes/101%20Literature/A%20survey%20of%20quantization%20methods%20for%20efficient%20neural%20network%20inference/","title":"A survey of quantization methods for efficient neural network inference","text":"Properties authors Amir Gholami, Sehoon Kim, Zhen Dong, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer year 2021 url https://arxiv.org/abs/2103.13630

    Abstract

    As soon as abstract mathematical computations were adapted to computation on digital computers, the problem of efficient representation, manipulation, and communication of the numerical values in those computations arose. Strongly related to the problem of numerical representation is the problem of quantization: in what manner should a set of continuous real-valued numbers be distributed over a fixed discrete set of numbers to minimize the number of bits required and also to maximize the accuracy of the attendant computations? This perennial problem of quantization is particularly relevant whenever memory and/or computational resources are severely restricted, and it has come to the forefront in recent years due to the remarkable performance of Neural Network models in computer vision, natural language processing, and related areas. Moving from floating-point representations to low-precision fixed integer values represented in four bits or less holds the potential to reduce the memory footprint and latency by a factor of 16x; and, in fact, reductions of 4x to 8x are often realized in practice in these applications. Thus, it is not surprising that quantization has emerged recently as an important and very active sub-area of research in the efficient implementation of computations associated with Neural Networks. In this article, we survey approaches to the problem of quantizing the numerical values in deep Neural Network computations, covering the advantages/disadvantages of current methods. With this survey and its organization, we hope to have presented a useful snapshot of the current research in quantization for Neural Networks and to have given an intelligent organization to ease the evaluation of future research in this area.

    ","tags":["paper","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/AWQ%20-%20Activation-aware%20Weight%20Quantization%20for%20LLM%20Compression%20and%20Acceleration/","title":"AWQ Activation aware Weight Quantization for LLM Compression and Acceleration","text":"Properties authors Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Wei-Ming Chen, Wei-Chen Wang, Guangxuan Xiao, Xingyu Dang, Chuang Gan, Song Han year 2023 url https://arxiv.org/abs/2306.00978

    Abstract

    Large language models (LLMs) have fundamentally transformed the capabilities of numerous applications, from natural language processing to more intricate domain-specific tasks in robotics and autonomous driving. Moreover, the importance of on-device LLMs has grown significantly in the recent years. Running LLMs on edge devices not only promises reduced latency and improved user experience but also aligns with the increasing need for user privacy, as data processing can occur locally. However, the astronomical model sizes of modern LLMs and constraints of the edge devices, primarily in terms of memory size and bandwidth, pose significant deployment challenges. In this paper, we propose Activation-aware Weight Quantization (AWQ), a hardware-friendly approach for LLM low-bit weight-only quantization. Our method is based on the observation that weights are not equally important: protecting only 1% of salient weights can greatly reduce quantization error. We then propose to search for the optimal per-channel scaling that protects the salient weights by observing the activation, not weights. AWQ does not rely on any backpropagation or reconstruction, so it can well preserve LLMs' generalization ability on different domains and modalities, without overfitting to the calibration set. AWQ outperforms existing work on various language modeling and domain-specific benchmarks (coding and math). Thanks to better generalization, it achieves excellent quantization performance for instruction-tuned LMs and, for the first time, multi-modal LMs. Alongside AWQ, we implement TinyChat, an efficient and flexible inference framework tailored for on-device LLM/VLMs, offering more than 3x speedup over the Huggingface FP16 implementation on both desktop and mobile GPUs. It also democratizes the deployment of the 70B Llama-2 model on mobile GPUs.

    ","tags":["paper","efficient_dl","quantization"]},{"location":"100%20Reference%20notes/101%20Literature/Adapting%20Vision%20Foundation%20Models%20for%20Plant%20Phenotyping/","title":"Adapting Vision Foundation Models for Plant Phenotyping","text":"Properties authors Feng Chen, Mario Valerio Giuffrida, Sotirios A. Tsaftaris year 2023 url https://openaccess.thecvf.com/content/ICCV2023W/CVPPA/html/Chen_Adapting_Vision_Foundation_Models_for_Plant_Phenotyping_ICCVW_2023_paper.html

    Abstract

    Foundation models are large models pre-trained on tremendous amount of data. They can be typically adapted to diverse downstream tasks with minimal effort. However, as foundation models are usually pre-trained on images or texts sourced from the Internet, their performance in specialized domains, such as plant phenotyping, comes into question. In addition, fully fine-tuning foundation models is time-consuming and requires high computational power. This paper investigates the efficient adaptation of foundation models for plant phenotyping settings and tasks. We perform extensive experiments on fine-tuning three foundation models, MAE, DINO, and DINOv2 on three essential plant phenotyping tasks: leaf counting, instance segmentation, and disease classification. In particular, the pre-trained backbones are kept frozen, while two distinct fine-tuning methods are evaluated, namely adapter tuning (using LoRA) and decoder tuning. The experimental results show that a foundation model can be efficiently adapted to multiple plant phenotyping tasks, yielding similar performance as the state-of-the-art (SoTA) models specifically designed or trained for each task. Despite exhibiting great transferability over different tasks, the fine-tuned foundation models perform slightly worse than the SoTA task-specific models in some scenarios, which requires further investigation.

    ","tags":["paper","peft","foundation_models"]},{"location":"100%20Reference%20notes/101%20Literature/Adapting%20Vision%20Foundation%20Models%20for%20Plant%20Phenotyping/#notes","title":"Notes","text":"

    Motivation / Problem

    Foundation models struggle with specialized data like (plant phenotyping, cancer predictions)

    Research question

    Which efficient fine-tuning technique is most promising for adapting foundation models (MAE, DINO, DINOv2) in specialized data?

    Methods

    Benchmarked fine-tuning methods include decoder fine-tuning (aka linear probing) and adapter tuning (linear probing + LoRa)

    Results

    1. LoRa consistently beats DT
    2. VFM w/ LoRa are often competitive fully-trained/finetuned SOTA
    3. It's not clear that one vfm beats another, each model (DINO, DINOv2, MAE) have metrics and tasks where they shine
    4. LoRa can help dampen issues of data scarcity, domain shifts and class imbalance
    ","tags":["paper","peft","foundation_models"]},{"location":"100%20Reference%20notes/101%20Literature/An%20Image%20is%20Worth%20More%20Than%2016x16%20Patches%20-%20Exploring%20Transformers%20on%20Individual%20Pixels/","title":"An Image is Worth More Than 16x16 Patches Exploring Transformers on Individual Pixels","text":"Properties authors Duy-Kien Nguyen, Mahmoud Assran, Unnat Jain, Martin R. Oswald, Cees G. M. Snoek, Xinlei Chen year 2024 url https://arxiv.org/abs/2406.09415v1

    Abstract

    This work does not introduce a new method. Instead, we present an interesting finding that questions the necessity of the inductive bias -- locality in modern computer vision architectures. Concretely, we find that vanilla Transformers can operate by directly treating each individual pixel as a token and achieve highly performant results. This is substantially different from the popular design in Vision Transformer, which maintains the inductive bias from ConvNets towards local neighborhoods (e.g. by treating each 16x16 patch as a token). We mainly showcase the effectiveness of pixels-as-tokens across three well-studied tasks in computer vision: supervised learning for object classification, self-supervised learning via masked autoencoding, and image generation with diffusion models. Although directly operating on individual pixels is less computationally practical, we believe the community must be aware of this surprising piece of knowledge when devising the next generation of neural architectures for computer vision.

    Comments: - Seems to contradict How do vision transformers work? in their position that inductive biases do improve vits. - [ ] Might be useful to check this.

    ","tags":["paper","dl_theory","vit"]},{"location":"100%20Reference%20notes/101%20Literature/An%20Investigation%20into%20Neural%20Net%20Optimization%20via%20Hessian%20Eigenvalue%20Density/","title":"An Investigation into Neural Net Optimization via Hessian Eigenvalue Density","text":"Properties authors Behrooz Ghorbani, Shankar Krishnan, Ying Xiao

    Abstract

    To understand the dynamics of optimization in deep neural networks, we develop a tool to study the evolution of the entire Hessian spectrum throughout the optimization process. Using this, we study a number of hypotheses concerning smoothness, curvature, and sharpness in the deep learning literature. We then thoroughly analyze a crucial structural feature of the spectra: in nonbatch normalized networks, we observe the rapid appearance of large isolated eigenvalues in the spectrum, along with a surprising concentration of the gradient in the corresponding eigenspaces. In batch normalized networks, these two effects are almost absent. We characterize these effects, and explain how they affect optimization speed through both theory and experiments. As part of this work, we adapt advanced tools from numerical linear algebra that allow scalable and accurate estimation of the entire Hessian spectrum of ImageNet-scale neural networks; this technique may be of independent interest in other applications

    ","tags":["dl_theory","optimizability","optimization","paper"]},{"location":"100%20Reference%20notes/101%20Literature/An%20image%20is%20worth%2016x16%20words%20-%20Transformers%20for%20image%20recognition%20at%20scale/","title":"An image is worth 16x16 words Transformers for image recognition at scale","text":"Properties authors Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby url https://arxiv.org/abs/2010.11929 year 2020

    Abstract

    While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train.

    ","tags":["vit","transformers","paper"]},{"location":"100%20Reference%20notes/101%20Literature/An%20image%20is%20worth%2016x16%20words%20-%20Transformers%20for%20image%20recognition%20at%20scale/#notes","title":"Notes","text":"","tags":["vit","transformers","paper"]},{"location":"100%20Reference%20notes/101%20Literature/An%20image%20is%20worth%2016x16%20words%20-%20Transformers%20for%20image%20recognition%20at%20scale/#regarding-inductive-biases","title":"Regarding inductive biases","text":"

    Inductive bias. We note that Vision Transformer has much less image-specific inductive bias than CNNs. In CNNs, locality, two-dimensional neighborhood structure, and translation equivariance are baked into each layer throughout the whole model. In ViT, only MLP layers are local and translationally equivariant, while the self-attention layers are global. The two-dimensional neighborhood structure is used very sparingly: in the beginning of the model by cutting the image into patches and at fine-tuning time for adjusting the position embeddings for images of different resolution (as described below). Other than that, the position embeddings at initialization time carry no information about the 2D positions of the patches and all spatial relations between the patches have to be learned from scratch.

    Interesting insight about Hybrid ViTs (40 conv layers + transformer blocks): - It is better on small data regimes but shows no improvement on large data regimes.

    ","tags":["vit","transformers","paper"]},{"location":"100%20Reference%20notes/101%20Literature/Apple%20Intelligence%20Foundation%20Language%20Models/","title":"Apple Intelligence Foundation Language Models","text":"Properties authors Tom Gunter, Zirui Wang, Chong Wang, Ruoming Pang, Andy Narayanan, Aonan Zhang, Bowen Zhang, Chen Chen, Chung-Cheng Chiu, David Qiu, Deepak Gopinath, Dian Ang Yap, Dong Yin, Feng Nan, Floris Weers, Guoli Yin, Haoshuo Huang, Jianyu Wang, Jiarui Lu, John Peebles, Ke Ye, Mark Lee, Nan Du, Qibin Chen, Quentin Keunebroek, Sam Wiseman, Syd Evans, Tao Lei, Vivek Rathod, Xiang Kong, Xianzhi Du, Yanghao Li, Yongqiang Wang, Yuan Gao, Zaid Ahmed, Zhaoyang Xu, Zhiyun Lu, Al Rashid, Albin Madappally Jose, Alec Doane, Alfredo Bencomo, Allison Vanderby, Andrew Hansen, Ankur Jain, Anupama Mann Anupama, Areeba Kamal, Bugu Wu, Carolina Brum, Charlie Maalouf, Chinguun Erdenebileg, Chris Dulhanty, Dominik Moritz, Doug Kang, Eduardo Jimenez, Evan Ladd, Fangping Shi, Felix Bai, Frank Chu, Fred Hohman, Hadas Kotek, Hannah Gillis Coleman, Jane Li, Jeffrey Bigham, Jeffery Cao, Jeff Lai, Jessica Cheung, Jiulong Shan, Joe Zhou, John Li, Jun Qin, Karanjeet Singh, Karla Vega, Kelvin Zou, Laura Heckman, Lauren Gardiner, Margit Bowler, Maria Cordell, Meng Cao, Nicole Hay, Nilesh Shahdadpuri, Otto Godwin, Pranay Dighe, Pushyami Rachapudi, Ramsey Tantawi, Roman Frigg, Sam Davarnia, Sanskruti Shah, Saptarshi Guha, Sasha Sirovica, Shen Ma, Shuang Ma, Simon Wang, Sulgi Kim, Suma Jayaram, Vaishaal Shankar, Varsha Paidi, Vivek Kumar, Xin Wang, Xin Zheng, Walker Cheng , Yael Shrager, Yang Ye, Yasu Tanaka, Yihao Guo, Yunsong Meng, Zhao Tang Luo, Zhi Ouyang, Alp Aygar, Alvin Wan, Andrew Walkingshaw, Andy Narayanan, Antonie Lin, Arsalan Farooq, Brent Ramerth, Colorado Reed, Chris Bartels, Chris Chaney, David Riazati, Eric Liang Yang, Erin Feldman, Gabriel Hochstrasser, Guillaume Seguin, Irina Belousova, Joris Pelemans, Karen Yang, Keivan Alizadeh Vahid, Liangliang Cao, Mahyar Najibi, Marco Zuliani, Max Horton, Minsik Cho, Nikhil Bhendawade, Patrick Dong, Piotr Maj, Pulkit Agrawal, Qi Shan, Qichen Fu, Regan Poston, Sam Xu, Shuangning Liu, Sushma Rao, Tashweena Heeramun, Thomas Merth, Uday Rayala, Victor Cui, Vivek Rangarajan Sridhar, Wencong Zhang, Wenqi Zhang, Wentao Wu, Xingyu Zhou, Xinwen Liu, Yang Zhao, Yin Xia, Zhile Ren, Zhongzheng Ren year 2024 url https://arxiv.org/abs/2407.21075

    Abstract

    We present foundation language models developed to power Apple Intelligence features, including a ~3 billion parameter model designed to run efficiently on devices and a large server-based language model designed for Private Cloud Compute. These models are designed to perform a wide range of tasks efficiently, accurately, and responsibly. This report describes the model architecture, the data used to train the model, the training process, how the models are optimized for inference, and the evaluation results. We highlight our focus on Responsible AI and how the principles are applied throughout the model development.

    ","tags":["paper","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/Apple%20Intelligence%20Foundation%20Language%20Models/#notes","title":"Notes","text":"

    \u201cA shared input/output embedding matrix [Press and Wolf, 2016] to reduce memory usage for parameters.\u201d (Gunter et al., 2024, p. 2)

    This reminds me of the Residual stream interpretation of transformers.

    \u201cThe model is compressed and quantized, on average under 4-bit-perweight, after the post-training stages (details of the quantization scheme will be discussed later). The quantized model often shows a moderate level of quality loss. Therefore, instead of directly passing the quantized model to application teams for feature development, we attach a set of parameter-efficient LoRa Adapters for quality recovery. We make sure that these LoRA adapters training recipes are consistent with pre-training and post-training processes. Then, products will fine-tune their own feature-specific LoRA adapters by initializing the adapter weights from the accuracy-recovery adapters, while keeping the quantized base model frozen.\u201d (Gunter et al., 2024, p. 16)

    So the recipe is: - Pre-training/Post-training - Compression? and Quantization (leads to accuracy loss) - LoRa fine-tuning to recover accuracy, call it LoRa Recovery, I'll assume this - For a specific task, initialize LoRa adapter to the LoRa Recovery Some details: - Rank 16 LoRa - Does each LoRa adapter also share the same precision as the underlying weight block/matrix? I suppose so

    \u201cSpecifically, our AFM-on-device model running on Apple Neural Engine (ANE) uses Bit Palettization: for projection weights, every 16 columns/rows share the same quantization constants (i.e., lookup tables) and are quantized using K-means with 16 unique values (4-bit).\u201d (Gunter et al., 2024, p. 17)

    ","tags":["paper","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/Approximately%20equivariant%20networks%20for%20imperfectly%20symmetric%20dynamics/","title":"Approximately equivariant networks for imperfectly symmetric dynamics","text":"Properties authors Rui Wang, Robin Walters, Rose Yu year 2022 url https://proceedings.mlr.press/v162/wang22aa.html

    Abstract

    Incorporating symmetry as an inductive bias into neural network architecture has led to improvements in generalization, data efficiency, and physical consistency in dynamics modeling. Methods such as CNNs or equivariant neural networks use weight tying to enforce symmetries such as shift invariance or rotational equivariance. However, despite the fact that physical laws obey many symmetries, real-world dynamical data rarely conforms to strict mathematical symmetry either due to noisy or incomplete data or to symmetry breaking features in the underlying dynamical system. We explore approximately equivariant networks which are biased towards preserving symmetry but are not strictly constrained to do so. By relaxing equivariance constraints, we find that our models can outperform both baselines with no symmetry bias and baselines with overly strict symmetry in both simulated turbulence domains and real-world multi-stream jet flow.

    ","tags":["relaxed_equivariance","equivariance","dl_theory","paper"]},{"location":"100%20Reference%20notes/101%20Literature/Approximation-Generalization%20Trade-offs%20under%20%28Approximate%29%20Group%20Equivariance/","title":"Approximation Generalization Trade offs under (Approximate) Group Equivariance","text":"Properties authors Mircea Petrache, Shubhendu Trivedi","tags":["paper"]},{"location":"100%20Reference%20notes/101%20Literature/Autoequivariant%20Network%20Search%20via%20Group%20Decomposition/","title":"Autoequivariant Network Search via Group Decomposition","text":"Properties authors Sourya Basu","tags":["paper"]},{"location":"100%20Reference%20notes/101%20Literature/Battle%20of%20the%20Backbones%20-%20A%20Large-Scale%20Comparison%20of%20Pretrained%20Models%20across%20Computer%20Vision%20Tasks/","title":"Battle of the Backbones A Large Scale Comparison of Pretrained Models across Computer Vision Tasks","text":"Properties authors Micah Goldblum, Hossein Souri, Renkun Ni, Manli Shu, Viraj Prabhu, Gowthami Somepalli, Prithvijt Chattopadhyay, Mark Ibrahim, Adrien Bardes, Judy Hoffman, Rama Chellappa, Andrew Gordon Wilson, Tom Goldstein year 2023 url https://arxiv.org/abs/2310.19909

    Abstract

    Neural network based computer vision systems are typically built on a backbone, a pretrained or randomly initialized feature extractor. Several years ago, the default option was an ImageNet-trained convolutional neural network. However, the recent past has seen the emergence of countless backbones pretrained using various algorithms and datasets. While this abundance of choice has led to performance increases for a range of systems, it is difficult for practitioners to make informed decisions about which backbone to choose. Battle of the Backbones (BoB) makes this choice easier by benchmarking a diverse suite of pretrained models, including vision-language models, those trained via self-supervised learning, and the Stable Diffusion backbone, across a diverse set of computer vision tasks ranging from classification to object detection to OOD generalization and more. Furthermore, BoB sheds light on promising directions for the research community to advance computer vision by illuminating strengths and weakness of existing approaches through a comprehensive analysis conducted on more than 1500 training runs. While vision transformers (ViTs) and self-supervised learning (SSL) are increasingly popular, we find that convolutional neural networks pretrained in a supervised fashion on large training sets still perform best on most tasks among the models we consider. Moreover, in apples-to-apples comparisons on the same architectures and similarly sized pretraining datasets, we find that SSL backbones are highly competitive, indicating that future works should perform SSL pretraining with advanced architectures and larger pretraining datasets. We release the raw results of our experiments along with code that allows researchers to put their own backbones through the gauntlet here:\u00a0this https URL

    ","tags":["paper","foundation_models","computer_vision","vit","transformers","cnn"]},{"location":"100%20Reference%20notes/101%20Literature/Battle%20of%20the%20Backbones%20-%20A%20Large-Scale%20Comparison%20of%20Pretrained%20Models%20across%20Computer%20Vision%20Tasks/#notes","title":"Notes","text":"

    It would be nice to see an update with DINOv2 - Learning Robust Visual Features without Supervision and EVA-02 - A Visual Representation for Neon Genesis.

    A performance comparison of ViTs and CNNs. Modern architectures strongly outperform vanilla ViTs. We see in Table 2 that the best performing backbone (ConvNeXt-Base) is convolutional, with a hierarchical transformer (SwinV2-Base) being a close second. The latter transformer architecture incorporates a strong spatial inductive bias. These findings suggest that the community should move past vanilla ViTs which are still used frequently. As a caveat, we do not evaluate very large models, and it is possible that ViTs might outperform their more advanced variants or convolutional networks at larger scales.

    Battle of the \u201csmall\u201d backbones. Keeping limited resources in mind, we also compare the \u201csmall\u201d subset of backbones in BoB (< 30M parameters) \u2013 with ViT-Small, ConvNeXt-Tiny, Swin-Tiny and ResNet-50 architectures. Overall, we find Supervised ConvNeXt-T trained on IN-1k to be the best, followed by Supervised SwinV2-T trained on IN-1k and DINO ViT-S trained on IN-1k. Interestingly, supervised learning again dominates, and backbones pretrained on just IN-1k outperform ones trained on a considerably more diverse and larger dataset (MiDaS).

    Object Detection & Segmentation. For object detection and instance segmentation, we find \u201cSupervised ConvNeXt-Base trained on IN-21K\u201d > \u201cSupervised SwinV2-Base trained on IN-21k (finetuned on IN-1k)\u201d > \u201cSupervised ConvNeXt-Base trained on IN-1k\u201d.

    These results are probably outdated since many foundation models already beat Swinv2 - SimPLR - A Simple and Plain Transformer for Scaling-Efficient Object Detection and Segmentation - Exploring Plain Vision Transformer Backbones for Object Detection

    ","tags":["paper","foundation_models","computer_vision","vit","transformers","cnn"]},{"location":"100%20Reference%20notes/101%20Literature/Block%20Transformer%20-%20Global-to-Local%20Language%20Modeling%20for%20Fast%20Inference/","title":"Block Transformer Global to Local Language Modeling for Fast Inference","text":"Properties authors Namgyu Ho, Sangmin Bae, Taehyeon Kim, Hyunjik Jo, Yireun Kim, Tal Schuster, Adam Fisch, James Thorne, Se-Young Yun year 2024 url https://arxiv.org/abs/2406.02657

    Abstract

    This paper presents the Block Transformer architecture which adopts hierarchical global-to-local modeling to autoregressive transformers to mitigate the inference bottlenecks of self-attention. To apply self-attention, the key-value (KV) cache of all previous sequences must be retrieved from memory at every decoding step. Thereby, this KV cache IO becomes a significant bottleneck in batch inference. We notice that these costs stem from applying self-attention on the global context, therefore we isolate the expensive bottlenecks of global modeling to lower layers and apply fast local modeling in upper layers. To mitigate the remaining costs in the lower layers, we aggregate input tokens into fixed size blocks and then apply self-attention at this coarse level. Context information is aggregated into a single embedding to enable upper layers to decode the next block of tokens, without global attention. Free of global attention bottlenecks, the upper layers can fully utilize the compute hardware to maximize inference throughput. By leveraging global and local modules, the Block Transformer architecture demonstrates 10-20x gains in inference throughput compared to vanilla transformers with equivalent perplexity. Our work introduces a new approach to optimize language model inference through novel application of global-to-local modeling. Code is available at https://github.com/itsnamgyu/block-transformer.

    ","tags":["efficient_dl","transformers","paper"]},{"location":"100%20Reference%20notes/101%20Literature/BoxeR%20-%20Box-Attention%20for%202D%20and%203D%20Transformers/","title":"BoxeR Box Attention for 2D and 3D Transformers","text":"Properties authors Duy-Kien Nguyen, Jihong Ju, Olaf Booij, Martin R. Oswald, Cees G. M. Snoek year 2021 url https://arxiv.org/abs/2111.13087

    Abstract

    In this paper, we propose a simple attention mechanism, we call box-attention. It enables spatial interaction between grid features, as sampled from boxes of interest, and improves the learning capability of transformers for several vision tasks. Specifically, we present BoxeR, short for Box Transformer, which attends to a set of boxes by predicting their transformation from a reference window on an input feature map. The BoxeR computes attention weights on these boxes by considering its grid structure. Notably, BoxeR-2D naturally reasons about box information within its attention module, making it suitable for end-to-end instance detection and segmentation tasks. By learning invariance to rotation in the box-attention module, BoxeR-3D is capable of generating discriminative information from a bird's-eye view plane for 3D end-to-end object detection. Our experiments demonstrate that the proposed BoxeR-2D achieves state-of-the-art results on COCO detection and instance segmentation. Besides, BoxeR-3D improves over the end-to-end 3D object detection baseline and already obtains a compelling performance for the vehicle category of Waymo Open, without any class-specific optimization. Code is available at\u00a0this https URL.

    ","tags":["paper","transformers","object_detection"]},{"location":"100%20Reference%20notes/101%20Literature/Building%20on%20Efficient%20Foundations%20-%20Effectively%20Training%20LLMs%20with%20Structured%20Feedforward%20Layers/","title":"Building on Efficient Foundations Effectively Training LLMs with Structured Feedforward Layers","text":"Properties authors Xiuying Wei, Skander Moalla, Razvan Pascanu, Caglar Gulcehre year 2024 url https://arxiv.org/abs/2406.16450v1

    Abstract

    State-of-the-art results in large language models (LLMs) often rely on scale, which becomes computationally expensive. This has sparked a research agenda to reduce these models' parameter count and computational costs without significantly impacting their performance. Our study focuses on transformer-based LLMs, specifically targeting the computationally intensive feedforward networks (FFN), which are less studied than attention blocks. We consider three candidate linear layer approximations in the FFN by combining efficient low-rank and block-diagonal matrices. In contrast to many previous works that examined these approximations, our study i) explores these structures from the training-from-scratch perspective, ii) scales up to 1.3B parameters, and iii) is conducted within recent Transformer-based LLMs rather than convolutional architectures. We first demonstrate they can lead to actual computational gains in various scenarios, including online decoding when using a pre-merge technique. Additionally, we propose a novel training regime, called \\textit{self-guided training}, aimed at improving the poor training dynamics that these approximations exhibit when used from initialization. Experiments on the large RefinedWeb dataset show that our methods are both efficient and effective for training and inference. Interestingly, these structured FFNs exhibit steeper scaling curves than the original models. Further applying self-guided training to the structured matrices with 32\\% FFN parameters and 2.5\u00d7\u00a0speed-up enables only a 0.4 perplexity increase under the same training FLOPs. Finally, we develop the wide and structured networks surpassing the current medium-sized and large-sized Transformer in perplexity and throughput performance. Our code is available at \\url{this https URL}.

    ","tags":["paper","efficient_dl","llm","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/Building%20on%20Efficient%20Foundations%20-%20Effectively%20Training%20LLMs%20with%20Structured%20Feedforward%20Layers/#notes","title":"Notes","text":"
    • Note to self: Read this in depth \u23eb #personal
    ","tags":["paper","efficient_dl","llm","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/CKConv%20-%20Continuous%20Kernel%20Convolution%20For%20Sequential%20Data/","title":"CKConv Continuous Kernel Convolution For Sequential Data","text":"Properties authors David W. Romero, Anna Kuzina, Erik J. Bekkers, Jakub M. Tomczak, Mark Hoogendoorn year 2021 url https://arxiv.org/abs/2102.02611

    Abstract

    Conventional neural architectures for sequential data present important limitations. Recurrent networks suffer from exploding and vanishing gradients, small effective memory horizons, and must be trained sequentially. Convolutional networks are unable to handle sequences of unknown size and their memory horizon must be defined a priori. In this work, we show that all these problems can be solved by formulating convolutional kernels in CNNs as continuous functions. The resulting Continuous Kernel Convolution (CKConv) allows us to model arbitrarily long sequences in a parallel manner, within a single operation, and without relying on any form of recurrence. We show that Continuous Kernel Convolutional Networks (CKCNNs) obtain state-of-the-art results in multiple datasets, e.g., permuted MNIST, and, thanks to their continuous nature, are able to handle non-uniformly sampled datasets and irregularly-sampled data natively. CKCNNs match or perform better than neural ODEs designed for these purposes in a faster and simpler manner.

    ","tags":["paper","convolutions","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Color%20Equivariant%20Convolutional%20Networks/","title":"Color Equivariant Convolutional Networks","text":"Properties authors Attila Lengyel, Ombretta Strafforello, Robert-Jan Bruintjes, Alexander Gielisse, Jan van Gemert

    References: - Learning Partial Equivariances from Data

    ","tags":["paper"]},{"location":"100%20Reference%20notes/101%20Literature/Color%20Space%20Transformation%20Network/","title":"Color Space Transformation Network","text":"Properties authors Alexandros Karargyris year 2015 url https://arxiv.org/abs/1511.01064

    Abstract

    Deep networks have become very popular over the past few years. The main reason for this widespread use is their excellent ability to learn and predict knowledge in a very easy and efficient way. Convolutional neural networks and auto-encoders have become the normal in the area of imaging and computer vision achieving unprecedented accuracy levels in many applications. The most common strategy is to build and train networks with many layers by tuning their hyper-parameters. While this approach has proven to be a successful way to build robust deep learning schemes it suffers from high complexity. In this paper we introduce a module that learns color space transformations within a network. Given a large dataset of colored images the color space transformation module tries to learn color space transformations that increase overall classification accuracy. This module has shown to increase overall accuracy for the same network design and to achieve faster convergence. It is part of a broader family of image transformations (e.g. spatial transformer network).

    ","tags":["cnn","paper"]},{"location":"100%20Reference%20notes/101%20Literature/ConViT%20-%20Improving%20Vision%20Transformers%20with%20Soft%20Convolutional%20Inductive%20Biases/","title":"ConViT Improving Vision Transformers with Soft Convolutional Inductive Biases","text":"Properties authors St\u00e9phane d'Ascoli, Hugo Touvron, Matthew L. Leavitt, Ari S. Morcos, Giulio Biroli, Levent Sagun

    Abstract

    TODO: - [ ] Read paper - [ ] Add main text summary

    From Early Convolutions Help Transformers See Better, where [9] is this paper:

    We did not observe evidence that the hard locality constraint in early layers hampers the representational capacity of the network, as might be feared [9]. [...] This perspective resonates with the findings of [9], who observe that early transformer blocks prefer to learn more local attention patterns than later blocks.

    This is contrary to How do vision transformers work?, as they claim that locality constraint is beneficial to ViTs.

    Haven't fully read this paper, so the above contradiction might be incorrect.

    ","tags":["vit","computer_vision","cnn","transformers","inductive_bias","paper"]},{"location":"100%20Reference%20notes/101%20Literature/DETRs%20Beat%20YOLOs%20on%20Real-time%20Object%20Detection/","title":"DETRs Beat YOLOs on Real time Object Detection","text":"Properties authors Yian Zhao, Wenyu Lv, Shangliang Xu, Jinman Wei, Guanzhong Wang, Qingqing Dang, Yi Liu, Jie Chen year 2023 url https://arxiv.org/abs/2304.08069v3

    Abstract

    The YOLO series has become the most popular framework for real-time object detection due to its reasonable trade-off between speed and accuracy. However, we observe that the speed and accuracy of YOLOs are negatively affected by the NMS. Recently, end-to-end Transformer-based detectors (DETRs) have provided an alternative to eliminating NMS. Nevertheless, the high computational cost limits their practicality and hinders them from fully exploiting the advantage of excluding NMS. In this paper, we propose the Real-Time DEtection TRansformer (RT-DETR), the first real-time end-to-end object detector to our best knowledge that addresses the above dilemma. We build RT-DETR in two steps, drawing on the advanced DETR: first we focus on maintaining accuracy while improving speed, followed by maintaining speed while improving accuracy. Specifically, we design an efficient hybrid encoder to expeditiously process multi-scale features by decoupling intra-scale interaction and cross-scale fusion to improve speed. Then, we propose the uncertainty-minimal query selection to provide high-quality initial queries to the decoder, thereby improving accuracy. In addition, RT-DETR supports flexible speed tuning by adjusting the number of decoder layers to adapt to various scenarios without retraining. Our RT-DETR-R50 / R101 achieves 53.1% / 54.3% AP on COCO and 108 / 74 FPS on T4 GPU, outperforming previously advanced YOLOs in both speed and accuracy. We also develop scaled RT-DETRs that outperform the lighter YOLO detectors (S and M models). Furthermore, RT-DETR-R50 outperforms DINO-R50 by 2.2% AP in accuracy and about 21 times in FPS. After pre-training with Objects365, RT-DETR-R50 / R101 achieves 55.3% / 56.2% AP. The project page:\u00a0this https URL.

    ","tags":["paper","computer_vision","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/DETRs%20with%20Collaborative%20Hybrid%20Assignments%20Training/","title":"DETRs with Collaborative Hybrid Assignments Training","text":"Properties authors Zhuofan Zong, Guanglu Song, Yu Liu year 2023 url https://arxiv.org/abs/2211.12860v5

    Abstract

    In this paper, we provide the observation that too few queries assigned as positive samples in DETR with one-to-one set matching leads to sparse supervision on the encoder's output which considerably hurt the discriminative feature learning of the encoder and vice visa for attention learning in the decoder. To alleviate this, we present a novel collaborative hybrid assignments training scheme, namely\u00a0\ue22fo-DETR, to learn more efficient and effective DETR-based detectors from versatile label assignment manners. This new training scheme can easily enhance the encoder's learning ability in end-to-end detectors by training the multiple parallel auxiliary heads supervised by one-to-many label assignments such as ATSS and Faster RCNN. In addition, we conduct extra customized positive queries by extracting the positive coordinates from these auxiliary heads to improve the training efficiency of positive samples in the decoder. In inference, these auxiliary heads are discarded and thus our method introduces no additional parameters and computational cost to the original detector while requiring no hand-crafted non-maximum suppression (NMS). We conduct extensive experiments to evaluate the effectiveness of the proposed approach on DETR variants, including DAB-DETR, Deformable-DETR, and DINO-Deformable-DETR. The state-of-the-art DINO-Deformable-DETR with Swin-L can be improved from 58.5% to 59.5% AP on COCO val. Surprisingly, incorporated with ViT-L backbone, we achieve 66.0% AP on COCO test-dev and 67.9% AP on LVIS val, outperforming previous methods by clear margins with much fewer model sizes. Codes are available at \\url{this https URL}.

    ","tags":["paper","object_detection","computer_vision","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/DETRs%20with%20Collaborative%20Hybrid%20Assignments%20Training/#notes","title":"Notes","text":"

    Beats EVA-02 - A Visual Representation for Neon Genesis on object detection.

    Weights for CO-DINO Swin-L (64.1 box AP on COCO val): https://github.com/Sense-X/Co-DETR?tab=readme-ov-file

    ","tags":["paper","object_detection","computer_vision","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/DINOv2%20-%20Learning%20Robust%20Visual%20Features%20without%20Supervision/","title":"DINOv2 Learning Robust Visual Features without Supervision","text":"Properties authors Maxime Oquab, Timoth\u00e9e Darcet, Th\u00e9o Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Rusell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Herv\u00e9 Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski year 2023 url https://arxiv.org/abs/2304.07193

    Abstract

    The recent breakthroughs in natural language processing for model pretraining on large quantities of data have opened the way for similar foundation models in computer vision. These models could greatly simplify the use of images in any system by producing all-purpose visual features, i.e., features that work across image distributions and tasks without finetuning. This work shows that existing pretraining methods, especially self-supervised methods, can produce such features if trained on enough curated data from diverse sources. We revisit existing approaches and combine different techniques to scale our pretraining in terms of data and model size. Most of the technical contributions aim at accelerating and stabilizing the training at scale. In terms of data, we propose an automatic pipeline to build a dedicated, diverse, and curated image dataset instead of uncurated data, as typically done in the self-supervised literature. In terms of models, we train a ViT model (Dosovitskiy et al., 2020) with 1B parameters and distill it into a series of smaller models that surpass the best available all-purpose features, OpenCLIP (Ilharco et al., 2021) on most of the benchmarks at image and pixel levels.

    ","tags":["paper","foundation_models","computer_vision","vit"]},{"location":"100%20Reference%20notes/101%20Literature/Deep%20Learning%20Book/","title":"Deep Learning Book","text":"Properties authors Ian Goodfellow, Yoshua Bengio, Aaron Courville year 2016 url https://www.deeplearningbook.org/","tags":["dl_theory","textbook"]},{"location":"100%20Reference%20notes/101%20Literature/DenseNets%20Reloaded%20-%20Paradigm%20Shift%20Beyond%20ResNets%20and%20ViTs/","title":"DenseNets Reloaded Paradigm Shift Beyond ResNets and ViTs","text":"Properties authors Donghyun Kim, Byeongho Heo, Dongyoon Han year 2024 url https://arxiv.org/abs/2403.19588

    Abstract

    This paper revives Densely Connected Convolutional Networks (DenseNets) and reveals the underrated effectiveness over predominant ResNet-style architectures. We believe DenseNets' potential was overlooked due to untouched training methods and traditional design elements not fully revealing their capabilities. Our pilot study shows dense connections through concatenation are strong, demonstrating that DenseNets can be revitalized to compete with modern architectures. We methodically refine suboptimal components - architectural adjustments, block redesign, and improved training recipes towards widening DenseNets and boosting memory efficiency while keeping concatenation shortcuts. Our models, employing simple architectural elements, ultimately surpass Swin Transformer, ConvNeXt, and DeiT-III - key architectures in the residual learning lineage. Furthermore, our models exhibit near state-of-the-art performance on ImageNet-1K, competing with the very recent models and downstream tasks, ADE20k semantic segmentation, and COCO object detection/instance segmentation. Finally, we provide empirical analyses that uncover the merits of the concatenation over additive shortcuts, steering a renewed preference towards DenseNet-style designs. Our code is available at\u00a0this https URL.

    ","tags":["cnn","dl_theory","optimizability","paper"]},{"location":"100%20Reference%20notes/101%20Literature/Discovering%20Symmetry%20Breaking%20in%20Physical%20Systems%20with%20Relaxed%20Group%20Convolution/","title":"Discovering Symmetry Breaking in Physical Systems with Relaxed Group Convolution","text":"Properties authors Rui Wang, Elyssa Hofgard, Han Gao, Robin Walters, Tess E Smidt year 2024 url https://arxiv.org/abs/2310.02299

    Abstract

    Modeling symmetry breaking is essential for understanding the fundamental changes in the behaviors and properties of physical systems, from microscopic particle interactions to macroscopic phenomena like fluid dynamics and cosmic structures. Thus, identifying sources of asymmetry is an important tool for understanding physical systems. In this paper, we focus on learning asymmetries of data using relaxed group convolutions. We provide both theoretical and empirical evidence that this flexible convolution technique allows the model to maintain the highest level of equivariance that is consistent with data and discover the subtle symmetry-breaking factors in various physical systems. We employ various relaxed group convolution architectures to uncover various symmetry-breaking factors that are interpretable and physically meaningful in different physical systems, including the phase transition of crystal structure, the isotropy and homogeneity breaking in turbulent flow, and the time-reversal symmetry breaking in pendulum systems.

    Observations: - \"In the relaxed group convolution, the initial relaxed (equivariant) weights\u00a0{\ud835\udc64\ud835\udc59\u2062(\u210e)}\u00a0in each layer are set to be the same for all\u00a0\u210e, ensuring that the model exhibits equivariance prior to being trained. [...] we prove that these relaxed weights only deviate from being equal when the symmetries of the input and the output are lower than that of the model.\" (Related to Equivariance Initialization)

    ","tags":["equivariance","relaxed_equivariance","dl_theory","paper"]},{"location":"100%20Reference%20notes/101%20Literature/EVA-02%20-%20A%20Visual%20Representation%20for%20Neon%20Genesis/","title":"EVA 02 A Visual Representation for Neon Genesis","text":"Properties authors Yuxin Fang, Quan Sun, Xinggang Wang, Tiejun Huang, Xinlong Wang, Yue Cao year 2023 url https://arxiv.org/abs/2303.11331

    Abstract

    We launch EVA-02, a next-generation Transformer-based visual representation pre-trained to reconstruct strong and robust language-aligned vision features via masked image modeling. With an updated plain Transformer architecture as well as extensive pre-training from an open & accessible giant CLIP vision encoder, EVA-02 demonstrates superior performance compared to prior state-of-the-art approaches across various representative vision tasks, while utilizing significantly fewer parameters and compute budgets. Notably, using exclusively publicly accessible training data, EVA-02 with only 304M parameters achieves a phenomenal 90.0 fine-tuning top-1 accuracy on ImageNet-1K val set. Additionally, our EVA-02-CLIP can reach up to 80.4 zero-shot top-1 on ImageNet-1K, outperforming the previous largest & best open-sourced CLIP with only ~1/6 parameters and ~1/6 image-text training data. We offer four EVA-02 variants in various model sizes, ranging from 6M to 304M parameters, all with impressive performance. To facilitate open access and open research, we release the complete suite of EVA-02 to the community at\u00a0this https URL.

    ","tags":["paper","foundation_models","computer_vision","vit"]},{"location":"100%20Reference%20notes/101%20Literature/Early%20Convolutions%20Help%20Transformers%20See%20Better/","title":"Early Convolutions Help Transformers See Better","text":"Properties authors Tete Xiao, Mannat Singh, Eric Mintun, Trevor Darrell, Piotr Doll\u00e1r, Ross Girshick

    Hypothesis

    ViT's patchify convolution is contrary to standard early layers in CNNs. Maybe that's the cause?

    Main idea

    Replace patchify convolution with a small number of convolutional layers and drop one transformer block to make comparison fair.

    Notes for myself: - Interesting experimentation regarding #optimizability , maybe take into account into hessian analysis

    ","tags":["cnn","transformers","vit","optimizability","paper"]},{"location":"100%20Reference%20notes/101%20Literature/Efficient%20Equivariant%20Transfer%20Learning%20from%20Pretrained%20Models/","title":"Efficient Equivariant Transfer Learning from Pretrained Models","text":"Properties authors Sourya Basu

    Builds on top of Equi-Tuning - Group Equivariant Fine-Tuning of Pretrained Models and Equivariance with Learned Canonicalization Functions

    Hypothesis

    Pretrained models provide better quality features for certain transformations than others and simply averaging them is bad.

    Main idea

    Lambda-Equitune: Weighted average with learned weights, \\(\\lambda\\).

    \\[ M_G^\\lambda(x) = \\frac{1}{\\sum_{g \\in G} \\lambda(gx)} \\sum_{g \\in G} \\lambda(gx) g^{-1} M(gx) \\]","tags":["paper"]},{"location":"100%20Reference%20notes/101%20Literature/Efficient%20Modulation%20for%20Vision%20Networks/","title":"Efficient Modulation for Vision Networks","text":"Properties authors Xu Ma, Xiyang Dai, Jianwei Yang, Bin Xiao, Yinpeng Chen, Yun Fu, Lu Yuan year 2024 url https://arxiv.org/abs/2403.19963

    Abstract

    In this work, we present efficient modulation, a novel design for efficient vision networks. We revisit the modulation mechanism, which operates input through convolutional context modeling and feature projection layers, and fuses features via element-wise multiplication and an MLP block. We demonstrate that the modulation mechanism is particularly well suited for efficient networks and further tailor the modulation design by proposing the efficient modulation (EfficientMod) block, which is considered the essential building block for our networks. Benefiting from the prominent representational ability of modulation mechanism and the proposed efficient design, our network can accomplish better trade-offs between accuracy and efficiency and set new state-of-the-art performance in the zoo of efficient networks. When integrating EfficientMod with the vanilla self-attention block, we obtain the hybrid architecture which further improves the performance without loss of efficiency. We carry out comprehensive experiments to verify EfficientMod's performance. With fewer parameters, our EfficientMod-s performs 0.6 top-1 accuracy better than EfficientFormerV2-s2 and is 25% faster on GPU, and 2.9 better than MobileViTv2-1.0 at the same GPU latency. Additionally, our method presents a notable improvement in downstream tasks, outperforming EfficientFormerV2-s by 3.6 mIoU on the ADE20K benchmark. Code and checkpoints are available at\u00a0this https URL.

    ","tags":["efficient_dl","computer_vision"]},{"location":"100%20Reference%20notes/101%20Literature/EfficientViT-SAM%20-%20Accelerated%20Segment%20Anything%20Model%20Without%20Accuracy%20Loss/","title":"EfficientViT SAM Accelerated Segment Anything Model Without Accuracy Loss","text":"Properties authors Zhuoyang Zhang, Han Cai, Song Han year 2024 url https://arxiv.org/abs/2402.05008

    Abstract

    We present EfficientViT-SAM, a new family of accelerated segment anything models. We retain SAM's lightweight prompt encoder and mask decoder while replacing the heavy image encoder with EfficientViT. For the training, we begin with the knowledge distillation from the SAM-ViT-H image encoder to EfficientViT. Subsequently, we conduct end-to-end training on the SA-1B dataset. Benefiting from EfficientViT's efficiency and capacity, EfficientViT-SAM delivers 48.9x measured TensorRT speedup on A100 GPU over SAM-ViT-H without sacrificing performance. Our code and pre-trained models are released at\u00a0this https URL.

    ","tags":["paper","efficient_dl","vit"]},{"location":"100%20Reference%20notes/101%20Literature/Emergent%20Equivariance%20in%20Deep%20Ensembles/","title":"Emergent Equivariance in Deep Ensembles","text":"Properties authors Jan E. Gerken, Pan Kessel year 2024 url https://arxiv.org/abs/2403.03103

    Abstract

    We demonstrate that deep ensembles are secretly equivariant models. More precisely, we show that deep ensembles become equivariant for all inputs and at all training times by simply using data augmentation. Crucially, equivariance holds off-manifold and for any architecture in the infinite width limit. The equivariance is emergent in the sense that predictions of individual ensemble members are not equivariant but their collective prediction is. Neural tangent kernel theory is used to derive this result and we verify our theoretical insights using detailed numerical experiments.

    ","tags":["equivariance","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Emerging%20Properties%20in%20Self-Supervised%20Vision%20Transformers/","title":"Emerging Properties in Self Supervised Vision Transformers","text":"Properties authors Mathilde Caron, Hugo Touvron, Ishan Misra, Herv\u00e9 Jegou, Julien Mairal, Piotr Bojanowski, Armand Joulin year 2021 url https://arxiv.org/abs/2104.14294

    Abstract

    In this paper, we question if self-supervised learning provides new properties to Vision Transformer (ViT) that stand out compared to convolutional networks (convnets). Beyond the fact that adapting self-supervised methods to this architecture works particularly well, we make the following observations: first, self-supervised ViT features contain explicit information about the semantic segmentation of an image, which does not emerge as clearly with supervised ViTs, nor with convnets. Second, these features are also excellent k-NN classifiers, reaching 78.3% top-1 on ImageNet with a small ViT. Our study also underlines the importance of momentum encoder, multi-crop training, and the use of small patches with ViTs. We implement our findings into a simple self-supervised method, called DINO, which we interpret as a form of self-distillation with no labels. We show the synergy between DINO and ViTs by achieving 80.1% top-1 on ImageNet in linear evaluation with ViT-Base.

    ","tags":["paper","foundation_models","computer_vision","vit"]},{"location":"100%20Reference%20notes/101%20Literature/End-to-End%20Object%20Detection%20with%20Transformers/","title":"End to End Object Detection with Transformers","text":"Properties authors Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko year 2020 url https://arxiv.org/abs/2005.12872

    Abstract

    We present a new method that views object detection as a direct set prediction problem. Our approach streamlines the detection pipeline, effectively removing the need for many hand-designed components like a non-maximum suppression procedure or anchor generation that explicitly encode our prior knowledge about the task. The main ingredients of the new framework, called DEtection TRansformer or DETR, are a set-based global loss that forces unique predictions via bipartite matching, and a transformer encoder-decoder architecture. Given a fixed small set of learned object queries, DETR reasons about the relations of the objects and the global image context to directly output the final set of predictions in parallel. The new model is conceptually simple and does not require a specialized library, unlike many other modern detectors. DETR demonstrates accuracy and run-time performance on par with the well-established and highly-optimized Faster RCNN baseline on the challenging COCO object detection dataset. Moreover, DETR can be easily generalized to produce panoptic segmentation in a unified manner. We show that it significantly outperforms competitive baselines. Training code and pretrained models are available at\u00a0this https URL.

    ","tags":["paper","computer_vision","object_detection","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/Equi-Tuning%20-%20Group%20Equivariant%20Fine-Tuning%20of%20Pretrained%20Models/","title":"Equi Tuning Group Equivariant Fine Tuning of Pretrained Models","text":"Properties authors Sourya Basu

    Main idea

    Given non-equivariant pre-trained model \\(M(x)\\), define equivariant model \\(M_G(x)\\), as the average of the inverted predictions for all group actions on input \\(x\\)

    \\[ M_G(x) = \\frac{1}{|G|} \\sum_{g \\in G} g^{-1} M(g x) \\]

    Abstract

    ","tags":["dl2"]},{"location":"100%20Reference%20notes/101%20Literature/Equivariance%20with%20Learned%20Canonicalization%20Functions/","title":"Equivariance with Learned Canonicalization Functions","text":"Properties authors S\u00e9kou-Oumar Kaba, Arnab Kumar Mondal, Yan Zhang, Yoshua Bengio, Siamak Ravanbakhsh

    Main idea

    We learn a canonicalization function \\(h\\) either by a neural network or an optimization procedure. $$ \\phi(x) = h'(x) f(h(x)^{-1} x) $$

    Abstract

    ","tags":["dl2"]},{"location":"100%20Reference%20notes/101%20Literature/Equivariance-aware%20architectural%20optimization%20of%20neural%20networks/","title":"Equivariance aware architectural optimization of neural networks","text":"Properties authors Kaitlin Maile, Dennis G. Wilson, Patrick Forr\u00e9

    References: - Learning Partial Equivariances from Data

    Abstract

    ","tags":["dl2"]},{"location":"100%20Reference%20notes/101%20Literature/Exact%20Conversion%20of%20In-Context%20Learning%20to%20Model%20Weights%20in%20Linearized-Attention%20Transformers/","title":"Exact Conversion of In Context Learning to Model Weights in Linearized Attention Transformers","text":"Properties authors Brian K Chen, Tianyang Hu, Hui Jin, Hwee Kuan Lee, Kenji Kawaguchi year 2024 url https://arxiv.org/abs/2406.02847

    Abstract

    In-Context Learning (ICL) has been a powerful emergent property of large language models that has attracted increasing attention in recent years. In contrast to regular gradient-based learning, ICL is highly interpretable and does not require parameter updates. In this paper, we show that, for linearized transformer networks, ICL can be made explicit and permanent through the inclusion of bias terms. We mathematically demonstrate the equivalence between a model with ICL demonstration prompts and the same model with the additional bias terms. Our algorithm (ICLCA) allows for exact conversion in an inexpensive manner. Existing methods are not exact and require expensive parameter updates. We demonstrate the efficacy of our approach through experiments that show the exact incorporation of ICL tokens into a linear transformer. We further suggest how our method can be adapted to achieve cheap approximate conversion of ICL tokens, even in regular transformer networks that are not linearized. Our experiments on GPT-2 show that, even though the conversion is only approximate, the model still gains valuable context from the included bias terms.

    ","tags":["paper","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/Exploiting%20Redundancy%20-%20Separable%20Group%20Convolutional%20Networks%20on%20Lie%20Groups/","title":"Exploiting Redundancy Separable Group Convolutional Networks on Lie Groups","text":"Properties authors David M. Knigge, David W. Romero, Erik J. Bekkers

    Abstract

    In this work, we investigate the properties of representations learned by regular G-CNNs, and show considerable parameter redundancy in group convolution kernels. This finding motivates further weight-tying by sharing convolution kernels over subgroups. To this end, we introduce convolution kernels that are separable over the subgroup and channel dimensions.

    Interesting because it reduces the total parameter count by separating group convolution kernels. This also has a regularisation effect.

    Citations: - Relaxing Equivariance Constraints with Non-stationary Continuous Filters

    ","tags":["dl2"]},{"location":"100%20Reference%20notes/101%20Literature/Exploring%20Plain%20Vision%20Transformer%20Backbones%20for%20Object%20Detection/","title":"Exploring Plain Vision Transformer Backbones for Object Detection","text":"Properties authors Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He year 2023 url https://arxiv.org/abs/2203.16527

    Abstract

    We explore the plain, non-hierarchical Vision Transformer (ViT) as a backbone network for object detection. This design enables the original ViT architecture to be fine-tuned for object detection without needing to redesign a hierarchical backbone for pre-training. With minimal adaptations for fine-tuning, our plain-backbone detector can achieve competitive results. Surprisingly, we observe: (i) it is sufficient to build a simple feature pyramid from a single-scale feature map (without the common FPN design) and (ii) it is sufficient to use window attention (without shifting) aided with very few cross-window propagation blocks. With plain ViT backbones pre-trained as Masked Autoencoders (MAE), our detector, named ViTDet, can compete with the previous leading methods that were all based on hierarchical backbones, reaching up to 61.3 AP_box on the COCO dataset using only ImageNet-1K pre-training. We hope our study will draw attention to research on plain-backbone detectors. Code for ViTDet is available in Detectron2.

    ","tags":["paper","computer_vision","object_detection","transformers","vit"]},{"location":"100%20Reference%20notes/101%20Literature/Exploring%20Plain%20Vision%20Transformer%20Backbones%20for%20Object%20Detection/#notes","title":"Notes","text":"
    • It effectively adapts a pre-trained vision transformers as backbones and decoder heads by adding minimal layers in between to make them work
    • Requires full fine-tuning
    • Ranks #16 on https://paperswithcode.com/sota/object-detection-on-coco-minival, ~4 box map points lower than the first spot

    Code and weights at: https://github.com/facebookresearch/detectron2/tree/main/projects/ViTDet

    ","tags":["paper","computer_vision","object_detection","transformers","vit"]},{"location":"100%20Reference%20notes/101%20Literature/Fast%2C%20Expressive%20SE%28n%29%20Equivariant%20Networks%20through%20Weight-Sharing%20in%20Position-Orientation%20Space/","title":"Fast, Expressive SE(n) Equivariant Networks through Weight Sharing in Position Orientation Space","text":"Properties authors Erik J. Bekkers, Sharvaree Vadgama, Rob D. Hesselink, Putri A. van der Linden, David W. Romero

    Abstract

    ","tags":["dl2"]},{"location":"100%20Reference%20notes/101%20Literature/FlexiViT%20-%20One%20Model%20for%20All%20Patch%20Sizes/","title":"FlexiViT One Model for All Patch Sizes","text":"Properties authors Lucas Beyer, Pavel Izmailov, Alexander Kolesnikov, Mathilde Caron, Simon Kornblith, Xiaohua Zhai, Matthias Minderer, Michael Tschannen, Ibrahim Alabdulmoshin, Filip Pavetic year 2022 url https://arxiv.org/abs/2212.08013

    Abstract

    Vision Transformers convert images to sequences by slicing them into patches. The size of these patches controls a speed/accuracy tradeoff, with smaller patches leading to higher accuracy at greater computational cost, but changing the patch size typically requires retraining the model. In this paper, we demonstrate that simply randomizing the patch size at training time leads to a single set of weights that performs well across a wide range of patch sizes, making it possible to tailor the model to different compute budgets at deployment time. We extensively evaluate the resulting model, which we call FlexiViT, on a wide range of tasks, including classification, image-text retrieval, open-world detection, panoptic segmentation, and semantic segmentation, concluding that it usually matches, and sometimes outperforms, standard ViT models trained at a single patch size in an otherwise identical setup. Hence, FlexiViT training is a simple drop-in improvement for ViT that makes it easy to add compute-adaptive capabilities to most models relying on a ViT backbone architecture. Code and pre-trained models are available at\u00a0this https URL

    ","tags":["paper","foundation_models","computer_vision","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/FlexiViT%20-%20One%20Model%20for%20All%20Patch%20Sizes/#notes","title":"Notes","text":"
    • Read in depth, seems very promising
    • Google already filed a patent for this: https://patents.google.com/patent/US20240169715A1/en
    ","tags":["paper","foundation_models","computer_vision","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/G-SGD%20-%20Optimizing%20ReLU%20Neural%20Networks%20in%20its%20Positively%20Scale-Invariant%20Space/","title":"G SGD Optimizing ReLU Neural Networks in its Positively Scale Invariant Space","text":"Properties authors Qi Meng, Shuxin Zheng, Huishuai Zhang, Wei Chen, Zhi-Ming Ma, Tie-Yan Liu year 2018 url https://arxiv.org/abs/1802.03713

    Abstract

    It is well known that neural networks with rectified linear units (ReLU) activation functions are positively scale-invariant. Conventional algorithms like stochastic gradient descent optimize the neural networks in the vector space of weights, which is, however, not positively scale-invariant. This mismatch may lead to problems during the optimization process. Then, a natural question is: \\emph{can we construct a new vector space that is positively scale-invariant and sufficient to represent ReLU neural networks so as to better facilitate the optimization process }? In this paper, we provide our positive answer to this question. First, we conduct a formal study on the positive scaling operators which forms a transformation group, denoted as\u00a0\ue233. We show that the value of a path (i.e. the product of the weights along the path) in the neural network is invariant to positive scaling and prove that the value vector of all the paths is sufficient to represent the neural networks under mild conditions. Second, we show that one can identify some basis paths out of all the paths and prove that the linear span of their value vectors (denoted as\u00a0\ue233-space) is an invariant space with lower dimension under the positive scaling group. Finally, we design stochastic gradient descent algorithm in\u00a0\ue233-space (abbreviated as\u00a0\ue233-SGD) to optimize the value vector of the basis paths of neural networks with little extra cost by leveraging back-propagation. Our experiments show that\u00a0\ue233-SGD significantly outperforms the conventional SGD algorithm in optimizing ReLU networks on benchmark datasets.

    ","tags":["dl_theory","dl2"]},{"location":"100%20Reference%20notes/101%20Literature/Grokked%20Transformers%20are%20Implicit%20Reasoners%20-%20A%20Mechanistic%20Journey%20to%20the%20Edge%20of%20Generalization/","title":"Grokked Transformers are Implicit Reasoners A Mechanistic Journey to the Edge of Generalization","text":"Properties authors Boshi Wang, Xiang Yue, Yu Su, Huan Sun year 2024 url https://arxiv.org/abs/2405.15071

    Abstract

    We study whether transformers can learn to implicitly reason over parametric knowledge, a skill that even the most capable language models struggle with. Focusing on two representative reasoning types, composition and comparison, we consistently find that transformers can learn implicit reasoning, but only through grokking, i.e., extended training far beyond overfitting. The levels of generalization also vary across reasoning types: when faced with out-of-distribution examples, transformers fail to systematically generalize for composition but succeed for comparison. We delve into the model's internals throughout training, conducting analytical experiments that reveal: 1) the mechanism behind grokking, such as the formation of the generalizing circuit and its relation to the relative efficiency of generalizing and memorizing circuits, and 2) the connection between systematicity and the configuration of the generalizing circuit. Our findings guide data and training setup to better induce implicit reasoning and suggest potential improvements to the transformer architecture, such as encouraging cross-layer knowledge sharing. Furthermore, we demonstrate that for a challenging reasoning task with a large search space, GPT-4-Turbo and Gemini-1.5-Pro based on non-parametric memory fail badly regardless of prompting styles or retrieval augmentation, while a fully grokked transformer can achieve near-perfect accuracy, showcasing the power of parametric memory for complex reasoning.

    ","tags":["paper","transformers","mechinterp"]},{"location":"100%20Reference%20notes/101%20Literature/Harmonics%20of%20Learning%20-%20Universal%20Fourier%20Features%20Emerge%20in%20Invariant%20Networks/","title":"Harmonics of Learning Universal Fourier Features Emerge in Invariant Networks","text":"Properties authors Giovanni Luca Marchetti, Christopher Hillar, Danica Kragic, Sophia Sanborn year 2023 url https://arxiv.org/abs/2312.08550

    Abstract

    In this work, we formally prove that, under certain conditions, if a neural network is invariant to a finite group then its weights recover the Fourier transform on that group. This provides a mathematical explanation for the emergence of Fourier features -- a ubiquitous phenomenon in both biological and artificial learning systems. The results hold even for non-commutative groups, in which case the Fourier transform encodes all the irreducible unitary group representations. Our findings have consequences for the problem of symmetry discovery. Specifically, we demonstrate that the algebraic structure of an unknown group can be recovered from the weights of a network that is at least approximately invariant within certain bounds. Overall, this work contributes to a foundation for an algebraic learning theory of invariant neural network representations.

    ","tags":["theory","equivariance","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/How%20do%20vision%20transformers%20work%3F/","title":"How do vision transformers work?","text":"Properties authors Namuk Park, Songkuk Kim year 2022 url https://arxiv.org/abs/2202.06709

    Abstract

    The success of multi-head self-attentions (MSAs) for computer vision is now indisputable. However, little is known about how MSAs work. We present fundamental explanations to help better understand the nature of MSAs. In particular, we demonstrate the following properties of MSAs and Vision Transformers (ViTs): (1) MSAs improve not only accuracy but also generalization by flattening the loss landscapes. Such improvement is primarily attributable to their data specificity, not long-range dependency. On the other hand, ViTs suffer from non-convex losses. Large datasets and loss landscape smoothing methods alleviate this problem; (2) MSAs and Convs exhibit opposite behaviors. For example, MSAs are low-pass filters, but Convs are high-pass filters. Therefore, MSAs and Convs are complementary; (3) Multi-stage neural networks behave like a series connection of small individual models. In addition, MSAs at the end of a stage play a key role in prediction. Based on these insights, we propose AlterNet, a model in which Conv blocks at the end of a stage are replaced with MSA blocks. AlterNet outperforms CNNs not only in large data regimes but also in small data regimes. The code is available at\u00a0this https URL.

    ","tags":["vit","computer_vision","cnn","optimizability"]},{"location":"100%20Reference%20notes/101%20Literature/How%20do%20vision%20transformers%20work%3F/#notes","title":"Notes","text":"","tags":["vit","computer_vision","cnn","optimizability"]},{"location":"100%20Reference%20notes/101%20Literature/How%20do%20vision%20transformers%20work%3F/#the-question-of-inductive-biases","title":"The question of inductive biases","text":"

    Contrary to our expectations, experimental results show that the stronger the inductive bias, the lower both the test error and the training NLL. This indicates that ViT does not overfit training datasets. In addition, appropriate inductive biases, such as locality constraints for MSAs, helps NNs learn strong representations. We also observe these phenomena on CIFAR-10 and ImageNet as shown in Fig. C.1. Figure C.2 also supports that weak inductive biases disrupt NN training. In this experiment, extremely small patch sizes for the embedding hurt the predictive performance of ViT.

    Long range (global) attention is worse than local attention. MSA are good because they smooth loss landscape and are input dependent.

    What properties of MSAs do we need to improve optimization? We present various evidences to support that MSA is generalized spatial smoothing. It means that MSAs improve performance because their formulation\u2014Eq. (1)\u2014is an appropriate inductive bias. Their weak inductive bias disrupts NN training. In particular, a key feature of MSAs is their data specificity, not long-range dependency. As an extreme example, local MSAs with a 3 \u00d7 3 receptive field outperforms global MSA because they reduce unnecessary degrees of freedom.

    As far as my understanding goes, local MSA is not translation equivariant because it still is input dependent. So Local MSA has locality inductive bias but not translation equivariance. This is interesting, normal ConvNets do locality inductive bias by translation equivariance and it is not straight forward to remove their translation equivariance. Tracking at Input-dependent convolutions and Non-translationally equivariant convolutions.

    Locality inductive biases help with more stable training dynamics

    ","tags":["vit","computer_vision","cnn","optimizability"]},{"location":"100%20Reference%20notes/101%20Literature/How%20do%20vision%20transformers%20work%3F/#hessian-spectra","title":"Hessian Spectra","text":"

    Legend: ViT (red), CNN (blue) - ViT has small magnitude and negative values - CNN has large magnitude and positive values

    ","tags":["vit","computer_vision","cnn","optimizability"]},{"location":"100%20Reference%20notes/101%20Literature/Hydra%20-%20Bidirectional%20State%20Space%20Models%20Through%20Generalized%20Matrix%20Mixers/","title":"Hydra Bidirectional State Space Models Through Generalized Matrix Mixers","text":"Properties authors Sukjun Hwang, Aakash Lahoti, Tri Dao, Albert Gu year 2024 url https://arxiv.org/abs/2407.09941

    Abstract

    A wide array of sequence models are built on a framework modeled after Transformers, comprising alternating sequence mixer and channel mixer layers. This paper studies a unifying matrix mixer view of sequence mixers that can be conceptualized as a linear map on the input sequence. This framework encompasses a broad range of well-known sequence models, including the self-attention of Transformers as well as recent strong alternatives such as structured state space models (SSMs), and allows understanding downstream characteristics such as efficiency and expressivity through properties of their structured matrix class. We identify a key axis of matrix parameterizations termed sequence alignment, which increases the flexibility and performance of matrix mixers, providing insights into the strong performance of Transformers and recent SSMs such as Mamba. Furthermore, the matrix mixer framework offers a systematic approach to developing sequence mixers with desired properties, allowing us to develop several new sub-quadratic sequence models. In particular, we propose a natural bidirectional extension of the Mamba model (Hydra), parameterized as a quasiseparable matrix mixer, which demonstrates superior performance over other sequence models including Transformers on non-causal tasks. As a drop-in replacement for attention layers, Hydra outperforms BERT by 0.8 points on the GLUE benchmark and ViT by 2% Top-1 accuracy on ImageNet.

    ","tags":["paper","sequence_models"]},{"location":"100%20Reference%20notes/101%20Literature/Improving%20Convergence%20and%20Generalization%20Using%20Parameter%20Symmetries/","title":"Improving Convergence and Generalization Using Parameter Symmetries","text":"Properties authors Bo Zhao, Robert M Gower, Robin Walters, Rose Yu year 2023 url https://arxiv.org/abs/2305.13404

    Abstract

    In overparametrized models, different values of the parameters may result in the same loss value. Parameter space symmetries are transformations that change the model parameters but leave the loss invariant. Teleportation applies such transformations to accelerate optimization. However, the exact mechanism behind this algorithm's success is not well understood. In this paper, we show that teleportation not only speeds up optimization in the short-term, but gives overall faster time to convergence. Additionally, we show that teleporting to minima with different curvatures improves generalization and provide insights on the connection between the curvature of the minima and generalization ability. Finally, we show that integrating teleportation into a wide range of optimization algorithms and optimization-based meta-learning improves convergence.

    ","tags":["equivariance","relaxed_equivariance","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/In%20Search%20of%20Projectively%20Equivariant%20Networks/","title":"In Search of Projectively Equivariant Networks","text":"Properties authors Georg Bokman, Axel Flinth, Fredrik Kahl year 2022 url https://arxiv.org/abs/2209.14719

    Abstract

    Equivariance of linear neural network layers is well studied. In this work, we relax the equivariance condition to only be true in a projective sense. We propose a way to construct a projectively equivariant neural network through building a standard equivariant network where the linear group representations acting on each intermediate feature space are\"multiplicatively modified lifts\"of projective group representations. By theoretically studying the relation of projectively and linearly equivariant linear layers, we show that our approach is the most general possible when building a network out of linear layers. The theory is showcased in two simple experiments.

    ","tags":["equivariance","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Knowledge%20Transfer%20from%20Vision%20Foundation%20Models%20for%20Efficient%20Training%20of%20Small%20Task-specific%20Models/","title":"Knowledge Transfer from Vision Foundation Models for Efficient Training of Small Task specific Models","text":"Properties authors Raviteja Vemulapalli, Hadi Pouransari, Fartash Faghri, Sachin Mehta, Mehrdad Farajtabar, Mohammad Rastegari, Oncel Tuzel year 2023 url https://arxiv.org/abs/2311.18237

    Abstract

    Vision Foundation Models (VFMs) pretrained on massive datasets exhibit impressive performance on various downstream tasks, especially with limited labeled target data. However, due to their high inference compute cost, these models cannot be deployed for many real-world applications. Motivated by this, we ask the following important question, \"How can we leverage the knowledge from a large VFM to train a small task-specific model for a new target task with limited labeled training data?\", and propose a simple task-oriented knowledge transfer approach as a highly effective solution to this problem. Our experimental results on five target tasks show that the proposed approach outperforms task-agnostic VFM distillation, web-scale CLIP pretraining, supervised ImageNet pretraining, and self-supervised DINO pretraining by up to 11.6%, 22.1%, 13.7%, and 29.8%, respectively. Furthermore, the proposed approach also demonstrates up to 9x, 4x and 15x reduction in pretraining compute cost when compared to task-agnostic VFM distillation, ImageNet pretraining and DINO pretraining, respectively, while outperforming them. We also show that the dataset used for transferring knowledge has a significant effect on the final target task performance, and introduce a retrieval-augmented knowledge transfer strategy that uses web-scale image retrieval to curate effective transfer sets.

    ","tags":["efficient_dl","paper","foundation_models"]},{"location":"100%20Reference%20notes/101%20Literature/LRP-QViT%20-%20Mixed-Precision%20Vision%20Transformer%20Quantization%20via%20Layer-wise%20Relevance%20Propagation/","title":"LRP QViT Mixed Precision Vision Transformer Quantization via Layer wise Relevance Propagation","text":"Properties authors Navin Ranjan, Andreas Savakis year 2024 url https://arxiv.org/abs/2401.11243

    Abstract

    Vision transformers (ViTs) have demonstrated remarkable performance across various visual tasks. However, ViT models suffer from substantial computational and memory requirements, making it challenging to deploy them on resource-constrained platforms. Quantization is a popular approach for reducing model size, but most studies mainly focus on equal bit-width quantization for the entire network, resulting in sub-optimal solutions. While there are few works on mixed precision quantization (MPQ) for ViTs, they typically rely on search space-based methods or employ mixed precision arbitrarily. In this paper, we introduce LRP-QViT, an explainability-based method for assigning mixed-precision bit allocations to different layers based on their importance during classification. Specifically, to measure the contribution score of each layer in predicting the target class, we employ the Layer-wise Relevance Propagation (LRP) method. LRP assigns local relevance at the output layer and propagates it through all layers, distributing the relevance until it reaches the input layers. These relevance scores serve as indicators for computing the layer contribution score. Additionally, we have introduced a clipped channel-wise quantization aimed at eliminating outliers from post-LayerNorm activations to alleviate severe inter-channel variations. To validate and assess our approach, we employ LRP-QViT across ViT, DeiT, and Swin transformer models on various datasets. Our experimental findings demonstrate that both our fixed-bit and mixed-bit post-training quantization methods surpass existing models in the context of 4-bit and 6-bit quantization.

    ","tags":["paper","vit","computer_vision","peft","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/Learned%20Gridification%20for%20Efficient%20Point%20Cloud%20Processing/","title":"Learned Gridification for Efficient Point Cloud Processing","text":"Properties authors Putri A. van der Linden, Erik J. Bekkers, David W. Romero

    Abstract

    ","tags":["dl2"]},{"location":"100%20Reference%20notes/101%20Literature/Learning%20Partial%20Equivariances%20from%20Data/","title":"Learning Partial Equivariances from Data","text":"Properties authors David W. Romero, Suhas Lohit year 2021 url https://arxiv.org/abs/2110.10211

    Monte Carlo Approximation of Group Convolutions

    We can approximate Group Convolutions on the expectation by uniformly sampling group actions \\(v_j\\). $$ (\\psi \\hat{*} f)(u_i) = \\sum_j \\psi (v_j^{-1} u_i)f(v_j) \\bar{\\mu}_{\\mathcal{G}} (v_j) $$

    Main idea

    1. Prioritize sampling of specific group elements during the group convolution by learning a probability distribution over them.
    2. 1D continuous groups: use reparametrization trick on the Lie algebra of the group, which is uniform over a connected set of group elements but zero otherwise. \\(\\to\\) Partial Equivariance
    3. 1D discrete groups: Bernoulli Distribution over all possible element combinations

    Citations: - Self-Supervised Detection of Perfect and Partial Input-Dependent Symmetries - Color Equivariant Convolutional Networks - Equivariance-aware architectural optimization of neural networks - Approximation-Generalization Trade-offs under (Approximate) Group Equivariance

    ","tags":["dl2","equivariance","partial_equivariance"]},{"location":"100%20Reference%20notes/101%20Literature/Learning%20both%20Weights%20and%20Connections%20for%20Efficient%20Neural%20Networks/","title":"Learning both Weights and Connections for Efficient Neural Networks","text":"Properties authors Song Han, Jeff Pool, John Tran, William J. Dally year 2015 url https://arxiv.org/abs/1506.02626

    Abstract

    Neural networks are both computationally intensive and memory intensive, making them difficult to deploy on embedded systems. Also, conventional networks fix the architecture before training starts; as a result, training cannot improve the architecture. To address these limitations, we describe a method to reduce the storage and computation required by neural networks by an order of magnitude without affecting their accuracy by learning only the important connections. Our method prunes redundant connections using a three-step method. First, we train the network to learn which connections are important. Next, we prune the unimportant connections. Finally, we retrain the network to fine tune the weights of the remaining connections. On the ImageNet dataset, our method reduced the number of parameters of AlexNet by a factor of 9x, from 61 million to 6.7 million, without incurring accuracy loss. Similar experiments with VGG-16 found that the number of parameters can be reduced by 13x, from 138 million to 10.3 million, again with no loss of accuracy.

    ","tags":["paper","efficient_dl","pruning","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Learning%20with%20Unmasked%20Tokens%20Drives%20Stronger%20Vision%20Learners/","title":"Learning with Unmasked Tokens Drives Stronger Vision Learners","text":"Properties authors Taekyung Kim, Sanghyuk Chun, Byeongho Heo, Dongyoon Han year 2024 url https://arxiv.org/abs/2310.13593

    Abstract

    Masked image modeling (MIM) has become a leading self-supervised learning strategy. MIMs such as Masked Autoencoder (MAE) learn strong representations by randomly masking input tokens for the encoder to process, with the decoder reconstructing the masked tokens to the input. However, MIM pre-trained encoders often exhibit a limited attention span, attributed to MIM's sole focus on regressing masked tokens only, which may impede the encoder's broader context learning. To tackle the limitation, we improve MIM by explicitly incorporating unmasked tokens into the training process. Specifically, our method enables the encoder to learn from broader context supervision, allowing unmasked tokens to experience broader contexts while the decoder reconstructs masked tokens. Thus, the encoded unmasked tokens are equipped with extensive contextual information, empowering masked tokens to leverage the enhanced unmasked tokens for MIM. As a result, our simple remedy trains more discriminative representations revealed by achieving 84.2% top-1 accuracy with ViT-B on ImageNet-1K with 0.6%p gain. We attribute the success to the enhanced pre-training method, as evidenced by the singular value spectrum and attention analyses. Finally, our models achieve significant performance gains at the downstream semantic segmentation and fine-grained visual classification tasks; and on diverse robust evaluation metrics. Code is available at\u00a0this https URL

    ","tags":["paper","foundation_models","computer_vision","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/Learning%20with%20Unmasked%20Tokens%20Drives%20Stronger%20Vision%20Learners/#notes","title":"Notes","text":"

    Some notes regarding MIM as a good objective are on Masked Image Modelling.

    However, MIM strategies often encounter challenges, such as local dependency on attention to understand entire context of an image. For example, liu\u00a0et al.\u00a0[36]\u00a0revealed that MAE\u00a0[22], a state-of-the-art MIM method, exhibits shorter average attention distances. Furthermore, we observe that attention map patterns by MAE substantiate extremely local behavior (See Fig.\u00a01) indeed. In other words, the MAE-trained attention mechanism less integrates information across the entire image pixels and tends to focus on specific input regions. This is presumably attributed to MIM-pretraining, primarily dedicated to predicting low-level pixel details (e.g., color or texture) without a comprehensive understanding of less-regional information (e.g., the input structure or shape).

    This maybe should not really be an issue: How do vision transformers work? explicitly constraint ViTs to only use local attention and they improve performance. So maybe this is an advantage? See Are less inductive biases better or worse?.

    ","tags":["paper","foundation_models","computer_vision","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/Llama%202%20-%20Open%20Foundation%20and%20Fine-Tuned%20Chat%20Models/","title":"Llama 2 Open Foundation and Fine Tuned Chat Models","text":"Properties authors Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom year 2023 url https://arxiv.org/abs/2307.09288

    Abstract

    In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models outperform open-source chat models on most benchmarks we tested, and based on our human evaluations for helpfulness and safety, may be a suitable substitute for closed-source models. We provide a detailed description of our approach to fine-tuning and safety improvements of Llama 2-Chat in order to enable the community to build on our work and contribute to the responsible development of LLMs.

    ","tags":["paper","foundation_models","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/LoRA%20-%20Low-Rank%20Adaptation%20of%20Large%20Language%20Models/","title":"LoRA Low Rank Adaptation of Large Language Models","text":"Properties authors Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen year 2021 url https://arxiv.org/abs/2106.09685

    Abstract

    An important paradigm of natural language processing consists of large-scale pre-training on general domain data and adaptation to particular tasks or domains. As we pre-train larger models, full fine-tuning, which retrains all model parameters, becomes less feasible. Using GPT-3 175B as an example -- deploying independent instances of fine-tuned models, each with 175B parameters, is prohibitively expensive. We propose Low-Rank Adaptation, or LoRA, which freezes the pre-trained model weights and injects trainable rank decomposition matrices into each layer of the Transformer architecture, greatly reducing the number of trainable parameters for downstream tasks. Compared to GPT-3 175B fine-tuned with Adam, LoRA can reduce the number of trainable parameters by 10,000 times and the GPU memory requirement by 3 times. LoRA performs on-par or better than fine-tuning in model quality on RoBERTa, DeBERTa, GPT-2, and GPT-3, despite having fewer trainable parameters, a higher training throughput, and, unlike adapters, no additional inference latency. We also provide an empirical investigation into rank-deficiency in language model adaptation, which sheds light on the efficacy of LoRA. We release a package that facilitates the integration of LoRA with PyTorch models and provide our implementations and model checkpoints for RoBERTa, DeBERTa, and GPT-2 at\u00a0this https URL.

    ","tags":["paper","efficient_dl","peft"]},{"location":"100%20Reference%20notes/101%20Literature/Mamba%20-%20Linear-Time%20Sequence%20Modeling%20with%20Selective%20State%20Spaces/","title":"Mamba Linear Time Sequence Modeling with Selective State Spaces","text":"Properties authors Albert Gu, Tri Dao year 2023

    Abstract

    Foundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module. Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformers' computational inefficiency on long sequences, but they have not performed as well as attention on important modalities such as language. We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements. First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token. Second, even though this change prevents the use of efficient convolutions, we design a hardware-aware parallel algorithm in recurrent mode. We integrate these selective SSMs into a simplified end-to-end neural network architecture without attention or even MLP blocks (Mamba). Mamba enjoys fast inference (5\u00d7\u00a0higher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences. As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics. On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.

    ","tags":["foundation_models","convolutions"]},{"location":"100%20Reference%20notes/101%20Literature/Memorization%20Through%20the%20Lens%20of%20Curvature%20of%20Loss%20Function%20Around%20Samples/","title":"Memorization Through the Lens of Curvature of Loss Function Around Samples","text":"Properties authors Isha Garg, Deepak Ravikumar, Kaushik Roy year 2024 url https://openreview.net/forum?id=WQbDS9RydY

    Abstract

    Deep neural networks are over-parameterized and easily overfit to and memorize the datasets that they train on. In the extreme case, it has been shown that networks can memorize a randomly labeled dataset. In this paper, we propose using the curvature of the loss function around each training sample, averaged over training epochs, as a measure of memorization of a sample. We show that this curvature metric effectively captures memorization statistics, both qualitatively and quantitatively in popular image datasets. We provide quantitative validation of the proposed metric against memorization scores released by Feldman & Zhang (2020). Further, experiments on mislabeled data detection show that corrupted samples are learned with high curvature and using curvature for identifying mislabelled examples outperforms existing approaches. Qualitatively, we find that high curvature samples correspond to long-tailed, mislabeled, or conflicting instances, indicating a likelihood of memorization. Notably, this analysis helps us find, to the best of our knowledge, a novel failure mode on the CIFAR100 and ImageNet datasets: that of duplicated images with differing labels.

    ","tags":["paper","dl_theory","llm"]},{"location":"100%20Reference%20notes/101%20Literature/Mixture%20of%20LoRa%20Experts/","title":"Mixture of LoRa Experts","text":"Properties authors Xun Wu, Shaohan Huang, Furu Wei year 2024 url https://arxiv.org/abs/2404.13628

    Abstract

    LoRA has gained widespread acceptance in the fine-tuning of large pre-trained models to cater to a diverse array of downstream tasks, showcasing notable effectiveness and efficiency, thereby solidifying its position as one of the most prevalent fine-tuning techniques. Due to the modular nature of LoRA's plug-and-play plugins, researchers have delved into the amalgamation of multiple LoRAs to empower models to excel across various downstream tasks. Nonetheless, extant approaches for LoRA fusion grapple with inherent challenges. Direct arithmetic merging may result in the loss of the original pre-trained model's generative capabilities or the distinct identity of LoRAs, thereby yielding suboptimal outcomes. On the other hand, Reference tuning-based fusion exhibits limitations concerning the requisite flexibility for the effective combination of multiple LoRAs. In response to these challenges, this paper introduces the Mixture of LoRA Experts (MoLE) approach, which harnesses hierarchical control and unfettered branch selection. The MoLE approach not only achieves superior LoRA fusion performance in comparison to direct arithmetic merging but also retains the crucial flexibility for combining LoRAs effectively. Extensive experimental evaluations conducted in both the Natural Language Processing (NLP) and Vision & Language (V&L) domains substantiate the efficacy of MoLE.

    ","tags":["paper","peft","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/MobileCLIP%20-%20Fast%20Image-Text%20Models%20through%20Multi-Modal%20Reinforced%20Training/","title":"MobileCLIP Fast Image Text Models through Multi Modal Reinforced Training","text":"Properties authors Pavan Kumar Anasosalu Vasu, Hadi Pouransari, Fartash Faghri, Raviteja Vemulapalli, Oncel Tuzel year 2023 url https://arxiv.org/abs/2311.17049

    Abstract

    Contrastive pretraining of image-text foundation models, such as CLIP, demonstrated excellent zero-shot performance and improved robustness on a wide range of downstream tasks. However, these models utilize large transformer-based encoders with significant memory and latency overhead which pose challenges for deployment on mobile devices. In this work, we introduce MobileCLIP -- a new family of efficient image-text models optimized for runtime performance along with a novel and efficient training approach, namely multi-modal reinforced training. The proposed training approach leverages knowledge transfer from an image captioning model and an ensemble of strong CLIP encoders to improve the accuracy of efficient models. Our approach avoids train-time compute overhead by storing the additional knowledge in a reinforced dataset. MobileCLIP sets a new state-of-the-art latency-accuracy tradeoff for zero-shot classification and retrieval tasks on several datasets. Our MobileCLIP-S2 variant is 2.3\u00d7\u00a0faster while more accurate compared to previous best CLIP model based on ViT-B/16. We further demonstrate the effectiveness of our multi-modal reinforced training by training a CLIP model based on ViT-B/16 image backbone and achieving +2.9% average performance improvement on 38 evaluation benchmarks compared to the previous best. Moreover, we show that the proposed approach achieves 10\u00d7-1000\u00d7\u00a0improved learning efficiency when compared with non-reinforced CLIP training. Code and models are available at\u00a0this https URL\u00a0.

    ","tags":["paper","efficient_dl","efficient_vision","computer_vision","multimodal"]},{"location":"100%20Reference%20notes/101%20Literature/MobileViT%20-%20light-weight%2C%20general-purpose%2C%20and%20mobile-friendly%20vision%20transformer/","title":"MobileViT light weight, general purpose, and mobile friendly vision transformer","text":"Properties authors Sachin Mehta, Mohammad Rastegari year 2022 url https://arxiv.org/abs/2110.02178

    Abstract

    Light-weight convolutional neural networks (CNNs) are the de-facto for mobile vision tasks. Their spatial inductive biases allow them to learn representations with fewer parameters across different vision tasks. However, these networks are spatially local. To learn global representations, self-attention-based vision trans-formers (ViTs) have been adopted. Unlike CNNs, ViTs are heavy-weight. In this paper, we ask the following question: is it possible to combine the strengths of CNNs and ViTs to build a light-weight and low latency network for mobile vision tasks? Towards this end, we introduce MobileViT, a light-weight and general-purpose vision transformer for mobile devices. MobileViT presents a different perspective for the global processing of information with transformers, i.e., transformers as convolutions. Our results show that MobileViT significantly outperforms CNN- and ViT-based networks across different tasks and datasets. On the ImageNet-1k dataset, MobileViT achieves top-1 accuracy of 78.4% with about 6 million parameters, which is 3.2% and 6.2% more accurate than MobileNetv3 (CNN-based) and DeIT (ViT-based) for a similar number of parameters. On the MS-COCO object detection task, MobileViT is 5.7% more accurate than MobileNetv3 for a similar number of parameters.

    Our source code is open-source and available at:\u00a0this https URL

    ","tags":["paper","efficient_dl","efficient_vision"]},{"location":"100%20Reference%20notes/101%20Literature/Model%20Compression%20in%20Practice%20-%20Lessons%20Learned%20from%20Practitioners%20Creating%20On-device%20Machine%20Learning%20Experiences/","title":"Model Compression in Practice Lessons Learned from Practitioners Creating On device Machine Learning Experiences","text":"Properties authors Fred Hohman, Mary Beth Kery, Donghao Ren, Dominik Moritz year 2024 url https://arxiv.org/abs/2310.04621

    Abstract

    On-device machine learning (ML) promises to improve the privacy, responsiveness, and proliferation of new, intelligent user experiences by moving ML computation onto everyday personal devices. However, today's large ML models must be drastically compressed to run efficiently on-device, a hurtle that requires deep, yet currently niche expertise. To engage the broader human-centered ML community in on-device ML experiences, we present the results from an interview study with 30 experts at Apple that specialize in producing efficient models. We compile tacit knowledge that experts have developed through practical experience with model compression across different hardware platforms. Our findings offer pragmatic considerations missing from prior work, covering the design process, trade-offs, and technical strategies that go into creating efficient models. Finally, we distill design recommendations for tooling to help ease the difficulty of this work and bring on-device ML into to more widespread practice.

    ","tags":["paper","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/Model%20Compression%20in%20Practice%20-%20Lessons%20Learned%20from%20Practitioners%20Creating%20On-device%20Machine%20Learning%20Experiences/#notes","title":"Notes","text":"

    Specific techniques on models weights help reduce size, but to get an efficient model comes from more careful design of the loss function, the system, which parts should and should not be modeled with ML. - [ ] How does the design of a loss function affect a model's efficiency? Note to myself to look into this in the future.

    Although posttraining quantization is considered \u201ceasy\u201d [E9] as far as ML compression techniques go, practitioners emphasized that it still often takes complex code to implement and there are many algorithm variations A survey of quantization methods for efficient neural network inference to experiment with [T5]. For models that need high accuracy, post-training quantization may not be enough to hit budget without unacceptable accuracy degradation [E9, E4, E13, E5].

    • Okay, so it's important to try a bunch of quantization techniques. Got it.

    \u201cIf you want to go to lower bit quantization, such as 4 or below, it\u2019s almost impossible to use post-training quantization because the difference in accuracy gets way too big. So for this level of compression you need to do training-aware compression.\u201d \u2014 E9

    • Cool, I didn't know that training aware compression was such an important thing to consider, from an industry perspective, not just research.

    Although training-aware compression is considered the best form of optimization A survey of quantization methods for efficient neural network inference, a major drawback is that is must be included in initial model training: \u201cNot starting early with compression is a dead end,\u201d [E3].

    • Why is that though? Why should compression-aware training happen from the start and not in the middle of training or even in finetuning? #rq

    [...] practitioners suggest estimating how much compression will be feasible with simple post-training quantization. To estimate quantization savings before training a model, first initialize the ML model architecture with random weights, then quantize, and test the model\u2019s speed and size on-device

    Strategy #6: Compression can degrade the accuracy of a model and change its behavior in unpredictable ways. It is essential to create a robust evaluation pipeline (e.g., defining metrics, curating test sets) before you start optimizing your model, so that you can reliably observe shifts in model error afterwards. To prevent degradation from a failed optimization, compare optimized models with varying amounts of compression to your original model, inspecting the metrics, subpopulation behaviors, and internals, such as weights and activations, to ensure they are within expected ranges.

    Okay, Robust evaluation pipeline is fundamental: Need to create unit tests, and for quantization specifically check that the distributions of weights (obviously) and activations (less obviously) are within the expected ranges. The latter might happen because of compounding degradation, this mean that errors in early layers caused by quantization might compound to later layers in unexpected ways.

    ","tags":["paper","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/Neural%20Mechanics%20-%20Symmetry%20and%20Broken%20Conservation%20Laws%20in%20Deep%20Learning%20Dynamics/","title":"Neural Mechanics Symmetry and Broken Conservation Laws in Deep Learning Dynamics","text":"Properties authors Daniel Kunin, Javier Sagastuy-Brena, Surya Ganguli, Daniel L.K. Yamins, Hidenori Tanaka year 2020 url https://arxiv.org/abs/2012.04728

    Abstract

    Understanding the dynamics of neural network parameters during training is one of the key challenges in building a theoretical foundation for deep learning. A central obstacle is that the motion of a network in high-dimensional parameter space undergoes discrete finite steps along complex stochastic gradients derived from real-world datasets. We circumvent this obstacle through a unifying theoretical framework based on intrinsic symmetries embedded in a network's architecture that are present for any dataset. We show that any such symmetry imposes stringent geometric constraints on gradients and Hessians, leading to an associated conservation law in the continuous-time limit of stochastic gradient descent (SGD), akin to Noether's theorem in physics. We further show that finite learning rates used in practice can actually break these symmetry induced conservation laws. We apply tools from finite difference methods to derive modified gradient flow, a differential equation that better approximates the numerical trajectory taken by SGD at finite learning rates. We combine modified gradient flow with our framework of symmetries to derive exact integral expressions for the dynamics of certain parameter combinations. We empirically validate our analytic expressions for learning dynamics on VGG-16 trained on Tiny ImageNet. Overall, by exploiting symmetry, our work demonstrates that we can analytically describe the learning dynamics of various parameter combinations at finite learning rates and batch sizes for state of the art architectures trained on any dataset.

    ","tags":["dl2","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/On%20Good%20Practices%20for%20Task-Specific%20Distillation%20of%20Large%20Pretrained%20Visual%20Models/","title":"On Good Practices for Task Specific Distillation of Large Pretrained Visual Models","text":"Properties authors Juliette Marrie, Michael Arbel, Julien Mairal, Diane Larlus year 2024 url https://arxiv.org/abs/2402.11305

    Abstract

    Large pretrained visual models exhibit remarkable generalization across diverse recognition tasks. Yet, real-world applications often demand compact models tailored to specific problems. Variants of knowledge distillation have been devised for such a purpose, enabling task-specific compact models (the students) to learn from a generic large pretrained one (the teacher). In this paper, we show that the excellent robustness and versatility of recent pretrained models challenge common practices established in the literature, calling for a new set of optimal guidelines for task-specific distillation. To address the lack of samples in downstream tasks, we also show that a variant of Mixup based on stable diffusion complements standard data augmentation. This strategy eliminates the need for engineered text prompts and improves distillation of generic models into streamlined specialized networks.

    ","tags":["paper","distillation","foundation_models","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/On%20Good%20Practices%20for%20Task-Specific%20Distillation%20of%20Large%20Pretrained%20Visual%20Models/#notes","title":"Notes","text":"","tags":["paper","distillation","foundation_models","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/On%20the%20Relationship%20between%20Self-Attention%20and%20Convolutional%20Layers/","title":"On the Relationship between Self Attention and Convolutional Layers","text":"Properties authors Jean-Baptiste Cordonnier, Andreas Loukas, Martin Jaggi year 2020 url https://arxiv.org/abs/1911.03584

    Abstract

    Recent trends of incorporating attention mechanisms in vision have led researchers to reconsider the supremacy of convolutional layers as a primary building block. Beyond helping CNNs to handle long-range dependencies, Stand-Alone Self-Attention in Vision Models showed that attention can completely replace convolution and achieve state-of-the-art performance on vision tasks. This raises the question: do learned attention layers operate similarly to convolutional layers? This work provides evidence that attention layers can perform convolution and, indeed, they often learn to do so in practice. Specifically, we prove that a multi-head self-attention layer with sufficient number of heads is at least as expressive as any convolutional layer. Our numerical experiments then show that self-attention layers attend to pixel-grid patterns similarly to CNN layers, corroborating our analysis.

    ","tags":["transformers","convolutions","theory"]},{"location":"100%20Reference%20notes/101%20Literature/On%20the%20Relationship%20between%20Self-Attention%20and%20Convolutional%20Layers/#notes","title":"Notes","text":"
    • Note to self: fully read article, it looks fun \u23eb #personal
    ","tags":["transformers","convolutions","theory"]},{"location":"100%20Reference%20notes/101%20Literature/On%20the%20Symmetries%20of%20Deep%20Learning%20Models%20and%20their%20Internal%20Representations/","title":"On the Symmetries of Deep Learning Models and their Internal Representations","text":"Properties authors Charles Godfrey, Davis Brown, Tegan Emerson, Henry Kvnige year 2022 url https://arxiv.org/abs/2205.14258

    Abstract

    Symmetry is a fundamental tool in the exploration of a broad range of complex systems. In machine learning symmetry has been explored in both models and data. In this paper we seek to connect the symmetries arising from the architecture of a family of models with the symmetries of that family's internal representation of data. We do this by calculating a set of fundamental symmetry groups, which we call the intertwiner groups of the model. We connect intertwiner groups to a model's internal representations of data through a range of experiments that probe similarities between hidden states across models with the same architecture. Our work suggests that the symmetries of a network are propagated into the symmetries in that network's representation of data, providing us with a better understanding of how architecture affects the learning and prediction process. Finally, we speculate that for ReLU networks, the intertwiner groups may provide a justification for the common practice of concentrating model interpretability exploration on the activation basis in hidden layers rather than arbitrary linear combinations thereof.

    Notes: - The following papers study the effect of weight space symmetries on training dynamics: - Neural Mechanics - Symmetry and Broken Conservation Laws in Deep Learning Dynamics - Understanding symmetries in deep networks - G-SGD - Optimizing ReLU Neural Networks in its Positively Scale-Invariant Space - Deep Learning Book

    ","tags":["dl_theory","dl2"]},{"location":"100%20Reference%20notes/101%20Literature/OpenELM%20-%20An%20Efficient%20Language%20Model%20Family%20with%20Open-source%20Training%20and%20Inference%20Framework/","title":"OpenELM An Efficient Language Model Family with Open source Training and Inference Framework","text":"Properties authors Sachin Mehta, Mohammad Hossein Sekhavat, Qingqing Cao, Maxwell Horton, Yanzi Jin, Chenfan Sun, Iman Mirzadeh, Dmitry Belenko, Peter Zatloukal, Mohammad Rastegari year 2024 url https://arxiv.org/abs/2404.14619

    Abstract

    The reproducibility and transparency of large language models are crucial for advancing open research, ensuring the trustworthiness of results, and enabling investigations into data and model biases, as well as potential risks. To this end, we release OpenELM, a state-of-the-art open language model. OpenELM uses a layer-wise scaling strategy to efficiently allocate parameters within each layer of the transformer model, leading to enhanced accuracy. For example, with a parameter budget of approximately one billion parameters, OpenELM exhibits a 2.36% improvement in accuracy compared to OLMo while requiring\u00a02\u00d7\u00a0fewer pre-training tokens. Diverging from prior practices that only provide model weights and inference code, and pre-train on private datasets, our release includes the complete framework for training and evaluation of the language model on publicly available datasets, including training logs, multiple checkpoints, and pre-training configurations. We also release code to convert models to MLX library for inference and fine-tuning on Apple devices. This comprehensive release aims to empower and strengthen the open research community, paving the way for future open research endeavors. Our source code along with pre-trained model weights and training recipes is available at \\url{this https URL}. Additionally, \\model models can be found on HuggingFace at: \\url{this https URL}.

    ","tags":["llm","foundation_models"]},{"location":"100%20Reference%20notes/101%20Literature/Optimal%20Brain%20Damage/","title":"Optimal Brain Damage","text":"Properties authors John Denker, Sara Solla, Yann LeCun year 1989 url https://proceedings.neurips.cc/paper/1989/hash/6c9882bbac1c7093bd25041881277658-Abstract.html

    Abstract

    We have used information-theoretic ideas to derive a class of practical and nearly optimal schemes for adapting the size of a neural network. By removing unimportant weights from a network, several improvements can be expected: better generalization, fewer training examples required, and improved speed of learning and/or classification. The basic idea is to use second-derivative information to make a tradeoff between network complexity and training set error. Experiments confirm the usefulness of the methods on a real-world application.

    OBD Pruning Algorithm

    Use saliency measure based on Hessian (loss wrt parameters) to pick which parameters to prune. Finetune afterwards.

    ","tags":["paper","efficient_vision","efficient_dl","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Optimization%20Dynamics%20of%20Equivariant%20and%20Augmented%20Neural%20Networks/","title":"Optimization Dynamics of Equivariant and Augmented Neural Networks","text":"Properties authors Alex Flinth, Fredrik Ohlsson year 2023 url https://arxiv.org/abs/2303.13458

    Abstract

    We investigate the optimization of multilayer perceptrons on symmetric data. We compare the strategy of constraining the architecture to be equivariant to that of using augmentation. We show that, under natural assumptions on the loss and non-linearities, the sets of equivariant stationary points are identical for the two strategies, and that the set of equivariant layers is invariant under the gradient flow for augmented models. Finally, we show that stationary points may be unstable for augmented training although they are stable for the equivariant models.

    Main observations: 1. They show that if the augmented model is equivariantly initialized, it will remain equivariant during training (See Equivariance Initialization) 3. Compared to the equivariant approach, augmentation introduces no new equivariant stationary points, nor does it exclude existing ones. (See Multiple global minima) 4. The existence of a stable equivariant minimum is not guaranteed by augmentation. (See Multiple global minima)

    Regarding Equivariance Initialization in this work:

    We initialize \u03a6A with equivariant layers A0 \u2208 E by drawing matrices randomly from a standard Gaussian distribution, and then projecting them orthogonally onto E. We train the network on (finite) datasets D using gradient descent in three different ways.

    My intuition is that they do something like the isotropic convolution from Priors over Neural Network weights

    ","tags":["dl_theory","equivariance","optimization"]},{"location":"100%20Reference%20notes/101%20Literature/Parameter%20Efficient%20Fine-tuning%20of%20Self-supervised%20ViTs%20without%20Catastrophic%20Forgetting/","title":"Parameter Efficient Fine tuning of Self supervised ViTs without Catastrophic Forgetting","text":"Properties authors Reza Akbarian Bafghi, Nidhin Harilal, Claire Monteleoni, Maziar Raissi year 2024 url https://arxiv.org/abs/2404.17245

    Abstract

    Artificial neural networks often suffer from catastrophic forgetting, where learning new concepts leads to a complete loss of previously acquired knowledge. We observe that this issue is particularly magnified in vision transformers (ViTs), where post-pre-training and fine-tuning on new tasks can significantly degrade the model's original general abilities. For instance, a DINO ViT-Base/16 pre-trained on ImageNet-1k loses over 70% accuracy on ImageNet-1k after just 10 iterations of fine-tuning on CIFAR-100. Overcoming this stability-plasticity dilemma is crucial for enabling ViTs to continuously learn and adapt to new domains while preserving their initial knowledge. In this work, we study two new parameter-efficient fine-tuning strategies: (1)~Block Expansion, and (2) Low-rank adaptation (LoRA). Our experiments reveal that using either Block Expansion or LoRA on self-supervised pre-trained ViTs surpass fully fine-tuned ViTs in new domains while offering significantly greater parameter efficiency. Notably, we find that Block Expansion experiences only a minimal performance drop in the pre-training domain, thereby effectively mitigating catastrophic forgetting in pre-trained ViTs.

    ","tags":["paper"]},{"location":"100%20Reference%20notes/101%20Literature/Parameter%20Efficient%20Fine-tuning%20of%20Self-supervised%20ViTs%20without%20Catastrophic%20Forgetting/#paper-results","title":"Paper Results","text":"Model N. params CIFAR-100 IN-1K Mean Standard Fine-tuning All 85.9 M 88.13 25.24 56.69 Top-3 21.3 M 84.56 74.15 79.36 Linear 76.9 K 80.57 76.11 78.34 LoRA \ud835\udc5f=4 301 K 87.91 66.82 77.37 \ud835\udc5f=8 448 K 88.27 65.99 77.13 \ud835\udc5f=16 743 K 87.84 65.06 76.45 Block Expansion \ud835\udc5d=1 7.2 M 82.72 75.75 79.24 \ud835\udc5d=2 14.3 M 86.70 75.54 81.12 \ud835\udc5d=3 21.3 M 88.58 74.61 81.60 \ud835\udc5d=4 28.4 M 89.09 72.28 80.69","tags":["paper"]},{"location":"100%20Reference%20notes/101%20Literature/Parameter%20Efficient%20Fine-tuning%20of%20Self-supervised%20ViTs%20without%20Catastrophic%20Forgetting/#observations","title":"Observations","text":"
    • Linear only fine-tuning does pretty well, kinda surprising.
    • It's kind of suprising that LoRa Adapter do bad, but does it matter? What is the purpose of making LoRa resistant to catastrophic forgetting if the whole point of it is to be able to hot-swap modules depending on the task?
    • Also worthy to point out that Block Expansion requires training parameters in the order of millions while LoRa only requires thousands.
    ","tags":["paper"]},{"location":"100%20Reference%20notes/101%20Literature/Parameter-Efficient%20Fine-Tuning%20for%20Pre-Trained%20Vision%20Models%20-%20A%20Survey/","title":"Parameter Efficient Fine Tuning for Pre Trained Vision Models A Survey","text":"Properties authors Yi Xin, Siqi Luo, Haodi Zhou, Junlong Du, Xiaohong Liu, Yue Fan, Qing Li, Yuntao Du year 2024 url https://arxiv.org/abs/2402.02242

    Abstract

    Large-scale pre-trained vision models (PVMs) have shown great potential for adaptability across various downstream vision tasks. However, with state-of-the-art PVMs growing to billions or even trillions of parameters, the standard full fine-tuning paradigm is becoming unsustainable due to high computational and storage demands. In response, researchers are exploring parameter-efficient fine-tuning (PEFT), which seeks to exceed the performance of full fine-tuning with minimal parameter modifications. This survey provides a comprehensive overview and future directions for visual PEFT, offering a systematic review of the latest advancements. First, we provide a formal definition of PEFT and discuss model pre-training methods. We then categorize existing methods into three categories: addition-based, partial-based, and unified-based. Finally, we introduce the commonly used datasets and applications and suggest potential future research challenges. A comprehensive collection of resources is available at\u00a0this https URL.

    ","tags":["paper","efficient_dl","efficient_vision","transformers","peft"]},{"location":"100%20Reference%20notes/101%20Literature/Progress%20measures%20for%20grokking%20via%20mechanistic%20interpretability/","title":"Progress measures for grokking via mechanistic interpretability","text":"Properties authors Neel Nanda, Lawrence Chan, Tom Lieberum, Jess Smith, Jacob Steinhardt year 2023 url https://arxiv.org/abs/2301.05217

    Abstract

    Neural networks often exhibit emergent behavior, where qualitatively new capabilities arise from scaling up the amount of parameters, training data, or training steps. One approach to understanding emergence is to find continuous \\textit{progress measures} that underlie the seemingly discontinuous qualitative changes. We argue that progress measures can be found via mechanistic interpretability: reverse-engineering learned behaviors into their individual components. As a case study, we investigate the recently-discovered phenomenon of ``grokking'' exhibited by small transformers trained on modular addition tasks. We fully reverse engineer the algorithm learned by these networks, which uses discrete Fourier transforms and trigonometric identities to convert addition to rotation about a circle. We confirm the algorithm by analyzing the activations and weights and by performing ablations in Fourier space. Based on this understanding, we define progress measures that allow us to study the dynamics of training and split training into three continuous phases: memorization, circuit formation, and cleanup. Our results show that grokking, rather than being a sudden shift, arises from the gradual amplification of structured mechanisms encoded in the weights, followed by the later removal of memorizing components.

    Related - Grokking

    ","tags":["paper","interpretability","mechinterp"]},{"location":"100%20Reference%20notes/101%20Literature/Provably%20Strict%20Generalisation%20Benefit%20for%20Equivariant%20Models/","title":"Provably Strict Generalisation Benefit for Equivariant Models","text":"Properties authors Bryn Elesedy, Sheheryar Zaidi year 2021 url https://arxiv.org/abs/2102.10333

    Abstract

    It is widely believed that engineering a model to be invariant/equivariant improves generalisation. Despite the growing popularity of this approach, a precise characterisation of the generalisation benefit is lacking. By considering the simplest case of linear models, this paper provides the first provably non-zero improvement in generalisation for invariant/equivariant models when the target distribution is invariant/equivariant with respect to a compact group. Moreover, our work reveals an interesting relationship between generalisation, the number of training examples and properties of the group action. Our results rest on an observation of the structure of function spaces under averaging operators which, along with its consequences for feature averaging, may be of independent interest.

    ","tags":["dl_theory","equivariance"]},{"location":"100%20Reference%20notes/101%20Literature/ProxylessNAS%20-%20Direct%20Neural%20Architecture%20Search%20on%20Target%20Task%20and%20Hardware/","title":"ProxylessNAS Direct Neural Architecture Search on Target Task and Hardware","text":"Properties authors Han Cai, Ligeng Zhu, Song Han year 2019 url https://arxiv.org/abs/1812.00332

    Abstract

    Neural architecture search (NAS) has a great impact by automatically designing effective neural network architectures. However, the prohibitive computational demand of conventional NAS algorithms (e.g.\u00a0104\u00a0GPU hours) makes it difficult to \\emph{directly} search the architectures on large-scale tasks (e.g. ImageNet). Differentiable NAS can reduce the cost of GPU hours via a continuous representation of network architecture but suffers from the high GPU memory consumption issue (grow linearly w.r.t. candidate set size). As a result, they need to utilize~\\emph{proxy} tasks, such as training on a smaller dataset, or learning with only a few blocks, or training just for a few epochs. These architectures optimized on proxy tasks are not guaranteed to be optimal on the target task. In this paper, we present \\emph{ProxylessNAS} that can \\emph{directly} learn the architectures for large-scale target tasks and target hardware platforms. We address the high memory consumption issue of differentiable NAS and reduce the computational cost (GPU hours and GPU memory) to the same level of regular training while still allowing a large candidate set. Experiments on CIFAR-10 and ImageNet demonstrate the effectiveness of directness and specialization. On CIFAR-10, our model achieves 2.08\\% test error with only 5.7M parameters, better than the previous state-of-the-art architecture AmoebaNet-B, while using 6\u00d7\u00a0fewer parameters. On ImageNet, our model achieves 3.1\\% better top-1 accuracy than MobileNetV2, while being 1.2\u00d7\u00a0faster with measured GPU latency. We also apply ProxylessNAS to specialize neural architectures for hardware with direct hardware metrics (e.g. latency) and provide insights for efficient CNN architecture design.

    ","tags":["paper","efficient_dl","nas"]},{"location":"100%20Reference%20notes/101%20Literature/ProxylessNAS%20-%20Direct%20Neural%20Architecture%20Search%20on%20Target%20Task%20and%20Hardware/#notes","title":"Notes","text":"
    • To avoid measuring performance on the target device, they learn a latency model.
      1. They take multiple measurements of a device with different architectures.
      2. They train a model to predict the latency given the architecture.
    ","tags":["paper","efficient_dl","nas"]},{"location":"100%20Reference%20notes/101%20Literature/R-MAE%20-%20Regions%20Meet%20Masked%20Autoencoders/","title":"R MAE Regions Meet Masked Autoencoders","text":"Properties authors Duy-Kien Nguyen, Vaibhav Aggarwal, Yanghao Li, Martin R. Oswald, Alexander Kirillov, Cees G. M. Snoek, Xinlei Chen year 2023 url https://arxiv.org/abs/2306.05411

    Abstract

    In this work, we explore regions as a potential visual analogue of words for self-supervised image representation learning. Inspired by Masked Autoencoding (MAE), a generative pre-training baseline, we propose masked region autoencoding to learn from groups of pixels or regions. Specifically, we design an architecture which efficiently addresses the one-to-many mapping between images and regions, while being highly effective especially with high-quality regions. When integrated with MAE, our approach (R-MAE) demonstrates consistent improvements across various pre-training datasets and downstream detection and segmentation benchmarks, with negligible computational overheads. Beyond the quantitative evaluation, our analysis indicates the models pre-trained with masked region autoencoding unlock the potential for interactive segmentation. The code is provided at\u00a0this https URL.

    ","tags":["paper","foundation_models"]},{"location":"100%20Reference%20notes/101%20Literature/R-MAE%20-%20Regions%20Meet%20Masked%20Autoencoders/#note","title":"Note","text":"
    • Note to self: Read in depth
    ","tags":["paper","foundation_models"]},{"location":"100%20Reference%20notes/101%20Literature/Refusal%20in%20Language%20Models%20Is%20Mediated%20by%20a%20Single%20Direction/","title":"Refusal in Language Models Is Mediated by a Single Direction","text":"Properties authors Andy Arditi, Oscar Obeso, Aaquib Syed, Daniel Paleka, Nina Rimsky, Wes Gurnee, Neel Nanda year 2024 url https://arxiv.org/abs/2406.11717

    Abstract

    Conversational large language models are fine-tuned for both instruction-following and safety, resulting in models that obey benign requests but refuse harmful ones. While this refusal behavior is widespread across chat models, its underlying mechanisms remain poorly understood. In this work, we show that refusal is mediated by a one-dimensional subspace, across 13 popular open-source chat models up to 72B parameters in size. Specifically, for each model, we find a single direction such that erasing this direction from the model's residual stream activations prevents it from refusing harmful instructions, while adding this direction elicits refusal on even harmless instructions. Leveraging this insight, we propose a novel white-box jailbreak method that surgically disables refusal with minimal effect on other capabilities. Finally, we mechanistically analyze how adversarial suffixes suppress propagation of the refusal-mediating direction. Our findings underscore the brittleness of current safety fine-tuning methods. More broadly, our work showcases how an understanding of model internals can be leveraged to develop practical methods for controlling model behavior.

    ","tags":["paper","transformers","mechinterp","interpretability"]},{"location":"100%20Reference%20notes/101%20Literature/Relaxed%20Octahedral%20Group%20Convolution%20for%20Learning%20Symmetry%20Breaking%20in%203D%20Physical%20Systems/","title":"Relaxed Octahedral Group Convolution for Learning Symmetry Breaking in 3D Physical Systems","text":"Properties authors Rui Wang, Robin Walters, Tess E Smidt year 2023 url https://arxiv.org/abs/2310.02299

    Abstract

    Deep equivariant models use symmetries to improve sample efficiency and generalization. However, the assumption of perfect symmetry in many of these models can sometimes be restrictive, especially when the data does not perfectly align with such symmetries. Thus, we introduce relaxed octahedral group convolution for modeling 3D physical systems in this paper. This flexible convolution technique provably allows the model to both maintain the highest level of equivariance that is consistent with data and discover the subtle symmetry-breaking factors in the physical systems. Empirical results validate that our approach can not only provide insights into the symmetry-breaking factors in phase transitions but also achieves superior performance in fluid super-resolution tasks.

    ","tags":["relaxed_equivariance","equivariance"]},{"location":"100%20Reference%20notes/101%20Literature/Relaxing%20Equivariance%20Constraints%20with%20Non-stationary%20Continuous%20Filters/","title":"Relaxing Equivariance Constraints with Non stationary Continuous Filters","text":"Properties authors David W. Romero

    Abstract

    ","tags":["dl2","equivariance","partial_equivariance"]},{"location":"100%20Reference%20notes/101%20Literature/Retrospective%20-%20EIE%20-%20Efficient%20Inference%20Engine%20onSparse%20and%20Compressed%20Neural%20Network/","title":"Retrospective EIE Efficient Inference Engine onSparse and Compressed Neural Network","text":"Properties authors Song Han, Xingyu Liu, Huizi Mao, Jing Pu, Ardavan Pedram, Mark A. Horowitz, William J. Dally year 2023 url https://arxiv.org/abs/2306.09552

    Abstract

    EIE proposed to accelerate pruned and compressed neural networks, exploiting weight sparsity, activation sparsity, and 4-bit weight-sharing in neural network accelerators. Since published in ISCA'16, it opened a new design space to accelerate pruned and sparse neural networks and spawned many algorithm-hardware co-designs for model compression and acceleration, both in academia and commercial AI chips. In retrospect, we review the background of this project, summarize the pros and cons, and discuss new opportunities where pruning, sparsity, and low precision can accelerate emerging deep learning workloads.

    ","tags":["paper","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/Revealing%20the%20Utilized%20Rank%20of%20Subspaces%20of%20Learning%20in%20Neural%20Networks/","title":"Revealing the Utilized Rank of Subspaces of Learning in Neural Networks","text":"Properties authors Isha Garg, Christian Koguchi, Eshan Verma, Daniel Ulbricht year 2024 url https://arxiv.org/abs/2407.04797

    Abstract

    In this work, we study how well the learned weights of a neural network utilize the space available to them. This notion is related to capacity, but additionally incorporates the interaction of the network architecture with the dataset. Most learned weights appear to be full rank, and are therefore not amenable to low rank decomposition. This deceptively implies that the weights are utilizing the entire space available to them. We propose a simple data-driven transformation that projects the weights onto the subspace where the data and the weight interact. This preserves the functional mapping of the layer and reveals its low rank structure. In our findings, we conclude that most models utilize a fraction of the available space. For instance, for ViTB-16 and ViTL-16 trained on ImageNet, the mean layer utilization is 35% and 20% respectively. Our transformation results in reducing the parameters to 50% and 25% respectively, while resulting in less than 0.2% accuracy drop after fine-tuning. We also show that self-supervised pre-training drives this utilization up to 70%, justifying its suitability for downstream tasks.

    ","tags":["paper","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Rewrite%20the%20Stars/","title":"Rewrite the Stars","text":"Properties authors Xu Ma, Xiyang Dai, Yue Bai, Yizhou Wang, Yun Fu year 2024 url https://arxiv.org/abs/2403.19967

    Abstract

    Recent studies have drawn attention to the untapped potential of the \"star operation\" (element-wise multiplication) in network design. While intuitive explanations abound, the foundational rationale behind its application remains largely unexplored. Our study attempts to reveal the star operation's ability to map inputs into high-dimensional, non-linear feature spaces -- akin to kernel tricks -- without widening the network. We further introduce StarNet, a simple yet powerful prototype, demonstrating impressive performance and low latency under compact network structure and efficient budget. Like stars in the sky, the star operation appears unremarkable but holds a vast universe of potential. Our work encourages further exploration across tasks, with codes available at\u00a0this https URL.

    ","tags":["dl_theory","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/SAM-CLIP%20-%20Merging%20Vision%20Foundation%20Models%20towards%20Semantic%20and%20Spatial%20Understanding/","title":"SAM CLIP Merging Vision Foundation Models towards Semantic and Spatial Understanding","text":"Properties authors Haoxiang Wang, Fartash Faghri, Raviteja Vemulapalli, Mehrdad Farajtabar, Sachin Mehta, Mohammad Rastegari, Oncel Tuzel, Hadi Pouransari, Pavan Kumar Anasosalu Vasu year 2024 url https://arxiv.org/abs/2310.15308

    Abstract

    The landscape of publicly available vision foundation models (VFMs), such as CLIP and Segment Anything Model (SAM), is expanding rapidly. VFMs are endowed with distinct capabilities stemming from their pre-training objectives. For instance, CLIP excels in semantic understanding, while SAM specializes in spatial understanding for segmentation. In this work, we introduce a simple recipe to efficiently merge VFMs into a unified model that absorbs their expertise. Our method integrates techniques of multi-task learning, continual learning, and distillation. Further, it demands significantly less computational cost compared to traditional multi-task training from scratch, and it only needs a small fraction of the pre-training datasets that were initially used to train individual models. By applying our method to SAM and CLIP, we obtain SAM-CLIP: a unified model that combines the capabilities of SAM and CLIP into a single vision transformer. Compared with deploying SAM and CLIP independently, our merged model, SAM-CLIP, reduces storage and compute costs for inference, making it well-suited for edge device applications. We show that SAM-CLIP not only retains the foundational strengths of SAM and CLIP, but also introduces synergistic functionalities, notably in zero-shot semantic segmentation, where SAM-CLIP establishes new state-of-the-art results on 5 benchmarks. It outperforms previous models that are specifically designed for this task by a large margin, including +6.8% and +5.9% mean IoU improvement on Pascal-VOC and COCO-Stuff datasets, respectively.

    ","tags":["paper","efficient_dl","efficient_vision"]},{"location":"100%20Reference%20notes/101%20Literature/Scaling%20%28Down%29%20CLIP%20-%20A%20Comprehensive%20Analysis%20of%20Data%2C%20Architecture%2C%20and%20Training%20Strategies/","title":"Scaling (Down) CLIP A Comprehensive Analysis of Data, Architecture, and Training Strategies","text":"Properties authors Zichao Li, Cihang Xie, Ekin Dogus Cubuk year 2024 url https://arxiv.org/abs/2404.08197

    Abstract

    This paper investigates the performance of the Contrastive Language-Image Pre-training (CLIP) when scaled down to limited computation budgets. We explore CLIP along three dimensions: data, architecture, and training strategies. With regards to data, we demonstrate the significance of high-quality training data and show that a smaller dataset of high-quality data can outperform a larger dataset with lower quality. We also examine how model performance varies with different dataset sizes, suggesting that smaller ViT models are better suited for smaller datasets, while larger models perform better on larger datasets with fixed compute. Additionally, we provide guidance on when to choose a CNN-based architecture or a ViT-based architecture for CLIP training. We compare four CLIP training strategies - SLIP, FLIP, CLIP, and CLIP+Data Augmentation - and show that the choice of training strategy depends on the available compute resource. Our analysis reveals that CLIP+Data Augmentation can achieve comparable performance to CLIP using only half of the training data. This work provides practical insights into how to effectively train and deploy CLIP models, making them more accessible and affordable for practical use in various applications.

    ","tags":["efficient_dl","vit","cnn"]},{"location":"100%20Reference%20notes/101%20Literature/Segment%20Anything/","title":"Segment Anything","text":"Properties authors Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alexander C. Berg, Wan-Yen Lo, Piotr Doll\u00e1r, Ross Girshick year 2023 url https://arxiv.org/abs/2304.02643

    Abstract

    We introduce the Segment Anything (SA) project: a new task, model, and dataset for image segmentation. Using our efficient model in a data collection loop, we built the largest segmentation dataset to date (by far), with over 1 billion masks on 11M licensed and privacy respecting images. The model is designed and trained to be promptable, so it can transfer zero-shot to new image distributions and tasks. We evaluate its capabilities on numerous tasks and find that its zero-shot performance is impressive -- often competitive with or even superior to prior fully supervised results. We are releasing the Segment Anything Model (SAM) and corresponding dataset (SA-1B) of 1B masks and 11M images at\u00a0this https URL\u00a0to foster research into foundation models for computer vision.

    ","tags":["paper","segmentation","computer_vision","foundation_models"]},{"location":"100%20Reference%20notes/101%20Literature/Self-Supervised%20Detection%20of%20Perfect%20and%20Partial%20Input-Dependent%20Symmetries/","title":"Self Supervised Detection of Perfect and Partial Input Dependent Symmetries","text":"Properties authors David W. Romero, Alonso Urbano","tags":["dl2","equivariance","partial_equivariance","inductive_bias"]},{"location":"100%20Reference%20notes/101%20Literature/SimPLR%20-%20A%20Simple%20and%20Plain%20Transformer%20for%20Scaling-Efficient%20Object%20Detection%20and%20Segmentation/","title":"SimPLR A Simple and Plain Transformer for Scaling Efficient Object Detection and Segmentation","text":"Properties authors Duy-Kien Nguyen, Martin R. Oswald, Cees G. M. Snoek year 2024 url https://arxiv.org/abs/2310.05920

    Abstract

    The ability to detect objects in images at varying scales has played a pivotal role in the design of modern object detectors. Despite considerable progress in removing hand-crafted components and simplifying the architecture with transformers, multi-scale feature maps and/or pyramid design remain a key factor for their empirical success. In this paper, we show that this reliance on either feature pyramids or an hierarchical backbone is unnecessary and a transformer-based detector with scale-aware attention enables the plain detector 'SimPLR' whose backbone and detection head are both non-hierarchical and operate on single-scale features. We find through our experiments that SimPLR with scale-aware attention is plain and simple, yet competitive with multi-scale vision transformer alternatives. Compared to the multi-scale and single-scale state-of-the-art, our model scales much better with bigger capacity (self-supervised) models and more pre-training data, allowing us to report a consistently better accuracy and faster runtime for object detection, instance segmentation as well as panoptic segmentation. Code will be released.

    ","tags":["paper","object_detection","computer_vision","vit"]},{"location":"100%20Reference%20notes/101%20Literature/SimPLR%20-%20A%20Simple%20and%20Plain%20Transformer%20for%20Scaling-Efficient%20Object%20Detection%20and%20Segmentation/#notes","title":"Notes","text":"

    \u201cDespite enabling plain-backbone detectors, feature pyramids are still an important factor in ViTDet to detect objects at various scales\u201d (Nguyen et al., 2024, p. 4)

    Not really an issue as far as I understand, but in the spirit of less inductive biases it makes sense. Feature pyramids intuitively hardcode scale information.

    \u201cMost recently, Lin et al. [35] introduce the transformer-based detector, PlainDETR, which also removes the multi-scale input. However, it still relies on multi-scale features to generate the object proposals for its decoder.\u201d (Nguyen et al., 2024, p. 4)

    Don't quite understand this, does this still allow arbitrary vits? - [ ] Read PlainDETR \ud83d\udd3d

    ","tags":["paper","object_detection","computer_vision","vit"]},{"location":"100%20Reference%20notes/101%20Literature/Simultaneous%20linear%20connectivity%20of%20neural%20networks%20modulo%20permutation/","title":"Simultaneous linear connectivity of neural networks modulo permutation","text":"Properties authors Ekansh Sharma, Devin Kwok, Tom Denton, Daniel M. Roy, David Rolnick, Gintare Karolina Dziugaite year 2024 url https://arxiv.org/abs/2404.06498

    Abstract

    Neural networks typically exhibit permutation symmetries which contribute to the non-convexity of the networks' loss landscapes, since linearly interpolating between two permuted versions of a trained network tends to encounter a high loss barrier. Recent work has argued that permutation symmetries are the only sources of non-convexity, meaning there are essentially no such barriers between trained networks if they are permuted appropriately. In this work, we refine these arguments into three distinct claims of increasing strength. We show that existing evidence only supports \"weak linear connectivity\"-that for each pair of networks belonging to a set of SGD solutions, there exist (multiple) permutations that linearly connect it with the other networks. In contrast, the claim \"strong linear connectivity\"-that for each network, there exists one permutation that simultaneously connects it with the other networks-is both intuitively and practically more desirable. This stronger claim would imply that the loss landscape is convex after accounting for permutation, and enable linear interpolation between three or more independently trained models without increased loss. In this work, we introduce an intermediate claim-that for certain sequences of networks, there exists one permutation that simultaneously aligns matching pairs of networks from these sequences. Specifically, we discover that a single permutation aligns sequences of iteratively trained as well as iteratively pruned networks, meaning that two networks exhibit low loss barriers at each step of their optimization and sparsification trajectories respectively. Finally, we provide the first evidence that strong linear connectivity may be possible under certain conditions, by showing that barriers decrease with increasing network width when interpolating among three networks.

    ","tags":["dl_theory","linear_connectivity","network_permutation_symmetries"]},{"location":"100%20Reference%20notes/101%20Literature/Stand-Alone%20Self-Attention%20in%20Vision%20Models/","title":"Stand Alone Self Attention in Vision Models","text":"Properties authors Prajit Ramachandran, Niki Parmar, Ashish Vaswani, Irwan Bello, Anselm Levskaya, Jonathon Shlens year 2019

    Abstract

    Convolutions are a fundamental building block of modern computer vision systems. Recent approaches have argued for going beyond convolutions in order to capture long-range dependencies. These efforts focus on augmenting convolutional models with content-based interactions, such as self-attention and non-local means, to achieve gains on a number of vision tasks. The natural question that arises is whether attention can be a stand-alone primitive for vision models instead of serving as just an augmentation on top of convolutions. In developing and testing a pure self-attention vision model, we verify that self-attention can indeed be an effective stand-alone layer. A simple procedure of replacing all instances of spatial convolutions with a form of self-attention applied to ResNet model produces a fully self-attentional model that outperforms the baseline on ImageNet classification with 12% fewer FLOPS and 29% fewer parameters. On COCO object detection, a pure self-attention model matches the mAP of a baseline RetinaNet while having 39% fewer FLOPS and 34% fewer parameters. Detailed ablation studies demonstrate that self-attention is especially impactful when used in later layers. These results establish that stand-alone self-attention is an important addition to the vision practitioner's toolbox.

    ","tags":["vit","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/Surgical%20Fine-Tuning%20Improves%20Adaptation%20to%20Distribution%20Shifts/","title":"Surgical Fine Tuning Improves Adaptation to Distribution Shifts","text":"Properties authors Yoonho Lee, Annie S. Chen, Fahim Tajwar, Huaxiu Yao, Percy Liang, Chelsea Finn, Ananya Kumar year 2022 url https://arxiv.org/abs/2210.11466

    Abstract

    A common approach to transfer learning under distribution shift is to fine-tune the last few layers of a pre-trained model, preserving learned features while also adapting to the new task. This paper shows that in such settings, selectively fine-tuning a subset of layers (which we term surgical fine-tuning) matches or outperforms commonly used fine-tuning approaches. Moreover, the type of distribution shift influences which subset is more effective to tune: for example, for image corruptions, fine-tuning only the first few layers works best. We validate our findings systematically across seven real-world data tasks spanning three types of distribution shifts. Theoretically, we prove that for two-layer neural networks in an idealized setting, first-layer tuning can outperform fine-tuning all layers. Intuitively, fine-tuning more parameters on a small target dataset can cause information learned during pre-training to be forgotten, and the relevant information depends on the type of shift.

    Notes: - Paper mentions that it depends on what kind of distribution shift the choice of layers (subset of parameters) to finetune. - They provide an automatic procedure to select those layers that beats full finetuning but is suboptimal when compared to expert/surgical finetuning. Suggest future work in this regard.

    ","tags":["paper","peft","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/Surgical-DINO%20-%20Adapter%20Learning%20of%20Foundation%20Models%20for%20Depth%20Estimation%20in%20Endoscopic%20Surgery/","title":"Surgical DINO Adapter Learning of Foundation Models for Depth Estimation in Endoscopic Surgery","text":"Properties authors Beilei Cui, Mobarakol Islam, Long Bai, Hongliang Ren year 2024 url https://arxiv.org/abs/2401.06013

    Abstract

    Purpose: Depth estimation in robotic surgery is vital in 3D reconstruction, surgical navigation and augmented reality visualization. Although the foundation model exhibits outstanding performance in many vision tasks, including depth estimation (e.g., DINOv2), recent works observed its limitations in medical and surgical domain-specific applications. This work presents a low-ranked adaptation (LoRA) of the foundation model for surgical depth estimation. Methods: We design a foundation model-based depth estimation method, referred to as Surgical-DINO, a low-rank adaptation of the DINOv2 for depth estimation in endoscopic surgery. We build LoRA layers and integrate them into DINO to adapt with surgery-specific domain knowledge instead of conventional fine-tuning. During training, we freeze the DINO image encoder, which shows excellent visual representation capacity, and only optimize the LoRA layers and depth decoder to integrate features from the surgical scene. Results: Our model is extensively validated on a MICCAI challenge dataset of SCARED, which is collected from da Vinci Xi endoscope surgery. We empirically show that Surgical-DINO significantly outperforms all the state-of-the-art models in endoscopic depth estimation tasks. The analysis with ablation studies has shown evidence of the remarkable effect of our LoRA layers and adaptation. Conclusion: Surgical-DINO shed some light on the successful adaptation of the foundation models into the surgical domain for depth estimation. There is clear evidence in the results that zero-shot prediction on pre-trained weights in computer vision datasets or naive fine-tuning is not sufficient to use the foundation model in the surgical domain directly. Code is available at\u00a0this https URL.

    References: - LoRA - Low-Rank Adaptation of Large Language Models

    Keywords: - LoRa Adapter

    ","tags":["paper","efficient_dl","efficient_vision","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/Symmetries%20in%20Overparametrized%20Neural%20Networks%20-%20A%20Mean-Field%20View/","title":"Symmetries in Overparametrized Neural Networks A Mean Field View","text":"Properties authors Javier Maass Martinez, Joaquin Fontbona year 2024 url https://arxiv.org/abs/2405.19995

    Abstract

    We develop a Mean-Field (MF) view of the learning dynamics of overparametrized Artificial Neural Networks (NN) under data symmetric in law wrt the action of a general compact group\u00a0G. We consider for this a class of generalized shallow NNs given by an ensemble of\u00a0N\u00a0multi-layer units, jointly trained using stochastic gradient descent (SGD) and possibly symmetry-leveraging (SL) techniques, such as Data Augmentation (DA), Feature Averaging (FA) or Equivariant Architectures (EA). We introduce the notions of weakly and strongly invariant laws (WI and SI) on the parameter space of each single unit, corresponding, respectively, to\u00a0G-invariant distributions, and to distributions supported on parameters fixed by the group action (which encode EA). This allows us to define symmetric models compatible with taking\u00a0N\u2192\u221e\u00a0and give an interpretation of the asymptotic dynamics of DA, FA and EA in terms of Wasserstein Gradient Flows describing their MF limits. When activations respect the group action, we show that, for symmetric data, DA, FA and freely-trained models obey the exact same MF dynamic, which stays in the space of WI laws and minimizes therein the population risk. We also give a counterexample to the general attainability of an optimum over SI laws. Despite this, quite remarkably, we show that the set of SI laws is also preserved by the MF dynamics even when freely trained. This sharply contrasts the finite-N\u00a0setting, in which EAs are generally not preserved by unconstrained SGD. We illustrate the validity of our findings as\u00a0N\u00a0gets larger in a teacher-student experimental setting, training a student NN to learn from a WI, SI or arbitrary teacher model through various SL schemes. We last deduce a data-driven heuristic to discover the largest subspace of parameters supporting SI distributions for a problem, that could be used for designing EA with minimal generalization error.

    ","tags":["dl_theory","equivariance"]},{"location":"100%20Reference%20notes/101%20Literature/Talaria%20-%20Interactively%20Optimizing%20Machine%20Learning%20Models%20for%20Efficient%20Inference/","title":"Talaria Interactively Optimizing Machine Learning Models for Efficient Inference","text":"Properties authors Fred Hohman, Chaoqun Wang, Jinmook Lee, Jochen G\u00f6rtler, Dominik Moritz, Jeffrey P Bigham, Zhile Ren, Cecile Foret, Qi Shan, Ziaoyi Zhang year 2024 url https://arxiv.org/abs/2404.03085

    Abstract

    On-device machine learning (ML) moves computation from the cloud to personal devices, protecting user privacy and enabling intelligent user experiences. However, fitting models on devices with limited resources presents a major technical challenge: practitioners need to optimize models and balance hardware metrics such as model size, latency, and power. To help practitioners create efficient ML models, we designed and developed Talaria: a model visualization and optimization system. Talaria enables practitioners to compile models to hardware, interactively visualize model statistics, and simulate optimizations to test the impact on inference metrics. Since its internal deployment two years ago, we have evaluated Talaria using three methodologies: (1) a log analysis highlighting its growth of 800+ practitioners submitting 3,600+ models; (2) a usability survey with 26 users assessing the utility of 20 Talaria features; and (3) a qualitative interview with the 7 most active users about their experience using Talaria.

    ","tags":["efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/The%20Empirical%20Impact%20of%20Neural%20Parameter%20Symmetries%2C%20or%20Lack%20Thereof/","title":"The Empirical Impact of Neural Parameter Symmetries, or Lack Thereof","text":"Properties authors Derek Lim, Moe Putterman, Robin Walters, Haggai Maron, Stefanie Jegelka year 2024 url https://arxiv.org/abs/2405.20231

    Abstract

    Many algorithms and observed phenomena in deep learning appear to be affected by parameter symmetries -- transformations of neural network parameters that do not change the underlying neural network function. These include linear mode connectivity, model merging, Bayesian neural network inference, metanetworks, and several other characteristics of optimization or loss-landscapes. However, theoretical analysis of the relationship between parameter space symmetries and these phenomena is difficult. In this work, we empirically investigate the impact of neural parameter symmetries by introducing new neural network architectures that have reduced parameter space symmetries. We develop two methods, with some provable guarantees, of modifying standard neural networks to reduce parameter space symmetries. With these new methods, we conduct a comprehensive experimental study consisting of multiple tasks aimed at assessing the effect of removing parameter symmetries. Our experiments reveal several interesting observations on the empirical impact of parameter symmetries; for instance, we observe linear mode connectivity between our networks without alignment of weight spaces, and we find that our networks allow for faster and more effective Bayesian neural network training.

    ","tags":["equivariance","relaxed_equivariance","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/The%20Lie%20derivative%20for%20measuring%20learned%20equivariance/","title":"The Lie derivative for measuring learned equivariance","text":"Properties authors Nate Gruver, Marc Finzi, Micah Goldblum, Andrew Gordon Wilson year 2022 url https://arxiv.org/abs/2210.02984

    Abstract

    The Lie derivative is introduced, a method for measuring equivariance with strong mathematical foundations and minimal hyperparameters that finds that transformers can be more equivariant than convolutional neural networks after training, and that as models get larger and more accurate they tend to display more equivariance, regardless of architecture.

    ","tags":["equivariance"]},{"location":"100%20Reference%20notes/101%20Literature/The%20Lie%20derivative%20for%20measuring%20learned%20equivariance/#notes","title":"Notes","text":"","tags":["equivariance"]},{"location":"100%20Reference%20notes/101%20Literature/The%20Unreasonable%20Ineffectiveness%20of%20the%20Deeper%20Layers/","title":"The Unreasonable Ineffectiveness of the Deeper Layers","text":"Properties authors Andrey Gromov, Kushal Tirumala, Hassan Shapourian, Paolo Glorioso, Daniel A. Roberts year 2024 url https://arxiv.org/abs/2403.17887

    Abstract

    We empirically study a simple layer-pruning strategy for popular families of open-weight pretrained LLMs, finding minimal degradation of performance on different question-answering benchmarks until after a large fraction (up to half) of the layers are removed. To prune these models, we identify the optimal block of layers to prune by considering similarity across layers; then, to \"heal\" the damage, we perform a small amount of finetuning. In particular, we use parameter-efficient finetuning (PEFT) methods, specifically quantization and Low Rank Adapters (QLoRA), such that each of our experiments can be performed on a single A100 GPU. From a practical perspective, these results suggest that layer pruning methods can complement other PEFT strategies to further reduce computational resources of finetuning on the one hand, and can improve the memory and latency of inference on the other hand. From a scientific perspective, the robustness of these LLMs to the deletion of layers implies either that current pretraining methods are not properly leveraging the parameters in the deeper layers of the network or that the shallow layers play a critical role in storing knowledge.

    ","tags":["transformers","efficient_dl","pruning","quantization"]},{"location":"100%20Reference%20notes/101%20Literature/TiC-CLIP%20-%20Continual%20Training%20of%20CLIP%20models/","title":"TiC CLIP Continual Training of CLIP models","text":"Properties authors Saurabh Garg, Mehrdad Farajtabar, Hadi Pouransari, Raviteja Vemulapalli, Sachin Mehta, Oncel Tuzel, Vaishaal Shankar, Fartash Faghri year 2024 url https://arxiv.org/abs/2310.16226

    Abstract

    Keeping large foundation models up to date on latest data is inherently expensive. To avoid the prohibitive costs of constantly retraining, it is imperative to continually train these models. This problem is exacerbated by the lack of any large scale continual learning benchmarks or baselines. We introduce the first set of web-scale Time-Continual (TiC) benchmarks for training vision-language models: TiC-DataComp, TiC-YFCC, and TiC-Redcaps. TiC-DataComp, our largest dataset, contains over 12.7B timestamped image-text pairs spanning 9 years (2014-2022). We first use our benchmarks to curate various dynamic evaluations to measure temporal robustness of existing models. We show OpenAI's CLIP (trained on data up to 2020) loses\u00a0\u22488%\u00a0zero-shot accuracy on our curated retrieval task from 2021-2022 compared with more recently trained models in OpenCLIP repository. We then study how to efficiently train models on time-continuous data. We demonstrate that a simple rehearsal-based approach that continues training from the last checkpoint and replays old data reduces compute by\u00a02.5\u00d7\u00a0when compared to the standard practice of retraining from scratch. Code is available at\u00a0this https URL.

    ","tags":["paper","continual_learning","multimodal"]},{"location":"100%20Reference%20notes/101%20Literature/Training%20quantized%20nets%20-%20A%20deeper%20understanding/","title":"Training quantized nets A deeper understanding","text":"Properties authors Hao Li, Soham De, Zheng Xu, Christoph Studer, Hanan Samet, Tom Goldstein year 2017 url https://arxiv.org/abs/1706.02379

    Abstract

    Currently, deep neural networks are deployed on low-power portable devices by first training a full-precision model using powerful hardware, and then deriving a corresponding low-precision model for efficient inference on such systems. However, training models directly with coarsely quantized weights is a key step towards learning on embedded platforms that have limited computing resources, memory capacity, and power consumption. Numerous recent publications have studied methods for training quantized networks, but these studies have mostly been empirical. In this work, we investigate training methods for quantized neural networks from a theoretical viewpoint. We first explore accuracy guarantees for training methods under convexity assumptions. We then look at the behavior of these algorithms for non-convex problems, and show that training algorithms that exploit high-precision representations have an important greedy search phase that purely quantized training methods lack, which explains the difficulty of training using low-precision arithmetic.

    ","tags":["paper","quantization"]},{"location":"100%20Reference%20notes/101%20Literature/Training%20quantized%20nets%20-%20A%20deeper%20understanding/#notes","title":"Notes","text":"
    • Read paper
    ","tags":["paper","quantization"]},{"location":"100%20Reference%20notes/101%20Literature/Understanding%20Deep%20Learning%20-%20Chapter%2010/","title":"Understanding Deep Learning Chapter 10","text":"Properties authors Simon J.D. Prince year 2023 url https://udlbook.github.io/udlbook/","tags":["textbook","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Understanding%20Deep%20Learning%20-%20Chapter%2020/","title":"Understanding Deep Learning Chapter 20","text":"Properties authors Simon J.D. Prince year 2023 url https://udlbook.github.io/udlbook/","tags":["textbook","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Understanding%20Deep%20Learning%20-%20Chapter%2020/#chapter-20-why-does-deep-learning-work","title":"Chapter 20: Why does deep learning work?","text":"

    Contents

    • 20.1 The case against deep learning
    • 20.2 Factors that influence fitting performance
    • 20.3 Properties of loss functions
    • 20.4 Factors that determine generalization
    • 20.5 Do we need so many parameters?
    • 20.6 Do networks have to be deep?
    • 20.7 Summary
    ","tags":["textbook","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Understanding%20symmetries%20in%20deep%20networks/","title":"Understanding symmetries in deep networks","text":"Properties authors Vijay Badrinarayanan, Bamdev Mishra, Roberto Cipolla year 2015 url https://arxiv.org/abs/1511.01029

    Abstract

    Recent works have highlighted scale invariance or symmetry present in the weight space of a typical deep network and the adverse effect it has on the Euclidean gradient based stochastic gradient descent optimization. In this work, we show that a commonly used deep network, which uses convolution, batch normalization, reLU, max-pooling, and sub-sampling pipeline, possess more complex forms of symmetry arising from scaling-based reparameterization of the network weights. We propose to tackle the issue of the weight space symmetry by constraining the filters to lie on the unit-norm manifold. Consequently, training the network boils down to using stochastic gradient descent updates on the unit-norm manifold. Our empirical evidence based on the MNIST dataset shows that the proposed updates improve the test performance beyond what is achieved with batch normalization and without sacrificing the computational efficiency of the weight updates.

    ,

    ","tags":["dl_theory","dl2"]},{"location":"100%20Reference%20notes/101%20Literature/Using%20Degeneracy%20in%20the%20Loss%20Landscape%20for%20Mechanistic%20Interpretability/","title":"Using Degeneracy in the Loss Landscape for Mechanistic Interpretability","text":"Properties authors Lucius Bushnaq, Jake Mendel, Stefan Heimersheim, Dan Braun, Nicholas Goldowsky-Dill, Kaarel H\u00e4nni, Cindy Wu, Marius Hobbhahn year 2024 url https://arxiv.org/abs/2405.10927

    Abstract

    Mechanistic Interpretability aims to reverse engineer the algorithms implemented by neural networks by studying their weights and activations. An obstacle to reverse engineering neural networks is that many of the parameters inside a network are not involved in the computation being implemented by the network. These degenerate parameters may obfuscate internal structure. Singular learning theory teaches us that neural network parameterizations are biased towards being more degenerate, and parameterizations with more degeneracy are likely to generalize further. We identify 3 ways that network parameters can be degenerate: linear dependence between activations in a layer; linear dependence between gradients passed back to a layer; ReLUs which fire on the same subset of datapoints. We also present a heuristic argument that modular networks are likely to be more degenerate, and we develop a metric for identifying modules in a network that is based on this argument. We propose that if we can represent a neural network in a way that is invariant to reparameterizations that exploit the degeneracies, then this representation is likely to be more interpretable, and we provide some evidence that such a representation is likely to have sparser interactions. We introduce the Interaction Basis, a tractable technique to obtain a representation that is invariant to degeneracies from linear dependence of activations or Jacobians.

    ","tags":["paper","dl_theory","mechinterp","optimization"]},{"location":"100%20Reference%20notes/101%20Literature/ViDT%20-%20An%20Efficient%20and%20Effective%20Fully%20Transformer-based%20Object%20Detector/","title":"ViDT An Efficient and Effective Fully Transformer based Object Detector","text":"Properties authors Hwanjun Song, Deqing Sun, Sanghyuk Chun, Varun Jampani, Dongyoon Han, Byeongho Heo, Wonjae Kim, Ming-Hsuan Yang year 2021 url https://arxiv.org/abs/2110.03921

    Abstract

    Transformers are transforming the landscape of computer vision, especially for recognition tasks. Detection transformers are the first fully end-to-end learning systems for object detection, while vision transformers are the first fully transformer-based architecture for image classification. In this paper, we integrate Vision and Detection Transformers (ViDT) to build an effective and efficient object detector. ViDT introduces a reconfigured attention module to extend the recent Swin Transformer to be a standalone object detector, followed by a computationally efficient transformer decoder that exploits multi-scale features and auxiliary techniques essential to boost the detection performance without much increase in computational load. Extensive evaluation results on the Microsoft COCO benchmark dataset demonstrate that ViDT obtains the best AP and latency trade-off among existing fully transformer-based object detectors, and achieves 49.2AP owing to its high scalability for large models. We will release the code and trained models at\u00a0this https URL

    ","tags":["paper","object_detection","vit","computer_vision"]},{"location":"100%20Reference%20notes/101%20Literature/Vision%20Mamba%20-%20Efficient%20Visual%20Representation%20Learning%20with%20Bidirectional%20State%20Space%20Model/","title":"Vision Mamba Efficient Visual Representation Learning with Bidirectional State Space Model","text":"Properties authors Lianghui Zhu, Bencheng Liao, Qian Zhang, Xinlong Wang, Wenyu Liu, Xinggang Wang year 2024 url https://arxiv.org/abs/2401.09417

    Abstract

    Recently the state space models (SSMs) with efficient hardware-aware designs, i.e., the Mamba deep learning model, have shown great potential for long sequence modeling. Meanwhile building efficient and generic vision backbones purely upon SSMs is an appealing direction. However, representing visual data is challenging for SSMs due to the position-sensitivity of visual data and the requirement of global context for visual understanding. In this paper, we show that the reliance on self-attention for visual representation learning is not necessary and propose a new generic vision backbone with bidirectional Mamba blocks (Vim), which marks the image sequences with position embeddings and compresses the visual representation with bidirectional state space models. On ImageNet classification, COCO object detection, and ADE20k semantic segmentation tasks, Vim achieves higher performance compared to well-established vision transformers like DeiT, while also demonstrating significantly improved computation & memory efficiency. For example, Vim is 2.8\u00d7\u00a0faster than DeiT and saves 86.8% GPU memory when performing batch inference to extract features on images with a resolution of 1248\u00d71248. The results demonstrate that Vim is capable of overcoming the computation & memory constraints on performing Transformer-style understanding for high-resolution images and it has great potential to be the next-generation backbone for vision foundation models. Code is available at\u00a0this https URL.

    ","tags":["transformers","mamba","ssm","efficient_dl"]},{"location":"100%20Reference%20notes/101%20Literature/Vision%20Transformers%20Need%20Registers/","title":"Vision Transformers Need Registers","text":"Properties authors Timoth\u00e9e Darcet, Maxime Oquab, Julien Mairal, Piotr Bojanowski year 2023 url https://arxiv.org/pdf/2309.16588

    Abstract

    Transformers have recently emerged as a powerful tool for learning visual representations. In this paper, we identify and characterize artifacts in feature maps of both supervised and self-supervised ViT networks. The artifacts correspond to high-norm tokens appearing during inference primarily in low-informative background areas of images, that are repurposed for internal computations. We propose a simple yet effective solution based on providing additional tokens to the input sequence of the Vision Transformer to fill that role. We show that this solution fixes that problem entirely for both supervised and self-supervised models, sets a new state of the art for self-supervised visual models on dense visual prediction tasks, enables object discovery methods with larger models, and most importantly leads to smoother feature maps and attention maps for downstream visual processing.

    ","tags":["paper","vit","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/Vision%20Transformers%20Need%20Registers/#note","title":"Note","text":"
    • note to myself:
      • Read paper in depth #personal \ud83d\udd3c
    ","tags":["paper","vit","dl_theory"]},{"location":"100%20Reference%20notes/101%20Literature/What%20Do%20Self-Supervised%20Vision%20Transformers%20Learn%3F/","title":"What Do Self Supervised Vision Transformers Learn?","text":"Properties authors Namuk Park, Wonjae Kim, Byeongho Heo, Taekyung Kim, Sangdoo Yun year 2023 url https://arxiv.org/abs/2305.00729

    Abstract

    We present a comparative study on how and why contrastive learning (CL) and masked image modeling (MIM) differ in their representations and in their performance of downstream tasks. In particular, we demonstrate that self-supervised Vision Transformers (ViTs) have the following properties: (1) CL trains self-attentions to capture longer-range global patterns than MIM, such as the shape of an object, especially in the later layers of the ViT architecture. This CL property helps ViTs linearly separate images in their representation spaces. However, it also makes the self-attentions collapse into homogeneity for all query tokens and heads. Such homogeneity of self-attention reduces the diversity of representations, worsening scalability and dense prediction performance. (2) CL utilizes the low-frequency signals of the representations, but MIM utilizes high-frequencies. Since low- and high-frequency information respectively represent shapes and textures, CL is more shape-oriented and MIM more texture-oriented. (3) CL plays a crucial role in the later layers, while MIM mainly focuses on the early layers. Upon these analyses, we find that CL and MIM can complement each other and observe that even the simplest harmonization can help leverage the advantages of both methods. The code is available at\u00a0this https URL.

    ","tags":["paper","dl_theory","vit","transformers"]},{"location":"100%20Reference%20notes/101%20Literature/What%20Do%20Self-Supervised%20Vision%20Transformers%20Learn%3F/#notes","title":"Notes","text":"

    Another certified banger\u2122 by Naver AI Lab. Also check How do vision transformers work? (the link might not be working because of the interrogation symbol on the name, will fix later).

    • Add annotations from Zotero \ud83d\udd3d
    ","tags":["paper","dl_theory","vit","transformers"]},{"location":"100%20Reference%20notes/102%20Authors/Albert%20Gu/","title":"Albert Gu","text":"Properties affiliation Carnegie Mellon University"},{"location":"100%20Reference%20notes/102%20Authors/Alex%20Flinth/","title":"Alex Flinth","text":"Properties affiliation Umea University"},{"location":"100%20Reference%20notes/102%20Authors/Alexander%20Kirillov/","title":"Alexander Kirillov","text":"Properties affiliation OpenAI, FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Alexey%20Dosovitskiy/","title":"Alexey Dosovitskiy","text":"Properties affiliation Google"},{"location":"100%20Reference%20notes/102%20Authors/Ananya%20Kumar/","title":"Ananya Kumar","text":"Properties affiliation Stanford"},{"location":"100%20Reference%20notes/102%20Authors/Andreas%20Loukas/","title":"Andreas Loukas","text":"Properties affiliation EPFL"},{"location":"100%20Reference%20notes/102%20Authors/Andreas%20Savakis/","title":"Andreas Savakis","text":"Properties affiliation Rochester Institute of Technology"},{"location":"100%20Reference%20notes/102%20Authors/Angela%20Fan/","title":"Angela Fan","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Annie%20S.%20Chen/","title":"Annie S. Chen","text":"Properties affiliation Stanford"},{"location":"100%20Reference%20notes/102%20Authors/Antonio%20Orvieto/","title":"Antonio Orvieto","text":"Properties affiliation Max Planck Institute for Intelligent Systems"},{"location":"100%20Reference%20notes/102%20Authors/Ardavan%20Pedram/","title":"Ardavan Pedram","text":"Properties affiliation Stanford, Samsung"},{"location":"100%20Reference%20notes/102%20Authors/Armand%20Joulin/","title":"Armand Joulin","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Attila%20Lengyel/","title":"Attila Lengyel","text":"Properties affiliation TU Delft"},{"location":"100%20Reference%20notes/102%20Authors/Boshi%20Wang/","title":"Boshi Wang","text":"Properties affiliation The Ohio State University"},{"location":"100%20Reference%20notes/102%20Authors/Byeongho%20Heo/","title":"Byeongho Heo","text":"Properties affiliation Naver AI Lab"},{"location":"100%20Reference%20notes/102%20Authors/Caglar%20Gulcehre/","title":"Caglar Gulcehre","text":"Properties affiliation CLAIRE, EPFL"},{"location":"100%20Reference%20notes/102%20Authors/Carmen%20Amo%20Alonso/","title":"Carmen Amo Alonso","text":"Properties affiliation ETH Zurich"},{"location":"100%20Reference%20notes/102%20Authors/Cees%20G.%20M.%20Snoek/","title":"Cees G. M. Snoek","text":"Properties affiliation University of Amsterdam"},{"location":"100%20Reference%20notes/102%20Authors/Chelsea%20Finn/","title":"Chelsea Finn","text":"Properties affiliation Stanford"},{"location":"100%20Reference%20notes/102%20Authors/Chong%20Wang/","title":"Chong Wang","text":"Properties affiliation Apple, Princeton University"},{"location":"100%20Reference%20notes/102%20Authors/Christopher%20Olah/","title":"Christopher Olah","text":"Properties affiliation Anthropic"},{"location":"100%20Reference%20notes/102%20Authors/Daniel%20M.%20Roy/","title":"Daniel M. Roy","text":"Properties affiliation Vector Institute"},{"location":"100%20Reference%20notes/102%20Authors/Daniel%20Ulbricht/","title":"Daniel Ulbricht","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/David%20M.%20Knigge/","title":"David M. Knigge","text":"Properties affiliation University of Amsterdam"},{"location":"100%20Reference%20notes/102%20Authors/David%20W.%20Romero/","title":"David W. Romero","text":"Properties affiliation Vrije Universiteit Amsterdam"},{"location":"100%20Reference%20notes/102%20Authors/Diane%20Larlus/","title":"Diane Larlus","text":"Properties affiliation Naver Labs Europe"},{"location":"100%20Reference%20notes/102%20Authors/Donghyun%20Kim/","title":"Donghyun Kim","text":"Properties affiliation Naver Cloud AI"},{"location":"100%20Reference%20notes/102%20Authors/Dongyoon%20Han/","title":"Dongyoon Han","text":"Properties affiliation Naver AI Lab"},{"location":"100%20Reference%20notes/102%20Authors/Duy-Kien%20Nguyen/","title":"Duy Kien Nguyen","text":"Properties affiliation University of Amsterdam"},{"location":"100%20Reference%20notes/102%20Authors/Edward%20J.%20Hu/","title":"Edward J. Hu","text":"Properties affiliation Microsoft"},{"location":"100%20Reference%20notes/102%20Authors/Edward%20Z.%20Yang/","title":"Edward Z. Yang","text":"Properties affiliation FAIR, Stanford, MIT, PyTorch

    Notes: - Has a pretty cool YouTube channel where he shares (bi-weekly) PyTorch meetings - For me, it's a nice source to get more involved with PyTorch compiler-ish libraries/tools like [[ExecuTorch|ExecuTorch]], [[torch.export|torch.export]] - Also it is interesting to see the interaction between engineers

    "},{"location":"100%20Reference%20notes/102%20Authors/Eric%20Mintun/","title":"Eric Mintun","text":"Properties affiliation FAIR, UC Santa Barbara"},{"location":"100%20Reference%20notes/102%20Authors/Erik%20J.%20Bekkers/","title":"Erik J. Bekkers","text":"Properties affiliation University of Amsterdam"},{"location":"100%20Reference%20notes/102%20Authors/Eshan%20Verma/","title":"Eshan Verma","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Fahim%20Tajwar/","title":"Fahim Tajwar","text":"Properties affiliation Stanford"},{"location":"100%20Reference%20notes/102%20Authors/Fartash%20Faghri/","title":"Fartash Faghri","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Francisco%20Massa/","title":"Francisco Massa","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Fred%20Hohman/","title":"Fred Hohman","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Furu%20Wei/","title":"Furu Wei","text":"Properties affiliation Microsoft"},{"location":"100%20Reference%20notes/102%20Authors/Gabriel%20Synnaeve/","title":"Gabriel Synnaeve","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Gintare%20Karolina%20Dziugaite/","title":"Gintare Karolina Dziugaite","text":"Properties affiliation Google DeepMind"},{"location":"100%20Reference%20notes/102%20Authors/Hadi%20Pouransari/","title":"Hadi Pouransari","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Han%20Cai/","title":"Han Cai","text":"Properties affiliation MIT, Shanghai Jiao Tong University"},{"location":"100%20Reference%20notes/102%20Authors/Hanzi%20Mao/","title":"Hanzi Mao","text":"Properties affiliation FAIR, NVIDIA"},{"location":"100%20Reference%20notes/102%20Authors/Haoxiang%20Wang/","title":"Haoxiang Wang","text":"Properties affiliation Apple, University of Illinois at Urbana-Champaign"},{"location":"100%20Reference%20notes/102%20Authors/Herv%C3%A9%20Jegou/","title":"Herv\u00e9 Jegou","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Huaxiu%20Yao/","title":"Huaxiu Yao","text":"Properties affiliation Stanford"},{"location":"100%20Reference%20notes/102%20Authors/Hugo%20Touvron/","title":"Hugo Touvron","text":"Properties affiliation FAIR, Sorbonne University"},{"location":"100%20Reference%20notes/102%20Authors/Huizi%20Mao/","title":"Huizi Mao","text":"Properties affiliation NVIDIA"},{"location":"100%20Reference%20notes/102%20Authors/Isha%20Garg/","title":"Isha Garg","text":"Properties affiliation Purdue University, Apple"},{"location":"100%20Reference%20notes/102%20Authors/Ishan%20Misra/","title":"Ishan Misra","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Jan%20E.%20Gerken/","title":"Jan E. Gerken","text":"Properties affiliation Chalmers University of Technology"},{"location":"100%20Reference%20notes/102%20Authors/Javier%20Maass%20Martinez/","title":"Javier Maass Martinez","text":"Properties affiliation University of Chile"},{"location":"100%20Reference%20notes/102%20Authors/Jean-Baptiste%20Cordonnier/","title":"Jean Baptiste Cordonnier","text":"Properties affiliation EPFL"},{"location":"100%20Reference%20notes/102%20Authors/Jeff%20Pool/","title":"Jeff Pool","text":"Properties affiliation NVIDIA"},{"location":"100%20Reference%20notes/102%20Authors/Jesse%20Cai/","title":"Jesse Cai","text":"Properties affiliation Meta, UCLA, PyTorch"},{"location":"100%20Reference%20notes/102%20Authors/Jing%20Pu/","title":"Jing Pu","text":"Properties affiliation Google, Stanford"},{"location":"100%20Reference%20notes/102%20Authors/Joaquin%20Fontbona/","title":"Joaquin Fontbona","text":"Properties affiliation University of Chile"},{"location":"100%20Reference%20notes/102%20Authors/John%20Denker/","title":"John Denker","text":"Properties affiliation Nokia Bell Labs"},{"location":"100%20Reference%20notes/102%20Authors/John%20Tran/","title":"John Tran","text":"Properties affiliation NVIDIA"},{"location":"100%20Reference%20notes/102%20Authors/Julien%20Mairal/","title":"Julien Mairal","text":"Properties affiliation INRIA"},{"location":"100%20Reference%20notes/102%20Authors/Juliette%20Marrie/","title":"Juliette Marrie","text":"Properties affiliation Naver Labs Europe, INRIA"},{"location":"100%20Reference%20notes/102%20Authors/Kaiming%20He/","title":"Kaiming He","text":"Properties affiliation FAIR, MIT"},{"location":"100%20Reference%20notes/102%20Authors/Kamyar%20Azizzadenesheli/","title":"Kamyar Azizzadenesheli","text":"Properties affiliation NVIDIA, Purdue University"},{"location":"100%20Reference%20notes/102%20Authors/Kaushik%20Roy/","title":"Kaushik Roy","text":"Properties affiliation Purdue University"},{"location":"100%20Reference%20notes/102%20Authors/Lawrence%20Chan/","title":"Lawrence Chan","text":"Properties affiliation UC Berkeley"},{"location":"100%20Reference%20notes/102%20Authors/Lucius%20Bushnaq/","title":"Lucius Bushnaq","text":"Properties affiliation Apollo Research"},{"location":"100%20Reference%20notes/102%20Authors/Maciej%20Wo%C5%82czyk/","title":"Maciej Wo\u0142czyk","text":"Properties affiliation IDEAS NCBR"},{"location":"100%20Reference%20notes/102%20Authors/Mahmoud%20Assran/","title":"Mahmoud Assran","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Marc%20Finzi/","title":"Marc Finzi","text":"Properties affiliation New York University"},{"location":"100%20Reference%20notes/102%20Authors/Mark%20A.%20Horowitz/","title":"Mark A. Horowitz","text":"Properties affiliation Stanford"},{"location":"100%20Reference%20notes/102%20Authors/Martin%20Jaggi/","title":"Martin Jaggi","text":"Properties affiliation EPFL"},{"location":"100%20Reference%20notes/102%20Authors/Martin%20R.%20Oswald/","title":"Martin R. Oswald","text":"Properties affiliation University of Amsterdam"},{"location":"100%20Reference%20notes/102%20Authors/Mathilde%20Caron/","title":"Mathilde Caron","text":"Properties affiliation FAIR, INRIA"},{"location":"100%20Reference%20notes/102%20Authors/Maxime%20Oquab/","title":"Maxime Oquab","text":"Properties affiliation FAIR, INRIA"},{"location":"100%20Reference%20notes/102%20Authors/Mehrdad%20Farajtabar/","title":"Mehrdad Farajtabar","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Michael%20Arbel/","title":"Michael Arbel","text":"Properties affiliation INRIA"},{"location":"100%20Reference%20notes/102%20Authors/Mohammad%20Rastegari/","title":"Mohammad Rastegari","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Namuk%20Park/","title":"Namuk Park","text":"Properties affiliation Naver AI Lab, Prescient Design, Genentech"},{"location":"100%20Reference%20notes/102%20Authors/Navin%20Ranjan/","title":"Navin Ranjan","text":"Properties affiliation Rochester Institute of Technology"},{"location":"100%20Reference%20notes/102%20Authors/Neel%20Nanda/","title":"Neel Nanda","text":"Properties affiliation Google DeepMind, Anthropic"},{"location":"100%20Reference%20notes/102%20Authors/Nicolas%20Carion/","title":"Nicolas Carion","text":"Properties affiliation New York University"},{"location":"100%20Reference%20notes/102%20Authors/Nicolas%20Usunier/","title":"Nicolas Usunier","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Oncel%20Tuzel/","title":"Oncel Tuzel","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Patrick%20Forr%C3%A9/","title":"Patrick Forr\u00e9","text":"Properties affiliation University of Amsterdam"},{"location":"100%20Reference%20notes/102%20Authors/Pavan%20Kumar%20Anasosalu%20Vasu/","title":"Pavan Kumar Anasosalu Vasu","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Percy%20Liang/","title":"Percy Liang","text":"Properties affiliation Stanford"},{"location":"100%20Reference%20notes/102%20Authors/Piotr%20Bojanowski/","title":"Piotr Bojanowski","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Raviteja%20Vemulapalli/","title":"Raviteja Vemulapalli","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Razvan%20Pascanu/","title":"Razvan Pascanu","text":"Properties affiliation Google DeepMind"},{"location":"100%20Reference%20notes/102%20Authors/Robin%20Walters/","title":"Robin Walters","text":"Properties affiliation Northeastern University"},{"location":"100%20Reference%20notes/102%20Authors/Rose%20Yu/","title":"Rose Yu","text":"Properties affiliation UC San Diego"},{"location":"100%20Reference%20notes/102%20Authors/Ross%20Girshick/","title":"Ross Girshick","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Rui%20Wang/","title":"Rui Wang","text":"Properties affiliation MIT, UC San Diego"},{"location":"100%20Reference%20notes/102%20Authors/Ruoming%20Pang/","title":"Ruoming Pang","text":"Properties affiliation Apple, Princeton University"},{"location":"100%20Reference%20notes/102%20Authors/Sachin%20Mehta/","title":"Sachin Mehta","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Sangdoo%20Yun/","title":"Sangdoo Yun","text":"Properties affiliation Naver AI Lab"},{"location":"100%20Reference%20notes/102%20Authors/Sanghyuk%20Chun/","title":"Sanghyuk Chun","text":"Properties affiliation Naver AI Lab"},{"location":"100%20Reference%20notes/102%20Authors/Sara%20Solla/","title":"Sara Solla","text":"Properties affiliation Northwestern University"},{"location":"100%20Reference%20notes/102%20Authors/Sergey%20Zagoruyko/","title":"Sergey Zagoruyko","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Shaohan%20Huang/","title":"Shaohan Huang","text":"Properties affiliation Microsoft"},{"location":"100%20Reference%20notes/102%20Authors/Simon%20J.D.%20Prince/","title":"Simon J.D. Prince","text":"Properties affiliation University of Bath"},{"location":"100%20Reference%20notes/102%20Authors/Skander%20Moalla/","title":"Skander Moalla","text":"Properties affiliation CLAIRE, EPFL"},{"location":"100%20Reference%20notes/102%20Authors/Soham%20De/","title":"Soham De","text":"Properties affiliation Google DeepMind, University of Maryland"},{"location":"100%20Reference%20notes/102%20Authors/Song%20Han/","title":"Song Han","text":"Properties affiliation MIT"},{"location":"100%20Reference%20notes/102%20Authors/Songkuk%20Kim/","title":"Songkuk Kim","text":"Properties affiliation Yonsei University"},{"location":"100%20Reference%20notes/102%20Authors/Sourya%20Basu/","title":"Sourya Basu","text":"Properties affiliation University of Illinois at Urbana-Champaign, IBM Research"},{"location":"100%20Reference%20notes/102%20Authors/St%C3%A9phane%20d%27Ascoli/","title":"St\u00e9phane d'Ascoli","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Sukjun%20Hwang/","title":"Sukjun Hwang","text":"Properties affiliation Carnegie Mellon University"},{"location":"100%20Reference%20notes/102%20Authors/Taekyung%20Kim/","title":"Taekyung Kim","text":"Properties affiliation Naver AI Lab"},{"location":"100%20Reference%20notes/102%20Authors/Tete%20Xiao/","title":"Tete Xiao","text":"Properties affiliation FAIR

    Associations: FAIR, UC Berkeley

    "},{"location":"100%20Reference%20notes/102%20Authors/Tom%20Gunter/","title":"Tom Gunter","text":"Properties affiliation Apple, University of Oxford"},{"location":"100%20Reference%20notes/102%20Authors/Tom%20Lieberum/","title":"Tom Lieberum","text":"Properties affiliation University of Amsterdam, Google DeepMind"},{"location":"100%20Reference%20notes/102%20Authors/Vaibhav%20Aggarwal/","title":"Vaibhav Aggarwal","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/William%20J.%20Dally/","title":"William J. Dally","text":"Properties affiliation Stanford, NVIDIA"},{"location":"100%20Reference%20notes/102%20Authors/Wonjae%20Kim/","title":"Wonjae Kim","text":"Properties affiliation Naver AI Lab"},{"location":"100%20Reference%20notes/102%20Authors/Xiang%20Yue/","title":"Xiang Yue","text":"Properties affiliation Carnegie Mellon University"},{"location":"100%20Reference%20notes/102%20Authors/Xingyu%20Liu/","title":"Xingyu Liu","text":"Properties affiliation Carnegie Mellon University"},{"location":"100%20Reference%20notes/102%20Authors/Xinlei%20Chen/","title":"Xinlei Chen","text":"Properties affiliation FAIR, Zhejiang University, Carnegie Mellon University, Zhejiang University"},{"location":"100%20Reference%20notes/102%20Authors/Xiuying%20Wei/","title":"Xiuying Wei","text":"Properties affiliation EPFL, CLAIRE"},{"location":"100%20Reference%20notes/102%20Authors/Xu%20Ma/","title":"Xu Ma","text":"Properties affiliation Northeastern University"},{"location":"100%20Reference%20notes/102%20Authors/Xun%20Wu/","title":"Xun Wu","text":"Properties affiliation Microsoft, Tsinghua University"},{"location":"100%20Reference%20notes/102%20Authors/Yanghao%20Li/","title":"Yanghao Li","text":"Properties affiliation FAIR, Apple"},{"location":"100%20Reference%20notes/102%20Authors/Yann%20LeCun/","title":"Yann LeCun","text":"Properties affiliation FAIR, New York University"},{"location":"100%20Reference%20notes/102%20Authors/Yelong%20Shen/","title":"Yelong Shen","text":"Properties affiliation Microsoft"},{"location":"100%20Reference%20notes/102%20Authors/Yoonho%20Lee/","title":"Yoonho Lee","text":"Properties affiliation Stanford"},{"location":"100%20Reference%20notes/102%20Authors/Zeyuan%20Allen-Zhu/","title":"Zeyuan Allen Zhu","text":"Properties affiliation FAIR"},{"location":"100%20Reference%20notes/102%20Authors/Zhuoyang%20Zhang/","title":"Zhuoyang Zhang","text":"Properties affiliation NVIDIA, Tsinghua University"},{"location":"100%20Reference%20notes/102%20Authors/Ziaoyi%20Zhang/","title":"Ziaoyi Zhang","text":"Properties affiliation Apple"},{"location":"100%20Reference%20notes/102%20Authors/Zirui%20Wang/","title":"Zirui Wang","text":"Properties affiliation Apple, Google, Carnegie Mellon University"},{"location":"100%20Reference%20notes/103%20Affiliations/CLAIRE/","title":"CLAIRE","text":""},{"location":"100%20Reference%20notes/103%20Affiliations/CLAIRE/#three-essential-pilars-of-the-lab","title":"Three essential pilars of the lab","text":"

    Efficient deep learning algorithms

    • Efficient RL
    • Sample efficient learning algorithms
    • Model Recycling
    • Efficient sequence models

    Robust, safe and responsible algorithms

    • RLHF/Alignment
    • Uncertainty aware/Bayesian algorithms
    • Offline RL
    • Active learning/Human in the loop algorithms
    • Better evaluations

    Improving reasoning: Moving from system 1 to system 2 level thinking

    • Improving reasoning
    • Creativity
    • Deliberation
    • Causality
    • Imagination
    • Planning
    "},{"location":"100%20Reference%20notes/103%20Affiliations/CLAIRE/#notes","title":"Notes","text":"
    • omg, this is amazing
    • Note to self: Look at CLAIRE's research \u23eb
    "},{"location":"100%20Reference%20notes/103%20Affiliations/FAIR/","title":"FAIR","text":"

    Related: FAIR

    "},{"location":"100%20Reference%20notes/103%20Affiliations/Naver%20Labs%20Europe/","title":"Naver Labs Europe","text":"

    Related to Naver AI Lab

    "},{"location":"100%20Reference%20notes/104%20Other/EPFL-CS439%20-%20Optimization%20for%20Machine%20Learning/","title":"EPFL CS439 Optimization for Machine Learning","text":"Properties authors Martin Jaggi, Nicolas Flammarion year 2024 url https://github.com/epfml/OptML_course/tree/master

    Abstract

    This course teaches an overview of modern mathematical optimization methods, for applications in machine learning and data science. In particular, scalability of algorithms to large datasets will be discussed in theory and in implementation.

    Topics

    Convexity, Gradient Methods, Proximal algorithms, Subgradient Methods, Stochastic and Online Variants of mentioned methods, Coordinate Descent, Frank-Wolfe, Accelerated Methods, Primal-Dual context and certificates, Lagrange and Fenchel Duality, Second-Order Methods including Quasi-Newton Methods, Derivative-Free Optimization.

    ","tags":["course","optimization"]},{"location":"100%20Reference%20notes/104%20Other/GPU%20mode%20-%20Sparsity/","title":"GPU mode Sparsity","text":"Properties authors Jesse Cai year 2024 url https://github.com/gpu-mode/lectures/blob/main/lecture_011/sparsity.pptx","tags":["lecture","presentation"]},{"location":"100%20Reference%20notes/104%20Other/GPU%20mode%20-%20Sparsity/#notes","title":"Notes","text":"
    • #todo take notes
    ","tags":["lecture","presentation"]},{"location":"100%20Reference%20notes/104%20Other/Introducing%20Apple%E2%80%99s%20On-Device%20and%20Server%20Foundation%20Models/","title":"Introducing Apple\u2019s On Device and Server Foundation Models","text":"Properties year 2024 url https://machinelearning.apple.com/research/introducing-apple-foundation-models","tags":["efficient_dl"]},{"location":"100%20Reference%20notes/104%20Other/Introducing%20Apple%E2%80%99s%20On-Device%20and%20Server%20Foundation%20Models/#pre-training","title":"## Pre-Training","text":"

    Our foundation models are trained on\u00a0Apple's AXLearn framework, an open-source project we released in 2023. It builds on top of JAX and XLA, and allows us to train the models with high efficiency and scalability on various training hardware and cloud platforms, including TPUs and both cloud and on-premise GPUs. We used a combination of data parallelism, tensor parallelism, sequence parallelism, and Fully Sharded Data Parallel (FSDP) to scale training along multiple dimensions such as data, model, and sequence length.

    ","tags":["efficient_dl"]},{"location":"100%20Reference%20notes/104%20Other/Introducing%20Apple%E2%80%99s%20On-Device%20and%20Server%20Foundation%20Models/#optimization","title":"Optimization","text":"

    In addition to ensuring our generative models are highly capable, we have used a range of innovative techniques to optimize them on-device and on our private cloud for speed and efficiency. We have applied an extensive set of optimizations for both first token and extended token inference performance.

    Both the on-device and server models use grouped-query-attention. We use shared input and output vocab embedding tables to reduce memory requirements and inference cost. These shared embedding tensors are mapped without duplications. The on-device model uses a vocab size of 49K, while the server model uses a vocab size of 100K, which includes additional language and technical tokens.

    For on-device inference, we use low-Bit Palettization, a critical optimization technique that achieves the necessary memory, power, and performance requirements. To maintain model quality, we developed a new framework using LoRA adapters that incorporates a mixed 2-bit and 4-bit configuration strategy \u2014 averaging 3.5 bits-per-weight \u2014 to achieve the same accuracy as the uncompressed models.

    Additionally, we use an interactive model latency and power analysis tool,\u00a0Talaria, to better guide the bit rate selection for each operation. We also utilize activation quantization and embedding quantization, and have developed an approach to enable efficient Key-Value (KV) cache update on our neural engines.

    References: Talaria - Interactively Optimizing Machine Learning Models for Efficient Inference Notes: - Might be useful to look at KV Cache hardware-dependency

    With this set of optimizations, on iPhone 15 Pro we are able to reach time-to-first-token latency of about 0.6 millisecond per prompt token, and a generation rate of 30 tokens per second. Notably, this performance is attained before employing token speculation techniques, from which we see further enhancement on the token generation rate.

    ","tags":["efficient_dl"]},{"location":"100%20Reference%20notes/104%20Other/Introducing%20Apple%E2%80%99s%20On-Device%20and%20Server%20Foundation%20Models/#model-adaptation","title":"Model Adaptation","text":"

    Our foundation models are fine-tuned for users\u2019 everyday activities, and can dynamically specialize themselves on-the-fly for the task at hand. We utilize adapters, small neural network modules that can be plugged into various layers of the pre-trained model, to fine-tune our models for specific tasks. For our models we adapt the attention matrices, the attention projection matrix, and the fully connected layers in the point-wise feedforward networks for a suitable set of the decoding layers of the transformer architecture.

    Notes: - How do you adapt the attention matrices? Is it like a bias? `A[i][j] += lora[i][j] - Attention projection matrix I suppose referes to the projection matrices \\(W_Q, W_K, W_V\\)

    By fine-tuning only the adapter layers, the original parameters of the base pre-trained model remain unchanged, preserving the general knowledge of the model while tailoring the adapter layers to support specific tasks.

    We represent the values of the adapter parameters using 16 bits, and for the ~3 billion parameter on-device model, the parameters for a rank 16 adapter typically require 10s of megabytes. The adapter models can be dynamically loaded, temporarily cached in memory, and swapped \u2014 giving our foundation model the ability to specialize itself on the fly for the task at hand while efficiently managing memory and guaranteeing the operating system's responsiveness.

    To facilitate the training of the adapters, we created an efficient infrastructure that allows us to rapidly retrain, test, and deploy adapters when either the base model or the training data gets updated. The adapter parameters are initialized using\u00a0the accuracy-recovery adapter introduced in the Optimization section.

    ","tags":["efficient_dl"]},{"location":"100%20Reference%20notes/104%20Other/Introduction%20to%20Quantization%20on%20PyTorch/","title":"Introduction to Quantization on PyTorch","text":"Properties authors Raghuraman Krishnamoorthi, James Reed, Min Ni, Chris Gottbrath, Seth Weidman year 2020 url https://pytorch.org/blog/introduction-to-quantization-on-pytorch/","tags":["website","efficient_dl","quantization"]},{"location":"100%20Reference%20notes/104%20Other/Introduction%20to%20Quantization%20on%20PyTorch/#notes","title":"Notes","text":"

    Quantization aware training is typically only used in CNN models when post training static or dynamic quantization doesn\u2019t yield sufficient accuracy. This can occur with models that are highly optimized to achieve small size (such as Mobilenet).

    Currently, operator coverage is limited and may restrict the choices listed in the table below: The table below provides a guideline.

    Model Type Preferred scheme Why LSTM/RNN Dynamic Quantization Throughput dominated by compute/memory bandwidth for weights BERT/Transformer Dynamic Quantization Throughput dominated by compute/memory bandwidth for weights CNN Static Quantization Throughput limited by memory bandwidth for activations CNN Quantization Aware Training In the case where accuracy can't be achieved with static quantization

    Does the Transformer row apply also for vision transformers? Since the number of tokens is quite large.

    Model Float Latency (ms) Quantized Latency (ms) Inference Performance Gain Device Notes BERT 581 313 1.8x Xeon-D2191 (1.6GHz) Batch size = 1, Maximum sequence length= 128, Single thread, x86-64, Dynamic quantization Resnet-50 214 103 2x Xeon-D2191 (1.6GHz) Single thread, x86-64, Static quantization Mobilenet-v2 97 17 5.7x Samsung S9 Static quantization, Floating point numbers are based on Caffe2 run-time and are not optimized

    So I should expect something around ~2x latency improvement with dynamic quantization

    ","tags":["website","efficient_dl","quantization"]},{"location":"100%20Reference%20notes/104%20Other/Let%27s%20talk%20about%20the%20Python%20Dispatcher/","title":"Let's talk about the Python Dispatcher","text":"Properties authors Edward Z. Yang year 2020 url http://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/","tags":["blog"]},{"location":"100%20Reference%20notes/104%20Other/MIT-65940%20-%20TinyML%20and%20Efficient%20Deep%20Learning%20Computing/","title":"MIT 65940 TinyML and Efficient Deep Learning Computing","text":"Properties authors Song Han year 2023 url https://hanlab.mit.edu/courses/2023-fall-65940","tags":["course"]},{"location":"100%20Reference%20notes/104%20Other/Optimizing%20Vision%20Transformer%20Model%20for%20Deployment/","title":"Optimizing Vision Transformer Model for Deployment","text":"Properties authors Jeff Tang, Geeta Chauhan year 2021 url https://pytorch.org/tutorials/beginner/vt_tutorial.html","tags":["website"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20ExecuTorch%20-%20Export%20IR%20Specification/","title":"PyTorch ExecuTorch Export IR Specification","text":"Properties authors PyTorch - Functionalization in PyTorch - Everything you need to know year 2024 url https://pytorch.org/executorch/main/ir-exir.html

    The Exported IR is a specification that consists of the following parts:

    1. A definition of computation graph model.
    2. Set of operators allowed in the graph.

    A dialect also provides further constraints meant for a specific purpose or stage in some compilation phase. Some dialects are: - aten dialect - edge dialect - backend dialect

    Executorch compilation first exports to aten, then to edge and finally to backend.

    ","tags":["paper"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20ExecuTorch%20-%20Export%20IR%20Specification/#aten-dialect","title":"Aten Dialect","text":"
    • PyTorch Functionalization is performed, removing any tensor aliases and mutations, and allowing for more flexible graph transformations to be made.
    ","tags":["paper"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20ExecuTorch%20-%20How%20ExecuTorch%20works%3F/","title":"PyTorch ExecuTorch How ExecuTorch works?","text":"Properties authors PyTorch Quantization for TensorRT year 2024 url https://pytorch.org/executorch/main/intro-how-it-works","tags":["pytorch","compilers","efficient_dl","documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20ExecuTorch%20-%20How%20ExecuTorch%20works%3F/#what-are-the-steps-to-run-a-model-with-executorch","title":"What are the steps to run a model with ExecuTorch?","text":"","tags":["pytorch","compilers","efficient_dl","documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20ExecuTorch%20-%20How%20ExecuTorch%20works%3F/#1-export-the-model","title":"1. Export the model","text":"
    • Capture the pytorch program as a graph
    ","tags":["pytorch","compilers","efficient_dl","documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20ExecuTorch%20-%20How%20ExecuTorch%20works%3F/#2-compile-the-exported-model-to-an-executorch-program","title":"2. Compile the exported model to an ExecuTorch program","text":"

    Captured Graph -> ExecuTorch program

    Possible Optimizations: - Compressing the model (e.g., quantization) - Lowering subgraphs to on-device specialized hardware accelerators to improve latency. - memory planning, i.e. to efficiently plan the location of intermediate tensors to reduce the runtime memory footprint.

    ","tags":["pytorch","compilers","efficient_dl","documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20ExecuTorch%20-%20How%20ExecuTorch%20works%3F/#3-run-the-executorch-program-to-a-target-device","title":"3. Run the ExecuTorch program to a target device","text":"
    • Light runtime with memory planning for fast inference :)
    ","tags":["pytorch","compilers","efficient_dl","documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20ExecuTorch%20-%20How%20ExecuTorch%20works%3F/#key-benefits","title":"Key Benefits","text":"
    • Export that is robust and powerful
    • Operator Standardization
    • Standardization for compiler interfaces (aka delegates) and the OSS ecosystem
    • First-party SDK and toolchain
    • Ease of customization
    • Low overhead runtime and execution
    ","tags":["pytorch","compilers","efficient_dl","documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20ExecuTorch%20-%20Quantization%20Overview/","title":"PyTorch ExecuTorch Quantization Overview","text":"Properties authors PyTorch Quantization for TensorRT year 2024 url https://pytorch.org/executorch/main/quantization-overview.html

    { width=\"400\" }

    Quantization is usually tied to execution backends that have quantized operators implemented. Thus each backend is opinionated about how the model should be quantized, expressed in a backend specific\u00a0Quantizer\u00a0class.

    ","tags":["documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20Functionalization%20in%20PyTorch%20-%20Everything%20you%20need%20to%20know/","title":"PyTorch Functionalization in PyTorch Everything you need to know","text":"Properties authors Brian Hirsh year 2023 url https://dev-discuss.pytorch.org/t/functionalization-in-pytorch-everything-you-wanted-to-know/965

    Given a program/function of PyTorch operators, functionalization will return a new function, that: 1. Has the same semantics as the old function 2. Has no mutations in it

    Exposed in functorch API.

    Functionalization operates at the level of our ATen API.

    Why? - Compilers don't like mutations: Graph partitioning is harder if nodes have side effects, etc.

    Notes: - PyTorch Functionalization

    ","tags":["documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20PyTorch%202%20Export%20Post%20Training%20Quantization/","title":"PyTorch PyTorch 2 Export Post Training Quantization","text":"Properties authors Jerry Zhang year 2024 url https://pytorch.org/tutorials/prototype/pt2e_quant_ptq.html

    Uses prepare_pt2e and convert_pt2e.

    float_model(Python)                          Example Input\n    \\                                              /\n     \\                                            /\n\u2014-------------------------------------------------------\n|                        export                        |\n\u2014-------------------------------------------------------\n                            |\n                    FX Graph in ATen     Backend Specific Quantizer\n                            |                       /\n\u2014--------------------------------------------------------\n|                     prepare_pt2e                      |\n\u2014--------------------------------------------------------\n                            |\n                     Calibrate/Train\n                            |\n\u2014--------------------------------------------------------\n|                    convert_pt2e                       |\n\u2014--------------------------------------------------------\n                            |\n                    Quantized Model\n                            |\n\u2014--------------------------------------------------------\n|                       Lowering                        |\n\u2014--------------------------------------------------------\n                            |\n        Executorch, Inductor or <Other Backends>\n
    ","tags":["documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20Quantization/","title":"PyTorch Quantization","text":"Properties authors PyTorch Quantization for TensorRT year 2024 url https://pytorch.org/docs/main/quantization.html#prototype-pytorch-2-export-quantization","tags":["documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20-%20Quantization/#backendhardware-support","title":"Backend/Hardware Support","text":"Hardware Kernel Library Eager Mode Quantization FX Graph Mode Quantization Quantization Mode Support server CPU fbgemm/onednn Supported All Supported mobile CPU qnnpack/xnnpack server GPU TensorRT (early prototype) Not support this it requires a graph Supported Static Quantization

    Today, PyTorch supports the following backends for running quantized operators efficiently:

    • x86 CPUs with AVX2 support or higher (without AVX2 some operations have inefficient implementations), via\u00a0x86\u00a0optimized by\u00a0fbgemm\u00a0and\u00a0onednn\u00a0(see the details at\u00a0RFC)
    • ARM CPUs (typically found in mobile/embedded devices), via\u00a0qnnpack
    • (early prototype) support for NVidia GPU via\u00a0TensorRT\u00a0through\u00a0fx2trt\u00a0(to be open sourced)

    Note: - This is a bit old, as fx2trt is already available in torch-tensorrt. However, there

    ","tags":["documentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20Compilers%20-%20What%20makes%20PyTorch%20beloved%20makes%20it%20hard%20to%20compile/","title":"PyTorch Compilers What makes PyTorch beloved makes it hard to compile","text":"Properties authors Peng Wu year 2022 url https://chips-compilers-mlsys-22.github.io/assets/slides/PyTorch%20Compilers%20(Compiler%20&%20Chips%20Symposium%202022).pdf

    Multiple pytorch compilers - TorchScript (torch.jit.script, torch.jit.trace) - supports python subset - full graph capture = Ahead-of-Time (AOT) Compilation - executed by TS interpreter - nnc, nvfuser - torch.fx - torch.package, torch.deploy - torch-mlir - TorchDynamo, TorchInductor - TorchDynamo captures partial graphs (if strict=False), and falls-back to eager.

    What makes TorchDynamo graph capture sound and out-of-the-box? - Partial graph capture: Ability to skip unwanted parts of eager - Guarded graphs: Ability to check if captured graph is valid for execution - Note: Basically, it inserts assertions/runtime checks to see that the partial graph is sound at runtime, if not, it jit recompiles. - Just-in-time recapture: recapture a graph if captured graph is invalid for execution

    Dynamo workflow - Captures FX Graph - Sends FX Graph to compiler hook to compile (which can be another compiler like TRT or torchscript)

    { width=\"800\" }

    Note: tbh this seems like an arbitrary separation, because torchdynamo also is meant for inference (torch.export), but this is probably because this tutorial is 2 years old

    ","tags":["presentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20Conference%202024%20-%20Fast%20Sparse%20Vision%20Transformers%20with%20minimal%20accuracy%20loss/","title":"PyTorch Conference 2024 Fast Sparse Vision Transformers with minimal accuracy loss","text":"Properties authors Jesse Cai year 2024 url https://static.sched.com/hosted_files/pytorch2024/c6/Sparsifying%20ViT%20lightning%20talk%20slides.pdf?_gl=119zah9b_gcl_auMTk3MjgxODE5OC4xNzI3MjU4NDM2FPAU*MTk3MjgxODE5OC4xNzI3MjU4NDM2

    Nice, it is on torchao

    Notes: - Don't quite understand what does Core or AO mean in this context, but at least torch.compile is acknowledged :p

    ","tags":["presentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20Conference%202024%20-%20What%E2%80%99s%20new%20in%20torch.export%3F/","title":"PyTorch Conference 2024 What\u2019s new in torch.export?","text":"Properties authors Avik Chaudhuri year 2024 url https://static.sched.com/hosted_files/pytorch2024/6b/What%E2%80%99s%20new%20in%20torch.export_.pptx.pdf?_gl=11s5cwnu_gcl_au*MTk3MjgxODE5OC4xNzI3MjU4NDM2","tags":["presentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20Conference%202024%20-%20What%E2%80%99s%20new%20in%20torch.export%3F/#recap-what-is-torchexport-and-why","title":"[Recap] What is torch.export and why?","text":"
    • \"Sound\", whole-graph capture of pytorch models
    • Emits \"IR\": backend-agnostic
    • For easier backend-specific lowering (trt, etc)
    • For python-free environments
    ","tags":["presentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20Conference%202024%20-%20What%E2%80%99s%20new%20in%20torch.export%3F/#composable-apis","title":"Composable APIs","text":"
    • Useful: torch.export.export_for_inference
    ","tags":["presentation"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20Conference%202024/","title":"PyTorch Conference 2024","text":"Properties year 2024

    Some interesting talks for #efficient_dl : - PyTorch Conference 2024 - What\u2019s new in torch.export? - PyTorch Conference 2024 - Fast Sparse Vision Transformers with minimal accuracy loss

    ","tags":["conference"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20Eager%20Mode%20Quantization%20TensorRT%20Acceleration/","title":"PyTorch Eager Mode Quantization TensorRT Acceleration","text":"Properties authors Lei Mao year 2024 url https://leimao.github.io/blog/PyTorch-Eager-Mode-Quantization-TensorRT-Acceleration/

    Abstract

    The TensorRT acceleration for the quantized PyTorch model from the PyTorch eager mode quantization interface involves three steps:

    1. Perform PyTorch eager mode quantization on the floating-point PyTorch model in PyTorch and export the quantized PyTorch model to ONNX.
    2. Fix the quantized ONNX model graph so that it can be parsed by the TensorRT parser.
    3. Build the quantized ONNX model to a TensorRT engine, profile the performance, and verify the accuracy.> 1

    The source code for this post can be found on GitHub .

    ","tags":["website","paper"]},{"location":"100%20Reference%20notes/104%20Other/PyTorch%20internals/","title":"PyTorch internals","text":"Properties authors Edward Z. Yang year 2019 url http://blog.ezyang.com/2019/05/pytorch-internals/

    Depending on tensor metadata (if it's CUDA, or sparse, etc) it's dispatched to different implementations () { width=\"500\" }

    ","tags":["blog"]},{"location":"100%20Reference%20notes/104%20Other/Quantized%20Transfer%20Learning%20for%20Computer%20Vision%20Tutorial/","title":"Quantized Transfer Learning for Computer Vision Tutorial","text":"Properties authors Zafar Takhirov url https://pytorch.org/tutorials/intermediate/quantized_transfer_learning_tutorial.html","tags":["website"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%2011/","title":"Reinforcement Learning An Introduction Chapter 11","text":"Properties authors Richard S. Sutton, Andrew G. Barton year 2018","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%2011/#111-semi-gradient-methods","title":"11.1 Semi-gradient Methods","text":"

    Equation 11.1: Per-step importance sampling ratio

    \\[ \\rho_t \\doteq \\rho_{t:T-1} = \\frac{\\pi(A_t \\mid S_t)}{b(A_t \\mid S_t)} \\]","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%2011/#todo","title":"todo","text":"","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%2011/#114-linear-value-function-geometry","title":"11.4 Linear Value-function Geometry","text":"

    TODO: - [x] 11.11 mu norm equation \u2705 2024-10-01 - [x] 11.17 and 11.18 bellman error \u2705 2024-10-01 - [ ] 11.19 mean square bellman error

    Equation 11.11: \\(\\mu\\)-norm

    \\[ ||\\mathbf{v}||^2_\\mu \\doteq \\sum_{s \\in \\mathcal{S}} \\mu(s) v(s)^2 \\]

    Equation 11.17 and 11.18: Bellman error

    \\[ \\begin{align} \\bar{\\delta}_{\\mathbf{w}}(s) &\\doteq \\left( \\sum_a \\pi(a \\mid s) \\sum_{s', r} p(s', r \\mid s, a)[r + \\gamma v_{\\mathbf{w}}(s')] \\right) - v_{\\mathbf{w}}(s) \\tag{11.17} \\\\ &= \\mathbb{E}_\\pi[R_{t+1} - \\gamma v_{\\mathbf{w}}(S_{t+1}) - v_{\\mathbf{w}}(S_{t}) \\mid S_t = s, A_t \\sim \\pi] \\end{align} \\]","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%2011/#115-gradient-descent-in-the-bellman-error","title":"11.5 Gradient Descent in the Bellman Error","text":"

    Mean-squared temporal difference error

    \\[ \\begin{align} \\overline{TDE}(\\mathbf{w}) &= \\sum_{s \\in \\mathcal{S}} \\mu(s) \\mathbb{E}\\left[\\delta_t^2 \\mid S_t = s, A_t \\sim \\pi \\right] \\\\ &= \\sum_{s \\in \\mathcal{S}} \\mu(s) \\mathbb{E}\\left[\\rho_t \\delta_t^2 \\mid S_t = s, A_t \\sim b \\right] \\\\ &= \\mathbb{E}_b\\left[\\rho_t \\delta_t^2 \\right] \\end{align} \\]

    Equation 11.23: Weight update of naive residual-gradient algoritm

    \\[ \\begin{align} \\mathbf{w}_{t+1} &= \\mathbf{w}_t - \\frac{1}{2} \\alpha \\nabla(\\rho_t \\delta_t^2) \\\\ &= \\mathbf{w}_t - \\alpha \\rho_t \\delta_t \\nabla(\\delta_t) \\\\ &= \\mathbf{w}_t - \\alpha \\rho_t \\delta_t (\\nabla \\hat{v}(S_t, \\mathbf{w}_t) - \\gamma \\nabla \\hat{v}(S_{t+1}, \\mathbf{w}_t)) \\tag{11.23} \\\\ \\end{align} \\]","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%203/","title":"Reinforcement Learning An Introduction Chapter 3","text":"Properties authors Richard S. Sutton, Andrew G. Barton year 2018","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%203/#31-the-agent-environment-interface","title":"3.1 The Agent-Environment Interface","text":"

    Equation 3.1: Trajectory

    \\[ S_0,A_0,R_1,S_1,A_1,R_2,S_2,A_2,R_3, \\dots \\tag{3.1} \\]

    Equation 3.2: MDP dynamics

    \\[ p(s', r \\mid s, a) \\doteq \\Pr \\{ S_t = s', R_t = r \\mid S_{t-1} = s, A_{t-1} = a \\} \\tag{3.2} \\]

    You can obtain the state-transition probabilities and the with the law of total probability. You can obtain the expected reward also.

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%203/#32-goals-and-rewards","title":"3.2 Goals and Rewards","text":"What is the reward hypothesis?

    The reward hypothesis is the idea that all of what we mean by goals and purposes can be well thought of as the maximization of the expected value of the cumulative sum of a received scalar signal (called reward).

    • The reward signal is your way of communicating to the agent what you want it to achieve not how you want it to achieve it.
    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%203/#33-returns-and-episodes","title":"3.3 Returns and Episodes","text":"

    Equation 3.7: Undiscounted return

    \\[ G_t \\doteq R_{t+1} + R_{t+2} + R_{t+3} + \\dots + R_T \\tag{3.7} \\]

    Equation 3.8: Discounted return

    \\[ G_t \\doteq R_{t+1} + \\gamma R_{t+2} + \\gamma^2 R_{t+3} + \\dots = \\sum_{k=0}^{\\infty} \\gamma^k R_{t+k+1} \\tag{3.8} \\]

    Where \\(\\gamma\\) is the discount rate.

    Equation 3.9: Recursive definition of return

    You can group Eq 3.8 into a recursive definition of the return.

    \\[ G_t \\doteq R_{t+1} + \\gamma G_{t+1} \\tag{3.9} \\]","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%203/#34-unified-notation-for-episodic-and-continuing-tasks","title":"3.4 Unified Notation for Episodic and Continuing Tasks","text":"","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%203/#35-policies-and-value-functions","title":"3.5 Policies and Value Functions","text":"

    A policy \\(\\pi(a \\mid s)\\) is a probability distribution over actions given states.

    Equation 3.12: State-value function

    $$ v_{\\pi}(s) \\doteq \\mathbb{E}_{\\pi}[G_t \\mid S_t = s] \\;\\; \\forall s \\in \\mathcal{S} \\tag{3.12}

    $$

    Equation 3.13: Action-value function

    \\[ q_{\\pi}(s, a) \\doteq \\mathbb{E}_{\\pi}[G_t \\mid S_t = s, A_t = a] \\;\\; \\forall s \\in \\mathcal{S}, a \\in \\mathcal{A} \\tag{3.13} \\]

    Equation 3.14: Bellman equation for \\(v_{\\pi}\\)

    \\[ \\begin{align} v_\\pi(s) &\\doteq \\mathbb{E}_{\\pi}[G_t \\mid S_t = s] \\\\ &= \\mathbb{E}_{\\pi}[R_{t+1} + \\gamma G_{t+1} \\mid S_t = s] \\tag{by (3.9)} \\\\ &= \\sum_{a} \\pi(a \\mid s) \\sum_{s', r} p(s', r \\mid s, a) \\left[r + \\gamma \\mathbb{E}_{\\pi}\\left[G_{t+1} \\mid S_{t+1} = s'\\right]\\right] \\\\ &= \\sum_{a} \\pi(a \\mid s) \\sum_{s', r} p(s', r \\mid s, a) [r + \\gamma v_\\pi(s')] \\tag{3.14} \\end{align} \\]","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%203/#36-optimal-policies-and-optimal-value-functions","title":"3.6 Optimal Policies and Optimal Value Functions","text":"

    Equation 3.15: Optimal state-value function

    \\[ v_*(s) \\doteq \\max_{\\pi} v_{\\pi}(s) \\tag{3.15} \\]

    Equation 3.16: Optimal action-value function

    \\[ q_*(s, a) \\doteq \\max_{\\pi} q_{\\pi}(s, a) \\tag{3.16} \\]

    Equation 3.17

    \\[ q_*(s, a) = \\mathbb{E}[R_{t+1} + \\gamma v_*(S_{t+1}) \\mid S_t = s, A_t = a] \\tag{3.17} \\]

    Equation 3.18 and 3.19: Bellman optimality equations for \\(v_*\\)

    \\[ \\begin{align} v_*(s) &= \\max_{a \\in \\mathcal{A}(s)} q_{\\pi_*}(s, a) \\\\ &= \\max_{a} \\mathbb{E}_{\\pi_*}[G_t \\mid S_t = s, A_t = a] \\tag{by (3.9)}\\\\ &= \\max_{a} \\mathbb{E}_{\\pi_*}[R_{t+1} + \\gamma G_{t+1} \\mid S_t = s, A_t = a] \\\\ &= \\max_{a} \\mathbb{E}[R_{t+1} + \\gamma v_*(S_{t+1}) \\mid S_t = s, A_t = a] \\tag{3.18} \\\\ &= \\max_{a} \\sum_{s', r} p(s', r \\mid s, a) [r + \\gamma v_*(s')] \\tag{3.19} \\\\ \\end{align} \\]

    Equation 3.20: Bellman optimality equation for \\(q_*\\)

    \\[ \\begin{align} q_*(s, a) &= \\mathbb{E}[R_{t+1} + \\gamma \\max_{a'} q_*(S_{t+1}, a') \\mid S_t = s, A_t = a] \\\\ &= \\sum_{s', r} p(s', r \\mid s, a) [r + \\gamma \\max_{a'} q_*(s', a')] \\tag{3.20} \\end{align} \\]

    Any policy that is greedy with respect to the optimal evaluation function \\(v_*\\) is an optimal policy.

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%204/","title":"Reinforcement Learning An Introduction Chapter 4","text":"Properties authors Richard S. Sutton, Andrew G. Barton year 2018","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%204/#41-policy-evaluation","title":"4.1 Policy evaluation","text":"

    Equations 4.3 and 4.4

    \\[ \\begin{align} v_{\\pi}(s) &\\doteq \\mathbb{E}_{\\pi}[G_t \\mid S_t = s] \\\\ &= \\mathbb{E}_{\\pi}[R_{t+1} + \\gamma G_{t+1} \\mid S_t = s] && (\\text{from (3.9)})\\\\ &= \\mathbb{E}_{\\pi}[R_{t+1} + \\gamma v_{\\pi}(S_{t+1}) \\mid S_t = s] && (4.3)\\\\ &= \\sum_a \\pi(a \\mid s) \\sum_{s',r} p(s', r \\mid s, a) \\left[ r + \\gamma v_{\\pi}(s') \\right] && (4.4), \\end{align} \\]

    Equation 4.5

    \\[ \\begin{align} v_{k+1}(s) &\\doteq \\mathbb{E}_{\\pi} [ R_{t+1} + \\gamma v_k(S_{t+1}) \\mid S_t = s ] \\\\ & = \\sum_a \\pi(a \\mid s) \\sum_{s', r} p(s', r \\mid s, a) \\left[ r + \\gamma v_k(s') \\right] && (4.5), \\end{align} \\]

    { width=\"600\" }

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%204/#42-policy-improvement","title":"4.2 Policy Improvement","text":"

    Equation 4.6

    \\[ \\begin{align} q_\\pi(s, a) &\\doteq \\mathbb{E}[R_{t+1} + \\gamma v_\\pi(S_{t+1}) \\mid S_t = s, A_t = a] && (4.6)\\\\ &= \\sum_{s', r}p(s', r \\mid s, a)[r + \\gamma v_\\pi(s')] \\\\ \\end{align} \\]","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%204/#43-policy-iteration","title":"4.3 Policy Iteration","text":"

    { width=\"600\" }

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%204/#44-value-iteration","title":"4.4 Value Iteration","text":"

    \"This algorithm is called value iteration. It can be written as a particularly simple update operation that combines the policy improvement and truncated policy evaluation steps.\"

    Equation 4.10

    \\[ \\begin{align} v_{k+1} &\\doteq \\max_{a} \\mathbb{E} [R_{t+1} + \\gamma v_k(S_{t+1}) \\mid S_t =s, A = a] \\\\ &= \\max_{a} \\sum_{s', r}p(s', r \\mid s, a)[r + \\gamma v_k(s')] && (4.10) \\\\ \\end{align} \\]

    { width=\"600\" }

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%204/#45-asynchronous-dynamic-programming","title":"4.5 Asynchronous Dynamic Programming","text":"

    \"These algorithms update the values of states in any order whatsoever, using whatever values of other states happen to be available. [...] To converge correctly, however, an asynchronous algorithm must continue to update the values of all the states: it can\u2019t ignore any state after some point in the computation. Asynchronous DP algorithms allow great flexibility in selecting states to update.\"

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%205/","title":"Reinforcement Learning An Introduction Chapter 5","text":"Properties authors Richard S. Sutton, Andrew G. Barton year 2018","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%205/#51-monte-carlo-prediction","title":"5.1 Monte Carlo prediction","text":"

    first-visit mc - independence assumptions, easier theoretically every-visit mc

    • TODO: finish notes
    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%205/#54-monte-carlo-control-without-exploring-starts","title":"5.4 Monte Carlo Control without Exploring Starts","text":"
    • \\(\\epsilon-\\)greedy policy

      • All non-greedy actions have minimum probability of \\(\\frac{\\epsilon}{|\\mathcal{A}|}\\)
      • Greedy action has probability \\((1 - \\epsilon) + \\frac{\\epsilon}{|\\mathcal{A}|}\\)
    • TODO: finish notes

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%205/#55-off-policy-prediction-via-importance-sampling","title":"5.5 Off-policy Prediction via Importance Sampling","text":"

    Given a starting state \\(S_t\\), the probability of the subsequent state-action trajectory, \\(A_t, S_{t+1}, A_{t+1}, \\dots, S_T\\), under the policy \\(\\pi\\) is given by:

    \\[ \\begin{align} Pr\\{A_t, S_{t+1}, A_{t+1}, \\dots, S_T \\mid S_t, A_{t:T-1} \\sim \\pi\\} & = \\prod_{k=t}^{T-1} \\pi(A_k \\mid S_k) p(S_{k+1} \\mid S_k, A_k) \\end{align} \\]

    Equation 5.3: Important sampling ratio

    \\[ \\rho_{t:T-1} \\doteq \\frac{\\prod_{k=t}^{T-1} \\pi(A_k \\mid S_k) p(S_{k+1} \\mid S_k, A_k)}{\\prod_{k=t}^{T-1} b(A_k \\mid S_k) p(S_{k+1} \\mid S_k, A_k)} = \\prod_{k=t}^{T-1} \\frac{\\pi(A_k \\mid S_k)}{b(A_k \\mid S_k)} \\tag{5.3} \\]

    Equation 5.4: Value function for target function \\(\\pi\\) under behavior policy \\(b\\)

    The importance sampling ratio allows us to compute the correct expected value to compute \\(v_\\pi\\):

    \\[ \\begin{align} v_\\pi(s) &\\doteq \\mathbb{E}_b[\\rho_{t:T - 1}G_t \\mid S_t = s] \\tag{5.4} \\\\ \\end{align} \\]

    Equation 5.5: Ordinary importance sampling

    \\[ V(s) \\doteq \\frac{\\sum_{t \\in \\mathcal{T}(s)} \\rho_{t:T-1} G_t}{|\\mathcal{T}(s)|} \\tag{5.5} \\]

    Equation 5.6: Weighted importance sampling

    \\[ V(s) \\doteq \\frac{\\sum_{t \\in \\mathcal{T}(s)} \\rho_{t:T-1} G_t}{\\sum_{t \\in \\mathcal{T}(s)} \\rho_{t:T-1}} \\tag{5.6} \\]

    In practice, weighted importance sampling has much lower error at the beginning.

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%205/#56-incremental-implementation","title":"5.6 Incremental Implementation","text":"","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%205/#todo","title":"todo","text":"","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%206/","title":"Reinforcement Learning An Introduction Chapter 6","text":"Properties authors Richard S. Sutton, Andrew G. Barton year 2018","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%206/#61-td-prediction","title":"6.1 TD Prediction","text":"

    Equation 6.2: TD(0) update

    \\[ \\begin{align} V(S_t) &\\leftarrow V(S_t) + \\alpha \\left[ R_{t+1} + \\gamma V(S_{t+1}) - V(S_t) \\right] \\tag{6.2} \\\\ \\end{align} \\]

    Equations 6.3 and 6.4: Relationship between TD(0), MC and DP

    \\[ \\begin{align} v_\\pi(s) &\\doteq \\mathbb{E}_\\pi[G_t \\mid S_t = s] \\tag{6.3} \\\\ &= \\mathbb{E}_\\pi[R_{t+1} + \\gamma G_{t+1} \\mid S_t = s] \\tag{from (3.9)} \\\\ &= \\mathbb{E}_\\pi[R_{t+1} + \\gamma v_\\pi(S_{t+1}) \\mid S_t = s] \\tag{6.4} \\\\ \\end{align} \\] Why is (6.3) called the Monte Carlo estimate?

    Because the expected value is not known, and sampled returns are used in its place.

    Why is (6.4) called the Dynamic Programming estimate?

    Although the expectation is known, the value function is not, as we use the estimate \\(V(S_t)\\).

    By looking at the previous two answers, what does TD(0) estimate and how does that differ from the previous methods?

    TD(0) maintains both an estimate of the value function and uses a sample reward as the estimate to the expectation.

    Equation 6.5: TD error

    \\[ \\begin{align} \\delta_t &\\doteq R_{t+1} + \\gamma V(S_{t+1}) - V(S_t) \\tag{6.5} \\end{align} \\]","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%206/#64-sarsa-on-policy-td-control","title":"6.4 Sarsa: On-policy TD Control","text":"

    Equation 6.7

    \\[ \\begin{align} Q(S_t, A_t) \\leftarrow Q(S_t, A_t) + \\alpha \\left[ R_{t+1} + \\gamma Q(S_{t+1}, A_{t+1}) - Q(S_t, A_t) \\right] \\end{align} \\]

    { width=\"900\" }

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%206/#65-q-learning-off-policy-td-control","title":"6.5 Q-learning: Off-policy TD Control","text":"

    Equation 6.8

    \\[ \\begin{align} Q(S_t, A_t) \\leftarrow Q(S_t, A_t) + \\alpha \\left[ R_{t+1} + \\gamma \\max_a Q(S_{t+1}, a) - Q(S_t, A_t) \\right] \\end{align} \\]

    { width=\"700\" }

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%206/#66-expected-sarsa","title":"6.6 Expected SARSA","text":"

    Equation 6.9

    \\[ \\begin{align} Q(S_t, A_t) &\\leftarrow Q(S_t, A_t) + \\alpha \\left[ R_{t+1} + \\gamma \\mathbb{E}_\\pi [Q(S_{t+1}, A_{t+1}) \\mid S_{t+1}] - Q(S_t, A_t) \\right] \\\\ &= Q(S_t, A_t) + \\alpha \\left[ R_{t+1} + \\gamma \\sum_a \\pi(a \\mid S_{t+1}) Q(S_{t+1}, a) - Q(S_t, A_t) \\right] && (6.9) \\end{align} \\]

    It's more computationally demanding but it's more stable and fares better than q learning and sarsa.

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%206/#67-maximization-bias-and-double-learning","title":"6.7 Maximization Bias and Double Learning","text":"

    \"All the control algorithms that we have discussed so far involve maximization in the construction of their target policies\"

    this causes maximization bias: - think of estimating the mean of N(-0.1, 1) - this estimate might at some point be 0.1 and the other option might be correctly 0 - the optimal choice is to pick 0, but because we take the max of an estimate, we positively bias ourselves

    The general way to solve it is to estimate two different value functions, one for getting the value (\\(Q_2\\)) and the other for obtaining the best action \\(Q_1\\).

    \\[ \\begin{align} A^* &= \\text{argmax}_a Q_1(a) \\\\ Q_2(A^*) &= Q_2(\\text{argmax}_a Q_1(a)) \\end{align} \\]

    This effectively debiases the estimate \\(\\mathbb{E}[Q_2(A^*)] = q(A^*)\\)

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%207/","title":"Reinforcement Learning An Introduction Chapter 7","text":"Properties authors Richard S. Sutton, Andrew G. Barton year 2018","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%207/#71-n-step-td-prediction","title":"7.1 \\(n\\)-step TD prediction","text":"

    One-step return:

    \\[ G_{t:t+1} \\doteq R_{t+1} + \\gamma V_t(S_{t+1}) \\]

    Equation 7.1: \\(n\\)-step return

    \\[ G_{t:t+n} \\doteq R_{t+1} + \\gamma R_{t+2} + \\dots + \\gamma^{n-1} R_{t+n} + \\gamma^n V_{t + n - 1}(S_{t+n}) \\tag{7.1} \\]","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%209/","title":"Reinforcement Learning An Introduction Chapter 9","text":"Properties authors Richard S. Sutton, Andrew G. Barton year 2018","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%209/#9-on-policy-prediction-with-approximation","title":"9. On-policy prediction with approximation","text":"

    Problem setting: In most real scenarios, the number of states is too large for tabular learning algorithms, so we will approximate the value function by a learned, parametrized function: \\(\\(\\hat{v}(s, \\mathbf{w}) \\approx v_\\pi(s)\\)\\) - Examples of possible modelling choices for this function could be linear functions, non linear functions, neural networks, etc. - \\(\\mathbf{w} \\in R^d\\) , \\(d \\ll |\\mathcal{S}|\\) , which means that updating on state affects multiple: generalization - This formulation allows for partially observable states. - Side note: not all convergence proofs apply to all function classes (for more info see UCL x DeepMind 7/13)

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%209/#91-value-function-approximation","title":"9.1 Value-function approximation","text":"

    New notation! (\\(s\\to u\\) is an update rule for \\(v(s)\\) using new expression \\(u\\))

    How does the learning setting differ between neural networks (supervised) and reinforcement learning?

    RL requires modeling to allow:

    • online learning (while interacting with environment), incrementally acquire data
      • Remember that supervised learning suffers from catastrophic forgetting
    • Non-stationary target functions

    Supervised Learning assumes iid sampling from a fixed but unknown data distribution

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%209/#92-the-prediction-objective-overlineve","title":"9.2 The Prediction Objective (\\(\\overline{VE}\\))","text":"Why do we need a prediction objective now? What has changed?

    In the tabular setting we had two nice properties:

    • the learned value function could actually converge exactly to the true value function
    • the value of a state was decoupled from other states

    Without these two, we must say which states are most important to us.

    Equation 9.1

    \\[ \\begin{align} \\overline{VE}(\\mathbf{w}) &\\doteq \\sum_{s \\in \\mathcal{S}} \\mu(s) \\left[v_{\\pi}(s) - \\hat{v}(s, \\mathbf{w})\\right]^2 && \\tag{9.1} \\end{align} \\]

    Where: - \\(\\mu(s)\\) is the state distribution (reminder: non-negative, sums to one)

    For on-policy episodic tasks, \\(\\mu(s)\\) is called the on-policy distribution, which can be defined as follows:

    Equations 9.2 and 9.3

    \\[ \\begin{align} \\eta(s) = h(s) + \\sum_{\\bar{s}} \\eta(\\bar{s}) \\sum_a \\pi(a \\mid \\bar{s})p(s \\mid \\bar{s}, a), && \\text{for all } s \\in S && \\tag{9.2} \\end{align} \\] \\[ \\begin{align} \\mu(s) = \\frac{\\eta(s)}{\\sum_{s'}\\eta(s')} && \\tag{9.3} \\end{align} \\]

    Where: - \\(h(s)\\) is the probability that an episode begins in a state \\(s\\). - \\(\\eta(s)\\) is the number of time steps spent on average in a state \\(s\\) for a single episode. - Interpretation of 2 terms: Time is spent in a \\(s\\) if an episode starts in \\(s\\) or if another state transitions into \\(s\\).

    • \\(\\overline{VE}\\) only guarantees local optimality.
    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%209/#93-stochastic-gradient-and-semi-gradient-methods","title":"9.3 Stochastic-gradient and Semi-gradient Methods","text":"

    Equations 9.4 and 9.5

    \\[ \\begin{align} \\mathbf{w}_{t+1} &= \\mathbf{w}_t - \\frac{1}{2} \\alpha \\nabla \\left[v_{\\pi}(S_t) - \\hat{v}(S_t, \\mathbf{w}_t) \\right] && \\tag{9.4} \\\\ &= \\mathbf{w}_t + \\alpha \\left[v_{\\pi}(S_t) - \\hat{v}(S_t, \\mathbf{w}_t) \\right] \\nabla \\hat{v}(S_t, \\mathbf{w}_t) && \\tag{9.5} \\end{align} \\]

    However, since we don't know the true \\(v_\\pi(s)\\), we can replace it with the target output \\(U_t\\):

    Equation 9.7

    \\[ \\begin{align} \\mathbf{w}_{t+1} &= \\mathbf{w}_t + \\alpha \\left[U_t - \\hat{v}(S_t, \\mathbf{w}_t) \\right] \\nabla \\hat{v}(S_t, \\mathbf{w}_t) && \\tag{9.7} \\end{align} \\]

    Where: - \\(U_t\\) should be an unbiased estimate of \\(v_\\pi(s)\\), that is: - \\(\\mathbb{E}[U_t \\mid S_t=s] = v_\\pi(s)\\) - With local optimum convergence guarantees.

    Examples of \\(U_t\\): - Monte Carlo target: \\(U_t = G_t\\) (that is, the reward achieved until the end of the episode), unbiased. - Bootstrapping targets are biased because they depend on \\(\\mathbf{w}\\) through \\(\\hat{v}(S_t, \\mathbf{w})\\) . - To make them unbiased, you can treat the dependent expressions as constants (stop the gradient flow). This yields semi-gradient methods.

    Semi-gradient methods: - Do not converge as robustly as gradient methods, aside from the linear case. - Faster, enable online/continual learning.

    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%209/#94-linear-methods","title":"9.4 Linear Methods","text":"

    Equation 9.8

    \\[ \\begin{align} \\hat{v}(s, \\mathbf{w}) \\doteq \\mathbf{w}^\\intercal \\mathbf{x}(s) = \\sum_{i=1}^d w_i x_i(s) && \\tag{9.8} \\end{align} \\]

    Where:

    • \\(\\mathbf{x}(s) = \\left(x_1(s), \\dots, x_d(s)\\right)^\\intercal\\)
    • The gradient Monte Carlo algorithm converges to the global optimum of the VE under linear function approximation if \\(\\alpha\\) is reduced over time according to the usual conditions.
    • Chapter also explores the convergence of TD(0) with SGD and linear approximation and finds it converges to the TD fixed point (Eqs. 9.11, 9.12), \\(\\mathbf{w}_{TD}\\). This is not the global optimum, but a point near the local optimum.

    Equation 9.14

    Interpretation: The asymptotic error of the TD method is no more than \\(\\frac{1}{1-\\gamma}\\) times the smallest possible error.

    \\[ \\begin{align} \\overline{VE}(\\mathbf{w}_{TD}) & \\leq \\frac{1}{1-\\gamma} \\min_{\\mathbf{w}} \\overline{VE}(\\mathbf{w}) \\tag{9.14} \\end{align} \\]

    Equation 9.15

    \\[ \\mathbf{w}_{t+n} \\doteq \\mathbf{w}_{t+n-1} + \\alpha \\left[ G_{t:t+n} - \\hat{v}(S_t, \\mathbf{w}_{t+n-1}) \\right] \\nabla \\hat{v}(S_t, \\mathbf{w}_{t+n-1}), \\quad 0 \\leq t < T, \\tag{9.15} \\]

    Equation 9.16

    \\[ G_{t:t+n} \\doteq R_{t+1} + \\gamma R_{t+2} + \\cdots + \\gamma^{n-1} R_{t+n} + \\gamma^n \\hat{v}(S_{t+n}, \\mathbf{w}_{t+n-1}), \\quad 0 \\leq t \\leq T - n. \\tag{9.16} \\]","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%209/#95-feature-construction-for-linear-methods","title":"9.5 Feature Construction for Linear Methods","text":"
    • 9.5.1 Polynomials
    • 9.5.2 Fourier Basis
    • 9.5.3 Coarse coding
    • 9.5.4 Tile Coding
    • 9.5.5 Radial Basis Functions
    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction%20-%20Chapter%209/#96-selecting-step-size-parameters-manually","title":"9.6 Selecting Step-Size Parameters Manually","text":"

    Equation 9.19

    Suppose you wanted to learn in about \\(\\tau\\) experiences with substantially the same feature vector. A good rule of thumb for setting the step-size parameter of linear SGD methods is:

    \\[ \\begin{align} \\alpha \\doteq \\left(\\tau \\mathbb{E}\\left[\\mathbf{x}^\\intercal\\mathbf{x}\\right]\\right)^{-1} \\tag{9.19} \\end{align} \\]","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/Reinforcement%20Learning%20-%20An%20Introduction/","title":"Reinforcement Learning An Introduction","text":"Properties authors Richard S. Sutton, Andrew G. Barton year 2018
    • Reinforcement Learning - An Introduction - Chapter 3
    • Reinforcement Learning - An Introduction - Chapter 4
    • Reinforcement Learning - An Introduction - Chapter 6
    • Reinforcement Learning - An Introduction - Chapter 9
    ","tags":["textbook"]},{"location":"100%20Reference%20notes/104%20Other/TinyML%20and%20Efficient%20Deep%20Learning%20Computing%20-%20Lecture%2012/","title":"TinyML and Efficient Deep Learning Computing Lecture 12","text":"Properties authors Song Han year 2023 url https://www.dropbox.com/scl/fi/spgvr9owflz6s1lt5po17/lec12.pdf?rlkey=cwqpteopgvsdgnxd8xtcniadr&e=2&dl=0","tags":["lecture"]},{"location":"100%20Reference%20notes/104%20Other/TinyML%20and%20Efficient%20Deep%20Learning%20Computing%20-%20Lecture%203/","title":"TinyML and Efficient Deep Learning Computing Lecture 3","text":"Properties authors Song Han year 2023 url https://www.dropbox.com/scl/fi/2oxmtvoeccyuw47yfambb/lec03.pdf?rlkey=3ykm0g21ibsoqn7xnw43v7aaw&e=1&dl=0","tags":["lecture"]},{"location":"100%20Reference%20notes/104%20Other/TinyML%20and%20Efficient%20Deep%20Learning%20Computing%20-%20Lecture%205/","title":"TinyML and Efficient Deep Learning Computing Lecture 5","text":"Properties authors Song Han year 2023 url https://www.dropbox.com/scl/fi/eos92o2fgys6gk0gizogl/lec05.pdf?rlkey=2hohvi8jcvjw3f8m8vugfa2mz&e=1&dl=0

    Content: 1. Reviews numeric datatypes (floating point, etc) 2. Learns basic concept of quantization 3. Introduces three types of common neural network quantization: - K-Means-based Quantization - Linear Quantization - [[Binary and Ternary Quantization|Binary and Ternary Quantization]] (will be covered on Lecture 6)

    ","tags":["lecture"]},{"location":"100%20Reference%20notes/104%20Other/TinyML%20and%20Efficient%20Deep%20Learning%20Computing%20-%20Lecture%206/","title":"TinyML and Efficient Deep Learning Computing Lecture 6","text":"Properties authors Song Han year 2023 url https://www.dropbox.com/scl/fi/1mo0umu0qtq7uxap2l5m3/lec06.pdf?rlkey=bdl2mgusgajddjuvjxb0fot36&e=2&dl=0

    Content: 1. Quantization Granularity 1. Per tensor quantization: same quantization parameters for the entire matrix 2. Per channel quantization: sometimes each channels have considerably different weight distributions, have different quantization parameters per channel/row 3. Group quantization: similar idea 2. Dynamic Range Clipping - To quantize activations, we must keep track of activations statistics - Use KL divergence to measure information loss - Allocating dynamic range to outliers hurts representation ability (see below image) - 3. Rounding

    Quantization Aware Training - To minimize the loss of accuracy, especially aggressive quantization with 4 bits and lower bit width, neural network will be trained/fine-tuned with quantized weights and activations. - Usually, fine-tuning a pre-trained floating point model provides better accuracy than training from scratch.

    ","tags":["lecture"]},{"location":"100%20Reference%20notes/104%20Other/TinyML%20and%20Efficient%20Deep%20Learning%20Computing/","title":"TinyML and Efficient Deep Learning Computing","text":"Properties authors Song Han year 2023 url https://hanlab.mit.edu/courses/2023-fall-65940","tags":["course"]},{"location":"100%20Reference%20notes/104%20Other/Tweet%20-%20Stable%20Diffusion%20XL%20on%20iPhone%20with%20Core%20ML%21/","title":"Tweet Stable Diffusion XL on iPhone with Core ML!","text":"Properties authors Atila Orhon year 2023 url https://x.com/atiorh/status/1707402410870862002

    We compressed the diffusion model using our Mixed-Bit Palettization technique (described in https://huggingface.co/blog/stable-diffusion-xl-coreml\u2026) which yields an average of 4.04-bits (5.2GB -> 1.3GB) while maintaining higher accuracy than linear 8-bit quantization. Compressed model runs faster too

    Notes - 4 times smaller memory footprint - Better than linear 8-bit quantization - Faster inference time

    ","tags":["efficient_dl","tweet"]}]} \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml index 9dbef985..f57a675e 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -1565,6 +1565,11 @@ 2024-10-01 daily + + https://dgcnz.github.io/second-brain/100%20Reference%20notes/104%20Other/GPU%20mode%20-%20Sparsity/ + 2024-10-01 + daily + https://dgcnz.github.io/second-brain/100%20Reference%20notes/104%20Other/Introducing%20Apple%E2%80%99s%20On-Device%20and%20Server%20Foundation%20Models/ 2024-10-01 diff --git a/sitemap.xml.gz b/sitemap.xml.gz index 5636bb04..88fad9e7 100644 Binary files a/sitemap.xml.gz and b/sitemap.xml.gz differ