From 089cfa0ecbae5fce340e2b2d180e89ddbaf0412b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 16 Jan 2025 17:33:45 +0000
Subject: [PATCH 1/4] Add: Serial & AVX-512 scatter/gather

---
 less_slow.cpp | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
diff --git a/less_slow.cpp b/less_slow.cpp
index 480001d..36b5f63 100644
--- a/less_slow.cpp
+++ b/less_slow.cpp
@@ -1540,6 +1540,103 @@ BENCHMARK(memory_access_aligned)->MinTime(10);
 
 #pragma endregion // Alignment of Memory Accesses
 
+#pragma region Gather & Scatter Operations for Spread Data
+
+/**
+ *  Sometimes, the variables of interest are scattered across memory, and we
+ *  need to gather them into a contiguous buffer for processing. This is already
+ *  common in sparse matrix operations, where only a few elements are non-zero,
+ *  but can apply to any irregular data structure...
+ *
+ *  The only question is: is there some smart way to gather these elements?
+ *
+ *  Our benchmarks is following - generate 32-bit unsigned integers from 0 to N,
+ *  random-shuffle and use them as gathering indices. For scatter operations,
+ *  we will use the same indicies to overwrite information in a separate buffer.
+ *
+ *  We will be looking at the ideal simplest case when the offset type and the
+ *  data have identical size.
+ */
+using spread_index_t = std::uint32_t;
+using spread_data_t = float;
+
+/**
+ * @brief Perform a scalar gather operation.
+ * @param data The data buffer to gather from.
+ * @param indices The indices used to gather data.
+ * @param result The buffer where gathered data will be stored.
+ * @param size The number of elements to process.
+ */
+void spread_gather_scalar( //
+    spread_data_t const *data, spread_index_t const *indices, spread_data_t *result, std::size_t size) noexcept {
+    for (std::size_t i = 0; i < size; ++i) result[i] = data[indices[i]];
+}
+
+/**
+ * @brief Perform a scalar scatter operation.
+ * @param data The buffer to scatter data into.
+ * @param indices The indices used to scatter data.
+ * @param source The buffer containing data to scatter.
+ * @param size The number of elements to process.
+ */
+void spread_scatter_scalar( //
+    spread_data_t *data, spread_index_t const *indices, spread_data_t const *source, std::size_t size) noexcept {
+    for (std::size_t i = 0; i < size; ++i) data[indices[i]] = source[i];
+}
+
+template <typename kernel_type_>
+static void spread_memory(bm::State &state, kernel_type_ kernel) {
+
+    std::size_t size = static_cast<std::size_t>(state.range(0));
+    std::vector<spread_index_t> indices(size);
+    std::vector<spread_data_t> first(size), second(size);
+    std::iota(first.begin(), first.end(), 0);
+    std::random_device random_device;
+    std::mt19937 generator(random_device());
+    std::shuffle(indices.begin(), indices.end(), generator);
+
+    for (auto _ : state) { kernel(first.data(), indices.data(), second.data(), size); }
+}
+
+BENCHMARK_CAPTURE(spread_memory, gather_scalar, spread_gather_scalar)->Range(1 << 10, 1 << 20);
+BENCHMARK_CAPTURE(spread_memory, scatter_scalar, spread_scatter_scalar)->Range(1 << 10, 1 << 20);
+
+#if defined(__AVX512F__)
+void spread_gather_avx512( //
+    spread_data_t const *data, spread_index_t const *indices, spread_data_t *result, std::size_t size) {
+    constexpr std::size_t simd_width_k = sizeof(__m512i) / sizeof(spread_data_t);
+    static_assert( //
+        sizeof(spread_data_t) == sizeof(spread_index_t), "Data and index types must have the same size");
+    std::size_t i = 0;
+    for (; i + simd_width_k <= size; i += simd_width_k)
+        _mm512_storeu_si512(&result[i], _mm512_i32gather_epi32(_mm512_loadu_si512(&indices[i]), data, 4));
+    for (; i < size; ++i) result[i] = data[indices[i]];
+}
+
+void spread_scatter_avx512( //
+    spread_data_t *data, spread_index_t const *indices, spread_data_t const *source, std::size_t size) {
+    constexpr std::size_t simd_width_k = sizeof(__m512i) / sizeof(spread_data_t);
+    static_assert( //
+        sizeof(spread_data_t) == sizeof(spread_index_t), "Data and index types must have the same size");
+    std::size_t i = 0;
+    for (; i + simd_width_k <= size; i += simd_width_k)
+        _mm512_i32scatter_epi32(data, _mm512_loadu_si512(&indices[i]), _mm512_loadu_si512(&source[i]), 4);
+    for (; i < size; ++i) data[indices[i]] = source[i];
+}
+
+BENCHMARK_CAPTURE(spread_memory, gather_avx512, spread_gather_avx512)->Range(1 << 10, 1 << 20);
+BENCHMARK_CAPTURE(spread_memory, scatter_avx512, spread_scatter_avx512)->Range(1 << 10, 1 << 20);
+
+/**
+ *  AVX-512 shows a @b 5%-30% improvement over scalar implementations, with
+ *  smaller datasets benefiting the most. For 1024 elements, AVX-512 reduces
+ *  latency from ~271 ns to ~212 ns. For 1,048,576 elements, improvements are
+ *  less pronounced: ~311 µs (scalar) vs. ~289 µs (AVX-512).
+ */
+#endif
+
+#pragma endregion // Gather & Scatter Operations for Spread Data
+
 #pragma region Non Uniform Memory Access
 
 /**

From 107b359bf2944896aebf6100c5824200e5ba1779 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 16 Jan 2025 17:47:54 +0000
Subject: [PATCH 2/4] Add: SVE gather/scatter

30% performance improvement
across small and large inputs!
---
 less_slow.cpp | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/less_slow.cpp b/less_slow.cpp
index 36b5f63..5658249 100644
--- a/less_slow.cpp
+++ b/less_slow.cpp
@@ -1635,6 +1635,40 @@ BENCHMARK_CAPTURE(spread_memory, scatter_avx512, spread_scatter_avx512)->Range(1
  */
 #endif
 
+#if defined(__ARM_FEATURE_SVE) // Arm NEON has no gather/scatter instructions, but SVE does 🥳
+#include <arm_sve.h>
+
+void spread_gather_sve( //
+    spread_data_t const *data, spread_index_t const *indices, spread_data_t *result, std::size_t size) {
+    for (std::size_t i = 0; i < size; i += svcntw()) {
+        svbool_t pg = svwhilelt_b32(i, size);
+        svuint32_t sv_indices = svld1(pg, &indices[i]);
+        svfloat32_t sv_data = svld1_gather_offset(pg, data, sv_indices);
+        svst1(pg, &result[i], sv_data);
+    }
+}
+
+void spread_scatter_sve( //
+    spread_data_t *data, spread_index_t const *indices, spread_data_t const *source, std::size_t size) {
+    for (std::size_t i = 0; i < size; i += svcntw()) {
+        svbool_t pg = svwhilelt_b32(i, size);
+        svuint32_t sv_indices = svld1(pg, &indices[i]);
+        svfloat32_t sv_data = svld1(pg, &source[i]);
+        svst1_scatter_offset(pg, data, sv_indices, sv_data);
+    }
+}
+
+BENCHMARK_CAPTURE(spread_memory, gather_sve, spread_gather_sve)->Range(1 << 10, 1 << 20);
+BENCHMARK_CAPTURE(spread_memory, scatter_sve, spread_scatter_sve)->Range(1 << 10, 1 << 20);
+
+/**
+ *  @b Finally! This may just be the first place where SVE supersedes NEON
+ *  in functionality and has a bigger improvement over scalar code than AVX-512
+ *  on a similar-level x86 platform! Both gathers and scatters are consistently
+ *  @b 30% faster across small and large inputs!
+ */
+#endif
+
 #pragma endregion // Gather & Scatter Operations for Spread Data
 
 #pragma region Non Uniform Memory Access

From 3fca99130daae738e61763dbbe1076b0baddbd53 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 16 Jan 2025 21:55:11 +0000
Subject: [PATCH 3/4] Improve: Stabilize gather timings

Reuse `std::unique_ptr` of an array
logic, abstracting it into `make_aligned_array`.
---
 less_slow.cpp | 48 +++++++++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/less_slow.cpp b/less_slow.cpp
index 36b5f63..b3c79f8 100644
--- a/less_slow.cpp
+++ b/less_slow.cpp
@@ -1363,7 +1363,7 @@ BENCHMARK(f32x4x4_matmul_avx512);
 #include <cassert>  // `assert`
 #include <fstream>  // `std::ifstream`
 #include <iterator> // `std::random_access_iterator_tag`
-#include <memory>   // `std::assume_aligned`
+#include <memory>   // `std::assume_aligned`, `std::unique_ptr`
 #include <string>   // `std::string`, `std::stoull`
 
 /**
@@ -1471,6 +1471,13 @@ class strided_ptr {
     // clang-format on
 };
 
+template <typename type_>
+std::unique_ptr<type_[], decltype(&std::free)> make_aligned_array(std::size_t size, std::size_t alignment) {
+    type_ *raw_ptr = static_cast<type_ *>(std::aligned_alloc(alignment, sizeof(type_) * size));
+    if (!raw_ptr) throw std::bad_alloc();
+    return std::unique_ptr<type_[], decltype(&std::free)>(raw_ptr, &std::free);
+}
+
 #if defined(__aarch64__)
 /**
  *  @brief  Helper derived from `__aarch64_sync_cache_range` in `libgcc`, used to
@@ -1495,9 +1502,7 @@ static void memory_access(bm::State &state) {
     // memory accesses may suffer from the same issues. For split-loads, pad our
     // buffer with an extra `cache_line_width` bytes of space.
     std::size_t const buffer_size = typical_l2_size + cache_line_width;
-    std::unique_ptr<std::byte, decltype(&std::free)> const buffer(                        //
-        reinterpret_cast<std::byte *>(std::aligned_alloc(cache_line_width, buffer_size)), //
-        &std::free);
+    auto const buffer = make_aligned_array<std::byte>(buffer_size, cache_line_width);
     std::byte *const buffer_ptr = buffer.get();
 
     // Let's initialize a strided range using out `strided_ptr` template, but
@@ -1585,21 +1590,23 @@ void spread_scatter_scalar( //
 }
 
 template <typename kernel_type_>
-static void spread_memory(bm::State &state, kernel_type_ kernel) {
+static void spread_memory(bm::State &state, kernel_type_ kernel, std::size_t align = sizeof(spread_data_t)) {
 
-    std::size_t size = static_cast<std::size_t>(state.range(0));
-    std::vector<spread_index_t> indices(size);
-    std::vector<spread_data_t> first(size), second(size);
-    std::iota(first.begin(), first.end(), 0);
+    std::size_t const size = static_cast<std::size_t>(state.range(0));
+    auto indices = make_aligned_array<spread_index_t>(size, align);
+    auto first = make_aligned_array<spread_data_t>(size, align);
+    auto second = make_aligned_array<spread_data_t>(size, align);
+
+    std::iota(indices.get(), indices.get() + size, 0);
     std::random_device random_device;
     std::mt19937 generator(random_device());
-    std::shuffle(indices.begin(), indices.end(), generator);
+    std::shuffle(indices.get(), indices.get() + size, generator);
 
-    for (auto _ : state) { kernel(first.data(), indices.data(), second.data(), size); }
+    for (auto _ : state) kernel(first.get(), indices.get(), second.get(), size);
 }
 
-BENCHMARK_CAPTURE(spread_memory, gather_scalar, spread_gather_scalar)->Range(1 << 10, 1 << 20);
-BENCHMARK_CAPTURE(spread_memory, scatter_scalar, spread_scatter_scalar)->Range(1 << 10, 1 << 20);
+BENCHMARK_CAPTURE(spread_memory, gather_scalar, spread_gather_scalar)->Range(1 << 10, 1 << 20)->MinTime(5);
+BENCHMARK_CAPTURE(spread_memory, scatter_scalar, spread_scatter_scalar)->Range(1 << 10, 1 << 20)->MinTime(5);
 
 #if defined(__AVX512F__)
 void spread_gather_avx512( //
@@ -1624,14 +1631,17 @@ void spread_scatter_avx512( //
     for (; i < size; ++i) data[indices[i]] = source[i];
 }
 
-BENCHMARK_CAPTURE(spread_memory, gather_avx512, spread_gather_avx512)->Range(1 << 10, 1 << 20);
-BENCHMARK_CAPTURE(spread_memory, scatter_avx512, spread_scatter_avx512)->Range(1 << 10, 1 << 20);
+BENCHMARK_CAPTURE(spread_memory, gather_avx512, spread_gather_avx512, 64)->Range(1 << 10, 1 << 20)->MinTime(5);
+BENCHMARK_CAPTURE(spread_memory, scatter_avx512, spread_scatter_avx512, 64)->Range(1 << 10, 1 << 20)->MinTime(5);
 
 /**
- *  AVX-512 shows a @b 5%-30% improvement over scalar implementations, with
- *  smaller datasets benefiting the most. For 1024 elements, AVX-512 reduces
- *  latency from ~271 ns to ~212 ns. For 1,048,576 elements, improvements are
- *  less pronounced: ~311 µs (scalar) vs. ~289 µs (AVX-512).
+ *  For consistent timing, for AVX-512 we align allocations to the ZMM register
+ *  size, which also coincides with the cache line width on x86 CPUs: @b 64!
+ *
+ *  For short arrays under 4K elements, gathers can get up to 50% faster,
+ *  dropping from @b 270ns to @b 136ns. On larger sizes gather can @b lose
+ *  to serial code. Like on arrays of 65K entries it can be 50% slower!
+ *  Scatters are even more questionable!
  */
 #endif
 

From daa55f518e069b0de6c914d1f113ab953a3d1263 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 16 Jan 2025 22:13:18 +0000
Subject: [PATCH 4/4] Improve: Timing SVE

---
 .vscode/settings.json |  2 ++
 less_slow.cpp         | 38 ++++++++++++++++++++++++++++++++------
 2 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 42509c5..d37349a 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -20,6 +20,7 @@
     "excerise",
     "fconcepts",
     "Fedor",
+    "Fugaku",
     "Goodput",
     "grandkids",
     "Hana",
@@ -36,6 +37,7 @@
     "Meneide",
     "MSVC",
     "Müller",
+    "Neoverse",
     "Niebler",
     "Niels",
     "nlohmann",
diff --git a/less_slow.cpp b/less_slow.cpp
index 1409bfc..dbf8194 100644
--- a/less_slow.cpp
+++ b/less_slow.cpp
@@ -1646,8 +1646,26 @@ BENCHMARK_CAPTURE(spread_memory, scatter_avx512, spread_scatter_avx512, 64)->Ran
 #endif
 
 #if defined(__ARM_FEATURE_SVE) // Arm NEON has no gather/scatter instructions, but SVE does 🥳
+
+/**
+ *  Arm Scalable Vector Extension @b (SVE) is one of the weirdest current SIMD
+ *  extensions. Unlike AVX2, AVX-512, or even RVV on RISC-V, it doesn't preset
+ *  the register width at the ISA level! It's up to the physical implementation
+ *  to choose any power of two between 128 and @b 2048 bits.
+ *
+ *  In practice, Fugaku supercomputer likely has the largest SVE implementation
+ *  at 512-bits length. The Arm Neoverse N2 core has 256-bit SVE. It also
+ *  handles masking differently from AVX-512! Definitely worth reading about!
+ *
+ *  @see "ARM's Scalable Vector Extensions: A Critical Look at SVE2 For Integer
+ *       Workloads" by @ zingaburga:
+ *       https://gist.github.com/zingaburga/805669eb891c820bd220418ee3f0d6bd
+ *
+ */
 #include <arm_sve.h>
 
+constexpr std::size_t max_sve_size_k = 2048 / CHAR_BIT;
+
 void spread_gather_sve( //
     spread_data_t const *data, spread_index_t const *indices, spread_data_t *result, std::size_t size) {
     for (std::size_t i = 0; i < size; i += svcntw()) {
@@ -1668,14 +1686,22 @@ void spread_scatter_sve( //
     }
 }
 
-BENCHMARK_CAPTURE(spread_memory, gather_sve, spread_gather_sve)->Range(1 << 10, 1 << 20);
-BENCHMARK_CAPTURE(spread_memory, scatter_sve, spread_scatter_sve)->Range(1 << 10, 1 << 20);
+BENCHMARK_CAPTURE(spread_memory, gather_sve, spread_gather_sve, max_sve_size_k)->Range(1 << 10, 1 << 20)->MinTime(5);
+BENCHMARK_CAPTURE(spread_memory, scatter_sve, spread_scatter_sve, max_sve_size_k)->Range(1 << 10, 1 << 20)->MinTime(5);
 
 /**
  *  @b Finally! This may just be the first place where SVE supersedes NEON
- *  in functionality and has a bigger improvement over scalar code than AVX-512
- *  on a similar-level x86 platform! Both gathers and scatters are consistently
- *  @b 30% faster across small and large inputs!
+ *  in functionality and may have a bigger improvement over scalar code than
+ *  AVX-512 on a similar-level x86 platform!
+ *
+ *  If you are very lucky with your input sizes, on small arrays under 65K
+ *  on AWS Graviton, gathers can be up to 4x faster compared to serial code!
+ *  On larger sizes, they again start losing to serial code. This makes
+ *  their applicability very limited 😡
+ *
+ *  Vectorized scatters are universally slower than serial code on Graviton
+ *  for small inputs, but on larger ones over 1MB start winning up to 50%!
+ *  Great way to get everyone confused 🤬
  */
 #endif
 
@@ -2917,7 +2943,7 @@ inline std::byte *reallocate_from_arena( //
         }
     }
 
-    // If we can’t grow in place, do: allocate new + copy + free old
+    // If we can't grow in place, do: allocate new + copy + free old
     std::byte *new_ptr = allocate_from_arena(arena, new_size);
     if (!new_ptr) return nullptr; // Out of memory