Skip to content

Commit c101a0c

Browse files
authored
Merge pull request #27 from ashvardanian/intel-fma
Shocking Intel Performance in Matrix Multiplications
2 parents 1213f91 + c3f3ce9 commit c101a0c

File tree

3 files changed

+276
-86
lines changed

3 files changed

+276
-86
lines changed

.vscode/settings.json

+3
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"cppcoro",
2121
"CTRE",
2222
"CUDA",
23+
"denormal",
2324
"DOTPROD",
2425
"Dusíková",
2526
"Eigen",
@@ -74,11 +75,13 @@
7475
"Tera",
7576
"TMUL",
7677
"Trettner",
78+
"Unbundling",
7779
"Unif",
7880
"unifex",
7981
"unsalvageable",
8082
"unscalable",
8183
"Vardanian",
84+
"vfmadd",
8285
"VNNI",
8386
"Weis",
8487
"XCOMP",

less_slow.cpp

+167-67
Original file line numberDiff line numberDiff line change
@@ -1172,7 +1172,7 @@ static void f32x4x4_matmul(bm::State &state) {
11721172
for (auto _ : state) bm::DoNotOptimize(c = f32x4x4_matmul_kernel(a, b));
11731173

11741174
std::size_t tops_per_cycle = 4 * 4 * (4 /* multiplications */ + 3 /* additions */);
1175-
state.counters["TOPS"] = bm::Counter(state.iterations() * tops_per_cycle, bm::Counter::kIsRate);
1175+
state.counters["TOP"] = bm::Counter(state.iterations() * tops_per_cycle, bm::Counter::kIsRate);
11761176
}
11771177

11781178
BENCHMARK(f32x4x4_matmul);
@@ -1232,7 +1232,7 @@ static void f32x4x4_matmul_unrolled(bm::State &state) {
12321232
for (auto _ : state) bm::DoNotOptimize(c = f32x4x4_matmul_unrolled_kernel(a, b));
12331233

12341234
std::size_t tops_per_cycle = 4 * 4 * (4 /* multiplications */ + 3 /* additions */);
1235-
state.counters["TOPS"] = bm::Counter(state.iterations() * tops_per_cycle, bm::Counter::kIsRate);
1235+
state.counters["TOP"] = bm::Counter(state.iterations() * tops_per_cycle, bm::Counter::kIsRate);
12361236
}
12371237

12381238
BENCHMARK(f32x4x4_matmul_unrolled);
@@ -1330,7 +1330,7 @@ static void f32x4x4_matmul_sse41(bm::State &state) {
13301330
for (auto _ : state) bm::DoNotOptimize(c = f32x4x4_matmul_sse41_kernel(a, b));
13311331

13321332
std::size_t tops_per_cycle = 4 * 4 * (4 /* multiplications */ + 3 /* additions */);
1333-
state.counters["TOPS"] = bm::Counter(state.iterations() * tops_per_cycle, bm::Counter::kIsRate);
1333+
state.counters["TOP"] = bm::Counter(state.iterations() * tops_per_cycle, bm::Counter::kIsRate);
13341334
}
13351335

13361336
BENCHMARK(f32x4x4_matmul_sse41);
@@ -1419,7 +1419,7 @@ static void f32x4x4_matmul_avx512(bm::State &state) {
14191419
for (auto _ : state) bm::DoNotOptimize(c = f32x4x4_matmul_avx512_kernel(a, b));
14201420

14211421
std::size_t tops_per_cycle = 4 * 4 * (4 /* multiplications */ + 3 /* additions */);
1422-
state.counters["TOPS"] = bm::Counter(state.iterations() * tops_per_cycle, bm::Counter::kIsRate);
1422+
state.counters["TOP"] = bm::Counter(state.iterations() * tops_per_cycle, bm::Counter::kIsRate);
14231423
}
14241424
BENCHMARK(f32x4x4_matmul_avx512);
14251425

@@ -1471,9 +1471,16 @@ static void theoretic_tops( //
14711471
// Each kernel returns the number of TOPS.
14721472
std::size_t tops = 0;
14731473
for (auto _ : state) bm::DoNotOptimize(tops = theoretic_tops_kernel());
1474-
state.counters["TOPS"] = bm::Counter(tops * state.iterations() * state.threads() * 1.0, bm::Counter::kIsRate);
1474+
state.counters["TOP"] = bm::Counter(tops * state.iterations() * state.threads() * 1.0, bm::Counter::kIsRate);
14751475
}
14761476

1477+
#if defined(__AVX512F__) || defined(__AVX2__)
1478+
void configure_x86_denormals(void) {
1479+
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // Flush results to zero
1480+
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); // Treat denormal inputs as zero
1481+
}
1482+
#endif
1483+
14771484
/**
14781485
* Assuming we are not aiming for dynamic dispatch, we can simply check for
14791486
* the available features at compile time with more preprocessing directives:
@@ -1486,45 +1493,90 @@ static void theoretic_tops( //
14861493
* @see Arm Feature Detection: https://developer.arm.com/documentation/101028/0010/Feature-test-macros
14871494
*/
14881495
#if defined(__AVX512F__)
1489-
extern "C" std::uint32_t tops_f64_avx512_asm_kernel(void);
1490-
BENCHMARK_CAPTURE(theoretic_tops, f64_avx512, tops_f64_avx512_asm_kernel)->MinTime(10);
1491-
BENCHMARK_CAPTURE(theoretic_tops, f64_avx512, tops_f64_avx512_asm_kernel)->MinTime(10)->Threads(physical_cores());
1492-
extern "C" std::uint32_t tops_f32_avx512_asm_kernel(void);
1493-
BENCHMARK_CAPTURE(theoretic_tops, f32_avx512, tops_f32_avx512_asm_kernel)->MinTime(10);
1494-
BENCHMARK_CAPTURE(theoretic_tops, f32_avx512, tops_f32_avx512_asm_kernel)->MinTime(10)->Threads(physical_cores());
1496+
extern "C" std::uint32_t tops_f64_avx512ma_asm_kernel(void);
1497+
BENCHMARK_CAPTURE(theoretic_tops, f64_avx512ma, tops_f64_avx512ma_asm_kernel, configure_x86_denormals)->MinTime(10);
1498+
BENCHMARK_CAPTURE(theoretic_tops, f64_avx512ma, tops_f64_avx512ma_asm_kernel, configure_x86_denormals)
1499+
->MinTime(10)
1500+
->Threads(physical_cores());
1501+
extern "C" std::uint32_t tops_f64_avx512fma_asm_kernel(void);
1502+
BENCHMARK_CAPTURE(theoretic_tops, f64_avx512fma, tops_f64_avx512fma_asm_kernel, configure_x86_denormals)->MinTime(10);
1503+
BENCHMARK_CAPTURE(theoretic_tops, f64_avx512fma, tops_f64_avx512fma_asm_kernel, configure_x86_denormals)
1504+
->MinTime(10)
1505+
->Threads(physical_cores());
1506+
1507+
extern "C" std::uint32_t tops_f32_avx512ma_asm_kernel(void);
1508+
BENCHMARK_CAPTURE(theoretic_tops, f32_avx512ma, tops_f32_avx512ma_asm_kernel, configure_x86_denormals)->MinTime(10);
1509+
BENCHMARK_CAPTURE(theoretic_tops, f32_avx512ma, tops_f32_avx512ma_asm_kernel, configure_x86_denormals)
1510+
->MinTime(10)
1511+
->Threads(physical_cores());
1512+
extern "C" std::uint32_t tops_f32_avx512fma_asm_kernel(void);
1513+
BENCHMARK_CAPTURE(theoretic_tops, f32_avx512fma, tops_f32_avx512fma_asm_kernel, configure_x86_denormals)->MinTime(10);
1514+
BENCHMARK_CAPTURE(theoretic_tops, f32_avx512fma, tops_f32_avx512fma_asm_kernel, configure_x86_denormals)
1515+
->MinTime(10)
1516+
->Threads(physical_cores());
14951517
#endif // defined(__AVX512F__)
14961518

14971519
#if defined(__AVX512FP16__)
1498-
extern "C" std::uint32_t tops_f16_avx512_asm_kernel(void);
1499-
BENCHMARK_CAPTURE(theoretic_tops, f16_avx512, tops_f16_avx512_asm_kernel)->MinTime(10);
1500-
BENCHMARK_CAPTURE(theoretic_tops, f16_avx512, tops_f16_avx512_asm_kernel)->MinTime(10)->Threads(physical_cores());
1520+
extern "C" std::uint32_t tops_f16_avx512ma_asm_kernel(void);
1521+
BENCHMARK_CAPTURE(theoretic_tops, f16_avx512ma, tops_f16_avx512ma_asm_kernel, configure_x86_denormals)->MinTime(10);
1522+
BENCHMARK_CAPTURE(theoretic_tops, f16_avx512ma, tops_f16_avx512ma_asm_kernel, configure_x86_denormals)
1523+
->MinTime(10)
1524+
->Threads(physical_cores());
1525+
extern "C" std::uint32_t tops_f16_avx512fma_asm_kernel(void);
1526+
BENCHMARK_CAPTURE(theoretic_tops, f16_avx512fma, tops_f16_avx512fma_asm_kernel, configure_x86_denormals)->MinTime(10);
1527+
BENCHMARK_CAPTURE(theoretic_tops, f16_avx512fma, tops_f16_avx512fma_asm_kernel, configure_x86_denormals)
1528+
->MinTime(10)
1529+
->Threads(physical_cores());
15011530
#endif // defined(__AVX512FP16__)
15021531

15031532
#if defined(__AVX512BF16__)
1504-
extern "C" std::uint32_t tops_bf16_avx512_asm_kernel(void);
1505-
BENCHMARK_CAPTURE(theoretic_tops, bf16_avx512, tops_bf16_avx512_asm_kernel)->MinTime(10);
1506-
BENCHMARK_CAPTURE(theoretic_tops, bf16_avx512, tops_bf16_avx512_asm_kernel)->MinTime(10)->Threads(physical_cores());
1533+
extern "C" std::uint32_t tops_bf16_avx512fma_asm_kernel(void);
1534+
BENCHMARK_CAPTURE(theoretic_tops, bf16_avx512fma, tops_bf16_avx512fma_asm_kernel, configure_x86_denormals)->MinTime(10);
1535+
BENCHMARK_CAPTURE(theoretic_tops, bf16_avx512fma, tops_bf16_avx512fma_asm_kernel, configure_x86_denormals)
1536+
->MinTime(10)
1537+
->Threads(physical_cores());
15071538
#endif // defined(__AVX512BF16__)
15081539

15091540
#if defined(__AVX512VNNI__)
1510-
extern "C" std::uint32_t tops_i16_avx512_asm_kernel(void);
1511-
BENCHMARK_CAPTURE(theoretic_tops, i16_avx512, tops_i16_avx512_asm_kernel)->MinTime(10);
1512-
BENCHMARK_CAPTURE(theoretic_tops, i16_avx512, tops_i16_avx512_asm_kernel)->MinTime(10)->Threads(physical_cores());
1513-
extern "C" std::uint32_t tops_u8i8_avx512_asm_kernel(void);
1514-
BENCHMARK_CAPTURE(theoretic_tops, u8i8_avx512, tops_u8i8_avx512_asm_kernel)->MinTime(10);
1515-
BENCHMARK_CAPTURE(theoretic_tops, u8i8_avx512, tops_u8i8_avx512_asm_kernel)->MinTime(10)->Threads(physical_cores());
1541+
extern "C" std::uint32_t tops_i16_avx512fma_asm_kernel(void);
1542+
BENCHMARK_CAPTURE(theoretic_tops, i16_avx512fma, tops_i16_avx512fma_asm_kernel, configure_x86_denormals)->MinTime(10);
1543+
BENCHMARK_CAPTURE(theoretic_tops, i16_avx512fma, tops_i16_avx512fma_asm_kernel, configure_x86_denormals)
1544+
->MinTime(10)
1545+
->Threads(physical_cores());
1546+
extern "C" std::uint32_t tops_i7_avx512fma_asm_kernel(void);
1547+
BENCHMARK_CAPTURE(theoretic_tops, i7_avx512fma, tops_i7_avx512fma_asm_kernel, configure_x86_denormals)->MinTime(10);
1548+
BENCHMARK_CAPTURE(theoretic_tops, i7_avx512fma, tops_i7_avx512fma_asm_kernel, configure_x86_denormals)
1549+
->MinTime(10)
1550+
->Threads(physical_cores());
15161551
#endif // defined(__AVX512VNNI__)
15171552

15181553
#if defined(__AVX2__)
1519-
extern "C" std::uint32_t tops_f64_avx2_asm_kernel(void);
1520-
BENCHMARK_CAPTURE(theoretic_tops, f64_avx2, tops_f64_avx2_asm_kernel)->MinTime(10);
1521-
BENCHMARK_CAPTURE(theoretic_tops, f64_avx2, tops_f64_avx2_asm_kernel)->MinTime(10)->Threads(physical_cores());
1522-
extern "C" std::uint32_t tops_f32_avx2_asm_kernel(void);
1523-
BENCHMARK_CAPTURE(theoretic_tops, f32_avx2, tops_f32_avx2_asm_kernel)->MinTime(10);
1524-
BENCHMARK_CAPTURE(theoretic_tops, f32_avx2, tops_f32_avx2_asm_kernel)->MinTime(10)->Threads(physical_cores());
1554+
extern "C" std::uint32_t tops_f64_avx2ma_asm_kernel(void);
1555+
BENCHMARK_CAPTURE(theoretic_tops, f64_avx2ma, tops_f64_avx2ma_asm_kernel, configure_x86_denormals)->MinTime(10);
1556+
BENCHMARK_CAPTURE(theoretic_tops, f64_avx2ma, tops_f64_avx2ma_asm_kernel, configure_x86_denormals)
1557+
->MinTime(10)
1558+
->Threads(physical_cores());
1559+
extern "C" std::uint32_t tops_f64_avx2fma_asm_kernel(void);
1560+
BENCHMARK_CAPTURE(theoretic_tops, f64_avx2fma, tops_f64_avx2fma_asm_kernel, configure_x86_denormals)->MinTime(10);
1561+
BENCHMARK_CAPTURE(theoretic_tops, f64_avx2fma, tops_f64_avx2fma_asm_kernel, configure_x86_denormals)
1562+
->MinTime(10)
1563+
->Threads(physical_cores());
1564+
extern "C" std::uint32_t tops_f32_avx2ma_asm_kernel(void);
1565+
BENCHMARK_CAPTURE(theoretic_tops, f32_avx2ma, tops_f32_avx2ma_asm_kernel, configure_x86_denormals)->MinTime(10);
1566+
BENCHMARK_CAPTURE(theoretic_tops, f32_avx2ma, tops_f32_avx2ma_asm_kernel, configure_x86_denormals)
1567+
->MinTime(10)
1568+
->Threads(physical_cores());
1569+
extern "C" std::uint32_t tops_f32_avx2fma_asm_kernel(void);
1570+
BENCHMARK_CAPTURE(theoretic_tops, f32_avx2fma, tops_f32_avx2fma_asm_kernel, configure_x86_denormals)->MinTime(10);
1571+
BENCHMARK_CAPTURE(theoretic_tops, f32_avx2fma, tops_f32_avx2fma_asm_kernel, configure_x86_denormals)
1572+
->MinTime(10)
1573+
->Threads(physical_cores());
15251574
#endif // defined(__AVX2__)
15261575

15271576
#if defined(__ARM_NEON)
1577+
extern "C" std::uint32_t tops_f64_neon_asm_kernel(void);
1578+
BENCHMARK_CAPTURE(theoretic_tops, f64_neon, tops_f64_neon_asm_kernel)->MinTime(10);
1579+
BENCHMARK_CAPTURE(theoretic_tops, f64_neon, tops_f64_neon_asm_kernel)->MinTime(10)->Threads(physical_cores());
15281580
extern "C" std::uint32_t tops_f32_neon_asm_kernel(void);
15291581
BENCHMARK_CAPTURE(theoretic_tops, f32_neon, tops_f32_neon_asm_kernel)->MinTime(10);
15301582
BENCHMARK_CAPTURE(theoretic_tops, f32_neon, tops_f32_neon_asm_kernel)->MinTime(10)->Threads(physical_cores());
@@ -1646,44 +1698,57 @@ BENCHMARK_CAPTURE(theoretic_tops, i8_amx, tops_i8_amx_asm_kernel, configure_amx)
16461698
*
16471699
* Scalar Operations Tensor Operations
16481700
*
1649-
* - `f64`: @b 34 Tera-OPS @b 67 Tera-OPS
1650-
* - `f32`: @b 67 Tera-OPS @b 989 Tera-OPS
1651-
* - `bf16`: @b 1.9 Peta-OPS
1652-
* - `f16`: @b 2.9 Peta-OPS
1653-
* - `i8`: @b 3.9 Peta-OPS
1701+
* - `f64`: @b 34 T @b 67 T
1702+
* - `f32`: @b 67 T @b 989 T
1703+
* - `bf16`: @b 1.9 P
1704+
* - `f16`: @b 2.9 P
1705+
* - `i8`: @b 3.9 P
16541706
*
16551707
* This requires up to 700 W of power. A typical high-end server CPU uses
16561708
* under 500 W of power, and has similar number of cores to the GPUs number
16571709
* of Streaming Multiprocessors @b (SMs). The CPU can also run at a higher
16581710
* frequency, and has a larger cache, which is crucial for many workloads.
16591711
* On a single CPU core, we can achieve the following FMA throughput:
16601712
*
1661-
* Intel Granite Rapids AMD Zen4
1662-
*
1663-
* - AVX-512 `f64`: @b 1.5 Giga-OPS @b 58 Giga-OPS
1664-
* - AVX-512 `f32`: @b 4.8 Giga-OPS @b 117 Giga-OPS
1665-
*
1666-
* - AVX-512 `bf16`: @b 123 Giga-OPS @b 235 Giga-OPS
1667-
* - AVX-512 `f16`: @b 357 Giga-OPS 🤯🤯
1668-
* - AVX-512 `i8 • u8`: @b 708 Giga-OPS @b 470 Giga-Ops 🤯🤯
1669-
*
1670-
* - AMX `bf16`: @b 3.7 Tera-OPS
1671-
* - AMX `i8` and `u8`: @b 7.5 Tera-OPS 🤯🤯🤯
1672-
*
1673-
* On a typical dual-socket system:
1674-
*
1675-
* Intel Granite Rapids AMD Zen4
1676-
*
1677-
* - AVX-512 `f64`: @b ___ Tera-OPS @b 9.3 Tera-OPS
1678-
* - AVX-512 `f32`: @b ___ Tera-OPS @b 20.1 Tera-OPS
1679-
*
1680-
* - AVX-512 `bf16`: @b ___ Tera-OPS @b 41.8 Tera-OPS
1681-
* - AVX-512 `f16`: @b ___ Tera-OPS @b 39.6 Tera-Ops
1682-
* - AVX-512 `i8 • u8`: @b ___ Tera-OPS @b 81.3 Tera-Ops
1683-
*
1684-
* - AMX `bf16`: @b __ Tera-OPS
1685-
* - AMX `i8` and `u8`: @b __ Tera-OPS
1713+
* Intel Xeon 4 AMD Zen 4 Graviton 4
1714+
* @b FMA in AVX-512 & NEON:
1715+
* - `f64`: @b 1.2-76 G ¹ @b 58 G @b 31 G
1716+
* - `f32`: @b 3.1-135 G ¹ @b 117 G @b 63 G
1717+
* - `bf16`: @b 121 G @b 235 G @b 101 G
1718+
* - `f16`: @b 286 G 🤯🤯 - @b 116 G
1719+
* - `i16`: @b 342 G 🤯🤯 - -
1720+
* - `i7`: ² @b 678 G @b 470 G 🤯🤯 -
1721+
* - `i8`, `u8`: - - @b 1.1 T
1722+
* @b Mat-Mul in AMX & SME:
1723+
* - `bf16`: @b 3.6 T - -
1724+
* - `i8`, `u8`: @b 7.2 T 🤯🤯🤯 - -
1725+
*
1726+
* On a high-end dual-socket system, comparing `c7i.metal-48xl` to `c7a.metal-48xl`
1727+
* and `c8g.metal-48xl` 192-core instances on AWS, this scales to:
1728+
*
1729+
* Intel Xeon 4 AMD Zen 4 Graviton 4
1730+
* @b FMA in AVX-512 & NEON:
1731+
* - `f64`: @b 0.2-8.2 T ¹ @b 9.3 T @b 4.2 T
1732+
* - `f32`: @b 0.6-15.1 T ¹ @b 20.1 T @b 8.4 T
1733+
* - `bf16`: @b 9.8 T @b 41.8 T @b 20.1 T
1734+
* - `f16`: @b 35.4 T - @b 16.8 T
1735+
* - `i16`: @b 34.3 T - -
1736+
* - `i7`: @b 76 T @b 81.3 T -
1737+
* - `i8`, `u8`: - - @b 38.2 T
1738+
* @b Mat-Mul in AMX & SME:
1739+
* - `bf16`: @b 301 T - -
1740+
* - `i8`, `u8`: @b 683 T 🤯🤯🤯 - -
1741+
*
1742+
* > ¹ The FMA throughput on Intel can be insanely low for denormal numbers!
1743+
* > ² AVX-512 has weird `i8` by `u8` multiplication instructions, which don't
1744+
* seem useful for any 8-bit problems I've encountered, but are handy for
1745+
* 7-bit representations.
1746+
*
1747+
* The Fused-Multiply-Add performance should be higher than separate Multiply
1748+
* and Add operations. Moreover, there is no direct support for `bf16` math
1749+
* in x86, so for some numeric types FMA is the only option.
16861750
*/
1751+
16871752
#pragma endregion // Compute Bound Linear Algebra
16881753

16891754
#pragma region // Port Interleaving and Latency Hiding
@@ -1697,9 +1762,9 @@ BENCHMARK_CAPTURE(theoretic_tops, i8_amx, tops_i8_amx_asm_kernel, configure_amx)
16971762

16981763
#if defined(__AVX512VNNI__) && defined(__AMX_INT8__)
16991764

1700-
extern "C" std::uint32_t tops_i8u8_amx_avx512_asm_kernel(void);
1701-
BENCHMARK_CAPTURE(theoretic_tops, i8u8_amx_avx512, tops_i8u8_amx_avx512_asm_kernel, configure_amx)->MinTime(10);
1702-
BENCHMARK_CAPTURE(theoretic_tops, i8u8_amx_avx512, tops_i8u8_amx_avx512_asm_kernel, configure_amx)
1765+
extern "C" std::uint32_t tops_i7_amx_avx512fma_asm_kernel(void);
1766+
BENCHMARK_CAPTURE(theoretic_tops, i7_amx_avx512, tops_i7_amx_avx512fma_asm_kernel, configure_amx)->MinTime(10);
1767+
BENCHMARK_CAPTURE(theoretic_tops, i7_amx_avx512, tops_i7_amx_avx512fma_asm_kernel, configure_amx)
17031768
->MinTime(10)
17041769
->Threads(physical_cores());
17051770

@@ -2142,7 +2207,7 @@ static void cblas_tops(bm::State &state) {
21422207
/* beta: */ 0, c.data(), ldc);
21432208

21442209
std::size_t tops_per_cycle = n * n * (n /* multiplications */ + (n - 1) /* additions */);
2145-
state.counters["TOPS"] = bm::Counter(state.iterations() * tops_per_cycle, bm::Counter::kIsRate);
2210+
state.counters["TOP"] = bm::Counter(state.iterations() * tops_per_cycle, bm::Counter::kIsRate);
21462211
state.SetComplexityN(n);
21472212
}
21482213

@@ -2179,12 +2244,14 @@ static void eigen_tops(bm::State &state) {
21792244
}
21802245

21812246
std::size_t tops_per_cycle = n * n * (n /* multiplications */ + (n - 1) /* additions */);
2182-
state.counters["TOPS"] = bm::Counter(state.iterations() * tops_per_cycle, bm::Counter::kIsRate);
2247+
state.counters["TOP"] = bm::Counter(state.iterations() * tops_per_cycle, bm::Counter::kIsRate);
21832248
state.SetComplexityN(n);
21842249
}
21852250

2186-
BENCHMARK(eigen_tops<float>)->RangeMultiplier(2)->Range(8, 65536)->Complexity(benchmark::oNCubed);
21872251
BENCHMARK(eigen_tops<double>)->RangeMultiplier(2)->Range(8, 65536)->Complexity(benchmark::oNCubed);
2252+
BENCHMARK(eigen_tops<float>)->RangeMultiplier(2)->Range(8, 65536)->Complexity(benchmark::oNCubed);
2253+
BENCHMARK(eigen_tops<std::int16_t>)->RangeMultiplier(2)->Range(8, 65536)->Complexity(benchmark::oNCubed);
2254+
BENCHMARK(eigen_tops<std::int8_t>)->RangeMultiplier(2)->Range(8, 65536)->Complexity(benchmark::oNCubed);
21882255

21892256
/**
21902257
* Arm provides C language extensions for half-precision numbers, like
@@ -2199,14 +2266,47 @@ BENCHMARK(eigen_tops<double>)->RangeMultiplier(2)->Range(8, 65536)->Complexity(b
21992266
BENCHMARK(eigen_tops<__fp16>)->RangeMultiplier(2)->Range(8, 65536)->Complexity(benchmark::oNCubed);
22002267
#endif
22012268

2202-
#if defined(__ARM_FEATURE_BF16)
2269+
#if defined(__ARM_FEATURE_BF16) //! May not be defined even if `__ARM_FEATURE_BF16_VECTOR_ARITHMETIC` is!
22032270
#include <arm_bf16.h>
22042271
BENCHMARK(eigen_tops<__bf16>)->RangeMultiplier(2)->Range(8, 65536)->Complexity(benchmark::oNCubed);
22052272
#endif
22062273

2274+
#if defined(__AVX512FP16__)
2275+
#include <immintrin.h>
2276+
BENCHMARK(eigen_tops<_Float16>)->RangeMultiplier(2)->Range(8, 65536)->Complexity(benchmark::oNCubed);
2277+
#endif
2278+
22072279
/**
22082280
* Now we can compare the theoretical limits to the actual performance
2209-
* of Eigen and BLAS libraries.
2281+
* of Eigen and BLAS libraries. On a dual-socket system, 192-core Intel
2282+
* Xeon 4 instances on AWS, we can achieve the following FMA throughput:
2283+
*
2284+
* Theoretical OpenBLAS Eigen
2285+
*
2286+
* - `f64` @b 4.1 T (AVX-512) @b 3.1 T @b 2.9 T
2287+
* - `f32` @b 8.9 T (AVX-512) @b 6.4 T @b 7.5 T
2288+
* - `bf16` @b 301 T (AMX) - -
2289+
* - `f16` @b 35.4 T (AVX-512) - @b 396 G
2290+
* - `i16`: @b 34.3 T (AVX-512) - @b 255 G
2291+
* - `i8` & `u8` @b 683 T (AMX) - @b 182 G
2292+
*
2293+
* Important to note, for different libraries and data types, the highest
2294+
* throughput was achieved with different shapes and the best number is shown.
2295+
*
2296+
* Similarly on the dual-socket Graviton 4 instances on AWS, we can achieve:
2297+
*
2298+
* Theoretical OpenBLAS Eigen
2299+
*
2300+
* - `f64` @b 4.2 T @b 1.2 T @b 1.2 T
2301+
* - `f32` @b 8.4 T @b 2.3 T @b 1.3 T
2302+
* - `bf16` @b 20.1 T - -
2303+
* - `f16` @b 16.8 T - @b 660 G
2304+
* - `i16`: - - @b 6.5 T
2305+
* - `i8` & `u8` @b 38.2 T - @b 13.4 T
2306+
*
2307+
* As expected, modern libraries are generally far less optimized for Arm,
2308+
* but for some applications dealing with 8-bit integers, Eigen can be good
2309+
* enough.
22102310
*/
22112311
#pragma endregion // Memory Bound Linear Algebra
22122312

0 commit comments

Comments
 (0)