@@ -1172,7 +1172,7 @@ static void f32x4x4_matmul(bm::State &state) {
1172
1172
for (auto _ : state) bm::DoNotOptimize (c = f32x4x4_matmul_kernel (a, b));
1173
1173
1174
1174
std::size_t tops_per_cycle = 4 * 4 * (4 /* multiplications */ + 3 /* additions */ );
1175
- state.counters [" TOPS " ] = bm::Counter (state.iterations () * tops_per_cycle, bm::Counter::kIsRate );
1175
+ state.counters [" TOP " ] = bm::Counter (state.iterations () * tops_per_cycle, bm::Counter::kIsRate );
1176
1176
}
1177
1177
1178
1178
BENCHMARK (f32x4x4_matmul);
@@ -1232,7 +1232,7 @@ static void f32x4x4_matmul_unrolled(bm::State &state) {
1232
1232
for (auto _ : state) bm::DoNotOptimize (c = f32x4x4_matmul_unrolled_kernel (a, b));
1233
1233
1234
1234
std::size_t tops_per_cycle = 4 * 4 * (4 /* multiplications */ + 3 /* additions */ );
1235
- state.counters [" TOPS " ] = bm::Counter (state.iterations () * tops_per_cycle, bm::Counter::kIsRate );
1235
+ state.counters [" TOP " ] = bm::Counter (state.iterations () * tops_per_cycle, bm::Counter::kIsRate );
1236
1236
}
1237
1237
1238
1238
BENCHMARK (f32x4x4_matmul_unrolled);
@@ -1330,7 +1330,7 @@ static void f32x4x4_matmul_sse41(bm::State &state) {
1330
1330
for (auto _ : state) bm::DoNotOptimize (c = f32x4x4_matmul_sse41_kernel (a, b));
1331
1331
1332
1332
std::size_t tops_per_cycle = 4 * 4 * (4 /* multiplications */ + 3 /* additions */ );
1333
- state.counters [" TOPS " ] = bm::Counter (state.iterations () * tops_per_cycle, bm::Counter::kIsRate );
1333
+ state.counters [" TOP " ] = bm::Counter (state.iterations () * tops_per_cycle, bm::Counter::kIsRate );
1334
1334
}
1335
1335
1336
1336
BENCHMARK (f32x4x4_matmul_sse41);
@@ -1419,7 +1419,7 @@ static void f32x4x4_matmul_avx512(bm::State &state) {
1419
1419
for (auto _ : state) bm::DoNotOptimize (c = f32x4x4_matmul_avx512_kernel (a, b));
1420
1420
1421
1421
std::size_t tops_per_cycle = 4 * 4 * (4 /* multiplications */ + 3 /* additions */ );
1422
- state.counters [" TOPS " ] = bm::Counter (state.iterations () * tops_per_cycle, bm::Counter::kIsRate );
1422
+ state.counters [" TOP " ] = bm::Counter (state.iterations () * tops_per_cycle, bm::Counter::kIsRate );
1423
1423
}
1424
1424
BENCHMARK (f32x4x4_matmul_avx512);
1425
1425
@@ -1471,9 +1471,16 @@ static void theoretic_tops( //
1471
1471
// Each kernel returns the number of TOPS.
1472
1472
std::size_t tops = 0 ;
1473
1473
for (auto _ : state) bm::DoNotOptimize (tops = theoretic_tops_kernel ());
1474
- state.counters [" TOPS " ] = bm::Counter (tops * state.iterations () * state.threads () * 1.0 , bm::Counter::kIsRate );
1474
+ state.counters [" TOP " ] = bm::Counter (tops * state.iterations () * state.threads () * 1.0 , bm::Counter::kIsRate );
1475
1475
}
1476
1476
1477
+ #if defined(__AVX512F__) || defined(__AVX2__)
1478
+ void configure_x86_denormals (void ) {
1479
+ _MM_SET_FLUSH_ZERO_MODE (_MM_FLUSH_ZERO_ON); // Flush results to zero
1480
+ _MM_SET_DENORMALS_ZERO_MODE (_MM_DENORMALS_ZERO_ON); // Treat denormal inputs as zero
1481
+ }
1482
+ #endif
1483
+
1477
1484
/* *
1478
1485
* Assuming we are not aiming for dynamic dispatch, we can simply check for
1479
1486
* the available features at compile time with more preprocessing directives:
@@ -1486,45 +1493,90 @@ static void theoretic_tops( //
1486
1493
* @see Arm Feature Detection: https://developer.arm.com/documentation/101028/0010/Feature-test-macros
1487
1494
*/
1488
1495
#if defined(__AVX512F__)
1489
- extern " C" std::uint32_t tops_f64_avx512_asm_kernel (void );
1490
- BENCHMARK_CAPTURE (theoretic_tops, f64_avx512, tops_f64_avx512_asm_kernel)->MinTime(10 );
1491
- BENCHMARK_CAPTURE (theoretic_tops, f64_avx512, tops_f64_avx512_asm_kernel)->MinTime(10 )->Threads(physical_cores());
1492
- extern " C" std::uint32_t tops_f32_avx512_asm_kernel (void );
1493
- BENCHMARK_CAPTURE (theoretic_tops, f32_avx512, tops_f32_avx512_asm_kernel)->MinTime(10 );
1494
- BENCHMARK_CAPTURE (theoretic_tops, f32_avx512, tops_f32_avx512_asm_kernel)->MinTime(10 )->Threads(physical_cores());
1496
+ extern " C" std::uint32_t tops_f64_avx512ma_asm_kernel (void );
1497
+ BENCHMARK_CAPTURE (theoretic_tops, f64_avx512ma, tops_f64_avx512ma_asm_kernel, configure_x86_denormals)->MinTime(10 );
1498
+ BENCHMARK_CAPTURE (theoretic_tops, f64_avx512ma, tops_f64_avx512ma_asm_kernel, configure_x86_denormals)
1499
+ ->MinTime(10 )
1500
+ ->Threads(physical_cores());
1501
+ extern " C" std::uint32_t tops_f64_avx512fma_asm_kernel (void );
1502
+ BENCHMARK_CAPTURE (theoretic_tops, f64_avx512fma, tops_f64_avx512fma_asm_kernel, configure_x86_denormals)->MinTime(10 );
1503
+ BENCHMARK_CAPTURE (theoretic_tops, f64_avx512fma, tops_f64_avx512fma_asm_kernel, configure_x86_denormals)
1504
+ ->MinTime(10 )
1505
+ ->Threads(physical_cores());
1506
+
1507
+ extern " C" std::uint32_t tops_f32_avx512ma_asm_kernel (void );
1508
+ BENCHMARK_CAPTURE (theoretic_tops, f32_avx512ma, tops_f32_avx512ma_asm_kernel, configure_x86_denormals)->MinTime(10 );
1509
+ BENCHMARK_CAPTURE (theoretic_tops, f32_avx512ma, tops_f32_avx512ma_asm_kernel, configure_x86_denormals)
1510
+ ->MinTime(10 )
1511
+ ->Threads(physical_cores());
1512
+ extern " C" std::uint32_t tops_f32_avx512fma_asm_kernel (void );
1513
+ BENCHMARK_CAPTURE (theoretic_tops, f32_avx512fma, tops_f32_avx512fma_asm_kernel, configure_x86_denormals)->MinTime(10 );
1514
+ BENCHMARK_CAPTURE (theoretic_tops, f32_avx512fma, tops_f32_avx512fma_asm_kernel, configure_x86_denormals)
1515
+ ->MinTime(10 )
1516
+ ->Threads(physical_cores());
1495
1517
#endif // defined(__AVX512F__)
1496
1518
1497
1519
#if defined(__AVX512FP16__)
1498
- extern " C" std::uint32_t tops_f16_avx512_asm_kernel (void );
1499
- BENCHMARK_CAPTURE (theoretic_tops, f16_avx512, tops_f16_avx512_asm_kernel)->MinTime(10 );
1500
- BENCHMARK_CAPTURE (theoretic_tops, f16_avx512, tops_f16_avx512_asm_kernel)->MinTime(10 )->Threads(physical_cores());
1520
+ extern " C" std::uint32_t tops_f16_avx512ma_asm_kernel (void );
1521
+ BENCHMARK_CAPTURE (theoretic_tops, f16_avx512ma, tops_f16_avx512ma_asm_kernel, configure_x86_denormals)->MinTime(10 );
1522
+ BENCHMARK_CAPTURE (theoretic_tops, f16_avx512ma, tops_f16_avx512ma_asm_kernel, configure_x86_denormals)
1523
+ ->MinTime(10 )
1524
+ ->Threads(physical_cores());
1525
+ extern " C" std::uint32_t tops_f16_avx512fma_asm_kernel (void );
1526
+ BENCHMARK_CAPTURE (theoretic_tops, f16_avx512fma, tops_f16_avx512fma_asm_kernel, configure_x86_denormals)->MinTime(10 );
1527
+ BENCHMARK_CAPTURE (theoretic_tops, f16_avx512fma, tops_f16_avx512fma_asm_kernel, configure_x86_denormals)
1528
+ ->MinTime(10 )
1529
+ ->Threads(physical_cores());
1501
1530
#endif // defined(__AVX512FP16__)
1502
1531
1503
1532
#if defined(__AVX512BF16__)
1504
- extern " C" std::uint32_t tops_bf16_avx512_asm_kernel (void );
1505
- BENCHMARK_CAPTURE (theoretic_tops, bf16_avx512, tops_bf16_avx512_asm_kernel)->MinTime(10 );
1506
- BENCHMARK_CAPTURE (theoretic_tops, bf16_avx512, tops_bf16_avx512_asm_kernel)->MinTime(10 )->Threads(physical_cores());
1533
+ extern " C" std::uint32_t tops_bf16_avx512fma_asm_kernel (void );
1534
+ BENCHMARK_CAPTURE (theoretic_tops, bf16_avx512fma, tops_bf16_avx512fma_asm_kernel, configure_x86_denormals)->MinTime(10 );
1535
+ BENCHMARK_CAPTURE (theoretic_tops, bf16_avx512fma, tops_bf16_avx512fma_asm_kernel, configure_x86_denormals)
1536
+ ->MinTime(10 )
1537
+ ->Threads(physical_cores());
1507
1538
#endif // defined(__AVX512BF16__)
1508
1539
1509
1540
#if defined(__AVX512VNNI__)
1510
- extern " C" std::uint32_t tops_i16_avx512_asm_kernel (void );
1511
- BENCHMARK_CAPTURE (theoretic_tops, i16_avx512, tops_i16_avx512_asm_kernel)->MinTime(10 );
1512
- BENCHMARK_CAPTURE (theoretic_tops, i16_avx512, tops_i16_avx512_asm_kernel)->MinTime(10 )->Threads(physical_cores());
1513
- extern " C" std::uint32_t tops_u8i8_avx512_asm_kernel (void );
1514
- BENCHMARK_CAPTURE (theoretic_tops, u8i8_avx512, tops_u8i8_avx512_asm_kernel)->MinTime(10 );
1515
- BENCHMARK_CAPTURE (theoretic_tops, u8i8_avx512, tops_u8i8_avx512_asm_kernel)->MinTime(10 )->Threads(physical_cores());
1541
+ extern " C" std::uint32_t tops_i16_avx512fma_asm_kernel (void );
1542
+ BENCHMARK_CAPTURE (theoretic_tops, i16_avx512fma, tops_i16_avx512fma_asm_kernel, configure_x86_denormals)->MinTime(10 );
1543
+ BENCHMARK_CAPTURE (theoretic_tops, i16_avx512fma, tops_i16_avx512fma_asm_kernel, configure_x86_denormals)
1544
+ ->MinTime(10 )
1545
+ ->Threads(physical_cores());
1546
+ extern " C" std::uint32_t tops_i7_avx512fma_asm_kernel (void );
1547
+ BENCHMARK_CAPTURE (theoretic_tops, i7_avx512fma, tops_i7_avx512fma_asm_kernel, configure_x86_denormals)->MinTime(10 );
1548
+ BENCHMARK_CAPTURE (theoretic_tops, i7_avx512fma, tops_i7_avx512fma_asm_kernel, configure_x86_denormals)
1549
+ ->MinTime(10 )
1550
+ ->Threads(physical_cores());
1516
1551
#endif // defined(__AVX512VNNI__)
1517
1552
1518
1553
#if defined(__AVX2__)
1519
- extern " C" std::uint32_t tops_f64_avx2_asm_kernel (void );
1520
- BENCHMARK_CAPTURE (theoretic_tops, f64_avx2, tops_f64_avx2_asm_kernel)->MinTime(10 );
1521
- BENCHMARK_CAPTURE (theoretic_tops, f64_avx2, tops_f64_avx2_asm_kernel)->MinTime(10 )->Threads(physical_cores());
1522
- extern " C" std::uint32_t tops_f32_avx2_asm_kernel (void );
1523
- BENCHMARK_CAPTURE (theoretic_tops, f32_avx2, tops_f32_avx2_asm_kernel)->MinTime(10 );
1524
- BENCHMARK_CAPTURE (theoretic_tops, f32_avx2, tops_f32_avx2_asm_kernel)->MinTime(10 )->Threads(physical_cores());
1554
+ extern " C" std::uint32_t tops_f64_avx2ma_asm_kernel (void );
1555
+ BENCHMARK_CAPTURE (theoretic_tops, f64_avx2ma, tops_f64_avx2ma_asm_kernel, configure_x86_denormals)->MinTime(10 );
1556
+ BENCHMARK_CAPTURE (theoretic_tops, f64_avx2ma, tops_f64_avx2ma_asm_kernel, configure_x86_denormals)
1557
+ ->MinTime(10 )
1558
+ ->Threads(physical_cores());
1559
+ extern " C" std::uint32_t tops_f64_avx2fma_asm_kernel (void );
1560
+ BENCHMARK_CAPTURE (theoretic_tops, f64_avx2fma, tops_f64_avx2fma_asm_kernel, configure_x86_denormals)->MinTime(10 );
1561
+ BENCHMARK_CAPTURE (theoretic_tops, f64_avx2fma, tops_f64_avx2fma_asm_kernel, configure_x86_denormals)
1562
+ ->MinTime(10 )
1563
+ ->Threads(physical_cores());
1564
+ extern " C" std::uint32_t tops_f32_avx2ma_asm_kernel (void );
1565
+ BENCHMARK_CAPTURE (theoretic_tops, f32_avx2ma, tops_f32_avx2ma_asm_kernel, configure_x86_denormals)->MinTime(10 );
1566
+ BENCHMARK_CAPTURE (theoretic_tops, f32_avx2ma, tops_f32_avx2ma_asm_kernel, configure_x86_denormals)
1567
+ ->MinTime(10 )
1568
+ ->Threads(physical_cores());
1569
+ extern " C" std::uint32_t tops_f32_avx2fma_asm_kernel (void );
1570
+ BENCHMARK_CAPTURE (theoretic_tops, f32_avx2fma, tops_f32_avx2fma_asm_kernel, configure_x86_denormals)->MinTime(10 );
1571
+ BENCHMARK_CAPTURE (theoretic_tops, f32_avx2fma, tops_f32_avx2fma_asm_kernel, configure_x86_denormals)
1572
+ ->MinTime(10 )
1573
+ ->Threads(physical_cores());
1525
1574
#endif // defined(__AVX2__)
1526
1575
1527
1576
#if defined(__ARM_NEON)
1577
+ extern " C" std::uint32_t tops_f64_neon_asm_kernel (void );
1578
+ BENCHMARK_CAPTURE (theoretic_tops, f64_neon, tops_f64_neon_asm_kernel)->MinTime(10 );
1579
+ BENCHMARK_CAPTURE (theoretic_tops, f64_neon, tops_f64_neon_asm_kernel)->MinTime(10 )->Threads(physical_cores());
1528
1580
extern " C" std::uint32_t tops_f32_neon_asm_kernel (void );
1529
1581
BENCHMARK_CAPTURE (theoretic_tops, f32_neon, tops_f32_neon_asm_kernel)->MinTime(10 );
1530
1582
BENCHMARK_CAPTURE (theoretic_tops, f32_neon, tops_f32_neon_asm_kernel)->MinTime(10 )->Threads(physical_cores());
@@ -1646,44 +1698,57 @@ BENCHMARK_CAPTURE(theoretic_tops, i8_amx, tops_i8_amx_asm_kernel, configure_amx)
1646
1698
*
1647
1699
* Scalar Operations Tensor Operations
1648
1700
*
1649
- * - `f64`: @b 34 Tera-OPS @b 67 Tera-OPS
1650
- * - `f32`: @b 67 Tera-OPS @b 989 Tera-OPS
1651
- * - `bf16`: @b 1.9 Peta-OPS
1652
- * - `f16`: @b 2.9 Peta-OPS
1653
- * - `i8`: @b 3.9 Peta-OPS
1701
+ * - `f64`: @b 34 T @b 67 T
1702
+ * - `f32`: @b 67 T @b 989 T
1703
+ * - `bf16`: @b 1.9 P
1704
+ * - `f16`: @b 2.9 P
1705
+ * - `i8`: @b 3.9 P
1654
1706
*
1655
1707
* This requires up to 700 W of power. A typical high-end server CPU uses
1656
1708
* under 500 W of power, and has similar number of cores to the GPUs number
1657
1709
* of Streaming Multiprocessors @b (SMs). The CPU can also run at a higher
1658
1710
* frequency, and has a larger cache, which is crucial for many workloads.
1659
1711
* On a single CPU core, we can achieve the following FMA throughput:
1660
1712
*
1661
- * Intel Granite Rapids AMD Zen4
1662
- *
1663
- * - AVX-512 `f64`: @b 1.5 Giga-OPS @b 58 Giga-OPS
1664
- * - AVX-512 `f32`: @b 4.8 Giga-OPS @b 117 Giga-OPS
1665
- *
1666
- * - AVX-512 `bf16`: @b 123 Giga-OPS @b 235 Giga-OPS
1667
- * - AVX-512 `f16`: @b 357 Giga-OPS 🤯🤯
1668
- * - AVX-512 `i8 • u8`: @b 708 Giga-OPS @b 470 Giga-Ops 🤯🤯
1669
- *
1670
- * - AMX `bf16`: @b 3.7 Tera-OPS
1671
- * - AMX `i8` and `u8`: @b 7.5 Tera-OPS 🤯🤯🤯
1672
- *
1673
- * On a typical dual-socket system:
1674
- *
1675
- * Intel Granite Rapids AMD Zen4
1676
- *
1677
- * - AVX-512 `f64`: @b ___ Tera-OPS @b 9.3 Tera-OPS
1678
- * - AVX-512 `f32`: @b ___ Tera-OPS @b 20.1 Tera-OPS
1679
- *
1680
- * - AVX-512 `bf16`: @b ___ Tera-OPS @b 41.8 Tera-OPS
1681
- * - AVX-512 `f16`: @b ___ Tera-OPS @b 39.6 Tera-Ops
1682
- * - AVX-512 `i8 • u8`: @b ___ Tera-OPS @b 81.3 Tera-Ops
1683
- *
1684
- * - AMX `bf16`: @b __ Tera-OPS
1685
- * - AMX `i8` and `u8`: @b __ Tera-OPS
1713
+ * Intel Xeon 4 AMD Zen 4 Graviton 4
1714
+ * @b FMA in AVX-512 & NEON:
1715
+ * - `f64`: @b 1.2-76 G ¹ @b 58 G @b 31 G
1716
+ * - `f32`: @b 3.1-135 G ¹ @b 117 G @b 63 G
1717
+ * - `bf16`: @b 121 G @b 235 G @b 101 G
1718
+ * - `f16`: @b 286 G 🤯🤯 - @b 116 G
1719
+ * - `i16`: @b 342 G 🤯🤯 - -
1720
+ * - `i7`: ² @b 678 G @b 470 G 🤯🤯 -
1721
+ * - `i8`, `u8`: - - @b 1.1 T
1722
+ * @b Mat-Mul in AMX & SME:
1723
+ * - `bf16`: @b 3.6 T - -
1724
+ * - `i8`, `u8`: @b 7.2 T 🤯🤯🤯 - -
1725
+ *
1726
+ * On a high-end dual-socket system, comparing `c7i.metal-48xl` to `c7a.metal-48xl`
1727
+ * and `c8g.metal-48xl` 192-core instances on AWS, this scales to:
1728
+ *
1729
+ * Intel Xeon 4 AMD Zen 4 Graviton 4
1730
+ * @b FMA in AVX-512 & NEON:
1731
+ * - `f64`: @b 0.2-8.2 T ¹ @b 9.3 T @b 4.2 T
1732
+ * - `f32`: @b 0.6-15.1 T ¹ @b 20.1 T @b 8.4 T
1733
+ * - `bf16`: @b 9.8 T @b 41.8 T @b 20.1 T
1734
+ * - `f16`: @b 35.4 T - @b 16.8 T
1735
+ * - `i16`: @b 34.3 T - -
1736
+ * - `i7`: @b 76 T @b 81.3 T -
1737
+ * - `i8`, `u8`: - - @b 38.2 T
1738
+ * @b Mat-Mul in AMX & SME:
1739
+ * - `bf16`: @b 301 T - -
1740
+ * - `i8`, `u8`: @b 683 T 🤯🤯🤯 - -
1741
+ *
1742
+ * > ¹ The FMA throughput on Intel can be insanely low for denormal numbers!
1743
+ * > ² AVX-512 has weird `i8` by `u8` multiplication instructions, which don't
1744
+ * seem useful for any 8-bit problems I've encountered, but are handy for
1745
+ * 7-bit representations.
1746
+ *
1747
+ * The Fused-Multiply-Add performance should be higher than separate Multiply
1748
+ * and Add operations. Moreover, there is no direct support for `bf16` math
1749
+ * in x86, so for some numeric types FMA is the only option.
1686
1750
*/
1751
+
1687
1752
#pragma endregion // Compute Bound Linear Algebra
1688
1753
1689
1754
#pragma region // Port Interleaving and Latency Hiding
@@ -1697,9 +1762,9 @@ BENCHMARK_CAPTURE(theoretic_tops, i8_amx, tops_i8_amx_asm_kernel, configure_amx)
1697
1762
1698
1763
#if defined(__AVX512VNNI__) && defined(__AMX_INT8__)
1699
1764
1700
- extern " C" std::uint32_t tops_i8u8_amx_avx512_asm_kernel (void );
1701
- BENCHMARK_CAPTURE (theoretic_tops, i8u8_amx_avx512, tops_i8u8_amx_avx512_asm_kernel , configure_amx)->MinTime(10 );
1702
- BENCHMARK_CAPTURE (theoretic_tops, i8u8_amx_avx512, tops_i8u8_amx_avx512_asm_kernel , configure_amx)
1765
+ extern " C" std::uint32_t tops_i7_amx_avx512fma_asm_kernel (void );
1766
+ BENCHMARK_CAPTURE (theoretic_tops, i7_amx_avx512, tops_i7_amx_avx512fma_asm_kernel , configure_amx)->MinTime(10 );
1767
+ BENCHMARK_CAPTURE (theoretic_tops, i7_amx_avx512, tops_i7_amx_avx512fma_asm_kernel , configure_amx)
1703
1768
->MinTime(10 )
1704
1769
->Threads(physical_cores());
1705
1770
@@ -2142,7 +2207,7 @@ static void cblas_tops(bm::State &state) {
2142
2207
/* beta: */ 0 , c.data (), ldc);
2143
2208
2144
2209
std::size_t tops_per_cycle = n * n * (n /* multiplications */ + (n - 1 ) /* additions */ );
2145
- state.counters [" TOPS " ] = bm::Counter (state.iterations () * tops_per_cycle, bm::Counter::kIsRate );
2210
+ state.counters [" TOP " ] = bm::Counter (state.iterations () * tops_per_cycle, bm::Counter::kIsRate );
2146
2211
state.SetComplexityN (n);
2147
2212
}
2148
2213
@@ -2179,12 +2244,14 @@ static void eigen_tops(bm::State &state) {
2179
2244
}
2180
2245
2181
2246
std::size_t tops_per_cycle = n * n * (n /* multiplications */ + (n - 1 ) /* additions */ );
2182
- state.counters [" TOPS " ] = bm::Counter (state.iterations () * tops_per_cycle, bm::Counter::kIsRate );
2247
+ state.counters [" TOP " ] = bm::Counter (state.iterations () * tops_per_cycle, bm::Counter::kIsRate );
2183
2248
state.SetComplexityN (n);
2184
2249
}
2185
2250
2186
- BENCHMARK (eigen_tops<float >)->RangeMultiplier(2 )->Range(8 , 65536 )->Complexity(benchmark::oNCubed);
2187
2251
BENCHMARK (eigen_tops<double >)->RangeMultiplier(2 )->Range(8 , 65536 )->Complexity(benchmark::oNCubed);
2252
+ BENCHMARK (eigen_tops<float >)->RangeMultiplier(2 )->Range(8 , 65536 )->Complexity(benchmark::oNCubed);
2253
+ BENCHMARK (eigen_tops<std::int16_t >)->RangeMultiplier(2 )->Range(8 , 65536 )->Complexity(benchmark::oNCubed);
2254
+ BENCHMARK (eigen_tops<std::int8_t >)->RangeMultiplier(2 )->Range(8 , 65536 )->Complexity(benchmark::oNCubed);
2188
2255
2189
2256
/* *
2190
2257
* Arm provides C language extensions for half-precision numbers, like
@@ -2199,14 +2266,47 @@ BENCHMARK(eigen_tops<double>)->RangeMultiplier(2)->Range(8, 65536)->Complexity(b
2199
2266
BENCHMARK (eigen_tops<__fp16>)->RangeMultiplier(2 )->Range(8 , 65536 )->Complexity(benchmark::oNCubed);
2200
2267
#endif
2201
2268
2202
- #if defined(__ARM_FEATURE_BF16)
2269
+ #if defined(__ARM_FEATURE_BF16) // ! May not be defined even if `__ARM_FEATURE_BF16_VECTOR_ARITHMETIC` is!
2203
2270
#include < arm_bf16.h>
2204
2271
BENCHMARK (eigen_tops<__bf16>)->RangeMultiplier(2 )->Range(8 , 65536 )->Complexity(benchmark::oNCubed);
2205
2272
#endif
2206
2273
2274
+ #if defined(__AVX512FP16__)
2275
+ #include < immintrin.h>
2276
+ BENCHMARK (eigen_tops<_Float16>)->RangeMultiplier(2 )->Range(8 , 65536 )->Complexity(benchmark::oNCubed);
2277
+ #endif
2278
+
2207
2279
/* *
2208
2280
* Now we can compare the theoretical limits to the actual performance
2209
- * of Eigen and BLAS libraries.
2281
+ * of Eigen and BLAS libraries. On a dual-socket system, 192-core Intel
2282
+ * Xeon 4 instances on AWS, we can achieve the following FMA throughput:
2283
+ *
2284
+ * Theoretical OpenBLAS Eigen
2285
+ *
2286
+ * - `f64` @b 4.1 T (AVX-512) @b 3.1 T @b 2.9 T
2287
+ * - `f32` @b 8.9 T (AVX-512) @b 6.4 T @b 7.5 T
2288
+ * - `bf16` @b 301 T (AMX) - -
2289
+ * - `f16` @b 35.4 T (AVX-512) - @b 396 G
2290
+ * - `i16`: @b 34.3 T (AVX-512) - @b 255 G
2291
+ * - `i8` & `u8` @b 683 T (AMX) - @b 182 G
2292
+ *
2293
+ * Important to note, for different libraries and data types, the highest
2294
+ * throughput was achieved with different shapes and the best number is shown.
2295
+ *
2296
+ * Similarly on the dual-socket Graviton 4 instances on AWS, we can achieve:
2297
+ *
2298
+ * Theoretical OpenBLAS Eigen
2299
+ *
2300
+ * - `f64` @b 4.2 T @b 1.2 T @b 1.2 T
2301
+ * - `f32` @b 8.4 T @b 2.3 T @b 1.3 T
2302
+ * - `bf16` @b 20.1 T - -
2303
+ * - `f16` @b 16.8 T - @b 660 G
2304
+ * - `i16`: - - @b 6.5 T
2305
+ * - `i8` & `u8` @b 38.2 T - @b 13.4 T
2306
+ *
2307
+ * As expected, modern libraries are generally far less optimized for Arm,
2308
+ * but for some applications dealing with 8-bit integers, Eigen can be good
2309
+ * enough.
2210
2310
*/
2211
2311
#pragma endregion // Memory Bound Linear Algebra
2212
2312
0 commit comments