AztecProtocol · crStiv · Jan 30, 2025 · Jan 30, 2025 · Jan 30, 2025 · Jan 30, 2025
diff --git a/barretenberg/cpp/src/barretenberg/numeric/uint128/uint128.test.cpp b/barretenberg/cpp/src/barretenberg/numeric/uint128/uint128.test.cpp
@@ -1,6 +1,7 @@
 #include "uint128.hpp"
 #include "../random/engine.hpp"
 #include <gtest/gtest.h>
+#include <chrono>
 #ifdef __i386__
 
 using namespace bb;
@@ -317,4 +318,124 @@ TEST(uint128, ToFromBuffer)
     auto b = from_buffer<uint128_t>(buf);
     EXPECT_EQ(a, b);
 }
-#endif
+
+TEST(uint128, karatsuba_multiplication_correctness)
+{
+    // Test basic multiplication
+    uint128_t a(123456789);
+    uint128_t b(987654321);
+    uint128_t expected = uint128_t(123456789) * uint128_t(987654321);
+    uint128_t result = a * b;
+    EXPECT_EQ(result, expected);
+
+    // Test edge cases
+    uint128_t max_32bit(0xFFFFFFFF);
+    uint128_t result_max = max_32bit * max_32bit;
+    EXPECT_EQ(result_max, uint128_t(0xFFFFFFFE00000001));
+
+    // Test with powers of 2
+    uint128_t pow2_16(1ULL << 16);
+    uint128_t pow2_15(1ULL << 15);
+    uint128_t result_pow2 = pow2_16 * pow2_15;
+    EXPECT_EQ(result_pow2, uint128_t(1ULL << 31));
+}
+
+TEST(uint128, karatsuba_multiplication_performance)
+{
+    const int NUM_ITERATIONS = 1000000;
+
+    uint128_t a(0xDEADBEEF);
+    uint128_t b(0xCAFEBABE);
+    uint128_t result;
+
+    auto start = std::chrono::high_resolution_clock::now();
+
+    for(int i = 0; i < NUM_ITERATIONS; ++i) {
+        result = a * b;
+        a += uint128_t(1);
+        b += uint128_t(2);
+    }
+
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+    // Prevent compiler from optimizing away the loop
+    EXPECT_NE(result, uint128_t(0));
+
+    // Print performance metrics
+    std::cout << "Karatsuba multiplication performance test:\n";
+    std::cout << "Time for " << NUM_ITERATIONS << " multiplications: "
+              << duration.count() << " microseconds\n";
+    std::cout << "Average time per multiplication: "
+              << static_cast<double>(duration.count()) / NUM_ITERATIONS
+              << " microseconds\n";
+}
+
+TEST(uint128, karatsuba_vs_standard_multiplication)
+{
+    const int NUM_ITERATIONS = 1000;
+    std::vector<std::pair<uint32_t, uint32_t>> test_cases;
+
+    // Generate test cases
+    for(int i = 0; i < NUM_ITERATIONS; ++i) {
+        test_cases.push_back({
+            static_cast<uint32_t>(engine.get_random_uint64()),
+            static_cast<uint32_t>(engine.get_random_uint64())
+        });
+    }
+
+    // Test both implementations
+    std::vector<std::pair<uint32_t, uint32_t>> karatsuba_results;
+    std::vector<std::pair<uint32_t, uint32_t>> standard_results;
+
+    // Time Karatsuba implementation
+    auto start = std::chrono::high_resolution_clock::now();
+    for(const auto& test : test_cases) {
+        karatsuba_results.push_back(karatsuba_mul(test.first, test.second));
+    }
+    auto karatsuba_time = std::chrono::duration_cast<std::chrono::microseconds>(
+        std::chrono::high_resolution_clock::now() - start
+    ).count();
+
+    // Time standard implementation
+    start = std::chrono::high_resolution_clock::now();
+    for(const auto& test : test_cases) {
+        const uint32_t a = test.first;
+        const uint32_t b = test.second;
+        const uint32_t a_lo = a & 0xffffULL;
+        const uint32_t a_hi = a >> 16ULL;
+        const uint32_t b_lo = b & 0xffffULL;
+        const uint32_t b_hi = b >> 16ULL;
+
+        const uint32_t lo_lo = a_lo * b_lo;
+        const uint32_t hi_lo = a_hi * b_lo;
+        const uint32_t lo_hi = a_lo * b_hi;
+        const uint32_t hi_hi = a_hi * b_hi;
+
+        const uint32_t cross = (lo_lo >> 16) + (hi_lo & 0xffffULL) + lo_hi;
+        standard_results.push_back({
+            (cross << 16ULL) | (lo_lo & 0xffffULL),
+            (hi_lo >> 16ULL) + (cross >> 16ULL) + hi_hi
+        });
+    }
+    auto standard_time = std::chrono::duration_cast<std::chrono::microseconds>(
+        std::chrono::high_resolution_clock::now() - start
+    ).count();
+
+    // Verify results match
+    for(size_t i = 0; i < test_cases.size(); ++i) {
+        EXPECT_EQ(karatsuba_results[i].first, standard_results[i].first)
+            << "Mismatch in low bits for test case " << i;
+        EXPECT_EQ(karatsuba_results[i].second, standard_results[i].second)
+            << "Mismatch in high bits for test case " << i;
+    }
+
+    // Print performance comparison
+    std::cout << "\nPerformance comparison over " << NUM_ITERATIONS << " multiplications:\n"
+              << "Karatsuba implementation: " << karatsuba_time << " microseconds\n"
+              << "Standard implementation: " << standard_time << " microseconds\n"
+              << "Performance improvement: "
+              << (standard_time - karatsuba_time) * 100.0 / standard_time
+              << "%\n";
+}
+#endif
diff --git a/barretenberg/cpp/src/barretenberg/numeric/uint128/uint128_impl.hpp b/barretenberg/cpp/src/barretenberg/numeric/uint128/uint128_impl.hpp
@@ -5,21 +5,67 @@
 #include "barretenberg/common/assert.hpp"
 namespace bb::numeric {
 
-constexpr std::pair<uint32_t, uint32_t> uint128_t::mul_wide(const uint32_t a, const uint32_t b)
-{
-    const uint32_t a_lo = a & 0xffffULL;
-    const uint32_t a_hi = a >> 16ULL;
-    const uint32_t b_lo = b & 0xffffULL;
-    const uint32_t b_hi = b >> 16ULL;
+/**
+ * @brief Karatsuba multiplication algorithm for 32-bit integers
+ *
+ * This implementation uses Karatsuba algorithm which reduces the number of multiplications
+ * from 4 to 3 for 16-bit splits. The algorithm works by splitting each number into high/low parts:
+ * a = a_hi * 2^16 + a_lo
+ * b = b_hi * 2^16 + b_lo
+ *
+ * Then a*b = (a_hi * 2^16 + a_lo)(b_hi * 2^16 + b_lo)
+ *         = a_hi * b_hi * 2^32 + ((a_hi + a_lo)(b_hi + b_lo) - a_hi*b_hi - a_lo*b_lo) * 2^16 + a_lo * b_lo
+ *
+ * @param a First 32-bit integer
+ * @param b Second 32-bit integer
+ * @return std::pair<uint32_t, uint32_t> {low 32 bits, high 32 bits} of the product
+ */
+constexpr std::pair<uint32_t, uint32_t> karatsuba_mul(const uint32_t a, const uint32_t b) {
+    constexpr uint32_t SPLIT_POINT = 16;
+    constexpr uint32_t LOW_MASK = (1ULL << SPLIT_POINT) - 1;
+
+    // Split numbers into high and low parts
+    const uint32_t a_lo = a & LOW_MASK;
+    const uint32_t a_hi = a >> SPLIT_POINT;
+    const uint32_t b_lo = b & LOW_MASK;
+    const uint32_t b_hi = b >> SPLIT_POINT;
+
+    // Calculate z0 = a_lo * b_lo
+    const uint32_t z0 = a_lo * b_lo;
+
+    // Calculate z2 = a_hi * b_hi
+    const uint32_t z2 = a_hi * b_hi;
+
+    // Calculate z1 = (a_lo + a_hi)(b_lo + b_hi) - z0 - z2
+    // Use temporary variables to prevent overflow
+    const uint32_t a_sum = a_lo + a_hi;
+    const uint32_t b_sum = b_lo + b_hi;
+
+    // Check for overflow
+    if (a_sum < a_lo || b_sum < b_lo) {
+        // Fall back to standard multiplication if overflow detected
+        const uint32_t hi_lo = a_hi * b_lo;
+        const uint32_t lo_hi = a_lo * b_hi;
+        const uint32_t cross = (z0 >> SPLIT_POINT) + (hi_lo & LOW_MASK) + lo_hi;
+        return {
+            (cross << SPLIT_POINT) | (z0 & LOW_MASK),
+            (hi_lo >> SPLIT_POINT) + (cross >> SPLIT_POINT) + z2
+        };
+    }
 
-    const uint32_t lo_lo = a_lo * b_lo;
-    const uint32_t hi_lo = a_hi * b_lo;
-    const uint32_t lo_hi = a_lo * b_hi;
-    const uint32_t hi_hi = a_hi * b_hi;
+    const uint32_t z1 = a_sum * b_sum - z0 - z2;
 
-    const uint32_t cross = (lo_lo >> 16) + (hi_lo & 0xffffULL) + lo_hi;
+    // Combine results
+    const uint32_t low = z0 + ((z1 & LOW_MASK) << SPLIT_POINT);
+    const uint32_t high = z2 + (z1 >> SPLIT_POINT);
 
-    return { (cross << 16ULL) | (lo_lo & 0xffffULL), (hi_lo >> 16ULL) + (cross >> 16ULL) + hi_hi };
+    return { low, high };
+}
+
+constexpr std::pair<uint32_t, uint32_t> uint128_t::mul_wide(const uint32_t a, const uint32_t b)
+{
+    // Use Karatsuba multiplication for better performance
+    return karatsuba_mul(a, b);
 }
 
 // compute a + b + carry, returning the carry