New lab: Huge Pages

grahamking · Oct 18, 2022 · e944ed2 · e944ed2
1 parent 4ec74aa
commit e944ed2
Show file tree

Hide file tree

Showing 12 changed files with 362 additions and 3 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -22,6 +22,7 @@ members = [
   "labs/memory_bound/loop_tiling_1",
   "labs/memory_bound/swmem_prefetch_1",
   "labs/memory_bound/false_sharing_1",
+  "labs/memory_bound/huge_pages_1",
   "labs/bad_speculation/conditional_store_1",
   "labs/bad_speculation/lookup_tables_1",
   "labs/bad_speculation/virtual_call_mispredict",

diff --git a/README.md b/README.md
@@ -22,6 +22,7 @@ I recommend reading Denis' free ebook [Performance Analysis and Tuning on Modern
   * [Loop Tiling](labs/memory_bound/loop_tiling_1)
   * [SW memory prefetching](labs/memory_bound/swmem_prefetch_1)
   * [False Sharing](labs/memory_bound/false_sharing_1)
+  * [Huge Pages](labs/memory_bound/huge_pages_1)
 * Bad Speculation:
   * [Conditional Store](labs/bad_speculation/conditional_store_1)
   * [Replacing Branches With Lookup Tables](labs/bad_speculation/lookup_tables_1)

diff --git a/labs/memory_bound/huge_pages_1/Cargo.toml b/labs/memory_bound/huge_pages_1/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "huge_pages_1"
+version = "0.1.0"
+edition = "2021"
+
+[[bench]]
+name = "bench_huge_pages_1"
+harness = false
+
+[dependencies]
+rand = "0.8.5"
+
+[dev-dependencies]
+criterion = { version = "0.3.6", features = ["html_reports", "real_blackbox", "cargo_bench_support"] }
diff --git a/labs/memory_bound/huge_pages_1/README.md b/labs/memory_bound/huge_pages_1/README.md
@@ -0,0 +1,16 @@
+[Original C++ lab with docs and maybe video](https://github.com/dendibakh/perf-ninja/tree/main/labs/memory_bound/huge_pages_1)
+
+Rust version is Linux only so far. The C++ original also supports macOS and Windows. Contributions welcome!
+
+Observe the memory bottleneck:
+
+- Build benchmark binary: `cargo bench --no-run`. It should print path to the binary.
+- Confirm we're loading a lot from main memory: `perf stat -e  cache-references,LLC-loads,LLC-load-misses <binary>`. I get over 50% Last Level Cache (L3) misses, meaning those loads had to go to main memory.
+- Check TLB: `perf stat -e dTLB-loads,dTLB-load-misses <binary>`. I have about 12% TLB misses (before optimization).
+
+Optimize:
+
+Enable huge pages on Linux (128 pages is a guess, try other numbers): `sudo bash -c 'echo 128 > /proc/sys/vm/nr_hugepages'`. If you use anonymous mmaped pages I don't think you need to mount a `hugetlbfs` filesystem [like the docs recommend](https://github.com/dendibakh/perf-ninja/blob/main/labs/memory_bound/huge_pages_1/HugePagesSetupTips.md).
+
+I got a ~30% speedup.
+
diff --git a/labs/memory_bound/huge_pages_1/benches/bench_huge_pages_1.rs b/labs/memory_bound/huge_pages_1/benches/bench_huge_pages_1.rs
@@ -0,0 +1,55 @@
+#![feature(allocator_api)]
+
+use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+
+use huge_pages_1::{allocator, generate_mesh, solution};
+use rand::distributions::{Distribution, Uniform};
+use rand::rngs::StdRng;
+use rand::SeedableRng;
+
+fn bench1(c: &mut Criterion) {
+    // Mesh
+    const N_NODES_X: u32 = 800;
+    const N_NODES_Y: u32 = 20000;
+    const N_NODES: u32 = N_NODES_X * N_NODES_Y;
+
+    const SEED: u64 = 0xaf173e8a;
+    let alloc = allocator();
+    let mut x = Vec::with_capacity_in(N_NODES as usize, alloc);
+    let mut y = Vec::with_capacity_in(N_NODES as usize, alloc);
+    x.resize(N_NODES as usize, 0.0);
+    y.resize(N_NODES as usize, 0.0);
+    let topology = generate_mesh(N_NODES_X, N_NODES_Y, &mut x, &mut y, SEED);
+
+    // Generate random left-hand side
+    let mut lhs = Vec::with_capacity_in(2 * N_NODES as usize, alloc);
+    lhs.resize(lhs.capacity(), 0.0);
+    let mut prng = StdRng::seed_from_u64(SEED);
+    let dist = Uniform::from(0.0..42.0);
+    for _ in 0..lhs.len() {
+        lhs.push(dist.sample(&mut prng));
+    }
+
+    // Right-hand side
+    let mut rhs = Vec::with_capacity_in(2 * N_NODES as usize, alloc);
+
+    // Run the benchmark
+
+    let mut group = c.benchmark_group("huge_pages_1");
+    group.sample_size(10);
+    //state.SetBytesProcessed(state.iterations() * topology.size() * 4 * sizeof(double));
+    group.throughput(Throughput::Bytes(
+        (topology.len() * 4 * std::mem::size_of::<f64>()) as u64,
+    ));
+
+    group.bench_function("Apply matrix-free operator", |b| {
+        b.iter(|| {
+            solution(&topology, N_NODES, &x, &y, &lhs, &mut rhs);
+            std::hint::black_box(&rhs);
+            //benchmark::ClobberMemory();
+        });
+    });
+}
+
+criterion_group!(benches, bench1);
+criterion_main!(benches);
diff --git a/labs/memory_bound/huge_pages_1/src/allocator.rs b/labs/memory_bound/huge_pages_1/src/allocator.rs
@@ -0,0 +1,6 @@
+use std::alloc::{Allocator, System};
+
+// Replace the body of this function with a huge pages allocator
+pub fn allocator() -> &'static impl Allocator {
+    &System
+}
diff --git a/labs/memory_bound/huge_pages_1/src/generate_mesh.rs b/labs/memory_bound/huge_pages_1/src/generate_mesh.rs
@@ -0,0 +1,79 @@
+// Generate an example mesh describing a 2D truss.
+// In this example, the nodes are distributed on a cartesian grid, the topology
+// of the mesh is as follows:
+/*
+ * o-o-o-o-o-o  ^
+ * |\|\|\|\|\|  |
+ * o-o-o-o-o-o  |  n_nodes_y layers
+ * |\|\|\|\|\|  |
+ * o-o-o-o-o-o  |
+ *
+ * ----------->
+ * n_nodes_x layers
+ *
+ */
+
+use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng};
+
+// n_nodes_x, n_nodes_y - see sketch above
+// x, y - arrays where node coordinates will be written, need to have space
+//        allocated for at least (n_nodes_x*n_nodes_y) doubles
+// seed - seed for RNG
+//
+// returns topology (see lib.rs)
+pub fn generate_mesh(
+    n_nodes_x: u32,
+    n_nodes_y: u32,
+    x: &mut [f64],
+    y: &mut [f64],
+    seed: u64,
+) -> Vec<[u32; 2]> {
+    let n_nodes = n_nodes_x * n_nodes_y;
+
+    // Topology
+    let mut topology: Vec<[u32; 2]> = Vec::with_capacity(
+        ((n_nodes_x - 1) * (n_nodes_y - 1) * 3 + (n_nodes_x - 1) + (n_nodes_y - 1)) as usize,
+    );
+    for j in 0..(n_nodes_y - 1) {
+        for i in 0..(n_nodes_x - 1) {
+            let base = n_nodes_x * j + i;
+            topology.push([base, base + 1]);
+            topology.push([base, base + n_nodes_x]);
+            topology.push([base, base + n_nodes_x + 1]);
+        }
+        topology.push([n_nodes_x * (j + 1) - 1, n_nodes_x * (j + 2) - 1]);
+    }
+    for i in 0..(n_nodes_x - 1) {
+        topology.push([
+            n_nodes_x * (n_nodes_y - 1) + i,
+            n_nodes_x * (n_nodes_y - 1) + i + 1,
+        ]);
+    }
+
+    // Node coords
+    let mut x_unshuffled = vec![0.0; n_nodes as usize];
+    let mut y_unshuffled = vec![0.0; n_nodes as usize];
+    let mut coord_ind: usize = 0;
+    for j in 0..n_nodes_y {
+        for i in 0..n_nodes_x {
+            x_unshuffled[coord_ind] = i as f64;
+            y_unshuffled[coord_ind] = j as f64;
+            coord_ind += 1;
+        }
+    }
+
+    // Shuffle
+    let mut prng = StdRng::seed_from_u64(seed);
+    let mut permutation: Vec<f64> = (0..n_nodes).map(|x| x as f64).collect();
+    permutation.shuffle(&mut prng);
+    x[..n_nodes as usize].copy_from_slice(&x_unshuffled[..n_nodes as usize]);
+    y[..n_nodes as usize].copy_from_slice(&y_unshuffled[..n_nodes as usize]);
+
+    for [n1, n2] in topology.iter_mut() {
+        *n1 = permutation[*n1 as usize] as u32;
+        *n2 = permutation[*n2 as usize] as u32;
+    }
+    topology.shuffle(&mut prng);
+
+    topology
+}
diff --git a/labs/memory_bound/huge_pages_1/src/lib.rs b/labs/memory_bound/huge_pages_1/src/lib.rs
@@ -0,0 +1,132 @@
+//////////////////////////////////////////////////////////////
+//                       ATTENTION                          //
+// You are not meant to modify this file. Please focus on   //
+// allocator.rs                                             //
+//////////////////////////////////////////////////////////////
+
+#![feature(maybe_uninit_uninit_array)]
+#![feature(maybe_uninit_array_assume_init)]
+#![feature(allocator_api)]
+
+use std::alloc::Allocator;
+use std::mem::MaybeUninit;
+
+mod allocator;
+pub use allocator::allocator;
+mod generate_mesh;
+pub use generate_mesh::generate_mesh;
+
+#[cfg(test)]
+mod tests;
+
+fn compute_local_product(coords: &[f64; 4], &lhs_local: &[f64; 4]) -> [f64; 4] {
+    let dx: f64 = coords[2] - coords[0];
+    let dy: f64 = coords[3] - coords[1];
+    let dx2: f64 = dx * dx;
+    let dy2: f64 = dy * dy;
+    let dxdy: f64 = dx * dy;
+
+    let mut k = [
+        [dx2, dxdy, -dx2, -dxdy],
+        [dxdy, dy2, -dxdy, -dy2],
+        [0f64; 4],
+        [0f64; 4],
+    ];
+    let mut c = 2;
+    while c < 4 {
+        let mut r = 0;
+        while r < 4 {
+            k[c][r] = -k[c - 2][r];
+            r += 1;
+        }
+        c += 1;
+    }
+    let mut mult_result = [0f64; 4];
+    let mut c = 0;
+    while c < 4 {
+        let mut r = 0;
+        while r < 4 {
+            mult_result[r] += k[c][r] * lhs_local[c];
+            r += 1;
+        }
+        c += 1;
+    }
+    let l = f64::sqrt(dx2 + dy2);
+    const E: f64 = 210e9;
+    #[allow(clippy::approx_constant)]
+    const A: f64 = 3.14 * 1e-2 * 1e-2;
+    let c = E * A / (l * l * l);
+    let mut ai = 0;
+    while ai < mult_result.len() {
+        mult_result[ai] *= c;
+        ai += 1;
+    }
+    mult_result
+}
+
+const fn compute_dofs(n1: u32, n2: u32) -> [u32; 4] {
+    let mut dofs = [0u32; 4];
+    dofs[0] = n1 * 2;
+    dofs[1] = n1 * 2 + 1;
+    dofs[2] = n2 * 2;
+    dofs[3] = n2 * 2 + 1;
+    dofs
+}
+
+fn gather_global(n1: u32, n2: u32, rhs_global: &[f64]) -> [f64; 4] {
+    let dofs = compute_dofs(n1, n2);
+    let mut vals: [MaybeUninit<f64>; 4] = MaybeUninit::uninit_array();
+    let mut i = 0;
+    while i < dofs.len() {
+        vals[i].write(rhs_global[dofs[i] as usize]);
+        i += 1;
+    }
+    unsafe { MaybeUninit::array_assume_init(vals) }
+}
+
+fn scatter_local(n1: u32, n2: u32, vals: &[f64; 4], rhs_global: &mut [f64]) {
+    let dofs = compute_dofs(n1, n2);
+    let mut i = 0;
+    while i < dofs.len() {
+        rhs_global[dofs[i] as usize] += vals[i];
+        i += 1;
+    }
+}
+
+// Local contrbution of the element described by the nodes (n1, n2). Remaining
+// arguments are the same as the arguments of solution(...)
+fn processs_element(n1: u32, n2: u32, x: &[f64], y: &[f64], lhs: &[f64], rhs: &mut [f64]) {
+    let lhs_vals: [f64; 4] = gather_global(n1, n2, lhs);
+    let mut coords = [0f64; 4];
+    coords[0] = x[n1 as usize];
+    coords[1] = y[n1 as usize];
+    coords[2] = x[n2 as usize];
+    coords[3] = y[n2 as usize];
+    let local_prod = compute_local_product(&coords, &lhs_vals);
+    scatter_local(n1, n2, &local_prod, rhs);
+}
+
+// Evaluate matrix-free operator for a 2D truss
+//
+// topo - topology of the mesh. Each entry in the vector represents a single
+//        element, described by the 2 IDs of the nodes of the element.
+// n_nodes - total number of nodes in the mesh
+// x, y - arrays containing the coordinates of the nodes - i-th entry contains
+//        the coordinates of the i-th node
+// lhs - left-hand side vector - this is the vector which is to be multiplied by
+//       the stiffness matrix. It has a length of 2 * n_nodes (2 DOFs per node)
+// rhs - right-hand side - this is the vector where we want to write the result
+//       of the multiplication (same length as lhs)
+pub fn solution<A: Allocator>(
+    topo: &[[u32; 2]],
+    n_nodes: u32,
+    x: &[f64],
+    y: &[f64],
+    lhs: &[f64],
+    rhs: &mut Vec<f64, A>,
+) {
+    rhs.resize(n_nodes as usize * 2, 0.0);
+    for [n1, n2] in topo {
+        processs_element(*n1, *n2, x, y, lhs, rhs);
+    }
+}
diff --git a/labs/memory_bound/huge_pages_1/src/tests.rs b/labs/memory_bound/huge_pages_1/src/tests.rs
@@ -0,0 +1,49 @@
+use crate::{allocator, generate_mesh, solution};
+use rand::distributions::{Distribution, Uniform};
+use std::alloc::{Allocator, System};
+
+const N_NODES_X: u32 = 100;
+const N_NODES_Y: u32 = 200;
+const N_NODES: u32 = N_NODES_X * N_NODES_Y;
+
+fn solve<A: Allocator>(alloc: &'static A) -> Vec<f64, &'static A> {
+    let mut x = Vec::with_capacity_in(N_NODES as usize, alloc);
+    let mut y = Vec::with_capacity_in(N_NODES as usize, alloc);
+    x.resize(N_NODES as usize, 0.0);
+    y.resize(N_NODES as usize, 0.0);
+    let topology = generate_mesh(N_NODES_X, N_NODES_Y, &mut x, &mut y, 0);
+
+    // Generate random left-hand side
+    let mut lhs = Vec::with_capacity_in(2 * N_NODES as usize, alloc);
+    lhs.resize(2 * N_NODES as usize, 0.0);
+
+    let mut prng = rand::thread_rng();
+    let dist = Uniform::from(0.0..42.0);
+    for _ in 0..lhs.len() {
+        lhs.push(dist.sample(&mut prng));
+    }
+
+    // Right-hand side
+    let mut rhs = Vec::with_capacity_in(2 * N_NODES as usize, alloc);
+
+    // Eval operator
+    solution(&topology, N_NODES, &x, &y, &lhs, &mut rhs);
+
+    rhs
+}
+
+#[test]
+fn validate() {
+    let sol_user = solve(allocator());
+    let sol_valid = solve(&System);
+
+    let mut acc = 0.0;
+    for i in 0..sol_valid.len() {
+        let a = sol_valid[i];
+        let b = sol_user[i];
+        acc += (a - b) * (a - b);
+    }
+    let l2_error = f64::sqrt(acc);
+
+    assert!(l2_error <= 1e-9);
+}
diff --git a/labs/memory_bound/loop_interchange_1/benches/bench_loop_interchange_1.rs b/labs/memory_bound/loop_interchange_1/benches/bench_loop_interchange_1.rs
@@ -1,5 +1,3 @@
-#![feature(bench_black_box)]
-
 use criterion::{criterion_group, criterion_main, Criterion};
 
 use loop_interchange_1::{init, power, zero, N};