Skip to content

Commit

Permalink
New lab: Huge Pages
Browse files Browse the repository at this point in the history
  • Loading branch information
grahamking committed Oct 18, 2022
1 parent 4ec74aa commit e944ed2
Show file tree
Hide file tree
Showing 12 changed files with 362 additions and 3 deletions.
8 changes: 8 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ members = [
"labs/memory_bound/loop_tiling_1",
"labs/memory_bound/swmem_prefetch_1",
"labs/memory_bound/false_sharing_1",
"labs/memory_bound/huge_pages_1",
"labs/bad_speculation/conditional_store_1",
"labs/bad_speculation/lookup_tables_1",
"labs/bad_speculation/virtual_call_mispredict",
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ I recommend reading Denis' free ebook [Performance Analysis and Tuning on Modern
* [Loop Tiling](labs/memory_bound/loop_tiling_1)
* [SW memory prefetching](labs/memory_bound/swmem_prefetch_1)
* [False Sharing](labs/memory_bound/false_sharing_1)
* [Huge Pages](labs/memory_bound/huge_pages_1)
* Bad Speculation:
* [Conditional Store](labs/bad_speculation/conditional_store_1)
* [Replacing Branches With Lookup Tables](labs/bad_speculation/lookup_tables_1)
Expand Down
14 changes: 14 additions & 0 deletions labs/memory_bound/huge_pages_1/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[package]
name = "huge_pages_1"
version = "0.1.0"
edition = "2021"

[[bench]]
name = "bench_huge_pages_1"
harness = false

[dependencies]
rand = "0.8.5"

[dev-dependencies]
criterion = { version = "0.3.6", features = ["html_reports", "real_blackbox", "cargo_bench_support"] }
16 changes: 16 additions & 0 deletions labs/memory_bound/huge_pages_1/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[Original C++ lab with docs and maybe video](https://github.com/dendibakh/perf-ninja/tree/main/labs/memory_bound/huge_pages_1)

Rust version is Linux only so far. The C++ original also supports macOS and Windows. Contributions welcome!

Observe the memory bottleneck:

- Build benchmark binary: `cargo bench --no-run`. It should print path to the binary.
- Confirm we're loading a lot from main memory: `perf stat -e cache-references,LLC-loads,LLC-load-misses <binary>`. I get over 50% Last Level Cache (L3) misses, meaning those loads had to go to main memory.
- Check TLB: `perf stat -e dTLB-loads,dTLB-load-misses <binary>`. I have about 12% TLB misses (before optimization).

Optimize:

Enable huge pages on Linux (128 pages is a guess, try other numbers): `sudo bash -c 'echo 128 > /proc/sys/vm/nr_hugepages'`. If you use anonymous mmaped pages I don't think you need to mount a `hugetlbfs` filesystem [like the docs recommend](https://github.com/dendibakh/perf-ninja/blob/main/labs/memory_bound/huge_pages_1/HugePagesSetupTips.md).

I got a ~30% speedup.

55 changes: 55 additions & 0 deletions labs/memory_bound/huge_pages_1/benches/bench_huge_pages_1.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#![feature(allocator_api)]

use criterion::{criterion_group, criterion_main, Criterion, Throughput};

use huge_pages_1::{allocator, generate_mesh, solution};
use rand::distributions::{Distribution, Uniform};
use rand::rngs::StdRng;
use rand::SeedableRng;

fn bench1(c: &mut Criterion) {
// Mesh
const N_NODES_X: u32 = 800;
const N_NODES_Y: u32 = 20000;
const N_NODES: u32 = N_NODES_X * N_NODES_Y;

const SEED: u64 = 0xaf173e8a;
let alloc = allocator();
let mut x = Vec::with_capacity_in(N_NODES as usize, alloc);
let mut y = Vec::with_capacity_in(N_NODES as usize, alloc);
x.resize(N_NODES as usize, 0.0);
y.resize(N_NODES as usize, 0.0);
let topology = generate_mesh(N_NODES_X, N_NODES_Y, &mut x, &mut y, SEED);

// Generate random left-hand side
let mut lhs = Vec::with_capacity_in(2 * N_NODES as usize, alloc);
lhs.resize(lhs.capacity(), 0.0);
let mut prng = StdRng::seed_from_u64(SEED);
let dist = Uniform::from(0.0..42.0);
for _ in 0..lhs.len() {
lhs.push(dist.sample(&mut prng));
}

// Right-hand side
let mut rhs = Vec::with_capacity_in(2 * N_NODES as usize, alloc);

// Run the benchmark

let mut group = c.benchmark_group("huge_pages_1");
group.sample_size(10);
//state.SetBytesProcessed(state.iterations() * topology.size() * 4 * sizeof(double));
group.throughput(Throughput::Bytes(
(topology.len() * 4 * std::mem::size_of::<f64>()) as u64,
));

group.bench_function("Apply matrix-free operator", |b| {
b.iter(|| {
solution(&topology, N_NODES, &x, &y, &lhs, &mut rhs);
std::hint::black_box(&rhs);
//benchmark::ClobberMemory();
});
});
}

criterion_group!(benches, bench1);
criterion_main!(benches);
6 changes: 6 additions & 0 deletions labs/memory_bound/huge_pages_1/src/allocator.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
use std::alloc::{Allocator, System};

// Replace the body of this function with a huge pages allocator
pub fn allocator() -> &'static impl Allocator {
&System
}
79 changes: 79 additions & 0 deletions labs/memory_bound/huge_pages_1/src/generate_mesh.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// Generate an example mesh describing a 2D truss.
// In this example, the nodes are distributed on a cartesian grid, the topology
// of the mesh is as follows:
/*
* o-o-o-o-o-o ^
* |\|\|\|\|\| |
* o-o-o-o-o-o | n_nodes_y layers
* |\|\|\|\|\| |
* o-o-o-o-o-o |
*
* ----------->
* n_nodes_x layers
*
*/

use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng};

// n_nodes_x, n_nodes_y - see sketch above
// x, y - arrays where node coordinates will be written, need to have space
// allocated for at least (n_nodes_x*n_nodes_y) doubles
// seed - seed for RNG
//
// returns topology (see lib.rs)
pub fn generate_mesh(
n_nodes_x: u32,
n_nodes_y: u32,
x: &mut [f64],
y: &mut [f64],
seed: u64,
) -> Vec<[u32; 2]> {
let n_nodes = n_nodes_x * n_nodes_y;

// Topology
let mut topology: Vec<[u32; 2]> = Vec::with_capacity(
((n_nodes_x - 1) * (n_nodes_y - 1) * 3 + (n_nodes_x - 1) + (n_nodes_y - 1)) as usize,
);
for j in 0..(n_nodes_y - 1) {
for i in 0..(n_nodes_x - 1) {
let base = n_nodes_x * j + i;
topology.push([base, base + 1]);
topology.push([base, base + n_nodes_x]);
topology.push([base, base + n_nodes_x + 1]);
}
topology.push([n_nodes_x * (j + 1) - 1, n_nodes_x * (j + 2) - 1]);
}
for i in 0..(n_nodes_x - 1) {
topology.push([
n_nodes_x * (n_nodes_y - 1) + i,
n_nodes_x * (n_nodes_y - 1) + i + 1,
]);
}

// Node coords
let mut x_unshuffled = vec![0.0; n_nodes as usize];
let mut y_unshuffled = vec![0.0; n_nodes as usize];
let mut coord_ind: usize = 0;
for j in 0..n_nodes_y {
for i in 0..n_nodes_x {
x_unshuffled[coord_ind] = i as f64;
y_unshuffled[coord_ind] = j as f64;
coord_ind += 1;
}
}

// Shuffle
let mut prng = StdRng::seed_from_u64(seed);
let mut permutation: Vec<f64> = (0..n_nodes).map(|x| x as f64).collect();
permutation.shuffle(&mut prng);
x[..n_nodes as usize].copy_from_slice(&x_unshuffled[..n_nodes as usize]);
y[..n_nodes as usize].copy_from_slice(&y_unshuffled[..n_nodes as usize]);

for [n1, n2] in topology.iter_mut() {
*n1 = permutation[*n1 as usize] as u32;
*n2 = permutation[*n2 as usize] as u32;
}
topology.shuffle(&mut prng);

topology
}
132 changes: 132 additions & 0 deletions labs/memory_bound/huge_pages_1/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
//////////////////////////////////////////////////////////////
// ATTENTION //
// You are not meant to modify this file. Please focus on //
// allocator.rs //
//////////////////////////////////////////////////////////////

#![feature(maybe_uninit_uninit_array)]
#![feature(maybe_uninit_array_assume_init)]
#![feature(allocator_api)]

use std::alloc::Allocator;
use std::mem::MaybeUninit;

mod allocator;
pub use allocator::allocator;
mod generate_mesh;
pub use generate_mesh::generate_mesh;

#[cfg(test)]
mod tests;

fn compute_local_product(coords: &[f64; 4], &lhs_local: &[f64; 4]) -> [f64; 4] {
let dx: f64 = coords[2] - coords[0];
let dy: f64 = coords[3] - coords[1];
let dx2: f64 = dx * dx;
let dy2: f64 = dy * dy;
let dxdy: f64 = dx * dy;

let mut k = [
[dx2, dxdy, -dx2, -dxdy],
[dxdy, dy2, -dxdy, -dy2],
[0f64; 4],
[0f64; 4],
];
let mut c = 2;
while c < 4 {
let mut r = 0;
while r < 4 {
k[c][r] = -k[c - 2][r];
r += 1;
}
c += 1;
}
let mut mult_result = [0f64; 4];
let mut c = 0;
while c < 4 {
let mut r = 0;
while r < 4 {
mult_result[r] += k[c][r] * lhs_local[c];
r += 1;
}
c += 1;
}
let l = f64::sqrt(dx2 + dy2);
const E: f64 = 210e9;
#[allow(clippy::approx_constant)]
const A: f64 = 3.14 * 1e-2 * 1e-2;
let c = E * A / (l * l * l);
let mut ai = 0;
while ai < mult_result.len() {
mult_result[ai] *= c;
ai += 1;
}
mult_result
}

const fn compute_dofs(n1: u32, n2: u32) -> [u32; 4] {
let mut dofs = [0u32; 4];
dofs[0] = n1 * 2;
dofs[1] = n1 * 2 + 1;
dofs[2] = n2 * 2;
dofs[3] = n2 * 2 + 1;
dofs
}

fn gather_global(n1: u32, n2: u32, rhs_global: &[f64]) -> [f64; 4] {
let dofs = compute_dofs(n1, n2);
let mut vals: [MaybeUninit<f64>; 4] = MaybeUninit::uninit_array();
let mut i = 0;
while i < dofs.len() {
vals[i].write(rhs_global[dofs[i] as usize]);
i += 1;
}
unsafe { MaybeUninit::array_assume_init(vals) }
}

fn scatter_local(n1: u32, n2: u32, vals: &[f64; 4], rhs_global: &mut [f64]) {
let dofs = compute_dofs(n1, n2);
let mut i = 0;
while i < dofs.len() {
rhs_global[dofs[i] as usize] += vals[i];
i += 1;
}
}

// Local contrbution of the element described by the nodes (n1, n2). Remaining
// arguments are the same as the arguments of solution(...)
fn processs_element(n1: u32, n2: u32, x: &[f64], y: &[f64], lhs: &[f64], rhs: &mut [f64]) {
let lhs_vals: [f64; 4] = gather_global(n1, n2, lhs);
let mut coords = [0f64; 4];
coords[0] = x[n1 as usize];
coords[1] = y[n1 as usize];
coords[2] = x[n2 as usize];
coords[3] = y[n2 as usize];
let local_prod = compute_local_product(&coords, &lhs_vals);
scatter_local(n1, n2, &local_prod, rhs);
}

// Evaluate matrix-free operator for a 2D truss
//
// topo - topology of the mesh. Each entry in the vector represents a single
// element, described by the 2 IDs of the nodes of the element.
// n_nodes - total number of nodes in the mesh
// x, y - arrays containing the coordinates of the nodes - i-th entry contains
// the coordinates of the i-th node
// lhs - left-hand side vector - this is the vector which is to be multiplied by
// the stiffness matrix. It has a length of 2 * n_nodes (2 DOFs per node)
// rhs - right-hand side - this is the vector where we want to write the result
// of the multiplication (same length as lhs)
pub fn solution<A: Allocator>(
topo: &[[u32; 2]],
n_nodes: u32,
x: &[f64],
y: &[f64],
lhs: &[f64],
rhs: &mut Vec<f64, A>,
) {
rhs.resize(n_nodes as usize * 2, 0.0);
for [n1, n2] in topo {
processs_element(*n1, *n2, x, y, lhs, rhs);
}
}
49 changes: 49 additions & 0 deletions labs/memory_bound/huge_pages_1/src/tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
use crate::{allocator, generate_mesh, solution};
use rand::distributions::{Distribution, Uniform};
use std::alloc::{Allocator, System};

const N_NODES_X: u32 = 100;
const N_NODES_Y: u32 = 200;
const N_NODES: u32 = N_NODES_X * N_NODES_Y;

fn solve<A: Allocator>(alloc: &'static A) -> Vec<f64, &'static A> {
let mut x = Vec::with_capacity_in(N_NODES as usize, alloc);
let mut y = Vec::with_capacity_in(N_NODES as usize, alloc);
x.resize(N_NODES as usize, 0.0);
y.resize(N_NODES as usize, 0.0);
let topology = generate_mesh(N_NODES_X, N_NODES_Y, &mut x, &mut y, 0);

// Generate random left-hand side
let mut lhs = Vec::with_capacity_in(2 * N_NODES as usize, alloc);
lhs.resize(2 * N_NODES as usize, 0.0);

let mut prng = rand::thread_rng();
let dist = Uniform::from(0.0..42.0);
for _ in 0..lhs.len() {
lhs.push(dist.sample(&mut prng));
}

// Right-hand side
let mut rhs = Vec::with_capacity_in(2 * N_NODES as usize, alloc);

// Eval operator
solution(&topology, N_NODES, &x, &y, &lhs, &mut rhs);

rhs
}

#[test]
fn validate() {
let sol_user = solve(allocator());
let sol_valid = solve(&System);

let mut acc = 0.0;
for i in 0..sol_valid.len() {
let a = sol_valid[i];
let b = sol_user[i];
acc += (a - b) * (a - b);
}
let l2_error = f64::sqrt(acc);

assert!(l2_error <= 1e-9);
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#![feature(bench_black_box)]

use criterion::{criterion_group, criterion_main, Criterion};

use loop_interchange_1::{init, power, zero, N};
Expand Down
Loading

0 comments on commit e944ed2

Please sign in to comment.