-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4ec74aa
commit e944ed2
Showing
12 changed files
with
362 additions
and
3 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
[package] | ||
name = "huge_pages_1" | ||
version = "0.1.0" | ||
edition = "2021" | ||
|
||
[[bench]] | ||
name = "bench_huge_pages_1" | ||
harness = false | ||
|
||
[dependencies] | ||
rand = "0.8.5" | ||
|
||
[dev-dependencies] | ||
criterion = { version = "0.3.6", features = ["html_reports", "real_blackbox", "cargo_bench_support"] } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
[Original C++ lab with docs and maybe video](https://github.com/dendibakh/perf-ninja/tree/main/labs/memory_bound/huge_pages_1) | ||
|
||
Rust version is Linux only so far. The C++ original also supports macOS and Windows. Contributions welcome! | ||
|
||
Observe the memory bottleneck: | ||
|
||
- Build benchmark binary: `cargo bench --no-run`. It should print path to the binary. | ||
- Confirm we're loading a lot from main memory: `perf stat -e cache-references,LLC-loads,LLC-load-misses <binary>`. I get over 50% Last Level Cache (L3) misses, meaning those loads had to go to main memory. | ||
- Check TLB: `perf stat -e dTLB-loads,dTLB-load-misses <binary>`. I have about 12% TLB misses (before optimization). | ||
|
||
Optimize: | ||
|
||
Enable huge pages on Linux (128 pages is a guess, try other numbers): `sudo bash -c 'echo 128 > /proc/sys/vm/nr_hugepages'`. If you use anonymous mmaped pages I don't think you need to mount a `hugetlbfs` filesystem [like the docs recommend](https://github.com/dendibakh/perf-ninja/blob/main/labs/memory_bound/huge_pages_1/HugePagesSetupTips.md). | ||
|
||
I got a ~30% speedup. | ||
|
55 changes: 55 additions & 0 deletions
55
labs/memory_bound/huge_pages_1/benches/bench_huge_pages_1.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#![feature(allocator_api)] | ||
|
||
use criterion::{criterion_group, criterion_main, Criterion, Throughput}; | ||
|
||
use huge_pages_1::{allocator, generate_mesh, solution}; | ||
use rand::distributions::{Distribution, Uniform}; | ||
use rand::rngs::StdRng; | ||
use rand::SeedableRng; | ||
|
||
fn bench1(c: &mut Criterion) { | ||
// Mesh | ||
const N_NODES_X: u32 = 800; | ||
const N_NODES_Y: u32 = 20000; | ||
const N_NODES: u32 = N_NODES_X * N_NODES_Y; | ||
|
||
const SEED: u64 = 0xaf173e8a; | ||
let alloc = allocator(); | ||
let mut x = Vec::with_capacity_in(N_NODES as usize, alloc); | ||
let mut y = Vec::with_capacity_in(N_NODES as usize, alloc); | ||
x.resize(N_NODES as usize, 0.0); | ||
y.resize(N_NODES as usize, 0.0); | ||
let topology = generate_mesh(N_NODES_X, N_NODES_Y, &mut x, &mut y, SEED); | ||
|
||
// Generate random left-hand side | ||
let mut lhs = Vec::with_capacity_in(2 * N_NODES as usize, alloc); | ||
lhs.resize(lhs.capacity(), 0.0); | ||
let mut prng = StdRng::seed_from_u64(SEED); | ||
let dist = Uniform::from(0.0..42.0); | ||
for _ in 0..lhs.len() { | ||
lhs.push(dist.sample(&mut prng)); | ||
} | ||
|
||
// Right-hand side | ||
let mut rhs = Vec::with_capacity_in(2 * N_NODES as usize, alloc); | ||
|
||
// Run the benchmark | ||
|
||
let mut group = c.benchmark_group("huge_pages_1"); | ||
group.sample_size(10); | ||
//state.SetBytesProcessed(state.iterations() * topology.size() * 4 * sizeof(double)); | ||
group.throughput(Throughput::Bytes( | ||
(topology.len() * 4 * std::mem::size_of::<f64>()) as u64, | ||
)); | ||
|
||
group.bench_function("Apply matrix-free operator", |b| { | ||
b.iter(|| { | ||
solution(&topology, N_NODES, &x, &y, &lhs, &mut rhs); | ||
std::hint::black_box(&rhs); | ||
//benchmark::ClobberMemory(); | ||
}); | ||
}); | ||
} | ||
|
||
criterion_group!(benches, bench1); | ||
criterion_main!(benches); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
use std::alloc::{Allocator, System}; | ||
|
||
// Replace the body of this function with a huge pages allocator | ||
pub fn allocator() -> &'static impl Allocator { | ||
&System | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
// Generate an example mesh describing a 2D truss. | ||
// In this example, the nodes are distributed on a cartesian grid, the topology | ||
// of the mesh is as follows: | ||
/* | ||
* o-o-o-o-o-o ^ | ||
* |\|\|\|\|\| | | ||
* o-o-o-o-o-o | n_nodes_y layers | ||
* |\|\|\|\|\| | | ||
* o-o-o-o-o-o | | ||
* | ||
* -----------> | ||
* n_nodes_x layers | ||
* | ||
*/ | ||
|
||
use rand::{rngs::StdRng, seq::SliceRandom, SeedableRng}; | ||
|
||
// n_nodes_x, n_nodes_y - see sketch above | ||
// x, y - arrays where node coordinates will be written, need to have space | ||
// allocated for at least (n_nodes_x*n_nodes_y) doubles | ||
// seed - seed for RNG | ||
// | ||
// returns topology (see lib.rs) | ||
pub fn generate_mesh( | ||
n_nodes_x: u32, | ||
n_nodes_y: u32, | ||
x: &mut [f64], | ||
y: &mut [f64], | ||
seed: u64, | ||
) -> Vec<[u32; 2]> { | ||
let n_nodes = n_nodes_x * n_nodes_y; | ||
|
||
// Topology | ||
let mut topology: Vec<[u32; 2]> = Vec::with_capacity( | ||
((n_nodes_x - 1) * (n_nodes_y - 1) * 3 + (n_nodes_x - 1) + (n_nodes_y - 1)) as usize, | ||
); | ||
for j in 0..(n_nodes_y - 1) { | ||
for i in 0..(n_nodes_x - 1) { | ||
let base = n_nodes_x * j + i; | ||
topology.push([base, base + 1]); | ||
topology.push([base, base + n_nodes_x]); | ||
topology.push([base, base + n_nodes_x + 1]); | ||
} | ||
topology.push([n_nodes_x * (j + 1) - 1, n_nodes_x * (j + 2) - 1]); | ||
} | ||
for i in 0..(n_nodes_x - 1) { | ||
topology.push([ | ||
n_nodes_x * (n_nodes_y - 1) + i, | ||
n_nodes_x * (n_nodes_y - 1) + i + 1, | ||
]); | ||
} | ||
|
||
// Node coords | ||
let mut x_unshuffled = vec![0.0; n_nodes as usize]; | ||
let mut y_unshuffled = vec![0.0; n_nodes as usize]; | ||
let mut coord_ind: usize = 0; | ||
for j in 0..n_nodes_y { | ||
for i in 0..n_nodes_x { | ||
x_unshuffled[coord_ind] = i as f64; | ||
y_unshuffled[coord_ind] = j as f64; | ||
coord_ind += 1; | ||
} | ||
} | ||
|
||
// Shuffle | ||
let mut prng = StdRng::seed_from_u64(seed); | ||
let mut permutation: Vec<f64> = (0..n_nodes).map(|x| x as f64).collect(); | ||
permutation.shuffle(&mut prng); | ||
x[..n_nodes as usize].copy_from_slice(&x_unshuffled[..n_nodes as usize]); | ||
y[..n_nodes as usize].copy_from_slice(&y_unshuffled[..n_nodes as usize]); | ||
|
||
for [n1, n2] in topology.iter_mut() { | ||
*n1 = permutation[*n1 as usize] as u32; | ||
*n2 = permutation[*n2 as usize] as u32; | ||
} | ||
topology.shuffle(&mut prng); | ||
|
||
topology | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
////////////////////////////////////////////////////////////// | ||
// ATTENTION // | ||
// You are not meant to modify this file. Please focus on // | ||
// allocator.rs // | ||
////////////////////////////////////////////////////////////// | ||
|
||
#![feature(maybe_uninit_uninit_array)] | ||
#![feature(maybe_uninit_array_assume_init)] | ||
#![feature(allocator_api)] | ||
|
||
use std::alloc::Allocator; | ||
use std::mem::MaybeUninit; | ||
|
||
mod allocator; | ||
pub use allocator::allocator; | ||
mod generate_mesh; | ||
pub use generate_mesh::generate_mesh; | ||
|
||
#[cfg(test)] | ||
mod tests; | ||
|
||
fn compute_local_product(coords: &[f64; 4], &lhs_local: &[f64; 4]) -> [f64; 4] { | ||
let dx: f64 = coords[2] - coords[0]; | ||
let dy: f64 = coords[3] - coords[1]; | ||
let dx2: f64 = dx * dx; | ||
let dy2: f64 = dy * dy; | ||
let dxdy: f64 = dx * dy; | ||
|
||
let mut k = [ | ||
[dx2, dxdy, -dx2, -dxdy], | ||
[dxdy, dy2, -dxdy, -dy2], | ||
[0f64; 4], | ||
[0f64; 4], | ||
]; | ||
let mut c = 2; | ||
while c < 4 { | ||
let mut r = 0; | ||
while r < 4 { | ||
k[c][r] = -k[c - 2][r]; | ||
r += 1; | ||
} | ||
c += 1; | ||
} | ||
let mut mult_result = [0f64; 4]; | ||
let mut c = 0; | ||
while c < 4 { | ||
let mut r = 0; | ||
while r < 4 { | ||
mult_result[r] += k[c][r] * lhs_local[c]; | ||
r += 1; | ||
} | ||
c += 1; | ||
} | ||
let l = f64::sqrt(dx2 + dy2); | ||
const E: f64 = 210e9; | ||
#[allow(clippy::approx_constant)] | ||
const A: f64 = 3.14 * 1e-2 * 1e-2; | ||
let c = E * A / (l * l * l); | ||
let mut ai = 0; | ||
while ai < mult_result.len() { | ||
mult_result[ai] *= c; | ||
ai += 1; | ||
} | ||
mult_result | ||
} | ||
|
||
const fn compute_dofs(n1: u32, n2: u32) -> [u32; 4] { | ||
let mut dofs = [0u32; 4]; | ||
dofs[0] = n1 * 2; | ||
dofs[1] = n1 * 2 + 1; | ||
dofs[2] = n2 * 2; | ||
dofs[3] = n2 * 2 + 1; | ||
dofs | ||
} | ||
|
||
fn gather_global(n1: u32, n2: u32, rhs_global: &[f64]) -> [f64; 4] { | ||
let dofs = compute_dofs(n1, n2); | ||
let mut vals: [MaybeUninit<f64>; 4] = MaybeUninit::uninit_array(); | ||
let mut i = 0; | ||
while i < dofs.len() { | ||
vals[i].write(rhs_global[dofs[i] as usize]); | ||
i += 1; | ||
} | ||
unsafe { MaybeUninit::array_assume_init(vals) } | ||
} | ||
|
||
fn scatter_local(n1: u32, n2: u32, vals: &[f64; 4], rhs_global: &mut [f64]) { | ||
let dofs = compute_dofs(n1, n2); | ||
let mut i = 0; | ||
while i < dofs.len() { | ||
rhs_global[dofs[i] as usize] += vals[i]; | ||
i += 1; | ||
} | ||
} | ||
|
||
// Local contrbution of the element described by the nodes (n1, n2). Remaining | ||
// arguments are the same as the arguments of solution(...) | ||
fn processs_element(n1: u32, n2: u32, x: &[f64], y: &[f64], lhs: &[f64], rhs: &mut [f64]) { | ||
let lhs_vals: [f64; 4] = gather_global(n1, n2, lhs); | ||
let mut coords = [0f64; 4]; | ||
coords[0] = x[n1 as usize]; | ||
coords[1] = y[n1 as usize]; | ||
coords[2] = x[n2 as usize]; | ||
coords[3] = y[n2 as usize]; | ||
let local_prod = compute_local_product(&coords, &lhs_vals); | ||
scatter_local(n1, n2, &local_prod, rhs); | ||
} | ||
|
||
// Evaluate matrix-free operator for a 2D truss | ||
// | ||
// topo - topology of the mesh. Each entry in the vector represents a single | ||
// element, described by the 2 IDs of the nodes of the element. | ||
// n_nodes - total number of nodes in the mesh | ||
// x, y - arrays containing the coordinates of the nodes - i-th entry contains | ||
// the coordinates of the i-th node | ||
// lhs - left-hand side vector - this is the vector which is to be multiplied by | ||
// the stiffness matrix. It has a length of 2 * n_nodes (2 DOFs per node) | ||
// rhs - right-hand side - this is the vector where we want to write the result | ||
// of the multiplication (same length as lhs) | ||
pub fn solution<A: Allocator>( | ||
topo: &[[u32; 2]], | ||
n_nodes: u32, | ||
x: &[f64], | ||
y: &[f64], | ||
lhs: &[f64], | ||
rhs: &mut Vec<f64, A>, | ||
) { | ||
rhs.resize(n_nodes as usize * 2, 0.0); | ||
for [n1, n2] in topo { | ||
processs_element(*n1, *n2, x, y, lhs, rhs); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
use crate::{allocator, generate_mesh, solution}; | ||
use rand::distributions::{Distribution, Uniform}; | ||
use std::alloc::{Allocator, System}; | ||
|
||
const N_NODES_X: u32 = 100; | ||
const N_NODES_Y: u32 = 200; | ||
const N_NODES: u32 = N_NODES_X * N_NODES_Y; | ||
|
||
fn solve<A: Allocator>(alloc: &'static A) -> Vec<f64, &'static A> { | ||
let mut x = Vec::with_capacity_in(N_NODES as usize, alloc); | ||
let mut y = Vec::with_capacity_in(N_NODES as usize, alloc); | ||
x.resize(N_NODES as usize, 0.0); | ||
y.resize(N_NODES as usize, 0.0); | ||
let topology = generate_mesh(N_NODES_X, N_NODES_Y, &mut x, &mut y, 0); | ||
|
||
// Generate random left-hand side | ||
let mut lhs = Vec::with_capacity_in(2 * N_NODES as usize, alloc); | ||
lhs.resize(2 * N_NODES as usize, 0.0); | ||
|
||
let mut prng = rand::thread_rng(); | ||
let dist = Uniform::from(0.0..42.0); | ||
for _ in 0..lhs.len() { | ||
lhs.push(dist.sample(&mut prng)); | ||
} | ||
|
||
// Right-hand side | ||
let mut rhs = Vec::with_capacity_in(2 * N_NODES as usize, alloc); | ||
|
||
// Eval operator | ||
solution(&topology, N_NODES, &x, &y, &lhs, &mut rhs); | ||
|
||
rhs | ||
} | ||
|
||
#[test] | ||
fn validate() { | ||
let sol_user = solve(allocator()); | ||
let sol_valid = solve(&System); | ||
|
||
let mut acc = 0.0; | ||
for i in 0..sol_valid.len() { | ||
let a = sol_valid[i]; | ||
let b = sol_user[i]; | ||
acc += (a - b) * (a - b); | ||
} | ||
let l2_error = f64::sqrt(acc); | ||
|
||
assert!(l2_error <= 1e-9); | ||
} |
2 changes: 0 additions & 2 deletions
2
labs/memory_bound/loop_interchange_1/benches/bench_loop_interchange_1.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.