Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Timestamp normalization #7343

Merged
merged 2 commits into from
Apr 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions tests/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ itertools.workspace = true
image.workspace = true
libtest-mimic.workspace = true
log.workspace = true
nanorand.workspace = true
parking_lot.workspace = true
png.workspace = true
pollster.workspace = true
Expand Down
2 changes: 2 additions & 0 deletions tests/tests/wgpu-gpu/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ mod texture_binding;
mod texture_blit;
mod texture_bounds;
mod texture_view_creation;
mod timestamp_normalization;
mod timestamp_query;
mod transfer;
mod transition_resources;
mod vertex_formats;
Expand Down
8 changes: 7 additions & 1 deletion tests/tests/wgpu-gpu/occlusion_query/mod.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
use wgpu::InstanceFlags;
use wgpu_test::{gpu_test, FailureCase, GpuTestConfiguration, TestParameters};

#[gpu_test]
static OCCLUSION_QUERY: GpuTestConfiguration = GpuTestConfiguration::new()
.parameters(TestParameters::default().expect_fail(FailureCase::webgl2()))
.parameters(
TestParameters::default()
.expect_fail(FailureCase::webgl2())
// Ensure timestamp normalization does not interfere with occlusion query results
.instance_flags(InstanceFlags::AUTOMATIC_TIMESTAMP_NORMALIZATION),
)
.run_async(|ctx| async move {
// Create depth texture
let depth_texture = ctx.device.create_texture(&wgpu::TextureDescriptor {
Expand Down
1 change: 1 addition & 0 deletions tests/tests/wgpu-gpu/timestamp_normalization/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
mod utils;
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Must have "wgpu-core/src/timestamp_normalization/common.wgsl"
// preprocessed before this file's contents.

struct ShiftRight96 {
value: Uint96,
shift: u32,
}

@group(0) @binding(0)
var<storage> input: array<ShiftRight96>;

@group(0) @binding(1)
var<storage, read_write> output: array<Uint96>;

@compute @workgroup_size(256)
fn main(@builtin(global_invocation_id) id: vec3u) {
let index = id.x;

let input = input[index];

output[index] = shift_right_96(input.value, input.shift);
}
23 changes: 23 additions & 0 deletions tests/tests/wgpu-gpu/timestamp_normalization/u64_mul_u32.wgsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// Must have "wgpu-core/src/timestamp_normalization/common.wgsl"
// preprocessed before this file's contents.

struct U64MulU32Input {
left: Uint64,
right: u32,
_padding: u32,
}

@group(0) @binding(0)
var<storage> input: array<U64MulU32Input>;

@group(0) @binding(1)
var<storage, read_write> output: array<Uint96>;

@compute @workgroup_size(256)
fn main(@builtin(global_invocation_id) id: vec3u) {
let index = id.x;

let input = input[index];

output[index] = u64_mul_u32(input.left, input.right);
}
285 changes: 285 additions & 0 deletions tests/tests/wgpu-gpu/timestamp_normalization/utils.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
//! Tests for the timestamp normalization algorithm's utility functions.
//!
//! Because they involve multiple kinds of hand-rolled math operations,
//! we do testing to ensure the overall operation (which is very simple)
//! works correctly.

use nanorand::Rng;
use wgpu::{util::DeviceExt, Limits};
use wgpu_test::{gpu_test, GpuTestConfiguration, TestParameters, TestingContext};

#[repr(C)]
#[derive(Debug, Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)]
struct Uint96(u32, u32, u32);

impl Uint96 {
fn from_u128(value: u128) -> Self {
let a = (value & 0xFFFF_FFFF) as u32;
let b = ((value >> 32) & 0xFFFF_FFFF) as u32;
let c = ((value >> 64) & 0xFFFF_FFFF) as u32;

Self(a, b, c)
}

fn as_u128(&self) -> u128 {
((self.2 as u128) << 64) | ((self.1 as u128) << 32) | (self.0 as u128)
}
}

#[repr(C)]
#[derive(Debug, Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)]
struct U64MulU32Input {
left: u64,
right: u32,
_pad: u32,
}

impl U64MulU32Input {
fn new(left: u64, right: u32) -> Self {
Self {
left,
right,
_pad: 0,
}
}
}

fn assert_u64_mul_u32(left: u64, right: u32, computed: Uint96) {
let real = left as u128 * right as u128;

let computed = computed.as_u128();

assert_eq!(
computed, real,
"{left} * {right} should be {real} but is {}",
computed
);
}

#[gpu_test]
static U64_MUL_U32: GpuTestConfiguration = GpuTestConfiguration::new()
.parameters(
TestParameters::default()
.test_features_limits()
.limits(Limits {
max_storage_buffer_binding_size: 256 * 1024 * 1024,
..Limits::downlevel_defaults()
}),
)
.run_sync(test_u64_mul_u32);

fn test_u64_mul_u32(ctx: TestingContext) {
const TOTAL_RANDOM_INPUTS: usize = 1_000_000;
const MANUAL_INPUTS: usize = 2;

const TOTAL_INPUTS: usize = TOTAL_RANDOM_INPUTS + MANUAL_INPUTS;

let mut inputs = Vec::with_capacity(TOTAL_INPUTS);

inputs.push(U64MulU32Input::new(2, 2));
inputs.push(U64MulU32Input::new(u64::MAX, u32::MAX));

// Smoke test the algorithm by generating 1M random inputs, and checking the results.
let mut generator = nanorand::WyRand::new_seed(0xDEAD_BEEF);

for _ in 0..TOTAL_RANDOM_INPUTS {
let left = generator.generate::<u64>();
let right = generator.generate::<u32>();

inputs.push(U64MulU32Input::new(left, right));
}

assert_eq!(TOTAL_INPUTS, inputs.len());

let output_bytes = process_shader(
ctx,
bytemuck::cast_slice(&inputs),
include_str!("u64_mul_u32.wgsl"),
);
let output_values = bytemuck::pod_collect_to_vec(&output_bytes);

for (&input, &output) in inputs.iter().zip(output_values.iter()) {
assert_u64_mul_u32(input.left, input.right, output);
}
}

#[repr(C)]
#[derive(Debug, Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)]
struct ShiftRightU96Input {
value: Uint96,
shift: u32,
}

impl ShiftRightU96Input {
fn new(value: u128, shift: u32) -> Self {
assert!(shift <= 32);
assert!(value >> 96 == 0);

Self {
value: Uint96::from_u128(value),
shift,
}
}
}

fn assert_shift_right_u96(value: Uint96, shift: u32, computed: Uint96) {
let value = value.as_u128();

let real = value >> shift;

let computed = computed.as_u128();

assert_eq!(
computed, real,
"{value:X} >> {shift} should be {real:X} but is {computed:X}",
);
}

#[gpu_test]
static SHIFT_RIGHT_U96: GpuTestConfiguration = GpuTestConfiguration::new()
.parameters(
TestParameters::default()
.test_features_limits()
.limits(Limits {
max_storage_buffer_binding_size: 256 * 1024 * 1024,
..Limits::downlevel_defaults()
}),
)
.run_sync(test_shift_right_u96);

fn test_shift_right_u96(ctx: TestingContext) {
const TOTAL_RANDOM_INPUTS: usize = 1_000_000;
const TOTAL_SHIFT_INPUTS: usize = 33;
const MANUAL_INPUTS: usize = 1;

const TOTAL_INPUTS: usize = TOTAL_RANDOM_INPUTS + TOTAL_SHIFT_INPUTS + MANUAL_INPUTS;

let mut inputs = Vec::with_capacity(TOTAL_INPUTS);

inputs.push(ShiftRightU96Input::new(1, 1));

for shift in 0..TOTAL_SHIFT_INPUTS {
// 96 bit number with a visually recognizable pattern.
const INTERESTING_NUMBER: u128 = 0x1234_5678_9ABC_DEF0_1234_5678;

inputs.push(ShiftRightU96Input::new(INTERESTING_NUMBER, shift as u32));
}

// Smoke test the algorithm by generating 1M random inputs, and checking the results.
let mut generator = nanorand::WyRand::new_seed(0xDEAD_BEEF);

for _ in 0..TOTAL_RANDOM_INPUTS {
// nanorand doesn't have generate_range for u128, so just chop the top bits off.
let value = generator.generate::<u128>() >> 32;
let shift = generator.generate_range(0..=32);

inputs.push(ShiftRightU96Input::new(value, shift));
}

assert_eq!(TOTAL_INPUTS, inputs.len());

let output_bytes = process_shader(
ctx,
bytemuck::cast_slice(&inputs),
include_str!("shift_right_u96.wgsl"),
);

let output_values = bytemuck::pod_collect_to_vec(&output_bytes);

for (&input, &output) in inputs.iter().zip(output_values.iter()) {
assert_shift_right_u96(input.value, input.shift, output);
}
}

fn process_shader(ctx: TestingContext, inputs: &[u8], entry_point_src: &str) -> Vec<u8> {
let common_src = include_str!("../../../../wgpu-core/src/timestamp_normalization/common.wgsl");

let full_source = format!("{common_src}\n{entry_point_src}");

let shader_module = ctx
.device
.create_shader_module(wgpu::ShaderModuleDescriptor {
label: Some("u64_mul_u32"),
source: wgpu::ShaderSource::Wgsl(full_source.into()),
});

let pipeline = ctx
.device
.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
label: Some("u64_mul_u32"),
layout: None,
module: &shader_module,
entry_point: None,
compilation_options: wgpu::PipelineCompilationOptions::default(),
cache: None,
});

let input_buffer = ctx
.device
.create_buffer_init(&wgpu::util::BufferInitDescriptor {
label: Some("Input Buffer"),
contents: inputs,
usage: wgpu::BufferUsages::STORAGE,
});

let output_size = (size_of::<Uint96>() * inputs.len()) as u64;

let output_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
label: Some("Output Buffer"),
size: output_size,
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
mapped_at_creation: false,
});

let pulldown_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
label: Some("Pulldown Buffer"),
size: output_size,
usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
mapped_at_creation: false,
});

let bgl = pipeline.get_bind_group_layout(0);

let bg = ctx.device.create_bind_group(&wgpu::BindGroupDescriptor {
label: Some("Bind Group"),
layout: &bgl,
entries: &[
wgpu::BindGroupEntry {
binding: 0,
resource: input_buffer.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 1,
resource: output_buffer.as_entire_binding(),
},
],
});

let mut encoder = ctx
.device
.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: Some("Compute Encoder"),
});

let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
label: Some("Compute Pass"),
timestamp_writes: None,
});

cpass.set_pipeline(&pipeline);
cpass.set_bind_group(0, &bg, &[]);
cpass.dispatch_workgroups(inputs.len().div_ceil(256) as u32, 1, 1);

drop(cpass);

encoder.copy_buffer_to_buffer(&output_buffer, 0, &pulldown_buffer, 0, output_size);

ctx.queue.submit([encoder.finish()]);
pulldown_buffer.map_async(wgpu::MapMode::Read, .., |_| {});

ctx.device.poll(wgpu::PollType::Wait).unwrap();

let value = pulldown_buffer.get_mapped_range(..).to_vec();

value
}
Loading
Loading