Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Profiler #1

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 19 additions & 6 deletions bench.py
Original file line number Diff line number Diff line change
@@ -13,10 +13,11 @@
)


def bench(function, input, input_keys, dim, n_iters=10):
def bench(function, input, input_keys, dim, n_iters=2):
start = time.time()
for _ in range(n_iters):
function(input, input_keys, dim)
torch.cuda.synchronize()

elapsed = time.time() - start
direct = elapsed / n_iters
@@ -28,6 +29,7 @@ def bench(function, input, input_keys, dim, n_iters=10):

start = time.time()
summed.backward()
torch.cuda.synchronize()
elapsed += time.time() - start

backward = elapsed / n_iters
@@ -40,19 +42,30 @@ def bench(function, input, input_keys, dim, n_iters=10):
n_features = 1000

torch.manual_seed(0xDEADBEEF)
X = torch.rand((n_samples, 7, n_features), requires_grad=True, dtype=torch.float64)
X = torch.rand((n_samples, 7, 10, n_features), requires_grad=True, dtype=torch.float64)
X_keys = torch.randint(4, (n_samples, 3), dtype=torch.int32)
X, X_keys = X.cuda(), X_keys.cuda()

print("implementation | forward pass | backward pass")

forward, backward = bench(reduce_python.reduce2, X, X_keys, 0)
print(f"python reduce2 = {1e3 * forward:.3} ms - {1e3 * backward:.5} ms")

# traced = torch.jit.trace(reduce_python.reduce, (X, X_keys, torch.tensor(0)))
# forward, backward = bench(traced, X, X_keys, torch.tensor(0))
# print(f"python traced = {1e3 * forward:.3} ms - {1e3 * backward:.5} ms")

forward, backward = bench(reduce_python.reduce, X, X_keys, 0)
print(f"python function = {1e3 * forward:.3} ms - {1e3 * backward:.5} ms")

forward, backward = bench(reduce_python.reduce_custom_autograd, X, X_keys, 0)
print(f"python autograd = {1e3 * forward:.3} ms - {1e3 * backward:.5} ms")

forward, backward = bench(torch.ops.reduce_cpp.reduce, X, X_keys, 0)
print(f"C++ function = {1e3 * forward:.3} ms - {1e3 * backward:.5} ms")
# forward, backward = bench(torch.ops.reduce_cpp.reduce, X, X_keys, 0)
# print(f"C++ function = {1e3 * forward:.3} ms - {1e3 * backward:.5} ms")

# forward, backward = bench(torch.ops.reduce_cpp.reduce_custom_autograd, X, X_keys, 0)
# print(f"C++ autograd = {1e3 * forward:.3} ms - {1e3 * backward:.5} ms")

forward, backward = bench(torch.ops.reduce_cpp.reduce_custom_autograd, X, X_keys, 0)
print(f"C++ autograd = {1e3 * forward:.3} ms - {1e3 * backward:.5} ms")
# forward, backward = bench(reduce_python.reduce, X.cuda(), X_keys.cuda(), 0)
# print(f"python cuda = {1e3 * forward:.3} ms - {1e3 * backward:.5} ms")
33 changes: 33 additions & 0 deletions profile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import time

import torch
import torch.utils.cpp_extension

from torch.profiler import profile, record_function, ProfilerActivity
import torch.utils.benchmark as benchmark

import reduce_python

torch.utils.cpp_extension.load(
name="reduce_cpp",
sources=["reduce.cpp", "bindings.cpp"],
extra_cflags=["-O3"],
is_python_module=False,
)


if __name__ == "__main__":
n_samples = 10000
n_features = 1000

torch.manual_seed(0xDEADBEEF)
X = torch.rand((n_samples, 5, 6, n_features), requires_grad=True, dtype=torch.float32, device='cuda')
X_keys = torch.randint(13, (n_samples, 3), dtype=torch.int32, device='cuda')
print(X.shape)

with profile(activities=[ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof:
with record_function("reduce_python"):
reduce_python.reduce(X, X_keys, 0).mean().backward()

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=100))
prof.export_chrome_trace("reduce_python.trace.json")
17 changes: 14 additions & 3 deletions reduce_python.py
Original file line number Diff line number Diff line change
@@ -6,11 +6,10 @@ def reduce(input, keys, dim):
unique_entries = torch.unique(keys[:, dim])

mapping = torch.empty(input.shape[0], dtype=torch.int32, device=input.device)
index = torch.arange(len(unique_entries), dtype=torch.int32, device=input.device)
for i, unique_entry in enumerate(unique_entries):
idx = torch.where(keys[:, dim] == unique_entry)[0]
mapping.index_put_(
(idx,), torch.tensor(i, dtype=torch.int32, device=input.device)
)
mapping.index_put_((idx,), index[i])

new_shape = (len(unique_entries),) + input.shape[1:]
reduced_input = torch.zeros(new_shape, dtype=input.dtype, device=input.device)
@@ -19,6 +18,18 @@ def reduce(input, keys, dim):
return reduced_input


def reduce2(input, keys, dim):
assert keys.dim() == 2, "keys should have only two dimensions"
unique_entries = torch.unique(keys[:, dim])
shape = input.shape

idx = torch.zeros(unique_entries.amax()+1, len(input), dtype=input.dtype, device=input.device)
idx[keys[:, dim].long(), torch.arange(len(input))] = 1
reduced_input = (idx @ input.view(len(input), -1)).view(len(unique_entries), *shape[1:])

return reduced_input


class ReduceAutograd(torch.autograd.Function):
@staticmethod
def forward(ctx, input, keys, dim):
14 changes: 14 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
@@ -57,6 +57,13 @@ def test_same_result(
(X, X_keys, dim),
verbose=True,
)
test_same_result(
"python / C++",
reduce_python.reduce,
reduce_python.reduce2,
(X, X_keys, dim),
verbose=True,
)
test_same_result(
"python / py autograd",
reduce_python.reduce,
@@ -84,6 +91,13 @@ def test_same_result(
(X, X_keys, dim),
verbose=False,
)
test_same_result(
"python / C++",
reduce_python.reduce,
reduce_python.reduce2,
(X, X_keys, dim),
verbose=False,
)
test_same_result(
"python / py autograd",
reduce_python.reduce,