From 2056542a0f49c95fc8fb1316972b0ec60503a5b0 Mon Sep 17 00:00:00 2001 From: Henrique Mendonca Date: Mon, 19 Sep 2022 16:02:30 +0200 Subject: [PATCH 1/2] simple pt profiler --- profile.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 profile.py diff --git a/profile.py b/profile.py new file mode 100644 index 0000000..87ab761 --- /dev/null +++ b/profile.py @@ -0,0 +1,33 @@ +import time + +import torch +import torch.utils.cpp_extension + +from torch.profiler import profile, record_function, ProfilerActivity +import torch.utils.benchmark as benchmark + +import reduce_python + +torch.utils.cpp_extension.load( + name="reduce_cpp", + sources=["reduce.cpp", "bindings.cpp"], + extra_cflags=["-O3"], + is_python_module=False, +) + + +if __name__ == "__main__": + n_samples = 10000 + n_features = 1000 + + torch.manual_seed(0xDEADBEEF) + X = torch.rand((n_samples, 5, 6, n_features), requires_grad=True, dtype=torch.float32, device='cuda') + X_keys = torch.randint(13, (n_samples, 3), dtype=torch.int32, device='cuda') + print(X.shape) + + with profile(activities=[ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof: + with record_function("reduce_python"): + reduce_python.reduce(X, X_keys, 0).mean().backward() + + print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=100)) + prof.export_chrome_trace("reduce_python.trace.json") From 4a3e280a9001eea16e5401359f5e2604ce7959f0 Mon Sep 17 00:00:00 2001 From: Henrique Mendonca Date: Mon, 19 Sep 2022 16:05:01 +0200 Subject: [PATCH 2/2] reduce2 --- bench.py | 25 +++++++++++++++++++------ reduce_python.py | 17 ++++++++++++++--- tests.py | 14 ++++++++++++++ 3 files changed, 47 insertions(+), 9 deletions(-) diff --git a/bench.py b/bench.py index 330b607..82c2bbe 100644 --- a/bench.py +++ b/bench.py @@ -13,10 +13,11 @@ ) -def bench(function, input, input_keys, dim, n_iters=10): +def bench(function, input, input_keys, dim, n_iters=2): start = time.time() for _ in range(n_iters): function(input, input_keys, dim) + torch.cuda.synchronize() elapsed = time.time() - start direct = elapsed / n_iters @@ -28,6 +29,7 @@ def bench(function, input, input_keys, dim, n_iters=10): start = time.time() summed.backward() + torch.cuda.synchronize() elapsed += time.time() - start backward = elapsed / n_iters @@ -40,19 +42,30 @@ def bench(function, input, input_keys, dim, n_iters=10): n_features = 1000 torch.manual_seed(0xDEADBEEF) - X = torch.rand((n_samples, 7, n_features), requires_grad=True, dtype=torch.float64) + X = torch.rand((n_samples, 7, 10, n_features), requires_grad=True, dtype=torch.float64) X_keys = torch.randint(4, (n_samples, 3), dtype=torch.int32) + X, X_keys = X.cuda(), X_keys.cuda() print("implementation | forward pass | backward pass") + forward, backward = bench(reduce_python.reduce2, X, X_keys, 0) + print(f"python reduce2 = {1e3 * forward:.3} ms - {1e3 * backward:.5} ms") + + # traced = torch.jit.trace(reduce_python.reduce, (X, X_keys, torch.tensor(0))) + # forward, backward = bench(traced, X, X_keys, torch.tensor(0)) + # print(f"python traced = {1e3 * forward:.3} ms - {1e3 * backward:.5} ms") + forward, backward = bench(reduce_python.reduce, X, X_keys, 0) print(f"python function = {1e3 * forward:.3} ms - {1e3 * backward:.5} ms") forward, backward = bench(reduce_python.reduce_custom_autograd, X, X_keys, 0) print(f"python autograd = {1e3 * forward:.3} ms - {1e3 * backward:.5} ms") - forward, backward = bench(torch.ops.reduce_cpp.reduce, X, X_keys, 0) - print(f"C++ function = {1e3 * forward:.3} ms - {1e3 * backward:.5} ms") + # forward, backward = bench(torch.ops.reduce_cpp.reduce, X, X_keys, 0) + # print(f"C++ function = {1e3 * forward:.3} ms - {1e3 * backward:.5} ms") + + # forward, backward = bench(torch.ops.reduce_cpp.reduce_custom_autograd, X, X_keys, 0) + # print(f"C++ autograd = {1e3 * forward:.3} ms - {1e3 * backward:.5} ms") - forward, backward = bench(torch.ops.reduce_cpp.reduce_custom_autograd, X, X_keys, 0) - print(f"C++ autograd = {1e3 * forward:.3} ms - {1e3 * backward:.5} ms") + # forward, backward = bench(reduce_python.reduce, X.cuda(), X_keys.cuda(), 0) + # print(f"python cuda = {1e3 * forward:.3} ms - {1e3 * backward:.5} ms") diff --git a/reduce_python.py b/reduce_python.py index d555dda..b1d6561 100644 --- a/reduce_python.py +++ b/reduce_python.py @@ -6,11 +6,10 @@ def reduce(input, keys, dim): unique_entries = torch.unique(keys[:, dim]) mapping = torch.empty(input.shape[0], dtype=torch.int32, device=input.device) + index = torch.arange(len(unique_entries), dtype=torch.int32, device=input.device) for i, unique_entry in enumerate(unique_entries): idx = torch.where(keys[:, dim] == unique_entry)[0] - mapping.index_put_( - (idx,), torch.tensor(i, dtype=torch.int32, device=input.device) - ) + mapping.index_put_((idx,), index[i]) new_shape = (len(unique_entries),) + input.shape[1:] reduced_input = torch.zeros(new_shape, dtype=input.dtype, device=input.device) @@ -19,6 +18,18 @@ def reduce(input, keys, dim): return reduced_input +def reduce2(input, keys, dim): + assert keys.dim() == 2, "keys should have only two dimensions" + unique_entries = torch.unique(keys[:, dim]) + shape = input.shape + + idx = torch.zeros(unique_entries.amax()+1, len(input), dtype=input.dtype, device=input.device) + idx[keys[:, dim].long(), torch.arange(len(input))] = 1 + reduced_input = (idx @ input.view(len(input), -1)).view(len(unique_entries), *shape[1:]) + + return reduced_input + + class ReduceAutograd(torch.autograd.Function): @staticmethod def forward(ctx, input, keys, dim): diff --git a/tests.py b/tests.py index 8601609..a765dad 100644 --- a/tests.py +++ b/tests.py @@ -57,6 +57,13 @@ def test_same_result( (X, X_keys, dim), verbose=True, ) + test_same_result( + "python / C++", + reduce_python.reduce, + reduce_python.reduce2, + (X, X_keys, dim), + verbose=True, + ) test_same_result( "python / py autograd", reduce_python.reduce, @@ -84,6 +91,13 @@ def test_same_result( (X, X_keys, dim), verbose=False, ) + test_same_result( + "python / C++", + reduce_python.reduce, + reduce_python.reduce2, + (X, X_keys, dim), + verbose=False, + ) test_same_result( "python / py autograd", reduce_python.reduce,