From 2056542a0f49c95fc8fb1316972b0ec60503a5b0 Mon Sep 17 00:00:00 2001
From: Henrique Mendonca <henrique@apache.org>
Date: Mon, 19 Sep 2022 16:02:30 +0200
Subject: [PATCH 1/2] simple pt profiler

---
 profile.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 profile.py

diff --git a/profile.py b/profile.py
new file mode 100644
index 0000000..87ab761
--- /dev/null
+++ b/profile.py
@@ -0,0 +1,33 @@
+import time
+
+import torch
+import torch.utils.cpp_extension
+
+from torch.profiler import profile, record_function, ProfilerActivity
+import torch.utils.benchmark as benchmark
+
+import reduce_python
+
+torch.utils.cpp_extension.load(
+    name="reduce_cpp",
+    sources=["reduce.cpp", "bindings.cpp"],
+    extra_cflags=["-O3"],
+    is_python_module=False,
+)
+
+
+if __name__ == "__main__":
+    n_samples = 10000
+    n_features = 1000
+
+    torch.manual_seed(0xDEADBEEF)
+    X = torch.rand((n_samples, 5, 6, n_features), requires_grad=True, dtype=torch.float32, device='cuda')
+    X_keys = torch.randint(13, (n_samples, 3), dtype=torch.int32, device='cuda')
+    print(X.shape)
+
+    with profile(activities=[ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof:
+        with record_function("reduce_python"):
+            reduce_python.reduce(X, X_keys, 0).mean().backward()
+
+    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=100))
+    prof.export_chrome_trace("reduce_python.trace.json")

From 4a3e280a9001eea16e5401359f5e2604ce7959f0 Mon Sep 17 00:00:00 2001
From: Henrique Mendonca <henrique@apache.org>
Date: Mon, 19 Sep 2022 16:05:01 +0200
Subject: [PATCH 2/2] reduce2

---
 bench.py         | 25 +++++++++++++++++++------
 reduce_python.py | 17 ++++++++++++++---
 tests.py         | 14 ++++++++++++++
 3 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/bench.py b/bench.py
index 330b607..82c2bbe 100644
--- a/bench.py
+++ b/bench.py
@@ -13,10 +13,11 @@
 )
 
 
-def bench(function, input, input_keys, dim, n_iters=10):
+def bench(function, input, input_keys, dim, n_iters=2):
     start = time.time()
     for _ in range(n_iters):
         function(input, input_keys, dim)
+        torch.cuda.synchronize()
 
     elapsed = time.time() - start
     direct = elapsed / n_iters
@@ -28,6 +29,7 @@ def bench(function, input, input_keys, dim, n_iters=10):
 
         start = time.time()
         summed.backward()
+        torch.cuda.synchronize()
         elapsed += time.time() - start
 
     backward = elapsed / n_iters
@@ -40,19 +42,30 @@ def bench(function, input, input_keys, dim, n_iters=10):
     n_features = 1000
 
     torch.manual_seed(0xDEADBEEF)
-    X = torch.rand((n_samples, 7, n_features), requires_grad=True, dtype=torch.float64)
+    X = torch.rand((n_samples, 7, 10, n_features), requires_grad=True, dtype=torch.float64)
     X_keys = torch.randint(4, (n_samples, 3), dtype=torch.int32)
+    X, X_keys = X.cuda(), X_keys.cuda()
 
     print("implementation  | forward pass | backward pass")
 
+    forward, backward = bench(reduce_python.reduce2, X, X_keys, 0)
+    print(f"python reduce2  =   {1e3 * forward:.3} ms    -   {1e3 * backward:.5} ms")
+
+    # traced = torch.jit.trace(reduce_python.reduce, (X, X_keys, torch.tensor(0)))
+    # forward, backward = bench(traced, X, X_keys, torch.tensor(0))
+    # print(f"python traced   =   {1e3 * forward:.3} ms    -   {1e3 * backward:.5} ms")
+
     forward, backward = bench(reduce_python.reduce, X, X_keys, 0)
     print(f"python function =   {1e3 * forward:.3} ms    -   {1e3 * backward:.5} ms")
 
     forward, backward = bench(reduce_python.reduce_custom_autograd, X, X_keys, 0)
     print(f"python autograd =   {1e3 * forward:.3} ms    -   {1e3 * backward:.5} ms")
 
-    forward, backward = bench(torch.ops.reduce_cpp.reduce, X, X_keys, 0)
-    print(f"C++ function    =   {1e3 * forward:.3} ms    -   {1e3 * backward:.5} ms")
+    # forward, backward = bench(torch.ops.reduce_cpp.reduce, X, X_keys, 0)
+    # print(f"C++ function    =   {1e3 * forward:.3} ms    -   {1e3 * backward:.5} ms")
+
+    # forward, backward = bench(torch.ops.reduce_cpp.reduce_custom_autograd, X, X_keys, 0)
+    # print(f"C++ autograd    =   {1e3 * forward:.3} ms    -   {1e3 * backward:.5} ms")
 
-    forward, backward = bench(torch.ops.reduce_cpp.reduce_custom_autograd, X, X_keys, 0)
-    print(f"C++ autograd    =   {1e3 * forward:.3} ms    -   {1e3 * backward:.5} ms")
+    # forward, backward = bench(reduce_python.reduce, X.cuda(), X_keys.cuda(), 0)
+    # print(f"python cuda     =   {1e3 * forward:.3} ms    -   {1e3 * backward:.5} ms")
diff --git a/reduce_python.py b/reduce_python.py
index d555dda..b1d6561 100644
--- a/reduce_python.py
+++ b/reduce_python.py
@@ -6,11 +6,10 @@ def reduce(input, keys, dim):
     unique_entries = torch.unique(keys[:, dim])
 
     mapping = torch.empty(input.shape[0], dtype=torch.int32, device=input.device)
+    index = torch.arange(len(unique_entries), dtype=torch.int32, device=input.device)
     for i, unique_entry in enumerate(unique_entries):
         idx = torch.where(keys[:, dim] == unique_entry)[0]
-        mapping.index_put_(
-            (idx,), torch.tensor(i, dtype=torch.int32, device=input.device)
-        )
+        mapping.index_put_((idx,), index[i])
 
     new_shape = (len(unique_entries),) + input.shape[1:]
     reduced_input = torch.zeros(new_shape, dtype=input.dtype, device=input.device)
@@ -19,6 +18,18 @@ def reduce(input, keys, dim):
     return reduced_input
 
 
+def reduce2(input, keys, dim):
+    assert keys.dim() == 2, "keys should have only two dimensions"
+    unique_entries = torch.unique(keys[:, dim])
+    shape = input.shape
+
+    idx = torch.zeros(unique_entries.amax()+1, len(input), dtype=input.dtype, device=input.device)
+    idx[keys[:, dim].long(), torch.arange(len(input))] = 1
+    reduced_input = (idx @ input.view(len(input), -1)).view(len(unique_entries), *shape[1:])
+
+    return reduced_input
+
+
 class ReduceAutograd(torch.autograd.Function):
     @staticmethod
     def forward(ctx, input, keys, dim):
diff --git a/tests.py b/tests.py
index 8601609..a765dad 100644
--- a/tests.py
+++ b/tests.py
@@ -57,6 +57,13 @@ def test_same_result(
             (X, X_keys, dim),
             verbose=True,
         )
+        test_same_result(
+            "python / C++",
+            reduce_python.reduce,
+            reduce_python.reduce2,
+            (X, X_keys, dim),
+            verbose=True,
+        )
         test_same_result(
             "python / py autograd",
             reduce_python.reduce,
@@ -84,6 +91,13 @@ def test_same_result(
             (X, X_keys, dim),
             verbose=False,
         )
+        test_same_result(
+            "python / C++",
+            reduce_python.reduce,
+            reduce_python.reduce2,
+            (X, X_keys, dim),
+            verbose=False,
+        )
         test_same_result(
             "python / py autograd",
             reduce_python.reduce,