Revert D34808051: [tensorexpr] Enabled aten::stack in the fuser pass with static shapes

malfet · pytorchmergebot · commit 320e5a826880 · 2022-03-31T04:25:43.000Z
Test Plan: revert-hammer Differential Revision: D34808051 Original commit changeset: 213e2ffdf87f Original Phabricator Diff: D34808051 fbshipit-source-id: b618daeb346f784e8ab9525040edcb4a30a39613 (cherry picked from commit e47b973)
diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
@@ -317,25 +317,6 @@ TEST(TEFuserPass, FuserPass_IgnoreUnknownShapeAtStart) {
   testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g);
 }
 
-TEST(TEFuserPass, FuserPass_Stack) {
-  WithCPUFuser cf;
-  const auto graph_string =
-      R"IR(graph(%y.1 : Float(5, 3, 3, 6, strides=[54, 18, 6, 1], requires_grad=0, device=cpu),
-      %x.1 : Float(5, 3, 3, 6, strides=[54, 18, 6, 1], requires_grad=0, device=cpu)):
-  %1 : int = prim::Constant[value=2]()
-  %9 : Float(5, 3, 3, 6, strides=[54, 18, 6, 1], requires_grad=0, device=cpu) = aten::tanh(%x.1)
-  %7 : Float(5, 3, 3, 6, strides=[54, 18, 6, 1], requires_grad=0, device=cpu) = aten::tanh(%y.1)
-  %5 : Tensor[] = prim::ListConstruct(%9, %7)
-  %z.2 : Float(5, 3, 2, 3, 6, strides=[108, 36, 18, 6, 1], requires_grad=0, device=cpu) = aten::stack(%5, %1)
-  return (%z.2)
-  )IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(graph_string, g.get());
-  g->lint();
-  FuseTensorExprs(g, /* min_group_size= */ 2);
-  testing::FileCheck().check("prim::TensorExprGroup")->run(*g);
-}
-
 TEST(TEFuserPass, FuserPass_Where) {
   WithCPUFuser cf;
   const auto graph_string = R"IR(
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
@@ -739,22 +739,6 @@ def foo(hx, cx):
                 # XXX: TE fuser can handle concats in a fusion group.
                 # FileCheck().check("FusedConcat").check_next("return").run(str(graph))
 
-    def test_stack(self):
-        # "aten::stack fusion is not enabled yet with dynamic shapes"
-        if self.dynamic_shapes:
-            return True
-        with set_fusion_group_inlining(True):
-            for device in self.devices:
-                hx = torch.randn(3, 20, dtype=torch.float, device=device)
-                cx = torch.randn(3, 20, dtype=torch.float, device=device)
-
-                def foo(hx, cx):
-                    return torch.stack((hx + cx, hx - cx))
-
-                ge = self.checkTrace(foo, (hx, cx))
-                graph = ge.graph_for(hx, cx)
-                self.assertAllFused(graph)
-
     def test_remove_output_used_only_in_size(self):
         for device in self.devices:
             def test_fuse(a, b):
@@ -1797,7 +1781,6 @@ def apply(fn):
         devices = self.devices
         list_ops = [
             torch.cat,
-            torch.stack
         ]
         for dtype, op, device in product(self.dtypes, list_ops, devices):
             if dtype in [torch.float16, torch.bfloat16] and device == "cpu":
diff --git a/test/test_tensorexpr_pybind.py b/test/test_tensorexpr_pybind.py
@@ -390,31 +390,6 @@ def f(a):
         np.testing.assert_allclose(res1.numpy(), correct.numpy(), atol=2e-3)
         np.testing.assert_allclose(res2.numpy(), correct.numpy(), atol=2e-3)
 
-    @unittest.skipIf(not LLVM_ENABLED, "LLVM backend not enabled")
-    def test_kernel_with_stack(self):
-        def f(a, b):
-            return torch.stack((a, b), dim=1)
-
-        device = "cpu"
-        x = torch.rand((3, 5), device=device)
-        y = torch.rand((3, 5), device=device)
-        graph_str = """
-graph(%x.1 : Float(3, 5, strides=[5, 1], requires_grad=0, device=cpu),
-      %y.1 : Float(3, 5, strides=[5, 1], requires_grad=0, device=cpu)):
-  %1 : int = prim::Constant[value=1]()
-  %5 : Tensor[] = prim::ListConstruct(%x.1, %y.1)
-  %z.2 : Float(3, 2, 5, strides=[10, 5, 1], requires_grad=0, device=cpu) = aten::stack(%5, %1) # local/stack.py:39:12
-  return (%z.2)
-  """
-        graph = torch._C.parse_ir(graph_str)
-
-        kernel = te.TensorExprKernel(graph)
-        res1 = kernel.run((x, y))
-        res2 = kernel.fallback((x, y))
-        correct = f(x, y)
-        np.testing.assert_allclose(res1.numpy(), correct.numpy(), atol=2e-3)
-        np.testing.assert_allclose(res2.numpy(), correct.numpy(), atol=2e-3)
-
     @unittest.skipIf(not LLVM_ENABLED, "LLVM backend not enabled")
     def test_alloc_in_loop(self):
         a, tmp, b = [
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -94,7 +94,6 @@ bool isSupported(Node* node) {
   };
   static const OperatorSet supported_misc_set{
       "aten::cat(Tensor[] tensors, int dim=0) -> Tensor",
-      "aten::stack(Tensor[] tensors, int dim=0) -> Tensor",
       "aten::unsqueeze(Tensor(a) self, int dim) -> Tensor(a)",
   };
   // clang-format on
@@ -772,7 +771,7 @@ class TensorExprFuser {
 
     std::vector<Node*> nodes_to_merge = {to_merge};
 
-    if (to_merge->kind() == aten::cat || to_merge->kind() == aten::stack) {
+    if (to_merge->kind() == aten::cat) {
       Node* listconstruct = to_merge->input(0)->node();
       nodes_to_merge.push_back(listconstruct);
     }
@@ -1054,6 +1053,7 @@ class TensorExprFuser {
     REQ(isFusableOnDevice(node));
     REQ(operators_not_to_fuse.find(node->kind()) ==
         operators_not_to_fuse.end());
+
     for (Value* input : node->inputs()) {
       if (auto const& tt = input->type()->cast<TensorType>()) {
         auto st = tt->scalarType();
@@ -1066,7 +1066,7 @@ class TensorExprFuser {
         }
       }
     }
-    if (node->kind() == aten::cat || node->kind() == aten::stack) {
+    if (node->kind() == aten::cat) {
       REQ(node->input(0)->node()->kind() == prim::ListConstruct);
       REQ(node->input(0)->uses().size() == 1);
       REQ(node->input(1)->node()->kind() == prim::Constant);
@@ -1120,8 +1120,7 @@ class TensorExprFuser {
     REQ(nInputs <= subgraphArgLimit);
 
     // Device checks
-    if (consumer->kind() != aten::cat && producer->kind() != aten::cat &&
-        consumer->kind() != aten::stack && producer->kind() != aten::stack) {
+    if (consumer->kind() != aten::cat && producer->kind() != aten::cat) {
       // aten::cat needs a special handling because it takes a Tensor[] as its
       // input We deal with that in the code below.
       auto consumer_device = tensorexpr::pickDeviceType(consumer->inputs());
@@ -1155,7 +1154,7 @@ class TensorExprFuser {
       REQ(producer->kind() != prim::Constant);
     }
 
-    if (producer->kind() == aten::cat || producer->kind() == aten::stack) {
+    if (producer->kind() == aten::cat) {
       REQ(producer->input(0)->node()->kind() == prim::ListConstruct);
       REQ(producer->input(0)->uses().size() == 1);
       REQ(producer->input(1)->node()->kind() == prim::Constant);
@@ -1173,8 +1172,7 @@ class TensorExprFuser {
         REQ(isFusableOnDevice(input->node()));
       }
       REQ((nInputs + listConstruct->inputs().size()) <= subgraphArgLimit);
-    } else if (
-        consumer->kind() == aten::cat || consumer->kind() == aten::stack) {
+    } else if (consumer->kind() == aten::cat) {
       REQ(consumer->input(0)->node()->kind() == prim::ListConstruct);
       REQ(consumer->input(0)->uses().size() == 1);
       REQ(consumer->input(1)->node()->kind() == prim::Constant);