intel-ai-tce
diff --git a/‎benchmarks/cpp/tensorexpr/bench_approx.cpp
+8-8 b/‎benchmarks/cpp/tensorexpr/bench_approx.cpp
+8-8
diff --git a/‎benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
+10-10 b/‎benchmarks/cpp/tensorexpr/bench_batchnorm.cpp
+10-10
diff --git a/‎benchmarks/cpp/tensorexpr/bench_compile.cpp
+2-2 b/‎benchmarks/cpp/tensorexpr/bench_compile.cpp
+2-2
diff --git a/‎benchmarks/cpp/tensorexpr/bench_concat.cpp
+9-9 b/‎benchmarks/cpp/tensorexpr/bench_concat.cpp
+9-9
diff --git a/‎benchmarks/cpp/tensorexpr/bench_gemm.cpp
+10-10 b/‎benchmarks/cpp/tensorexpr/bench_gemm.cpp
+10-10
diff --git a/‎benchmarks/cpp/tensorexpr/bench_parallel.cpp
+2-2 b/‎benchmarks/cpp/tensorexpr/bench_parallel.cpp
+2-2
diff --git a/‎benchmarks/cpp/tensorexpr/bench_reduce.cpp
+10-10 b/‎benchmarks/cpp/tensorexpr/bench_reduce.cpp
+10-10
diff --git a/‎benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp
+8-8 b/‎benchmarks/cpp/tensorexpr/bench_signed_log1p.cpp
+8-8
diff --git a/‎test/cpp/tensorexpr/test_approx.cpp
+1-1 b/‎test/cpp/tensorexpr/test_approx.cpp
+1-1
@@ -30,7 +30,7 @@ void optimizePointwise(tensorexpr::LoopNest* ln, tensorexpr::Tensor target) {
 
 static void relu_nnc(benchmark::State& state) {
   auto N = VarHandle("N", kInt);
-  Placeholder A("A", kFloat, {N});
+  BufHandle A("A", {N}, kFloat);
   auto clamp = 0;
   torch::jit::tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i){
     auto A_elem = [&]() {
@@ -64,7 +64,7 @@ static void relu_nnc(benchmark::State& state) {
 
 static void log_nnc_sleef(benchmark::State& state) {
   auto N = VarHandle("N", kInt);
-  Placeholder A("A", kFloat, {N});
+  BufHandle A("A", {N}, kFloat);
   torch::jit::tensorexpr::Tensor B =
       Compute("B", {N}, [&](const VarHandle& i) {
         return log(A.load(i));
@@ -93,7 +93,7 @@ static void log_nnc_sleef(benchmark::State& state) {
 
 static void log_nnc_fast(benchmark::State& state) {
   auto N = VarHandle("N", kInt);
-  Placeholder A("A", kFloat, {N});
+  BufHandle A("A", {N}, kFloat);
   torch::jit::tensorexpr::Tensor B =
       Compute("B", {N}, [&](const VarHandle& i) {
         return fast_log(A.load(i));
@@ -122,7 +122,7 @@ static void log_nnc_fast(benchmark::State& state) {
 
 static void log_nnc_vml(benchmark::State& state) {
   auto N = VarHandle("N", kInt);
-  Placeholder A("A", kFloat, {N});
+  BufHandle A("A", {N}, kFloat);
   torch::jit::tensorexpr::Tensor B =
       Compute("B", {N}, [&](const VarHandle& i) {
         return log_vml(A.load(i));
@@ -161,7 +161,7 @@ static void log_aten(benchmark::State& state) {
 
 static void logit_nnc_sleef(benchmark::State& state) {
   auto N = VarHandle("N", kInt);
-  Placeholder A("A", kFloat, {N});
+  BufHandle A("A", {N}, kFloat);
   auto clamp = 1e-6f;
   tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
     auto A_elem = [&]() {
@@ -197,7 +197,7 @@ static void logit_nnc_sleef(benchmark::State& state) {
 
 static void logit_nnc_fast(benchmark::State& state) {
   auto N = VarHandle("N", kInt);
-  Placeholder A("A", kFloat, {N});
+  BufHandle A("A", {N}, kFloat);
   auto clamp = 1e-6f;
   tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
     auto A_elem = [&]() {
@@ -233,7 +233,7 @@ static void logit_nnc_fast(benchmark::State& state) {
 
 static void logit_nnc_vml(benchmark::State& state) {
   auto N = VarHandle("N", kInt);
-  Placeholder A("A", kFloat, {N});
+  BufHandle A("A", {N}, kFloat);
   auto clamp = 1e-6f;
   tensorexpr::Tensor B = Compute("B", {N}, [&](const VarHandle& i) {
     auto A_elem = [&]() {
@@ -310,7 +310,7 @@ static void logit_caffe2(benchmark::State& state) {
 
 static void tanh_nnc_fast(benchmark::State& state) {
   auto N = VarHandle("N", kInt);
-  Placeholder A("A", kFloat, {N});
+  BufHandle A("A", {N}, kFloat);
   torch::jit::tensorexpr::Tensor B =
       Compute("B", {N}, [&](const VarHandle& i) {
         return fast_tanh(A.load(i));
 
@@ -75,11 +75,11 @@ BENCHMARK_DEFINE_F(BatchNorm, ATen)(benchmark::State& state) {
 
 BENCHMARK_DEFINE_F(BatchNorm, NNC)(benchmark::State& state) {
 
-  Placeholder input("input", kFloat, {N_, C_, H_, W_});
-  Placeholder weight("weight", kFloat, {C_});
-  Placeholder bias("bias", kFloat, {C_});
-  Placeholder mean("mean", kFloat, {C_});
-  Placeholder var("var", kFloat, {C_});
+  BufHandle input("input", {N_, C_, H_, W_}, kFloat);
+  BufHandle weight("weight", {C_}, kFloat);
+  BufHandle bias("bias", {C_}, kFloat);
+  BufHandle mean("mean", {C_}, kFloat);
+  BufHandle var("var", {C_}, kFloat);
   VarHandle eps("eps", kFloat);
 
   using axis = const VarHandle&;
@@ -137,11 +137,11 @@ BENCHMARK_DEFINE_F(BatchNorm, ATenRelu)(benchmark::State& state) {
 
 BENCHMARK_DEFINE_F(BatchNorm, NNCRelu)(benchmark::State& state) {
 
-  Placeholder input("input", kFloat, {N_, C_, H_, W_});
-  Placeholder weight("weight", kFloat, {C_});
-  Placeholder bias("bias", kFloat, {C_});
-  Placeholder mean("mean", kFloat, {C_});
-  Placeholder var("var", kFloat, {C_});
+  BufHandle input("input", {N_, C_, H_, W_}, kFloat);
+  BufHandle weight("weight", {C_}, kFloat);
+  BufHandle bias("bias", {C_}, kFloat);
+  BufHandle mean("mean", {C_}, kFloat);
+  BufHandle var("var", {C_}, kFloat);
   VarHandle eps("eps", kFloat);
 
   using axis = const VarHandle&;
 
@@ -11,7 +11,7 @@ static void BM_CompileSwish(benchmark::State& state) {
   for (auto _ : state) {
     constexpr int N = 512;
     te::VarHandle n("n", te::kInt);
-    te::Placeholder A(te::BufHandle("A", {N}, te::kFloat));
+    te::BufHandle A("A", {N}, te::kFloat);
     te::Tensor relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
       return te::Max::make(A.load(i), 0.f, false);
     });
@@ -40,7 +40,7 @@ static void BM_CompileSwish(benchmark::State& state) {
 static void BM_CompileSwishLLVMOnly(benchmark::State& state) {
   constexpr int N = 512;
   te::VarHandle n("n", te::kInt);
-  te::Placeholder A(te::BufHandle("A", {N}, te::kFloat));
+  te::BufHandle A("A", {N}, te::kFloat);
   te::Tensor relu = te::Compute("relu", {{n, "n"}}, [&](const te::VarHandle& i) {
     return te::Max::make(A.load(i), 0.f, false);
   });
 
@@ -51,12 +51,12 @@ class ConcatBench : public benchmark::Fixture {
     size_t num_inputs = inputs_.size();
     size_t num_dims = 2;
 
-    std::vector<Placeholder> inputs;
+    std::vector<BufHandle> inputs;
     for (size_t i = 0; i < num_inputs; ++i) {
-      inputs.emplace_back(Placeholder(
+      inputs.emplace_back(BufHandle(
           "input" + std::to_string(i),
-          kFloat,
-          {input_sizes_[i][0], input_sizes_[i][1]}));
+          {input_sizes_[i][0], input_sizes_[i][1]},
+          kFloat));
     }
 
     Tensor output = Compute(
@@ -112,14 +112,14 @@ class ConcatBench : public benchmark::Fixture {
             {alloc<IntImm>(output_size_[0]), alloc<IntImm>(output_size_[1])}),
         kFloat);
 
-    std::vector<Placeholder> inputs;
+    std::vector<BufHandle> inputs;
     std::vector<StmtPtr> for_stmts(num_inputs);
     int cumulative_input_sizes = 0;
     for (size_t i = 0; i < num_inputs; ++i) {
-      inputs.emplace_back(Placeholder(
+      inputs.emplace_back(BufHandle(
           "input" + std::to_string(i),
-          kFloat,
-          {input_sizes_[i][0], input_sizes_[i][1]}));
+          {input_sizes_[i][0], input_sizes_[i][1]},
+          kFloat));
       std::vector<VarPtr> for_vars(num_inputs);
       for (size_t d = 0; d < num_dims; ++d) {
         for_vars[d] =
@@ -131,7 +131,7 @@ class ConcatBench : public benchmark::Fixture {
               {for_vars[0],
                alloc<Add>(for_vars[1], alloc<IntImm>(cumulative_input_sizes))}),
           alloc<Load>(
-              inputs[i].data(),
+              inputs[i].node(),
               std::vector<ExprPtr>({for_vars[0], for_vars[1]})));
       auto for_st = alloc<For>(
           for_vars[0],
 
@@ -41,8 +41,8 @@ BENCHMARK_DEFINE_F(Gemm, Torch)(benchmark::State& state) {
 
 BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) {
 
-  te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
-  te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
+  te::BufHandle AP("A", {M, K}, te::kFloat);
+  te::BufHandle BP("B", {K, N}, te::kFloat);
   te::Tensor CT = te::Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
@@ -64,8 +64,8 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprNoopt)(benchmark::State& state) {
 
 BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) {
 
-  te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
-  te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
+  te::BufHandle AP("A", {M, K}, te::kFloat);
+  te::BufHandle BP("B", {K, N}, te::kFloat);
   te::Tensor CT = te::Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
@@ -123,8 +123,8 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile32x32)(benchmark::State& state) {
 
 BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) {
 
-  te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
-  te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
+  te::BufHandle AP("A", {M, K}, te::kFloat);
+  te::BufHandle BP("B", {K, N}, te::kFloat);
   te::Tensor CT = te::Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
@@ -182,8 +182,8 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16)(benchmark::State& state) {
 
 BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) {
 
-  te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
-  te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
+  te::BufHandle AP("A", {M, K}, te::kFloat);
+  te::BufHandle BP("B", {K, N}, te::kFloat);
   te::Tensor CT = te::Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
@@ -249,8 +249,8 @@ BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16VecUnroll)(benchmark::State& state) {
 
 BENCHMARK_DEFINE_F(Gemm, TensorExprTile4x16Cache)(benchmark::State& state) {
 
-  te::Placeholder AP(te::BufHandle("A", {M, K}, te::kFloat));
-  te::Placeholder BP(te::BufHandle("B", {K, N}, te::kFloat));
+  te::BufHandle AP("A", {M, K}, te::kFloat);
+  te::BufHandle BP("B", {K, N}, te::kFloat);
   te::Tensor CT = te::Reduce(
       "gemm",
       {{M, "M"}, {N, "N"}},
 
@@ -35,8 +35,8 @@ class ParallelAdd : public benchmark::Fixture {
 };
 
 BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) {
-  Placeholder a_buf("a", kFloat, {M});
-  Placeholder b_buf("b", kFloat, {M});
+  BufHandle a_buf("a", {M}, kFloat);
+  BufHandle b_buf("b", {M}, kFloat);
   Tensor c_tensor = Compute(
       "c", {{M, "m"}}, [&](const VarHandle& m) {
         return a_buf.load(m) + b_buf.load(m);
 
@@ -220,7 +220,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) {
 
   int M = A.numel();
 
-  te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
+  te::BufHandle AP("A", {M}, te::kFloat);
   te::Tensor BT = te::Reduce(
       "reduce_full",
       {{1, "N"}},
@@ -252,7 +252,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) {
 
   int M = A.numel();
 
-  te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
+  te::BufHandle AP("A", {M}, te::kFloat);
   te::Tensor BT = te::Reduce(
       "reduce_full",
       {{1, "N"}},
@@ -292,7 +292,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) {
 
   int M = A.numel();
 
-  te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
+  te::BufHandle AP("A", {M}, te::kFloat);
   te::Tensor BT = te::Reduce(
       "reduce_full",
       {{1, "N"}},
@@ -334,7 +334,7 @@ BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
   const int kChunkSize = 8;
   TORCH_CHECK(M % kChunkSize == 0);
 
-  te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
+  te::BufHandle AP("A", {M}, te::kFloat);
   te::Tensor BT = te::Reduce(
       "reduce_full",
       {},
@@ -384,8 +384,8 @@ BENCHMARK_DEFINE_F(Reduce1D, Op)(benchmark::State& state) {
   const int M = A.numel();
   const int kChunkSize = 8;
 
-  te::Placeholder a("A", te::kFloat, {M});
-  te::Tensor b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat);
+  te::BufHandle a("A", {M}, te::kFloat);
+  te::Tensor b = te::computeSum({a, te::IntList({0}), false}, at::kFloat);
   te::LoopNest nest({b});
 
   auto loops = nest.getLoopStmtsFor(b);
@@ -446,8 +446,8 @@ BENCHMARK_REGISTER_F(Reduce2DCol, Torch)
 
 BENCHMARK_DEFINE_F(Reduce2DCol, OpSchedule)(benchmark::State& state) {
   constexpr int kCacheSize = 1 << 12;
-  te::Placeholder a("A", te::kFloat, {M, N});
-  te::Tensor b = te::computeSum({a.handle(), te::IntList({0}), false}, at::kFloat);
+  te::BufHandle a("A", {M, N}, te::kFloat);
+  te::Tensor b = te::computeSum({a, te::IntList({0}), false}, at::kFloat);
   te::LoopNest nest({b});
 
   auto sch = state.range(2);
@@ -552,8 +552,8 @@ BENCHMARK_REGISTER_F(Reduce2DRow, Hand)
 
 BENCHMARK_DEFINE_F(Reduce2DRow, OpSchedule)(benchmark::State& state) {
   constexpr int kChunkSize = 8;
-  te::Placeholder a("A", te::kFloat, {M, N});
-  te::Tensor b = te::computeSum({a.handle(), te::IntList({1}), false}, at::kFloat);
+  te::BufHandle a("A", {M, N}, te::kFloat);
+  te::Tensor b = te::computeSum({a, te::IntList({1}), false}, at::kFloat);
   te::LoopNest nest({b});
 
   auto sch = state.range(2);
 
@@ -42,8 +42,8 @@ class SignedLog1pBench : public benchmark::Fixture {
   }
 
   void runNNC(benchmark::State& state) {
-    Placeholder input_ph(
-        "input", kFloat, {input_size_int_[0], input_size_int_[1]});
+    BufHandle input_ph(
+        "input", {input_size_int_[0], input_size_int_[1]}, kFloat);
     Tensor abs_result = Compute(
         "aten_abs",
         {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
@@ -56,8 +56,8 @@ class SignedLog1pBench : public benchmark::Fixture {
         [&](const VarHandle& m, const VarHandle& n) {
           return log1p(abs_result.load(m, n));
         });
-    Tensor sign_result = computeSign(
-        {input_ph.handle()}, {input_size_int_[0], input_size_int_[1]});
+    Tensor sign_result =
+        computeSign({input_ph}, {input_size_int_[0], input_size_int_[1]});
     Tensor output = Compute(
         "aten_mul",
         {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
@@ -90,8 +90,8 @@ class SignedLog1pBench : public benchmark::Fixture {
   }
 
   void runNNCLogVml(benchmark::State& state) {
-    Placeholder input_ph(
-        "input", kFloat, {input_size_int_[0], input_size_int_[1]});
+    BufHandle input_ph(
+        "input", {input_size_int_[0], input_size_int_[1]}, kFloat);
     Tensor abs_result = Compute(
         "aten_abs",
         {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
@@ -104,8 +104,8 @@ class SignedLog1pBench : public benchmark::Fixture {
         [&](const VarHandle& m, const VarHandle& n) {
           return log_vml(abs_result.load(m, n) + ExprHandle(1));
         });
-    Tensor sign_result = computeSign(
-        {input_ph.handle()}, {input_size_int_[0], input_size_int_[1]});
+    Tensor sign_result =
+        computeSign({input_ph}, {input_size_int_[0], input_size_int_[1]});
     Tensor output = Compute(
         "aten_mul",
         {{input_size_int_[0], "M"}, {input_size_int_[1], "N"}},
 
@@ -31,7 +31,7 @@ std::string diffs(const at::Tensor& a, const at::Tensor& b) {
 
 TEST(Approx, log_vml) {
   te::VarHandle N("N", te::kInt);
-  te::Placeholder A("A", te::kFloat, {N});
+  te::BufHandle A("A", {N}, te::kFloat);
   te::Tensor B = te::Compute(
       "B", {N}, [&](const te::VarHandle& i) { return log_vml(A.load(i)); });