intel · gblong1 · Mar 26, 2025 · Mar 28, 2025 · Mar 28, 2025 · Apr 1, 2025
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -104,7 +104,8 @@ BackendManager::BackendManager(SessionContext& session_context,
     subgraph_context_.has_dynamic_input_shape = true;
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims";
     if ((session_context_.device_type.find("CPU") != std::string::npos ||
-         session_context_.device_type.find("GPU") != std::string::npos) &&
+         session_context_.device_type.find("GPU") != std::string::npos ||
+         session_context_.device_type.find("NPU") != std::string::npos) &&
         !session_context_.disable_dynamic_shapes) {
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Starting backend initialization. "
                          << "Creating backend Dynamic Shapes";
@@ -473,7 +474,8 @@ void BackendManager::Compute(OrtKernelContext* context) {
   if (subgraph_context_.has_dynamic_input_shape &&
       !session_context_.disable_dynamic_shapes &&
       (session_context_.device_type.find("CPU") != std::string::npos ||
-       session_context_.device_type.find("GPU") != std::string::npos)) {
+       session_context_.device_type.find("GPU") != std::string::npos ||
+       session_context_.device_type.find("NPU") != std::string::npos)) {
     concrete_backend_->Infer(context);
   } else if (subgraph_context_.has_dynamic_input_shape) {
     std::vector<std::vector<int64_t>> tensor_shapes = GetInputTensorShapes(ctx);

diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -91,7 +91,8 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
       exe_network_ = OVCore::Get()->ImportModel(*model_stream,
                                                 hw_target,
                                                 device_config,
-                                                subgraph_context_.subgraph_name);
+                                                session_context.onnx_model_path_name.string());
+
       model_stream.reset();  // Delete stream after it is no longer needed
     } else if (!session_context_.has_external_weights &&
                !subgraph_context_.has_dynamic_input_shape &&
@@ -167,7 +168,6 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
   if (session_context_.precision.find("ACCURACY") != std::string::npos &&
       session_context_.device_type.find("GPU") != std::string::npos) {
     if (session_context_.OpenVINO_Version.at(0) >= 2024) {
-      device_config.emplace(ov::hint::inference_precision(ov::element::undefined));
       device_config.emplace(ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY));
     } else {
       if (!subgraph_context_.model_precision.empty())
@@ -365,14 +365,22 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
         }
         index++;
       }
+
+      // for the stateful PoC, the ONNX model will have KV cache (past/present) tensors, but
+      // we internally converted it to stateful, which removed these. So, we just continue here
+      // to avoid runtime exception.
+      //if (input_name.empty()) continue;
+      if (input_name.empty() || input_name == "beam_idx") continue;
+
       ORT_ENFORCE(!input_name.empty(), log_tag,
                   "Input names mismatch between OpenVINO and ONNX. ", onnx_input_name,
                   " doesn't exist in the list of OpenVINO input tensor names");
       size_t batch_slice_idx = 0;
       if (subgraph_context_.has_dynamic_input_shape &&
           !session_context_.disable_dynamic_shapes &&
           (session_context_.device_type.find("CPU") != std::string::npos ||
-           session_context_.device_type.find("GPU") != std::string::npos)) {
+           session_context_.device_type.find("GPU") != std::string::npos ||
+           session_context_.device_type.find("NPU") != std::string::npos)) {
         auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name));
         auto tensor_info = tensor.GetTensorTypeAndShapeInfo();
         auto tensor_shape = tensor_info.GetShape();
@@ -434,7 +442,10 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
       }
     }  // Loop subgraph original input names
 
-    if (session_context_.device_type.find("NPU") != std::string::npos) {
+    // For stateful PoC added '&& false' here to disable it, as we forced it through
+    // same dynamic shape path above as we do for CPU & GPU.
+    if (session_context_.device_type.find("NPU") != std::string::npos &&
+        !subgraph_context_.has_dynamic_input_shape && false) {
       // Set the output blob as remote blob
       auto graph_output_info = exe_network_.Get().outputs();
       auto output_idx = 0;
@@ -629,7 +640,8 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
             "list of OpenVINO output tensor names");
       }
       if ((session_context_.device_type.find("CPU") != std::string::npos ||
-           session_context_.device_type.find("GPU") != std::string::npos)) {
+           session_context_.device_type.find("GPU") != std::string::npos ||
+           session_context_.device_type.find("NPU") != std::string::npos)) {
         try {
           graph_output_blob = infer_request->GetTensor(output_name);
         } catch (const char* msg) {

diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -14,6 +14,7 @@
 
 namespace onnxruntime {
 namespace openvino_ep {
+
 void ParseConfigOptions(ProviderInfo& pi, const ConfigOptions& config_options) {
   pi.so_disable_cpu_ep_fallback = config_options.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
   pi.so_context_enable = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
@@ -330,7 +331,9 @@ struct OpenVINO_Provider : Provider {
 
     // Always true for NPU plugin or when passed .
     if (pi.device_type.find("NPU") != std::string::npos) {
-      pi.disable_dynamic_shapes = true;
+      // For Stateful PoC, we want control to pass through dynamic shape paths,
+      // so just force this to false right now.
+      pi.disable_dynamic_shapes = false;
     }
 
     // Append values to config to support weight-as-inputs conversion for shared contexts

diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -8,6 +8,9 @@
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/openvino/backend_utils.h"
 
+// for make stateful utility function(s)
+#include "core/providers/openvino/ov_stateful_patch_utils.h"
+
 using Exception = ov::Exception;
 
 namespace onnxruntime {
@@ -77,7 +80,52 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_netwo
                                   const std::string& name) {
   ov::CompiledModel obj;
   try {
-    obj = core.compile_model(ie_cnn_network, hw_target, device_config);
+    if (true) {
+      ov::AnyMap config;
+
+      // Create a clone of ie_cnn_network, since it's a const ov::Model, and we need to patch it..
+      //  Note! With this default path, the model runs but produces garbage (for NPUW). For CPU it's fine.
+      auto mutable_model = ie_cnn_network->clone();
+
+      // uncomment to override ov::Model with one produced by OV's ONNX front-end.
+      // For some reason, this makes it work -- even though model.onnx is the same model read by ORT GenAI.
+      // auto mutable_model = core.read_model("C:\\Users\\LNL\\Workspace\\ORT\\deepseek_r1_distill_qwen_1.5B_int4_ort_qdq\\model.onnx");
+
+      std::cout << "stateless model" << std::endl;
+      logBasicModelInfo(mutable_model);
+
+      std::cout << "making stateful..." << std::endl;
+      patch_stateful_decoder(mutable_model);
+
+      std::cout << "after stateful transition:" << std::endl;
+      logBasicModelInfo(mutable_model);
+
+      // This patches the model so that it only produces the logits required for sampling.
+      // Actually either way that happens within NPUW::LLMCompiledModel creation, but this is
+      // here mostly to align this behavior for other devices (CPU, GPU).
+      apply_slice_before_matmul_transformation(mutable_model);
+
+      auto kv_pos = get_kv_axes_pos(mutable_model);
+      std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl;
+      std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl;
+
+      if (hw_target.find("NPU") != std::string::npos) {
+        KVDesc kv_desc;
+        kv_desc.max_prompt_len = pop_int_and_cast(device_config, "MAX_PROMPT_LEN").value_or(1024u);
+        kv_desc.min_response_len = pop_int_and_cast(device_config, "MIN_RESPONSE_LEN").value_or(128u);
+
+        std::cout << "kv_desc.max_prompt_len = " << kv_desc.max_prompt_len << std::endl;
+        std::cout << "kv_desc.min_response_len = " << kv_desc.min_response_len << std::endl;
+
+        update_npu_config(config, mutable_model, kv_pos, kv_desc);
+      }
+
+      std::cout << "calling compile on stateful model..." << std::endl;
+      obj = core.compile_model(mutable_model, hw_target, config);
+      std::cout << "done calling compile on stateful model..." << std::endl;
+    } else {
+      obj = core.compile_model(ie_cnn_network, hw_target, device_config);
+    }
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
@@ -115,7 +163,83 @@ OVExeNetwork OVCore::ImportModel(std::istream& model_stream,
                                  std::string name) {
   try {
     ov::CompiledModel obj;
-    obj = core.import_model(model_stream, hw_target, device_config);
+
+    //Check if it's XML
+    std::streampos originalPos = model_stream.tellg();
+    std::string header(5, '\0');  // Allocate space for "<?xml"
+    model_stream.read(&header[0], 5);
+
+    // Restore the stream position (important for reusing the stream)
+    model_stream.clear(); // Clear any read errors
+    model_stream.seekg(originalPos);
+
+    if (header != "<?xml") {
+        obj = core.import_model(model_stream, hw_target, device_config);
+
+    } else {
+
+        //Get path to bin file
+        std::string bin_file;
+        if (name.size() >= 5 && name.substr(name.size() - 5) == ".onnx") {
+            bin_file = name;
+            bin_file.replace(name.size() - 5, 5, ".bin");
+        } else {
+            throw std::runtime_error("Invalid model name. Make sure *.onnx, *.xml, and *.bin carry the same name." );
+        }
+
+        // Read the model XML into a string    
+        std::stringstream xml_stream;    
+        xml_stream << model_stream.rdbuf();    
+        std::string xml_content = xml_stream.str();     
+
+        // Read model.bin into a vector    
+        std::ifstream bin_stream;
+        bin_stream.open(bin_file, std::ios::binary);
+        if (!bin_stream.is_open()) {
+            throw std::runtime_error("Failed to open " + bin_file);    
+        }     
+
+        bin_stream.seekg(0, std::ios::end);
+        std::streamsize size = bin_stream.tellg();
+        bin_stream.seekg(0, std::ios::beg);
+        std::vector<uint8_t> bin_data(size);
+        if (!bin_stream.read(reinterpret_cast<char*>(bin_data.data()), size)) {
+            throw std::runtime_error("Failed to read binary data from " + bin_file);
+        }
+
+        // Create an ov::Tensor for weights
+        ov::Tensor weights_tensor(ov::element::u8, {bin_data.size()}, bin_data.data());
+
+        // Load the model explicitly with XML content and weights
+        std::shared_ptr<ov::Model> model = core.read_model(xml_content, weights_tensor);
+
+
+        ov::AnyMap config = device_config;
+
+        std::cout << "already a stateful model since it came from EPCtx:" << std::endl;
+        logBasicModelInfo(model);
+
+        auto kv_pos = get_kv_axes_pos(model);
+        std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl;
+        std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl;
+
+        if (hw_target.find("NPU") != std::string::npos) {
+          KVDesc kv_desc;
+          kv_desc.max_prompt_len = pop_int_and_cast(config, "MAX_PROMPT_LEN").value_or(1024u);
+          kv_desc.min_response_len = pop_int_and_cast(config, "MIN_RESPONSE_LEN").value_or(128u);
+
+          std::cout << "kv_desc.max_prompt_len = " << kv_desc.max_prompt_len << std::endl;
+          std::cout << "kv_desc.min_response_len = " << kv_desc.min_response_len << std::endl;
+
+          update_npu_config(config, model, kv_pos, kv_desc);
+        } else {
+          apply_slice_before_matmul_transformation(model);
+        }
+
+         std::cout << "calling compile on stateful model for" << hw_target  << " ... " << std::endl;
+         obj = core.compile_model(model, hw_target, config);
+         std::cout << "done calling compile on stateful model..." << std::endl;
+    }   
 #ifndef NDEBUG
     printDebugInfo(obj);
 #endif
@@ -128,6 +252,9 @@ OVExeNetwork OVCore::ImportModel(std::istream& model_stream,
   }
 }
 
+
+
+
 void OVCore::SetCache(const std::string& cache_dir_path) {
   core.set_property(ov::cache_dir(cache_dir_path));
 }
@@ -211,6 +338,16 @@ std::string OVInferRequest::GetInputTensorName(uint32_t index) {
 void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) {
   try {
     ovInfReq.set_tensor(name, *(blob.get()));
+
+    if (name == "input_ids") {
+      // Since we can't seem to set at ORT GenAI layer right now, we just set it here
+      // as a workaround.
+      // TODO: Fix this.
+      ov::Tensor beam_idx = ov::Tensor(ov::element::i32, {1});
+      std::fill_n(beam_idx.data<int32_t>(), 1, 0);
+      ovInfReq.set_tensor("beam_idx", beam_idx);
+    }
+
   } catch (const Exception& e) {
     ORT_THROW(log_tag + " Cannot set Remote Blob for output: " + name + e.what());
   } catch (...) {