Skip to content

initial step for stateful_poc to enable epctx for genai. #633

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 7 commits into
base: msb_release
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions onnxruntime/core/providers/openvino/backend_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ BackendManager::BackendManager(SessionContext& session_context,
subgraph_context_.has_dynamic_input_shape = true;
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims";
if ((session_context_.device_type.find("CPU") != std::string::npos ||
session_context_.device_type.find("GPU") != std::string::npos) &&
session_context_.device_type.find("GPU") != std::string::npos ||
session_context_.device_type.find("NPU") != std::string::npos) &&
!session_context_.disable_dynamic_shapes) {
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Starting backend initialization. "
<< "Creating backend Dynamic Shapes";
Expand Down Expand Up @@ -473,7 +474,8 @@ void BackendManager::Compute(OrtKernelContext* context) {
if (subgraph_context_.has_dynamic_input_shape &&
!session_context_.disable_dynamic_shapes &&
(session_context_.device_type.find("CPU") != std::string::npos ||
session_context_.device_type.find("GPU") != std::string::npos)) {
session_context_.device_type.find("GPU") != std::string::npos ||
session_context_.device_type.find("NPU") != std::string::npos)) {
concrete_backend_->Infer(context);
} else if (subgraph_context_.has_dynamic_input_shape) {
std::vector<std::vector<int64_t>> tensor_shapes = GetInputTensorShapes(ctx);
Expand Down
22 changes: 17 additions & 5 deletions onnxruntime/core/providers/openvino/backends/basic_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
exe_network_ = OVCore::Get()->ImportModel(*model_stream,
hw_target,
device_config,
subgraph_context_.subgraph_name);
session_context.onnx_model_path_name.string());

model_stream.reset(); // Delete stream after it is no longer needed
} else if (!session_context_.has_external_weights &&
!subgraph_context_.has_dynamic_input_shape &&
Expand Down Expand Up @@ -167,7 +168,6 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
if (session_context_.precision.find("ACCURACY") != std::string::npos &&
session_context_.device_type.find("GPU") != std::string::npos) {
if (session_context_.OpenVINO_Version.at(0) >= 2024) {
device_config.emplace(ov::hint::inference_precision(ov::element::undefined));
device_config.emplace(ov::hint::execution_mode(ov::hint::ExecutionMode::ACCURACY));
} else {
if (!subgraph_context_.model_precision.empty())
Expand Down Expand Up @@ -365,14 +365,22 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
}
index++;
}

// for the stateful PoC, the ONNX model will have KV cache (past/present) tensors, but
// we internally converted it to stateful, which removed these. So, we just continue here
// to avoid runtime exception.
//if (input_name.empty()) continue;
if (input_name.empty() || input_name == "beam_idx") continue;

ORT_ENFORCE(!input_name.empty(), log_tag,
"Input names mismatch between OpenVINO and ONNX. ", onnx_input_name,
" doesn't exist in the list of OpenVINO input tensor names");
size_t batch_slice_idx = 0;
if (subgraph_context_.has_dynamic_input_shape &&
!session_context_.disable_dynamic_shapes &&
(session_context_.device_type.find("CPU") != std::string::npos ||
session_context_.device_type.find("GPU") != std::string::npos)) {
session_context_.device_type.find("GPU") != std::string::npos ||
session_context_.device_type.find("NPU") != std::string::npos)) {
auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name));
auto tensor_info = tensor.GetTensorTypeAndShapeInfo();
auto tensor_shape = tensor_info.GetShape();
Expand Down Expand Up @@ -434,7 +442,10 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
}
} // Loop subgraph original input names

if (session_context_.device_type.find("NPU") != std::string::npos) {
// For stateful PoC added '&& false' here to disable it, as we forced it through
// same dynamic shape path above as we do for CPU & GPU.
if (session_context_.device_type.find("NPU") != std::string::npos &&
!subgraph_context_.has_dynamic_input_shape && false) {
// Set the output blob as remote blob
auto graph_output_info = exe_network_.Get().outputs();
auto output_idx = 0;
Expand Down Expand Up @@ -629,7 +640,8 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe
"list of OpenVINO output tensor names");
}
if ((session_context_.device_type.find("CPU") != std::string::npos ||
session_context_.device_type.find("GPU") != std::string::npos)) {
session_context_.device_type.find("GPU") != std::string::npos ||
session_context_.device_type.find("NPU") != std::string::npos)) {
try {
graph_output_blob = infer_request->GetTensor(output_name);
} catch (const char* msg) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

namespace onnxruntime {
namespace openvino_ep {

void ParseConfigOptions(ProviderInfo& pi, const ConfigOptions& config_options) {
pi.so_disable_cpu_ep_fallback = config_options.GetConfigOrDefault(kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
pi.so_context_enable = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
Expand Down Expand Up @@ -330,7 +331,9 @@ struct OpenVINO_Provider : Provider {

// Always true for NPU plugin or when passed .
if (pi.device_type.find("NPU") != std::string::npos) {
pi.disable_dynamic_shapes = true;
// For Stateful PoC, we want control to pass through dynamic shape paths,
// so just force this to false right now.
pi.disable_dynamic_shapes = false;
}

// Append values to config to support weight-as-inputs conversion for shared contexts
Expand Down
141 changes: 139 additions & 2 deletions onnxruntime/core/providers/openvino/ov_interface.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/openvino/backend_utils.h"

// for make stateful utility function(s)
#include "core/providers/openvino/ov_stateful_patch_utils.h"

using Exception = ov::Exception;

namespace onnxruntime {
Expand Down Expand Up @@ -77,7 +80,52 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_netwo
const std::string& name) {
ov::CompiledModel obj;
try {
obj = core.compile_model(ie_cnn_network, hw_target, device_config);
if (true) {
ov::AnyMap config;

// Create a clone of ie_cnn_network, since it's a const ov::Model, and we need to patch it..
// Note! With this default path, the model runs but produces garbage (for NPUW). For CPU it's fine.
auto mutable_model = ie_cnn_network->clone();

// uncomment to override ov::Model with one produced by OV's ONNX front-end.
// For some reason, this makes it work -- even though model.onnx is the same model read by ORT GenAI.
// auto mutable_model = core.read_model("C:\\Users\\LNL\\Workspace\\ORT\\deepseek_r1_distill_qwen_1.5B_int4_ort_qdq\\model.onnx");

std::cout << "stateless model" << std::endl;
logBasicModelInfo(mutable_model);

std::cout << "making stateful..." << std::endl;
patch_stateful_decoder(mutable_model);

std::cout << "after stateful transition:" << std::endl;
logBasicModelInfo(mutable_model);

// This patches the model so that it only produces the logits required for sampling.
// Actually either way that happens within NPUW::LLMCompiledModel creation, but this is
// here mostly to align this behavior for other devices (CPU, GPU).
apply_slice_before_matmul_transformation(mutable_model);

auto kv_pos = get_kv_axes_pos(mutable_model);
std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl;
std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl;

if (hw_target.find("NPU") != std::string::npos) {
KVDesc kv_desc;
kv_desc.max_prompt_len = pop_int_and_cast(device_config, "MAX_PROMPT_LEN").value_or(1024u);
kv_desc.min_response_len = pop_int_and_cast(device_config, "MIN_RESPONSE_LEN").value_or(128u);

std::cout << "kv_desc.max_prompt_len = " << kv_desc.max_prompt_len << std::endl;
std::cout << "kv_desc.min_response_len = " << kv_desc.min_response_len << std::endl;

update_npu_config(config, mutable_model, kv_pos, kv_desc);
}

std::cout << "calling compile on stateful model..." << std::endl;
obj = core.compile_model(mutable_model, hw_target, config);
std::cout << "done calling compile on stateful model..." << std::endl;
} else {
obj = core.compile_model(ie_cnn_network, hw_target, device_config);
}
#ifndef NDEBUG
printDebugInfo(obj);
#endif
Expand Down Expand Up @@ -115,7 +163,83 @@ OVExeNetwork OVCore::ImportModel(std::istream& model_stream,
std::string name) {
try {
ov::CompiledModel obj;
obj = core.import_model(model_stream, hw_target, device_config);

//Check if it's XML
std::streampos originalPos = model_stream.tellg();
std::string header(5, '\0'); // Allocate space for "<?xml"
model_stream.read(&header[0], 5);

// Restore the stream position (important for reusing the stream)
model_stream.clear(); // Clear any read errors
model_stream.seekg(originalPos);

if (header != "<?xml") {
obj = core.import_model(model_stream, hw_target, device_config);

} else {

//Get path to bin file
std::string bin_file;
if (name.size() >= 5 && name.substr(name.size() - 5) == ".onnx") {
bin_file = name;
bin_file.replace(name.size() - 5, 5, ".bin");
} else {
throw std::runtime_error("Invalid model name. Make sure *.onnx, *.xml, and *.bin carry the same name." );
}

// Read the model XML into a string
std::stringstream xml_stream;
xml_stream << model_stream.rdbuf();
std::string xml_content = xml_stream.str();

// Read model.bin into a vector
std::ifstream bin_stream;
bin_stream.open(bin_file, std::ios::binary);
if (!bin_stream.is_open()) {
throw std::runtime_error("Failed to open " + bin_file);
}

bin_stream.seekg(0, std::ios::end);
std::streamsize size = bin_stream.tellg();
bin_stream.seekg(0, std::ios::beg);
std::vector<uint8_t> bin_data(size);
if (!bin_stream.read(reinterpret_cast<char*>(bin_data.data()), size)) {
throw std::runtime_error("Failed to read binary data from " + bin_file);
}

// Create an ov::Tensor for weights
ov::Tensor weights_tensor(ov::element::u8, {bin_data.size()}, bin_data.data());

// Load the model explicitly with XML content and weights
std::shared_ptr<ov::Model> model = core.read_model(xml_content, weights_tensor);


ov::AnyMap config = device_config;

std::cout << "already a stateful model since it came from EPCtx:" << std::endl;
logBasicModelInfo(model);

auto kv_pos = get_kv_axes_pos(model);
std::cout << "kv_pos.batch = " << kv_pos.batch << std::endl;
std::cout << "kv_pos.seq_len = " << kv_pos.seq_len << std::endl;

if (hw_target.find("NPU") != std::string::npos) {
KVDesc kv_desc;
kv_desc.max_prompt_len = pop_int_and_cast(config, "MAX_PROMPT_LEN").value_or(1024u);
kv_desc.min_response_len = pop_int_and_cast(config, "MIN_RESPONSE_LEN").value_or(128u);

std::cout << "kv_desc.max_prompt_len = " << kv_desc.max_prompt_len << std::endl;
std::cout << "kv_desc.min_response_len = " << kv_desc.min_response_len << std::endl;

update_npu_config(config, model, kv_pos, kv_desc);
} else {
apply_slice_before_matmul_transformation(model);
}

std::cout << "calling compile on stateful model for" << hw_target << " ... " << std::endl;
obj = core.compile_model(model, hw_target, config);
std::cout << "done calling compile on stateful model..." << std::endl;
}
#ifndef NDEBUG
printDebugInfo(obj);
#endif
Expand All @@ -128,6 +252,9 @@ OVExeNetwork OVCore::ImportModel(std::istream& model_stream,
}
}




void OVCore::SetCache(const std::string& cache_dir_path) {
core.set_property(ov::cache_dir(cache_dir_path));
}
Expand Down Expand Up @@ -211,6 +338,16 @@ std::string OVInferRequest::GetInputTensorName(uint32_t index) {
void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) {
try {
ovInfReq.set_tensor(name, *(blob.get()));

if (name == "input_ids") {
// Since we can't seem to set at ORT GenAI layer right now, we just set it here
// as a workaround.
// TODO: Fix this.
ov::Tensor beam_idx = ov::Tensor(ov::element::i32, {1});
std::fill_n(beam_idx.data<int32_t>(), 1, 0);
ovInfReq.set_tensor("beam_idx", beam_idx);
}

} catch (const Exception& e) {
ORT_THROW(log_tag + " Cannot set Remote Blob for output: " + name + e.what());
} catch (...) {
Expand Down
Loading