@@ -88,38 +88,47 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr<const OVNetwork>& ie_cnn_netwo
88
88
// Note! With this default path, the model runs but produces garbage (for NPUW). For CPU it's fine.
89
89
auto mutable_model = ie_cnn_network->clone ();
90
90
91
- std::cout << " stateless model" << std::endl;
92
- logBasicModelInfo (mutable_model);
91
+ if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled ()) {
92
+ std::cout << " Stateless OV Model Statistic" << std::endl;
93
+ LogBasicModelInfo (mutable_model);
94
+ }
95
+ LogBasicModelInfo (mutable_model);
93
96
94
- std::cout << " making stateful... " << std::endl;
95
- patch_stateful_decoder (mutable_model);
97
+ LOGS_DEFAULT (INFO) << log_tag << " Converting from Stateless OV Model to Stateful OV Model " << std::endl;
98
+ PatchStatefulDecoder (mutable_model);
96
99
97
- std::cout << " after stateful transition:" << std::endl;
98
- logBasicModelInfo (mutable_model);
100
+ if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled ()) {
101
+ std::cout << " Stateful OV Model Statistic" << std::endl;
102
+ LogBasicModelInfo (mutable_model);
103
+ }
99
104
100
105
// This patches the model so that it only produces the logits required for sampling.
101
106
// Actually either way that happens within NPUW::LLMCompiledModel creation, but this is
102
107
// here mostly to align this behavior for other devices (CPU, GPU).
103
- apply_slice_before_matmul_transformation (mutable_model);
108
+ ApplySliceBeforeMatmulTransformation (mutable_model);
104
109
105
- auto kv_pos = get_kv_axes_pos (mutable_model);
106
- std::cout << " kv_pos.batch = " << kv_pos.batch << std::endl;
107
- std::cout << " kv_pos.seq_len = " << kv_pos.seq_len << std::endl;
110
+ auto kv_pos = GetKVAxesPos (mutable_model);
111
+ if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled ()) {
112
+ std::cout << " kv_pos.batch = " << kv_pos.batch << std::endl;
113
+ std::cout << " kv_pos.seq_len = " << kv_pos.seq_len << std::endl;
114
+ }
108
115
109
116
if (hw_target.find (" NPU" ) != std::string::npos) {
110
117
KVDesc kv_desc;
111
- kv_desc.max_prompt_len = pop_int_and_cast (device_config, " MAX_PROMPT_LEN" ).value_or (1024u );
112
- kv_desc.min_response_len = pop_int_and_cast (device_config, " MIN_RESPONSE_LEN" ).value_or (128u );
118
+ kv_desc.max_prompt_len = PopIntAndCast (device_config, " MAX_PROMPT_LEN" ).value_or (1024u );
119
+ kv_desc.min_response_len = PopIntAndCast (device_config, " MIN_RESPONSE_LEN" ).value_or (128u );
113
120
114
- std::cout << " kv_desc.max_prompt_len = " << kv_desc.max_prompt_len << std::endl;
115
- std::cout << " kv_desc.min_response_len = " << kv_desc.min_response_len << std::endl;
121
+ if (onnxruntime::openvino_ep::backend_utils::IsDebugEnabled ()) {
122
+ std::cout << " kv_desc.max_prompt_len = " << kv_desc.max_prompt_len << std::endl;
123
+ std::cout << " kv_desc.min_response_len = " << kv_desc.min_response_len << std::endl;
124
+ }
116
125
117
- update_npu_config (config, mutable_model , kv_pos, kv_desc);
126
+ UpdateNPUConfig (config, kv_pos, kv_desc);
118
127
}
119
128
120
- std::cout << " calling compile on stateful model ..." << std::endl;
129
+ std::cout << " Compiling Stateful OV Model ..." << std::endl;
121
130
obj = core.compile_model (mutable_model, hw_target, config);
122
- std::cout << " done calling compile on stateful model... " << std::endl;
131
+ std::cout << " Stateful OV Model Compilation Complete " << std::endl;
123
132
} else {
124
133
obj = core.compile_model (ie_cnn_network, hw_target, device_config);
125
134
}
0 commit comments