openvinotoolkit · mzegla · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025
diff --git a/src/BUILD b/src/BUILD
@@ -602,18 +602,15 @@ cc_library(
         deps = select({
             "//:not_disable_python": [
                 "//src/python:libovmspythonmodule",
-                # Jinja template processing is done in Python
-                "//src/llm:llmcalculator",
-                "//src/llm:genai_servables",
-                "//src/llm:text_processor",
             ],
             "//:disable_python": []
         }) + select({
             "//conditions:default": [],
             "//:not_disable_mediapipe" : [
                 "//src/llm:openai_completions_api_handler",
                 "//src/embeddings:embeddingscalculator",
-                "//src/rerank:rerankcalculator",],
+                "//src/rerank:rerankcalculator",
+                "//src/llm:llmcalculator",],
         }) + select({
             "//:enable_drogon": ["libdrogon_http_server"],
             "//conditions:default" : ["libnet_http_server"],
@@ -2756,6 +2753,11 @@ cc_test(
                 "test/get_mediapipe_graph_metadata_response_test.cpp",
                 "test/mediapipe_framework_test.cpp",
                 "test/http_openai_handler_test.cpp",
+                "test/llm/llmnode_test.cpp",
+                "test/llm/max_model_length_test.cpp",
+                "test/llm/text_streamer_test.cpp",
+                "test/llm/visual_language_model/complete_flow_test.cpp",
+                "test/llm/visual_language_model/initialization_test.cpp",
             ],
             "//:disable_mediapipe" : [
                 "test/disabled_mediapipe_test.cpp",
@@ -2765,13 +2767,8 @@ cc_test(
                 # OvmsPyTensor is currently not used in OVMS core and is just a base for the binding.
                 # "test/python/ovms_py_tensor_test.cpp",
                 "test/pythonnode_test.cpp",
-                # LLM logic uses Python for processing Jinja templates
-                "test/llm/llmnode_test.cpp",
-                "test/llm/max_model_length_test.cpp",
+                # LLM logic uses Python for processing Jinja templates when built with Python enabled
                 "test/llm/llmtemplate_test.cpp",
-                "test/llm/text_streamer_test.cpp",
-                "test/llm/visual_language_model/complete_flow_test.cpp",
-                "test/llm/visual_language_model/initialization_test.cpp",
             ],
             "//:disable_python" : [],
         }),

diff --git a/src/llm/BUILD b/src/llm/BUILD
@@ -16,9 +16,9 @@
 
 load("@mediapipe//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library", "mediapipe_proto_library")
 load("//:common_settings.bzl",
-     "COMMON_STATIC_LIBS_COPTS", "COMMON_STATIC_LIBS_LINKOPTS", "COMMON_FUZZER_COPTS", "COMMON_FUZZER_LINKOPTS", "COMMON_LOCAL_DEFINES", "PYBIND_DEPS")
+     "COMMON_STATIC_LIBS_COPTS", "COMMON_STATIC_LIBS_LINKOPTS", "COMMON_FUZZER_COPTS", "COMMON_FUZZER_LINKOPTS", "COMMON_LOCAL_DEFINES", "PYBIND_DEPS", "COPTS_PYTHON")
 
-COPTS_ADJUSTED = COMMON_STATIC_LIBS_COPTS + select({
+COPTS_ADJUSTED = COMMON_STATIC_LIBS_COPTS + COPTS_PYTHON + select({
         "//conditions:default": [],
         "//:fuzzer_build" : COMMON_FUZZER_COPTS,
 })
@@ -92,13 +92,30 @@ cc_library(
 
 cc_library(
     name = "genai_servables",
-    hdrs = ["servable.hpp", "servable_initializer.hpp", 
-            "language_model/continuous_batching/servable.hpp", "language_model/continuous_batching/llm_executor.hpp", "language_model/continuous_batching/servable_initializer.hpp",
-            "visual_language_model/continuous_batching/servable.hpp", "language_model/legacy/servable.hpp", "language_model/legacy/servable_initializer.hpp", "language_model/legacy/legacy_executor.hpp",
-            "visual_language_model/legacy/servable.hpp", "visual_language_model/legacy/servable_initializer.hpp", "visual_language_model/legacy/legacy_executor.hpp"],
-    srcs = ["servable.cpp", "servable_initializer.cpp", "language_model/continuous_batching/servable.cpp", "language_model/continuous_batching/servable_initializer.cpp",
-            "visual_language_model/continuous_batching/servable.cpp", "language_model/legacy/servable.cpp", "language_model/legacy/servable_initializer.cpp", "language_model/legacy/legacy_executor.cpp",
-            "visual_language_model/legacy/servable.cpp", "visual_language_model/legacy/servable_initializer.cpp", "visual_language_model/legacy/legacy_executor.cpp"],
+    hdrs = ["servable.hpp", 
+            "servable_initializer.hpp", 
+            "language_model/continuous_batching/servable.hpp",
+            "language_model/continuous_batching/llm_executor.hpp",
+            "language_model/continuous_batching/servable_initializer.hpp",
+            "visual_language_model/continuous_batching/servable.hpp",
+            "language_model/legacy/servable.hpp",
+            "language_model/legacy/servable_initializer.hpp",
+            "language_model/legacy/legacy_executor.hpp",
+            "visual_language_model/legacy/servable.hpp",
+            "visual_language_model/legacy/servable_initializer.hpp",
+            "visual_language_model/legacy/legacy_executor.hpp",
+            "text_utils.hpp"],
+    srcs = ["servable.cpp",
+            "servable_initializer.cpp",
+            "language_model/continuous_batching/servable.cpp",
+            "language_model/continuous_batching/servable_initializer.cpp",
+            "visual_language_model/continuous_batching/servable.cpp",
+            "language_model/legacy/servable.cpp",
+            "language_model/legacy/servable_initializer.cpp",
+            "language_model/legacy/legacy_executor.cpp",
+            "visual_language_model/legacy/servable.cpp",
+            "visual_language_model/legacy/servable_initializer.cpp",
+            "visual_language_model/legacy/legacy_executor.cpp"],
     deps = [
         "//third_party:openvino",
         "@mediapipe//mediapipe/framework:calculator_framework",
@@ -110,14 +127,15 @@ cc_library(
         "//src:libovmsprofiler",
         "//src:libovmsfilesystem",
         "llmcalculator_cc_proto",
-        "//src/python:utils",
-        ":text_processor",
         ":openai_completions_api_handler",
         "//src:httppayload",
         "//src:libhttpclientconnection",
-    ] + PYBIND_DEPS + select({
+    ] + select({
         "//conditions:default": ["//third_party:genai", ":llm_engine"],
         "//:not_genai_bin" : [":llm_engine"],
+    }) + select({
+        "//:disable_python": [],
+        "//:not_disable_python" : [":py_jinja_template_processor"],
     }),
     visibility = ["//visibility:public"],
     local_defines = COMMON_LOCAL_DEFINES,
@@ -127,9 +145,9 @@ cc_library(
 )
 
 cc_library(
-    name = "text_processor",
-    hdrs = ["text_processor.hpp"],
-    srcs = ["text_processor.cpp"],
+    name = "py_jinja_template_processor",
+    hdrs = ["py_jinja_template_processor.hpp"],
+    srcs = ["py_jinja_template_processor.cpp"],
     deps = ["@mediapipe//mediapipe/framework:calculator_framework",
             "//third_party:openvino",
             "//src:libovmslogging",

diff --git a/src/llm/http_llm_calculator.cc b/src/llm/http_llm_calculator.cc
@@ -26,6 +26,7 @@
 #pragma warning(pop)
 
 #include "../http_payload.hpp"
+#include "../logging.hpp"
 #include "../profiler.hpp"
 #include "apis/openai_completions.hpp"
 #include "servable.hpp"

diff --git a/src/llm/language_model/continuous_batching/servable.cpp b/src/llm/language_model/continuous_batching/servable.cpp
@@ -33,7 +33,10 @@
 #include "../../../http_payload.hpp"
 #include "../../../mediapipe_internal/mediapipe_utils.hpp"
 #include "../../apis/openai_completions.hpp"
-#include "../../text_processor.hpp"
+#include "../../text_utils.hpp"
+#if (PYTHON_DISABLE == 0)
+#include "../../py_jinja_template_processor.hpp"
+#endif
 #include "llm_executor.hpp"
 #include "servable.hpp"
 

diff --git a/src/llm/language_model/continuous_batching/servable_initializer.cpp b/src/llm/language_model/continuous_batching/servable_initializer.cpp
@@ -115,8 +115,9 @@ Status ContinuousBatchingServableInitializer::initializeExperimental(std::shared
         SPDLOG_ERROR("Error during llm node initialization for models_path: {}", parsedModelsPath);
         return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
     }
-
-    loadTextProcessor(properties, parsedModelsPath);
+#if (PYTHON_DISABLE == 0)
+    loadTemplateProcessor(properties, parsedModelsPath);
+#endif
     if (nodeOptions.has_max_tokens_limit()) {
         properties->maxTokensLimit = nodeOptions.max_tokens_limit();
     }
@@ -133,15 +134,12 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
         return status;
     }
     auto properties = std::static_pointer_cast<ContinuousBatchingServableProperties>(servable->getProperties());
-
     properties->modelsPath = parsedModelsPath;
-
     properties->schedulerConfig.max_num_batched_tokens = nodeOptions.max_num_batched_tokens();
     properties->schedulerConfig.cache_size = nodeOptions.cache_size();
     properties->schedulerConfig.dynamic_split_fuse = nodeOptions.dynamic_split_fuse();
     properties->schedulerConfig.max_num_seqs = nodeOptions.max_num_seqs();
     properties->schedulerConfig.enable_prefix_caching = nodeOptions.enable_prefix_caching();
-
     properties->device = nodeOptions.device();
     properties->isSpeculativePipeline = false;
 
@@ -163,7 +161,6 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
         properties->pluginConfig.insert(draftPipeline);
         properties->isSpeculativePipeline = true;
     } else if (nodeOptions.has_draft_max_num_batched_tokens() || nodeOptions.has_draft_cache_size() || nodeOptions.has_draft_dynamic_split_fuse() || nodeOptions.has_draft_max_num_seqs() || nodeOptions.has_draft_block_size() || nodeOptions.has_draft_device()) {
-        // Consider moving draft parameters to separate structure in node options, so it's validated on the proto level
         SPDLOG_ERROR("Draft model path is not provided, but draft scheduler options are set.");
         return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
     }
@@ -188,14 +185,16 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
         return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
     }
 
-    loadTextProcessor(properties, parsedModelsPath);
+#if (PYTHON_DISABLE == 0)
+    loadTemplateProcessor(properties, parsedModelsPath);
+#endif
     if (nodeOptions.has_max_tokens_limit()) {
         properties->maxTokensLimit = nodeOptions.max_tokens_limit();
     }
     properties->bestOfLimit = nodeOptions.best_of_limit();
     properties->maxModelLength = parseMaxModelLength(parsedModelsPath);
-
     properties->llmExecutorWrapper = std::make_shared<LLMExecutorWrapper>(properties->pipeline);
+
     return StatusCode::OK;
 }
 

diff --git a/src/llm/language_model/legacy/servable.cpp b/src/llm/language_model/legacy/servable.cpp
@@ -33,7 +33,10 @@
 #include "../../../http_payload.hpp"
 #include "../../../mediapipe_internal/mediapipe_utils.hpp"
 #include "../../apis/openai_completions.hpp"
-#include "../../text_processor.hpp"
+#include "../../text_utils.hpp"
+#if (PYTHON_DISABLE == 0)
+#include "../../py_jinja_template_processor.hpp"
+#endif
 #include "servable.hpp"
 
 namespace ovms {

diff --git a/src/llm/language_model/legacy/servable_initializer.cpp b/src/llm/language_model/legacy/servable_initializer.cpp
@@ -96,8 +96,9 @@ Status LegacyServableInitializer::initialize(std::shared_ptr<GenAiServable>& ser
         SPDLOG_ERROR("Error during llm node initialization for models_path: {}", parsedModelsPath);
         return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
     }
-
-    loadTextProcessor(properties, parsedModelsPath);
+#if (PYTHON_DISABLE == 0)
+    loadTemplateProcessor(properties, parsedModelsPath);
+#endif
     properties->legacyExecutor = std::make_shared<LegacyExecutorWrapper>(properties->pipeline);
     if (nodeOptions.has_max_tokens_limit()) {
         properties->maxTokensLimit = nodeOptions.max_tokens_limit();

diff --git a/src/llm/text_processor.cpp → src/llm/py_jinja_template_processor.cpp b/src/llm/text_processor.cpp → src/llm/py_jinja_template_processor.cpp
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //*****************************************************************************
-#include "text_processor.hpp"
+#include "py_jinja_template_processor.hpp"
 
 #include <string>
 #include <utility>
@@ -35,16 +35,16 @@
 
 namespace ovms {
 
-bool TextProcessor::applyChatTemplate(TextProcessor& textProcessor, std::string modelsPath, const std::string& requestBody, std::string& output) {
-    if (textProcessor.chatTemplate == nullptr) {
+bool PyJinjaTemplateProcessor::applyChatTemplate(PyJinjaTemplateProcessor& templateProcessor, std::string modelsPath, const std::string& requestBody, std::string& output) {
+    if (templateProcessor.chatTemplate == nullptr) {
         output = "Error: Chat template not loaded correctly, so it cannot be applied";
         return false;
     }
 
     py::gil_scoped_acquire acquire;
     try {
-        auto locals = py::dict("request_body"_a = requestBody, "chat_template"_a = textProcessor.chatTemplate->getObject(),
-            "bos_token"_a = textProcessor.bosToken, "eos_token"_a = textProcessor.eosToken);
+        auto locals = py::dict("request_body"_a = requestBody, "chat_template"_a = templateProcessor.chatTemplate->getObject(),
+            "bos_token"_a = templateProcessor.bosToken, "eos_token"_a = templateProcessor.eosToken);
         py::exec(R"(
             output = ""
             error = ""

diff --git a/src/llm/py_jinja_template_processor.hpp b/src/llm/py_jinja_template_processor.hpp
@@ -0,0 +1,41 @@
+//*****************************************************************************
+// Copyright 2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include <openvino/openvino.hpp>
+#pragma warning(push)
+#pragma warning(disable : 6326 28182 6011 28020)
+// Python execution for template processing
+#include <pybind11/embed.h>  // everything needed for embedding
+#include <pybind11/stl.h>
+#pragma warning(pop)
+
+#include "src/python/utils.hpp"
+
+namespace ovms {
+
+class PyJinjaTemplateProcessor {
+public:
+    std::string bosToken = "";
+    std::string eosToken = "";
+    std::unique_ptr<PyObjectWrapper<py::object>> chatTemplate = nullptr;
+
+    static bool applyChatTemplate(PyJinjaTemplateProcessor& templateProcessor, std::string modelsPath, const std::string& requestBody, std::string& output);
+};
+}  // namespace ovms
diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp
@@ -32,7 +32,7 @@
 #include "../profiler.hpp"
 #include "apis/openai_completions.hpp"
 #include "servable.hpp"
-#include "text_processor.hpp"
+#include "text_utils.hpp"
 
 namespace ovms {
 absl::Status GenAiServable::loadRequest(std::shared_ptr<GenAiServableExecutionContext>& executionContext, const ovms::HttpPayload& payload) {
@@ -87,15 +87,21 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptr<GenAiServableExecution
     std::string inputText;
     switch (executionContext->endpoint) {
     case Endpoint::CHAT_COMPLETIONS: {
+#if (PYTHON_DISABLE == 0)
         bool success;
         if (executionContext->apiHandler->getProcessedJson().size() > 0) {
-            success = TextProcessor::applyChatTemplate(getProperties()->textProcessor, getProperties()->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText);
+            success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText);
         } else {
-            success = TextProcessor::applyChatTemplate(getProperties()->textProcessor, getProperties()->modelsPath, executionContext->payload.body, inputText);
+            success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->payload.body, inputText);
         }
         if (!success) {
             return absl::Status(absl::StatusCode::kInvalidArgument, inputText);
         }
+#else
+        ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory();
+        constexpr bool add_generation_prompt = true;  // confirm it should be hardcoded
+        inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
+#endif
         if (inputText.size() == 0) {
             return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty");
         }

diff --git a/src/llm/servable.hpp b/src/llm/servable.hpp
@@ -32,7 +32,9 @@
 
 #include "../http_payload.hpp"
 #include "apis/openai_completions.hpp"
-#include "text_processor.hpp"
+#if (PYTHON_DISABLE == 0)
+#include "py_jinja_template_processor.hpp"
+#endif
 
 namespace ovms {
 // Some pipelines internals rely on request_id, so for now we provide increasing ID
@@ -81,12 +83,14 @@ struct GenAiServableProperties {
     ov::AnyMap tokenizerPluginConfig;
     // Sampling limits
     std::optional<uint32_t> maxTokensLimit;
+    std::optional<uint32_t> maxModelLength;
     uint32_t bestOfLimit;
     bool isSpeculativePipeline;  // sampling is generally common, but maybe we could avoid having this field at all
     // Text processing utilities
     ov::genai::Tokenizer tokenizer;
-    TextProcessor textProcessor;
-    std::optional<uint32_t> maxModelLength;
+#if (PYTHON_DISABLE == 0)
+    PyJinjaTemplateProcessor templateProcessor;
+#endif
 };
 
 class GenAiServable {