Skip to content

Enable C++ only text generation #3260

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 8 additions & 11 deletions src/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -602,18 +602,15 @@ cc_library(
deps = select({
"//:not_disable_python": [
"//src/python:libovmspythonmodule",
# Jinja template processing is done in Python
"//src/llm:llmcalculator",
"//src/llm:genai_servables",
"//src/llm:text_processor",
],
"//:disable_python": []
}) + select({
"//conditions:default": [],
"//:not_disable_mediapipe" : [
"//src/llm:openai_completions_api_handler",
"//src/embeddings:embeddingscalculator",
"//src/rerank:rerankcalculator",],
"//src/rerank:rerankcalculator",
"//src/llm:llmcalculator",],
}) + select({
"//:enable_drogon": ["libdrogon_http_server"],
"//conditions:default" : ["libnet_http_server"],
Expand Down Expand Up @@ -2756,6 +2753,11 @@ cc_test(
"test/get_mediapipe_graph_metadata_response_test.cpp",
"test/mediapipe_framework_test.cpp",
"test/http_openai_handler_test.cpp",
"test/llm/llmnode_test.cpp",
"test/llm/max_model_length_test.cpp",
"test/llm/text_streamer_test.cpp",
"test/llm/visual_language_model/complete_flow_test.cpp",
"test/llm/visual_language_model/initialization_test.cpp",
],
"//:disable_mediapipe" : [
"test/disabled_mediapipe_test.cpp",
Expand All @@ -2765,13 +2767,8 @@ cc_test(
# OvmsPyTensor is currently not used in OVMS core and is just a base for the binding.
# "test/python/ovms_py_tensor_test.cpp",
"test/pythonnode_test.cpp",
# LLM logic uses Python for processing Jinja templates
"test/llm/llmnode_test.cpp",
"test/llm/max_model_length_test.cpp",
# LLM logic uses Python for processing Jinja templates when built with Python enabled
"test/llm/llmtemplate_test.cpp",
"test/llm/text_streamer_test.cpp",
"test/llm/visual_language_model/complete_flow_test.cpp",
"test/llm/visual_language_model/initialization_test.cpp",
],
"//:disable_python" : [],
}),
Expand Down
48 changes: 33 additions & 15 deletions src/llm/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@

load("@mediapipe//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library", "mediapipe_proto_library")
load("//:common_settings.bzl",
"COMMON_STATIC_LIBS_COPTS", "COMMON_STATIC_LIBS_LINKOPTS", "COMMON_FUZZER_COPTS", "COMMON_FUZZER_LINKOPTS", "COMMON_LOCAL_DEFINES", "PYBIND_DEPS")
"COMMON_STATIC_LIBS_COPTS", "COMMON_STATIC_LIBS_LINKOPTS", "COMMON_FUZZER_COPTS", "COMMON_FUZZER_LINKOPTS", "COMMON_LOCAL_DEFINES", "PYBIND_DEPS", "COPTS_PYTHON")

COPTS_ADJUSTED = COMMON_STATIC_LIBS_COPTS + select({
COPTS_ADJUSTED = COMMON_STATIC_LIBS_COPTS + COPTS_PYTHON + select({
"//conditions:default": [],
"//:fuzzer_build" : COMMON_FUZZER_COPTS,
})
Expand Down Expand Up @@ -92,13 +92,30 @@ cc_library(

cc_library(
name = "genai_servables",
hdrs = ["servable.hpp", "servable_initializer.hpp",
"language_model/continuous_batching/servable.hpp", "language_model/continuous_batching/llm_executor.hpp", "language_model/continuous_batching/servable_initializer.hpp",
"visual_language_model/continuous_batching/servable.hpp", "language_model/legacy/servable.hpp", "language_model/legacy/servable_initializer.hpp", "language_model/legacy/legacy_executor.hpp",
"visual_language_model/legacy/servable.hpp", "visual_language_model/legacy/servable_initializer.hpp", "visual_language_model/legacy/legacy_executor.hpp"],
srcs = ["servable.cpp", "servable_initializer.cpp", "language_model/continuous_batching/servable.cpp", "language_model/continuous_batching/servable_initializer.cpp",
"visual_language_model/continuous_batching/servable.cpp", "language_model/legacy/servable.cpp", "language_model/legacy/servable_initializer.cpp", "language_model/legacy/legacy_executor.cpp",
"visual_language_model/legacy/servable.cpp", "visual_language_model/legacy/servable_initializer.cpp", "visual_language_model/legacy/legacy_executor.cpp"],
hdrs = ["servable.hpp",
"servable_initializer.hpp",
"language_model/continuous_batching/servable.hpp",
"language_model/continuous_batching/llm_executor.hpp",
"language_model/continuous_batching/servable_initializer.hpp",
"visual_language_model/continuous_batching/servable.hpp",
"language_model/legacy/servable.hpp",
"language_model/legacy/servable_initializer.hpp",
"language_model/legacy/legacy_executor.hpp",
"visual_language_model/legacy/servable.hpp",
"visual_language_model/legacy/servable_initializer.hpp",
"visual_language_model/legacy/legacy_executor.hpp",
"text_utils.hpp"],
srcs = ["servable.cpp",
"servable_initializer.cpp",
"language_model/continuous_batching/servable.cpp",
"language_model/continuous_batching/servable_initializer.cpp",
"visual_language_model/continuous_batching/servable.cpp",
"language_model/legacy/servable.cpp",
"language_model/legacy/servable_initializer.cpp",
"language_model/legacy/legacy_executor.cpp",
"visual_language_model/legacy/servable.cpp",
"visual_language_model/legacy/servable_initializer.cpp",
"visual_language_model/legacy/legacy_executor.cpp"],
deps = [
"//third_party:openvino",
"@mediapipe//mediapipe/framework:calculator_framework",
Expand All @@ -110,14 +127,15 @@ cc_library(
"//src:libovmsprofiler",
"//src:libovmsfilesystem",
"llmcalculator_cc_proto",
"//src/python:utils",
":text_processor",
":openai_completions_api_handler",
"//src:httppayload",
"//src:libhttpclientconnection",
] + PYBIND_DEPS + select({
] + select({
"//conditions:default": ["//third_party:genai", ":llm_engine"],
"//:not_genai_bin" : [":llm_engine"],
}) + select({
"//:disable_python": [],
"//:not_disable_python" : [":py_jinja_template_processor"],
}),
visibility = ["//visibility:public"],
local_defines = COMMON_LOCAL_DEFINES,
Expand All @@ -127,9 +145,9 @@ cc_library(
)

cc_library(
name = "text_processor",
hdrs = ["text_processor.hpp"],
srcs = ["text_processor.cpp"],
name = "py_jinja_template_processor",
hdrs = ["py_jinja_template_processor.hpp"],
srcs = ["py_jinja_template_processor.cpp"],
deps = ["@mediapipe//mediapipe/framework:calculator_framework",
"//third_party:openvino",
"//src:libovmslogging",
Expand Down
1 change: 1 addition & 0 deletions src/llm/http_llm_calculator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#pragma warning(pop)

#include "../http_payload.hpp"
#include "../logging.hpp"
#include "../profiler.hpp"
#include "apis/openai_completions.hpp"
#include "servable.hpp"
Expand Down
5 changes: 4 additions & 1 deletion src/llm/language_model/continuous_batching/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@
#include "../../../http_payload.hpp"
#include "../../../mediapipe_internal/mediapipe_utils.hpp"
#include "../../apis/openai_completions.hpp"
#include "../../text_processor.hpp"
#include "../../text_utils.hpp"
#if (PYTHON_DISABLE == 0)
#include "../../py_jinja_template_processor.hpp"
#endif
#include "llm_executor.hpp"
#include "servable.hpp"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,9 @@ Status ContinuousBatchingServableInitializer::initializeExperimental(std::shared
SPDLOG_ERROR("Error during llm node initialization for models_path: {}", parsedModelsPath);
return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
}

loadTextProcessor(properties, parsedModelsPath);
#if (PYTHON_DISABLE == 0)
loadTemplateProcessor(properties, parsedModelsPath);
#endif
if (nodeOptions.has_max_tokens_limit()) {
properties->maxTokensLimit = nodeOptions.max_tokens_limit();
}
Expand All @@ -133,15 +134,12 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
return status;
}
auto properties = std::static_pointer_cast<ContinuousBatchingServableProperties>(servable->getProperties());

properties->modelsPath = parsedModelsPath;

properties->schedulerConfig.max_num_batched_tokens = nodeOptions.max_num_batched_tokens();
properties->schedulerConfig.cache_size = nodeOptions.cache_size();
properties->schedulerConfig.dynamic_split_fuse = nodeOptions.dynamic_split_fuse();
properties->schedulerConfig.max_num_seqs = nodeOptions.max_num_seqs();
properties->schedulerConfig.enable_prefix_caching = nodeOptions.enable_prefix_caching();

properties->device = nodeOptions.device();
properties->isSpeculativePipeline = false;

Expand All @@ -163,7 +161,6 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
properties->pluginConfig.insert(draftPipeline);
properties->isSpeculativePipeline = true;
} else if (nodeOptions.has_draft_max_num_batched_tokens() || nodeOptions.has_draft_cache_size() || nodeOptions.has_draft_dynamic_split_fuse() || nodeOptions.has_draft_max_num_seqs() || nodeOptions.has_draft_block_size() || nodeOptions.has_draft_device()) {
// Consider moving draft parameters to separate structure in node options, so it's validated on the proto level
SPDLOG_ERROR("Draft model path is not provided, but draft scheduler options are set.");
return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
}
Expand All @@ -188,14 +185,16 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
}

loadTextProcessor(properties, parsedModelsPath);
#if (PYTHON_DISABLE == 0)
loadTemplateProcessor(properties, parsedModelsPath);
#endif
if (nodeOptions.has_max_tokens_limit()) {
properties->maxTokensLimit = nodeOptions.max_tokens_limit();
}
properties->bestOfLimit = nodeOptions.best_of_limit();
properties->maxModelLength = parseMaxModelLength(parsedModelsPath);

properties->llmExecutorWrapper = std::make_shared<LLMExecutorWrapper>(properties->pipeline);

return StatusCode::OK;
}

Expand Down
5 changes: 4 additions & 1 deletion src/llm/language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@
#include "../../../http_payload.hpp"
#include "../../../mediapipe_internal/mediapipe_utils.hpp"
#include "../../apis/openai_completions.hpp"
#include "../../text_processor.hpp"
#include "../../text_utils.hpp"
#if (PYTHON_DISABLE == 0)
#include "../../py_jinja_template_processor.hpp"
#endif
#include "servable.hpp"

namespace ovms {
Expand Down
5 changes: 3 additions & 2 deletions src/llm/language_model/legacy/servable_initializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,9 @@ Status LegacyServableInitializer::initialize(std::shared_ptr<GenAiServable>& ser
SPDLOG_ERROR("Error during llm node initialization for models_path: {}", parsedModelsPath);
return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
}

loadTextProcessor(properties, parsedModelsPath);
#if (PYTHON_DISABLE == 0)
loadTemplateProcessor(properties, parsedModelsPath);
#endif
properties->legacyExecutor = std::make_shared<LegacyExecutorWrapper>(properties->pipeline);
if (nodeOptions.has_max_tokens_limit()) {
properties->maxTokensLimit = nodeOptions.max_tokens_limit();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include "text_processor.hpp"
#include "py_jinja_template_processor.hpp"

#include <string>
#include <utility>
Expand All @@ -35,16 +35,16 @@

namespace ovms {

bool TextProcessor::applyChatTemplate(TextProcessor& textProcessor, std::string modelsPath, const std::string& requestBody, std::string& output) {
if (textProcessor.chatTemplate == nullptr) {
bool PyJinjaTemplateProcessor::applyChatTemplate(PyJinjaTemplateProcessor& templateProcessor, std::string modelsPath, const std::string& requestBody, std::string& output) {
if (templateProcessor.chatTemplate == nullptr) {
output = "Error: Chat template not loaded correctly, so it cannot be applied";
return false;
}

py::gil_scoped_acquire acquire;
try {
auto locals = py::dict("request_body"_a = requestBody, "chat_template"_a = textProcessor.chatTemplate->getObject(),
"bos_token"_a = textProcessor.bosToken, "eos_token"_a = textProcessor.eosToken);
auto locals = py::dict("request_body"_a = requestBody, "chat_template"_a = templateProcessor.chatTemplate->getObject(),
"bos_token"_a = templateProcessor.bosToken, "eos_token"_a = templateProcessor.eosToken);
py::exec(R"(
output = ""
error = ""
Expand Down
41 changes: 41 additions & 0 deletions src/llm/py_jinja_template_processor.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
//*****************************************************************************
// Copyright 2024 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <memory>
#include <sstream>
#include <string>

#include <openvino/openvino.hpp>
#pragma warning(push)
#pragma warning(disable : 6326 28182 6011 28020)
// Python execution for template processing
#include <pybind11/embed.h> // everything needed for embedding
#include <pybind11/stl.h>
#pragma warning(pop)

#include "src/python/utils.hpp"

namespace ovms {

class PyJinjaTemplateProcessor {
public:
std::string bosToken = "";
std::string eosToken = "";
std::unique_ptr<PyObjectWrapper<py::object>> chatTemplate = nullptr;

static bool applyChatTemplate(PyJinjaTemplateProcessor& templateProcessor, std::string modelsPath, const std::string& requestBody, std::string& output);
};
} // namespace ovms
12 changes: 9 additions & 3 deletions src/llm/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
#include "../profiler.hpp"
#include "apis/openai_completions.hpp"
#include "servable.hpp"
#include "text_processor.hpp"
#include "text_utils.hpp"

namespace ovms {
absl::Status GenAiServable::loadRequest(std::shared_ptr<GenAiServableExecutionContext>& executionContext, const ovms::HttpPayload& payload) {
Expand Down Expand Up @@ -87,15 +87,21 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptr<GenAiServableExecution
std::string inputText;
switch (executionContext->endpoint) {
case Endpoint::CHAT_COMPLETIONS: {
#if (PYTHON_DISABLE == 0)
bool success;
if (executionContext->apiHandler->getProcessedJson().size() > 0) {
success = TextProcessor::applyChatTemplate(getProperties()->textProcessor, getProperties()->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText);
success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText);
} else {
success = TextProcessor::applyChatTemplate(getProperties()->textProcessor, getProperties()->modelsPath, executionContext->payload.body, inputText);
success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->payload.body, inputText);
}
if (!success) {
return absl::Status(absl::StatusCode::kInvalidArgument, inputText);
}
#else
ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory();
constexpr bool add_generation_prompt = true; // confirm it should be hardcoded
inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
#endif
if (inputText.size() == 0) {
return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty");
}
Expand Down
10 changes: 7 additions & 3 deletions src/llm/servable.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@

#include "../http_payload.hpp"
#include "apis/openai_completions.hpp"
#include "text_processor.hpp"
#if (PYTHON_DISABLE == 0)
#include "py_jinja_template_processor.hpp"
#endif

namespace ovms {
// Some pipelines internals rely on request_id, so for now we provide increasing ID
Expand Down Expand Up @@ -81,12 +83,14 @@ struct GenAiServableProperties {
ov::AnyMap tokenizerPluginConfig;
// Sampling limits
std::optional<uint32_t> maxTokensLimit;
std::optional<uint32_t> maxModelLength;
uint32_t bestOfLimit;
bool isSpeculativePipeline; // sampling is generally common, but maybe we could avoid having this field at all
// Text processing utilities
ov::genai::Tokenizer tokenizer;
TextProcessor textProcessor;
std::optional<uint32_t> maxModelLength;
#if (PYTHON_DISABLE == 0)
PyJinjaTemplateProcessor templateProcessor;
#endif
};

class GenAiServable {
Expand Down
Loading