40
40
41
41
namespace ovms {
42
42
43
- ov::genai::SchedulerConfig ContinuousBatchingServableInitializer::prepareDraftPipelineSchedulerConfigExperimental (const mediapipe::LLMCalculatorOptions_PipelineConfig& draftPipelineConfig) {
44
- ov::genai::SchedulerConfig config;
45
- config.max_num_batched_tokens = draftPipelineConfig.max_num_batched_tokens ();
46
- config.cache_size = draftPipelineConfig.cache_size ();
47
- config.dynamic_split_fuse = draftPipelineConfig.dynamic_split_fuse ();
48
- config.max_num_seqs = draftPipelineConfig.max_num_seqs ();
49
- config.enable_prefix_caching = draftPipelineConfig.enable_prefix_caching ();
50
- return config;
51
- }
52
-
53
43
ov::genai::SchedulerConfig ContinuousBatchingServableInitializer::prepareDraftPipelineSchedulerConfig (const mediapipe::LLMCalculatorOptions& nodeOptions) {
54
44
ov::genai::SchedulerConfig config;
55
45
config.max_num_batched_tokens = nodeOptions.has_draft_max_num_batched_tokens () ? nodeOptions.draft_max_num_batched_tokens () : nodeOptions.max_num_batched_tokens ();
@@ -60,72 +50,6 @@ ov::genai::SchedulerConfig ContinuousBatchingServableInitializer::prepareDraftPi
60
50
return config;
61
51
}
62
52
63
- Status ContinuousBatchingServableInitializer::initializeExperimental (std::shared_ptr<GenAiServable>& servable, const mediapipe::LLMCalculatorOptions& nodeOptions, std::string graphPath) {
64
- auto continousBatchingPipelineConfig = nodeOptions.continuous_batching_pipeline_config ();
65
- auto mainPipelineConfig = continousBatchingPipelineConfig.main_pipeline_config ();
66
- std::string parsedModelsPath;
67
- auto status = parseModelsPath (parsedModelsPath, mainPipelineConfig.models_path (), graphPath);
68
- if (!status.ok ()) {
69
- return status;
70
- }
71
- auto properties = std::static_pointer_cast<ContinuousBatchingServableProperties>(servable->getProperties ());
72
- properties->modelsPath = parsedModelsPath;
73
-
74
- properties->schedulerConfig .max_num_batched_tokens = mainPipelineConfig.max_num_batched_tokens ();
75
- properties->schedulerConfig .cache_size = mainPipelineConfig.cache_size ();
76
- properties->schedulerConfig .dynamic_split_fuse = mainPipelineConfig.dynamic_split_fuse ();
77
- properties->schedulerConfig .max_num_seqs = mainPipelineConfig.max_num_seqs ();
78
- properties->schedulerConfig .enable_prefix_caching = mainPipelineConfig.enable_prefix_caching ();
79
-
80
- properties->device = mainPipelineConfig.device ();
81
-
82
- // Speculative decoding enabled
83
- properties->isSpeculativePipeline = false ;
84
- if (continousBatchingPipelineConfig.has_draft_pipeline_config ()) {
85
- auto draftPipelineConfig = continousBatchingPipelineConfig.draft_pipeline_config ();
86
- auto fsDraftModelsPath = std::filesystem::path (draftPipelineConfig.models_path ());
87
- std::string draftPipelinePath;
88
- if (fsDraftModelsPath.is_relative ()) {
89
- draftPipelinePath = (std::filesystem::path (graphPath) / fsDraftModelsPath).string ();
90
- } else {
91
- draftPipelinePath = fsDraftModelsPath.string ();
92
- }
93
- auto draftSchedulerConfig = prepareDraftPipelineSchedulerConfigExperimental (draftPipelineConfig);
94
- auto draftPipeline = ov::genai::draft_model (draftPipelinePath, draftPipelineConfig.device (), ov::genai::scheduler_config (draftSchedulerConfig));
95
- properties->pluginConfig .insert (draftPipeline);
96
- properties->isSpeculativePipeline = true ;
97
- }
98
-
99
- status = JsonParser::parsePluginConfig (mainPipelineConfig.plugin_config (), properties->pluginConfig );
100
- if (!status.ok ()) {
101
- SPDLOG_ERROR (" Error during llm node plugin_config option parsing to JSON: {}" , mainPipelineConfig.plugin_config ());
102
- return status;
103
- }
104
-
105
- properties->tokenizerPluginConfig = {{" PERFORMANCE_HINT" , " THROUGHPUT" }};
106
- try {
107
- properties->pipeline = std::make_shared<ov::genai::ContinuousBatchingPipeline>(parsedModelsPath,
108
- properties->schedulerConfig , properties->device ,
109
- properties->pluginConfig , properties->tokenizerPluginConfig );
110
- properties->tokenizer = properties->pipeline ->get_tokenizer ();
111
- } catch (const std::exception & e) {
112
- SPDLOG_ERROR (" Error during llm node initialization for models_path: {} exception: {}" , parsedModelsPath, e.what ());
113
- return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
114
- } catch (...) {
115
- SPDLOG_ERROR (" Error during llm node initialization for models_path: {}" , parsedModelsPath);
116
- return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
117
- }
118
-
119
- loadTextProcessor (properties, parsedModelsPath);
120
- if (nodeOptions.has_max_tokens_limit ()) {
121
- properties->maxTokensLimit = nodeOptions.max_tokens_limit ();
122
- }
123
- properties->bestOfLimit = mainPipelineConfig.best_of_limit ();
124
-
125
- properties->llmExecutorWrapper = std::make_shared<LLMExecutorWrapper>(properties->pipeline );
126
- return StatusCode::OK;
127
- }
128
-
129
53
Status ContinuousBatchingServableInitializer::initialize (std::shared_ptr<GenAiServable>& servable, const mediapipe::LLMCalculatorOptions& nodeOptions, std::string graphPath) {
130
54
std::string parsedModelsPath;
131
55
auto status = parseModelsPath (parsedModelsPath, nodeOptions.models_path (), graphPath);
@@ -174,6 +98,20 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
174
98
return status;
175
99
}
176
100
101
+ std::cout << " Checking if prompt lookup is enabled" << std::endl;
102
+ // Check if prompt lookup is enabled
103
+ auto promptLookupPropertyIt = properties->pluginConfig .find (" prompt_lookup" );
104
+ if (promptLookupPropertyIt != properties->pluginConfig .end ()) {
105
+ auto promptLookupProperty = promptLookupPropertyIt->second .as <bool >();
106
+ if (promptLookupProperty == true ) {
107
+ properties->isPromptLookupPipeline = true ;
108
+ } else {
109
+ properties->isPromptLookupPipeline = false ;
110
+ }
111
+ }
112
+
113
+ std::cout << " properties->isPromptLookupPipeline: " << properties->isPromptLookupPipeline << std::endl;
114
+
177
115
properties->tokenizerPluginConfig = {{" PERFORMANCE_HINT" , " THROUGHPUT" }};
178
116
try {
179
117
properties->pipeline = std::make_shared<ov::genai::ContinuousBatchingPipeline>(parsedModelsPath,
0 commit comments