diff --git a/.github/workflows/bandit.yml b/.github/workflows/bandit.yml index 5a56c01cb2..fce770d101 100644 --- a/.github/workflows/bandit.yml +++ b/.github/workflows/bandit.yml @@ -13,5 +13,5 @@ jobs: - uses: actions/setup-python@v4 with: python-version: 3.11 - - run: python -m pip install bandit==1.8.0 + - run: python -m pip install bandit - run: python -m bandit --recursive --configfile bandit.yml . diff --git a/bandit.yml b/bandit.yml index b4d31fb76a..bdd324b1da 100644 --- a/bandit.yml +++ b/bandit.yml @@ -79,7 +79,7 @@ # IPAS Required Checkers. Do not disable these # Additional checkers may be added if desired tests: - [ 'B301', 'B302', 'B303', 'B304', 'B305', 'B306', 'B308', 'B310', 'B311', 'B312', 'B313', 'B314', 'B315', 'B316', 'B317', 'B318', 'B319', 'B320', 'B321', 'B323', 'B324', 'B401', 'B402', 'B403', 'B404', 'B405', 'B406', 'B407', 'B408', 'B409', 'B410', 'B411', 'B412', 'B413'] + [ 'B301', 'B302', 'B303', 'B304', 'B305', 'B306', 'B308', 'B310', 'B311', 'B312', 'B313', 'B314', 'B315', 'B316', 'B317', 'B318', 'B319', 'B321', 'B323', 'B324', 'B401', 'B402', 'B403', 'B404', 'B405', 'B406', 'B407', 'B408', 'B409', 'B411', 'B412', 'B413'] # (optional) list skipped test IDs here, eg '[B101, B406]': # The following checkers are not required but be added to tests list if desired diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index b29bec3b4a..cffeedfc75 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -690,19 +690,38 @@ namespace static_llm { StatefulLLMPipeline::StatefulLLMPipeline( const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, - const std::string&, + const std::string& device, const ov::AnyMap& config ) : LLMPipelineImplBase(tokenizer, utils::from_config_json_if_exists(models_path)), m_sampler(m_tokenizer) { - - auto model = genai::utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, config); - ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json"); ov::AnyMap properties = config; - - auto compiled = setupAndCompileModel(model, model_desc, properties); - m_request = compiled->create_infer_request(); - m_sampler.set_seed(m_generation_config.rng_seed); + const auto use_blob = pop_or_default(properties, "USE_BLOB", false); + if (use_blob) { + auto blob_path = pop_or_default(properties, "BLOB_PATH", std::string{}); + if (blob_path.empty()) { + blob_path = (models_path / "openvino_model.blob").string(); + } + if (!std::filesystem::exists(blob_path)) { + OPENVINO_THROW("Blob file is not found at: " + blob_path); + } + std::ifstream fin(blob_path, std::ios::in | std::ios::binary); + if (!fin.is_open()) { + OPENVINO_THROW("Blob file can't be opened: " + blob_path); + } + auto compiled = genai::utils::singleton_core().import_model(fin, device, {}); + m_max_prompt_len = compiled.get_property("NPUW_LLM_MAX_PROMPT_LEN").as(); + auto min_resp_len = compiled.get_property("NPUW_LLM_MIN_RESPONSE_LEN").as(); + m_kvcache_total = m_max_prompt_len + min_resp_len; + m_request = compiled.create_infer_request(); + } else { + auto model = genai::utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, config); + ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json"); + ov::AnyMap properties = config; + auto compiled = setupAndCompileModel(model, model_desc, properties); + m_request = compiled->create_infer_request(); + m_sampler.set_seed(m_generation_config.rng_seed); + } } @@ -721,11 +740,9 @@ StatefulLLMPipeline::StatefulLLMPipeline( m_sampler.set_seed(m_generation_config.rng_seed); } -std::shared_ptr StatefulLLMPipeline::setupAndCompileModel( - const std::shared_ptr& model, +void StatefulLLMPipeline::updateStatefulConfig( const ModelConfigDesc& model_desc, ov::AnyMap& pipeline_config) { - const uint32_t kMaxPromptLen = pop_int_and_cast(pipeline_config, "MAX_PROMPT_LEN").value_or(1024u); const uint32_t kMinResponseLen = pop_int_and_cast(pipeline_config, "MIN_RESPONSE_LEN").value_or(128u); m_max_prompt_len = kMaxPromptLen; @@ -755,6 +772,13 @@ std::shared_ptr StatefulLLMPipeline::setupAndCompileModel( // Replace CACHE_DIR option if NPUW is enabled set_npuw_cache_dir(pipeline_config); +} + +std::shared_ptr StatefulLLMPipeline::setupAndCompileModel( + const std::shared_ptr& model, + const ModelConfigDesc& model_desc, + ov::AnyMap& pipeline_config) { + updateStatefulConfig(model_desc, pipeline_config); return std::make_shared(genai::utils::singleton_core().compile_model(model, "NPU", pipeline_config)); } diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp index 0138797a24..0916c09b73 100644 --- a/src/cpp/src/llm_pipeline_static.hpp +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -59,6 +59,10 @@ class StatefulLLMPipeline : public LLMPipelineImplBase { const ModelConfigDesc& model_desc, ov::AnyMap& pipeline_config); + void updateStatefulConfig( + const ModelConfigDesc& model_desc, + ov::AnyMap& pipeline_config); + DecodedResults generate( StringInputs inputs, OptionalGenerationConfig generation_config, diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 9f8718f14c..9f228cafa3 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -172,6 +172,8 @@ class InputsEmbedder::IInputsEmbedder { auto start_tokenizer_time = std::chrono::steady_clock::now(); ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids; TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false)); + auto end_tokenizer_time = std::chrono::steady_clock::now(); + metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); // some symbols combinations can be encoded by the tokenizer in different ways // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history @@ -211,8 +213,6 @@ class InputsEmbedder::IInputsEmbedder { if (m_last_disappeared_token.has_value()) encoded_input_ids = ov::genai::utils::push_front_inputs(encoded_input_ids, *m_last_disappeared_token); } - auto end_tokenizer_time = std::chrono::steady_clock::now(); - metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); m_templated_chat_history = std::move(new_templated_chat_history); m_tokenized_history.clear(); std::copy_n(new_chat_tokens.data(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history));