Skip to content

Commit

Permalink
LLM: use set_output_seq_len instead of WA (#1611)
Browse files Browse the repository at this point in the history
Such method `set_output_seq_len` of `SequenceGroup` was introduced here
#1261
  • Loading branch information
ilya-lavrenov authored Jan 21, 2025
1 parent bb62b71 commit 2da00a0
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 9 deletions.
8 changes: 4 additions & 4 deletions src/cpp/src/llm_pipeline_static.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -940,8 +940,8 @@ EncodedResults StatefulLLMPipeline::generate(

auto sequence_group = std::make_shared<SequenceGroup>(
0 /* request_id */, input_ids, config, 1 /* block_size */);
sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - output_sequence_len);
sequence_group->schedule_tokens(output_sequence_len);
sequence_group->schedule_tokens(sequence_group->get_prompt_len());
sequence_group->set_output_seq_len(output_sequence_len);

// NB: Controls what tokens are ready to be pushed into the streamer
GenerationHandle handle = std::make_shared<GenerationHandleImpl>(
Expand Down Expand Up @@ -1412,8 +1412,8 @@ EncodedResults StatelessLLMPipeline::generate(
// Retrive only useful logits and work only with them here.
auto sequence_group = std::make_shared<SequenceGroup>(
0 /* request_id */, padded_input_ids, config, 1 /* block_size */);
sequence_group->update_processed_tokens_num(m_kvcache_desc.max_prompt_size - output_sequence_len);
sequence_group->schedule_tokens(output_sequence_len);
sequence_group->schedule_tokens(m_kvcache_desc.max_prompt_size);
sequence_group->set_output_seq_len(output_sequence_len);

// NB: Controls what tokens are ready to be pushed into the streamer
GenerationHandle handle = std::make_shared<GenerationHandleImpl>(
Expand Down
7 changes: 2 additions & 5 deletions src/cpp/src/lm_encoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,13 +138,10 @@ std::pair<EncodedResults, std::optional<int64_t>> get_lm_encoded_results(

auto logits = m_llm.get_tensor("logits");

// since we have applied `Slice` operation to last MatMul, model output sequence lenght is 1
// so, we need to update sequence groups to think that they already have processed all prompt tokens except last ones
// and schedule only `output_sequence_len` ones
int64_t output_sequence_len = logits.get_shape().at(1);
for (auto& sequence_group : sequence_groups) {
sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - output_sequence_len);
sequence_group->schedule_tokens(output_sequence_len);
sequence_group->schedule_tokens(sequence_group->get_prompt_len());
sequence_group->set_output_seq_len(output_sequence_len);
}

std::map<size_t, size_t> beam_offets;
Expand Down

0 comments on commit 2da00a0

Please sign in to comment.