diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 2e0afaa882..82e326abce 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -120,7 +120,7 @@ jobs: with open('pred.txt', 'r') as file: predictions = file.read() tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') - tokenized = tokenizer('Why is the Sun yellow?', return_tensors='pt') + tokenized = tokenizer('Why is the Sun yellow?', return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -136,7 +136,7 @@ jobs: with open('pred.txt', 'r') as file: predictions = file.read() tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') - tokenized = tokenizer('69', return_tensors='pt') + tokenized = tokenizer('69', return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -152,7 +152,7 @@ jobs: with open('pred.txt', 'r') as file: predictions = file.read() tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') - tokenized = tokenizer('Hi', return_tensors='pt') + tokenized = tokenizer('Hi', return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -168,7 +168,7 @@ jobs: with open('pred.txt', 'r') as file: predictions = file.read() tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') - tokenized = tokenizer('return 0', return_tensors='pt') + tokenized = tokenizer('return 0', return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -184,7 +184,7 @@ jobs: with open('pred.txt', 'r', errors='ignore') as file: predictions = file.read() tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') - tokenized = tokenizer('你好! 你好嗎?', return_tensors='pt') + tokenized = tokenizer('你好! 你好嗎?', return_tensors='pt', add_special_tokens=False) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref.replace('�', '')) @@ -199,6 +199,9 @@ jobs: import transformers with open('pred.txt', 'r', errors='ignore') as file: predictions = file.read() + print('\n\n') + print(predictions) + print('\n\n') tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') prompts = [ 'Alan Turing was a', @@ -206,9 +209,13 @@ jobs: '你好! 你好嗎?' ] for prompt in prompts: - tokenized = tokenizer(prompt, return_tensors='pt') + if tokenizer.chat_template: + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) + print(tokenized) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + print(ref) idx = predictions.find(ref.replace('�', '')) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp index 2a53154c27..6836e57257 100644 --- a/src/cpp/src/llm_pipeline_stateful.cpp +++ b/src/cpp/src/llm_pipeline_stateful.cpp @@ -87,8 +87,14 @@ DecodedResults StatefulLLMPipeline::generate( TokenizedInputs encoded_input; if (auto input_vector = std::get_if>(&inputs)) { - OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts"); - encoded_input = m_tokenizer.encode(*input_vector); + std::vector templated_input_vector; + for (auto& input : *input_vector) { + ChatHistory history({{{"role", "user"}, {"content", input}}}); + constexpr bool add_generation_prompt = true; + auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); + templated_input_vector.push_back(templated_prompt); + } + encoded_input = m_tokenizer.encode(templated_input_vector, ov::genai::add_special_tokens(false)); } else if (auto input_prompt = std::get_if(&inputs)) { std::string& prompt = *input_prompt;