diff --git a/07-data-ingestion/document-readers-tika-ollama/README.md b/07-data-ingestion/document-readers-tika-ollama/README.md new file mode 100644 index 0000000..6d70878 --- /dev/null +++ b/07-data-ingestion/document-readers-tika-ollama/README.md @@ -0,0 +1,43 @@ +# Tika Document Readers: Ollama + +Reading and vectorizing documents with LLMs and Tika via Ollama. + +## Running the application + +The application relies on Ollama for providing LLMs. You can either run Ollama locally on your laptop, or rely on the Testcontainers support in Spring Boot to spin up an Ollama service automatically. + +### Ollama as a native application + +First, make sure you have [Ollama](https://ollama.ai) installed on your laptop. +Then, use Ollama to run the _mistral_ large language model. + +```shell +ollama run mistral +``` + +Finally, run the Spring Boot application. + +```shell +./gradlew bootRun +``` + +### Ollama as a dev service with Testcontainers + +The application relies on the native Testcontainers support in Spring Boot to spin up an Ollama service with a _mistral_ model at startup time. + +```shell +./gradlew bootTestRun +``` + +## Calling the application + +You can now call the application that will use Ollama and _mistral_ to load documents as embeddings and generate an answer to your questions based on those documents (RAG pattern). +This example uses [httpie](https://httpie.io) to send HTTP requests. + +```shell +http --raw "What is Iorek's biggest dream?" :8080/chat/doc +``` + +```shell +http --raw "Who is Lucio?" :8080/chat/doc +``` diff --git a/07-data-ingestion/document-readers-tika-ollama/build.gradle b/07-data-ingestion/document-readers-tika-ollama/build.gradle new file mode 100644 index 0000000..95d699a --- /dev/null +++ b/07-data-ingestion/document-readers-tika-ollama/build.gradle @@ -0,0 +1,42 @@ +plugins { + id 'java' + id 'org.springframework.boot' + id 'io.spring.dependency-management' + id 'org.graalvm.buildtools.native' +} + +group = 'com.thomasvitale' +version = '0.0.1-SNAPSHOT' + +java { + toolchain { + languageVersion = JavaLanguageVersion.of(23) + } +} + +repositories { + mavenCentral() + maven { url 'https://repo.spring.io/milestone' } + maven { url 'https://repo.spring.io/snapshot' } +} + +dependencies { + implementation platform("org.springframework.ai:spring-ai-bom:${springAiVersion}") + + implementation 'org.springframework.boot:spring-boot-starter-web' + implementation 'org.springframework.ai:spring-ai-ollama-spring-boot-starter' + implementation 'org.springframework.ai:spring-ai-tika-document-reader' + + testAndDevelopmentOnly 'org.springframework.boot:spring-boot-devtools' + + testImplementation 'org.springframework.boot:spring-boot-starter-test' + testImplementation 'org.springframework.boot:spring-boot-starter-webflux' + testImplementation 'org.springframework.boot:spring-boot-testcontainers' + testImplementation 'org.springframework.ai:spring-ai-spring-boot-testcontainers' + testImplementation 'org.testcontainers:junit-jupiter' + testImplementation 'org.testcontainers:ollama' +} + +tasks.named('test') { + useJUnitPlatform() +} diff --git a/07-data-ingestion/document-readers-tika-ollama/src/main/java/com/thomasvitale/ai/spring/ChatController.java b/07-data-ingestion/document-readers-tika-ollama/src/main/java/com/thomasvitale/ai/spring/ChatController.java new file mode 100644 index 0000000..16aafd3 --- /dev/null +++ b/07-data-ingestion/document-readers-tika-ollama/src/main/java/com/thomasvitale/ai/spring/ChatController.java @@ -0,0 +1,21 @@ +package com.thomasvitale.ai.spring; + +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestBody; +import org.springframework.web.bind.annotation.RestController; + +@RestController +class ChatController { + + private final ChatService chatService; + + ChatController(ChatService chatService) { + this.chatService = chatService; + } + + @PostMapping("/chat/doc") + String chatWithDocument(@RequestBody String input) { + return chatService.chatWithDocument(input); + } + +} diff --git a/07-data-ingestion/document-readers-tika-ollama/src/main/java/com/thomasvitale/ai/spring/ChatService.java b/07-data-ingestion/document-readers-tika-ollama/src/main/java/com/thomasvitale/ai/spring/ChatService.java new file mode 100644 index 0000000..2afe366 --- /dev/null +++ b/07-data-ingestion/document-readers-tika-ollama/src/main/java/com/thomasvitale/ai/spring/ChatService.java @@ -0,0 +1,47 @@ +package com.thomasvitale.ai.spring; + +import org.springframework.ai.chat.client.ChatClient; +import org.springframework.ai.document.Document; +import org.springframework.ai.vectorstore.SearchRequest; +import org.springframework.ai.vectorstore.VectorStore; +import org.springframework.stereotype.Service; + +import java.util.List; +import java.util.stream.Collectors; + +@Service +class ChatService { + + private final ChatClient chatClient; + private final VectorStore vectorStore; + + ChatService(ChatClient.Builder chatClientBuilder, VectorStore vectorStore) { + this.chatClient = chatClientBuilder.build(); + this.vectorStore = vectorStore; + } + + String chatWithDocument(String message) { + var systemPromptTemplate = """ + You are a helpful assistant, conversing with a user about the subjects contained in a set of documents. + Use the information from the DOCUMENTS section to provide accurate answers. If unsure or if the answer + isn't found in the DOCUMENTS section, simply state that you don't know the answer and do not mention + the DOCUMENTS section. + + DOCUMENTS: + {documents} + """; + + List similarDocuments = vectorStore.similaritySearch(SearchRequest.query(message).withTopK(5)); + String content = similarDocuments.stream().map(Document::getContent).collect(Collectors.joining(System.lineSeparator())); + + return chatClient.prompt() + .system(systemSpec -> systemSpec + .text(systemPromptTemplate) + .param("documents", content) + ) + .user(message) + .call() + .content(); + } + +} diff --git a/07-data-ingestion/document-readers-tika-ollama/src/main/java/com/thomasvitale/ai/spring/DocumentEtlPipeline.java b/07-data-ingestion/document-readers-tika-ollama/src/main/java/com/thomasvitale/ai/spring/DocumentEtlPipeline.java new file mode 100644 index 0000000..0097974 --- /dev/null +++ b/07-data-ingestion/document-readers-tika-ollama/src/main/java/com/thomasvitale/ai/spring/DocumentEtlPipeline.java @@ -0,0 +1,47 @@ +package com.thomasvitale.ai.spring; + +import jakarta.annotation.PostConstruct; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.ai.document.Document; +import org.springframework.ai.reader.tika.TikaDocumentReader; +import org.springframework.ai.vectorstore.VectorStore; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.core.io.Resource; +import org.springframework.stereotype.Component; + +import java.util.ArrayList; +import java.util.List; + +@Component +public class DocumentEtlPipeline { + + private static final Logger logger = LoggerFactory.getLogger(DocumentEtlPipeline.class); + private final VectorStore vectorStore; + + @Value("classpath:documents/story1.md") + Resource file1; + + @Value("classpath:documents/story2.pdf") + Resource file2; + + public DocumentEtlPipeline(VectorStore vectorStore) { + this.vectorStore = vectorStore; + } + + @PostConstruct + public void run() { + List documents = new ArrayList<>(); + + logger.info("Loading files as Documents"); + var tikaReader1 = new TikaDocumentReader(file1); + documents.addAll(tikaReader1.get()); + + var tikaReader2 = new TikaDocumentReader(file2); + documents.addAll(tikaReader2.get()); + + logger.info("Creating and storing Embeddings from Documents"); + vectorStore.add(documents); + } + +} diff --git a/07-data-ingestion/document-readers-tika-ollama/src/main/java/com/thomasvitale/ai/spring/DocumentReadersTikaOllamaApplication.java b/07-data-ingestion/document-readers-tika-ollama/src/main/java/com/thomasvitale/ai/spring/DocumentReadersTikaOllamaApplication.java new file mode 100644 index 0000000..9247c29 --- /dev/null +++ b/07-data-ingestion/document-readers-tika-ollama/src/main/java/com/thomasvitale/ai/spring/DocumentReadersTikaOllamaApplication.java @@ -0,0 +1,22 @@ +package com.thomasvitale.ai.spring; + +import org.springframework.ai.embedding.EmbeddingModel; +import org.springframework.ai.vectorstore.SimpleVectorStore; +import org.springframework.ai.vectorstore.VectorStore; +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.context.annotation.Bean; + +@SpringBootApplication +public class DocumentReadersTikaOllamaApplication { + + public static void main(String[] args) { + SpringApplication.run(DocumentReadersTikaOllamaApplication.class, args); + } + + @Bean + VectorStore vectorStore(EmbeddingModel embeddingModel) { + return new SimpleVectorStore(embeddingModel); + } + +} diff --git a/07-data-ingestion/document-readers-tika-ollama/src/main/resources/application.yml b/07-data-ingestion/document-readers-tika-ollama/src/main/resources/application.yml new file mode 100644 index 0000000..5090f64 --- /dev/null +++ b/07-data-ingestion/document-readers-tika-ollama/src/main/resources/application.yml @@ -0,0 +1,9 @@ +spring: + ai: + ollama: + chat: + options: + model: mistral + embedding: + options: + model: mistral diff --git a/07-data-ingestion/document-readers-tika-ollama/src/main/resources/documents/story1.md b/07-data-ingestion/document-readers-tika-ollama/src/main/resources/documents/story1.md new file mode 100644 index 0000000..e9174fd --- /dev/null +++ b/07-data-ingestion/document-readers-tika-ollama/src/main/resources/documents/story1.md @@ -0,0 +1,42 @@ +# The Adventures of Iorek and Pingu + +Iorek was a little polar bear who lived in the Arctic circle. He loved to explore the snowy landscape and +dreamt of one day going on an adventure around the North Pole. One day, he met a penguin named Pingu who +was on a similar quest. They quickly became friends and decided to embark on their journey together. + +Iorek and Pingu set off early in the morning, eager to cover as much ground as possible before nightfall. +The air was crisp and cold, and the snow crunched under their paws as they walked. They chatted excitedly +about their dreams and aspirations, and Iorek told Pingu about his desire to see the Northern Lights. + +As they journeyed onward, they encountered a group of playful seals who were sliding and jumping in the +snow. Iorek and Pingu watched in delight as the seals frolicked and splashed in the water. They even tried +to join in, but their paws kept slipping and they ended up sliding on their stomachs instead. + +After a few hours of walking, Iorek and Pingu came across a cave hidden behind a wall of snow. They +cautiously entered the darkness, their eyes adjusting to the dim light inside. The cave was filled with +glittering ice formations that sparkled like diamonds in the flickering torchlight. + +As they continued their journey, Iorek and Pingu encountered a group of walruses who were lounging on the +ice. They watched in amazement as the walruses lazily rolled over and exposed their tusks for a good +scratch. Pingu even tried to imitate them, but ended up looking more like a clumsy seal than a walrus. + +As the sun began to set, Iorek and Pingu found themselves at the edge of a vast, frozen lake. They gazed +out across the glassy surface, mesmerized by the way the ice glinted in the fading light. They could see +the faint outline of a creature moving beneath the surface, and their hearts raced with excitement. + +Suddenly, a massive narwhal burst through the ice and into the air, its ivory tusk glistening in the +sunset. Iorek and Pingu watched in awe as it soared overhead, its cries echoing across the lake. They felt +as though they were witnessing a magical moment, one that would stay with them forever. + +As the night drew in, Iorek and Pingu settled down to rest in their makeshift camp. They huddled together +for warmth, gazing up at the starry sky above. They chatted about all they had seen and experienced during +their adventure, and Iorek couldn't help but feel grateful for the new friend he had made. + +The next morning, Iorek and Pingu set off once again, determined to explore every inch of the North Pole. +They stumbled upon a hidden cave filled with glittering crystals that sparkled like diamonds in the +sunlight. They marveled at their beauty before continuing on their way. + +As they journeyed onward, Iorek and Pingu encountered many more wonders and adventures. They met a group +of playful reindeer who showed them how to pull sledges across the snow, and even caught a glimpse of the +mythical Loch Ness Monster lurking beneath the icy waters. In the end, their adventure around the North +Pole had been an unforgettable experience, one that they would treasure forever. diff --git a/07-data-ingestion/document-readers-tika-ollama/src/main/resources/documents/story2.pdf b/07-data-ingestion/document-readers-tika-ollama/src/main/resources/documents/story2.pdf new file mode 100644 index 0000000..b11b2ce Binary files /dev/null and b/07-data-ingestion/document-readers-tika-ollama/src/main/resources/documents/story2.pdf differ diff --git a/07-data-ingestion/document-readers-tika-ollama/src/test/java/com/thomasvitale/ai/spring/DocumentReadersTikaOllamaApplicationTests.java b/07-data-ingestion/document-readers-tika-ollama/src/test/java/com/thomasvitale/ai/spring/DocumentReadersTikaOllamaApplicationTests.java new file mode 100644 index 0000000..6159e4f --- /dev/null +++ b/07-data-ingestion/document-readers-tika-ollama/src/test/java/com/thomasvitale/ai/spring/DocumentReadersTikaOllamaApplicationTests.java @@ -0,0 +1,15 @@ +package com.thomasvitale.ai.spring; + +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.springframework.boot.test.context.SpringBootTest; + +@SpringBootTest +@Disabled +class DocumentReadersTikaOllamaApplicationTests { + + @Test + void contextLoads() { + } + +} diff --git a/07-data-ingestion/document-readers-tika-ollama/src/test/java/com/thomasvitale/ai/spring/TestDocumentReadersTikaOllamaApplication.java b/07-data-ingestion/document-readers-tika-ollama/src/test/java/com/thomasvitale/ai/spring/TestDocumentReadersTikaOllamaApplication.java new file mode 100644 index 0000000..8eeb530 --- /dev/null +++ b/07-data-ingestion/document-readers-tika-ollama/src/test/java/com/thomasvitale/ai/spring/TestDocumentReadersTikaOllamaApplication.java @@ -0,0 +1,26 @@ +package com.thomasvitale.ai.spring; + +import org.springframework.boot.SpringApplication; +import org.springframework.boot.devtools.restart.RestartScope; +import org.springframework.boot.test.context.TestConfiguration; +import org.springframework.boot.testcontainers.service.connection.ServiceConnection; +import org.springframework.context.annotation.Bean; +import org.testcontainers.ollama.OllamaContainer; +import org.testcontainers.utility.DockerImageName; + +@TestConfiguration(proxyBeanMethods = false) +public class TestDocumentReadersTikaOllamaApplication { + + @Bean + @RestartScope + @ServiceConnection + OllamaContainer ollama() { + return new OllamaContainer(DockerImageName.parse("ghcr.io/thomasvitale/ollama-mistral") + .asCompatibleSubstituteFor("ollama/ollama")); + } + + public static void main(String[] args) { + SpringApplication.from(DocumentReadersTikaOllamaApplication::main).with(TestDocumentReadersTikaOllamaApplication.class).run(args); + } + +} diff --git a/README.md b/README.md index ad035c2..8568d80 100644 --- a/README.md +++ b/README.md @@ -72,13 +72,14 @@ Samples showing how to build Java applications powered by Generative AI and Larg ### 7. Data Ingestion -| Project | Description | -|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------| -| [document-readers-json-ollama](https://github.com/ThomasVitale/llm-apps-java-spring-ai/tree/main/07-data-ingestion/document-readers-json-ollama) | Reading and vectorizing JSON documents with LLMs via Ollama. | -| [document-readers-pdf-ollama](https://github.com/ThomasVitale/llm-apps-java-spring-ai/tree/main/07-data-ingestion/document-readers-text-ollama) | Reading and vectorizing PDF documents with LLMs via Ollama. | -| [document-readers-text-ollama](https://github.com/ThomasVitale/llm-apps-java-spring-ai/tree/main/07-data-ingestion/document-readers-text-ollama) | Reading and vectorizing text documents with LLMs via Ollama. | -| [document-transformers-metadata-ollama](https://github.com/ThomasVitale/llm-apps-java-spring-ai/tree/main/07-data-ingestion/document-transformers-metadata-ollama) | Enrich documents with keywords and summary metadata for enhanced retrieval via Ollama. | -| [document-transformers-splitters-ollama](https://github.com/ThomasVitale/llm-apps-java-spring-ai/tree/main/07-data-ingestion/document-transformers-splitters-ollama) | Divide documents into chunks to fit the LLM context window via Ollama. | +| Project | Description | +|----------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------| +| [document-readers-json-ollama](https://github.com/ThomasVitale/llm-apps-java-spring-ai/tree/main/07-data-ingestion/document-readers-json-ollama) | Reading and vectorizing JSON documents with LLMs via Ollama. | +| [document-readers-pdf-ollama](https://github.com/ThomasVitale/llm-apps-java-spring-ai/tree/main/07-data-ingestion/document-readers-text-ollama) | Reading and vectorizing PDF documents with LLMs via Ollama. | +| [document-readers-text-ollama](https://github.com/ThomasVitale/llm-apps-java-spring-ai/tree/main/07-data-ingestion/document-readers-text-ollama) | Reading and vectorizing text documents with LLMs via Ollama. | +| [document-readers-tika-ollama](https://github.com/ThomasVitale/llm-apps-java-spring-ai/tree/main/07-data-ingestion/document-readers-tika-ollama) | Reading and vectorizing documents with LLMs and Tika via Ollama. | +| [document-transformers-metadata-ollama](https://github.com/ThomasVitale/llm-apps-java-spring-ai/tree/main/07-data-ingestion/document-transformers-metadata-ollama) | Enrich documents with keywords and summary metadata for enhanced retrieval via Ollama. | +| [document-transformers-splitters-ollama](https://github.com/ThomasVitale/llm-apps-java-spring-ai/tree/main/07-data-ingestion/document-transformers-splitters-ollama) | Divide documents into chunks to fit the LLM context window via Ollama. | ### 8. Vector Stores diff --git a/settings.gradle b/settings.gradle index ec77c8c..d83274c 100644 --- a/settings.gradle +++ b/settings.gradle @@ -36,6 +36,7 @@ include '06-embedding-models:embedding-models-transformers' include '07-data-ingestion:document-readers-json-ollama' include '07-data-ingestion:document-readers-pdf-ollama' include '07-data-ingestion:document-readers-text-ollama' +include '07-data-ingestion:document-readers-tika-ollama' include '07-data-ingestion:document-transformers-metadata-ollama' include '07-data-ingestion:document-transformers-splitters-ollama'