Document readers samples for RAG

ThomasVitale · Jan 26, 2024 · 2347203 · 2347203
1 parent f5ad2b7
commit 2347203
Show file tree

Hide file tree

Showing 17 changed files with 328 additions and 17 deletions.
diff --git a/05-document-readers/document-readers-json-ollama/README.md b/05-document-readers/document-readers-json-ollama/README.md
@@ -1,25 +1,39 @@
 # JSON Document Readers: Ollama
 
-## Running the application
+Reading and vectorizing JSON documents with LLMs via Ollama.
+
+# Running the application
+
+The application relies on Ollama for providing LLMs. You can either run Ollama locally on your laptop (macOS or Linux), or rely on the Testcontainers support in Spring Boot to spin up an Ollama service automatically.
 
 ### When using Ollama
 
+First, make sure you have [Ollama](https://ollama.ai) installed on your laptop (macOS or Linux).
+Then, use Ollama to run the _llama2_ large language model.
+
 ```shell
 ollama run llama2
 ```
 
+Finally, run the Spring Boot application.
+
 ```shell
 ./gradlew bootRun
 ```
 
 ### When using Docker/Podman
 
+The application relies on the native Testcontainers support in Spring Boot to spin up an Ollama service with a _llama2_ model at startup time.
+
 ```shell
 ./gradlew bootTestRun
 ```
 
 ## Calling the application
 
+You can now call the application that will use Ollama and llama2 to load JSON documents as embeddings and generate an answer to your questions based on those documents (RAG pattern).
+This example uses [httpie](https://httpie.io) to send HTTP requests.
+
 ```shell
 http --raw "What bike is good for city commuting?" :8080/ai/doc/chat
 ```
diff --git a/...rs/document-readers-json-ollama/src/main/java/com/thomasvitale/ai/spring/ChatService.java b/...rs/document-readers-json-ollama/src/main/java/com/thomasvitale/ai/spring/ChatService.java
@@ -28,10 +28,9 @@ class ChatService {
     AssistantMessage chatWithDocument(String message) {
         var systemPromptTemplate = new SystemPromptTemplate("""
                 You're assisting with questions about products in a bicycle catalog.
-                Use the information from the DOCUMENTS section to provide accurate answers.
-                If the answer involves referring to the price or the dimension of the bicycle,
-                include the bicycle name in the response.
-                If unsure, simply state that you don't know.
+                Use the information from the DOCUMENTS section and no prior knowledge.
+                If unsure or if the answer isn't found in the DOCUMENTS section, simply state
+                that you don't know the answer.
                                 
                 DOCUMENTS:
                 {documents}

diff --git a/05-document-readers/document-readers-pdf-ollama/README.md b/05-document-readers/document-readers-pdf-ollama/README.md
@@ -0,0 +1,43 @@
+# PDF Document Readers: Ollama
+
+Reading and vectorizing PDF documents with LLMs via Ollama.
+
+# Running the application
+
+The application relies on Ollama for providing LLMs. You can either run Ollama locally on your laptop (macOS or Linux), or rely on the Testcontainers support in Spring Boot to spin up an Ollama service automatically.
+
+### When using Ollama
+
+First, make sure you have [Ollama](https://ollama.ai) installed on your laptop (macOS or Linux).
+Then, use Ollama to run the _llama2_ large language model.
+
+```shell
+ollama run llama2
+```
+
+Finally, run the Spring Boot application.
+
+```shell
+./gradlew bootRun
+```
+
+### When using Docker/Podman
+
+The application relies on the native Testcontainers support in Spring Boot to spin up an Ollama service with a _llama2_ model at startup time.
+
+```shell
+./gradlew bootTestRun
+```
+
+## Calling the application
+
+You can now call the application that will use Ollama and llama2 to load PDF documents as embeddings and generate an answer to your questions based on those documents (RAG pattern).
+This example uses [httpie](https://httpie.io) to send HTTP requests.
+
+```shell
+http --raw "What is Iorek's biggest dream?" :8080/ai/doc/chat
+```
+
+```shell
+http --raw "Who is Lucio?" :8080/ai/doc/chat
+```
diff --git a/05-document-readers/document-readers-pdf-ollama/build.gradle b/05-document-readers/document-readers-pdf-ollama/build.gradle
@@ -0,0 +1,35 @@
+plugins {
+    id 'java'
+    id 'org.springframework.boot'
+    id 'io.spring.dependency-management'
+}
+
+group = 'com.thomasvitale'
+version = '0.0.1-SNAPSHOT'
+
+java {
+    sourceCompatibility = '21'
+}
+
+repositories {
+    mavenCentral()
+    maven { url 'https://repo.spring.io/milestone' }
+    maven { url 'https://repo.spring.io/snapshot' }
+}
+
+dependencies {
+    implementation 'org.springframework.boot:spring-boot-starter-web'
+
+    implementation "org.springframework.ai:spring-ai-ollama-spring-boot-starter:${springAiVersion}"
+    implementation "org.springframework.ai:spring-ai-pdf-document-reader:${springAiVersion}"
+
+    testAndDevelopmentOnly 'org.springframework.boot:spring-boot-devtools'
+
+    testImplementation 'org.springframework.boot:spring-boot-starter-test'
+    testImplementation 'org.springframework.boot:spring-boot-testcontainers'
+    testImplementation 'org.testcontainers:junit-jupiter'
+}
+
+tasks.named('test') {
+    useJUnitPlatform()
+}
diff --git a/.../document-readers-pdf-ollama/src/main/java/com/thomasvitale/ai/spring/ChatController.java b/.../document-readers-pdf-ollama/src/main/java/com/thomasvitale/ai/spring/ChatController.java
@@ -0,0 +1,21 @@
+package com.thomasvitale.ai.spring;
+
+import org.springframework.web.bind.annotation.PostMapping;
+import org.springframework.web.bind.annotation.RequestBody;
+import org.springframework.web.bind.annotation.RestController;
+
+@RestController
+class ChatController {
+
+    private final ChatService chatService;
+
+    ChatController(ChatService chatService) {
+        this.chatService = chatService;
+    }
+
+    @PostMapping("/ai/doc/chat")
+    String chatWithDocument(@RequestBody String input) {
+        return chatService.chatWithDocument(input).getContent();
+    }
+
+}
diff --git a/...ers/document-readers-pdf-ollama/src/main/java/com/thomasvitale/ai/spring/ChatService.java b/...ers/document-readers-pdf-ollama/src/main/java/com/thomasvitale/ai/spring/ChatService.java
@@ -0,0 +1,51 @@
+package com.thomasvitale.ai.spring;
+
+import org.springframework.ai.chat.ChatClient;
+import org.springframework.ai.chat.messages.AssistantMessage;
+import org.springframework.ai.chat.messages.UserMessage;
+import org.springframework.ai.chat.prompt.Prompt;
+import org.springframework.ai.chat.prompt.SystemPromptTemplate;
+import org.springframework.ai.document.Document;
+import org.springframework.ai.vectorstore.SearchRequest;
+import org.springframework.ai.vectorstore.SimpleVectorStore;
+import org.springframework.stereotype.Service;
+
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+@Service
+class ChatService {
+
+    private final ChatClient chatClient;
+    private final SimpleVectorStore vectorStore;
+
+    ChatService(ChatClient chatClient, SimpleVectorStore vectorStore) {
+        this.chatClient = chatClient;
+        this.vectorStore = vectorStore;
+    }
+
+    AssistantMessage chatWithDocument(String message) {
+        var systemPromptTemplate = new SystemPromptTemplate("""
+                Answer questions given the context information below (DOCUMENTS section) and no prior knowledge.
+                If the answer is not found in the DOCUMENTS section, simply state that you don't know the answer.
+                In the answer, include the source file name from which the context information is extracted from.
+                                
+                DOCUMENTS:
+                {documents}
+                """);
+
+        List<Document> similarDocuments = vectorStore.similaritySearch(SearchRequest.query(message).withTopK(2));
+        String documents = similarDocuments.stream().map(Document::getContent).collect(Collectors.joining(System.lineSeparator()));
+
+        Map<String,Object> model = Map.of("documents", documents);
+        var systemMessage = systemPromptTemplate.createMessage(model);
+
+        var userMessage = new UserMessage(message);
+        var prompt = new Prompt(List.of(systemMessage, userMessage));
+
+        var chatResponse = chatClient.call(prompt);
+        return chatResponse.getResult().getOutput();
+    }
+
+}
diff --git a/...ment-readers-pdf-ollama/src/main/java/com/thomasvitale/ai/spring/DocumentInitializer.java b/...ment-readers-pdf-ollama/src/main/java/com/thomasvitale/ai/spring/DocumentInitializer.java
@@ -0,0 +1,57 @@
+package com.thomasvitale.ai.spring;
+
+import jakarta.annotation.PostConstruct;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.ai.document.Document;
+import org.springframework.ai.reader.ExtractedTextFormatter;
+import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
+import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
+import org.springframework.ai.vectorstore.SimpleVectorStore;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.core.io.Resource;
+import org.springframework.stereotype.Component;
+
+import java.util.ArrayList;
+import java.util.List;
+
+@Component
+public class DocumentInitializer {
+
+    private static final Logger log = LoggerFactory.getLogger(DocumentInitializer.class);
+    private final SimpleVectorStore simpleVectorStore;
+
+    @Value("classpath:documents/story1.pdf")
+    Resource pdfFile1;
+
+    @Value("classpath:documents/story2.pdf")
+    Resource pdfFile2;
+
+    public DocumentInitializer(SimpleVectorStore simpleVectorStore) {
+        this.simpleVectorStore = simpleVectorStore;
+    }
+
+    @PostConstruct
+    public void run() {
+        List<Document> documents = new ArrayList<>();
+
+        log.info("Loading PDF files as Documents");
+        var pdfReader1 = new PagePdfDocumentReader(pdfFile1);
+        documents.addAll(pdfReader1.get());
+
+        log.info("Loading PDF files as Documents after reformatting");
+        var pdfReader2 = new PagePdfDocumentReader(pdfFile2, PdfDocumentReaderConfig.builder()
+                .withPageExtractedTextFormatter(ExtractedTextFormatter.builder()
+                        .withNumberOfTopPagesToSkipBeforeDelete(0)
+                        .withNumberOfBottomTextLinesToDelete(1)
+                        .withNumberOfTopTextLinesToDelete(1)
+                        .build())
+                .withPagesPerDocument(1)
+                .build());
+        documents.addAll(pdfReader2.get());
+
+        log.info("Creating and storing Embeddings from Documents");
+        simpleVectorStore.add(documents);
+    }
+
+}
diff --git a/...-ollama/src/main/java/com/thomasvitale/ai/spring/DocumentReadersPdfOllamaApplication.java b/...-ollama/src/main/java/com/thomasvitale/ai/spring/DocumentReadersPdfOllamaApplication.java
@@ -0,0 +1,21 @@
+package com.thomasvitale.ai.spring;
+
+import org.springframework.ai.embedding.EmbeddingClient;
+import org.springframework.ai.vectorstore.SimpleVectorStore;
+import org.springframework.boot.SpringApplication;
+import org.springframework.boot.autoconfigure.SpringBootApplication;
+import org.springframework.context.annotation.Bean;
+
+@SpringBootApplication
+public class DocumentReadersPdfOllamaApplication {
+
+    @Bean
+    SimpleVectorStore documentWriter(EmbeddingClient embeddingClient) {
+        return new SimpleVectorStore(embeddingClient);
+    }
+
+    public static void main(String[] args) {
+        SpringApplication.run(DocumentReadersPdfOllamaApplication.class, args);
+    }
+
+}
diff --git a/05-document-readers/document-readers-pdf-ollama/src/main/resources/application.yml b/05-document-readers/document-readers-pdf-ollama/src/main/resources/application.yml
@@ -0,0 +1,10 @@
+spring:
+  ai:
+    ollama:
+      chat:
+        model: llama2
+      embedding:
+        model: llama2
+  threads:
+    virtual:
+      enabled: true
diff --git a/05-document-readers/document-readers-pdf-ollama/src/main/resources/documents/story1.pdf b/05-document-readers/document-readers-pdf-ollama/src/main/resources/documents/story1.pdf
diff --git a/05-document-readers/document-readers-pdf-ollama/src/main/resources/documents/story2.pdf b/05-document-readers/document-readers-pdf-ollama/src/main/resources/documents/story2.pdf
diff --git a/...ma/src/test/java/com/thomasvitale/ai/spring/DocumentReadersPdfOllamaApplicationTests.java b/...ma/src/test/java/com/thomasvitale/ai/spring/DocumentReadersPdfOllamaApplicationTests.java
@@ -0,0 +1,17 @@
+package com.thomasvitale.ai.spring;
+
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.context.annotation.Import;
+
+@SpringBootTest
+@Import(TestDocumentReadersPdfOllamaApplication.class)
+@Disabled // Only run locally for now
+class DocumentReadersPdfOllamaApplicationTests {
+
+    @Test
+    void contextLoads() {
+    }
+
+}
diff --git a/...ama/src/test/java/com/thomasvitale/ai/spring/TestDocumentReadersPdfOllamaApplication.java b/...ama/src/test/java/com/thomasvitale/ai/spring/TestDocumentReadersPdfOllamaApplication.java
@@ -0,0 +1,29 @@
+package com.thomasvitale.ai.spring;
+
+import org.springframework.boot.SpringApplication;
+import org.springframework.boot.devtools.restart.RestartScope;
+import org.springframework.boot.test.context.TestConfiguration;
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Scope;
+import org.springframework.test.context.DynamicPropertyRegistry;
+import org.testcontainers.containers.GenericContainer;
+
+@TestConfiguration(proxyBeanMethods = false)
+public class TestDocumentReadersPdfOllamaApplication {
+
+    @Bean
+    @RestartScope
+    @Scope("singleton") // needed because of https://github.com/spring-projects/spring-boot/issues/35786
+    GenericContainer<?> ollama(DynamicPropertyRegistry properties) {
+        var ollama = new GenericContainer<>("ghcr.io/thomasvitale/ollama-llama2")
+                .withExposedPorts(11434);
+        properties.add("spring.ai.ollama.base-url",
+                () -> "http://%s:%s".formatted(ollama.getHost(), ollama.getMappedPort(11434)));
+        return ollama;
+    }
+
+    public static void main(String[] args) {
+        SpringApplication.from(DocumentReadersPdfOllamaApplication::main).with(TestDocumentReadersPdfOllamaApplication.class).run(args);
+    }
+
+}
diff --git a/05-document-readers/document-readers-text-ollama/README.md b/05-document-readers/document-readers-text-ollama/README.md
@@ -1,25 +1,39 @@
 # Text Document Readers: Ollama
 
-## Running the application
+Reading and vectorizing text documents with LLMs via Ollama.
+
+# Running the application
+
+The application relies on Ollama for providing LLMs. You can either run Ollama locally on your laptop (macOS or Linux), or rely on the Testcontainers support in Spring Boot to spin up an Ollama service automatically.
 
 ### When using Ollama
 
+First, make sure you have [Ollama](https://ollama.ai) installed on your laptop (macOS or Linux).
+Then, use Ollama to run the _llama2_ large language model.
+
 ```shell
 ollama run llama2
 ```
 
+Finally, run the Spring Boot application.
+
 ```shell
 ./gradlew bootRun
 ```
 
 ### When using Docker/Podman
 
+The application relies on the native Testcontainers support in Spring Boot to spin up an Ollama service with a _llama2_ model at startup time.
+
 ```shell
 ./gradlew bootTestRun
 ```
 
 ## Calling the application
 
+You can now call the application that will use Ollama and llama2 to load text documents as embeddings and generate an answer to your questions based on those documents (RAG pattern).
+This example uses [httpie](https://httpie.io) to send HTTP requests.
+
 ```shell
 http --raw "What is Iorek's biggest dream?" :8080/ai/doc/chat
 ```