diff --git a/data/scraping_scripts/create_db.ipynb b/data/scraping_scripts/create_db.ipynb deleted file mode 100644 index 56b585b..0000000 --- a/data/scraping_scripts/create_db.ipynb +++ /dev/null @@ -1,353 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Create HF vector database\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from dotenv import load_dotenv\n", - "\n", - "load_dotenv(\"../../.env\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import nest_asyncio\n", - "\n", - "nest_asyncio.apply()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a set of Llama-index Documents with each section in the jsonl file\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index.core import Document\n", - "from llama_index.core.schema import MetadataMode\n", - "import json\n", - "import pickle\n", - "\n", - "\n", - "def create_docs(input_file):\n", - " with open(input_file, \"r\") as f:\n", - " documents = []\n", - " for i, line in enumerate(f):\n", - " data = json.loads(line)\n", - " documents.append(\n", - " Document(\n", - " doc_id=data[\"doc_id\"],\n", - " text=data[\"content\"],\n", - " metadata={\n", - " \"url\": data[\"url\"],\n", - " \"title\": data[\"name\"],\n", - " \"tokens\": data[\"tokens\"],\n", - " \"retrieve_doc\": data[\"retrieve_doc\"],\n", - " \"source\": data[\"source\"],\n", - " },\n", - " # LLM will see the 'url' of each chunk\n", - " excluded_llm_metadata_keys=[\n", - " # \"url\",\n", - " \"title\",\n", - " \"tokens\",\n", - " \"retrieve_doc\",\n", - " \"source\",\n", - " ],\n", - " # Embedding model will embed the 'title' of each chunk\n", - " excluded_embed_metadata_keys=[\n", - " \"url\",\n", - " # \"title\",\n", - " \"tokens\",\n", - " \"retrieve_doc\",\n", - " \"source\",\n", - " ],\n", - " )\n", - " )\n", - " return documents\n", - "\n", - "\n", - "# documents = create_docs(\"../transformers_data.jsonl\")\n", - "# documents = create_docs(\"../peft_data.jsonl\")\n", - "# documents = create_docs(\"../trl_data.jsonl\")\n", - "# documents = create_docs(\"../llama_index_data.jsonl\")\n", - "documents = create_docs(\"../openai-cookbook_data.jsonl\")\n", - "print(documents[0])\n", - "print(documents[0].metadata)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# print(\n", - "# \"The LLM sees this: \\n\",\n", - "# documents[0].get_content(metadata_mode=MetadataMode.LLM),\n", - "# )\n", - "print(\n", - " \"The Embedding model sees this: \\n\",\n", - " documents[0].get_content(metadata_mode=MetadataMode.EMBED),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import chromadb\n", - "\n", - "# create client and a new collection\n", - "DB_COLLECTION = \"chroma-db-openai-cookbooks\"\n", - "chroma_client = chromadb.PersistentClient(path=f\"../{DB_COLLECTION}\")\n", - "chroma_collection = chroma_client.create_collection(DB_COLLECTION)\n", - "\n", - "\n", - "from llama_index.vector_stores.chroma import ChromaVectorStore\n", - "from llama_index.core import StorageContext\n", - "\n", - "# Define a storage context object using the created vector database.\n", - "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n", - "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", - "\n", - "document_dict = {doc.doc_id: doc for doc in documents}\n", - "DOCUMENT_NAME = f\"../{DB_COLLECTION}/document_dict_openai.pkl\"\n", - "\n", - "with open(DOCUMENT_NAME, \"wb\") as f:\n", - " pickle.dump(document_dict, f)\n", - "\n", - "# with open(DOCUMENT_NAME, \"rb\") as f:\n", - "# document_dict = pickle.load(f)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index.core import VectorStoreIndex\n", - "from llama_index.core.node_parser import SentenceSplitter\n", - "from llama_index.embeddings.openai import OpenAIEmbedding\n", - "\n", - "index = VectorStoreIndex.from_documents(\n", - " documents,\n", - " embed_model=OpenAIEmbedding(model=\"text-embedding-3-large\", mode=\"similarity\"),\n", - " transformations=[SentenceSplitter(chunk_size=800, chunk_overlap=400)],\n", - " show_progress=True,\n", - " use_async=True,\n", - " storage_context=storage_context,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the DB" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "retriever = index.as_retriever(\n", - " similarity_top_k=10,\n", - " use_async=True,\n", - " embed_model=OpenAIEmbedding(model=\"text-embedding-3-large\", mode=\"similarity\"),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index.core.data_structs import Node\n", - "from llama_index.core.schema import NodeWithScore, BaseNode, TextNode\n", - "\n", - "\n", - "# query = \"fine-tune a pretrained model\"\n", - "# query = \"fine-tune an llm\"\n", - "query = \"how to fine-tune an llm?\"\n", - "\n", - "nodes_context = []\n", - "nodes = retriever.retrieve(query)\n", - "\n", - "\n", - "# Filter nodes with the same ref_doc_id\n", - "def filter_nodes_by_unique_doc_id(nodes):\n", - " unique_nodes = {}\n", - " for node in nodes:\n", - " doc_id = node.node.ref_doc_id\n", - " if doc_id is not None and doc_id not in unique_nodes:\n", - " unique_nodes[doc_id] = node\n", - " return list(unique_nodes.values())\n", - "\n", - "\n", - "nodes = filter_nodes_by_unique_doc_id(nodes)\n", - "print(len(nodes))\n", - "\n", - "for node in nodes:\n", - " print(\"Node ID\\t\", node.node_id)\n", - " print(\"Title\\t\", node.metadata[\"title\"])\n", - " print(\"Text\\t\", node.text)\n", - " print(\"Score\\t\", node.score)\n", - " print(\"Metadata\\t\", node.metadata)\n", - " print(\"-_\" * 20)\n", - " if node.metadata[\"retrieve_doc\"] == True:\n", - " print(\"This node will be replaced by the document\")\n", - " doc = document_dict[node.node.ref_doc_id]\n", - " # print(doc.text)\n", - " new_node = NodeWithScore(\n", - " node=TextNode(text=doc.text, metadata=node.metadata), score=node.score\n", - " )\n", - " print(new_node.text)\n", - " nodes_context.append(new_node)\n", - " else:\n", - " nodes_context.append(node)\n", - "\n", - "print(len(nodes_context))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index.core import ChatPromptTemplate\n", - "from llama_index.core.llms import ChatMessage, MessageRole\n", - "from pydantic import BaseModel, Field\n", - "\n", - "system_prompt = (\n", - " \"You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine-tuning models, giving 'memory' to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks, Langchain, Llama-Index, LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context.\"\n", - " \"You are provided information found in Hugging Face's documentation and the RAG course. \"\n", - " \"Only some information might be relevant to the question, so ignore the irrelevant part and use the relevant part to answer the question.\"\n", - " \"Only respond with information given to you documentation. DO NOT use additional information, even if you know the answer. \"\n", - " \"If the answer is somewhere in the documentation, answer the question (depending on the questions and the variety of relevant information in the documentation, give complete and helpful answers.\"\n", - " \"Here is the information you can use, the order is not important: \\n\\n\"\n", - " \"---------------------\\n\"\n", - " \"{context_str}\\n\"\n", - " \"---------------------\\n\\n\"\n", - " \"REMEMBER:\\n\"\n", - " \"You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine tuning models, giving memory to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks, Langchain, making LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context.\"\n", - " \"You are provided information found in Hugging Face's documentation and the RAG course. \"\n", - " \"Here are the rules you must follow:\\n\"\n", - " \"* Only respond with information inside the documentation. DO NOT provide additional information, even if you know the answer. \"\n", - " \"* If the answer is in the documentation, answer the question (depending on the questions and the variety of relevant information in the json documentation. Your answer needs to be pertinent and not redundant giving a clear explanation as if you were a teacher. \"\n", - " \"* Only use information summarized from the documentation, do not respond otherwise. \"\n", - " \"* Do not refer to the documentation directly, but use the instructions provided within it to answer questions. \"\n", - " \"* Do not reference any links, urls or hyperlinks in your answers.\\n\"\n", - " \"* Make sure to format your answers in Markdown format, including code block and snippets.\\n\"\n", - " \"Now answer the following question: \\n\"\n", - ")\n", - "\n", - "chat_text_qa_msgs: list[ChatMessage] = [\n", - " ChatMessage(role=MessageRole.SYSTEM, content=system_prompt),\n", - " ChatMessage(\n", - " role=MessageRole.USER,\n", - " content=\"{query_str}\",\n", - " ),\n", - "]\n", - "\n", - "TEXT_QA_TEMPLATE = ChatPromptTemplate(chat_text_qa_msgs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from IPython.display import Markdown\n", - "from llama_index.core.data_structs import Node\n", - "from llama_index.core.schema import NodeWithScore\n", - "from llama_index.core import get_response_synthesizer\n", - "from llama_index.llms.gemini import Gemini\n", - "from llama_index.llms.openai import OpenAI\n", - "\n", - "# llm = Gemini(model=\"models/gemini-1.5-flash\", temperature=1, max_tokens=None)\n", - "# llm = Gemini(model=\"models/gemini-1.5-pro\", temperature=1, max_tokens=None)\n", - "# llm = OpenAI(temperature=1, model=\"gpt-3.5-turbo\", max_tokens=None)\n", - "llm = OpenAI(temperature=1, model=\"gpt-4o-mini\", max_tokens=None)\n", - "\n", - "response_synthesizer = get_response_synthesizer(\n", - " llm=llm, response_mode=\"simple_summarize\", text_qa_template=TEXT_QA_TEMPLATE\n", - ")\n", - "\n", - "response = response_synthesizer.synthesize(query, nodes=nodes_context)\n", - "# print(response.response)\n", - "display(Markdown(response.response))\n", - "\n", - "# for src in response.source_nodes:\n", - "# print(src.node.ref_doc_id)\n", - "# print(\"Node ID\\t\", src.node_id)\n", - "# print(\"Title\\t\", src.metadata[\"title\"])\n", - "# print(\"Text\\t\", src.text)\n", - "# print(\"Score\\t\", src.score)\n", - "# print(\"Metadata\\t\", src.metadata)\n", - "# print(\"-_\" * 20)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/data/scraping_scripts/upload_dbs_to_hf.py b/data/scraping_scripts/upload_dbs_to_hf.py new file mode 100644 index 0000000..f4708d9 --- /dev/null +++ b/data/scraping_scripts/upload_dbs_to_hf.py @@ -0,0 +1,34 @@ +""" +Hugging Face Data Upload Script + +Purpose: +This script uploads a local folder to a Hugging Face dataset repository. It's designed to +update or create a dataset on the Hugging Face Hub by uploading the contents of a specified +local folder. + +Usage: +- Run the script: python data/scraping_scripts/upload_dbs_to_hf.py + +The script will: +- Upload the contents of the 'data' folder to the specified Hugging Face dataset repository. +- https://huggingface.co/datasets/towardsai-buster/ai-tutor-vector-db + +Configuration: +- The script is set to upload to the "towardsai-buster/test-data" dataset repository. +- It ignores files with extensions .jsonl, .py, .txt, and .ipynb. +- It deletes all existing files in the repository before uploading (due to delete_patterns=["*"]). +""" + +from huggingface_hub import HfApi + +api = HfApi() + +api.upload_folder( + folder_path="data", + repo_id="towardsai-buster/ai-tutor-vector-db", + repo_type="dataset", + multi_commits=True, + multi_commits_verbose=True, + delete_patterns=["*"], + ignore_patterns=["*.jsonl", "*.py", "*.txt", "*.ipynb"], +) diff --git a/scripts/setup.py b/scripts/setup.py index 25ba5b7..1fb1dd0 100644 --- a/scripts/setup.py +++ b/scripts/setup.py @@ -11,8 +11,7 @@ from llama_index.core.retrievers import VectorIndexRetriever from llama_index.embeddings.openai import OpenAIEmbedding from llama_index.vector_stores.chroma import ChromaVectorStore - -# from utils import init_mongo_db +from utils import init_mongo_db load_dotenv() @@ -108,11 +107,11 @@ def setup_database(db_collection, dict_file_name): # "rag_course", ] -# mongo_db = ( -# init_mongo_db(uri=MONGODB_URI, db_name="towardsai-buster") -# if MONGODB_URI -# else logfire.warn("No mongodb uri found, you will not be able to save data.") -# ) +mongo_db = ( + init_mongo_db(uri=MONGODB_URI, db_name="towardsai-buster") + if MONGODB_URI + else logfire.warn("No mongodb uri found, you will not be able to save data.") +) __all__ = [ "custom_retriever_transformers", @@ -121,8 +120,8 @@ def setup_database(db_collection, dict_file_name): "custom_retriever_llama_index", "custom_retriever_openai_cookbooks", "custom_retriever_langchain", + "mongo_db", "CONCURRENCY_COUNT", - "MONGODB_URI", "AVAILABLE_SOURCES_UI", "AVAILABLE_SOURCES", ]