update scripts

towardsai · Jul 29, 2024 · 34b6f5e · 34b6f5e
1 parent 207bc12
commit 34b6f5e
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 4 deletions.
diff --git a/data/scraping_scripts/create_vector_stores.py b/data/scraping_scripts/create_vector_stores.py
@@ -28,6 +28,7 @@
 import json
 import os
 import pickle
+import shutil
 from typing import Dict, List
 
 import chromadb
@@ -69,7 +70,7 @@ def create_docs(input_file: str) -> List[Document]:
                 Document(
                     doc_id=data["doc_id"],
                     text=data["content"],
-                    metadata={
+                    metadata={  # type: ignore
                         "url": data["url"],
                         "title": data["name"],
                         "tokens": data["tokens"],
@@ -95,14 +96,22 @@ def create_docs(input_file: str) -> List[Document]:
 
 def process_source(source: str):
     config = SOURCE_CONFIGS[source]
+
     input_file = config["input_file"]
     db_name = config["db_name"]
+    db_path = f"data/{db_name}"
 
     print(f"Processing source: {source}")
 
     documents = create_docs(input_file)
     print(f"Created {len(documents)} documents")
 
+    # Check if the folder exists and delete it
+    if os.path.exists(db_path):
+        print(f"Existing database found at {db_path}. Deleting...")
+        shutil.rmtree(db_path)
+        print(f"Deleted existing database at {db_path}")
+
     # Create Chroma client and collection
     chroma_client = chromadb.PersistentClient(path=f"data/{db_name}")
     chroma_collection = chroma_client.create_collection(db_name)

diff --git a/data/scraping_scripts/github_to_markdown_ai_docs.py b/data/scraping_scripts/github_to_markdown_ai_docs.py
@@ -38,8 +38,11 @@
 
 import nbformat
 import requests
+from dotenv import load_dotenv
 from nbconvert import MarkdownExporter
 
+load_dotenv()
+
 # Configuration for different sources
 SOURCE_CONFIGS = {
     "transformers": {
@@ -75,7 +78,7 @@
 }
 
 # GitHub Personal Access Token (replace with your own token)
-GITHUB_TOKEN = "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"
+GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
 
 # Headers for authenticated requests
 HEADERS = {

diff --git a/data/scraping_scripts/upload_dbs_to_hf.py b/data/scraping_scripts/upload_dbs_to_hf.py
@@ -15,7 +15,6 @@
 
 Configuration:
 - The script is set to upload to the "towardsai-buster/test-data" dataset repository. 
-- It ignores files with extensions .jsonl, .py, .txt, and .ipynb.
 - It deletes all existing files in the repository before uploading (due to delete_patterns=["*"]).
 """
 
@@ -30,5 +29,5 @@
     multi_commits=True,
     multi_commits_verbose=True,
     delete_patterns=["*"],
-    ignore_patterns=["*.jsonl", "*.py", "*.txt", "*.ipynb"],
+    ignore_patterns=["*.jsonl", "*.py", "*.txt", "*.ipynb", "*.md", "*.pyc"],
 )