Skip to content

Commit

Permalink
update scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
omar-sol committed Jul 29, 2024
1 parent 207bc12 commit 34b6f5e
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 4 deletions.
11 changes: 10 additions & 1 deletion data/scraping_scripts/create_vector_stores.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import json
import os
import pickle
import shutil
from typing import Dict, List

import chromadb
Expand Down Expand Up @@ -69,7 +70,7 @@ def create_docs(input_file: str) -> List[Document]:
Document(
doc_id=data["doc_id"],
text=data["content"],
metadata={
metadata={ # type: ignore
"url": data["url"],
"title": data["name"],
"tokens": data["tokens"],
Expand All @@ -95,14 +96,22 @@ def create_docs(input_file: str) -> List[Document]:

def process_source(source: str):
config = SOURCE_CONFIGS[source]

input_file = config["input_file"]
db_name = config["db_name"]
db_path = f"data/{db_name}"

print(f"Processing source: {source}")

documents = create_docs(input_file)
print(f"Created {len(documents)} documents")

# Check if the folder exists and delete it
if os.path.exists(db_path):
print(f"Existing database found at {db_path}. Deleting...")
shutil.rmtree(db_path)
print(f"Deleted existing database at {db_path}")

# Create Chroma client and collection
chroma_client = chromadb.PersistentClient(path=f"data/{db_name}")
chroma_collection = chroma_client.create_collection(db_name)
Expand Down
5 changes: 4 additions & 1 deletion data/scraping_scripts/github_to_markdown_ai_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,11 @@

import nbformat
import requests
from dotenv import load_dotenv
from nbconvert import MarkdownExporter

load_dotenv()

# Configuration for different sources
SOURCE_CONFIGS = {
"transformers": {
Expand Down Expand Up @@ -75,7 +78,7 @@
}

# GitHub Personal Access Token (replace with your own token)
GITHUB_TOKEN = "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"
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")

# Headers for authenticated requests
HEADERS = {
Expand Down
3 changes: 1 addition & 2 deletions data/scraping_scripts/upload_dbs_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
Configuration:
- The script is set to upload to the "towardsai-buster/test-data" dataset repository.
- It ignores files with extensions .jsonl, .py, .txt, and .ipynb.
- It deletes all existing files in the repository before uploading (due to delete_patterns=["*"]).
"""

Expand All @@ -30,5 +29,5 @@
multi_commits=True,
multi_commits_verbose=True,
delete_patterns=["*"],
ignore_patterns=["*.jsonl", "*.py", "*.txt", "*.ipynb"],
ignore_patterns=["*.jsonl", "*.py", "*.txt", "*.ipynb", "*.md", "*.pyc"],
)

0 comments on commit 34b6f5e

Please sign in to comment.