Skip to content

Commit

Permalink
Merge pull request #980 from readthedocs/davidfischer/enable-etl-pipe…
Browse files Browse the repository at this point in the history
…line

Enable ETL pipeline if available
  • Loading branch information
davidfischer authored Jan 23, 2025
2 parents 16aad1c + c796f13 commit 6ba7e59
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 10 deletions.
1 change: 1 addition & 0 deletions config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,7 @@
if ADSERVER_ANALYZER_BACKEND and ADSERVER_EXT:
INSTALLED_APPS.append("ethicalads_ext.embedding")
if ADSERVER_EXT:
INSTALLED_APPS.append("ethicalads_ext.etl")
INSTALLED_APPS.append("ethicalads_ext.support")

# Whether Do Not Track is enabled for the ad server
Expand Down
5 changes: 5 additions & 0 deletions config/settings/production.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,11 @@
"task": "ethicalads_ext.embedding.tasks.daily_analyze_advertiser_urls",
"schedule": crontab(hour="4", minute="30"),
}
if "ethicalads_ext.etl" in INSTALLED_APPS:
CELERY_BEAT_SCHEDULE["every-day-etl-pipeline"] = {
"task": "ethicalads_ext.etl.tasks.daily_etl_pipeline",
"schedule": crontab(hour="0", minute="30"),
}


# Sentry settings for error monitoring
Expand Down
8 changes: 8 additions & 0 deletions requirements/analyzer.in
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ trafilatura
# but we need to make sure lxml.html.clean is installed too
lxml[html_clean]


#######################################################################
# ETL Pipeline
#######################################################################
duckdb >= 1.1.0,<1.2
adlfs==2024.12.0


#######################################################################
# Machine learning production requirements
#######################################################################
Expand Down
97 changes: 87 additions & 10 deletions requirements/analyzer.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,36 @@
#
# pip-compile analyzer.in
#
adlfs==2024.12.0
# via -r analyzer.in
aiohappyeyeballs==2.4.4
# via aiohttp
aiohttp==3.11.11
# via adlfs
aiosignal==1.3.2
# via aiohttp
async-timeout==5.0.1
# via aiohttp
attrs==24.3.0
# via aiohttp
azure-core==1.32.0
# via
# adlfs
# azure-identity
# azure-storage-blob
azure-datalake-store==0.0.53
# via adlfs
azure-identity==1.19.0
# via adlfs
azure-storage-blob==12.24.1
# via adlfs
babel==2.16.0
# via courlan
beautifulsoup4==4.12.3
# via -r analyzer.in
blis==0.7.11
# via thinc
cachetools==5.5.0
cachetools==5.5.1
# via textacy
catalogue==2.0.10
# via
Expand All @@ -22,6 +45,10 @@ certifi==2024.12.14
# via
# requests
# trafilatura
cffi==1.17.1
# via
# azure-datalake-store
# cryptography
charset-normalizer==3.4.1
# via
# htmldate
Expand All @@ -37,7 +64,13 @@ confection==0.1.5
# weasel
courlan==1.3.2
# via trafilatura
cymem==2.0.10
cryptography==44.0.0
# via
# azure-identity
# azure-storage-blob
# msal
# pyjwt
cymem==2.0.11
# via
# preshed
# spacy
Expand All @@ -46,18 +79,25 @@ cytoolz==1.0.1
# via textacy
dateparser==1.2.0
# via htmldate
duckdb==1.1.3
# via -r analyzer.in
en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.0/en_core_web_md-3.4.0-py3-none-any.whl
# via -r analyzer.in
filelock==3.16.1
filelock==3.17.0
# via
# huggingface-hub
# torch
# transformers
# triton
floret==0.10.5
# via textacy
frozenlist==1.5.0
# via
# aiohttp
# aiosignal
fsspec==2024.12.0
# via
# adlfs
# huggingface-hub
# torch
htmldate==1.9.3
Expand All @@ -68,7 +108,11 @@ huggingface-hub==0.24.7
# sentence-transformers
# transformers
idna==3.10
# via requests
# via
# requests
# yarl
isodate==0.7.2
# via azure-storage-blob
jellyfish==1.1.3
# via textacy
jinja2==3.1.5
Expand Down Expand Up @@ -103,7 +147,18 @@ markupsafe==3.0.2
# via jinja2
mpmath==1.3.0
# via sympy
murmurhash==1.0.11
msal==1.31.1
# via
# azure-datalake-store
# azure-identity
# msal-extensions
msal-extensions==1.2.0
# via azure-identity
multidict==6.1.0
# via
# aiohttp
# yarl
murmurhash==1.0.12
# via
# preshed
# spacy
Expand Down Expand Up @@ -176,17 +231,29 @@ pgvector==0.3.6
# via -r analyzer.in
pillow==11.1.0
# via torchvision
portalocker==2.10.1
# via msal-extensions
preshed==3.0.9
# via
# spacy
# thinc
pydantic==1.10.19
propcache==0.2.1
# via
# aiohttp
# yarl
pycparser==2.22
# via cffi
pydantic==1.10.21
# via
# confection
# spacy
# thinc
# weasel
pyphen==0.17.0
pyjwt[crypto]==2.10.1
# via
# msal
# pyjwt
pyphen==0.17.2
# via textacy
python-dateutil==2.9.0.post0
# via
Expand All @@ -205,16 +272,19 @@ regex==2024.11.6
# transformers
requests==2.32.3
# via
# azure-core
# azure-datalake-store
# huggingface-hub
# msal
# spacy
# textacy
# transformers
# weasel
scikit-learn==1.6.0
scikit-learn==1.6.1
# via
# sentence-transformers
# textacy
scipy==1.14.1
scipy==1.15.1
# via
# scikit-learn
# sentence-transformers
Expand All @@ -225,6 +295,7 @@ sentencepiece==0.2.0
# via sentence-transformers
six==1.17.0
# via
# azure-core
# langdetect
# python-dateutil
smart-open==6.4.0
Expand All @@ -248,7 +319,7 @@ spacy-loggers==1.0.5
# via spacy
spacy-transformers==1.1.9
# via -r analyzer.in
srsly==2.5.0
srsly==2.5.1
# via
# confection
# spacy
Expand Down Expand Up @@ -299,8 +370,12 @@ typer==0.7.0
# weasel
typing-extensions==4.12.2
# via
# azure-core
# azure-identity
# azure-storage-blob
# cloudpathlib
# huggingface-hub
# multidict
# pydantic
# torch
tzlocal==5.2
Expand All @@ -316,6 +391,8 @@ wasabi==0.10.1
# spacy
# thinc
# weasel
yarl==1.18.3
# via aiohttp

# The following packages are considered to be unsafe in a requirements file:
# setuptools

0 comments on commit 6ba7e59

Please sign in to comment.