Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow running multiple analyzer models #800

Merged
merged 2 commits into from
Nov 1, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion adserver/analyzer/backends/textacynlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class TextacyAnalyzerBackend(NaiveKeywordAnalyzerBackend):
https://textacy.readthedocs.io/en/latest/quickstart.html
"""

TOP_PHRASE_COUNT = 20
TOP_PHRASE_COUNT = 50

# Minimum phrase length where each word isn't required to be in the output phrase
MIN_PHRASE_LENGTH = 6
Expand Down
20 changes: 13 additions & 7 deletions adserver/analyzer/management/commands/runmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from django.core.validators import URLValidator
from django.utils.translation import gettext_lazy as _

from ...utils import get_url_analyzer_backend
from ...utils import get_url_analyzer_backends


class Command(BaseCommand):
Expand All @@ -24,7 +24,7 @@ def add_arguments(self, parser):
def handle(self, *args, **kwargs):
"""Entrypoint to the command."""
self.stdout.write(
_("Using the model from %s") % settings.ADSERVER_ANALYZER_BACKEND
_("Using the model(s) from %s") % settings.ADSERVER_ANALYZER_BACKEND
)

for url in kwargs["urls"]:
Expand All @@ -36,10 +36,16 @@ def handle_url(self, url):
"""Dump questions from metabase to a file."""
self.stdout.write(_("Running against %s") % url)

backend = get_url_analyzer_backend()(url)
keywords = backend.analyze()

if keywords is None:
self.stderr.write(_("Failed to connect/process %s") % url)
keywords = []
for backend in get_url_analyzer_backends():
backend_instance = backend(url)
analyzed_keywords = backend_instance.analyze()
self.stdout.write(
_("Keywords from '%s': %s") % (backend.__name__, analyzed_keywords)
)

if analyzed_keywords:
for kw in analyzed_keywords:
keywords.append(kw)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can just use keywords.extend(analyzed_keywords)?


self.stdout.write(_("Keywords/topics: %s") % keywords)
15 changes: 12 additions & 3 deletions adserver/analyzer/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from ..models import Publisher
from ..utils import get_day
from .models import AnalyzedUrl
from .utils import get_url_analyzer_backend
from .utils import get_url_analyzer_backends
from .utils import normalize_url
from config.celery_app import app

Expand Down Expand Up @@ -48,10 +48,19 @@ def analyze_url(url, publisher_slug):
return

log.debug("Analyzing url: %s", normalized_url)
keywords = set()

backend = get_url_analyzer_backend()(url)
keywords = backend.analyze() # Can be None
for backend in get_url_analyzer_backends():
backend_instance = backend(url)
analyzed_keywords = backend_instance.analyze() # Can be None
log.debug("Keywords from '%s': %s", backend.__name__, analyzed_keywords)
if analyzed_keywords:
for kw in analyzed_keywords:
keywords.add(kw)

log.debug("Keywords found : %s", keywords)

keywords = list(keywords)
url_obj, created = AnalyzedUrl.objects.get_or_create(
url=normalized_url,
publisher=publisher,
Expand Down
12 changes: 11 additions & 1 deletion adserver/analyzer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,18 @@
from .constants import IGNORED_QUERY_PARAMS


def get_url_analyzer_backends():
for backend in settings.ADSERVER_ANALYZER_BACKEND:
if backend:
yield import_string(backend)


def get_url_analyzer_backend():
return import_string(settings.ADSERVER_ANALYZER_BACKEND)
backends = list(get_url_analyzer_backends())
if backends:
return backends[0]

return None


def normalize_url(url):
Expand Down
8 changes: 4 additions & 4 deletions config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,12 +487,12 @@
default="adserver.decisionengine.backends.ProbabilisticFlightBackend",
)

# The backend to be used by the ad server
# The backend(s) to be used by the ad server
# for topic and keyword analysis
# Set to `None` to disable the analyzer entirely
ADSERVER_ANALYZER_BACKEND = env(
# Set to `None` or an empty string to disable the analyzer entirely
ADSERVER_ANALYZER_BACKEND = env.list(
"ADSERVER_ANALYZER_BACKEND",
default="adserver.analyzer.backends.TextacyAnalyzerBackend",
default=["adserver.analyzer.backends.TextacyAnalyzerBackend"],
)
if ADSERVER_ANALYZER_BACKEND:
INSTALLED_APPS.append("adserver.analyzer")
Expand Down